Important, this is not a tutorial.
You also could try to check an new Play wright its new end to end testing device.
That should be nice for anny scrapy purposes.
Looks nice and prommising. It was recomenended to me by some of people fom area.
https://playwright.dev/python/docs/intro
Scrapy form trick
class TheScraper(Spider):
name = 'simp'
start_urls = ['https://xxxxxxxxxxxxxxxx.com/login']
def parse(self, response, **kwargs):
yield FormRequest(url=self.start_urls[0],
formdata={
'email': 'login',
'password': 'pass'
},
callback=self.scrape_page)
def scrape_page(self,response):
print("Logged in !")
#Do stuff
url = 'https://xxxxxxxxxxxxxxxxxxxxxxxxx.com'
yield scrapy.Request(url=url, callback=self.parse_100)
def parse_100(self, response):
open_in_browser(response)
Selenium headles mode
def getOptions():
options = Options()
options.headless = False
return options
browser = webdriver.Chrome(options=getOptions())
Selenium basic setup and Xpath Selectors
#Getting an Xpath selector of the element
#Import those libraies -> pip install selenium ....
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
#Download and throw chrome webdriver to project repository.
if __name__ == "__main__":
#Load driver defaults
#browser = webdriver.Chrome()
#Or load with options ( headles )
webdriver.Chrome(options=self.getOptions())
#Open desired page
browser.get("https://www.markurion.eu")
WebDriverWait – Will make browser wait, so our dinamicaly rendered content is there.
EC expected_condition – Is needed to show to selenium what is the condition you want to see.
By – allows to use XPATH or CSS selectors to find our target.
You probably will be able to find more on this website about xpath selectors.
https://saucelabs.com/resources/articles/selenium-tips-css-selectors
Selecting one
#Let's try to select one attributte
main_title =
WebDriverWait(browser,10).until(EC.presence_of_element_located((By.XPATH,"ENTER_HERE_XPATH_SELECTOR"))).get_attribute('innerText')
#For the last part you could use anny attribute you want like
.get_attribute('innerHTML')
.get_attribute('src')
.get_attribute('id')
.get_attribute('innerHTML')
Selecting multiple
try:
# List of the videos for process
the_list = WebDriverWait(browser, 10).until(
EC.presence_of_all_elements_located((By.XPATH, ".//a[contains(@class,'block')]")))
for x in range(len(the_list)):
title = the_list[x].find_element(By.XPATH, ".//div").get_attribute('innerText').split("\n")[0]
file_title = sanitize(title)
link = the_list[x].get_attribute('href')
duration = the_list[x].find_element(By.XPATH, ".//div").get_attribute('innerText').split("\n")[1][1:]
print(title)
print(link)
except Exception as e:
print(e)