如果在selenium中找不到元素，如何删除元素

import scrapy import urllib from selenium import webdriver import os import logging from selenium.common.exceptions import TimeoutException from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.chrome.options import Options as ChromeOptions from selenium.webdriver.support.wait import WebDriverWait CHROME_DRIVER_WINDOW_PATH = "C:/Users/RAJ/PycharmProjects/WebCrawler/WebCrawler/WebCrawler/spiders/chromedriver.exe" class ProductSpider(scrapy.Spider): name = "product_spider" allowed_domains = ['https://www.startupindia.gov.in/'] start_urls = [ 'https://www.startupindia.gov.in/content/sih/en/search.html?industries=sih:industry/agriculture&sectors=sih:industry/agriculture/dairy-farming&states=sih:location/india/andhra-pradesh&roles=Startup&page=0'] def __init__(self): cwd = os.getcwd() opts = ChromeOptions() opts.add_argument("--headless") # for headless browser it's not necessary self.driver = webdriver.Chrome(executable_path=CHROME_DRIVER_WINDOW_PATH) def parse(self, response): self.driver.get(response.url) next = self.driver.find_elements_by_xpath("//*[@id='persona-results']//a[@class='img-wrap']") for i in next: try: i.click() # click on image in page # move to new tab open self.driver.switch_to.window(self.driver.window_handles[next.index(i) + 1]) logging.info(self.driver.current_url) self.driver.get(self.driver.current_url) self.scrape_data() self.driver.switch_to.window(self.driver.window_handles[0]) # get the data and write it to scrapy items except Exception as e: print(e) # company_url = self.driver.find_element_by_css_selector('div.container div.company-name span a') # company_url_text = company_url.text def scrape_data(self): url_of_comp = self.driver.find_element_by_css_selector('div.container div.company-name span > a').text name = self.driver.find_element_by_css_selector('div.container div.company-name p').text logging.info(url_of_comp) logging.info(name)

1条回答

网友

1楼 · 发布于 2024-09-27 02:21:00

您不必抓取每个详细信息页面来抓取name和URL详细信息。清单页面应该足够了

检查更新的解析函数

    def parse(self, response):
        self.driver.get(response.url)

        item_list = []
        list_items = self.driver.find_elements_by_xpath("//*[@id='persona-results']//a[@class='img-wrap']")
        for item in list_items:
            items = { "url": item.get_attribute("href"),
                      "name": item.find_element_by_xpath('./div/div[@class="events-details"]/h3').text }
            item_list.append(items)
            yield items
        print(item_list)

相关问题更多 >

编程相关推荐

热门问题

热门文章