我正在从一个名为startup India的网站上抓取数据,我正在尝试删除配置文件的URL和名称,但有些配置文件没有URL,如果有些配置文件没有URL,我应该将名称和URL设置为“无”,因为我尝试了很多替代方法,如try-except语句和if-else语句,但它们都不起作用,所以我需要帮助
代码如下:
import scrapy
import urllib
from selenium import webdriver
import os
import logging
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options as ChromeOptions
from selenium.webdriver.support.wait import WebDriverWait
CHROME_DRIVER_WINDOW_PATH = "C:/Users/RAJ/PycharmProjects/WebCrawler/WebCrawler/WebCrawler/spiders/chromedriver.exe"
class ProductSpider(scrapy.Spider):
name = "product_spider"
allowed_domains = ['https://www.startupindia.gov.in/']
start_urls = [
'https://www.startupindia.gov.in/content/sih/en/search.html?industries=sih:industry/agriculture§ors=sih:industry/agriculture/dairy-farming&states=sih:location/india/andhra-pradesh&roles=Startup&page=0']
def __init__(self):
cwd = os.getcwd()
opts = ChromeOptions()
opts.add_argument("--headless") # for headless browser it's not necessary
self.driver = webdriver.Chrome(executable_path=CHROME_DRIVER_WINDOW_PATH)
def parse(self, response):
self.driver.get(response.url)
next = self.driver.find_elements_by_xpath("//*[@id='persona-results']//a[@class='img-wrap']")
for i in next:
try:
i.click() # click on image in page
# move to new tab open
self.driver.switch_to.window(self.driver.window_handles[next.index(i) + 1])
logging.info(self.driver.current_url)
self.driver.get(self.driver.current_url)
self.scrape_data()
self.driver.switch_to.window(self.driver.window_handles[0])
# get the data and write it to scrapy items
except Exception as e:
print(e)
# company_url = self.driver.find_element_by_css_selector('div.container div.company-name span a')
# company_url_text = company_url.text
def scrape_data(self):
url_of_comp = self.driver.find_element_by_css_selector('div.container div.company-name span > a').text
name = self.driver.find_element_by_css_selector('div.container div.company-name p').text
logging.info(url_of_comp)
logging.info(name)
代码将不胜感激
您不必抓取每个详细信息页面来抓取
name
和URL
详细信息。清单页面应该足够了检查更新的解析函数
相关问题 更多 >
编程相关推荐