在启动CrawlerProcess/Scrapy时修改spider的CSV文件输入

def main(): # ----- This part launch all given spiders ----- # process = CrawlerProcess(get_project_settings()) process.crawl(FirstSpider) process.crawl(SecondSpider) process.crawl(ThirdSpider) process.crawl(EtcSpider) process.start() # the script will block here until the crawling is finished

class FirstSpider(scrapy.Spider): name = "first_bot" def start_requests(self): base_url = "https://example.fr/catalogsearch/result/?q=" script_dir = osp.dirname(osp.realpath(__file__)) file_path = osp.join(script_dir, 'files', 'to_collect_firstbot.csv') input_file = open(file_path, 'r', encoding="utf-8", errors="ignore") reader = csv.reader(input_file) for row in reader: if row: url = row[0] absolute_url = base_url + url print(absolute_url) yield scrapy.Request( absolute_url, meta={ "handle_httpstatus_list": [302, 301, 502], }, callback=self.parse )

2条回答

网友

1楼 · 编辑于 2024-09-26 18:10:28

crawl接受参数，您可以在spider的from_crawler内使用它们。你知道吗

网友

2楼 · 编辑于 2024-09-26 18:10:28

你可以把论点传给你的蜘蛛爬行，我认为这是你需要让这个工作。你知道吗

将代码更改为：

class FirstSpider(scrapy.Spider):
    name = "first_bot"

    file_name = 'to_collect_firstbot.csv' # <- we are gonna change this variable later

    def start_requests(self):
        base_url = "https://example.fr/catalogsearch/result/?q="
        script_dir = osp.dirname(osp.realpath(__file__))
        file_path = osp.join(script_dir, 'files', self.file_name) # here we use the argument
        input_file = open(file_path, 'r', encoding="utf-8", errors="ignore")
        reader = csv.reader(input_file)
        for row in reader:
            if row:
                url = row[0]
                absolute_url = base_url + url
                print(absolute_url)
                yield scrapy.Request(
                    absolute_url,
                    meta={
                        "handle_httpstatus_list": [302, 301, 502],
                    },
                    callback=self.parse
                )

现在，在启动spider时，只需在进程爬网调用中将它们作为参数传递：

def main():

    #   - This part launch all given spiders   - #

    process = CrawlerProcess(get_project_settings())

    process.crawl(FirstSpider, file_name='custom_file1.csv')
    process.crawl(SecondSpider, file_name='custom_file2.csv')
    process.crawl(ThirdSpider)
    process.crawl(EtcSpider, file_name='custom_file_whatever.csv')

    process.start()  # the script will block here until the crawling is finished

检查第三个调用是否没有设置file_name参数，这意味着spider将使用spider代码中指定的默认参数：

file_name = 'to_collect_firstbot.csv'

相关问题更多 >

编程相关推荐

热门问题

热门文章