crapy python csv输出之间有空行

# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html import csv from scrapy import signals from scrapy.exporters import CsvItemExporter class CSVPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open('%s_items.csv' % spider.name, 'w+b') self.files[spider] = file self.exporter = CsvItemExporter(file) self.exporter.fields_to_export = ["plotid","plotprice","plotname","name","address"] self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item

import scrapy from urllib.parse import urljoin from CharlesChurch.items import CharleschurchItem class charleschurchSpider(scrapy.Spider): name = "charleschurch" allowed_domains = ["charleschurch.com"] start_urls = ["https://www.charleschurch.com/county-durham_willington/the-ridings-1111"] def parse(self, response): for sel in response.xpath('//*[@id="aspnetForm"]/div[4]'): item = CharleschurchItem() item['name'] = sel.xpath('//*[@id="XplodePage_ctl12_dsDetailsSnippet_pDetailsContainer"]/span[1]/b/text()').extract() item['address'] = sel.xpath('//*[@id="XplodePage_ctl12_dsDetailsSnippet_pDetailsContainer"]/div/*[@itemprop="postalCode"]/text()').extract() plotnames = sel.xpath('//div[@class="housetype js-filter-housetype"]/div[@class="housetype__col-2"]/div[@class="housetype__plots"]/div[not(contains(@data-status,"Sold"))]/div[@class="plot__name"]/a/text()').extract() plotnames = [plotname.strip() for plotname in plotnames] plotids = sel.xpath('//div[@class="housetype js-filter-housetype"]/div[@class="housetype__col-2"]/div[@class="housetype__plots"]/div[not(contains(@data-status,"Sold"))]/div[@class="plot__name"]/a/@href').extract() plotids = [plotid.strip() for plotid in plotids] plotprices = sel.xpath('//div[@class="housetype js-filter-housetype"]/div[@class="housetype__col-2"]/div[@class="housetype__plots"]/div[not(contains(@data-status,"Sold"))]/div[@class="plot__price"]/text()').extract() plotprices = [plotprice.strip() for plotprice in plotprices] result = zip(plotnames, plotids, plotprices) for plotname, plotid, plotprice in result: item['plotname'] = plotname item['plotid'] = plotid item['plotprice'] = plotprice yield item

2条回答

网友

1楼 · 编辑于 2024-06-26 13:41:03

w+b中的b很可能是问题的一部分，因为这将使文件被视为二进制文件，因此换行符按原样编写。在

所以第一步是删除b。然后通过添加U还可以激活通用换行支持（请参见：https://docs.python.org/3/glossary.html#term-universal-newlines）

所以这条线应该是这样的：

file = open('%s_items.csv' % spider.name, 'Uw+')

网友

2楼 · 编辑于 2024-06-26 13:41:03

我怀疑不太理想，但我已经找到了解决这个问题的办法。在管道.py文件i添加了更多的代码，基本上读取带有空白行的CSV文件到列表中，这样删除空白行，然后将清理后的列表写入新文件。在

我添加的代码是：

with open('%s_items.csv' % spider.name, 'r') as f:
  reader = csv.reader(f)
  original_list = list(reader)
  cleaned_list = list(filter(None,original_list))

with open('%s_items_cleaned.csv' % spider.name, 'w', newline='') as output_file:
    wr = csv.writer(output_file, dialect='excel')
    for data in cleaned_list:
      wr.writerow(data)

所以整个管道.py文件是：

^{pr2}$

不太理想，但现在能解决问题。在

相关问题更多 >

编程相关推荐

热门问题

热门文章