剪贴式URL获取

ValueError: All strings must be XML compatible: Unicode or ASCII, no NULL bytes or control characters 2017-07-17 16:12:49 [scrapy.core.scraper] ERROR: Spider error processing <GET https://subastas.boe.es/detalleSubasta.php?idSub=SUB-JA-2017-68197&idBus=_VDFMQktMNXdpU0loK3B1UjZhMzhzUHdTUmdiTW9DNjBhM3lkMWpZWDBGbXdtOEVmWW13VmlhSC8vQUR5V1RNRjY0NWhVcjd2aDRMbkVyMkFLbmN4Ym0wc1E4eHVHWHlxSURJSTVBeGhzNGFIRzNkOUpBbW9SRG5RZExsbUNNeFFORSs1R21vaEJIeVhrMkdKdGRYUzg5N1laT2NPUTBwYUI0SVlHTm8vRkF4UEpleHE0b2U2MmZTdFhvZlIyUzgyemg0ekhOSEVoWEtuaVFMbXdBei92MytWaXNhWGtUTVd4SDJZUk9KUUJpVnExa01TeUhOcGZFQ1JqZDIxVU9BTWpHMGJVRU9rNmljVVN4UFFkNUp4SG1FR3dYWGlrVGgxWVJnWkRIQVJXZWxadVRpYWRUcm81WUgxeW4xb3RxQWJXV3JSNUl1N0NYZFoyVlhDaldGWU5RPT0,> (referer: https://subastas.boe.es/subastas_ava.php?campo%5B0%5D=SUBASTA.ORIGEN&dato%5B0%5D=&campo%5B1%5D=SUBASTA.ESTADO&dato%5B1%5D=EJ&campo%5B2%5D=BIEN.TIPO&dato%5B2%5D=I&dato%5B3%5D=501&campo%5B4%5D=BIEN.DIRECCION&dato%5B4%5D=&campo%5B5%5D=BIEN.CODPOSTAL&dato%5B5%5D=&campo%5B6%5D=BIEN.LOCALIDAD&dato%5B6%5D=&campo%5B7%5D=BIEN.COD_PROVINCIA&dato%5B7%5D=28&campo%5B8%5D=SUBASTA.POSTURA_MINIMA_MINIMA_LOTES&dato%5B8%5D=&campo%5B9%5D=SUBASTA.NUM_CUENTA_EXPEDIENTE_1&dato%5B9%5D=&campo%5B10%5D=SUBASTA.NUM_CUENTA_EXPEDIENTE_2&dato%5B10%5D=&campo%5B11%5D=SUBASTA.NUM_CUENTA_EXPEDIENTE_3&dato%5B11%5D=&campo%5B12%5D=SUBASTA.NUM_CUENTA_EXPEDIENTE_4&dato%5B12%5D=&campo%5B13%5D=SUBASTA.NUM_CUENTA_EXPEDIENTE_5&dato%5B13%5D=&campo%5B14%5D=SUBASTA.ID_SUBASTA_BUSCAR&dato%5B14%5D=&campo%5B15%5D=SUBASTA.FECHA_FIN_YMD&dato%5B15%5D%5B0%5D=&dato%5B15%5D%5B1%5D=&campo%5B16%5D=SUBASTA.FECHA_INICIO_YMD&dato%5B16%5D%5B0%5D=&dato%5B16%5D%5B1%5D=&page_hits=1000&sort_field%5B0%5D=SUBASTA.FECHA_FIN_YMD&sort_order%5B0%5D=desc&sort_field%5B1%5D=SUBASTA.FECHA_FIN_YMD&sort_order%5B1%5D=asc&sort_field%5B2%5D=SUBASTA.HORA_FIN&sort_order%5B2%5D=asc&accion=Buscar)

2017-07-17 16:12:49 [scrapy.core.engine] INFO: Closing spider (finished) 2017-07-17 16:12:49 [scrapy.statscollectors] INFO: Dumping Scrapy stats: {'downloader/request_bytes': 176632, 'downloader/request_count': 95, 'downloader/request_method_count/GET': 95, 'downloader/response_bytes': 1279009, 'downloader/response_count': 95, 'downloader/response_status_count/200': 95, 'finish_reason': 'finished', 'finish_time': datetime.datetime(2017, 7, 17, 14, 12, 49, 900000), 'log_count/DEBUG': 96, 'log_count/ERROR': 94, 'log_count/INFO': 7, 'request_depth_max': 1, 'response_received_count': 95, 'scheduler/dequeued': 95, 'scheduler/dequeued/memory': 95, 'scheduler/enqueued': 95, 'scheduler/enqueued/memory': 95, 'spider_exceptions/ValueError': 94, 'start_time': datetime.datetime(2017, 7, 17, 14, 12, 46, 66000)} 2017-07-17 16:12:49 [scrapy.core.engine] INFO: Spider closed (finished)

# -*- coding: utf-8 -*- import scrapy from scrapy.spider import CrawlSpider, Rule from scrapy.linkextractors import LinkExtractor from scrapy.exceptions import CloseSpider from boe.items import boeItem class boeSpider(CrawlSpider): name = 'boe' item_count = 0 allowed_domain = ['https://subastas.boe.es'] start_urls = ['https://subastas.boe.es/subastas_ava.php?campo[0]=SUBASTA.ORIGEN&dato[0]=&campo[1]=SUBASTA.ESTADO&dato[1]=EJ&campo[2]=BIEN.TIPO&dato[2]=I&dato[3]=501&campo[4]=BIEN.DIRECCION&dato[4]=&campo[5]=BIEN.CODPOSTAL&dato[5]=&campo[6]=BIEN.LOCALIDAD&dato[6]=&campo[7]=BIEN.COD_PROVINCIA&dato[7]=28&campo[8]=SUBASTA.POSTURA_MINIMA_MINIMA_LOTES&dato[8]=&campo[9]=SUBASTA.NUM_CUENTA_EXPEDIENTE_1&dato[9]=&campo[10]=SUBASTA.NUM_CUENTA_EXPEDIENTE_2&dato[10]=&campo[11]=SUBASTA.NUM_CUENTA_EXPEDIENTE_3&dato[11]=&campo[12]=SUBASTA.NUM_CUENTA_EXPEDIENTE_4&dato[12]=&campo[13]=SUBASTA.NUM_CUENTA_EXPEDIENTE_5&dato[13]=&campo[14]=SUBASTA.ID_SUBASTA_BUSCAR&dato[14]=&campo[15]=SUBASTA.FECHA_FIN_YMD&dato[15][0]=&dato[15][1]=&campo[16]=SUBASTA.FECHA_INICIO_YMD&dato[16][0]=&dato[16][1]=&page_hits=1000&sort_field[0]=SUBASTA.FECHA_FIN_YMD&sort_order[0]=desc&sort_field[1]=SUBASTA.FECHA_FIN_YMD&sort_order[1]=asc&sort_field[2]=SUBASTA.HORA_FIN&sort_order[2]=asc&accion=Buscar'] rules = { # Para cada item Rule(LinkExtractor(allow = (), restrict_xpaths = ("//a[contains(@class,'resultado-busqueda-link-defecto')]")), callback = 'parse_item', follow = False) } def parse_item(self, response): DATAQ = boeItem() #info de General DATAQ['Gen_Id'] = response.xpath('//th[text()="Identificador"]/following-sibling::td[1]/strong/text()').extract_first() DATAQ['Gen_Tipo'] = response.xpath('//th[text()="Tipo de subasta"]/following-sibling::td[1]/strong/text()').extract() DATAQ['Gen_Inicio'] = response.xpath('//th[text()="Fecha de inicio"]/following-sibling::td[1]/span/text()').extract() DATAQ['Gen_Fin'] = response.xpath('//th[text()="Fecha de conclusión"]/following-sibling::td[1]/span/text()').extract() DATAQ['Gen_Deuda'] = response.xpath('//th[text()="Cantidad reclamada"]/following-sibling::td[1]/text()').extract() DATAQ['Gen_Lotes'] = response.xpath('//th[text()="Lotes"]/following-sibling::td[1]/text()').extract() DATAQ['Gen_Anuncio'] = response.xpath('//th[text()="Anuncio BOE"]/following-sibling::td[1]/a/@href').extract() DATAQ['Gen_Valor'] = response.xpath('//th[text()="Valor subasta"]/following-sibling::td[1]/text()').extract() DATAQ['Gen_Tasacion'] = response.xpath('//th[text()="Tasación"]/following-sibling::td[1]/text()').extract() DATAQ['Gen_Minimo'] = response.xpath('//th[text()="Puja mínima"]/following-sibling::td[1]/text()').extract_first() DATAQ['Gen_Tramos'] = response.xpath('//th[text()="Tramos entre pujas"]/following-sibling::td[1]/text()').extract_first() DATAQ['Gen_Deposito'] = response.xpath('//th[text()="Importe del depósito"]/following-sibling::td[1]/text()').extract() self.item_count += 1 if self.item_count > 10: raise CloseSpider('item_exceeded') yield DATAQ

# -*- coding: utf-8 -*- # Scrapy settings for boe project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # http://doc.scrapy.org/en/latest/topics/settings.html # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html BOT_NAME = 'boe' SPIDER_MODULES = ['boe.spiders'] NEWSPIDER_MODULE = 'boe.spiders' #CSV IMPORTACION ITEM_PIPELINES = {'boe.pipelines.boePipeline': 500, } # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'boe (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY = False

0条回答

目前没有回答

相关问题更多 >

编程相关推荐

热门问题

热门文章