爬网0页

from scrapy.spider import BaseSpider from scrapy import Field from scrapy import Item from scrapy.selector import HtmlXPathSelector def Yeezy(Item): price = Field() class YeezySpider(BaseSpider): name = "yeezy" allowed_domains = ["https://www.grailed.com/"] start_url = ['https://www.grailed.com/feed/0Qu8Gh1qHQ?page=2'] def parse(self, response): hxs = HtmlXPathSelector(response) price = hxs.css('.listing-price .sub-title:nth-child(1) span').extract() items = [] for price in price: item = Yeezy() item["price"] = price.select(".listing-price .sub-title:nth-child(1) span").extract() items.append(item) yield item

ScrapyDeprecationWarning: YeezyScrape.spiders.yeezy_spider.YeezySpider inherits from deprecated class scrapy.spider.BaseSpider, please inherit from scrapy.spider.Spider. (warning only on first subclass, there may be others) class YeezySpider(BaseSpider): 2017-08-02 14:45:25-0700 [scrapy] INFO: Scrapy 0.25.1 started (bot: YeezyScrape) 2017-08-02 14:45:25-0700 [scrapy] INFO: Optional features available: ssl, http11 2017-08-02 14:45:25-0700 [scrapy] INFO: Overridden settings: {'NEWSPIDER_MODULE': 'YeezyScrape.spiders', 'SPIDER_MODULES': ['YeezyScrape.spiders'], 'BOT_NAME': 'YeezyScrape'} 2017-08-02 14:45:25-0700 [scrapy] INFO: Enabled extensions: LogStats, TelnetConsole, CloseSpider, WebService, CoreStats, SpiderState 2017-08-02 14:45:26-0700 [scrapy] INFO: Enabled downloader middlewares: HttpAuthMiddleware, DownloadTimeoutMiddleware, UserAgentMiddleware, RetryMiddleware, DefaultHeadersMiddleware, MetaRefreshMiddleware, HttpCompressionMiddleware, RedirectMiddleware, CookiesMiddleware, ChunkedTransferMiddleware, DownloaderStats 2017-08-02 14:45:26-0700 [scrapy] INFO: Enabled spider middlewares: HttpErrorMiddleware, OffsiteMiddleware, RefererMiddleware, UrlLengthMiddleware, DepthMiddleware 2017-08-02 14:45:26-0700 [scrapy] INFO: Enabled item pipelines: 2017-08-02 14:45:26-0700 [yeezy] INFO: Spider opened 2017-08-02 14:45:26-0700 [yeezy] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min) 2017-08-02 14:45:26-0700 [scrapy] DEBUG: Telnet console listening on 127.0.0.1:6023 2017-08-02 14:45:26-0700 [scrapy] DEBUG: Web service listening on 127.0.0.1:6080 2017-08-02 14:45:26-0700 [yeezy] INFO: Closing spider (finished) 2017-08-02 14:45:26-0700 [yeezy] INFO: Dumping Scrapy stats: {'finish_reason': 'finished', 'finish_time': datetime.datetime(2017, 8, 2, 21, 45, 26, 127000), 'log_count/DEBUG': 2, 'log_count/INFO': 7, 'start_time': datetime.datetime(2017, 8, 2, 21, 45, 26, 125000)} 2017-08-02 14:45:26-0700 [yeezy] INFO: Spider closed (finished) Process finished with exit code 0

import scrapy from scrapy.http import FormRequest from scrapy.selector import HtmlXPathSelector #from YeezyScrape import YeezyscrapeItem class YeezySpider(scrapy.Spider): name = "yeezy" allowed_domains = ["www.grailed.com"] start_url = ["https://www.grailed.com/feed/0Qu8Gh1qHQ?page=2"] def parse(self, response): for i in range(0,2): yield FormRequest(url = 'https://mnrwefss2q- dsn.algolia.net/1/indexes/Listing_production/query?x-algolia- agent=Algolia%20for%20vanilla%20JavaScript%203.21.1&x-algolia-application- id=MNRWEFSS2Q&x-algolia-api-key=a3a4de2e05d9e9b463911705fb6323ad', method="post", formdata={"params":"query:boost filters:(strata:'basic' OR strata:'grailed' OR strata:'hype') AND (category_path:'footwear.slip_ons' OR category_path:'footwear.sandals' OR category_path:'footwear.lowtop_sneakers' OR category_path:'footwear.leather' OR category_path:'footwear.hitop_sneakers' OR category_path:'footwear.formal_shoes' OR category_path:'footwear.boots') AND (marketplace:grailed) hitsPerPage:40 facets ["strata","size","category","category_size", "category_path","category_path_size", "category_path_root_size","price_i","designers.id", "location","marketplace"] page:2"}, callback=self.data_parse()) def data_parse(self, response): hxs = HtmlXPathSelector(response) prices = hxs.xpath("//p").extract() for prices in prices: price = prices.select("a/text()").extract() print price

C:\Python27\python.exe C:/Python27/Lib/site-packages/scrapy/cmdline.py crawl yeezy -o price.json 2017-08-04 13:23:27-0700 [scrapy] INFO: Scrapy 0.25.1 started (bot: YeezyScrape) 2017-08-04 13:23:27-0700 [scrapy] INFO: Optional features available: ssl, http11 2017-08-04 13:23:27-0700 [scrapy] INFO: Overridden settings: {'NEWSPIDER_MODULE': 'YeezyScrape.spiders', 'FEED_FORMAT': 'json', 'SPIDER_MODULES': ['YeezyScrape.spiders'], 'FEED_URI': 'price.json', 'BOT_NAME': 'YeezyScrape'} 2017-08-04 13:23:27-0700 [scrapy] INFO: Enabled extensions: FeedExporter, LogStats, TelnetConsole, CloseSpider, WebService, CoreStats, SpiderState 2017-08-04 13:23:27-0700 [scrapy] INFO: Enabled downloader middlewares: HttpAuthMiddleware, DownloadTimeoutMiddleware, UserAgentMiddleware, RetryMiddleware, DefaultHeadersMiddleware, MetaRefreshMiddleware, HttpCompressionMiddleware, RedirectMiddleware, CookiesMiddleware, ChunkedTransferMiddleware, DownloaderStats 2017-08-04 13:23:27-0700 [scrapy] INFO: Enabled spider middlewares: HttpErrorMiddleware, OffsiteMiddleware, RefererMiddleware, UrlLengthMiddleware, DepthMiddleware 2017-08-04 13:23:27-0700 [scrapy] INFO: Enabled item pipelines: 2017-08-04 13:23:27-0700 [yeezy] INFO: Spider opened 2017-08-04 13:23:28-0700 [yeezy] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min) 2017-08-04 13:23:28-0700 [scrapy] DEBUG: Telnet console listening on 127.0.0.1:6023 2017-08-04 13:23:28-0700 [scrapy] DEBUG: Web service listening on 127.0.0.1:6080 2017-08-04 13:23:28-0700 [yeezy] INFO: Closing spider (finished) 2017-08-04 13:23:28-0700 [yeezy] INFO: Dumping Scrapy stats: {'finish_reason': 'finished', 'finish_time': datetime.datetime(2017, 8, 4, 20, 23, 28, 3000), 'log_count/DEBUG': 2, 'log_count/INFO': 7, 'start_time': datetime.datetime(2017, 8, 4, 20, 23, 28, 1000)} 2017-08-04 13:23:28-0700 [yeezy] INFO: Spider closed (finished) Process finished with exit code 0

1条回答

网友

1楼 · 发布于 2024-10-03 19:24:17

似乎产品是由AJAX检索的（参见相关的：Can scrapy be used to scrape dynamic content from websites that are using AJAX?）。
如果打开浏览器webinspector，选择“网络”选项卡并在页面加载时查找XHR请求，则可以看到以下内容：

似乎正在使用categories、filter等发出POST类型请求，并返回json个产品。你可以对它进行反向工程，然后在scrapy中复制它。你知道吗

相关问题更多 >

编程相关推荐

热门问题

热门文章

爬网0页

相关问题 更多 >

编程相关推荐

热门问题

热门文章

相关问题更多 >