如页面所示，从div中刮取数据

def parse(self, response): data={} #count=0 title = response.css('[itemprop="name"]::text').get() #data["Title"] = title count=0 data["title"] = title count=0 for content in response.css('li .content ::text'): text = content.get() text=text.strip() content = "content" +str(count) data[content] = text count=count+1 yield data

1条回答

网友

1楼 · 发布于 2024-10-03 19:26:11

您应该首先获得所有不带::text的.content，并使用for-loop分别处理每个.content。对于每个.content，您应该运行::text以仅获取此内容中的所有文本，将其放入列表中，然后将其合并为单个字符串

       for count, content in enumerate(response.css('li .content')):
            text = []

            # get all `::text` in current `.content`
            for item in content.css('::text'):
                item = item.get()#.strip()
                # put on list
                text.append(item)

            # join all items in single string
            text = "".join(text)
            text = text.strip()

            print(count, '|', text)
            data[f"content {count}"] = text

最小工作代码

您可以将所有代码放在一个文件中并运行python script.py，而无需在scrapy中创建项目

import scrapy

class MySpider(scrapy.Spider):

    name = 'myspider'

    start_urls = ['https://eksisozluk.com/mortingen-sitraze 1277239']

    def parse(self, response):
        print('url:', response.url)

        data = {}  # PEP8: spaces around `=`

        title = response.css('[itemprop="name"]::text').get()
        data["title"] = title

        for count, content in enumerate(response.css('li .content')):
            text = []

            for item in content.css('::text'):
                item = item.get()#.strip()
                text.append(item)

            text = "".join(text)
            text = text.strip()

            print(count, '|', text)
            data[f"content {count}"] = text

        yield data
    
#  - run without project and save in `output.csv`  -

from scrapy.crawler import CrawlerProcess

c = CrawlerProcess({
    'USER_AGENT': 'Mozilla/5.0',
    # save in file CSV, JSON or XML
    'FEEDS': {'output.csv': {'format': 'csv'}},  # new in 2.1
})
c.crawl(MySpider)
c.start()

编辑：

用getall()稍微短一点

        for count, content in enumerate(response.css('li .content')):

            text = content.css('::text').getall()

            text = "".join(text)
            text = text.strip()

            print(count, '|', text)
            data[f"content {count}"] = text

相关问题更多 >

编程相关推荐

热门问题

热门文章