碎屑物品没有得到适当的处理

import scrapy class ResortItem(scrapy.Item): # images images = scrapy.Field() image_urls = scrapy.Field() # room details and amenities room_title = scrapy.Field() square_feet = scrapy.Field() kitchen = scrapy.Field() num_baths = scrapy.Field() max_guests = scrapy.Field() beds = scrapy.Field() washer_dryer = scrapy.Field() room_amenities = scrapy.Field()

import scrapy from items import ResortItem class ScraperSpider(scrapy.Spider): name = 'scraper' allowed_domains = ['domains'] start_urls = [ 'urls' ] def parse(self, response): item = ResortItem() unit_img_path = units_img.xpath(unit_image_selector).getall() url_list = imgs_path + unit_img_path image_urls = [ "url" + x for x in url_list] item['image_urls'] = image_urls yield item # gets and sets the room_title to an item room_title = units.xpath(room_nameSelector).get().strip() item['room_title'] = room_title beds = units.xpath(bedSelector).getall() item['beds'] = beds num_baths = units.xpath(bathsSelector).get().strip() item['num_baths'] = num_baths # gets the square feet and sets it to an item square_feet = units.xpath(sqftSelector).get().strip() item['square_feet'] = square_feet room_amenities = units.xpath(room_amenitiesSelector).getall() # Pulls Washer/Dryer amenity if available washer_amenity = 'Washer' washer_dryer = list( filter(lambda x: washer_amenity in x, room_amenities)) # Extracts the washer/dryer room_amenities list # setting room_amenities item room_amenities = [ x for x in room_amenities if not x.startswith('Washer')] item['room_amenities'] = room_amenities # formatting Kitchen data # setting kitchens item kitchen = units.xpath(kitchenSelector).get().strip() item['kitchen'] = kitchen yield item

1条回答

网友

1楼 · 发布于 2024-09-29 04:21:48

移动这个

        unit_image_selector = './/div[@class = "orbit-wrapper"]/ul//li/figure/img//@src'
        unit_img_path = units_img.xpath(unit_image_selector).getall()

        url_list = imgs_path + unit_img_path
        image_urls = [
            "https://clubwyndham.wyndhamdestinations.com" + x for x in url_list]
        item['image_urls'] = image_urls

在第二个循环内

unit = './/div[contains(@id, "unit-details")]'
for units in response.xpath(unit):

删除第一个循环和不必要的变量

相关问题更多 >

编程相关推荐

热门问题

热门文章