WebCrawler，只有少数项目有折扣价格指数

import bs4 from urllib.request import urlopen as uReq from bs4 import BeautifulSoup as soup myUrl = 'https://www.zalando.de/rucksaecke-herren/' #open connection, grabbing page, saving in page_html and closing connection uClient = uReq(myUrl) page_html = uClient.read() uClient.close() #Datatype, html paser page_soup = soup(page_html, "html.parser") #grabbing information brand_Names = page_soup.findAll("div",{"class": "z-nvg-cognac_brandName-2XZRz z-nvg-cognac_textFormat-16QFn"}) articale_Names = page_soup.findAll ("div",{"class": "z-nvg-cognac_articleName--arFp z-nvg-cognac_textFormat-16QFn"}) original_Prices = page_soup.findAll("div",{"class": "z-nvg-cognac_originalPrice-2Oy4G"}) new_Prices = page_soup.findAll("div",{"class": "z-nvg-cognac_promotionalPrice-3GRE7"}) #opening a csv file and printing its header filename = "XXX.csv" file = open(filename, "w") headers = "BRAND, ARTICALE NAME, OLD PRICE, NEW PRICE\n" file.write(headers) #How many brands on page? products_on_page = len(brand_Names) #Looping through all brands, atricles, prices and writing the text into the CSV for i in range(products_on_page): brand = brand_Names[i].text articale_Name = articale_Names[i].text price = original_Prices[i].text new_Price = new_Prices[i].text file.write(brand + "," + articale_Name + "," + price.replace(",",".") + new_Price.replace(",",".") +"\n") #closing CSV file.close()

1条回答

网友

1楼 · 发布于 2024-10-08 18:27:59

由于某些项没有'div.z-nvg-cognac_promotionalPrice-3GRE7'标记，因此无法可靠地使用列表索引。
但是，您可以选择所有容器标记（'div.z-nvg-cognac_infoContainer-MvytX'），并使用find选择每个项目上的标记。你知道吗

from urllib.request import urlopen
from bs4 import BeautifulSoup as soup
import csv

my_url = 'https://www.zalando.de/sporttaschen-reisetaschen-herren/'
client = urlopen(my_url)
page_html = client.read().decode(errors='ignore')
page_soup = soup(page_html, "html.parser")

headers = ["BRAND", "ARTICALE NAME", "OLD PRICE", "NEW PRICE"]
filename = "test.csv"
with open(filename, 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(headers)

    items = page_soup.find_all(class_='z-nvg-cognac_infoContainer-MvytX')
    for item in items:
        brand_names = item.find(class_="z-nvg-cognac_brandName-2XZRz z-nvg-cognac_textFormat-16QFn").text
        articale_names = item.find(class_="z-nvg-cognac_articleName arFp z-nvg-cognac_textFormat-16QFn").text
        original_prices = item.find(class_="z-nvg-cognac_originalPrice-2Oy4G").text
        new_prices = item.find(class_="z-nvg-cognac_promotionalPrice-3GRE7")
        if new_prices is not None: 
            new_prices = new_prices.text 
        writer.writerow([brand_names, articale_names, original_prices, new_prices])

如果您希望每页获取超过24个项目，则必须使用运行js的客户机，如^{}。你知道吗

from selenium import webdriver
from bs4 import BeautifulSoup as soup
import csv

my_url = 'https://www.zalando.de/sporttaschen-reisetaschen-herren/'
driver = webdriver.Firefox()
driver.get(my_url)
page_html = driver.page_source
driver.quit()
page_soup = soup(page_html, "html.parser")
...

脚注：
函数和变量的naming conventions是小写加下划线。
读写csv文件时，最好使用^{}库。
处理文件时，可以使用^{}语句。你知道吗

相关问题更多 >

编程相关推荐

热门问题

热门文章