我有一套显示在多个页面上的产品。我需要浏览每一页,了解详细信息。我编写了下面的代码,但是循环似乎有问题,因为条目被多次获取
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup
def cpap_spider(max_pages):
page = 1
while(page <= max_pages):
url = "https://www.respshop.com/cpap-machines/auto-cpap/?cpapmachines=autocpap&page=" + str(page) + "&redirectCancelled=1&sort=6a"
product_info_url = 'https://www.respshop.com/product_info.php'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0'}
soup = BeautifulSoup(requests.get(url, headers=headers).content, 'html.parser')
all_data = []
for item in soup.select('td.name a'):
sku = item.find_parent('table', class_="prod2_t")
.select_one('b:contains("SKU:")')
.find_next('td').text
print(item.text, sku)
products_id = re.search(r'p-(\d+)\.html', item['href'])[1]
s = BeautifulSoup(
requests.post(
product_info_url,
data={'products_id': products_id,
'tab': 3},
headers=headers).content,
'html.parser')
row = {'Name': item.text, 'SKU': sku, 'URL': item['href']}
for k, v in zip(s.select('#cont_3 td.main:nth-child(1)'),
s.select('#cont_3 td.main:nth-child(2)')):
row[k.get_text(strip=True)] = v.get_text(strip=True)
all_data.append(row)
df = pd.DataFrame(all_data)
df.to_csv('ACPAP.csv')
page += 1
cpap_spider(3)
输出:
第1页CSV:
相关问题 更多 >
编程相关推荐