网站抓取网站未返回正确的值

import requests from bs4 import BeautifulSoup headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'} url = "https://www.adidas.com.sg/yeezy/" productsource = requests.get(url,headers=headers,timeout=15) productinfo = BeautifulSoup(productsource.content, "lxml") for item in productinfo.select('div',class_='src-components-___coming-soon__row___NfXc3'): sku = item.find('div', class_="src-components-___coming-soon__product___2Gai4")['id'] link = item.a['href'] print(sku,'\n',link)

Traceback (most recent call last): File "c:\Users\matta\OneDrive\xeonon\testing monitors\test.py", line 14, in <module> sku = item.find('div', class_="src-components-___coming-soon__product___2Gai4")['id'] TypeError: 'NoneType' object is not subscriptable

"imageUrls": [ "https://assets.adidas.com/images/w_840,h_840,q_auto:sensitive/3d37a43625ce413ea6d3ad44013560db_9366/GZ0954_01_standard.jpg", "https://assets.adidas.com/images/w_840,h_840,q_auto:sensitive/e1748ff26ad54f559ffbad4401356122_9366/GZ0954_01_standard1_hover.jpg", "https://assets.adidas.com/images/w_840,h_840,q_auto:sensitive/3da89e0f71064a958377ad4401355e12_9366/GZ0954_01_standard2.jpg", "https://assets.adidas.com/images/w_840,h_840,q_auto:sensitive/43136245b78840e9901bad44013561bf_9366/GZ0954_02_standard.jpg", "https://assets.adidas.com/images/w_840,h_840,q_auto:sensitive/c116076d86b34098bf9cad4401355ee8_9366/GZ0954_03_standard.jpg" ],

2条回答

网友

1楼 · 编辑于 2024-10-02 10:20:38

import requests
from bs4 import BeautifulSoup
import json

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0'
}


def main(url):
    r = requests.get(url, headers=headers)
    soup = BeautifulSoup(r.text, 'lxml')
    goal = soup.select_one('script').string.split("=", 1)[1]
    print(json.loads(goal)['productIds'])


main('https://www.adidas.com.sg/yeezy')

输出：

['GZ0953', 'GZ0954', 'GZ0955', 'GZ5551', 'GZ5554']

网友

2楼 · 编辑于 2024-10-02 10:20:38

数据以JavaScript嵌入到页面中。您可以使用以下示例来解析它：

import re
import json
import requests


url = "https://www.adidas.com.sg/yeezy"

headers = {
    "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0"
}

html_doc = requests.get(url, headers=headers).text
data = re.search(r"window\.ENV = ({.*})", html_doc).group(1)
data = json.loads(data)

# uncomment this to print all data:
# print(json.dumps(data, indent=4))

for id_, product in data["productData"].items():
    print(id_, product["shared"]["trackingName"], product["localized"]["color"])
    print("https://www.adidas.com.sg/yeezy/product/{}".format(id_))

印刷品：

GZ0953 YEEZY SLIDE ADULTS ENFLAME ORANGE
https://www.adidas.com.sg/yeezy/product/GZ0953
GZ0954 YEEZY SLIDE KIDS ENFLAME ORANGE
https://www.adidas.com.sg/yeezy/product/GZ0954
GZ0955 YEEZY SLIDE INFANTS ENFLAME ORANGE
https://www.adidas.com.sg/yeezy/product/GZ0955
GZ5551 YEEZY SLIDE RESIN
https://www.adidas.com.sg/yeezy/product/GZ5551
GZ5554 YEEZY SLIDE PURE
https://www.adidas.com.sg/yeezy/product/GZ5554

相关问题更多 >

编程相关推荐

热门问题

热门文章