网络抓取Asos错误的Python问题

import urllib.request, urllib.parse, urllib.error from bs4 import BeautifulSoup import ssl import requests headers = { 'authority': 'www.asos.com', 'method': 'GET', 'path': '/us/hollister/hollister-v-neck-knitted-sweater/prd/14148890', 'scheme': 'https', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7', 'sec-fetch-dest': 'document', 'sec-fetch-mode': 'navigate', 'sec-fetch-site': 'none', 'sec-fetch-user': '?1', 'upgrade-insecure-requests': '1', 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36', } ctx = ssl.create_default_context() ctx.check_hostname = False url = "https://www.asos.com/us/hollister/hollister-v-neck-knitted-sweater/prd/14148890" html = requests.get(url, headers = headers).text soup = BeautifulSoup(html, 'html.parser') wrappers = soup("div",{"class":"grid-row rendered"}) for wrap in wrappers: print(wrap.find("span",{"data-id":"rrp-price"})["class"])

1条回答

网友

1楼 · 发布于 2024-07-02 11:15:37

我修改了代码中的两个主要部分，现在可以正常工作了

BS4Parsing：这里您需要迭代找到的每个“div”，并使用find（）获取“h1”元素。此外，“旁白内容”是你正在抓取的div的“id”而不是“class”

绕过潜在的机器人防护：该程序现在正在模仿chrome浏览器。您可以在ChromeDeveloper工具中读取请求头，并将其用作python请求的头。还有一个用于此目的的python模块称为“fake useragent”，顺便说一句。由于您的操作超时，我认为主要的解决方案是我修改的第二部分

import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl
import requests


headers = {
        'authority': 'www.asos.com',
        'method': 'GET',
        'path': '/us/hollister/hollister-v-neck-knitted-sweater/prd/14148890',
        'scheme': 'https',
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'accept-encoding': 'gzip, deflate, br',
        'accept-language': 'de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7',
        'sec-fetch-dest': 'document',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-site': 'none',
        'sec-fetch-user': '?1',
        'upgrade-insecure-requests': '1',
        'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36',
}


ctx = ssl.create_default_context()
ctx.check_hostname = False
url = "https://www.asos.com/us/hollister/hollister-v-neck-knitted-sweater/prd/14148890"
html = requests.get(url, headers=headers).text
soup = BeautifulSoup(html, 'html.parser')
wraps = soup("div",{"id":"aside-content"})

for wrap in wraps:

    print(wrap.find("h1"))

输出：<；h1>；Hollister v领针织毛衣</h1>

新问题的解决方案：您需要直接请求ASOSAPI，因为网站使用xhr动态生成价格html

response = requests.get("https://www.asos.com/api/product/catalogue/v3/stockprice?productIds=14148890&store=US&currency=USD&keyStoreDataversion=j42uv2x-26", headers=headers)

response_json = json.loads(response.text)

print(response_json['productPrice']['current']['text'])

相关问题更多 >

编程相关推荐

热门问题

热门文章