使用请求删除页面不会返回所有html标记

2024-05-19 18:41:49 发布

您现在位置:Python中文网/ 问答频道 /正文

我试图刮取这个page,以便提取[ol^{id1}内每个[li]标记的细节$

import requests
import time
from bs4 import BeautifulSoup

headers = {
    "Connection": "keep-alive",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"
}

url = "https://www.skroutz.gr/s/11706397/Guy-Laroche-Linda-Red.html"
page = requests.get(url, headers=headers )

# i also tried the following two commands in order to wait for the page to load
#seconds = 10
#page = requests.get(url, time.sleep(seconds), headers=headers)

soup = BeautifulSoup(page.content, 'html.parser')

eshops_grid = soup.find("ol", id="prices")
eshops_product = eshops_grid.findAll("li", class_='cf card js-product-card')
for eshop in eshops_product[0:]:
    eshop_name = eshop.find("div", class_="shop-name").text
    print(eshop_name) # I need to print the eshop_name for each eshop

虽然我需要通过使用请求库来实现这一点,但出于这个原因,我也使用了selenium,但同样的问题也发生了

from selenium import webdriver
from pyvirtualdisplay import Display
from bs4 import BeautifulSoup

# We are opening a browser but not visible
print('- Open a browser but not visible ')
display = Display(visible=0, size=(1920, 1080))
display.start()

driver = webdriver.Chrome("/usr/lib/chromium-browser/chromedriver")

url = 'https://www.skroutz.gr/s/11706397/Guy-Laroche-Linda-Red.html?o=%CE%9C%CF%80%CE%BF%CF%85%CF%81%CE%BD%CE%BF%CF%8D%CE%B6%CE%B9%20Guy%20Laroche%20Linda%20Red'
#print('- Get the initial url of brandwatch')
driver.get(url)

page = driver.page_source
soup = BeautifulSoup(page, 'html.parser')

eshops_grid = soup.find("ol", id="prices")
eshops_product = eshops_grid.findAll("li", class_='cf card js-product-card')
for eshop in eshops_product[0:]:
    eshop_name = eshop.find("div", class_="shop-name").text
    print(eshop_name) # I need to print the eshop_name for each eshop

是否有办法获取每个[“li”]的所有内容,以便提取和打印“eshop_名称”


Tags: thenamefromimporturlforhtmlpage
2条回答

这是你想要的

import requests
import demjson
from bs4 import BeautifulSoup as bs

headers = {
    'authority': 'www.skroutz.gr',
    'cache-control': 'max-age=0',
    'sec-ch-ua': '"Google Chrome"; v="83"',
    'sec-ch-ua-mobile': '?0',
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36',
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'sec-fetch-site': 'none',
    'sec-fetch-mode': 'navigate',
    'sec-fetch-user': '?1',
    'sec-fetch-dest': 'document',
    'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
    'cookie': '_helmet_couch=eyJzZXNzaW9uX2lkIjoiNjgzNzhmMmNmNjI5OTcxNjI5NzU2ZWNmMTM5MzE5MmIiLCJidWNrZXRfaWQiOiJmNTk1ZGRhYy00ZmVhLTQ5NmYtODNkNS00OWQzODgzMWFhYTAiLCJsYXN0X3NlZW4iOjE1OTEyNjgwNTUsInZvbCI6MSwiX2NzcmZfdG9rZW4iOiI1a3Yxb3FKTmhXTCs1YUxzdjYzRFk3TlNXeGs5TlhXYmZhM0UzSmtEL0NBPSJ9 22dfbfe582c0f3a7485e20d9d3932b32fbfb721b',
    'if-none-match': 'W/"e6fb8187391e99a90270c2351f9d17cd"',
}

params = (
    ('o', '\u039C\u03C0\u03BF\u03C5\u03C1\u03BD\u03BF\u03CD\u03B6\u03B9 Guy Laroche Linda Red'),
)

response = requests.get('https://www.skroutz.gr/s/11706397/Guy-Laroche-Linda-Red.html', headers=headers, params=params)

data = bs(response.text,'lxml')
s = data.find_all('script')[5].text.split('SKR.page.first_shop_name = ')[1].split(';')[0].replace('"','')
print(s)

输出为:

Spitishop

我认为提议的解决方案只适用于第一项。商店列表由通过JS发送的单独POST请求提供,然后由浏览器单独呈现

您可以使用适当的标题和POST数据(即原始请求的“js product link内容占位符”项中的产品ID)将POST请求复制到https://www.skroutz.gr/personalization/product_prices.json,返回的值为JSON格式

编辑

好的,信息被“隐藏”在“原始”页面中以base64编码的HTML注释中。以下代码用于识别有关商店所需的信息。注释掉的行可以用于发出后续POST请求,以防需要任何进一步的信息,而这些信息在base64编码字符串中不存在(老实说,我没有费心检查)

import requests
from lxml import html
import simplejson as json
import copy

sess = requests.Session()
sess.head('https://www.skroutz.gr')

headers = {
    'authority': 'www.skroutz.gr',
    'cache-control': 'max-age=0',
    'sec-ch-ua': '"Google Chrome"; v="83"',
    'sec-ch-ua-mobile': '?0',
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36',
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'sec-fetch-site': 'none',
    'sec-fetch-mode': 'navigate',
    'sec-fetch-user': '?1',
    'sec-fetch-dest': 'document',
    'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
    'if-none-match': 'W/"e6fb8187391e99a90270c2351f9d17cd"'
}

params = (('o', '\u039C\u03C0\u03BF\u03C5\u03C1\u03BD\u03BF\u03CD\u03B6\u03B9 Guy Laroche Linda Red'),)

response = sess.get('https://www.skroutz.gr/s/11706397/Guy-Laroche-Linda-Red.html', headers=headers, params=params)
tree = html.fromstring(response.content)
# hrefs = tree.xpath('//a[@class="js-product-link content-placeholder"]/@href')
# ids = [x.split('/')[-1] for x in hrefs]
# headers2 = copy.deepcopy(headers)
# headers2['content-type'] = 'application/json'
# ret = sess.post('https://www.skroutz.gr/personalization/product_prices.json', data=json.dumps({'product_ids': ids}), headers=headers2)
details = tree.xpath('//script[@type="application/json"]')[0]
# details.text_content() contains the base64 encoded elements within HTML comments
details_b64 = details.text_content()[4:-3] # strip off the html comments
details_d = json.loads(base64.b64decode(details_b64).decode('utf-8')) # based64 decode the string, decode it in utf-8 and parse the resulting json  object

相关问题 更多 >