当url保持不变时，无法从网页的第二页刮取名称

import requests from bs4 import BeautifulSoup from pprint import pprint link = 'https://www.gebiz.gov.sg/ptn/opportunity/BOListing.xhtml?origin=menu' url = 'https://www.gebiz.gov.sg/ptn/opportunity/BOListing.xhtml' with requests.Session() as s: s.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36' r = s.get(link) soup = BeautifulSoup(r.text,"lxml") payload = { 'contentForm': 'contentForm', 'contentForm:j_idt171_windowName': '', 'contentForm:j_idt187_listButton2_HIDDEN-INPUT': '', 'contentForm:j_idt192_searchBar_INPUT-SEARCH': '', 'contentForm:j_idt192_searchBarList_HIDDEN-SUBMITTED-VALUE': '', 'contentForm:j_id135_0': 'Title', 'contentForm:j_id135_1': 'Document No.', 'contentForm:j_id136': 'Match All', 'contentForm:j_idt853_select': 'ON', 'contentForm:j_idt859_select': '0', 'javax.faces.ViewState': soup.select_one('input[name="javax.faces.ViewState"]')['value'], 'javax.faces.source': 'contentForm:j_idt902:j_idt955_2_2', 'javax.faces.partial.event': 'click', 'javax.faces.partial.execute': 'contentForm:j_idt902:j_idt955_2_2 contentForm:j_idt902', 'javax.faces.partial.render': 'contentForm:j_idt902:j_idt955 contentForm dialogForm', 'javax.faces.behavior.event': 'action', 'javax.faces.partial.ajax': 'true' } s.headers['Referer'] = 'https://www.gebiz.gov.sg/ptn/opportunity/BOListing.xhtml?origin=menu' s.headers['Faces-Request'] = 'partial/ajax' s.headers['Origin'] = 'https://www.gebiz.gov.sg' s.headers['Host'] = 'www.gebiz.gov.sg' s.headers['Accept-Encoding'] = 'gzip, deflate, br' res = s.post(url,data=payload,allow_redirects=False) # soup = BeautifulSoup(res.text,"lxml") # for item in soup.select(".commandLink_TITLE-BLUE"): # print(item.get_text(strip=True)) print(res.text)

1条回答

网友

1楼 · 发布于 2024-10-03 11:19:54

您可以使用Selenium在页面之间进行遍历。下面的代码将允许您执行此操作

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.options import Options
import time


chrome_options = Options()
#chrome_options.add_argument(" headless")
#chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36")


driver = webdriver.Chrome(executable_path="./chromedriver", options=chrome_options)
driver.get("https://www.gebiz.gov.sg/ptn/opportunity/BOListing.xhtml?origin=menu")

#check if next page exists
next_page = driver.find_element_by_xpath("//input[starts-with(@value, 'Next')]")

#click the next button
while next_page is not None:
    time.sleep(5)
    click_btn = driver.find_element_by_xpath("//input[starts-with(@value, 'Next')]")
    click_btn.click()
    time.sleep(5)
    next_page = driver.find_element_by_xpath("//input[starts-with(@value, 'Next')]")

我没有添加提取机构名称的代码。我想这对你来说并不难

确保安装Selenium并下载chrome driver。还要确保下载正确版本的驱动程序。您可以通过查看chrome浏览器的“关于”部分来确认版本

相关问题更多 >

编程相关推荐

热门问题

热门文章