使用beautifulsoup和webdriver抓取下一页中的问题

from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.support import expected_conditions as EC import time import requests from bs4 import BeautifulSoup import array as arr import pandas as pd #The first line import the Web Driver, and the second import Chrome Options #-----------------------------------# #Chrome Options all_link = [] chrome_options = Options() chrome_options.add_argument ('--ignore-certificate-errors') chrome_options.add_argument ("--igcognito") chrome_options.add_argument ("--window-size=1920x1080") chrome_options.add_argument ('--headless') #-----------------------------------# driver = webdriver.Chrome(chrome_options=chrome_options, executable_path="C:/webdriver/chromedriver.exe") #Open url url = "https://www.vietnamworks.com/tim-viec-lam/tat-ca-viec-lam" driver.get(url) time.sleep(2) #-----------------------------------# page_source = driver.page_source page = page_source soup = BeautifulSoup(page_source,"html.parser") block_job_list = soup.find_all("div",{"class":"d-flex justify-content-center align-items-center logo-area-wrapper logo-border"}) for i in block_job_list: link = i.find("a") all_link.append("https://www.vietnamworks.com/"+ link.get("href"))

1条回答

网友

1楼 · 发布于 2024-10-02 22:29:22

因为您的问题是遍历页面，所以这段代码将帮助您做到这一点。如前所述，在while循环中插入刮码

from selenium import  webdriver
from selenium.common.exceptions import NoSuchElementException
import time
from webdriver_manager.chrome import  ChromeDriverManager       # use pip install webdriver_manager if not installed

option = webdriver.ChromeOptions()
CDM = ChromeDriverManager()
driver = webdriver.Chrome(CDM.install(),options=option)

url = 'https://www.vietnamworks.com/tim-viec-lam/tat-ca-viec-lam'
driver.get(url)
time.sleep(3)

page_num = 1
links = []
driver.execute_script("window.scrollTo(0, document.body.scrollHeight/2);")

while True:

    # create the soup element here so that it can get the page source of every page        

    # sample scraping of url's of the jobs posted
    for i in driver.find_elements_by_class_name('job-title '):
        links.append(i.get_attribute('href'))

    # moves to next page
    try:
        print(f'On page {str(page_num)}')
        print()
        page_num+=1
        driver.find_element_by_link_text(str(page_num)).click()
        time.sleep(3)

    # checks only at the end of the page
    except NoSuchElementException:
        print('End of pages')
        break

driver.quit()

编辑：

简化和修改分页方法
如果您正在使用BeautifulSoup，那么必须在while循环中插入page_source和soup变量，因为在每次迭代之后，源页面代码都会发生变化。在您的代码中，您只提取了第一页的源代码，因此您得到了与页数相等的重复输出。
通过在包webdriver-manager中使用ChromeDriverManager，不需要提及位置/可执行路径。您只需复制粘贴此代码，并在安装了Chrome的任何计算机上运行即可。如果必须安装，请在运行代码之前在cmd中使用pip install webdriver_manager

警告：避免显示任何帐户的实际用户名和密码，就像在GitHub代码中一样

相关问题更多 >

编程相关推荐

热门问题

热门文章