如何遍历站点中的所有页面？

import requests from bs4 import BeautifulSoup as bs count = 0 cookies = { 'ASP.NET_SessionId': 'uij03wnehlax221msxy4jkno', '__RequestVerificationToken': 'ReASHPRKAhth_7S9C1U7qg7de4AxnkIdFxUt6yhMKTdWPHsZl_1vC-pJOJZ8fQwopOL56MS3yjVi1D6WhrKm2ZyKoNU1', 'LoginGuid': '', '_ga': 'GA1.2.1257196513.1587105612', 'Asi.Web.Browser.CookiesEnabled': 'true', 'tltos': '1', '_gid': 'GA1.2.1385127198.1587230995', '__utmxst': '180', } headers = { 'Connection': 'keep-alive', 'sec-ch-ua': '"Google Chrome 80"', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Cache-Control': 'no-cache', 'Sec-Fetch-Dest': 'empty', 'X-Requested-With': 'XMLHttpRequest', 'X-MicrosoftAjax': 'Delta=true', 'Accept': '*/*', 'Origin': 'https://www.therapistlocator.net', 'Sec-Fetch-Site': 'same-origin', 'Sec-Fetch-Mode': 'cors', 'Referer': 'https://www.therapistlocator.net/tl/therapist-finder.aspx?zip=10001&name=', 'Accept-Language': 'en-GB,en-US;q=0.9,en;q=0.8', } params = ( ('zip', '10001'), ('name', ''), ) data = { 'ctl01$ScriptManager1': 'ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$rapLoadingPanel|ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$btnFilter', '__WPPS': 's', '__ClientContext': '{"baseUrl":"/","isAnonymous":true,"loggedInPartyId":"132791","selectedPartyId":"132791","websiteRoot":"http://www.therapistlocator.net/","virtualDir":""}', '__CTRLKEY': '', '__SHIFTKEY': '', 'ctl01_ScriptManager1_TSM': '', 'PageInstanceKey': '54d43052-a674-4b86-bebe-f3635b68db37', '__RequestVerificationToken': 'Q0PHslrV-Kffbpo7LCbjPe8RMOcT59p8PRLefKE93uc6G4hfz6Ewpjg_bCI3SV2MPNfGUd1VirBZ3igc1rB51IPZTvc1', 'TemplateUserMessagesID': 'ctl01_TemplateUserMessages_ctl00_Messages', 'PageIsDirty': 'false', 'IsControlPostBackctl01$HeaderLogo$HeaderLogoSpan': '1', 'IsControlPostBackctl01$SearchField': '1', '__EVENTTARGET': 'ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$btnFilter', '__EVENTARGUMENT': '', 'NavMenuClientID': 'ctl01_Primary_NavMenu', 'IsControlPostBackctl01$TemplateBody$WebPartManager1$gwpciTitleandintro_9bb3191967f941e883b2c501791a2061$ciTitleandintro_9bb3191967f941e883b2c501791a2061': '1', 'IsControlPostBackctl01$TemplateBody$WebPartManager1$gwpciStyles_59e895c08d4f407aa0ada09911013fd2$ciStyles_59e895c08d4f407aa0ada09911013fd2': '1', 'IsControlPostBackctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon': '1', 'IsControlPostBackctl01$TemplateBody$WebPartManager1$gwpciScriptsDONOTREMOVE_f3cae45af58246d8b3f4953f13f8d401$ciScriptsDONOTREMOVE_f3cae45af58246d8b3f4953f13f8d401': '1', 'IsControlPostBackctl01$TemplateBody$WebPartManager1$gwpciNewContentHtml_0be4f96424fb47de90d1c22db2588e85$ciNewContentHtml_0be4f96424fb47de90d1c22db2588e85': '1', 'IsControlPostBackctl01$TemplateBody$WebPartManager1$gwpciNewATSGeoCodingCommon$ciNewATSGeoCodingCommon': '1', 'IsControlPostBackctl01$TemplateBody$ContentPage1': '1', 'IsControlPostBackctl01$TemplateBody$ContentPage2': '1', 'IsControlPostBackctl01$TemplateBody$ContentPage3': '1', 'IsControlPostBackctl01$TemplateBody$ContentPageFooter1': '1', 'IsControlPostBackctl01$FooterCopyright$FooterCopyright': '1', 'IsControlPostBackctl01$FooterCopyright$tosol': '1', '__VIEWSTATE': '/wEPaA8FDzhkN2UyOWRmZGE0ZGQ4NxgBBR5fX0NvbnRyb2xzUmVxdWlyZVBvc3RCYWNrS2V5X18WBwUYY3RsMDEkTG9naW5TdGF0dXMxJGN0bDAxBRhjdGwwMSRMb2dpblN0YXR1czEkY3RsMDMFFWN0bDAxJFByaW1hcnkkTmF2TWVudQUUY3RsMDEkV2luZG93TWFuYWdlcjEFE2N0bDAxJEdlbmVyaWNXaW5kb3cFE2N0bDAxJE9iamVjdEJyb3dzZXIFGWN0bDAxJE9iamVjdEJyb3dzZXJEaWFsb2fx/JLd/+XByre34VShpvA4WynsKA==', '__VIEWSTATEGENERATOR': '37E773F2', 'ctl01$lastClickedElementId': '', 'ctl01$SearchField$SearchTerms': 'Keyword Search', 'ctl01_Primary_NavMenu_ClientState': '', 'ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$txtPOSTALCODE0': '10001', 'ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$ddlDISTANCE0': '5', 'ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$txtName_TL0': '', '__ASYNCPOST': 'true', '': '' } response = requests.post('https://www.therapistlocator.net/tl/therapist-finder.aspx', headers=headers, params=params, cookies=cookies, data=data) html = str(response.content) con = bs(html , 'lxml') therapists = con.find('div',class_='QueryDisplayWrapper').find_all('div',class_='row') for therapist in therapists: count+=1 name = therapist.find('div',class_='item name').find('a').text.strip() therapist_href = therapist.find('div',class_='item name').find('a').get('href') therapist_href = therapist_href.replace('\\','') therapist_href = therapist_href.replace("'",'') therapist_link = 'https://www.therapistlocator.net{}'.format(therapist_href) therapist_info = requests.get(therapist_link) if therapist_info.ok: dataa = bs(therapist_info.text,'lxml') try: email = dataa.find('a',class_='PanelField').text.strip() location = dataa.find_all('div',class_='PanelFieldValue')[0].find('span').text.strip() loc1 = dataa.find_all('div',class_='PanelFieldValue')[0].find('br').next_sibling.strip() location = location.replace(loc1 , ' {}'.format(loc1)) phone = dataa.find_all('div',class_='PanelFieldValue')[1].find('span').text.strip() print('\n*********** '+str(count)+' ************\n') print('Name: {}'.format(name)) print('Email: {}'.format(email)) print('Phone: {}'.format(phone)) print('Location: {}'.format(location)) except: pass

2条回答

网友

1楼 · 编辑于 2024-10-03 06:32:11

解决方案

你的网站有javascript。当您单击下一页时，它将触发一个javascript函数来填充结果。您可以使用浏览器自动化以编程方式访问其他页面

请看这些：

硒分页

navigating through pagination with selenium in python

核心步骤

您需要将问题分解为以下步骤：

使用Selenium（与python一起使用）浏览自动化来访问您的页面
获取总页数（请参见页面源代码中，在最后，它有一个分页部分）。另外，默认情况下，您可以使用total_pages = total_results//max_results + 1其中，max_results = 25
对于每页：
1. 使用BeautifulSoup从使用Selenium获得的响应对象中提取数据
2. 使用selenium点击下一页链接
3. 如果愿意，将结果附加到dict或list或pandas.DataFrame中

网友

2楼 · 编辑于 2024-10-03 06:32:11

import requests
from bs4 import BeautifulSoup
from urllib.parse import unquote
import re
import pandas as pd

fish = ["ctl01$ScriptManager1", "ctl01$lastClickedElementId", "__EVENTTARGET"]

data = {
    'ctl01$ScriptManager1': 'ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$rapLoadingPanel|ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$lnkFirstPage',
    '__WPPS': 's',
    '__CTRLKEY': '',
    '__SHIFTKEY': '',
    'NavMenuClientID': 'ctl01_Primary_NavMenu',
    'IsControlPostBackctl01$TemplateBody$ContentPageFooter1': '1',
    'ctl01$lastClickedElementId': 'id|ctl01_TemplateBody_WebPartManager1_gwpciNewATSCustomQueryDisplayCommon_ciNewATSCustomQueryDisplayCommon_lnkFirstPage',
    'ctl01$SearchField$SearchTerms': 'Keyword Search',
    "ctl01_Primary_NavMenu_ClientState": "",
    "ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$txtPOSTALCODE0": "10001",
    "ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$ddlDISTANCE0": "5",
    "ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$txtName_TL0": "",
    "ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$ddlResultsPerPage": "25",
    "ctl01_GenericWindow_ClientState": "",
    "ctl01_ObjectBrowser_ClientState": "",
    "ctl01_ObjectBrowserDialog_ClientState": "",
    "ctl01_WindowManager1_ClientState": "",
    "__EVENTTARGET": "ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$lnkFirstPage",
    "__EVENTARGUMENT": "",
    "__LASTFOCUS": "",
    "__VIEWSTATEGENERATOR": "37E773F2",
    "__ClientContext": "{\"baseUrl\":\"/\",\"isAnonymous\":true,\"loggedInPartyId\":\"132791\",\"selectedPartyId\":\"132791\",\"websiteRoot\":\"http://www.therapistlocator.net/\",\"virtualDir\":\"\"}",
    "TemplateUserMessagesID": "ctl01_TemplateUserMessages_ctl00_Messages",
    "PageIsDirty": "false",
    "IsControlPostBackctl01$HeaderLogo$HeaderLogoSpan": "1",
    "IsControlPostBackctl01$SearchField": "1",
    "IsControlPostBackctl01$TemplateBody$WebPartManager1$gwpciTitleandintro_9bb3191967f941e883b2c501791a2061$ciTitleandintro_9bb3191967f941e883b2c501791a2061": "1",
    "IsControlPostBackctl01$TemplateBody$WebPartManager1$gwpciStyles_59e895c08d4f407aa0ada09911013fd2$ciStyles_59e895c08d4f407aa0ada09911013fd2": "1",
    "IsControlPostBackctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon": "1",
    "IsControlPostBackctl01$TemplateBody$WebPartManager1$gwpciScriptsDONOTREMOVE_f3cae45af58246d8b3f4953f13f8d401$ciScriptsDONOTREMOVE_f3cae45af58246d8b3f4953f13f8d401": "1",
    "IsControlPostBackctl01$TemplateBody$WebPartManager1$gwpciNewContentHtml_0be4f96424fb47de90d1c22db2588e85$ciNewContentHtml_0be4f96424fb47de90d1c22db2588e85": "1",
    "IsControlPostBackctl01$TemplateBody$WebPartManager1$gwpciNewATSGeoCodingCommon$ciNewATSGeoCodingCommon": "1",
    "IsControlPostBackctl01$TemplateBody$ContentPage1": "1",
    "IsControlPostBackctl01$TemplateBody$ContentPage2": "1",
    "IsControlPostBackctl01$TemplateBody$ContentPage3": "1",
    "IsControlPostBackctl01$FooterCopyright$FooterCopyright": "1",
    "IsControlPostBackctl01$FooterCopyright$tosol": "1",
    "__ASYNCPOST": "true",
    "RadAJAXControlID": "ctl01_TemplateBody_WebPartManager1_gwpciNewATSCustomQueryDisplayCommon_ciNewATSCustomQueryDisplayCommon_rapLoading"
}

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0",
    "Referer": "https://www.therapistlocator.net/tl/therapist-finder.aspx?zip=10001"
}


def main(url):
    with requests.Session() as req:

        r = req.get(url)
        soup = BeautifulSoup(r.content, 'html.parser')

        data['ctl01_ScriptManager1_TSM'] = unquote(soup.select_one(
            "script[src*=Telerik]").get("src")).split("=", 3)[-1]
        data['__VIEWSTATE'] = soup.find("input", id="__VIEWSTATE").get("value")
        data['PageInstanceKey'] = re.search(
            'PageInstanceKey=(.+?)"', r.text).group(1)
        data['__RequestVerificationToken'] = soup.find(
            "input", id="__RequestVerificationToken").get("value")
        urls = []
        for num in range(1, 4):
            print(f"Extracting Links From Page {num}")

            r = req.post(url, data=data, headers=headers)
            soup = BeautifulSoup(r.content, 'html.parser')

            links = [f'{url[:32]}{link.get("href")}'
                     for link in soup.select("a[href*=viewprofile]")]
            urls.extend(links)

            for f in fish:
                if num == 1:
                    data[f] = re.sub('(k)(.+)', r"\1SecondPage", data[f])
                else:
                    data[f] = re.sub('(k)(.+)', r"\1Last", data[f])

        print(f"Collected {len(urls)} Links")
        done = []
        for x in urls:
            r = req.get(x)
            soup = BeautifulSoup(r.content, 'html.parser')
            load = soup.select("div.PanelFieldValue")
            name = load[2].span.text
            add = load[0].span.text
            ph = load[1].span.text
            try:
                em = soup.select_one("a.PanelField").text
            except:
                em = "N/A"
            goal = [name, add, ph, em]
            done.append(goal)
        df = pd.DataFrame.from_records(
            done, columns=["Name", "Address", "Phone", "Email"])
        print(df)
        df.to_csv("data.csv", index=False)


main("https://www.therapistlocator.net/tl/therapist-finder.aspx?zip=10001")

输出：view-online

解决方案

硒分页

核心步骤

相关问题更多 >

编程相关推荐

热门问题

热门文章