<pre class="lang-py prettyprint-override"><code>import requests
from bs4 import BeautifulSoup
from urllib.parse import unquote
import re
import pandas as pd
fish = ["ctl01$ScriptManager1", "ctl01$lastClickedElementId", "__EVENTTARGET"]
data = {
'ctl01$ScriptManager1': 'ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$rapLoadingPanel|ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$lnkFirstPage',
'__WPPS': 's',
'__CTRLKEY': '',
'__SHIFTKEY': '',
'NavMenuClientID': 'ctl01_Primary_NavMenu',
'IsControlPostBackctl01$TemplateBody$ContentPageFooter1': '1',
'ctl01$lastClickedElementId': 'id|ctl01_TemplateBody_WebPartManager1_gwpciNewATSCustomQueryDisplayCommon_ciNewATSCustomQueryDisplayCommon_lnkFirstPage',
'ctl01$SearchField$SearchTerms': 'Keyword Search',
"ctl01_Primary_NavMenu_ClientState": "",
"ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$txtPOSTALCODE0": "10001",
"ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$ddlDISTANCE0": "5",
"ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$txtName_TL0": "",
"ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$ddlResultsPerPage": "25",
"ctl01_GenericWindow_ClientState": "",
"ctl01_ObjectBrowser_ClientState": "",
"ctl01_ObjectBrowserDialog_ClientState": "",
"ctl01_WindowManager1_ClientState": "",
"__EVENTTARGET": "ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$lnkFirstPage",
"__EVENTARGUMENT": "",
"__LASTFOCUS": "",
"__VIEWSTATEGENERATOR": "37E773F2",
"__ClientContext": "{\"baseUrl\":\"/\",\"isAnonymous\":true,\"loggedInPartyId\":\"132791\",\"selectedPartyId\":\"132791\",\"websiteRoot\":\"http://www.therapistlocator.net/\",\"virtualDir\":\"\"}",
"TemplateUserMessagesID": "ctl01_TemplateUserMessages_ctl00_Messages",
"PageIsDirty": "false",
"IsControlPostBackctl01$HeaderLogo$HeaderLogoSpan": "1",
"IsControlPostBackctl01$SearchField": "1",
"IsControlPostBackctl01$TemplateBody$WebPartManager1$gwpciTitleandintro_9bb3191967f941e883b2c501791a2061$ciTitleandintro_9bb3191967f941e883b2c501791a2061": "1",
"IsControlPostBackctl01$TemplateBody$WebPartManager1$gwpciStyles_59e895c08d4f407aa0ada09911013fd2$ciStyles_59e895c08d4f407aa0ada09911013fd2": "1",
"IsControlPostBackctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon": "1",
"IsControlPostBackctl01$TemplateBody$WebPartManager1$gwpciScriptsDONOTREMOVE_f3cae45af58246d8b3f4953f13f8d401$ciScriptsDONOTREMOVE_f3cae45af58246d8b3f4953f13f8d401": "1",
"IsControlPostBackctl01$TemplateBody$WebPartManager1$gwpciNewContentHtml_0be4f96424fb47de90d1c22db2588e85$ciNewContentHtml_0be4f96424fb47de90d1c22db2588e85": "1",
"IsControlPostBackctl01$TemplateBody$WebPartManager1$gwpciNewATSGeoCodingCommon$ciNewATSGeoCodingCommon": "1",
"IsControlPostBackctl01$TemplateBody$ContentPage1": "1",
"IsControlPostBackctl01$TemplateBody$ContentPage2": "1",
"IsControlPostBackctl01$TemplateBody$ContentPage3": "1",
"IsControlPostBackctl01$FooterCopyright$FooterCopyright": "1",
"IsControlPostBackctl01$FooterCopyright$tosol": "1",
"__ASYNCPOST": "true",
"RadAJAXControlID": "ctl01_TemplateBody_WebPartManager1_gwpciNewATSCustomQueryDisplayCommon_ciNewATSCustomQueryDisplayCommon_rapLoading"
}
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0",
"Referer": "https://www.therapistlocator.net/tl/therapist-finder.aspx?zip=10001"
}
def main(url):
with requests.Session() as req:
r = req.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
data['ctl01_ScriptManager1_TSM'] = unquote(soup.select_one(
"script[src*=Telerik]").get("src")).split("=", 3)[-1]
data['__VIEWSTATE'] = soup.find("input", id="__VIEWSTATE").get("value")
data['PageInstanceKey'] = re.search(
'PageInstanceKey=(.+?)"', r.text).group(1)
data['__RequestVerificationToken'] = soup.find(
"input", id="__RequestVerificationToken").get("value")
urls = []
for num in range(1, 4):
print(f"Extracting Links From Page {num}")
r = req.post(url, data=data, headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
links = [f'{url[:32]}{link.get("href")}'
for link in soup.select("a[href*=viewprofile]")]
urls.extend(links)
for f in fish:
if num == 1:
data[f] = re.sub('(k)(.+)', r"\1SecondPage", data[f])
else:
data[f] = re.sub('(k)(.+)', r"\1Last", data[f])
print(f"Collected {len(urls)} Links")
done = []
for x in urls:
r = req.get(x)
soup = BeautifulSoup(r.content, 'html.parser')
load = soup.select("div.PanelFieldValue")
name = load[2].span.text
add = load[0].span.text
ph = load[1].span.text
try:
em = soup.select_one("a.PanelField").text
except:
em = "N/A"
goal = [name, add, ph, em]
done.append(goal)
df = pd.DataFrame.from_records(
done, columns=["Name", "Address", "Phone", "Email"])
print(df)
df.to_csv("data.csv", index=False)
main("https://www.therapistlocator.net/tl/therapist-finder.aspx?zip=10001")
</code></pre>
<p>输出:<a href="http://www.sharecsv.com/s/f24282821fa1fee65cd0cb1d0564762c/data.csv" rel="nofollow noreferrer">view-online</a></p>
<p><a href="https://i.stack.imgur.com/kcspn.png" rel="nofollow noreferrer"><img src="https://i.stack.imgur.com/kcspn.png" alt="enter image description here"/></a></p>