我正在从一个网站上抓取治疗师的姓名、电子邮件、电话和位置。我已经从第一页抓取了数据,但我无法对其余的页面进行分页。
我正在使用requests
和beautifulsoup
该网站是Here
第一页的代码为:
import requests
from bs4 import BeautifulSoup as bs
count = 0
cookies = {
'ASP.NET_SessionId': 'uij03wnehlax221msxy4jkno',
'__RequestVerificationToken': 'ReASHPRKAhth_7S9C1U7qg7de4AxnkIdFxUt6yhMKTdWPHsZl_1vC-pJOJZ8fQwopOL56MS3yjVi1D6WhrKm2ZyKoNU1',
'LoginGuid': '',
'_ga': 'GA1.2.1257196513.1587105612',
'Asi.Web.Browser.CookiesEnabled': 'true',
'tltos': '1',
'_gid': 'GA1.2.1385127198.1587230995',
'__utmxst': '180',
}
headers = {
'Connection': 'keep-alive',
'sec-ch-ua': '"Google Chrome 80"',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Cache-Control': 'no-cache',
'Sec-Fetch-Dest': 'empty',
'X-Requested-With': 'XMLHttpRequest',
'X-MicrosoftAjax': 'Delta=true',
'Accept': '*/*',
'Origin': 'https://www.therapistlocator.net',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-Mode': 'cors',
'Referer': 'https://www.therapistlocator.net/tl/therapist-finder.aspx?zip=10001&name=',
'Accept-Language': 'en-GB,en-US;q=0.9,en;q=0.8',
}
params = (
('zip', '10001'),
('name', ''),
)
data = {
'ctl01$ScriptManager1': 'ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$rapLoadingPanel|ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$btnFilter',
'__WPPS': 's',
'__ClientContext': '{"baseUrl":"/","isAnonymous":true,"loggedInPartyId":"132791","selectedPartyId":"132791","websiteRoot":"http://www.therapistlocator.net/","virtualDir":""}',
'__CTRLKEY': '',
'__SHIFTKEY': '',
'ctl01_ScriptManager1_TSM': '',
'PageInstanceKey': '54d43052-a674-4b86-bebe-f3635b68db37',
'__RequestVerificationToken': 'Q0PHslrV-Kffbpo7LCbjPe8RMOcT59p8PRLefKE93uc6G4hfz6Ewpjg_bCI3SV2MPNfGUd1VirBZ3igc1rB51IPZTvc1',
'TemplateUserMessagesID': 'ctl01_TemplateUserMessages_ctl00_Messages',
'PageIsDirty': 'false',
'IsControlPostBackctl01$HeaderLogo$HeaderLogoSpan': '1',
'IsControlPostBackctl01$SearchField': '1',
'__EVENTTARGET': 'ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$btnFilter',
'__EVENTARGUMENT': '',
'NavMenuClientID': 'ctl01_Primary_NavMenu',
'IsControlPostBackctl01$TemplateBody$WebPartManager1$gwpciTitleandintro_9bb3191967f941e883b2c501791a2061$ciTitleandintro_9bb3191967f941e883b2c501791a2061': '1',
'IsControlPostBackctl01$TemplateBody$WebPartManager1$gwpciStyles_59e895c08d4f407aa0ada09911013fd2$ciStyles_59e895c08d4f407aa0ada09911013fd2': '1',
'IsControlPostBackctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon': '1',
'IsControlPostBackctl01$TemplateBody$WebPartManager1$gwpciScriptsDONOTREMOVE_f3cae45af58246d8b3f4953f13f8d401$ciScriptsDONOTREMOVE_f3cae45af58246d8b3f4953f13f8d401': '1',
'IsControlPostBackctl01$TemplateBody$WebPartManager1$gwpciNewContentHtml_0be4f96424fb47de90d1c22db2588e85$ciNewContentHtml_0be4f96424fb47de90d1c22db2588e85': '1',
'IsControlPostBackctl01$TemplateBody$WebPartManager1$gwpciNewATSGeoCodingCommon$ciNewATSGeoCodingCommon': '1',
'IsControlPostBackctl01$TemplateBody$ContentPage1': '1',
'IsControlPostBackctl01$TemplateBody$ContentPage2': '1',
'IsControlPostBackctl01$TemplateBody$ContentPage3': '1',
'IsControlPostBackctl01$TemplateBody$ContentPageFooter1': '1',
'IsControlPostBackctl01$FooterCopyright$FooterCopyright': '1',
'IsControlPostBackctl01$FooterCopyright$tosol': '1',
'__VIEWSTATE': '/wEPaA8FDzhkN2UyOWRmZGE0ZGQ4NxgBBR5fX0NvbnRyb2xzUmVxdWlyZVBvc3RCYWNrS2V5X18WBwUYY3RsMDEkTG9naW5TdGF0dXMxJGN0bDAxBRhjdGwwMSRMb2dpblN0YXR1czEkY3RsMDMFFWN0bDAxJFByaW1hcnkkTmF2TWVudQUUY3RsMDEkV2luZG93TWFuYWdlcjEFE2N0bDAxJEdlbmVyaWNXaW5kb3cFE2N0bDAxJE9iamVjdEJyb3dzZXIFGWN0bDAxJE9iamVjdEJyb3dzZXJEaWFsb2fx/JLd/+XByre34VShpvA4WynsKA==',
'__VIEWSTATEGENERATOR': '37E773F2',
'ctl01$lastClickedElementId': '',
'ctl01$SearchField$SearchTerms': 'Keyword Search',
'ctl01_Primary_NavMenu_ClientState': '',
'ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$txtPOSTALCODE0': '10001',
'ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$ddlDISTANCE0': '5',
'ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$txtName_TL0': '',
'__ASYNCPOST': 'true',
'': ''
}
response = requests.post('https://www.therapistlocator.net/tl/therapist-finder.aspx', headers=headers, params=params, cookies=cookies, data=data)
html = str(response.content)
con = bs(html , 'lxml')
therapists = con.find('div',class_='QueryDisplayWrapper').find_all('div',class_='row')
for therapist in therapists:
count+=1
name = therapist.find('div',class_='item name').find('a').text.strip()
therapist_href = therapist.find('div',class_='item name').find('a').get('href')
therapist_href = therapist_href.replace('\\','')
therapist_href = therapist_href.replace("'",'')
therapist_link = 'https://www.therapistlocator.net{}'.format(therapist_href)
therapist_info = requests.get(therapist_link)
if therapist_info.ok:
dataa = bs(therapist_info.text,'lxml')
try:
email = dataa.find('a',class_='PanelField').text.strip()
location = dataa.find_all('div',class_='PanelFieldValue')[0].find('span').text.strip()
loc1 = dataa.find_all('div',class_='PanelFieldValue')[0].find('br').next_sibling.strip()
location = location.replace(loc1 , ' {}'.format(loc1))
phone = dataa.find_all('div',class_='PanelFieldValue')[1].find('span').text.strip()
print('\n*********** '+str(count)+' ************\n')
print('Name: {}'.format(name))
print('Email: {}'.format(email))
print('Phone: {}'.format(phone))
print('Location: {}'.format(location))
except:
pass
其余的页面似乎都有相同的URL,所以我无法遍历它们
每页有25个条目。我希望把它们都弄到手
每个条目的示例输出:
Name: Marya B . Slater
Email: nycitytherapist@gmail.com
Phone: (646) 265-1555
Location: 360 W 34th St Apt 5P New York, NY 10001-2407
解决方案
你的网站有
javascript
。当您单击下一页时,它将触发一个javascript
函数来填充结果。您可以使用浏览器自动化以编程方式访问其他页面请看这些:
硒分页
核心步骤
您需要将问题分解为以下步骤:
Selenium
(与python一起使用)浏览自动化来访问您的页面李>获取总页数(请参见页面源代码中,在最后,它有一个分页部分)。另外,默认情况下,您可以使用
total_pages = total_results//max_results + 1
其中,max_results = 25
对于每页:
使用
BeautifulSoup
从使用Selenium获得的响应对象中提取数据使用
selenium
点击下一页链接如果愿意,将结果附加到
dict
或list
或pandas.DataFrame
中输出:view-online
相关问题 更多 >
编程相关推荐