在会话中发出后续POST请求对web抓取不起作用

<input type="hidden" name="__VIEWSTATEGENERATOR" id="__VIEWSTATEGENERATOR" value="4424DBE6"> <input type="hidden" name="__VIEWSTATEENCRYPTED" id="__VIEWSTATEENCRYPTED" value=""> <input type="hidden" name="__EVENTVALIDATION" id="__EVENTVALIDATION" value="TlIgNH

import requests from lxml import html from bs4 import BeautifulSoup page = requests.get('http://search.cpsa.ca/physiciansearch') print('got page!') d = {"ctl00$ctl13": "ctl00$ctl13|ctl00$MainContent$physicianSearchView$btnSearch", "ctl00$MainContent$physicianSearchView$txtLastName": "", 'ctl00$MainContent$physicianSearchView$txtFirstName': "", 'ctl00$MainContent$physicianSearchView$txtCity': "", "__VIEWSTATEENCRYPTED":"", 'ctl00$MainContent$physicianSearchView$txtPostalCode': "", 'ctl00$MainContent$physicianSearchView$rblPractice': "", 'ctl00$MainContent$physicianSearchView$ddDiscipline': "", 'ctl00$MainContent$physicianSearchView$rblGender': "", 'ctl00$MainContent$physicianSearchView$txtPracticeInterests': "", 'ctl00$MainContent$physicianSearchView$ddApprovals': "", 'ctl00$MainContent$physicianSearchView$ddLanguage': "", "__EVENTTARGET": "ctl00$MainContent$physicianSearchView$btnSearch", "__EVENTARGUMENT": "", 'ctl00$MainContent$physicianSearchView$hfPrefetchUrl': "http://service.cpsa.ca/OnlineService/OnlineService.svc/Services/GetAlbertaCities?name=", 'ctl00$MainContent$physicianSearchView$hfRemoveUrl': "http://service.cpsa.ca/OnlineService/OnlineService.svc/Services/GetAlbertaCities?name=%QUERY", '__ASYNCPOST': 'true'} h ={ "X-MicrosoftAjax":"Delta = true", "X-Requested-With":"XMLHttpRequest", "User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36" } urls = [] with requests.session() as s: r = s.get("http://search.cpsa.ca/PhysicianSearch",headers=h) soup = BeautifulSoup(r.content, "lxml") tree = html.fromstring(r.content) html.open_in_browser(tree) ev = soup.select("#__EVENTVALIDATION" )[0]["value"] vs = soup.select("#__VIEWSTATE")[0]["value"] vsg = soup.select("#__VIEWSTATEGENERATOR")[0]["value"] d["__EVENTVALIDATION"] = ev d["__VIEWSTATEGENERATOR"] = vsg d["__VIEWSTATE"] = vs r = s.post('http://search.cpsa.ca/PhysicianSearch', data=d,headers=h) print('opening in browser') retrievedUrls = tree.xpath('//*[@id="MainContent_physicianSearchView_gvResults"]/tr/td[2]/a/@href') print(retrievedUrls) for url in retrievedUrls: urls.append(url) endSearch = False while endSearch == False: tree = html.fromstring(r.content) html.open_in_browser(tree) soup = BeautifulSoup(r.content, "lxml") print('soup2:') ## BREAKS HERE ev = soup.select("#__EVENTVALIDATION" )[0]["value"] ## BREAKS HERE, vs = soup.select("#__VIEWSTATE")[0]["value"] vsg = soup.select("#__VIEWSTATEGENERATOR")[0]["value"] d["ctl00$ctl13"] = "ctl00$MainContent$physicianSearchView$ResultsPanel|ctl00$MainContent$physicianSearchView$gvResults$ctl01$btnNextPage" d["__EVENTVALIDATION"] = ev d["__EVENTTARGET"] = "" d["__VIEWSTATEGENERATOR"] = vsg d["__VIEWSTATE"] = vs d["ctl00$MainContent$physicianSearchView$gvResults$ctl01$ddlPager"] = 1 d["ctl00$MainContent$physicianSearchView$gvResults$ctl01$ddlPager"] = 1 d["ctl00$MainContent$physicianSearchView$gvResults$ctl01$btnNextPage"] = "Next" r = requests.post('http://search.cpsa.ca/PhysicianSearch', data=d,headers=h) tree = html.fromstring(r.content) tree = html.fromstring(r.content) retrievedUrls = tree.xpath('//*[@id="MainContent_physicianSearchView_gvResults"]/tr/td[2]/a/@href') print(urls) print(retrievedUrls) endSearch = True ... Traceback (most recent call last): File "C:\Users\daniel.bak\workspace\Alberta Physician Scraper\main\main.py", line 63, in <module> ev = soup.select("#__EVENTVALIDATION" )[0]["value"] IndexError: list index out of range

1条回答

网友

1楼 · 发布于 2024-09-28 22:28:36

好吧，这几乎让我发疯了，但它终于奏效了，您必须发出get请求，为每个帖子获取一个新的__EVENTVALIDATION令牌：

import requests

from bs4 import BeautifulSoup

h = {"X-MicrosoftAjax": "Delta = true",
     "X-Requested-With": "XMLHttpRequest",
     "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36"
     }
"ctl00$ctl13 | ctl00$MainContent$physicianSearchView$btnSearch"
d = {
    "ctl00$ctl13": "ctl00$MainContent$physicianSearchView$btnSearch",
    "__EVENTTARGET": "ctl00$MainContent$physicianSearchView$btnSearch",
    'ctl00$MainContent$physicianSearchView$hfPrefetchUrl': "http://service.cpsa.ca/OnlineService/OnlineService.svc/Services/GetAlbertaCities?name=",
    'ctl00$MainContent$physicianSearchView$hfRemoveUrl': "http://service.cpsa.ca/OnlineService/OnlineService.svc/Services/GetAlbertaCities?name=%QUERY",
    '__ASYNCPOST': 'true'}

nxt_d = {
    "ctl00$ctl13": "ctl00$MainContent$physicianSearchView$ResultsPanel|ctl00$MainContent$physicianSearchView$gvResults$ctl14$ddlPager",
    "ctl00$MainContent$physicianSearchView$gvResults$ctl01$ddlPager": "2",
    "ctl00$MainContent$physicianSearchView$gvResults$ctl14$ddlPager": "1",
    "__ASYNCPOST": "true",
    "__EVENTTARGET": "ctl00$MainContent$physicianSearchView$gvResults$ctl14$ddlPager"}

url = "http://search.cpsa.ca/PhysicianSearch"
with requests.session() as s:
    r = s.get(url, headers=h)
    soup = BeautifulSoup(r.content, "lxml")
    ev = soup.select("#__EVENTVALIDATION")[0]["value"]
    vs = soup.select("#__VIEWSTATE")[0]["value"]
    d["__EVENTVALIDATION"] = ev
    d["__VIEWSTATE"] = vs
    r = s.post(url, data=d, headers=h)
    soup = BeautifulSoup(s.get("http://search.cpsa.ca/PhysicianSearch").content, "lxml")
    ev = soup.select("#__EVENTVALIDATION")[0]["value"]
    vs = soup.select("#__VIEWSTATE")[0]["value"]
    nxt_d["__EVENTVALIDATION"] = ev
    nxt_d["__VIEWSTATE"] = vs
    r = s.post(url, data=nxt_d, headers=h)

如果你打开上一篇文章的来源，你会看到你点击了第2页。我们需要添加更多的逻辑来通读所有的页面，我将添加一点。在

参数：

^{pr2}$

是要转到的页面和您要从中返回的页面，以便在get之后需要更改的所有内容。在

这将获取所有页面，并以编程方式提取大多数值，您可能需要更详细地使用regex，但它在不使用硬编码值的情况下拉动大多数页面：

^{3}$

相关问题更多 >

编程相关推荐

热门问题

热门文章