我正在进行一项任务,其中我必须使用python请求库从网页获取详细信息。我已经成功地从页面中获取了详细信息,但是它有一个按钮“ShowDetails”,它使用ajax调用获取更多详细信息,现在我还需要获取这些额外的详细信息。有人能帮我吗?
这是网站的链接:-http://ipindiaonline.gov.in/tmrpublicsearch/frmmain.aspx
以及网站的屏幕截图:
这是我写的代码
from bs4 import BeautifulSoup
import requests,json
def returnJson(wordmark, page_class):
url = "http://ipindiaonline.gov.in/tmrpublicsearch/frmmain.aspx"
r_init = requests.get(url)
soup = BeautifulSoup(r_init.text, 'html.parser')
event_validation = soup.find("input", attrs={"name" : "__EVENTVALIDATION"})['value']
view_state = soup.find("input", attrs={"name" : "__VIEWSTATE"})['value']
search_type = 'WM'
postdata = {
'ctl00$ContentPlaceHolder1$DDLFilter' : '0',
'ctl00$ContentPlaceHolder1$DDLSearchType' : search_type,
'ctl00$ContentPlaceHolder1$TBWordmark' : wordmark,
'ctl00$ContentPlaceHolder1$TBClass' : page_class,
'__EVENTVALIDATION' : event_validation,
"__EVENTTARGET" : "ctl00$ContentPlaceHolder1$BtnSearch",
"__VIEWSTATE" : view_state,
}
r = requests.post(url, data=postdata)
return r
def scraping(r):
soup = BeautifulSoup(r.text, 'html.parser')
counter=len(soup.findAll('tr',attrs={'class':'row'}))
counter+=len(soup.findAll('tr',attrs={'class':'alt'}))
wordmark_idvalue='ContentPlaceHolder1_MGVSearchResult_lblsimiliarmark_'
proprietor_idvalue='ContentPlaceHolder1_MGVSearchResult_LblVProprietorName_'
applicationno_idvalue='ContentPlaceHolder1_MGVSearchResult_lblapplicationnumber_'
class_idvalue='ContentPlaceHolder1_MGVSearchResult_lblsearchclass_'
status_idvalue='ContentPlaceHolder1_MGVSearchResult_Label6_'
words_list=[]
for i in range(0,counter):
words_dict={}
row=soup.find('span',attrs={'id':(wordmark_idvalue+str(i))})
words_dict['Wordmark']=row.text
row=soup.find('span',attrs={'id':(proprietor_idvalue+str(i))})
words_dict['Proprietor']=row.text
row=soup.find('span',attrs={'id':(applicationno_idvalue+str(i))})
words_dict['Application Number']=row.text
row=soup.find('span',attrs={'id':(class_idvalue+str(i))})
words_dict['Class ']=row.text
row=soup.find('span',attrs={'id':(status_idvalue+str(i))})
words_dict['Status']=row.text
words_list.append(words_dict)
return words_list
def showDetails(wordmark, page_class):
if(len(wordmark)>2 and page_class.isalnum()==1):
var=json.dumps(scraping(returnJson(wordmark, page_class)))
return var
else:
print("Please Enter Valid Parameters\n")
showDetails('AIWA','2')
您需要使用第一个POST请求中的信息创建另一个POST请求。下面显示了如何从返回的数据中提取
Goods & Services Description
:这将显示:
注意:返回的数据包含由
|
字符分隔的其他字段。详细信息的HTML碰巧也包含此字符,因此有必要提取字段7
和8
以仅获取HTML我不会尝试使用BeautifulSoup。然而,我认为你可以用硒来做(见https://selenium-python.readthedocs.io/)
使用其功能,您可以单击所选的“显示详细信息”按钮,然后等待请求的信息出现在右侧面板中,然后像使用该面板中的BeautifulSoup一样或多或少地提取请求的信息
相关问题 更多 >
编程相关推荐