Python刮gmcuk.org网站

2024-06-28 19:42:28 发布

您现在位置:Python中文网/ 问答频道 /正文

我正在努力搜索一个网站:https://www.gmc-uk.org/doctors/register/LRMP.asp

以下是我编写的代码,它不起作用:

import requests, csv, re, sys
from lxml import html

def parser1(keyword,source):
    with open(str(keyword)+'.csv','wb')as export:
        writer = csv.writer(export)
        for each in re.findall('<tr><td class="listapplettablerows" >(.+?)</tr>',source,re.DOTALL):
            new_each = '<td class="listapplettablerows" >'+each
            source = html.fromstring(new_each)
            lines = source.xpath('//td[@class="listapplettablerows"]//text()')
            #print (lines)
            try:
                writer.writerow([lines[0],lines[1],lines[2],lines[3],lines[4],lines[5],lines[6]])
            except:
                writer.writerow([lines[0],lines[1],lines[2],lines[3],lines[4],None,lines[5]])

def make_requests(url,keyword,SWETS):
    s = requests.Session()
    headers = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
                'Accept-Encoding':'gzip, deflate',
                'Accept-Language':'en-US,en;q=0.9',
                'Host':'webcache.gmc-uk.org',
               #'Cookie':'_ga=GA1.2.1612314458.1511275507; _gid=GA1.2.1054886815.1511275507',
                'Referer':'http://webcache.gmc-uk.org/gmclrmp_enu/start.swe?SWENeedContext=false&SWECmd=GetCachedFrame&W=t&SWEACn=7691&_sn=AVN6CAdOO0TLfHYEWmkfiCc5NXsWqEWnu1QinbOLc8NU.5VYcL46LP-V1h1wBqvlQYqNVBRCbMk6wOV9ByGHIw6-NgaeeOCxe-VxSekkxnLHXZZSKGnrBiJaYUTe-S7K.d3nInri.S4wG6fk0CD4JAEKBxpsYv8C0hibwdV3LcAlTqBpiFSlHFjguoh8q8WZOtzdmX07Geg_&SWEC=1&SWEFrame=top._sweclient._sweview&SWEBID=-1&SRN=&SWETS=',
                'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'}
    s.get('https://www.gmc-uk.org/doctors/register/LRMP.asp')
    r = s.get(url)
    formdata = {'s_3_1_5_0':'',
                's_3_1_3_0':'', 
                's_3_1_9_0':keyword,
                's_3_1_6_0':'60',
                's_3_1_4_0':'40',
                's_3_1_7_0':'',
                'SWEFo':'SWEForm3_0',
                'SWEField':'s_3_1_10_0',
                'SWENeedContext':'true',
                'SWENoHttpRedir':'true',
                'W':'t',
                'SWECmd':'InvokeMethod',
                'SWEMethod':'NewQuerySearch',
                'SWERowIds':'',
                'SWESP':'false',
                'SWEVI':'',
                'SWESPNR':'',
                'SWEPOC':'',
                'SWESPNH':'',
                'SWEH':'',
                'SWETargetView':'',
                'SWEDIC':'false',
                '_sn':url.split('_sn=')[1].split('&')[0],
                'SWEReqRowId':'1',
                'SWEView':'GMC WEB Doctor Search',
                'SWEC':'1',
                'SWERowId':'VRId-0',
                'SWETVI':'',
                'SWEW':'',
                'SWEBID':re.findall('navigator.id = "(.+?)"',r.text,re.DOTALL)[0],
                'SWEM':'',
                'SRN':'',
                'SWESPa':'',
                'SWETS':SWETS,
                'SWEContainer':'',
                'SWEWN':'',
                'SWEKeepContext':'0',
                'SWEApplet':'GMC WEB Health Provider Search Applet',
                'SWETA':''}
    headers['Referer'] = url
    r1 = s.post('http://webcache.gmc-uk.org/gmclrmp_enu/start.swe',data=formdata)

    if 'Sorry but we cannot find a record that matches your search' not in r1.text:
        parser1(keyword,r1.text)


make_requests(sys.argv[1],sys.argv[2],sys.argv[3])

问题是formdata字典中的SWETS键,当我检查网络元素时,我发现网站接受了一个POST请求,其中SWETS是13位GMT-linux日期时间戳。但是我无法找到如何使用正确的13位戳,因为我没有从服务器的js响应中找到任何内容&当我发送生成的13位GMT-linux日期时间戳时,它表示输入无效。请查看并建议可能的步骤。你知道吗


Tags: csvtextorgreurlsourcesysrequests