Python网络刮板，同一链接不同的文本，计数

import urllib import re import mechanize from bs4 import BeautifulSoup import urlparse import cookielib from urlparse import urlsplit from publicsuffix import PublicSuffixList link = "http://www.ananda-pur.de/23.html" newesturlDict = {} baseAdrInsArray = [] br = mechanize.Browser() cj = cookielib.LWPCookieJar() br.set_cookiejar(cj) br.set_handle_robots(False) br.set_handle_equiv(False) br.set_handle_redirect(True) br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1) br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')] page = br.open(link, timeout=10) for linkins in br.links(): newesturl = urlparse.urljoin(linkins.base_url, linkins.url) linkTxt = linkins.text baseAdrIns = linkins.base_url if baseAdrIns not in baseAdrInsArray: baseAdrInsArray.append(baseAdrIns) netLocation = urlsplit(baseAdrIns) psl = PublicSuffixList() publicAddress = psl.get_public_suffix(netLocation.netloc) if publicAddress not in newesturl: if newesturl not in newesturlDict: newesturlDict[newesturl,linkTxt] = 1 if newesturl in newesturlDict: newesturlDict[newesturl,linkTxt] += 1 newesturlCount = sorted(newesturlDict.items(),key=lambda(k,v):(v,k),reverse=True) for newesturlC in newesturlCount: print baseAdrInsArray[0]," - ",newesturlC[0],"- count: ", newesturlC[1]

http://www.ananda-pur.de/23.html - http://www.yogibhajan.com/ - http://www.yogibhajan.com - count: 1 http://www.ananda-pur.de/23.html - http://www.kundalini-yoga-zentrum-berlin.de - http://www.kundalini-yoga-zentrum-berlin.de - count: 1 http://www.ananda-pur.de/23.html - http://www.kriteachings.org/ - http://www.sat-nam-rasayan.de, http://www.kriteachings.org, http://www.gurudevsnr.com, http://www.3ho.de - count: 4

1条回答

网友

1楼 · 发布于 2024-10-02 18:15:29

将链接收集到字典中，收集链接文本并处理计数：

import cookielib

import mechanize


base_url = "http://www.ananda-pur.de/23.html"

br = mechanize.Browser()
cj = cookielib.LWPCookieJar()
br.set_cookiejar(cj)
br.set_handle_robots(False)
br.set_handle_equiv(False)
br.set_handle_redirect(True)
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
br.addheaders = [('User-agent',
                  'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
page = br.open(base_url, timeout=10)

links = {}
for link in br.links():
    if link.url not in links:
        links[link.url] = {'count': 1, 'texts': [link.text]}
    else:
        links[link.url]['count'] += 1
        links[link.url]['texts'].append(link.text)

# printing
for link, data in links.iteritems():
    print "%s - %s - %s - %d" % (base_url, link, ",".join(data['texts']), data['count'])

印刷品：

^{pr2}$

相关问题更多 >

编程相关推荐

热门问题

热门文章