无法使用删除重复的结果

import requests from lxml.html import fromstring from urllib.parse import urljoin link = "http://tennishub.co.uk/" processed_links = set() processed_nextpage_links = set() def get_links(url): response = requests.get(url) tree = fromstring(response.text) unprocessed_links = [urljoin(link,item.xpath('.//a/@href')[0]) for item in tree.xpath('//*[@class="countylist"]')] for nlink in unprocessed_links: if nlink not in processed_links: processed_links.add(nlink) get_nextpage_links(processed_links) def get_nextpage_links(itemlinks): for ilink in itemlinks: response = requests.get(ilink) tree = fromstring(response.text) titles = [title.xpath('.//a/@href')[0] for title in tree.xpath('//div[@class="pagination"]') if title.xpath('.//a/@href')] for ititle in titles: if ititle not in processed_nextpage_links: processed_nextpage_links.add(ititle) for rlink in processed_nextpage_links: print(rlink) if __name__ == '__main__': get_links(link)

/tennis-clubs-by-county/Durham/2 /tennis-clubs-by-county/Durham/2 /tennis-clubs-by-county/Durham/2 /tennis-clubs-by-county/Cheshire/2 /tennis-clubs-by-county/Derbyshire/2 /tennis-clubs-by-county/Durham/2 /tennis-clubs-by-county/Cheshire/2 /tennis-clubs-by-county/Derbyshire/2 /tennis-clubs-by-county/Durham/2

3条回答

网友

1楼 · 编辑于 2024-06-28 23:50:03

请尝试以下脚本。结果发现，你的xapth在解析一个特定的块时有一些缺陷，正如@tripleee在他的评论中已经提到的那样。我在follong脚本中使用set()的方式略有不同。现在，它应该产生独特的联系。你知道吗

import requests
from lxml.html import fromstring
from urllib.parse import urljoin

link = "http://tennishub.co.uk/"

def get_links(url):
    response = requests.get(url)
    tree = fromstring(response.text)
    crude_links = set([urljoin(link,item) for item in tree.xpath('//*[@class="countylist"]//a/@href') if item])
    return crude_links

def get_nextpage(link):
    response = requests.get(link)
    tree = fromstring(response.text)
    titles = set([title for title in tree.xpath('//div[@class="pagination"]//a/@href') if title])
    return titles

if __name__ == '__main__':
    for next_page in get_links(link):
        for unique_link in get_nextpage(next_page):
            print(unique_link)

网友

2楼 · 编辑于 2024-06-28 23:50:03

每次调用get_nextpage_links时，都会打印到目前为止收集的所有链接。你知道吗

我想您应该完全删除print，完成后只打印列表，最好是在任何def之外（使函数可重用，并将任何外部副作用推迟到调用代码）。你知道吗

没有全局变量的更好的解决方案可能是让get_links收集一个集合并返回它，无论何时调用它，都将对集合的引用传递给get_nextpage_links，并且（显然）让它添加任何新链接。你知道吗

因为您使用的是集合，所以在添加链接之前，不需要特别检查链接是否已经在集合中。无法将副本添加到此数据类型。你知道吗

网友

3楼 · 编辑于 2024-06-28 23:50:03

每次你打电话的时候

        for rlink in processed_nextpage_links:
            print(rlink)

打印它是因为for循环在for循环中，在集合中添加链接

相关问题更多 >

编程相关推荐

热门问题

热门文章