如何让我们的爬虫更快地解决维基百科游戏?

2024-10-01 02:25:32 发布

您现在位置:Python中文网/ 问答频道 /正文

我正在做一个维基百科爬虫,但速度很慢。我怎样才能让它更快

我使用requests模块和beautifulsoup4来解析html页面。我试过实现多线程,但仍然很慢

import requests
from bs4 import BeautifulSoup as bs
from queue import Queue

baseURL = "https://en.wikipedia.org";

startURL = "/wiki/French_battleship_Courbet_(1911)"
endURL = "/wiki/Royal_Navy"

tovisit = Queue()
visited = []

def main():


    if (not checkValid(startURL)) or (not checkValid(endURL)):
        print("Invalid URLs entered.")
        quit()

    initCrawler(startURL)

def initCrawler(startURL):

    global tovisit
    global visited

    tovisit.put(startURL)

    finished = False

    while not finished:

        if tovisit.empty():
            finished = True
            continue

        url = tovisit.get()

        childlinks = linkCrawl(url)

        for i in childlinks:
            tovisit.put(i)

        visited.append(url)

def linkCrawl(url):

    global visited
    global tovisit
    global endURL

    print("crawling "+ url + "\n")

    r = requests.get(baseURL+url)
    soup = bs(r.content, "html.parser")

    rawlinks = soup.find_all('a', href=True)

    refinedlinks = []

    for rawLink in rawlinks:
        i = rawLink["href"]
        if i is None:
            continue
        # ensure what we have is a string
        if not (type(i) is str):
            continue
        # no poi
        if i in visited:
            continue
        if i in list(tovisit.queue):
            continue
        if not checkValid(i):
            continue
        if i == endURL:
            print("yay")
            exit()
        refinedlinks.append(i)

    return refinedlinks

def checkValid(url):
    if not url.startswith("/wiki/"):
        return False
    if url.startswith("/wiki/Special:"):
        return False
    if url.startswith("/wiki/Wikipedia:"):
        return False
    if url.startswith("/wiki/Portal:"):
        return False
    if url.startswith("/wiki/File:"):
        return False
    if url.endswith("(disambiguation)"):
        return False
    return True

if __name__ == "__main__":
    main()

我希望机器人能跑得更快,但实际上速度很慢。研究表明最终多线程是不够的


Tags: falseurlreturnifdefwikinotglobal