一个beautifulsoup循环，返回没有特定单词的链接

import requests from bs4 import BeautifulSoup import random import time def scrapeWikiArticle(url): response = requests.get( url=url, ) soup = BeautifulSoup(response.content, 'html.parser') title = soup.find(id="firstHeading") print(title.text) print(url) allLinks = soup.find(id="bodyContent").find_all("a") random.shuffle(allLinks) linkToScrape = 0 for link in allLinks: # Here i am trying to select hrefs with /wiki/ in them and exclude hrefs with "Category:" etc. It does select for wikis but does not exclude anything. if link['href'].find("/wiki/") == -1: if link['href'].find("Category:") == 1: if link['href'].find("File:") == 1: if link['href'].find("List") == 1: continue # Use this link to scrape linkToScrape = link articleTitles = open("savedArticles.txt", "a+") articleTitles.write(title.text + ", ") articleTitles.close() time.sleep(6) break scrapeWikiArticle("https://en.wikipedia.org" + linkToScrape['href']) scrapeWikiArticle("https://en.wikipedia.org/wiki/Anarchism")

3条回答

网友

1楼 · 编辑于 2024-10-03 06:21:20

这一部分似乎有问题

if link['href'].find("/wiki/") == -1:
    if link['href'].find("Category:") == 1:
        if link['href'].find("File:") == 1:
            if link['href'].find("List") == 1:
                continue

find返回要查找的子字符串的索引，但也使用了错误的子字符串

因此，如果未找到wiki或Category:，File:等出现在href中，则继续

if link['href'].find("/wiki/") == -1 or \
    link['href'].find("Category:") != -1 or \
    link['href'].find("File:") != -1 or \
    link['href'].find("List")!= -1 :
    print("skipped " + link["href"])
    continue

Saint Petersburg
https://en.wikipedia.org/wiki/St._Petersburg
National Diet Library
https://en.wikipedia.org/wiki/NDL_(identifier)
Template talk:Authority control files
https://en.wikipedia.org/wiki/Template_talk:Authority_control_files
skipped #searchInput
skipped /w/index.php?title=Template_talk:Authority_control_files&action=edit&section=1
User: Tom.Reding
https://en.wikipedia.org/wiki/User:Tom.Reding
skipped http://toolserver.org/~dispenser/view/Main_Page
Iapetus (moon)
https://en.wikipedia.org/wiki/Iapetus_(moon)
87 Sylvia
https://en.wikipedia.org/wiki/87_Sylvia
skipped /wiki/List_of_adjectivals_and_demonyms_of_astronomical_bodies
Asteroid belt
https://en.wikipedia.org/wiki/Main_asteroid_belt
Detached object
https://en.wikipedia.org/wiki/Detached_object

网友

2楼 · 编辑于 2024-10-03 06:21:20

使用^{}在href和*contains操作符旁边处理排除列表。这将过滤掉包含（*）指定子字符串的hrefs。在前面加一个^{}，它包含*{}。我通过i为前两个指定了一个不区分大小写的匹配，可以删除：

import requests
from bs4 import BeautifulSoup as bs
r = requests.get('https://en.wikipedia.org/wiki/2018_FIFA_World_Cup#Prize_money')
soup = bs(r.content, 'lxml') # 'html.parser'
links = [i['href'] for i in soup.select('#bodyContent a[href*="/wiki/"]:not([href*="Category:" i], [href*="File:" i], [href*="List"])')]

网友

3楼 · 编辑于 2024-10-03 06:21:20

您需要修改for循环，.attrs用于访问任何标记的属性。如果要排除href值包含特定关键字的链接，请使用!=-1比较

修改代码：

import requests
from bs4 import BeautifulSoup
import random
import time

def scrapeWikiArticle(url):
    response = requests.get(
        url=url,
    )
    soup = BeautifulSoup(response.content, 'html.parser')

    title = soup.find(id="firstHeading")



    allLinks = soup.find(id="bodyContent").find_all("a")

    random.shuffle(allLinks)
    linkToScrape = 0
    for link in allLinks:
            if("href" in link.attrs):

                if link.attrs['href'].find("/wiki/") == -1 or link.attrs['href'].find("Category:") != -1 or link.attrs['href'].find("File:") != -1 or link.attrs['href'].find("List") != -1:
                    continue

                linkToScrape = link


                articleTitles = open("savedArticles.txt", "a+")
                articleTitles.write(title.text + ", ")
                articleTitles.close()
                time.sleep(6)
                break

    if(linkToScrape):
        scrapeWikiArticle("https://en.wikipedia.org" + linkToScrape.attrs['href'])

scrapeWikiArticle("https://en.wikipedia.org/wiki/Anarchism")

相关问题更多 >

编程相关推荐

热门问题

热门文章