从一个页面上用美丽的汤刮链接，我现在如何迭代这些链接？

from urllib.request import urlopen from bs4 import BeautifulSoup as soup import re def getExternalLinks(includeURL): html = urlopen(includeURL) bsObj = soup(html, "html.parser") externalLinks = [] links = bsObj.findAll("a", href=re.compile("^(http://www.homedepot.com/b)")) for link in links: if link.attrs['href'] is not None: if link.attrs['href'] not in externalLinks: externalLinks.append(link.attrs['href']) print(externalLinks) getExternalLinks("http://www.homedepot.com/")

def getInternalLinks(includeLinks): internalHTML = urlopen(includeLinks) Inner_bsObj = soup(internalHTML, "html.parser") internalLinks = [] inner_links = Inner_bsObj.findAll("a", "href") for inner_link in inner_links: if inner_link.attrs['href'] is not None: if inner_link.attrs['href'] not in internalLinks: internalLinks.append(inner_link.attrs['href']) print(internalLinks) getInternalLinks(getExternalLinks("http://www.homedepot.com")) File "C:/Users/anag/Documents/Python Scripts/Webscrapers/BeautifulSoup/HomeDepot/HomeDepotScraper.py", line 20, in getInternalLinks internalHTML = urlopen(includeLinks) File "C:\ProgramData\Anaconda3\lib\urllib\request.py", line 223, in urlopen return opener.open(url, data, timeout) File "C:\ProgramData\Anaconda3\lib\urllib\request.py", line 517, in open req.timeout = timeout AttributeError: 'NoneType' object has no attribute 'timeout'

1条回答

网友

1楼 · 发布于 2024-09-20 00:11:09

它是一个列表而不是数组。Python中的Array通常表示Numpy数组，这与list有很大不同。在

代码的问题在于getExternalLinks()函数返回None，并将其作为getInternalLinks()函数的参数，该函数只需要一个URL。第一个函数需要返回URL的列表（或集合），而不是（仅仅）打印它们，然后需要循环返回值并将每个URL提供给第二个函数。在

两个函数包含几乎相同的代码。尽管名称不同，但findAll()方法的参数不同。我将把它重构成一个公共函数。在

import re
from urllib.request import urlopen
from bs4 import BeautifulSoup


def get_links(url, attrs=None):
    if attrs is None:
        attrs = dict()
    links = set()
    soup = BeautifulSoup(urlopen(url), 'html.parser')
    for a_node in soup.find_all('a', attrs):
        link = a_node.get('href')
        if link is not None:
            links.add(link)
    return links


def main():
    external_links = get_links(
        'http://www.homedepot.com/',
        {'href': re.compile('^(http://www.homedepot.com/b)')},
    )
    print(external_links)
    for link in external_links:
        # 
        # TODO I am not sure if you really want to filter on <a> elements
        #      with a class of 'href' but that is what your code did, so...
        # 
        internal_links = get_links(link, {'class': 'href'})
        print(internal_links)


if __name__ == '__main__':
    main()

相关问题更多 >

编程相关推荐

热门问题

热门文章