在Python中进行抓取时出错，需要绕过

import requests from bs4 import BeautifulSoup import csv from urlparse import urljoin import urllib2 outfile = open("./battingall.csv", "wb") writer = csv.writer(outfile) base_url = 'http://www.baseball-reference.com' player_url = 'http://www.baseball-reference.com/players/' alphabet = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z'] players = 'shtml' gamel = '&t=b&year=' game_logs = 'http://www.baseball-reference.com/players/gl.cgi?id=' years = ['2015','2014','2013','2012','2011','2010','2009','2008'] drounders = [] for dround in alphabet: drounders.append(player_url + dround) urlz = [] for ab in drounders: data = requests.get(ab) soup = BeautifulSoup(data.content) for link in soup.find_all('a'): if link.has_attr('href'): urlz.append(base_url + link['href']) yent = [] for ant in urlz: for d in drounders: for y in years: if players in ant: if len(ant) < 60: if d in ant: yent.append(game_logs + ant[44:-6] + gamel + y) for j in yent: try: data = requests.get(j) soup = BeautifulSoup(data.content) table = soup.find('table', attrs={'id': 'batting_gamelogs'}) tablea = j[52:59] tableb= soup.find("b", text='Throws:').next_sibling.strip() tablec= soup.find("b", text='Height:').next_sibling.strip() tabled= soup.find("b", text='Weight:').next_sibling.strip() list_of_rows = [] for row in table.findAll('tr'): list_of_cells = [] list_of_cells.append(tablea) list_of_cells.append(j[len(j)-4:]) list_of_cells.append(tableb) list_of_cells.append(tablec) list_of_cells.append(tabled) for cell in row.findAll('td'): text = cell.text.replace(' ', '').encode("utf-8") list_of_cells.append(text) list_of_rows.append(list_of_cells) print list_of_rows writer.writerows(list_of_rows) except (AttributeError,NameError): pass

1条回答

网友

1楼 · 发布于 2024-10-02 20:42:25

您可以在try/except中包装requests.get()块。您需要捕获正在生成的^{}。在

for ab in drounders:
    try:
        data = requests.get(ab)
        soup = BeautifulSoup(data.content)
        for link in soup.find_all('a'):
            if link.has_attr('href'):
                urlz.append(base_url + link['href'])
    except requests.exceptions.ConnectionError:
        pass

发生这种情况是因为连接本身有问题，而不是因为表中没有数据。你还没走那么远。在

注意：这只需简单地使用pass（正如您稍后在代码块中所做的那样），这完全是在吞噬异常。这样做可能更好：

^{pr2}$

这将在控制台上为您提供一条关于哪个URL失败的消息。在

相关问题更多 >

编程相关推荐

热门问题

热门文章