无法通过多个页面循环以擦除d

from urlparse import urljoin import requests from bs4 import BeautifulSoup import csv base_url = 'http://cricket.inhs.uiuc.edu/edwipweb/FMPro?-db=nvpassoc.fp5&-format=nvp_search_results.htm&-lay=web%20form&-max=20&-findall=' list_of_rows = [] next_page = 'http://cricket.inhs.uiuc.edu/edwipweb/FMPro?-db=nvpassoc.fp5&-format=nvp_search_results.htm&-lay=web%20form&-max=20&-skip=20&-findall=' while True: soup = BeautifulSoup(requests.get(next_page).content) soup.findAll('table')[1].findAll('tr') for row in soup.findAll('table')[1].findAll('tr'): list_of_cells = [] for cell in row.findAll('p'): text = cell.text.replace(' ','') list_of_cells.append(text) list_of_rows.append(list_of_cells) try: next_page = urljoin(base_url, soup.select('/FMPro?-db=nvpassoc.fp5&-format=nvp_search_results.htm&-lay=web%20form&-max=20&-skip=20&-findall=')[1].get('href')) except IndexError: break print list_of_rows outfile = open("./trialpage.csv","wb") writer = csv.writer(outfile) writer.writerows(list_of_rows)

1条回答

网友

1楼 · 发布于 2024-06-25 23:27:51

我对你的代码做了一些修改。我用一个名为skip的变量设置原始url。每次跳过将增加20

from urlparse import urljoin
import requests
from bs4 import BeautifulSoup
import csv

list_of_rows = []

skip = 0
next_page = 'http://cricket.inhs.uiuc.edu/edwipweb/FMPro?-db=nvpassoc.fp5&-format=nvp_search_results.htm&-lay=web%20form&-max=20&-skip=' + str(skip) + '&-findall='
print next_page
while True:
    soup = BeautifulSoup(requests.get(next_page).content)
    soup.findAll('table')[1].findAll('tr')
    for row in soup.findAll('table')[1].findAll('tr'):
        list_of_cells = []
        for cell in row.findAll('p'):
            text = cell.text.replace('&nbsp;','')
            list_of_cells.append(text)
        list_of_rows.append(list_of_cells)

    try:
        skip += 20
        if skip > 300:
            break
        next_page = 'http://cricket.inhs.uiuc.edu/edwipweb/FMPro?-db=nvpassoc.fp5&-format=nvp_search_results.htm&-lay=web%20form&-max=20&-skip=' + str(skip) + '&-findall='
        print next_page
    except IndexError as e:
        print e
        break


# print list_of_rows

outfile = open("./trialpage.csv","wb")
writer = csv.writer(outfile)
writer.writerows(list_of_rows)

你可以采取更大的块，因为你不受屏幕视图的限制，我认为它会更快地工作。尝试max=200，然后按200递增

相关问题更多 >

编程相关推荐

热门问题

热门文章