<p>您可以像这样动态地创建url字符串。您可能还希望在循环的每一个其他迭代中使用一个定时延迟,以避免被服务器阻塞。你知道吗</p>
<pre><code>import requests
from pandas import DataFrame
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
path_of_csv = '/Users/gfidarov/Desktop/Python/MKB/mkb.csv'
first_string = 'https://www.rlsnet.ru/mkb_index_id_'
third_string = '.htm'
df = pd.DataFrame(columns=['scraping results'])
try:
for second_string in range(1, 11001):
second_string = str(second_string)
url = first_string + second_string + third_string
page_sc = requests.get(url)
soup_sc = BeautifulSoup(page_sc.content, 'html.parser')
items_sc = soup_sc.find_all(class_='subcatlist__item')
mkb_names_sc = [item_sc.find(class_='subcatlist__link').get_text() for item_sc in items_sc]
df.append({'scraping results': mkb_names_sc}, ignore_index=True)
df.to_csv(
path_or_buf=path_of_csv
)
except:
# If it fails in the middle of the process, the results won't be lost
path_of_csv = 'backup_' + path_of_csv
df.to_csv(
path_or_buf=path_of_csv
)
print('Failed at index ' + second_string + '. Please start from here again by setting the beginning of the range to this index. A backup was made of the results that were already scraped. You may want to rename the backup to avoid overwriting in the next run.')
</code></pre>