基本上,该脚本是要抓取一个url的大列表,进入每个url的请求,找到页面上的所有链接,然后检查其中的每一个404并保存在相应的csv文件中
我使用csv,因为我喜欢使用.bat文件同时运行脚本的多个实例,以便更快地完成列表
link_pages_checked = []
for i in links_to_search:
i = random.choice(links_to_search)
print("working on ", i)
print("broken links :" , broken_links)
worked_on = []## Heading ##
with open('worked_on.csv', 'r', encoding="utf") as f:
reader = csv.reader(f)
cats = list(reader)
for c in cats:
for url in c:
worked_on.append(url)
if i not in worked_on:
with open('worked_on.csv','a+' ) as csv_file:
writer = csv.writer(csv_file)
writer.writerow([i])
try:
#go to each url in master csv of combined sitemaps
r = requests.get(i)
if (r.status_code == 404):
broken_links = broken_links + 1
with open('broken_links_found.csv','a+' ) as csv_file:
writer = csv.writer(csv_file)
writer.writerow(["BROKEN FROM SITEMAP!!! :",i])
else:
data = r.content # Content of response
soup = BeautifulSoup(data, "html.parser")
links_on_page = []
#grab all links on page
for alinks in soup.find_all('a'):
if (alinks['href'].startswith("http") or alinks['href'].startswith("www.") and (alinks['href'] not in worked_on) and not alinks['href'].endswith(".jpg") and not alinks['href'].endswith(".png") and not alinks['href'].endswith(".pdf") ):
alinks_save = alinks['href']
links_on_page.append(alinks_save)
else:
continue
#check each link on page
#check each link on page
for link in links_on_page:
try:
if link not in link_pages_checked:
link_pages_checked.append(link)
r2 = requests.get(link)
if (r2.status_code == 404):
broken_links = broken_links + 1
with open('broken_links_found.csv','a+' ) as csv_file:
writer = csv.writer(csv_file)
writer.writerow([link,i])
named_tuple = time.localtime() # get struct_time
time_string = time.strftime("%H:%M:%S", named_tuple)
print(time_string)
print("broken link found")
continue
else:
print("Checking links on page: ", i)
print("Number of links on page: ", len(links_on_page))
print("Checking link: ", link)
print("broken links found", broken_links)
print("Total links checked from each page:",len(link_pages_checked))
print("working on :", len(worked_on), " of ", len(links_to_search))
number_worked_on = len(worked_on)
number_total_to_search = len(links_to_search)
print("Number left to search:",number_total_to_search - number_worked_on)
named_tuple = time.localtime() # get struct_time
time_string = time.strftime("%H:%M:%S", named_tuple)
print(time_string)
continue
elif link in link_pages_checked:
print("already searched")
continue
except Exception as e:
print(e)
break
except Exception as e:
print(e)
else:
print("already searched")
问题是脚本运行正常,但只运行了大约半个小时,然后似乎会暂停,并且终端窗口不接受ctrl-c命令来结束进程,因此必须将其关闭
只是想知道我将如何着手找出哪里出了问题,以及导致崩溃/冻结的原因
您可以将整个脚本放入try:,除了:block。然后,您可以将异常写入文本文件:
相关问题 更多 >
编程相关推荐