嵌套循环不断重复

from bs4 import BeautifulSoup import time from time import sleep from datetime import datetime import requests import csv print(" Initializing ...") print(" Loading Keywords") with open("pcodes.txt") as pcodes: postkeys = [] for line in pcodes: postkeys.append(line.strip()) with open("pcodnum.txt") as pcodnum: postkeynum = [] for line in pcodnum: postkeynum.append(line.strip()) print(" Welcome to YellScrape v1.0") print(" You ar searching yell.com ") comtype = input(" Please enter a Company Type (e.g Newsagent, Barber): ") pagesnum = 0 listinnum = 0 comloc = " " f = csv.writer(open(datetime.today().strftime('%Y-%m-%d') + '-' + comtype + '-' + 'yelldata.csv', 'w')) f.writerow(['Business Name', 'Business Type', 'Phone Number', 'Street Address', 'Locality', 'Region', 'Website']) headers = { 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36', } data_list = [] for x in postkeys: print(" Searching " + x + " for " + comtype + " companies") for y in postkeynum: url = 'https://www.yell.com/ucs/UcsSearchAction.do?keywords=' + comtype + '&pageNum=' + str(y) + '&location=' + x data_list.append(url) for item in data_list: site = requests.get(item, headers=headers) soup = BeautifulSoup(site.content, 'html.parser') questions = soup.select('.businessCapsule--mainContent') for question in questions: listinnum += 1 busname = question.find(class_='businessCapsule--name').get_text() bustype = question.find(class_='businessCapsule--classification').get_text() busnumber = question.select_one('span.business--telephoneNumber') if busnumber is None: busnumber = 'None' else: busnumber = busnumber.text busadd = question.find('span', attrs={"itemprop": "streetAddress"}) if busadd is None: busadd = 'None' else: busadd = busadd.text.replace(',',' ') buslocal = question.find('span', attrs={"itemprop": "addressLocality"}) if buslocal is None: buslocal = 'None' else: buslocal = buslocal.text buspost = question.find('span', attrs={"itemprop": "postalCode"}) if buspost is None: buspost = 'None' else: buspost = buspost.text busweb = question.find('a', attrs={"rel": "nofollow noopener"}) if busweb is None: busweb = 'None' else: busweb = busweb.attrs['href'] print(busweb) f.writerow([busname, bustype, busnumber, busadd, buslocal, buspost, busweb]) pagesnum += 1 print(" Finsihed Page " + str(y) + ". For " + x + " . " + str(listinnum) + " listings so far. Moving To Next Page") print(" Waiting 30 seconds for security reasons.") sleep(30) print(" Finished. \n Total: " + str(pagesnum) + " pages with " + str(listinnum) + " listings. \n Please look for file: " + datetime.today().strftime('%Y-%m-%d') + '-' + comtype + '-' + 'yelldata.csv')

2条回答

网友

1楼 · 编辑于 2024-09-29 19:35:02

初始化pageNum inside for循环：

for x in postkeys:
   pageNum = 1

循环和格式URL的增量pageNum边

for item in data_list:
    #format website url
    url = "https://www.yell.com/ucs/UcsSearchAction.do?keywords={}&pageNum={}&location={}".format(comtype, pageNum, x)
    site = requests.get(url, headers=headers)

    # check response status code:
    if site.status_code != 200:
        break

    pageNum += 1

应删除此for循环：

for y in postkeynum:
        url = 'https://www.yell.com/ucs/UcsSearchAction.do?keywords=' + comtype + '&pageNum=' + str(y) + '&location=' + x
        data_list.append(url)

网友

2楼 · 编辑于 2024-09-29 19:35:02

这是因为您将附加到数据列表，然后在每次附加新链接之后使用for循环对其进行迭代。你知道吗

所以要对第1页做requests，然后对第1页做requests，对第2页做requests，然后是第1页，第2页和第3页，然后是第1页，第2页，第3页和第4页。。。等等

所以有两种方法可以解决这个问题。1）不要附加到数据列表并将其全部消除，或者2）您可以先附加到数据列表，然后循环遍历它（因此将附加到data_list的循环分开，然后循环遍历data_list。你知道吗

我选择选项2）

from bs4 import BeautifulSoup
import time
from time import sleep
from datetime import datetime
import requests
import csv

print(" Initializing ...")
print(" Loading Keywords")
with open("C:/pcodes.txt") as pcodes:
    postkeys = []
    for line in pcodes:
        postkeys.append(line.strip())

with open("C:/pcodnum.txt") as pcodnum:
    postkeynum = []
    for line in pcodnum:
        postkeynum.append(line.strip())

print(" Welcome to YellScrape v1.0")
print(" You are searching yell.com ")

comtype = input(" Please enter a Company Type (e.g Newsagent, Barber): ")
pagesnum = 0
listinnum = 0
comloc = " "
f = csv.writer(open('C:/'+datetime.today().strftime('%Y-%m-%d') + '-' + comtype + '-' + 'yelldata.csv', 'w'))
f.writerow(['Business Name', 'Business Type', 'Phone Number', 'Street Address', 'Locality', 'Region', 'Website'])

headers = {
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
    }

data_list = []
for x in postkeys:
    print(" Searching " + x + " for " + comtype + " companies")
    for y in postkeynum:
        url = 'https://www.yell.com/ucs/UcsSearchAction.do?keywords=' + comtype + '&pageNum=' + str(y) + '&location=' + x
        data_list.append(url)

    # Now that you created a list of the urls, now you can loop through them


    for item in data_list:

        page = item.split('pageNum=')[-1].split('&')[0]
        location = item[-5:]

        site = requests.get(item, headers=headers)
        soup = BeautifulSoup(site.content, 'html.parser')
        questions = soup.select('.businessCapsule mainContent')
        for question in questions:
            listinnum += 1
            busname = question.find(class_='businessCapsule name').get_text()
            bustype =   question.find(class_='businessCapsule classification').get_text()
            busnumber = question.select_one('span.business telephoneNumber')
            if busnumber is None:
                busnumber = 'None'
            else:
                busnumber = busnumber.text
            busadd = question.find('span', attrs={"itemprop": "streetAddress"})
            if busadd is None:
                busadd = 'None'
            else:
                busadd = busadd.text.replace(',',' ')
            buslocal = question.find('span', attrs={"itemprop": "addressLocality"})
            if buslocal is None:
                buslocal = 'None'
            else:
                buslocal = buslocal.text
            buspost = question.find('span', attrs={"itemprop": "postalCode"})
            if buspost is None:
                buspost = 'None'
            else:
                buspost = buspost.text
            busweb = question.find('a', attrs={"rel": "nofollow noopener"})
            if busweb is None:
                busweb = 'None'
            else:
                busweb = busweb.attrs['href']
            print(busweb)
            f.writerow([busname, bustype, busnumber, busadd, buslocal, buspost, busweb])


        pagesnum += 1
        print(" Finished Page " + page + ". For " + location + " . " + str(listinnum) + " listings so far. Moving To Next Page")


    if item != data_list[-1]:
        print(" Waiting 30 seconds for security reasons.")
        sleep(30)
print(" Finished. \n Total: " + str(pagesnum) + " pages with " + str(listinnum) + " listings. \n Please look for file: " + datetime.today().strftime('%Y-%m-%d') + '-' + comtype + '-' + 'yelldata.csv')

相关问题更多 >

编程相关推荐

热门问题

热门文章