嵌套循环不断重复

2024-09-29 19:35:02 发布

您现在位置:Python中文网/ 问答频道 /正文

我有一个主要目的

将邮政编码列表从文本读取到数组

数组中的每个邮政编码 搜索10页 拉出某些内容。你知道吗

我似乎得到了这样的结果: 第1页 第2页 第2页 第3页 第3页 第3页 第4页 第4页 第4页 第4页

等等

我已经尝试重新安排代码几次没有太多的看,一切都很好,除了这一步


from bs4 import BeautifulSoup
import time
from time import sleep
from datetime import datetime
import requests
import csv

print(" Initializing ...")
print(" Loading Keywords")
with open("pcodes.txt") as pcodes:
    postkeys = []
    for line in pcodes:
        postkeys.append(line.strip())

with open("pcodnum.txt") as pcodnum:
    postkeynum = []
    for line in pcodnum:
        postkeynum.append(line.strip())

print(" Welcome to YellScrape v1.0")
print(" You ar searching yell.com ")

comtype = input(" Please enter a Company Type (e.g Newsagent, Barber): ")
pagesnum = 0
listinnum = 0
comloc = " "
f = csv.writer(open(datetime.today().strftime('%Y-%m-%d') + '-' + comtype + '-' + 'yelldata.csv', 'w'))
f.writerow(['Business Name', 'Business Type', 'Phone Number', 'Street Address', 'Locality', 'Region', 'Website'])

headers = {
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
    }

data_list = []
for x in postkeys:
    print(" Searching " + x + " for " + comtype + " companies")
    for y in postkeynum:
        url = 'https://www.yell.com/ucs/UcsSearchAction.do?keywords=' + comtype + '&pageNum=' + str(y) + '&location=' + x
        data_list.append(url)
        for item in data_list:
            site = requests.get(item, headers=headers)
            soup = BeautifulSoup(site.content, 'html.parser')
            questions = soup.select('.businessCapsule--mainContent')
            for question in questions:
                listinnum += 1
                busname = question.find(class_='businessCapsule--name').get_text()
                bustype =   question.find(class_='businessCapsule--classification').get_text()
                busnumber = question.select_one('span.business--telephoneNumber')
                if busnumber is None:
                    busnumber = 'None'
                else:
                    busnumber = busnumber.text
                busadd = question.find('span', attrs={"itemprop": "streetAddress"})
                if busadd is None:
                    busadd = 'None'
                else:
                    busadd = busadd.text.replace(',',' ')
                buslocal = question.find('span', attrs={"itemprop": "addressLocality"})
                if buslocal is None:
                    buslocal = 'None'
                else:
                    buslocal = buslocal.text
                buspost = question.find('span', attrs={"itemprop": "postalCode"})
                if buspost is None:
                    buspost = 'None'
                else:
                    buspost = buspost.text
                busweb = question.find('a', attrs={"rel": "nofollow noopener"})
                if busweb is None:
                    busweb = 'None'
                else:
                    busweb = busweb.attrs['href']
                print(busweb)
                f.writerow([busname, bustype, busnumber, busadd, buslocal, buspost, busweb])


        pagesnum += 1
        print(" Finsihed Page " + str(y) + ". For " + x + " . " + str(listinnum) + " listings so far. Moving To Next Page")
    print(" Waiting 30 seconds for security reasons.")
    sleep(30)
print(" Finished. \n Total: " + str(pagesnum) + " pages with " + str(listinnum) + " listings. \n Please look for file: " + datetime.today().strftime('%Y-%m-%d') + '-' + comtype + '-' + 'yelldata.csv')

预期结果:

已完成第1页 已完成第2页 已完成第3页

等等


Tags: textinimportnoneforfindquestionprint
2条回答

初始化pageNum inside for循环:

for x in postkeys:
   pageNum = 1

循环和格式URL的增量pageNum边

for item in data_list:
    #format website url
    url = "https://www.yell.com/ucs/UcsSearchAction.do?keywords={}&pageNum={}&location={}".format(comtype, pageNum, x)
    site = requests.get(url, headers=headers)

    # check response status code:
    if site.status_code != 200:
        break

    pageNum += 1

应删除此for循环:

for y in postkeynum:
        url = 'https://www.yell.com/ucs/UcsSearchAction.do?keywords=' + comtype + '&pageNum=' + str(y) + '&location=' + x
        data_list.append(url)

这是因为您将附加到数据列表,然后在每次附加新链接之后使用for循环对其进行迭代。你知道吗

所以要对第1页做requests,然后对第1页做requests,对第2页做requests,然后是第1页,第2页和第3页,然后是第1页,第2页,第3页和第4页。。。等等

所以有两种方法可以解决这个问题。1) 不要附加到数据列表并将其全部消除,或者2)您可以先附加到数据列表,然后循环遍历它(因此将附加到data_list的循环分开,然后循环遍历data_list。你知道吗

我选择选项2)

from bs4 import BeautifulSoup
import time
from time import sleep
from datetime import datetime
import requests
import csv

print(" Initializing ...")
print(" Loading Keywords")
with open("C:/pcodes.txt") as pcodes:
    postkeys = []
    for line in pcodes:
        postkeys.append(line.strip())

with open("C:/pcodnum.txt") as pcodnum:
    postkeynum = []
    for line in pcodnum:
        postkeynum.append(line.strip())

print(" Welcome to YellScrape v1.0")
print(" You are searching yell.com ")

comtype = input(" Please enter a Company Type (e.g Newsagent, Barber): ")
pagesnum = 0
listinnum = 0
comloc = " "
f = csv.writer(open('C:/'+datetime.today().strftime('%Y-%m-%d') + '-' + comtype + '-' + 'yelldata.csv', 'w'))
f.writerow(['Business Name', 'Business Type', 'Phone Number', 'Street Address', 'Locality', 'Region', 'Website'])

headers = {
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
    }

data_list = []
for x in postkeys:
    print(" Searching " + x + " for " + comtype + " companies")
    for y in postkeynum:
        url = 'https://www.yell.com/ucs/UcsSearchAction.do?keywords=' + comtype + '&pageNum=' + str(y) + '&location=' + x
        data_list.append(url)

    # Now that you created a list of the urls, now you can loop through them


    for item in data_list:

        page = item.split('pageNum=')[-1].split('&')[0]
        location = item[-5:]

        site = requests.get(item, headers=headers)
        soup = BeautifulSoup(site.content, 'html.parser')
        questions = soup.select('.businessCapsule mainContent')
        for question in questions:
            listinnum += 1
            busname = question.find(class_='businessCapsule name').get_text()
            bustype =   question.find(class_='businessCapsule classification').get_text()
            busnumber = question.select_one('span.business telephoneNumber')
            if busnumber is None:
                busnumber = 'None'
            else:
                busnumber = busnumber.text
            busadd = question.find('span', attrs={"itemprop": "streetAddress"})
            if busadd is None:
                busadd = 'None'
            else:
                busadd = busadd.text.replace(',',' ')
            buslocal = question.find('span', attrs={"itemprop": "addressLocality"})
            if buslocal is None:
                buslocal = 'None'
            else:
                buslocal = buslocal.text
            buspost = question.find('span', attrs={"itemprop": "postalCode"})
            if buspost is None:
                buspost = 'None'
            else:
                buspost = buspost.text
            busweb = question.find('a', attrs={"rel": "nofollow noopener"})
            if busweb is None:
                busweb = 'None'
            else:
                busweb = busweb.attrs['href']
            print(busweb)
            f.writerow([busname, bustype, busnumber, busadd, buslocal, buspost, busweb])


        pagesnum += 1
        print(" Finished Page " + page + ". For " + location + " . " + str(listinnum) + " listings so far. Moving To Next Page")


    if item != data_list[-1]:
        print(" Waiting 30 seconds for security reasons.")
        sleep(30)
print(" Finished. \n Total: " + str(pagesnum) + " pages with " + str(listinnum) + " listings. \n Please look for file: " + datetime.today().strftime('%Y-%m-%d') + '-' + comtype + '-' + 'yelldata.csv')

相关问题 更多 >

    热门问题