用Python漂亮的汤刮分页

import requests import time import csv import sys from bs4 import BeautifulSoup houses = [] url = "https://www.propertypal.com/property-to-rent/newtownabbey/" page=requests.get(url) soup=BeautifulSoup(page.text,"lxml") g_data = soup.findAll("div", {"class": "propbox-details"}) for item in g_data: try: title = item.find_all("span", {"class": "propbox-addr"})[0].text except: pass try: town = item.find_all("span", {"class": "propbox-town"})[0].text except: pass try: price = item.find_all("span", {"class": "price-value"})[0].text except: pass try: period = item.find_all("span", {"class": "price-period"})[0].text except: pass course=[title,town,price,period] houses.append(course) for i in range(1,15): time.sleep(2)#delay time requests are sent so we don't get kicked by server url2 = "https://www.propertypal.com/property-to-rent/newtownabbey/page-{0}".format(i) page2=requests.get(url2) print(url2) soup=BeautifulSoup(page2.text,"lxml") g_data = soup.findAll("div", {"class": "propbox-details"}) for item in g_data: try: title = item.find_all("span", {"class": "propbox-addr"})[0].text except: pass try: town = item.find_all("span", {"class": "propbox-town"})[0].text except: pass try: price = item.find_all("span", {"class": "price-value"})[0].text except: pass try: period = item.find_all("span", {"class": "price-period"})[0].text except: pass course=[title,town,price,period] houses.append(course) with open ('newtownabbeyrentalproperties.csv','w') as file: writer=csv.writer(file) writer.writerow(['Address','Town', 'Price', 'Period']) for row in houses: writer.writerow(row)

3条回答

网友

1楼 · 编辑于 2024-10-01 09:21:17

请参阅下面我的修改。此解决方案应该能够继续在页面中循环，直到尝试获取不存在的页面。这样做也是有益的，因为在你的代码中，你总是会尝试15页，即使只有一、二、三页，等等

page_num = 0
http_status_okay = True
while http_status_okay:
    page_num = page_num + 1
    time.sleep(2)#delay time requests are sent so we don't get kicked by server
    url2 = "https://www.propertypal.com/property-to-rent/newtownabbey/page-{0}".format(i)
    page2=requests.get(url2)

    # continue if we get a 200 response code
    if page2.status_code is 200:
        http_status_okay = True
    else:
        http_status_okay = False

网友

2楼 · 编辑于 2024-10-01 09:21:17

向不存在的页面发出请求。例如：https://www.propertypal.com/property-to-rent/newtownabbey/page-999999 找出存在和不存在的页面之间的区别。分析下一页，直到找到这个差异。在

网友

3楼 · 编辑于 2024-10-01 09:21:17

类似这样的东西（我没有测试这个，它可能有用也可能不起作用，只是想展示一下原理）

button_next = soup.find("a", {"class": "btn paging-next"}, href=True)
while button_next:
    time.sleep(2)#delay time requests are sent so we don\'t get kicked by server
    url2 = "https://www.propertypal.com{0}".format(button_next["href"])
    page2=requests.get(url2)
    print(url2)
    soup=BeautifulSoup(page2.text,"lxml")
    g_data = soup.findAll("div", {"class": "propbox-details"})
    for item in g_data:
        try:
            title = item.find_all("span", {"class": "propbox-addr"})[0].text
        except:
            pass
        try:
            town = item.find_all("span", {"class": "propbox-town"})[0].text
        except:
            pass
        try:
            price = item.find_all("span", {"class": "price-value"})[0].text
        except:
            pass
        try:
            period = item.find_all("span", {"class": "price-period"})[0].text
        except:
            pass

    course=[title,town,price,period]
    houses.append(course)

    button_next = soup.find("a", {"class": "btn paging-next"}, href=True)

相关问题更多 >

编程相关推荐

热门问题

热门文章