<p>你有选择的余地。无需使用Selenium,因为您可以通过api访问数据</p>
<p>网站上有一个限制,只允许您对最多10000个列表进行分页。返回的数据远远多于您想要的数据,因此您需要查看json响应,看看是否还有其他需要添加的数据:</p>
<p><strong>代码:</strong></p>
<pre><code>import pandas as pd
import requests
import math
import time
import random
url = 'https://glue-api.vivareal.com/v2/listings'
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36',
'x-domain': 'www.vivareal.com.br'}
payload = {
'addressCity': 'Salvador',
'addressLocationId': 'BR>Bahia>NULL>Salvador',
'addressNeighborhood': '',
'addressState': 'Bahia',
'addressCountry': 'Brasil',
'addressStreet': '',
'addressZone': '',
'addressPointLat': '-12.977738',
'addressPointLon': '-38.501636',
'business': 'SALE',
'facets': 'amenities',
'unitTypes': 'APARTMENT',
'unitSubTypes': 'UnitSubType_NONE,DUPLEX,LOFT,STUDIO,TRIPLEX',
'unitTypesV3': 'APARTMENT',
'usageTypes': 'RESIDENTIAL',
'listingType': 'USED',
'parentId': 'null',
'categoryPage': 'RESULT',
'size': '350',
'from': '0',
'q': '',
'developmentsSize': '5',
'__vt': '',
'levels': 'CITY,UNIT_TYPE',
'ref': '/venda/bahia/salvador/apartamento_residencial/',
'pointRadius':''}
def get_num_of_listings(priceMin, priceMax, payload, url, previous_priceMax, jsonData, previous_jsonData):
randInt = random.uniform(5.1, 7.9)
payload.update({'from':'0'})
#time.sleep(randInt)
if priceMax > 2500000:
priceMax = 100000000
payload.update({'priceMin':'%s' %priceMin,'priceMax':'%s' %priceMax})
jsonData = requests.get(url, headers=headers, params=payload).json()
listings_count = jsonData['search']['totalCount']
if listings_count < 10000:
if priceMax < 100000000:
print ('Price range %s - %s returns %s listings.' %(priceMin, priceMax, listings_count))
previous_jsonData = jsonData
previous_priceMax = priceMax
priceMax += 25000
listings_count, priceMin, priceMax, previous_priceMax, jsonData, previous_jsonData = get_num_of_listings(priceMin, priceMax, payload, url, previous_priceMax, jsonData, previous_jsonData)
else:
previous_jsonData = jsonData
previous_priceMax = 100000000
priceMin = previous_priceMax + 1
priceMax = priceMin + 250000 - 1
return listings_count, priceMin, priceMax, previous_priceMax, jsonData, previous_jsonData
rows = []
priceMin = 1
priceMax = 250000
finished = False
aquired = []
while finished == False:
randInt = random.uniform(5.1, 7.9)
listings_count, priceMin, priceMax, previous_priceMax, jsonData, previous_jsonData = get_num_of_listings(priceMin, priceMax, payload, url, None, None, None)
total_pages = math.ceil(previous_jsonData['search']['totalCount'] / 350)
for page in range(1, total_pages+1):
if page == 1:
idx=0
jsonData = previous_jsonData
else:
idx = 350*page
payload.update({'from':'%s' %idx})
if idx == 9800:
payload.update({'size':200})
else:
payload.update({'size':350})
if idx > 9800:
continue
#time.sleep(randInt)
jsonData = requests.get(url, headers=headers, params=payload).json()
listings = jsonData['search']['result']['listings']
for listing in listings:
listingId = listing['listing']['id']
if listingId in aquired:
continue
zone = listing['listing']['address']['zone']
size = listing['listing']['usableAreas'][0]
bedrooms = listing['listing']['bedrooms'][0]
bathrooms = listing['listing']['bathrooms'][0]
if listing['listing']['parkingSpaces'] != []:
parking = listing['listing']['parkingSpaces'][0]
else:
parking = None
price = listing['listing']['pricingInfos'][0]['price']
try:
condoFee = listing['listing']['pricingInfos'][0]['monthlyCondoFee']
except:
condoFee = None
amenities = listing['listing']['amenities']
listingUrl = 'https://www.vivareal.com.br' + listing['link']['href']
row = {
'Id':listingId,
'Zone' : zone,
'Size' : size,
'Bedrooms' : bedrooms,
'Bathrooms': bathrooms,
'Garage' : parking,
'Price': price,
'Condominio' : condoFee,
'Amenidades' : amenities,
'url' : listingUrl}
aquired.append(listingId)
rows.append(row)
print('Page %s of %s' %(page, total_pages))
if priceMax > 100000000:
print('Done')
finished = True
df = pd.DataFrame(rows)
</code></pre>
<p><strong>输出:</strong></p>
<pre><code>IPdb [3]: print(df)
Id ... url
0 2511396476 ... https://www.vivareal.com.br/imovel/apartamento...
1 2494354474 ... https://www.vivareal.com.br/imovel/apartamento...
2 2504461896 ... https://www.vivareal.com.br/imovel/apartamento...
3 2508574459 ... https://www.vivareal.com.br/imovel/apartamento...
4 2511489082 ... https://www.vivareal.com.br/imovel/apartamento...
... ... ...
26244 94618731 ... https://www.vivareal.com.br/imovel/apartamento...
26245 93437597 ... https://www.vivareal.com.br/imovel/apartamento...
26246 79341843 ... https://www.vivareal.com.br/imovel/apartamento...
26247 2455978575 ... https://www.vivareal.com.br/imovel/apartamento...
26248 2509913182 ... https://www.vivareal.com.br/imovel/apartamento...
[26249 rows x 10 columns]
</code></pre>