从tripadvis获取要做的事情列表

import requests import re from bs4 import BeautifulSoup from urllib.request import urlopen offset = 0 url = 'https://www.tripadvisor.com/Attractions-g255057-Activities-oa' + str(offset) + '-Canberra_Australian_Capital_Territory-Hotels.html#ATTRACTION_LIST_CONTENTS' urls = [] r = requests.get(url) soup = BeautifulSoup(r.text, "html.parser") for link in soup.find_all('a', {'last'}): page_number = link.get('data-page-number') last_offset = int(page_number) * 30 print('last offset:', last_offset) for offset in range(0, last_offset, 30): print('--- page offset:', offset, '---') url = 'https://www.tripadvisor.com/Attractions-g255057-oa' + str(offset) + '-Canberra_Australian_Capital_Territory-Hotels.html#ATTRACTION_LIST_CONTENTS' r = requests.get(url) soup = BeautifulSoup(r.text, "html.parser") for link in soup.find_all('a', {'property_title'}): iurl='https://www.tripadvisor.com/Attraction_Review-g255057' + link.get('href') print(iurl)

import requests import re from bs4 import BeautifulSoup from urllib.request import urlopen with requests.Session() as session: for offset in range(0, 1050, 30): url = 'https://www.tripadvisor.com/Restaurants-g255057-oa{0}-Canberra_Australian_Capital_Territory.html#EATERY_LIST_CONTENTS'.format(offset) soup = BeautifulSoup(session.get(url).content, "html.parser") for link in soup.select('a.property_title'): iurl = 'https://www.tripadvisor.com/' + link.get('href') print(iurl)

https://www.tripadvisor.com/Restaurant_Review-g255057-d1054676-Reviews-Lanterne_Rooms-Canberra_Australian_Capital_Territory.html https://www.tripadvisor.com/Restaurant_Review-g255057-d755055-Reviews-Courgette_Restaurant-Canberra_Australian_Capital_Territory.html https://www.tripadvisor.com/Restaurant_Review-g255057-d6893178-Reviews-Pomegranate-Canberra_Australian_Capital_Territory.html https://www.tripadvisor.com/Restaurant_Review-g255057-d7262443-Reviews-Les_Bistronomes-Canberra_Australian_Capital_Territory.html . . . .

1条回答

网友

1楼 · 发布于 2024-09-28 21:00:29

好吧，这并不难，你只需要知道要使用哪些标签。
让我用这个例子来解释：

import requests
from bs4 import BeautifulSoup

base_url = 'https://www.tripadvisor.com/'  ## we need this to join the links later ##
main_page = 'https://www.tripadvisor.com/Attractions-g255057-Activities-oa{}-Canberra_Australian_Capital_Territory-Hotels.html#ATTRACTION_LIST_CONTENTS'
links = []

## get the initial page to find the number of pages ##
r = requests.get(main_page.format(0))  
soup = BeautifulSoup(r.text, "html.parser")
## select the last page from the list of pages ('a', {'class':'pageNum taLnk'}) ##
last_page = max([ int(page.get('data-offset')) for page in soup.find_all('a', {'class':'pageNum taLnk'}) ])

## now iterate over that range (first page, last page, number of links), and extract the links from each page ##
for i in range(0, last_page + 30, 30):
    page = main_page.format(i)
    soup = BeautifulSoup(requests.get(page).text, "html.parser") ## get the next page and parse it with BeautifulSoup ##  
    ## get the hrefs from ('div', {'class':'listing_title'}), and join them with base_url to make the links ##
    links += [ base_url + link.find('a').get('href') for link in soup.find_all('div', {'class':'listing_title'}) ]

for link in links : 
    print(link)

总共有8页和212个链接（每页30个，最后2个）。
我希望这能把事情弄清楚一点

相关问题更多 >

编程相关推荐

热门问题

热门文章