如何使用python将web垃圾文本写入csv？

#! python3 import bs4, os, requests, csv # Get URL of the page URL = ('https://www.tripadvisor.com/Attraction_Review-g294265-d2149128-Reviews-Gardens_by_the_Bay-Singapore.html') # Looping until the 5th page of reviews pagecounter = 0 while pagecounter != 5: # Request get the first page res = requests.get(URL) res.raise_for_status # Download the html of the first page soup = bs4.BeautifulSoup(res.text, "html.parser") reviewElems = soup.select('.partial_entry') if reviewElems == []: print('Could not find clue.') else: #for i in range(len(reviewElems)): #print(reviewElems[i].getText()) with open('GardensbytheBay.csv', 'a', newline='') as csvfile: for row in reviewElems: writer = csv.writer(csvfile, delimiter=' ', quoting=csv.QUOTE_ALL) writer.writerow(row) print('Writing page') # Find URL of next page and update URL if pagecounter == 0: nextLink = soup.select('a[data-offset]')[0] elif pagecounter != 0: nextLink = soup.select('a[data-offset]')[1] URL = 'http://www.tripadvisor.com' + nextLink.get('href') pagecounter += 1 print('Download complete') csvfile.close()

1条回答

网友

1楼 · 发布于 2024-10-03 06:18:11

您可以使用row.get_text(strip=True)从选定的p.partial_entry中获取文本。尝试以下操作：

import bs4, os, requests, csv

# Get URL of the page
URL = ('https://www.tripadvisor.com/Attraction_Review-g294265-d2149128-Reviews-Gardens_by_the_Bay-Singapore.html')

with open('GardensbytheBay.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile, delimiter=' ')

    # Looping until the 5th page of reviews
    for pagecounter in range(6):

        # Request get the first page
        res = requests.get(URL)
        res.raise_for_status

        # Download the html of the first page
        soup = bs4.BeautifulSoup(res.text, "html.parser")
        reviewElems = soup.select('p.partial_entry')

        if reviewElems:
            for row in reviewElems:
                review_text = row.get_text(strip=True).encode('utf8', 'ignore').decode('latin-1')
                writer.writerow([review_text])
            print('Writing page', pagecounter + 1)
        else:
            print('Could not find clue.')

        # Find URL of next page and update URL
        if pagecounter == 0:
            nextLink = soup.select('a[data-offset]')[0]
        elif pagecounter != 0:
            nextLink = soup.select('a[data-offset]')[1]

        URL = 'http://www.tripadvisor.com' + nextLink.get('href')

print('Download complete')

相关问题更多 >

编程相关推荐

热门问题

热门文章