来自2个类的数据帧

pro = [] with requests.Session() as session: session.headers = { 'x-requested-with': 'XMLHttpRequest' } page = 1 while True: print(f"Processing page {page}..") url = f'https://www.kununu.com/de/volkswagen/kommentare/{page}' response = session.get(url) soup = BeautifulSoup(response.text, 'html.parser') new_comments = [ pro.find_next_sibling('p').get_text() for pro in soup.find_all('h2', text='Pro') ] if not new_comments: print(f"No more comments. Page: {page}") break pro += new_comments print(pro) #print(len(pro)) page += 1 print(pro)

Arbeit = [] Stadt=[] with requests.Session() as session: session.headers = { 'x-requested-with': 'XMLHttpRequest' } page = 1 while True: print(f"Processing page {page}..") url = f'https://www.kununu.com/de/volkswagen/kommentare/{page}' response = session.get(url) soup = BeautifulSoup(response.text, 'html.parser') new_comments1 = [ Arbeit.find_next_sibling('span').get_text() for Arbeit in soup.find_all('span', text='Arbeitsatmosphäre') ] new_comments2 = [ Stadt.find_next_sibling('div').get_text() for Stadt in soup.find_all('div', text='Stadt') ] if not new_comments1: print(f"No more comments. Page: {page}") break Arbeit += new_comments1 Stadt += new_comments2 print(Arbeit) print(Stadt) #print(len(pro)) page += 1

1条回答

网友

1楼 · 发布于 2024-06-26 02:15:09

您可以尝试：

import requests
from bs4 import BeautifulSoup
import pandas as  pd

arbeit = []
firma = []
stadt = []
with requests.Session() as session:
    session.headers = {
        'x-requested-with': 'XMLHttpRequest'
    }
    page = 1
    while True:
        print(f"Processing page {page}..")
        url = f'https://www.kununu.com/de/volkswagen/kommentare/{page}'
        response = session.get(url)

        soup = BeautifulSoup(response.text, 'html.parser')
        articles = soup.find_all('article')
        print("Number of articles: " + str(len(articles)))
        for article in articles:
            rating_tags = article.find_all('span', {'class' : 'rating-badge'})            
            arbeit.append(rating_tags[0].text.strip())
            detail_div = article.find_all('div', {'class' : 'review-details'})[0]
            nodes = detail_div.find_all('li')
            firma_node = nodes[0]
            stadt_node = nodes[1]
            firma_node_div = firma_node.find_all('div')
            firma_name = firma_node_div[1].text.strip()
            firma.append(firma_name)

            stadt_node_div = stadt_node.find_all('div')
            stadt_name = stadt_node_div[1].text.strip()
            stadt.append(stadt_name)                                       
        page += 1

        pagination = soup.find_all('div', {'class' : 'paginationControl'})
        if not pagination:
            break

df = pd.DataFrame({'Arbeitsatmosphäre' : arbeit, 'Stadt' : stadt})
print(df)

相关问题更多 >

编程相关推荐

热门问题

热门文章