在执行一个过程时抓取多个网页

from bs4 import BeautifulSoup import requests from collections import ChainMap pages=[] for i in range(0,5): url = 'https://sportmedbc.com/practitioners?field_profile_first_name_value=&field_profile_last_name_value=&field_pract_profession_tid=All&city=&taxonomy_vocabulary_5_tid=All&page='+str(i) pages.append(url) for item in pages: page=requests.get(item) soup = BeautifulSoup(page.text, 'lxml') def get_data(soup): default_data = {'name': 'n/a', 'clinic': 'n/a', 'profession': 'n/a', 'region': 'n/a', 'city': 'n/a'} for doctor in soup.select('.view-practitioners .practitioner'): doctor_data = {} if doctor.select_one('.practitioner__name').text.strip(): doctor_data['name'] = doctor.select_one('.practitioner__name').text if doctor.select_one('.practitioner__clinic').text.strip(): doctor_data['clinic'] = doctor.select_one('.practitioner__clinic').text if doctor.select_one('.practitioner__profession').text.strip(): doctor_data['profession'] = doctor.select_one('.practitioner__profession').text if doctor.select_one('.practitioner__region').text.strip(): doctor_data['region'] = doctor.select_one('.practitioner__region').text if doctor.select_one('.practitioner__city').text.strip(): doctor_data['city'] = doctor.select_one('.practitioner__city').text yield ChainMap(doctor_data, default_data) for doctor in get_data(soup): print('name:\t\t', doctor['name']) print('clinic:\t\t',doctor['clinic']) print('profession:\t',doctor['profession']) print('city:\t\t',doctor['city']) print('region:\t\t',doctor['region']) print('-' * 80)

2条回答

网友

1楼 · 编辑于 2024-09-22 20:20:21

代码基本正常，将带有get_soup()的循环放在第一个循环中：

from bs4 import BeautifulSoup
import requests
from collections import ChainMap

def get_data(soup):
    default_data = {'name': 'n/a', 'clinic': 'n/a', 'profession': 'n/a', 'region': 'n/a', 'city': 'n/a'}

    for doctor in soup.select('.view-practitioners .practitioner'):
        doctor_data = {}

        if doctor.select_one('.practitioner__name').text.strip():
            doctor_data['name'] = doctor.select_one('.practitioner__name').text

        if doctor.select_one('.practitioner__clinic').text.strip():
            doctor_data['clinic'] = doctor.select_one('.practitioner__clinic').text

        if doctor.select_one('.practitioner__profession').text.strip():
            doctor_data['profession'] = doctor.select_one('.practitioner__profession').text

        if doctor.select_one('.practitioner__region').text.strip():
            doctor_data['region'] = doctor.select_one('.practitioner__region').text

        if doctor.select_one('.practitioner__city').text.strip():
            doctor_data['city'] = doctor.select_one('.practitioner__city').text

        yield ChainMap(doctor_data, default_data)

url = 'https://sportmedbc.com/practitioners?field_profile_first_name_value=&field_profile_last_name_value=&field_pract_profession_tid=All&city=&taxonomy_vocabulary_5_tid=All&page=%s'

for i in range(5):
    page=requests.get(url % i)
    soup = BeautifulSoup(page.text, 'lxml')

    print('Page {}'.format(i + 1))
    print('#' * 80)
    for doctor in get_data(soup):
        print('name:\t\t', doctor['name'])
        print('clinic:\t\t',doctor['clinic'])
        print('profession:\t',doctor['profession'])
        print('city:\t\t',doctor['city'])
        print('region:\t\t',doctor['region'])
        print('-' * 80)

印刷品：

Page 1
################################################################################
name:        Jaimie Ackerman
clinic:      n/a
profession:  n/a
city:        n/a
region:      n/a
                                        
name:        Marilyn Adams
clinic:      Fortius Sport & Health
profession:  Physiotherapist
city:        n/a
region:      Fraser River Delta
                                        
name:        Mahsa Ahmadi
clinic:      Wellpoint Acupuncture (Sports Medicine)
profession:  Acupuncturist
city:        Vancouver
region:      Vancouver & Sea to Sky
                                        
name:        Tracie Albisser
clinic:      Pacific Sport Northern BC, Tracie Albisser
profession:  Strength and Conditioning Specialist, Exercise Physiologist
city:        n/a
region:      Cariboo - North East
                                        
name:        Christine Alder
clinic:      n/a
profession:  n/a
city:        Vancouver
region:      Vancouver & Sea to Sky
                                        
name:        Steacy Alexander
clinic:      Go! Physiotherapy Sports and Wellness Centre
profession:  Physiotherapist
city:        Vancouver
region:      Vancouver & Sea to Sky
                                        
name:        Page Allison
clinic:      AET Clinic, .
profession:  Athletic Therapist
city:        Victoria
region:      Vancouver Island - Central Coast
                                        
name:        Dana Alumbaugh
clinic:      n/a
profession:  Podiatrist
city:        Squamish
region:      Vancouver & Sea to Sky
                                        
name:        Manouch Amel
clinic:      Mountainview Kinesiology Ltd.
profession:  Strength and Conditioning Specialist
city:        Anmore
region:      Vancouver & Sea to Sky
                                        
name:        Janet Ames
clinic:      Dr. Janet Ames
profession:  Physician
city:        Prince George
region:      Cariboo - North East
                                        
name:        Greg Anderson
clinic:      University of the Fraser Valley
profession:  Exercise Physiologist
city:        Mission
region:      Fraser Valley
                                        
name:        Sandi Anderson
clinic:      n/a
profession:  n/a
city:        Coquitlam
region:      Fraser Valley
                                        
Page 2
################################################################################

... and so on.

网友

2楼 · 编辑于 2024-09-22 20:20:21

for item in pages:
    page=requests.get(item)
    soup = BeautifulSoup(page.text, 'lxml')  # This sets soup to be the 5th page at the end of the loop.

因为soup是上面解释的第5页，所以只能得到第5页的结果

解决方案：

# Place this code at the bottom
for item in pages:
    page=requests.get(item)
    soup = BeautifulSoup(page.text, 'lxml')
    # Call the bottom loop here.
    for doctor in get_data(soup):
       print('name:\t\t', doctor['name'])
       print('clinic:\t\t',doctor['clinic'])
       print('profession:\t',doctor['profession'])
       print('city:\t\t',doctor['city'])
       print('region:\t\t',doctor['region'])
       print('-' * 80)

相关问题更多 >

编程相关推荐

热门问题

热门文章