如何使用Python中的BeautifulSoup库从具有“View More”选项的网站抓取数据

import urllib.request from bs4 import BeautifulSoup import sys non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd) response = urllib.request.urlopen("https://www.mygov.in/group-issue/share- your-ideas-pm-narendra-modis-mann-ki-baat-26th-march-2017/") srcode = response.read() soup = BeautifulSoup(srcode, "html.parser") all_comments_div=soup.find_all('div', class_="comment_body"); all_comments=[] for div in all_comments_div: all_comments.append(div.find('p').text.translate(non_bmp_map)) print (all_comments) print (len(all_comments))

2条回答

网友

1楼 · 编辑于 2024-09-28 16:59:02

您可以使用while循环来获取下一页
（即有下一页，所有评论少于1000条）

import urllib.request
from bs4 import BeautifulSoup
import sys

non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd)
all_comments = [] 
max_comments = 1000
base_url = 'https://www.mygov.in/'
next_page = base_url + '/group-issue/share-your-ideas-pm-narendra-modis-mann-ki-baat-26th-march-2017/'

while next_page and len(all_comments) < max_comments : 
    response = response = urllib.request.urlopen(next_page)
    srcode = response.read()
    soup = BeautifulSoup(srcode, "html.parser")

    all_comments_div=soup.find_all('div', class_="comment_body");
    for div in all_comments_div:
        all_comments.append(div.find('p').text.translate(non_bmp_map))

    next_page = soup.find('li', class_='pager-next first last')
    if next_page : 
        next_page = base_url + next_page.find('a').get('href')
    print('comments: {}'.format(len(all_comments)))

print(all_comments)
print(len(all_comments))

网友

2楼 · 编辑于 2024-09-28 16:59:02

新的注释通过ajax加载，我们需要解析它，然后使用bs，即：

import json
import requests
import sys
from bs4 import BeautifulSoup

how_many_pages = 5 # how many comments pages you want to parse?
non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd)
all_comments = []

for x in range(how_many_pages):
    # note: mygov.in seems very slow...
    json_data = requests.get(
        "https://www.mygov.in/views/ajax/?view_name=view_comments&view_display_id=block_2&view_args=267721&view_path=node%2\
F267721&view_base_path=comment_pdf_export&view_dom_id=f3a7ae636cabc2c47a14cebc954a2ff0&pager_element=1&sort_by=created&sort_order=DESC&page=0,{}"\
            .format(x)).content
    d = json.loads(json_data.decode()) # Remove .decode() for python < 3
    print(len(d))
    if len(d) == 3: # sometimes json lenght is 3 
        comments = d[2]['data'] # data is the key that contains the comments html
    elif len(d) == 2: # others just 2...
        comments = d[1]['data']

    #From here, we can use your BeautifulSoup code.  
    soup = BeautifulSoup(comments, "html.parser")
    all_comments_div = soup.find_all('div', class_="comment_body");

    for div in all_comments_div:
        all_comments.append(div.find('p').text.translate(non_bmp_map))


print(all_comments)

输出：

^{pr2}$

相关问题更多 >

编程相关推荐

热门问题

热门文章