<p>新的注释通过<code>ajax</code>加载,我们需要解析它,然后使用<code>bs</code>,即:</p>
<pre><code>import json
import requests
import sys
from bs4 import BeautifulSoup
how_many_pages = 5 # how many comments pages you want to parse?
non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd)
all_comments = []
for x in range(how_many_pages):
# note: mygov.in seems very slow...
json_data = requests.get(
"https://www.mygov.in/views/ajax/?view_name=view_comments&view_display_id=block_2&view_args=267721&view_path=node%2\
F267721&view_base_path=comment_pdf_export&view_dom_id=f3a7ae636cabc2c47a14cebc954a2ff0&pager_element=1&sort_by=created&sort_order=DESC&page=0,{}"\
.format(x)).content
d = json.loads(json_data.decode()) # Remove .decode() for python < 3
print(len(d))
if len(d) == 3: # sometimes json lenght is 3
comments = d[2]['data'] # data is the key that contains the comments html
elif len(d) == 2: # others just 2...
comments = d[1]['data']
#From here, we can use your BeautifulSoup code.
soup = BeautifulSoup(comments, "html.parser")
all_comments_div = soup.find_all('div', class_="comment_body");
for div in all_comments_div:
all_comments.append(div.find('p').text.translate(non_bmp_map))
print(all_comments)
</code></pre>
<hr/>
<p><strong>输出</strong>:</p>
^{pr2}$