<p>谷歌是一个非常复杂的搜索引擎,它不能简单地用一个get请求来抓取,它还具有防机器人篡改功能,以防止人们抓取该网站(因为谷歌希望开发者为该API付费并使用该API)。这是我用python编写的一个google搜索模块,它是我正在从事的一个项目的一部分</p>
<p>通过此代码发送的请求将被Google服务器接受,因为它模拟了真实web浏览器的行为。通过发送带有用户代理标头的GET请求,并生成必要的cookie</p>
<pre><code>from bs4 import BeautifulSoup
import requests, json, os
import datetime
class google_search():
def __init__(self):
self.user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:38.0) Gecko/20100101 Firefox/38.0"
self.url = "https://www.google.co.uk"
self.domain = ".google.co.uk"
self.output_filename = "output.html"
self.write_2_file = False
self.return_content = True
def search(self, query):
# generate header
self.head = {
"User-Agent": self.user_agent,
}
# generate cookies
self.current_date = datetime.datetime.now()
self.todays_date = self.current_date.strftime("%Y-%m-%d-%S")
self.date_in_month = datetime.datetime(
self.current_date.year,
self.current_date.month+1,
self.current_date.day-1,
self.current_date.hour,
self.current_date.minute,
self.current_date.second
).strftime("%a, %d-%b-%Y %H:%M:%S")
self.expiry_date = f"expires={self.date_in_month} GMT"
self.consent_cookie_fname = "YES+cb.{self.current_date.strftime('%Y%m%d-%m-p0')}.en+FX+949"
self.cookie = {
"1P_JAR" : f"={self.todays_date}; {self.expiry_date}; path=/; domain={self.domain}; Secure; SameSite=none",
"CONSENT" : f"{self.consent_cookie_fname}; Domain={self.domain}; {self.expiry_date}; Path=/; Secure; SameSite=none"
}
# send request
self.s = requests.Session()
self.res = requests.get(f"{self.url}{query}", headers=self.head, cookies=self.cookie)
html = self.res.content
# write to file
if self.write_2_file == True:
with open(self.output_filename, "wb") as file:
file.write(html)
elif self.return_content == True:
return html
url = "https://www.google.com/search?q=bar%20hong%20kong%20central&biw=1246&bih=714&sz=16&tbm=lcl&sxsrf=ALeKk02B3dHjl422M1wOkUldNgdUeC6RVA%3A1621869556252&ei=9MOrYMzsDobZ-QbhyK6YDA&oq=bar+hong+kong+central&gs_l=psy-ab.12...0.0.0.2313.0.0.0.0.0.0.0.0..0.0....0...1c..64.psy-ab..0.0.0....0.vxIZeVhM24g&tbs=lrf:!1m4!1u3!2m2!3m1!1e1!1m4!1u2!2m2!2m1!1e1!1m4!1u16!2m2!16m1!1e1!1m4!1u16!2m2!16m1!1e2!2m1!1e2!2m1!1e16!2m1!1e3!3sIAE,lf:1,lf_ui:9&rlst=f#rlfi=hd:;si:;mv:[[22.287261599999997,114.1668826],[22.2769662,114.1507584]];tbs:lrf:!1m4!1u3!2m2!3m1!1e1!1m4!1u2!2m2!2m1!1e1!1m4!1u16!2m2!16m1!1e1!1m4!1u16!2m2!16m1!1e2!2m1!1e2!2m1!1e16!2m1!1e3!3sIAE,lf:1,lf_ui:9"
req = google_search()
req.url = url
html = req.search("")
</code></pre>
<p>您可以在我的GitHub存储库<a href="https://github.com/JaredTurck/google-search-scraper" rel="nofollow noreferrer">here</a>上签出完整代码</p>