import re # regex library
# Define regex pattern and compile for speed
pat = r"window.searchPageData\['Product'\] = ({\s*.*\s*})\s*\n.*"
pat = re.compile(pat)
# Extract JSON Data
json_data = pat.findall(s)[0] # s = dummy data
## Output
# { SOME JSON DATA }
import requests
from bs4 import BeautifulSoup as bsp
## Get respose object from the target URL
url = "http://your.target.url.here" # "http://www.google.com"
r = requests.get(url=url)
## Use BeautifulSoup to get a list of Scripts
soup = bsp(r.text, 'html.parser')
scripts = soup.find_all(name='script')
## ID your script from scripts
# Now you need to find out some pattern or
# specify which script in scripts (list) has
# your intended content
# TODO.... FOR YOU
## Special Case: Pattern located in a single place only
# If your regex pattern could be found only
# in one place in the entire html page, you
# can do this alternatively.
scripts_text = '\n'.join(scripts)
# Extract JSON Data
json_data = pat.findall(scripts_text)[0]
虚拟数据
# Dummy Data
s = """
<script>
window.searchPageData = window.searchPageData || {};
window.searchPageData['Product'] = { SOME JSON DATA }
};
</script>
"""
解决方案
概述:
s
:首先获取<script></script>
标记之间的文本。您可以很容易地使用requests
+BeautifulSoup
库来实现这一点。你不一定需要硒李><script>
标记对内的文本(s
)中提取以window.searchPageData['Product'] = {
开头的字符串和以}
结尾的字符串。为此,我们将使用regex
(正则表达式)库李>假设您已经可以访问网站中的内容,因此您也可以将某个标记
<script></script>
中的内容提取为文本,下面是一种提取JSON数据的方法,据我所知,JSON数据嵌入到html源中如何在这种情况下使用
requests
+BeautifulSoup
库因为,我无法访问您的数据(url或url中的html),所以这里是一个脚手架,说明了您需要做什么才能得到您想要的。但是,很明显,我无法测试它。我希望这有帮助
虚拟数据
相关问题 更多 >
编程相关推荐