Python Beautifulsoup4无法从源文档提取json数据

2024-06-28 11:33:42 发布

您现在位置:Python中文网/ 问答频道 /正文

我对python和beautifulsoup图书馆都是新手。我正在制作一个脚本,从网页上刮取一些图像。但该网站将图像以json的形式存储在源代码中。 另外还有一个问题,他们将相关列表的图像也存储在页面中

但是我需要得到所有具有“full_screen”属性的图像,但是只需要第一组源代码,因为我不想要其他列表的图像,我只需要当前页面的列表图像

我的代码:

import os
import requests
from bs4 import BeautifulSoup, Tag
import json


def getResponse(url):
    while True:
        try:
            page = requests.get(url)
            soup = BeautifulSoup(page.text, 'html.parser')
            return soup
        except:
            print("retrying...")


url = "https://www.propertyfinder.ae/en/rent/apartment-for-rent-dubai-dubai-marina-botanica-tower-7469382.html"

soup = getResponse(url)

script = soup.find_all("script")

val = json.loads(script[7].text)

print(val)

源文档示例:

{"homepage":"https:\/\/www.propertyfinder.ae\/property\/2c86eb83cbe5c9588b9347ef0c0f50b9\/338\/248\/MODE\/6cf3ec\/7481797-75cceo.jpg","cts":"https:\/\/www.propertyfinder.ae\/property\/2c86eb83cbe5c9588b9347ef0c0f50b9\/668\/452\/MODE\/782bc1\/7481797-75cceo.jpg","small":"https:\/\/www.propertyfinder.ae\/property\/2c86eb83cbe5c9588b9347ef0c0f50b9\/260\/185\/MODE\/686c22\/7481797-75cceo.jpg","medium":"https:\/\/www.propertyfinder.ae\/property\/2c86eb83cbe5c9588b9347ef0c0f50b9\/668\/452\/MODE\/782bc1\/7481797-75cceo.jpg","thumb":"https:\/\/www.propertyfinder.ae\/property\/2c86eb83cbe5c9588b9347ef0c0f50b9\/95\/95\/MODE\/2f9a70\/7481797-75cceo.jpg","new_big":"https:\/\/www.propertyfinder.ae\/property\/2c86eb83cbe5c9588b9347ef0c0f50b9\/856\/550\/MODE\/7cbb67\/7481797-75cceo.jpg","new_small":"https:\/\/www.propertyfinder.ae\/property\/2c86eb83cbe5c9588b9347ef0c0f50b9\/416\/272\/MODE\/724ffe\/7481797-75cceo.jpg","full_screen":"https:\/\/www.propertyfinder.ae\/property\/2c86eb83cbe5c9588b9347ef0c0f50b9\/1312\/894\/MODE\/57d3b7\/7481797-75cceo.jpg"}},{"type":"property_image","id":"118819718","attributes":{"id":"118819718","path":"7481797-a0120o.jpg","number":2,"version":"537f08c43e0437e41778534772d1659a","is_default":false},"links":{"homepage":"https:\/\/www.propertyfinder.ae\/property\/537f08c43e0437e41778534772d1659a\/338\/248\/MODE\/a56d8f\/7481797-a0120o.jpg","cts":"https:\/\/www.propertyfinder.ae\/property\/537f08c43e0437e41778534772d1659a\/668\/452\/MODE\/094349\/7481797-a0120o.jpg","small":"https:\/\/www.propertyfinder.ae\/property\/537f08c43e0437e41778534772d1659a\/260\/185\/MODE\/b5637b\/7481797-a0120o.jpg","medium":"https:\/\/www.propertyfinder.ae\/property\/537f08c43e0437e41778534772d1659a\/668\/452\/MODE\/094349\/7481797-a0120o.jpg","thumb":"https:\/\/www.propertyfinder.ae\/property\/537f08c43e0437e41778534772d1659a\/95\/95\/MODE\/8d79d7\/7481797-a0120o.jpg","new_big":"https:\/\/www.propertyfinder.ae\/property\/537f08c43e0437e41778534772d1659a\/856\/550\/MODE\/30ee0f\/7481797-a0120o.jpg","new_small":"https:\/\/www.propertyfinder.ae\/property\/537f08c43e0437e41778534772d1659a\/416\/272\/MODE\/ee84d8\/7481797-a0120o.jpg","full_screen":"https:\/\/www.propertyfinder.ae\/property\/537f08c43e0437e41778534772d1659a\/1312\/894\/MODE\/8afdf1\/7481797-a0120o.jpg"}},{"type":"property_image","id":"118819719","attributes":{"id":"118819719","path":"7481797-f337do.jpg","number":3,"version":"3523f4921a89e87ea7d4b752038e93ef","is_default":false},"links":

错误:

No JSON object could be decoded

请任何人帮我获取第一组id为“全屏”的图像

Pyfiddle链接https://pyfiddle.io/fiddle/8e039908-e713-43be-9513-ef4bab9dfb9d/?i=true


Tags: https图像importidurlnewmodewww
2条回答

最简单的事情是通过API,但也可以通过<script>标记来完成。并非所有属性都具有“全屏”属性:

带有<script>标记:

import os
import requests
from bs4 import BeautifulSoup, Tag
import json

def getResponse(url):
    while True:
        try:
            page = requests.get(url)
            soup = BeautifulSoup(page.text, 'html.parser')
            return soup
        except:
            print("retrying...")

url = "https://www.propertyfinder.ae/en/rent/apartment-for-rent-dubai-dubai-marina-botanica-tower-7469382.html"
soup = getResponse(url)
script = soup.find_all("script")

jsonStr = script[7].text.split('payload: ')[-1].split(';')[0].rsplit('}',1)[0]
val = json.loads(jsonStr)


properties = val['included']
for prop in properties:
    if 'links' in prop.keys():
        if 'full_screen' in prop['links'].keys():
            print (prop['links']['full_screen'])

使用API:

import requests

url = 'https://www.propertyfinder.ae/en/api/search'
payload = {
'er[category_id]': '2',
'filter[furnished]': '0',
'filter[locations_ids][]': '3037',
'filter[price_type]': 'y',
'filter[property_type_id]': '1',
'page[limit]': '9999',
'page[number]': '1',
'sort': 'mr',
'include': 'properties,properties.property_type,properties.property_images,properties.location_tree,properties.agent,properties.broker,smart_ads,smart_ads.agent,smart_ads.broker,smart_ads.property_type,smart_ads.property_images,smart_ads.location_tree,direct_from_developer,direct_from_developer.property_type,direct_from_developer.property_images,direct_from_developer.location_tree,direct_from_developer.agent,direct_from_developer.broker,cts,cts.agent,cts.broker,cts.property_type,cts.property_images,cts.location_tree,similar_properties,similar_properties.agent,similar_properties.broker,similar_properties.property_type,similar_properties.property_images,similar_properties.location_tree,agent_smart_ads,agent_smart_ads.broker,agent_smart_ads.languages,agent_properties_smart_ads,agent_properties_smart_ads.agent,agent_properties_smart_ads.broker,agent_properties_smart_ads.location_tree,agent_properties_smart_ads.property_type'}

val = requests.get(url,  params=payload).json()

properties = val['included']
for prop in properties:
    if 'links' in prop.keys():
        if 'full_screen' in prop['links'].keys():
            print (prop['links']['full_screen'])

输出:

https://www.propertyfinder.ae/property/eaefddc999df314f589016fbb9df0c1e/1312/894/MODE/62d644/7468078-5a70co.jpg
https://www.propertyfinder.ae/property/9dae7d23cc50000baa36f55cde632fec/1312/894/MODE/b59dc8/7468078-d15a9o.jpg
https://www.propertyfinder.ae/property/2aaaa29b083099436f8dea3d018ba0f0/1312/894/MODE/94e390/7468078-84666o.jpg
https://www.propertyfinder.ae/property/a6ab186660629c4b6d494c2e66bd2b71/1312/894/MODE/97a052/7468078-eb879o.jpg
....
import requests
import re


def main(url):
    r = requests.get(url)
    match = re.search(r'location = ([^;]+)', r.text).group(1)
    print(match)


main("https://www.propertyfinder.ae/en/rent/apartment-for-rent-dubai-dubai-marina-botanica-tower-7469382.html")

输出:

{
    id: "3037",
    payload: {"data":{"type":"location","id":"3037","attributes":{"name":"Botanica Tower","path":"1.50.3037","path_name":"Dubai, Dubai Marina","location_type":"TOWER","review_score":3.7142856000000002,"reviews_count":3,"image_token":"60040b9695bbc9b791d1c121e17a91366de3eba1","coordinates":{"lon":55.142415,"lat":25.085046999999999},"level":2,"abbreviation":"","url_slug":"dubai-marina-botanica-tower","children_count":0},"links":{"building_reviews":"\/en\/building-reviews\/dubai\/dubai-marina-botanica-tower.html","image_location":"https:\/\/www.propertyfinder.ae\/images\/pf_portal\/tower\/60040b9695bbc9b791d1c121e17a91366de3eba1\/desktop"}}}
  }

正则表达式演示版

Check

或者如果您的目标是data

因此,请使用以下版本:

import requests
import re
import json


def main(url):
    r = requests.get(url)
    match = re.search(r'location = {[\s\S]+?payload: ({.+})', r.text).group(1)
    goal = json.loads(match)
    print(goal)


main("https://www.propertyfinder.ae/en/rent/apartment-for-rent-dubai-dubai-marina-botanica-tower-7469382.html")

输出:

{
    "data": {
        "type": "location",
        "id": "3037",
        "attributes": {
            "name": "Botanica Tower",
            "path": "1.50.3037",
            "path_name": "Dubai, Dubai Marina",
            "location_type": "TOWER",
            "review_score": 3.7142856,
            "reviews_count": 3,
            "image_token": "60040b9695bbc9b791d1c121e17a91366de3eba1",        
            "coordinates": {
                "lon": 55.142415,
                "lat": 25.085047
            },
            "level": 2,
            "abbreviation": "",
            "url_slug": "dubai-marina-botanica-tower",
            "children_count": 0
        },
        "links": {
            "building_reviews": "/en/building-reviews/dubai/dubai-marina-botanica-tower.html",
            "image_location": "https://www.propertyfinder.ae/images/pf_portal/tower/60040b9695bbc9b791d1c121e17a91366de3eba1/desktop"
        }
    }
}

相关问题 更多 >