用Python将数据保存在XML文件中

2024-10-04 01:37:09 发布

您现在位置:Python中文网/ 问答频道 /正文

我正在尝试将数据保存到XML文件中。这些数据来自一个我想收集评论的网站。每页总有五篇评论,我想用XML格式保存在一个文件中。问题是,如果我用print(ET.tostring(root, encoding='utf8').decode('utf8'))打印XML树,那么我希望有五个评论。但是如果我用tree.write("test.xml", encoding='unicode')将它们保存到文件中,那么我只会看到一个评论。。。这是我的密码:

import requests
from bs4 import BeautifulSoup
import re
import json
import xml.etree.cElementTree as ET

source = requests.get('https://www.tripadvisor.ch/Hotel_Review-g188113-d228146-Reviews-Coronado_Hotel-Zurich.html#REVIEWS').text

soup = BeautifulSoup(source, 'lxml')
pattern = re.compile(r'window.__WEB_CONTEXT__={pageManifest:(\{.*\})};')
script = soup.find("script", text=pattern)
dictData = pattern.search(script.text).group(1)
jsonData = json.loads(dictData)

def get_countrycitydata():

    countrycity_dict = dict()

    country_data = jsonData['urqlCache']['3960485871']['data']['locations']
    for data in country_data:
        data1 = data['parents']
        countrycity_dict["country_name"] = data1[2]['name']
        countrycity_dict["tripadvisorid_city"] = data1[0]['locationId']
        countrycity_dict["city_name"] = data1[0]['name']

    return countrycity_dict

def get_hoteldata():

    hotel_dict = dict()

    locations = jsonData['urqlCache']['669061039']['data']['locations']
    for data in locations:
        hotel_dict["tripadvisorid_hotel"] = data['locationId']
        hotel_dict["hotel_name"] = data['name']

    return hotel_dict

def get_reviews():  

    all_dictionaries = []

    for locations in jsonData['urqlCache']['669061039']['data']['locations']:
        for reviews in locations['reviewListPage']['reviews']:

            review_dict = {}

            review_dict["reviewid"] = reviews['id']
            review_dict["reviewurl"] =  reviews['absoluteUrl']
            review_dict["reviewlang"] = reviews['language']
            review_dict["reviewtitle"] = reviews['title']
            reviewtext = reviews['text']
            clean_reviewtext = reviewtext.replace('\n', ' ')
            review_dict["reviewtext"] = clean_reviewtext

            all_dictionaries.append(review_dict)

    return all_dictionaries

def xml_tree(new_dict): # should I change something here???

    root = ET.Element("countries")
    country = ET.SubElement(root, "country")

    ET.SubElement(country, "name").text = new_dict["country_name"]
    city = ET.SubElement(country, "city")

    ET.SubElement(city, "tripadvisorid").text = str(new_dict["tripadvisorid_city"])
    ET.SubElement(city, "name").text = new_dict["city_name"]
    hotels = ET.SubElement(city, "hotels")

    hotel = ET.SubElement(hotels, "hotel")
    ET.SubElement(hotel, "tripadvisorid").text = str(new_dict["tripadvisorid_hotel"])
    ET.SubElement(hotel, "name").text = new_dict["hotel_name"]
    reviews = ET.SubElement(hotel, "reviews")

    review = ET.SubElement(reviews, "review")
    ET.SubElement(review, "reviewid").text = str(new_dict["reviewid"])
    ET.SubElement(review, "reviewurl").text = new_dict["reviewurl"]
    ET.SubElement(review, "reviewlang").text = new_dict["reviewlang"]
    ET.SubElement(review, "reviewtitle").text = new_dict["reviewtitle"]
    ET.SubElement(review, "reviewtext").text = new_dict["reviewtext"]

    tree = ET.ElementTree(root)
    tree.write("test.xml", encoding='unicode')  

    print(ET.tostring(root, encoding='utf8').decode('utf8'))

##########################################################  

def main():

    city_dict = get_countrycitydata()
    hotel_dict = get_hoteldata()
    review_list = get_reviews()

    for index in range(len(review_list)):
        new_dict = {**city_dict, **hotel_dict, **review_list[index]}

        xml_tree(new_dict)

if __name__ == "__main__":
    main()  

如何更改XML树以将所有五个评论保存在文件中?XML文件应如下所示:

<countries>
    <country>
        <name>Schweiz</name>
        <city>
            <tripadvisorid>188113</tripadvisorid>
            <name>Zürich</name>
            <hotels>
                <hotel>
                    <tripadvisorid>228146</tripadvisorid>
                    <name>Hotel Coronado</name>
                    <reviews>
                        <review>
                            <reviewid>672052111</reviewid> 
                            <reviewurl>https://www.tripadvisor.ch/ShowUserReviews-g188113-d228146-r672052111-Coronado Hotel-Zurich.html</reviewurl>
                            <reviewlang>de</reviewlang>
                            <reviewtitle>Optimale Lage und Preis</reviewtitle>
                            <reviewtext>Hervorragendes Hotel.Beste Erfahrun mit Service und Zimme.Die Qalität der Betten ist optimalr. Zimmer sind trotz geringer Größe sehr gut ausgestattet.Der Föhn war in diesem Fall (nicht in früheren)etwas lahm</reviewtext>
                        </review>
                        <review>
                         second review here ...
                        </review>
                        <review>
                         third review here ...
                        </review>
                        ...
                    </reviews>
                </hotel>
            </hotels>
        </city>
    </country>
</countries>

提前感谢您的所有建议


Tags: textnamecitynewdatagetcountryhotel
1条回答
网友
1楼 · 发布于 2024-10-04 01:37:09

由于xml_tree(new_dict)存在于for循环中,因此tree.write()方法被多次调用以覆盖文件

open()a(追加)模式打开文件:

tree.write(open('test.xml', 'a'), encoding='unicode')

见文件here

相关问题 更多 >