使用具有多个标记的Pandas将XML转换为CSV

2024-09-30 08:28:09 发布

您现在位置:Python中文网/ 问答频道 /正文

需要使用python(Pandas)或任何其他库将XML文件展平为CSV。我有多个子标签下面和一个单一的标题要跨子元素复制。这是我用Python编写的第一个程序,无法成功执行。请告诉我这里的任何方向

-我的输入数据的预期输出。

chName,envText,rptCod,rptNam,membId,membLglNam,rptPrntEffDat,rptPrntRunDat,membClgIdCod,membClgIdNam,clgCurrTypCod,poolId,currTypCod,membChgIdCod,membChgIdNam,acctTypFlexible,prtMgnUnadj,prtMgnReqt,ChgRat,totMgnClgCurr,sumChgMbrPrtgbReq,sumChgMbrClgCurr,sumPoolIdTotal,sumPoolIdClgCurr,sumClgMbrTotMgnClgCurr
DEBSIL,S,PTR001,DD01 Report,POP,ABC LT,08/04/2021,08/04/2021,UVBV,YEZZE,USD,POPXCLIENTECM,USD,POP123,ABC LT123,A1,5656.6,5634.6,1,34543.6,5656.6,5634.6,5656.6,5634.6,700090
DEBSIL,S,PTR001,DD01 Report,POP,ABC LT,08/04/2021,08/04/2021,UVBV,YEZZE,USD,POPXCLIENTNET,NZD,POP676,SEEN TD,A9,8989.9,45656.9,1,1780688.9,8989.9,45656.9,8989.9,45656.9,700090
DEBSIL,S,PTR001,DD01 Report,POP,ABC LT,08/04/2021,08/04/2021,UVBV,YEZZE,USD,POPXSTANDARD,MXM,POP343,ACMI,P1,345.5,4545.5,1,4545.5,345.5,4545.5,345.5,4545.5,700090

输入数据

<PTR001>
<rptHdr>
    <chName>DEBSIL</chName>
    <envText>S</envText>
    <rptCod>PTR001</rptCod>
    <rptNam>DD01 Report</rptNam>
    <membId>POP</membId>
    <membLglNam>ABC LT</membLglNam>
    <rptPrntEffDat>2021-04-08</rptPrntEffDat>
    <rptPrntRunDat>2021-04-08</rptPrntRunDat>
</rptHdr>

<ptr001Grp>
    <ptr001KeyGrp>
        <membClgIdCod>UVBV</membClgIdCod>
        <membClgIdNam>YEZZE</membClgIdNam>
    </ptr001KeyGrp>
    <clgCurrTypCod>USD</clgCurrTypCod>
    <ptr001Grp1>
    <ptr001KeyGrp1>
        <poolId>CSBXCLIENTECM</poolId>
        <currTypCod>USD</currTypCod>
    </ptr001KeyGrp1>
    <ptr001Grp3>
        <ptr001KeyGrp3>
            <membChgIdCod>POP123</membChgIdCod>
            <membChgIdNam>ABC LT123</membChgIdNam>
        </ptr001KeyGrp3>
        <ptr001Rec>
            <acctTypFlexible>A1</acctTypFlexible>
            <prtMgnUnadj>5656.60</prtMgnUnadj>
            <prtMgnReqt>5634.60</prtMgnReqt>
            <ChgRat>1.000000</ChgRat>
            <totMgnClgCurr>34543.60</totMgnClgCurr>
        </ptr001Rec>

        <sumChgMbrPrtMgbReq>5656.60</sumChgMbrPrtMgbReq>
        <sumChgMbrClgCurr>5634.60</sumChgMbrClgCurr>
    </ptr001Grp3>

    <sumPoolIdTotal>5656.60</sumPoolIdTotal>
    <sumPoolIdClgCurr>5634.60</sumPoolIdClgCurr>
    </ptr001Grp1>

    <ptr001Grp1>
        <ptr001KeyGrp1>
            <poolId>POPXCLIENTNET</poolId>
            <currTypCod>NZD</currTypCod>
        </ptr001KeyGrp1>
    <ptr001Grp3>
        <ptr001KeyGrp3>
            <membChgIdCod>POP676</membChgIdCod>
            <membChgIdNam>SEEN TD</membChgIdNam>
        </ptr001KeyGrp3>
        <ptr001Rec>
            <acctTypFlexible>A9</acctTypFlexible>
            <prtMgnUnadj>8989.90</prtMgnUnadj>
            <prtMgnReqt>45656.90</prtMgnReqt>
            <ChgRat>1.000000</ChgRat>
            <totMgnClgCurr>1780688.90</totMgnClgCurr>
        </ptr001Rec>
        <sumChgMbrPrtMgbReq>8989.90</sumChgMbrPrtMgbReq>
        <sumChgMbrClgCurr>45656.90</sumChgMbrClgCurr>
    </ptr001Grp3>

    <sumPoolIdTotal>8989.90</sumPoolIdTotal>
    <sumPoolIdClgCurr>45656.90</sumPoolIdClgCurr>

    </ptr001Grp1>

    <ptr001Grp1>
        <ptr001KeyGrp1>
            <poolId>POPXSTANDARD</poolId>
            <currTypCod>MXM</currTypCod>
        </ptr001KeyGrp1>
    <ptr001Grp3>
        <ptr001KeyGrp3>
            <membChgIdCod>POP343</membChgIdCod>
            <membChgIdNam>ACMI</membChgIdNam>
        </ptr001KeyGrp3>
        <ptr001Rec>
            <acctTypFlexible>P1</acctTypFlexible>
            <prtMgnUnadj>345.50</prtMgnUnadj>
            <prtMgnReqt>4545.50</prtMgnReqt>
            <ChgRat>1.000000</ChgRat>
            <totMgnClgCurr>4545.50</totMgnClgCurr>
        </ptr001Rec>
        <sumChgMbrPrtMgbReq>345.50</sumChgMbrPrtMgbReq>
        <sumChgMbrClgCurr>4545.50</sumChgMbrClgCurr>
    </ptr001Grp3>
    <sumPoolIdTotal>345.50</sumPoolIdTotal>
    <sumPoolIdClgCurr>4545.50</sumPoolIdClgCurr>
    </ptr001Grp1>
    <sumClgMbrTotMgnClgCurr>700090.00</sumClgMbrTotMgnClgCurr>

</ptr001Grp>

</PTR001>
# Importing the required libraries
import xml.etree.ElementTree as Xet
import pandas as pd
  
cols = ["poolId", "currTypCod", "membChgIdCod", "membChgIdNam", "acctTypFlexible", "prtMgnReqt", "prtMgnReqt", "ChgRat", "totMgnClgCurr", "sumChgMbrPrtgbReq", "sumChgMbrClgCurr", "sumPoolIdTotal", "sumPoolIdClgCurr"]
rows = []
  
# Parsing the XML file
xmlparse = Xet.parse('myReport.xml')
root = xmlparse.getroot()
for i in root:
    poolId = i.find("poolId").text
    currTypCod = i.find("currTypCod").text
    membChgIdCod = i.find("membChgIdCod").text
    membChgIdNam = i.find("membChgIdNam").text
    acctTypFlexible = i.find("acctTypFlexible").text
    prtMgnReqt = i.find("prtMgnReqt").text
    ChgRat = i.find("ChgRat").text
    totMgnClgCurr = i.find("totMgnClgCurr").text
    sumChgMbrPrtgbReq = i.find("sumChgMbrPrtgbReq").text
    sumChgMbrClgCurr = i.find("sumChgMbrClgCurr").text
    sumPoolIdTotal = i.find("sumPoolIdTotal").text
    sumPoolIdClgCurr = i.find("sumPoolIdClgCurr").text
    
    rows.append({"poolId": poolId,
                 "currTypCod": currTypCod,
                 "membChgIdCod": membChgIdCod,
                 "membChgIdNam": membChgIdNam,
                 "acctTypFlexible": acctTypFlexible,
                 "prtMgnReqt": prtMgnReqt,
                 "ChgRat": ChgRat,
                 "totMgnClgCurr": totMgnClgCurr,
                 "sumChgMbrPrtgbReq":  sumChgMbrPrtgbReq,
                 "sumChgMbrClgCurr": sumChgMbrClgCurr,
                 "sumPoolIdTotal": sumPoolIdTotal,
                 "sumPoolIdClgCurr": sumPoolIdClgCurr})
  
df = pd.DataFrame(rows, columns=cols)
  
# Writing dataframe to csv
df.to_csv('myReport.csv')

Tags: textfindpoolidmembchgidnamsumchgmbrprtgbreqcurrtypcodaccttypflexibleprtmgnreqt
1条回答
网友
1楼 · 发布于 2024-09-30 08:28:09

我将把CSV列的排序留给您来解决。在这里搜索SO应该会得到很多帖子

from xml.etree import ElementTree as ET
import pandas as pd

data = '''\
<PTR001>
    <rptHdr>
        <chName>DEBSIL</chName>
        <envText>S</envText>
        <rptCod>PTR001</rptCod>
        <rptNam>DD01 Report</rptNam>
        <membId>POP</membId>
        <membLglNam>ABC LT</membLglNam>
        <rptPrntEffDat>2021-04-08</rptPrntEffDat>
        <rptPrntRunDat>2021-04-08</rptPrntRunDat>
    </rptHdr>
    <ptr001Grp>
        <ptr001KeyGrp>
            <membClgIdCod>UVBV</membClgIdCod>
            <membClgIdNam>YEZZE</membClgIdNam>
        </ptr001KeyGrp>
        <clgCurrTypCod>USD</clgCurrTypCod>
        <ptr001Grp1>
            <ptr001KeyGrp1>
                <poolId>CSBXCLIENTECM</poolId>
                <currTypCod>USD</currTypCod>
            </ptr001KeyGrp1>
            <ptr001Grp3>
                <ptr001KeyGrp3>
                    <membChgIdCod>POP123</membChgIdCod>
                    <membChgIdNam>ABC LT123</membChgIdNam>
                </ptr001KeyGrp3>
                <ptr001Rec>
                    <acctTypFlexible>A1</acctTypFlexible>
                    <prtMgnUnadj>5656.60</prtMgnUnadj>
                    <prtMgnReqt>5634.60</prtMgnReqt>
                    <ChgRat>1.000000</ChgRat>
                    <totMgnClgCurr>34543.60</totMgnClgCurr>
                </ptr001Rec>
                <sumChgMbrPrtMgbReq>5656.60</sumChgMbrPrtMgbReq>
                <sumChgMbrClgCurr>5634.60</sumChgMbrClgCurr>
            </ptr001Grp3>
            <sumPoolIdTotal>5656.60</sumPoolIdTotal>
            <sumPoolIdClgCurr>5634.60</sumPoolIdClgCurr>
        </ptr001Grp1>
        <ptr001Grp1>
            <ptr001KeyGrp1>
                <poolId>POPXCLIENTNET</poolId>
                <currTypCod>NZD</currTypCod>
            </ptr001KeyGrp1>
            <ptr001Grp3>
                <ptr001KeyGrp3>
                    <membChgIdCod>POP676</membChgIdCod>
                    <membChgIdNam>SEEN TD</membChgIdNam>
                </ptr001KeyGrp3>
                <ptr001Rec>
                    <acctTypFlexible>A9</acctTypFlexible>
                    <prtMgnUnadj>8989.90</prtMgnUnadj>
                    <prtMgnReqt>45656.90</prtMgnReqt>
                    <ChgRat>1.000000</ChgRat>
                    <totMgnClgCurr>1780688.90</totMgnClgCurr>
                </ptr001Rec>
                <sumChgMbrPrtMgbReq>8989.90</sumChgMbrPrtMgbReq>
                <sumChgMbrClgCurr>45656.90</sumChgMbrClgCurr>
            </ptr001Grp3>
            <sumPoolIdTotal>8989.90</sumPoolIdTotal>
            <sumPoolIdClgCurr>45656.90</sumPoolIdClgCurr>
        </ptr001Grp1>
        <ptr001Grp1>
            <ptr001KeyGrp1>
                <poolId>POPXSTANDARD</poolId>
                <currTypCod>MXM</currTypCod>
            </ptr001KeyGrp1>
            <ptr001Grp3>
                <ptr001KeyGrp3>
                    <membChgIdCod>POP343</membChgIdCod>
                    <membChgIdNam>ACMI</membChgIdNam>
                </ptr001KeyGrp3>
                <ptr001Rec>
                    <acctTypFlexible>P1</acctTypFlexible>
                    <prtMgnUnadj>345.50</prtMgnUnadj>
                    <prtMgnReqt>4545.50</prtMgnReqt>
                    <ChgRat>1.000000</ChgRat>
                    <totMgnClgCurr>4545.50</totMgnClgCurr>
                </ptr001Rec>
                <sumChgMbrPrtMgbReq>345.50</sumChgMbrPrtMgbReq>
                <sumChgMbrClgCurr>4545.50</sumChgMbrClgCurr>
            </ptr001Grp3>
            <sumPoolIdTotal>345.50</sumPoolIdTotal>
            <sumPoolIdClgCurr>4545.50</sumPoolIdClgCurr>
        </ptr001Grp1>
        <sumClgMbrTotMgnClgCurr>700090.00</sumClgMbrTotMgnClgCurr>
    </ptr001Grp>
</PTR001>
'''

tree = ET.fromstring(data)
header = tree.find('rptHdr')
hd = {}
for c in header:
    hd[c.tag] = c.text.strip() if c.text else ''

group = tree.find('ptr001Grp')
gpaths = ['ptr001KeyGrp/*', 'clgCurrTypCod', 'sumClgMbrTotMgnClgCurr']
gd = {}
for x in gpaths:
    for c in group.findall(x):
        gd[c.tag] = c.text.strip() if c.text else ''

rows = []

dpaths = [
    'ptr001KeyGrp1/poolId', 'ptr001KeyGrp1/currTypCod',
    'ptr001Grp3/ptr001KeyGrp3/*', 'ptr001Grp3/ptr001Rec/*',
    'ptr001Grp3/sumChgMbrPrtMgbReq', 'ptr001Grp3/sumChgMbrClgCurr'
]
for e in group.findall('ptr001Grp1'):
    d = {}
    for x in dpaths:
        for c in e.findall(x):
            d[c.tag] = c.text.strip() if c.text else ''
    d.update(gd)
    d.update(hd)
    rows.append(d)

df = pd.DataFrame(rows)

print(df[['poolId', 'currTypCod', 'sumChgMbrPrtMgbReq', 'sumChgMbrClgCurr']])

import sys

df.to_csv(sys.stdout, index=False)

相关问题 更多 >

    热门问题