Python：跨数千个Dictionary/XMLs/JSON比较和计算Dictionary结构

{ "Header": { "Ts": {}, "PeriodEndDt": {}, "PreparedBy": { "PreparerID": {}, "PreparerFirmName": { "BusinessNameLine1Txt": {} }, "PreparerAddress": { "AddLn1Txt": {}, "CityName": {}, "StateAbbreviationCd": {}, "ZIPCd": {} } }, "FormTypeCd": {}, "PeriodBeginDt": {}, "Filer": { "UniqueID": {}, "BusinessName": { "BusinessNameLine1Txt": {} }, "BusinessNameControlTxt": {}, "PhoneNum": {}, "USAddress": { "AddressLine1Txt": {}, "CityNm": {}, "StateAbbreviationCd": {}, "ZIPCd": {} } }, "FormData": { "FormCodeType": { "BizType": {}, "AssetsAtEOY": {}, "AccountingMethod": {}, "RevenueAndExpenses": { "ScheduleBNotReqd": {}, "DivsRevAndExpenses": {}, "DivsNetInvstIncomeAmt": {}, "NetGainSaleAstRevAndExpnssAmt": {}, "RevsOvrExpenses": {}, "NetInvestmentIncomeAmt": {} }, "BalanceSheetGroup": { "CashInvstBOYAmt": {}, "CashInvstEOYAmt": {}, "CashInvstEOYFMVAmt": {}, "OtherInvestmentsBOYAmt": {}, "OtherInvestmentsEOYAmt": {}, "CapitalStockEOYAmt": {}, "TotalLiabilitiesNetAstEOYAmt": {} }, "ChangeNetAssetsFundGroup": { "NetAssettFundBalancesBOYAmt": {}, "ExcessRevExpensesAmt": {}, "OtherIncreasesAmt": {}, "SubtotalAmt": {}, "OtherDecreasesAmt": {}, "TotNetAstOrFundBalancesEOYAmt": {} }, "CapGainsLossTxInvstIncmDetail": { "CapGainsLossTxInvstIncmGrp": { "PropertyDesc": {}, "HowAcquiredCd": {}, "GrossSalesPriceAmt": {}, "GainOrLossAmt": {}, "GainsMinusExcessOrLossesAmt": {} }, "StatementsRegardingActyGrp": { "LegislativePoliticalActyInd": {}, "MoreThan100SpentInd": {} }, "PhoneNum": {}, "LocationOfBooksUSAddress": { "AddressLine1Txt": {}, "CityNm": {}, "StateAbbreviationCd": {}, "ZIPCd": {} }, "CorporateDirectorsGrp": { "DirectorsGrp": { "PersonNm": {}, "USAddress": { "AddressLine1Txt": {}, "CityNm": {}, "StateAbbreviationCd": {}, "ZIPCd": {} }, "EmpPrograms": { "EmployeeBenefitGroupNum": {}, "GroupType": { "GroupElement": {}, "GroupCharacter": { "GroupNames": {} } } }, "EmpOffice1": {}, "EmpOffice2": {}, "EmpOffice3": {}, "EmpOffice4": {} } } } } } } }

import xml.etree.ElementTree as ET strip_ns = lambda xx: str(xx).split('}', 1)[1] tree = ET.parse('xmlpath.xml') root = tree.getroot() tierdict = {} for tier1 in root: tier1var = strip_ns(tier1.tag) tierdict[tier1var] = {} for tier2 in tier1: tier2var = strip_ns(tier2.tag) tierdict[tier1var][tier2var] = {} for tier3 in tier2: tier3var = strip_ns(tier3.tag) tierdict[tier1var][tier2var][tier3var] = {} for tier4 in tier3: tier4var = strip_ns(tier4.tag) tierdict[tier1var][tier2var][tier3var][tier4var] = {}

1条回答

网友

1楼 · 发布于 2024-10-02 12:36:20

我可能会对您想要的元素进行递归搜索，定义如下：

def get_elements(json_entry, child_elements=[]):

     if not child_elements:
         return json_entry

     el, other_children = child_elements[0], child_elements[1:]

     children = el.getchildren()
     rec = json_entry.get(el.tag)
     if not children:
         json_entry[el.tag] = {"Count": rec.get("Count",0)+1 if rec else 1}

     else:
         json_entry[el.tag] = {"Count": rec.get("Count",0) if rec else 1,
                                    **get_elements({}, children)}

     return get_elements(json_entry, other_children)

这样，您只需传递xml的根元素：

from lxml import etree

with open("myxml.xml", "r") as fh:
    tree = etree.parse(fh)

root = tree.getroot()

root_children = root.getchildren()

child_recs = get_elements({}, root_children)

{'tagOne': {'Count': 1}, 'tagTwo': {'Count': 1, 'tagThree': {'Count': 1}, 'tagFour': {'Count': 1, 'tagFive': {'Count': 1}}}}

如果要将根元素环绕在它周围，请按如下方式操作：

master_lookup = {root.tag: {"Count": 1, **child_recs}}

这可以很容易地扩展到许多文件的for循环

master_lookup = {}

for file in os.walk(path):
    with open(file) as fh:
        tree = etree.parse(fh)

    root = tree.getroot()
    root_entry = master_lookup.get(root.tag, {"Count": 0})
    root_children = root.getchildren()

    root_count = root_entry.pop("Count")

    master_lookup[root.tag] = {"Count": root_count, **get_elements({**root_entry}, root_children)}

有这样的意思吗

相关问题更多 >

编程相关推荐

热门问题

热门文章