我将成千上万的XML文件解析成字典,并将其结构存储在JSON中。你知道吗
它们的结构基本相同,但不同的标记命名方案的数目不详。在这数千个文件中,有各种不同的缩写用于命名标记。你知道吗
我需要找出有多少不同的标签来描述每一条信息,正确地解析它们。你知道吗
为此,我想创建一个XMLs/字典的主字典,其中包括标记名的所有变体,最好是它们在数千个XMLs/字典中的计数。你知道吗
以下是其中一本词典的小样本:
{
"Header": {
"Ts": {},
"PeriodEndDt": {},
"PreparedBy": {
"PreparerID": {},
"PreparerFirmName": {
"BusinessNameLine1Txt": {}
},
"PreparerAddress": {
"AddLn1Txt": {},
"CityName": {},
"StateAbbreviationCd": {},
"ZIPCd": {}
}
},
"FormTypeCd": {},
"PeriodBeginDt": {},
"Filer": {
"UniqueID": {},
"BusinessName": {
"BusinessNameLine1Txt": {}
},
"BusinessNameControlTxt": {},
"PhoneNum": {},
"USAddress": {
"AddressLine1Txt": {},
"CityNm": {},
"StateAbbreviationCd": {},
"ZIPCd": {}
}
},
"FormData": {
"FormCodeType": {
"BizType": {},
"AssetsAtEOY": {},
"AccountingMethod": {},
"RevenueAndExpenses": {
"ScheduleBNotReqd": {},
"DivsRevAndExpenses": {},
"DivsNetInvstIncomeAmt": {},
"NetGainSaleAstRevAndExpnssAmt": {},
"RevsOvrExpenses": {},
"NetInvestmentIncomeAmt": {}
},
"BalanceSheetGroup": {
"CashInvstBOYAmt": {},
"CashInvstEOYAmt": {},
"CashInvstEOYFMVAmt": {},
"OtherInvestmentsBOYAmt": {},
"OtherInvestmentsEOYAmt": {},
"CapitalStockEOYAmt": {},
"TotalLiabilitiesNetAstEOYAmt": {}
},
"ChangeNetAssetsFundGroup": {
"NetAssettFundBalancesBOYAmt": {},
"ExcessRevExpensesAmt": {},
"OtherIncreasesAmt": {},
"SubtotalAmt": {},
"OtherDecreasesAmt": {},
"TotNetAstOrFundBalancesEOYAmt": {}
},
"CapGainsLossTxInvstIncmDetail": {
"CapGainsLossTxInvstIncmGrp": {
"PropertyDesc": {},
"HowAcquiredCd": {},
"GrossSalesPriceAmt": {},
"GainOrLossAmt": {},
"GainsMinusExcessOrLossesAmt": {}
},
"StatementsRegardingActyGrp": {
"LegislativePoliticalActyInd": {},
"MoreThan100SpentInd": {}
},
"PhoneNum": {},
"LocationOfBooksUSAddress": {
"AddressLine1Txt": {},
"CityNm": {},
"StateAbbreviationCd": {},
"ZIPCd": {}
},
"CorporateDirectorsGrp": {
"DirectorsGrp": {
"PersonNm": {},
"USAddress": {
"AddressLine1Txt": {},
"CityNm": {},
"StateAbbreviationCd": {},
"ZIPCd": {}
},
"EmpPrograms": {
"EmployeeBenefitGroupNum": {},
"GroupType": {
"GroupElement": {},
"GroupCharacter": {
"GroupNames": {}
}
}
},
"EmpOffice1": {},
"EmpOffice2": {},
"EmpOffice3": {},
"EmpOffice4": {}
}
}
}
}
}
}
}
首先,我用来创建dictionaries/JSON的代码如下:
import xml.etree.ElementTree as ET
strip_ns = lambda xx: str(xx).split('}', 1)[1]
tree = ET.parse('xmlpath.xml')
root = tree.getroot()
tierdict = {}
for tier1 in root:
tier1var = strip_ns(tier1.tag)
tierdict[tier1var] = {}
for tier2 in tier1:
tier2var = strip_ns(tier2.tag)
tierdict[tier1var][tier2var] = {}
for tier3 in tier2:
tier3var = strip_ns(tier3.tag)
tierdict[tier1var][tier2var][tier3var] = {}
for tier4 in tier3:
tier4var = strip_ns(tier4.tag)
tierdict[tier1var][tier2var][tier3var][tier4var] = {}
我想看到的输出是:
{
"Header": {
"Header.Count": 5672,
"Ts": {
"Ts.Count": 3365
},
"Ss": {
"Ss.Count": 2328
},
我可能会对您想要的元素进行递归搜索,定义如下:
这样,您只需传递xml的根元素:
如果要将根元素环绕在它周围,请按如下方式操作:
这可以很容易地扩展到许多文件的
for
循环有这样的意思吗
相关问题 更多 >
编程相关推荐