如何使用python minidom从XML中提取数据

2024-09-30 06:19:49 发布

您现在位置:Python中文网/ 问答频道 /正文

对于这个xml文件,我想从中提取数据。但是,从<LandmarkPointListXml>以后提取数据有困难。在

XML文件:

  <?xml version="1.0" encoding="utf-8"?>
  <Map xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema">
  <MapName>er</MapName>
  <MapURL>er.gif</MapURL>
  <Name>er</Name>
  <URL>er.gif</URL>
  <LandmarkPointListXml>
    <anyType xsi:type="LandmarkPointProperty">
      <LandmarkPointX>400</LandmarkPointX>
      <LandmarkPointY>292</LandmarkPointY>
      <LandmarkDesc>my room door</LandmarkDesc>
    </anyType>
    <anyType xsi:type="LandmarkPointProperty">
      <LandmarkPointX>399</LandmarkPointX>
      <LandmarkPointY>219</LandmarkPointY>
      <LandmarkDesc>bro room door</LandmarkDesc>
    </anyType>
  </LandmarkPointListXml>
  <RegionPointListXml />
</Map>

Python程序:

^{pr2}$

我能找回结果,“呃,电子邮箱,呃,gif电子邮箱,直到程序到达<LandmarkPointListXml>。在


Tags: 文件数据httpmapwwwxmlgifer
3条回答

这个代码相当脆弱。它对XML输入做了很强的假设,如果以有效的方式修改XML(例如,如果不是紧随其后),则会失败。在

我建议在解析XML时使用标准库,比如elementtree(http://docs.python.org/library/xml.etree.elementtree.html)或lxml(http://lxml.de),它也可以验证XML输入。在

下面我编写的代码使用元素树并处理您的XML输入(我删除了父类的“self”参数)。它还允许(忽略)XML元素中的空值。在

import xml.etree.ElementTree as ET

def GetMapData( xmlfile ):
    result = ""
    try:
        tree = ET.parse( xmlfile )
    except IOError, e:
        print "Failure Parsing %s: %s" % (xmlfile, e)
    root = tree.getroot() # root node
    for child in root:
        if ( child.tag == 'LandmarkPointListXml' ):
            result += '|' + loopLandmark(child) + '|'
        elif child.text is not None:
            result += child.text + ','
    return result

def loopLandmark( landmarks ):
    result=""
    for landmark in landmarks:
        if ( landmark.tag == 'anyType' ): # check also xsi:type="LandmarkPointProperty"?
            for child in landmark:
                if ( child.text and child.tag in [ 'LandmarkPointX', 'LandmarkPointY' ] ):
                    result += child.text + ','
    return result

GetMapData( 'xml.in' )

我设法从发布的XML文件中提取出数据。但我觉得这比我提供的答案更简单。为了得到每一个数据要做很多循环。在

import sys
import socket
import os
from xml.dom.minidom import Document, parse, parseString

class mapDataClass:

def __init__(self):
    self.XMLdoc = Document()
    self.MakeRootNode()

def MakeRootNode(self):
    self.RootNode = self.XMLdoc.createElement('Map')
    self.XMLdoc.appendChild(self.RootNode)

def GetXML_Doc(self):
    return self.XMLdoc

def LoadXMLFile(self, AbsFileName):
    try:
        self.XMLdoc.unlink()
        self.XMLdoc = parse(AbsFileName)
        if (self.XMLdoc.hasChildNodes()): #if not empty
            #Determine if root node <CalibrationData> exist
            if (cmp(self.XMLdoc.firstChild.nodeName,
                                'Map') == 0):
                self.RootNode = self.XMLdoc.firstChild

        return True

    except IOError:
        print 'File ' + AbsFileName + ' not found'
        return False

def GetMapData(self):
    result = ""
    haha = self.XMLdoc.firstChild #root node
    for child in haha.childNodes:
        if cmp(child.nodeName, 'LandmarkPointListXml')==0:
            result1 = self.loopLandmark(child)
        elif cmp(child.nodeName, 'RegionPointListXml')==0:
            print 'Empty'
        elif cmp(child.nodeName, 'URL')==0:
            result = result + child.firstChild.nodeValue
        else:
            result = result + child.firstChild.nodeValue + ','
    result = result + "|" + result1 + "EMPTY"
    return result

def loopLandmark(self,landmarks):
    result2=""
    tempResult=""
    haha=landmarks.getElementsByTagName('anyType')
    for i in range(0, len(haha)):
        result2=self.loopAnyType(haha[i])
        if ((i+1)!=len(haha)):
            tempResult = tempResult + result2 + ';'
        else:
            tempResult = tempResult + result2 + '|'
    return tempResult

def loopAnyType(self,anyType):
    result3=""
    haha1=anyType.getElementsByTagName('LandmarkPointX')[0]
    haha2=anyType.getElementsByTagName('LandmarkPointY')[0]
    haha3=anyType.getElementsByTagName('LandmarkDesc')[0]
    result3 = haha1.firstChild.nodeValue + "," + haha2.firstChild.nodeValue + "," + haha3.firstChild.nodeValue
    return result3

profile = mapDataClass()
boolean = profile.LoadXMLFile('upload\er.m')
print boolean
result = profile.GetMapData()
print result

我以前的回答还不完整。这是一个认为应该是好的。在

import sys
import socket
import os
from xml.dom.minidom import Document, parse, parseString, Node

class mapDataClass:

def __init__(self):
    self.XMLdoc = Document()
    self.MakeRootNode()

def MakeRootNode(self):
    self.RootNode = self.XMLdoc.createElement('Map')
    self.XMLdoc.appendChild(self.RootNode)

def GetXML_Doc(self):
    return self.XMLdoc

def LoadXMLFile(self, AbsFileName):
    try:
        self.XMLdoc.unlink()
        self.XMLdoc = parse(AbsFileName)
        if (self.XMLdoc.hasChildNodes()): #if not empty
            if (cmp(self.XMLdoc.firstChild.nodeName,
                                'Map') == 0):
                self.RootNode = self.XMLdoc.firstChild

        return True

    except IOError:
        print 'File ' + AbsFileName + ' not found'
        return False

def GetMapData(self):
    result = ""
    result1 = ""
    result2 = ""
    haha = self.XMLdoc.firstChild #root node
    for child in haha.childNodes:
        if child.nodeType == Node.ELEMENT_NODE:
            if cmp(child.nodeName, 'LandmarkPointListXml')<>0 and cmp(child.nodeName, 'RegionPointListXml')<>0:
                if cmp(child.nodeName, 'URL')==0:
                    result = result + child.firstChild.nodeValue       
                else:
                    result = result + child.firstChild.nodeValue + ','
            elif cmp(child.nodeName, 'LandmarkPointListXml')==0:
                if child.firstChild is not None:
                    result1 = self.loopLandmark(child)
                else:
                    result1 = 'EMPTY|'
            elif cmp(child.nodeName, 'RegionPointListXml')==0:
                if child.firstChild is None:
                    result2 =  'EMPTY'

    result = result + "|" + result1 + result2
    return result

def loopLandmark(self,landmarks):
    result2=""
    tempResult=""
    haha=landmarks.getElementsByTagName('anyType')
    for i in range(0, len(haha)):
        result2=self.loopAnyType(haha[i])
        if ((i+1)!=len(haha)):
            tempResult = tempResult + result2 + ';'
        else:
            tempResult = tempResult + result2 + '|'
    return tempResult

def loopAnyType(self,anyType):
    result3=""
    haha1=anyType.getElementsByTagName('LandmarkPointX')[0]
    haha2=anyType.getElementsByTagName('LandmarkPointY')[0]
    haha3=anyType.getElementsByTagName('LandmarkDesc')[0]
    result3 = haha1.firstChild.nodeValue + "," + haha2.firstChild.nodeValue + "," + haha3.firstChild.nodeValue
    return result3

data = mapDataClass()
success = data.LoadXMLFile("upload\homeTest.m")
if success:
    print "file loaded"
    print data.GetMapData()
else:
    print "no such file found"

相关问题 更多 >

    热门问题