我有两个轻量级XML解析器(见下文)。一(1)是更有效的,但结果与熊猫奇怪的互动。另一种是缓慢的,但不会引起不寻常的错误。问题是,我找不到这两者到底在哪里会产生什么不同。你知道吗
(一)
keep=[]
for en,a in enumerate(open(inputfile,'r')):
if not a.startswith("<Record"):continue
line=a.strip().split("'")
line[0]=line[0].replace("<Record ",'')
line.pop() # deletes the final "/>"
store={}
for x in range(0, len(line), 2):
if not line[x]:continue
line[x] = line[x].strip().rstrip("=")
print(line[x])
print(line[x+1])
store[line[x]]=line[x+1]
if line[x] == "id17":
id17s.append(line[x+1])
keep.append(store)
doc = pandas.DataFrame(keep,index=id17s) # contains all XML records.
(二)
keep=[]
for en,a in enumerate(open(inputfile,'r')):
if not a.startswith("<Record"):continue
line=a.strip().split("' ")
line[0]=line[0].replace("<Record ",'')
line.pop()
store={}
for x in range(len(line)):
if not line[x]:continue
try:
trait,evalx=line[x].replace("'",'').strip().split('=',1)
except:
print line
print line[x].replace("'",'').strip().split('=')
sys.exit()
store[trait]=evalx
if trait == "id17":
id17s.append(evalx)
keep.append(store)
doc = pandas.DataFrame(keep,index=id17s) # contains all XML records.
两个解析器后面都有
doc_df = doc.apply(pandas.to_numeric, args=('ignore',)) # set missing values to nan
header=list(doc_df.columns.values) #Get columns names (header)
目前没有回答
相关问题 更多 >
编程相关推荐