import pandas as pd
from io import StringIO
#Mimic opening a file with StringIO
in_file = StringIO(u"""\
key1=abc||key2=ajdskj||name=ankush||contact=123444
key1=def||name=reddy||contact=456778
key1=aef||address=ashaskawe||name=john
""")
#First loop through the lines to find all the fields
#Also store each line info as a dict
all_line_dicts = []
all_fields = set()
for line in in_file.readlines():
line_dict = dict(pair.split('=') for pair in line.strip().split('||'))
all_line_dicts.append(line_dict)
all_fields = all_fields.union(line_dict.keys())
#Now loop through the line dicts and fill each field
#Put 'NULL' if the field is not given
field_dicts = {field:[] for field in all_fields}
for line_dict in all_line_dicts:
for field in field_dicts:
field_dicts[field].append(line_dict[field] if field in line_dict else 'NULL')
#Convert to dataframe by pandas for simplicity
df = pd.DataFrame(field_dicts)
print df #<-- look at it
df.to_csv('test.csv',index = False) #<-- save it as a CSV
from collections import defaultdict, OrderedDict
with open('messy_data.txt') as infile, open('cleaner_data.txt','w') as outfile:
whole_data = [x.strip().split("||") for x in infile]
headers = []
for x in whole_data:
for k in [y.split("=")[0] for y in x]:
if k not in headers:
headers.append(k)
whole_data = [dict(y.split("=") for y in x) for x in whole_data]
output = defaultdict(list)
for header in headers:
for d in whole_data:
output[header].append(d.get(header,'NULL'))
output = OrderedDict((x,output.get(x)) for x in headers)
outfile.write("||".join(list(output.keys()))+"\n")
for row in zip(*output.values()):
outfile.write("||".join(row)+"\n")
这将产生:
^{pr2}$
编辑:
更易于调试的脚本:
from collections import defaultdict, OrderedDict
with open('messy_data.txt') as infile, open('cleaner_data.txt','w') as outfile:
whole_data = [x.strip().split("||") for x in infile]
headers = []
for x in whole_data:
for k in [y.split("=")[0] for y in x]:
if k not in headers:
headers.append(k)
#whole_data = [dict(y.split("=") for y in x) for x in whole_data]
whole_data2 = []
for x in whole_data:
temp_list = [y.split("=") for y in x]
try:
temp_dict = dict(temp_list)
whole_data2.append(temp_dict)
except:
print(temp_list)
continue
output = defaultdict(list)
for header in headers:
for d in whole_data2:
output[header].append(d.get(header,'NULL'))
output = OrderedDict((x,output.get(x)) for x in headers)
print(output)
outfile.write("||".join(list(output.keys()))+"\n")
for row in zip(*output.values()):
outfile.write("||".join(row)+"\n")
熊猫解决方案:
正在读取文件:
制作数据帧:
^{pr2}$我不确定熊猫是否能做到,但我自己采取了一种很长的方法(也没那么糟糕)把钥匙分开。在
代码:
输出:
^{pr2}$这是一种使用标准库中的工具并维护列顺序的方法。
messy_data.txt
文件包含原始数据,cleaner_data.txt
是保存清理器数据的位置:这将产生:
^{pr2}$编辑:
更易于调试的脚本:
我希望这证明有用。在
相关问题 更多 >
编程相关推荐