Python多键和转换为字典

2024-10-03 17:26:31 发布

您现在位置:Python中文网/ 问答频道 /正文

我正在Python2.6.6中导入一个txt文件,需要进行一些数据争用。我是Python新手,正在努力用谷歌搜索每一步来完成任务。你能帮忙还是建议

下面是我的输入myData.txt,如下所示。标题不在数据中,但为了便于阅读,我将其放在这里

key1|key2|group|v1|v2|v3|v4
1|A|-1|10|100|1|2
1|A|2|20|35|2|3
1|B|1|15|5|3|5
2|B|5|23|25|4|2
2|B|2|33|20|22|98    
2|D|4|23|21|20|32
...

下面是我在panda数据框中所需的输出,如下所示。基本上,我想合并key1和key2并形成一个组合键,然后将group、v1和v2放入一个字典中,group作为键,v1 v2作为列表中的值(v1是第一个元素,v2是第二个元素)。输出中不需要v3或v4

     comboKey1  new_v1
     1_A        {"-1":[10,100], "2":[20,35]}
     1_B        {"1":[15,5]}
     2_B        {"2":[33,20], "5":[23,25]}
     2_D        {"4":[23,21]}

这是我现在拥有的。有人能给点建议吗

import pandas as pd
df1 = pd.read_csv('myData.txt', header=None, sep='|')
df1.columns = ('key1','key2','group','v1','v2')
df1['comboKey1'] = df1['key1'].map(str)+"_"+df1['key2']

Tags: 数据txt元素groupv3建议v2v4
2条回答
 import pandas as pd

 # Reading file, 'r' -> read
 file = open('data.txt', 'r')
 lines = file.readlines()

 # Fict where info will be stored
 main_dict = {}

 for line in lines:
     # Getting the list of values in the line
     # values -> [key1, key2, group, v1, v2, v3, v4]
     # indexs ->   0     1      2     3   4   5   6
     values = line.split('|')

     #creating combo_key
     combo_key = str(values[0])+"_"+str(values[1])

     #tests if key already exists
     #if not, creats a new dict into it
     if combo_key not in main_dict.keys():
         main_dict[combo_key] = {}   #adding new dict to dict key

     main_dict[combo_key][str(values[2])] = [values[3], values[4]]

 data = []
 for key in main_dict.keys():
     data.append([key, str(main_dict[key])])

 df = pd.DataFrame(data, columns = ['ComboKey1', "new_v1"])

 print(df)

那就把这条命令分类吧(:

如果只是实现了期望的输出,那么下面的代码也可以应用

import pandas as pd
from io import StringIO

YOUR_TXT_DATA = """\
1|A|-1|10|100|1|2
1|A|2|20|35|2|3
1|B|1|15|5|3|5
2|B|5|23|25|4|2
2|B|2|33|20|22|98    
2|D|4|23|21|20|32
"""

df = pd.read_csv(StringIO(YOUR_TXT_DATA), header=None,
                 usecols=[_ for _ in range(0, 5)],
                 names=['key1', 'key2', 'group', 'v1', 'v2'],
                 sep='|')
result_dict = dict(comboKey1=[], new_v1=[])
for key1, key2, group, v1, v2 in df.values:
    key = str(key1) + '_' + str(key2)
    if key not in result_dict['comboKey1']:
        result_dict['comboKey1'].append(key)
        result_dict['new_v1'].append({str(group): [v1, v2]})
    else:
        index = result_dict['comboKey1'].index(key)
        result_dict['new_v1'][index].update({str(group): [v1, v2]})

result_df = pd.DataFrame.from_dict(result_dict)
print(result_df)

输出

  comboKey1                            new_v1
0       1_A  {'-1': [10, 100], '2': [20, 35]}
1       1_B                    {'1': [15, 5]}
2       2_B    {'5': [23, 25], '2': [33, 20]}
3       2_D                   {'4': [23, 21]}

关于测试数据

我认为有一些特殊的情况需要考虑,假设数据如下。p>
key1|key2|group|v1|v2|v3|v4
1|A|-1|10|100|1|2
1|A|-1|10|100|1|2
1|A|-1|20|35|2|3

你的预期产出是多少?(案例1~3)

  • 案例1:以最后一个为准1_A {'-1': [20, 35]}(解决方案:dict)
  • 案例2:保留全部但不重复:{('-1', (10, 100)), ('-1', (20, 35))}(解决方案:set)
  • 案例3:保留所有1_A [('-1', (10, 100)), ('-1', (10, 100)), ('-1', (20, 35))](解决方案:列表)

代码:

from unittest import TestCase
import pandas as pd
from io import StringIO

OTHER_TXT_DATA = """\
1|A|-1|10|100|1|2
1|A|-1|10|100|1|2
1|A|-1|20|35|2|3
"""

class MyTests(TestCase):
    def __init__(self, *args, **options):
        super().__init__(*args, **options)
        self.df = pd.read_csv(StringIO(OTHER_TXT_DATA), header=None,
                              usecols=[_ for _ in range(0, 5)],
                              names=['key1', 'key2', 'group', 'v1', 'v2'],
                              sep='|')

    def setUp(self) -> None:
        # init on every test case.
        self.result_dict = dict(comboKey1=[], new_v1=[])

    def solution_base(self, new_v1_fun, update_v1_fun) -> pd.DataFrame:

        result_dict = self.result_dict

        for key1, key2, group, v1, v2 in self.df.values:
            key = str(key1) + '_' + str(key2)
            if key not in result_dict['comboKey1']:
                result_dict['comboKey1'].append(key)
                new_v1_fun(group, v1, v2)  # result_dict['new_v1'].append({str(group): [v1, v2]})
            else:
                index = result_dict['comboKey1'].index(key)
                update_v1_fun(index, group, v1, v2)  # result_dict['new_v1'][index].update({str(group): [v1, v2]})

        df = pd.DataFrame.from_dict(result_dict)
        print(df)
        return df

    def test_case_1_dict(self):
        df = self.solution_base(new_v1_fun=lambda group, v1, v2: self.result_dict['new_v1'].append({str(group): [v1, v2]}),
                                update_v1_fun=lambda index, group, v1, v2: self.result_dict['new_v1'][index].update({str(group): [v1, v2]}))
        self.assertTrue(df.equals(pd.DataFrame(
            columns=['comboKey1', 'new_v1'],
            data=[
                ['1_A', {'-1': [20, 35]}],
            ]
        )))

    def test_case_2_set(self):
        df = self.solution_base(new_v1_fun=lambda group, v1, v2: self.result_dict['new_v1'].append({(str(group), (v1, v2))}),
                                update_v1_fun=lambda index, group, v1, v2: self.result_dict['new_v1'][index].add((str(group), (v1, v2))))
        self.assertTrue(df.equals(pd.DataFrame(
            columns=['comboKey1', 'new_v1'],
            data=[
                ['1_A', {('-1', (20, 35)), ('-1', (10, 100))}],
            ]
        )))

    def test_case_3_list(self):
        df = self.solution_base(new_v1_fun=lambda group, v1, v2: self.result_dict['new_v1'].append([(str(group), (v1, v2))]),
                                update_v1_fun=lambda index, group, v1, v2: self.result_dict['new_v1'][index].append((str(group), (v1, v2))))
        self.assertTrue(df.equals(pd.DataFrame(
            columns=['comboKey1', 'new_v1'],
            data=[
                ['1_A', [('-1', (10, 100)), ('-1', (10, 100)), ('-1', (20, 35))]],
            ]
        )))

注意:Python 2不支持注释(请参见PEP484

相关问题 更多 >