将数组转换为数据帧

2024-09-30 05:18:59 发布

您现在位置:Python中文网/ 问答频道 /正文

我有以下数组:

['preprocessed\\AB_30624_badchannels.set', {'E88', 'E91', 'E248', 'E139', 'E245', 'E216', 'E111'}, 'preprocessed\\ACM_98630_badchannels.set', {'E88', 'E186', 'E91', 'E139', 'E102', 'E216', 'E111', 'E238'}, 'preprocessed\\AL_96705_badchannels.set', {'E88', 'E37', 'E91'}]

您能告诉我如何创建一个数据帧,其中列名是文件名,没有路径和扩展名(即预处理\AB_30624_badchannels.set->;AB_30624_badchannels),行是相应字典中包含的通道列表(即“E88”、“E91”、“E248”、“E139”、“E245”、“E216”、“E111”fro AB_30624)

所需的输出如下所示:

AB_30624 | ACM_98630 | AL_96705|
E88      |E88        |E88
E91      |E37        |E37
E248     |E91        |E91
E139     |E73
E245     |E232
E216     |E256
E111     |E139
         |E235
         |E216
         |E46

谢谢你的帮助


Tags: ab数组alsetacmpreprocessede91e111
3条回答

由于我不清楚请求的df结构,您可以在下面找到2个选项

选项1正在从文件名和频道列表创建一行
选项2从文件名创建一行,每个通道都是通道集

更新
选项3(自OP添加请求输出的描述后创建)

import pandas as pd

data = ['preprocessed\\AB_30624_badchannels.set', {'E88', 'E91', 'E248', 'E139', 'E245', 'E216', 'E111'}, 'preprocessed\\ACM_98630_badchannels.set', {'E88', 'E186', 'E91', 'E139', 'E102', 'E216', 'E111', 'E238'}, 'preprocessed\\AL_96705_badchannels.set', {'E88', 'E37', 'E91', 'E73', 'E232', 'E256', 'E139', 'E235', 'E216', 'E46'}, 'preprocessed\\AP_43781_badchannels.set', {'E25', 'E164', 'E253', 'E104', 'E230', 'E237', 'E18', 'E198', 'E120', 'E10', 'E233', 'E46', 'E54', 'E82', 'E31'}]

# option 1
entries = []
file = None
channels = None
for x in data:
    if isinstance(x,str):
        file = x.split('\\')[-1][:-4]
    else:
        channels = list(x)
    if file and channels:
        entries= {'file': file,'channels': channels}
        file = None
        channels = None

df = pd.DataFrame(entries)
print(df.to_string())


# option 2

entries = []
file = None
channels = None
for x in data:
    if isinstance(x,str):
        file = x.split('\\')[-1][:-4]
    else:
        channels = x
        for c in channels:
            entries.append({'file':file, 'channel':c})
        file = None
        channels = None

df = pd.DataFrame(entries)
print(df.to_string())

# option 3

import pandas as pd

entries = dict()
data = ['preprocessed\\AB_30624_badchannels.set', 
       {'E88', 'E91', 'E248', 'E139', 'E245', 'E216', 'E111'}, 
       'preprocessed\\ACM_98630_badchannels.set',
        {'E88', 'E186', 'E91', 'E139', 'E102', 'E216', 'E111', 'E238'}, 
        'preprocessed\\AL_96705_badchannels.set', 
        {'E88', 'E37', 'E91'}]
_max = 0
for x in range(0,len(data),2):
    key = '_'.join(data[x].split('\\')[1].split('_')[:2])
    entries[key] = list(data[x+1])
    _max = len(entries[key]) if len(entries[key]) > _max  else _max

for k,v in entries.items():
    v.extend(['NA'] * (_max - len(v)))

df = pd.DataFrame(entries)

print(df)

产出(1)

                    file channels
0   AP_43781_badchannels      E46
1   AP_43781_badchannels     E233
2   AP_43781_badchannels     E237
3   AP_43781_badchannels      E18
4   AP_43781_badchannels     E164
5   AP_43781_badchannels     E104
6   AP_43781_badchannels      E82
7   AP_43781_badchannels     E253
8   AP_43781_badchannels     E120
9   AP_43781_badchannels      E10
10  AP_43781_badchannels      E54
11  AP_43781_badchannels     E198
12  AP_43781_badchannels      E25
13  AP_43781_badchannels      E31
14  AP_43781_badchannels     E230

产出(2)

   channel                   file
0      E88   AB_30624_badchannels
1     E216   AB_30624_badchannels
2     E248   AB_30624_badchannels
3     E111   AB_30624_badchannels
4     E139   AB_30624_badchannels
5     E245   AB_30624_badchannels
6      E91   AB_30624_badchannels
7      E88  ACM_98630_badchannels
8     E216  ACM_98630_badchannels
9     E111  ACM_98630_badchannels
10    E186  ACM_98630_badchannels
11    E139  ACM_98630_badchannels
12    E238  ACM_98630_badchannels
13    E102  ACM_98630_badchannels
14     E91  ACM_98630_badchannels
15     E88   AL_96705_badchannels
16    E216   AL_96705_badchannels
17    E232   AL_96705_badchannels
18    E235   AL_96705_badchannels
19     E46   AL_96705_badchannels
20     E73   AL_96705_badchannels
21    E139   AL_96705_badchannels
22    E256   AL_96705_badchannels
23     E37   AL_96705_badchannels
24     E91   AL_96705_badchannels
25     E46   AP_43781_badchannels
26    E233   AP_43781_badchannels
27    E237   AP_43781_badchannels
28     E18   AP_43781_badchannels
29    E164   AP_43781_badchannels
30    E104   AP_43781_badchannels
31     E82   AP_43781_badchannels
32    E253   AP_43781_badchannels
33    E120   AP_43781_badchannels
34     E10   AP_43781_badchannels
35     E54   AP_43781_badchannels
36    E198   AP_43781_badchannels
37     E25   AP_43781_badchannels
38     E31   AP_43781_badchannels
39    E230   AP_43781_badchannels

产出(3)

  AB_30624 ACM_98630 AL_96705
0      E91      E111      E37
1     E245      E216      E91
2     E111       E91      E88
3     E248      E238       NA
4      E88      E186       NA
5     E216       E88       NA
6     E139      E139       NA
7       NA      E102       NA

我的解决方案有点残酷:

import pandas as pd

array = ['preprocessed\\AB_30624_badchannels.set', {'E88', 'E91', 'E248', 'E139', 'E245', 'E216', 'E111'},
         'preprocessed\\ACM_98630_badchannels.set', {'E88', 'E186', 'E91', 'E139', 'E102', 'E216', 'E111', 'E238'},
         'preprocessed\\AL_96705_badchannels.set',
         {'E88', 'E37', 'E91', 'E73', 'E232', 'E256', 'E139', 'E235', 'E216', 'E46'},
         'preprocessed\\AP_43781_badchannels.set',
         {'E25', 'E164', 'E253', 'E104', 'E230', 'E237', 'E18', 'E198', 'E120', 'E10', 'E233', 'E46', 'E54', 'E82',
          'E31'}]

row_names = []
rows = []
# add string names in array to row_names-list and use dicts to fill rows-list
for i in array:
    if isinstance(i, str):
        # split sring to only use file name as row name
        colum_name = i.split("\\")
        row_names.append(colum_name[1])
    elif isinstance(i, set):
        rows.append([j for j in i])

# merge rows and row_names to DataFrame
max_length = max([len(i) for i in rows])
df = pd.DataFrame(index=row_names, columns=list(range(15)))

for i in range(df.shape[0]):
    for j, val in enumerate(rows[i]):
        df.iloc[i,j] = val
# transform rows to columns and vice versa
df = df.T

在我看来,数据帧不是存储不同大小信息的正确格式。这里推荐一个dict

你有几个问题要处理。第一个是列表在名称字符串和一组值之间交替。一个将这种交替转换成成对的快速生成器修复了这个问题。可以使用正则表达式将数据中的名称转换为所需的列名。您的值有不同的长度,因为它们是集合,所以它们的顺序是随机的。我们可以将它们转换为命名系列,但无法解决顺序问题。最后,连接该系列,就得到了数据帧

import re
import pandas as pd
import numpy as np

# extracts channel name from ex. "preprocessed\\AB_30624_badchannels.set"
colname_re = re.compile(r"\\(.*?)_badchannels.set")

# test data set
data = ['preprocessed\\AB_30624_badchannels.set', {'E88', 'E91', 'E248', 'E139', 'E245', 'E216', 'E111'}, 'preprocessed\\ACM_98630_badchannels.set', {'E88', 'E186', 'E91', 'E139', 'E102', 'E216', 'E111', 'E238'}, 'preprocessed\\AL_96705_badchannels.set', {'E88', 'E37', 'E91'}]

def iter_pairs(seq):
    """Iterate seq as pairs. E.g, (1,2,3,4) iterates as (1,2) then (3,4)"""
    iseq = iter(seq)
    try:
        while True:
            yield next(iseq), next(iseq)
    except StopIteration:
        pass

# create list of named series for the dataframe
interim_series = [pd.Series(list(values), name=colname_re.search(name).group(1))
        for name, values in iter_pairs(data)]

# build dataframe, convert NaN to ""
df = pd.concat(interim_series, axis=1)
del interim_series
df = df.replace(np.NaN, "")
print(df)

相关问题 更多 >

    热门问题