在多索引中连接级别

{0: {0: nan, 1: nan, 2: nan, 3: 'A', 4: 'A', 5: 'B', 6: 'B', 7: 'C', 8: 'C'}, 1: {0: nan, 1: nan, 2: nan, 3: 1.0, 4: 2.0, 5: 1.0, 6: 2.0, 7: 1.0, 8: 2.0}, 2: {0: 'AA1', 1: 'a', 2: 'ng/mL', 3: 1, 4: 1, 5: 1, 6: 1, 7: 1, 8: 1}, 3: {0: 'AA2', 1: 'a', 2: nan, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1, 8: 1}, 4: {0: 'BB1', 1: 'b', 2: nan, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1, 8: 1}, 5: {0: 'BB2', 1: 'b', 2: 'mL', 3: 1, 4: 1, 5: 1, 6: 1, 7: 1, 8: 1}, 6: {0: 'CC1', 1: 'c', 2: nan, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1, 8: 1}, 7: {0: 'CC2', 1: 'c', 2: nan, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1, 8: 1}}

level_0 AA1 AA2 CB1 BB2 CC1 CC2 new a ng/mL a N/A b N/A b mL c N/A c N/A 0 1 A 1 1 1 1 1 1 1 2 1 1 1 1 1 1 B 1 1 1 1 1 1 1 2 1 1 1 1 1 1 C 1 1 1 1 1 1 1 2 1 1 1 1 1 1

# read the column index separately to avoid pandas inputting "Unnamed: ..." # for the nans df = pd.read_excel(file_path, skiprows=3, index_col=None, header=None) df.set_index([0, 1], inplace=True) # the column index cols = pd.read_excel(file_path, nrows=3, index_col=None, header=None).loc[:, 2:] cols = cols.fillna('N/A') idx = pd.MultiIndex.from_arrays(cols.values) df.columns = idx

AA1 AA2 CB1 BB2 CC1 CC2 a a b b c c ng/mL N/A N/A mL N/A N/A 0 1 A 1 1 1 1 1 1 1 2 1 1 1 1 1 1 B 1 1 1 1 1 1 1 2 1 1 1 1 1 1 C 1 1 1 1 1 1 1 2 1 1 1 1 1 1

level_0 AA1 AA2 CB1 BB2 CC1 CC2 new a ng/mL a N/A b N/A b mL c N/A c N/A 0 1 A 1 1 1 1 1 1 1 2 1 1 1 1 1 1 B 1 1 1 1 1 1 1 2 1 1 1 1 1 1 C 1 1 1 1 1 1 1 2 1 1 1 1 1 1

1条回答

网友

1楼 · 发布于 2024-05-19 16:25:21

用途：

#file from sample data

d = {0: {0:  np.nan, 1:  np.nan, 2:  np.nan, 3: 'A', 4: 'A', 5: 'B', 6: 'B', 7: 'C', 8: 'C'}, 
     1: {0:  np.nan, 1:  np.nan, 2:  np.nan, 3: 1.0, 4: 2.0, 5: 1.0, 6: 2.0, 7: 1.0, 8: 2.0}, 
     2: {0: 'AA1', 1: 'a', 2: 'ng/mL', 3: 1, 4: 1, 5: 1, 6: 1, 7: 1, 8: 1}, 
     3: {0: 'AA2', 1: 'a', 2:  np.nan, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1, 8: 1}, 
     4: {0: 'BB1', 1: 'b', 2:  np.nan, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1, 8: 1}, 
     5: {0: 'BB2', 1: 'b', 2: 'mL', 3: 1, 4: 1, 5: 1, 6: 1, 7: 1, 8: 1}, 
     6: {0: 'CC1', 1: 'c', 2:  np.nan, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1, 8: 1}, 
     7: {0: 'CC2', 1: 'c', 2:  np.nan, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1, 8: 1}}

df = pd.DataFrame(d)

df.to_excel('file.xlsx', header=False, index=False)

首先用header=[0,1,2]创建MultiIndex DataFrame，然后用^{}的前2列创建MultiIndex，用^{}删除索引名：

df = pd.read_excel('file.xlsx', header=[0,1,2])

df = df.set_index(df.columns[:2].tolist()).rename_axis((None, None))

然后按列表理解中的每个级别循环，并将第二个级别与第三个级别连接，如果不是Unnamed，则最后使用^{}：

tuples = [(a, f'{b} N/A') if c.startswith('Unnamed') 
          else (a, f'{b} {c}') 
          for a, b, c in df.columns]

print (tuples)
[('AA1', 'a ng/mL'), ('AA2', 'a N/A'), 
 ('BB1', 'b N/A'), ('BB2', 'b mL'),
 ('CC1', 'c N/A'), ('CC2', 'c N/A')]

df.columns = pd.MultiIndex.from_tuples(tuples)
print (df)
        AA1   AA2   BB1  BB2   CC1   CC2
    a ng/mL a N/A b N/A b mL c N/A c N/A
A 1       1     1     1    1     1     1
  2       1     1     1    1     1     1
B 1       1     1     1    1     1     1
  2       1     1     1    1     1     1
C 1       1     1     1    1     1     1
  2       1     1     1    1     1     1

另一个想法是使用：

df = pd.read_excel('file.xlsx', header=[0,1,2])
df = df.set_index(df.columns[:2].tolist()).rename_axis((None, None))

lv1 = df.columns.get_level_values(0)
lv2 = df.columns.get_level_values(1)
lv3 = df.columns.get_level_values(2)
lv3 = lv3.where(~lv3.str.startswith('Unnamed'),'N/A')

df.columns = [lv1, lv2.to_series() + ' ' + lv3]

相关问题更多 >

编程相关推荐

热门问题

热门文章