#read sample data - omit first column and last row (because it seems wrong data)
df = pd.read_csv('https://pastebin.com/raw/RgcE69KC', nrows=9, usecols=range(1,12))
#add data for another user for better sample
df.loc[range(5), 'User'] = 'Jan'
print (df)
User cb43/cb431c cb43/cb431j cb43/cb432c cb43/cb433c cb43/cb434c \
0 Jan n/a n/a yes yes yes
1 Jan n/a n/a n/a n/a n/a
2 Jan n/a n/a yes yes yes
3 Jan n/a n/a yes yes yes
4 Jan n/a n/a yes yes yes
5 Dan n/a n/a n/a n/a n/a
6 Dan n/a n/a n/a n/a n/a
7 Dan n/a n/a n/a n/a n/a
8 Dan n/a n/a yes yes yes
cb43/cb435c cb43/cb436c cb43/cb437c cb43/cb437j cb44/cb441c
0 yes yes no No records available yes
1 n/a n/a n/a n/a n/a
2 yes yes no No records available yes
3 yes yes yes No fertilizer applied yes
4 yes yes yes No fertilizer applied yes
5 n/a n/a n/a n/a n/a
6 n/a n/a n/a n/a n/a
7 n/a n/a n/a n/a n/a
8 yes yes na No fertilizer applied yes
df = df.set_index('User')
#replace all non 'yes' and 'no' values to `NaN` and reshape
df = df.where(df.isin(['yes','no'])).stack().reset_index(name='val')
#get for each User unique columns names
df = df.groupby('User')['level_1'].unique().reset_index(name='un_val')
print (df)
User un_val
0 Dan [cb43/cb432c, cb43/cb433c, cb43/cb434c, cb43/c...
1 Jan [cb43/cb432c, cb43/cb433c, cb43/cb434c, cb43/c...
我想你需要:
相关问题 更多 >
编程相关推荐