在pandas中为groupby设置一些规则

import pandas as pd import numpy as np df = pd.read_csv("sample.csv",delimiter='|') df['datetime'] = pd.to_datetime(df['datetime'],errors = 'coerce') most_recent_date = df.groupby(df['VIP_ID'])['datetime'].max() most_recent_date= most_recent_date.rename("most_recent_date") df = df.join(most_recent_date, on="VIP_ID") df['both'] = np.where( ((df['keep'] == 'same tier')&(dup == 'yes')), df['VIP_ID']+df['datetime'].astype(str), df['ID'] ) df['keep'] = np.where( df['keep'] != 'same tier',df['keep'], (np.where( df['most_recent_date'] == df['datetime'], 'yes', 'dup by ' + df['VIP_ID'].astype(str))) ) df.loc[df.duplicated(subset=['both'], keep = False),'keep'] = 'same time' df = df.drop(columns = ['both','most_recent_date']) print(df)

ID|VIP_ID|TIER|datatime|keep 1|F08210020403|GO|2014-05-17 00:00:00|same tier 2|F08210020403|GO|2014-04-18 00:00:00|same tier 3|F08210020403|FO||dup by F08210020403 4|F08210020403|FO||dup by F08210020403 5|F08210020403|FO|2016-09-18 00:00:00|dup by F08210020403 6|F08210020403|FO|2016-05-10 00:00:00|dup by F08210020403 7|F08210020403|FO||dup by F08210020403 8|F08210020403|FO||dup by F08210020403

1|F08210020403|GO|2014-05-17 00:00:00|dup by F08210020403 2|F08210020403|GO|2014-04-18 00:00:00|dup by F08210020403 3|F08210020403|FO||dup by F08210020403 4|F08210020403|FO||dup by F08210020403 5|F08210020403|FO|2016-09-18 00:00:00|dup by F08210020403 6|F08210020403|FO|2016-05-10 00:00:00|dup by F08210020403 7|F08210020403|FO||dup by F08210020403 8|F08210020403|FO||dup by F08210020403

1|F08210020403|GO|2014-05-17 00:00:00|yes 2|F08210020403|GO|2014-04-18 00:00:00|dup by F08210020403 3|F08210020403|FO||dup by F08210020403 4|F08210020403|FO||dup by F08210020403 5|F08210020403|FO|2016-09-18 00:00:00|dup by F08210020403 6|F08210020403|FO|2016-05-10 00:00:00|dup by F08210020403 7|F08210020403|FO||dup by F08210020403 8|F08210020403|FO||dup by F08210020403

1条回答

网友

1楼 · 发布于 2024-05-02 10:44:42

IIUC：

尝试：

c=df['keep'].str.contains('dup by')
#created a condition which check if 'keep' column contains 'dup by' or not
df['datetime'] = pd.to_datetime(df['datetime'],errors = 'coerce')
most_recent_date = df[~c].groupby(df['VIP_ID'])['datetime'].max()
#excluded those rows in groupby where 'keep' contains 'dup by'
df['most_recent_date']=df['VIP_ID'].map(most_recent_date)
df['both'] = np.where((df['keep'] == 'same tier') & c,df['VIP_ID']+df['datetime'].astype(str),df['ID'])
df['keep'] = np.where(
    df['keep'] != 'same tier',df['keep'],
    (np.where(
         df['most_recent_date'] == df['datetime'],
         'yes',
         'dup by ' + df['VIP_ID'].astype(str)))
)
df.loc[df.duplicated(subset=['both'], keep = False),'keep'] = 'same time'
df = df.drop(columns = ['both','most_recent_date'])

相关问题更多 >

编程相关推荐

热门问题

热门文章