在splitapplycombie中省略的列

import pandas as pd import numpy as np import matplotlib.pyplot as plt import sys from sklearn.cross_validation import train_test_split from sklearn import linear_model # this function takes the drugcount dataframe as input and output a tuple of 3 data frames: DrugCount_Y1,DrugCount_Y2,DrugCount_Y3 def process_DrugCount(drugcount): dc = pd.read_csv("DrugCount.csv") sub_map = {'1' : 1, '2':2, '3':3, '4':4, '5':5, '6':6, '7+' : 7} dc['DrugCount'] = dc.DrugCount.map(sub_map) dc['DrugCount'] = dc.DrugCount.astype(int) dc_grouped = dc.groupby(dc.Year, as_index=False) DrugCount_Y1 = dc_grouped.get_group('Y1') DrugCount_Y2 = dc_grouped.get_group('Y2') DrugCount_Y3 = dc_grouped.get_group('Y3') DrugCount_Y1.drop('Year', axis=1, inplace=True) DrugCount_Y2.drop('Year', axis=1, inplace=True) DrugCount_Y3.drop('Year', axis=1, inplace=True) return (DrugCount_Y1,DrugCount_Y2,DrugCount_Y3) # this function converts strings such as "1- 2 month" to "1_2" def replaceMonth(string): replace_map = {'0- 1 month' : "0_1", "1- 2 months": "1_2", "2- 3 months": "2_3", "3- 4 months": '3_4', "4- 5 months": "4_5", "5- 6 months": "5_6", "6- 7 months": "6_7", \ "7- 8 months" : "7_8", "8- 9 months": "8_9", "9-10 months": "9_10", "10-11 months": "10_11", "11-12 months": "11_12"} a_new_string = string.map(replace_map) return a_new_string # this function processes a yearly drug count data def process_yearly_DrugCount(aframe): processed_frame = None aframe.drop("Year", axis = 1, inplace = True) reformed = aframe[['DSFS']].apply(replaceMonth) gd = pd.get_dummies(reformed) joined = pd.concat([aframe, gd], axis = 1) joined.drop("DSFS", axis = 1, inplace = True) joined_grouped = joined.groupby("MemberID", as_index = False) joined_grouped_agg = joined_grouped.agg(np.sum) print joined_grouped_agg return processed_frame def main(): pd.options.mode.chained_assignment = None daysinhospital = pd.read_csv('DaysInHospital_Y2.csv') drugcount = pd.read_csv('DrugCount.csv') process_DrugCount(drugcount) process_yearly_DrugCount(drugcount) replaceMonth(drugcount['DSFS']) if __name__ == '__main__': main()

1条回答

网友

1楼 · 发布于 2024-09-30 18:27:42

简单地说，直接从csv中提取的DrugCount不是作为数字字段（int/float）读入的。否则它将保留在.agg(np.sum)处理中。在聚合之前，检查数据类型并查看它是否是object类型（即字符串列）：

print joined['DrugCount'].dtype

实际上，在process_DrugCount()函数中，使用astype显式地将DrugCount列转换为整数，但在process_yearly_DrugCount()函数中不这样做。在后一个函数中运行同一行，在聚合和处理中应保留药量：

aframe['DrugCount'] = aframe['DrugCount'].astype(int)

或者更好，在main()中，为了避免在后面的函数中进行两次转换：

drugcount['DrugCount'] = drugcount['DrugCount'].astype(int)

另外，请注意，read_csv()允许使用其dtype参数显式指定列类型：

drugcount = pd.read_csv('DrugCount.csv', dtype={'DrugCount': np.int64})

相关问题更多 >

编程相关推荐

热门问题

热门文章