我正在尝试创建一个更小的分层样本,以减少处理时间
运行此代码:
df_strat= stratified_sample(df, ["Parental Status","Gender", "Age", "Geographical Residence", "Highest Level of Education", "Industry","725", "899","1125", "1375", "1625", "1875", "2500","3000"], size=None, keep_index=True)
这就是功能:
def stratified_sample(df, strata, size=None, seed=None, keep_index= True): population = len(df)
size = __smpl_size(population, size)
tmp = df[strata]
tmp['size'] = 1
tmp_grpd = tmp.groupby(strata).count().reset_index()
tmp_grpd['samp_size'] = round(size/population * tmp_grpd['size']).astype(int)
# controlling variable to create the dataframe or append to it
first = True
for i in range(len(tmp_grpd)):
# query generator for each iteration
qry=''
for s in range(len(strata)):
stratum = strata[s]
value = tmp_grpd.iloc[i][stratum]
n = tmp_grpd.iloc[i]['samp_size']
if type(value) == str:
value = "'" + str(value) + "'"
if s != len(strata)-1:
qry = qry + stratum + ' == ' + str(value) +' & '
else:
qry = qry + stratum + ' == ' + str(value)
# final dataframe
if first:
stratified_df = df.query(qry).sample(n=n, random_state=seed).reset_index(drop=(not keep_index))
first = False
else:
tmp_df = df.query(qry).sample(n=n, random_state=seed).reset_index(drop=(not keep_index))
stratified_df = stratified_df.append(tmp_df, ignore_index=True)
return stratified_df
我要退回这个:
File "<unknown>", line 1
Parental Status =='False'and Gender =='F'and Age =='20-29'and Geographical Residence =='Adelaide'and Highest Level of Education =='1'and Industry =='A'and 725 ==13 and 899 ==14 and 1125 ==5 and 1375 ==0 and 1625 ==0 and 1875 ==0 and 2500 ==0 and 3000 ==0
^
SyntaxError: Python keyword not valid identifier in numexpr query
使用此错误代码的其他人有导致此问题的符号,但我的数据是干净的,并且不是object就是int32数据
有人知道是什么导致了这个问题吗
似乎可以通过删除列标题之间的空格来解决此问题。
例如
"Parental Status"
到"Parental_Status"
解决了这个问题相关问题 更多 >
编程相关推荐