我有这个df
,我想使用交叉验证来训练、测试和验证它的功能:
RangeIndex: 370 entries, 0 to 369
Data columns (total 17 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 round 370 non-null int64 <---- will be dropped
1 home_team_goal 370 non-null int64 <---- will be dropped
2 away_team_goal 370 non-null int64 <---- will be dropped
3 home_best_attack 370 non-null float64
4 home_best_defense 370 non-null float64
5 home_avg_attack 370 non-null float64
6 home_avg_defense 370 non-null float64
7 home_std_attack 370 non-null float64
8 home_std_defense 370 non-null float64
9 gk_home_player_1 370 non-null float64
10 away_avg_attack 370 non-null float64
11 away_avg_defense 370 non-null float64
12 away_std_attack 370 non-null float64
13 away_std_defense 370 non-null float64
14 away_best_attack 370 non-null float64
15 away_best_defense 370 non-null float64
16 gk_away_player_1 370 non-null float64
dtypes: float64(14), int64(3)
Dataset
然而,我的数据集必须包含一些添加的拓扑特征(它捕获了上述原始特征集之间的图表和几何关系)
这就是我需要我的数据集在pandas
数据帧中结束的方式:
Data columns (total 20 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 home_best_attack 2565 non-null float64
1 home_best_defense 2565 non-null float64
2 home_avg_attack 2565 non-null float64
3 home_avg_defense 2565 non-null float64
4 home_std_attack 2565 non-null float64
5 home_std_defense 2565 non-null float64
6 gk_home_player_1 2565 non-null float64
7 away_avg_attack 2565 non-null float64
8 away_avg_defense 2565 non-null float64
9 away_std_attack 2565 non-null float64
10 away_std_defense 2565 non-null float64
11 away_best_attack 2565 non-null float64
12 away_best_defense 2565 non-null float64
13 gk_away_player_1 2565 non-null float64
14 bottleneck_metric 2565 non-null float64 <---- will be added
15 wasserstein_metric 2565 non-null float64 <---- will be added
16 landscape_metric 2565 non-null float64 <---- will be added
17 betti_metric 2565 non-null float64 <---- will be added
18 heat_metric 2565 non-null float64 <---- will be added
19 label 2565 non-null float64 <---- will be added
Test Feature Extraction
我有以下方法从原始数据帧中提取这些添加的功能:
def extract_topological_features(diagrams):
metrics = ['bottleneck', 'wasserstein', 'landscape', 'betti', 'heat']
new_features = []
for metric in metrics:
amplitude = Amplitude(metric=metric)
new_features.append(amplitude.fit_transform(diagrams))
new_features = np.concatenate(new_features, axis=1)
return new_features
def extract_features_for_fantasy_prediction(x_train, y_train, x_test, y_test, pipeline):
shift = 10
top_features = []
# run as main
all_x_train = x_train[:, :14]
all_y_train = y_train
for i in tqdm(range(0, len(x_test), shift)):
#
print(range(0, len(x_test), shift))
if i+shift > len(x_test):
shift = len(x_test) - i
batch = np.concatenate([all_x_train, x_test[i: i + shift]])
batch_y = np.concatenate([all_y_train, y_test[i: i + shift].reshape((-1,))])
diagrams_batch, _ = pipeline.fit_transform_resample(batch, batch_y)
new_features_batch = extract_topological_features(diagrams_batch[-shift:])
top_features.append(new_features_batch)
all_x_train = np.concatenate([all_x_train, batch[-shift:]])
all_y_train = np.concatenate([all_y_train, batch_y[-shift:]])
final_x_test = np.concatenate([x_test, np.concatenate(top_features, axis=0)], axis=1)
return final_x_test
Cross Validation
这是我这样做的代码:
def cross_validate(self, full_x, full_y, splitting_dates):
train_split_date = splitting_dates[0]
val_split_date = splitting_dates[1]
end_date = splitting_dates[2]
train_x = full_x[(full_x['round'] > train_split_date) | (full_x['round'] <= end_date)]
train_y = full_y[(full_x['round'] > train_split_date) | (full_x['round'] <= end_date)]
val_x = full_x[(full_x['round'] >= train_split_date) & (full_x['round'] < val_split_date)]
val_y = full_y[(full_x['round'] >= train_split_date) & (full_x['round'] < val_split_date)]
test_x = full_x[(full_x['round'] >= val_split_date) & (full_x['round'] < end_date)]
test_y = full_y[(full_x['round'] >= val_split_date) & (full_x['round'] < end_date)]
train_x.pop("round")
val_x.pop("round")
test_x.pop("round")
train_x = train_x.values
train_y = train_y.values
val_x = val_x.values
val_y = val_y.values
test_x = test_x.values
test_y = test_y.values
print("START VALIDATING MODEL")
models_cv = self._validate_k_fold_model(train_x, train_y, val_x, val_y)
best_model_params = best_combination(models_cv)
best_model_params.pop("score")
best_model = RandomForestClassifier(**best_model_params)
best_model.fit(train_x, train_y)
score = best_model.score(test_x, test_y)
print(f'score no_top {score}')
print(f'best model parameters no_top {best_model_params}')
print("START VALIDATING PARAMS")
topo_cv = self._validate_k_fold_top(best_model, train_x, train_y, val_x, val_y)
best_topo = best_combination(topo_cv)
best_topo.pop("score")
best_topo_pipeline_list = [('extract_subspaces', SubSpaceExtraction(**best_topo)),
('compute_diagrams', VietorisRipsPersistence(n_jobs=-1))]
best_topo_pipeline = Pipeline(best_topo_pipeline_list)
train_x_for_test = np.concatenate([train_x, val_x], axis=0)
train_y_for_test = np.concatenate([train_y, val_y], axis=0)
diagrams_train, _ = best_topo_pipeline.fit_transform_resample(train_x_for_test, train_y_for_test)
print("EXTRACTING TOPOLOGICAL FEATURES TRAIN")
top_features_train = extract_topological_features(diagrams_train)
x_train_model = np.concatenate([train_x_for_test, top_features_train], axis=1)
best_model.fit(x_train_model, train_y_for_test)
print("EXTRACTING TOPOLOGICAL FEATURES TEST")
x_test_model = extract_features_for_fantasy_prediction(x_train_model, train_y_for_test,
test_x, test_y, best_topo_pipeline)
score_top = best_model.score(x_test_model, test_y)
val_x_with_topo = extract_features_for_fantasy_prediction(train_x, train_y, val_x, val_y, best_topo_pipeline)
print('START VALIDATING MODEL WITH OPTIMAL TOPOLOGY')
model_config_with_topo = self._validate_k_fold_model(x_train_model[:train_x.shape[0]], train_y, val_x_with_topo, val_y)
best_model_config_with_topo = best_combination(model_config_with_topo)
best_model_config_with_topo.pop('score')
best_model_with_topo = RandomForestClassifier(**best_model_config_with_topo)
best_model_with_topo.fit(x_train_model, train_y_for_test)
score_best_topo_and_model = best_model_with_topo.score(x_test_model, test_y)
print(f'score best model and topo_feat {score_best_topo_and_model}')
return best_model_params, best_topo, best_model_config_with_topo, score, score_top, score_best_topo_and_model
问题:
正在运行以下代码
y = compute_match_result(df)
df.pop('home_team_goal')
df.pop('away_team_goal')
cv = CrossValidation(k_mins=k_mins, k_maxs=k_maxs, dist_percentages=distances, **model_params)
cv_output = cv.cross_validate(df, y, (train_split_date, val_split_date, end_date))
…而且由于大多数特征都被计算为arrays
,在上面的cross_validation()
点(以及如何)我可以将拓扑特征添加到原始df
,最终得到上面所需的数据集(datarame)
注意:完整代码可以在here中找到
对于数据集示例:
pip install openml
然后在脚本的顶部:
from openml.datasets import get_dataset
以及:
x_y_df = get_dataset(42188).get_data(dataset_format='dataframe')[0]
目前没有回答
相关问题 更多 >
编程相关推荐