基于支持向量机系数函数的特征重要性分析

X = df1[features] y = df1['label'] # Create selector class for text and numbers class TextSelector(BaseEstimator, TransformerMixin): """Transformer to select a single column from the data frame to perform additional transformations on""" def __init__(self, key): self.key = key def fit(self, X, y=None): return self def transform(self, X): return X[self.key] class NumberSelector(BaseEstimator, TransformerMixin): """For data grouped by feature, select subset of data at a provided key.""" def __init__(self, key): self.key = key def fit(self, X, y=None): return self def transform(self, X): return X[[self.key]] scaler = StandardScaler() text = Pipeline([ ('selector', TextSelector(key='title_mainText')), ('vect', TfidfVectorizer(ngram_range=(1, 2))), ]) upper_title = Pipeline([ ('selector', NumberSelector(key='upper_title')), ('standard', scaler), ]) upper_mainText = Pipeline([ ('selector', NumberSelector(key='upper_mainText')), ('standard', scaler), ]) punct_title = Pipeline([ ('selector', NumberSelector(key='punct_title')), ('standard', scaler), ]) punct_mainText = Pipeline([ ('selector', NumberSelector(key='punct_mainText')), ('standard', scaler), ]) exclamations_title = Pipeline([ ('selector', NumberSelector(key='exclamations_title')), ('standard', scaler), ]) exclamations_text = Pipeline([ ('selector', NumberSelector(key='exclamations_text')), ('standard', scaler), ]) feats = FeatureUnion([('title_mainText', text), ('upper_title', upper_title), ('upper_mainText', upper_mainText), ('punct_title', punct_title), ('punct_mainText', punct_mainText), ('exclamations_text', exclamations_text), ('exclamations_title', exclamations_title), feature_processing = Pipeline([('feats', feats)]) pipeline = Pipeline([ ('features', feats), ('classifier', SVC(C=1, kernel= 'linear', max_iter= 1000, tol=0.0001, probability=True)) ]) def f_importances(coef, names): imp = coef imp,names = zip(*sorted(zip(imp,names))) plt.barh(range(len(names)), imp, align='center') plt.yticks(range(len(names)), names) plt.show() features_names = ['title_mainText', 'upper_title', 'upper_mainText', 'punct_title', 'punct_mainText', 'exclamations_title', 'exclamations_text'] pipeline.fit(X, y) clf = pipeline.named_steps['classifier'] f_importances(clf.coef_, features_names)

1条回答

网友

1楼 · 发布于 2024-09-25 00:22:55

scikitlearn的文档states指出coef_U属性是shape=[n_class*（n_class-1）/2，n_features]的数组。假设有4个类和9个特性，_coef的形状是6 x 9（6行9列）。另一方面，barh期望每个特性有一个值，而不是六个，因此您得到了一个错误。如果您将每个列的系数相加，就可以消除它，如下例所示。在

import numpy as np
import matplotlib.pyplot as plt

def f_importances(coef, names):
    imp = coef
    imp,names = zip(*sorted(zip(imp,names)))
    plt.barh(range(len(names)), imp, align='center')
    plt.yticks(range(len(names)), names)
    plt.show()

features_names = ['title_mainText', 'upper_title', 'upper_mainText', 'punct_title', 'punct_mainText',
                  'exclamations_title', 'exclamations_text', 'title_words_not_stopword', 'text_words_not_stopword']

n_classes = 4
n_features = len(features_names)

clf_coef_ = np.random.randint(1, 30, size=(int(0.5*n_classes*(n_classes-1)), n_features))

f_importances(clf_coef_.sum(axis=0), features_names)

相关问题更多 >

编程相关推荐

热门问题

热门文章