基于支持向量机系数函数的特征重要性分析

2024-09-25 00:22:55 发布

您现在位置:Python中文网/ 问答频道 /正文

我正在进行一个文本分类项目,并尝试使用SVC(kernel='linear')来获得特性的重要性。这是我的代码:
(我从this post更改了代码)

X = df1[features]
y = df1['label']


# Create selector class for text and numbers
class TextSelector(BaseEstimator, TransformerMixin):
    """Transformer to select a single column from the data frame to perform additional transformations on"""
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]

class NumberSelector(BaseEstimator, TransformerMixin):
    """For data grouped by feature, select subset of data at a provided key."""
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]

scaler = StandardScaler()    
text = Pipeline([
                ('selector', TextSelector(key='title_mainText')),
                ('vect', TfidfVectorizer(ngram_range=(1, 2))),                
            ])

upper_title =  Pipeline([
                ('selector', NumberSelector(key='upper_title')),
                ('standard', scaler),
            ])

upper_mainText =  Pipeline([
                ('selector', NumberSelector(key='upper_mainText')),
                ('standard', scaler),
            ])

punct_title =  Pipeline([
                ('selector', NumberSelector(key='punct_title')),
                ('standard', scaler),
            ])

punct_mainText =  Pipeline([
                ('selector', NumberSelector(key='punct_mainText')),
                ('standard', scaler),
            ])


exclamations_title =  Pipeline([
                ('selector', NumberSelector(key='exclamations_title')),
                ('standard', scaler),
            ])


exclamations_text =  Pipeline([
                ('selector', NumberSelector(key='exclamations_text')),
                ('standard', scaler),
            ])


feats = FeatureUnion([('title_mainText', text), 
                      ('upper_title', upper_title),
                      ('upper_mainText', upper_mainText),
                      ('punct_title', punct_title),
                      ('punct_mainText', punct_mainText),                    
                      ('exclamations_text', exclamations_text),
                      ('exclamations_title', exclamations_title),                        

feature_processing = Pipeline([('feats', feats)])

pipeline = Pipeline([
        ('features', feats),
        ('classifier', SVC(C=1, kernel= 'linear', max_iter= 1000, tol=0.0001, probability=True))
    ])


    def f_importances(coef, names):
        imp = coef
        imp,names = zip(*sorted(zip(imp,names)))
        plt.barh(range(len(names)), imp, align='center')
        plt.yticks(range(len(names)), names)
        plt.show()

    features_names = ['title_mainText', 'upper_title', 'upper_mainText', 'punct_title', 'punct_mainText',
                      'exclamations_title', 'exclamations_text']
    pipeline.fit(X, y)
    clf = pipeline.named_steps['classifier']
    f_importances(clf.coef_, features_names)

但是,它显示了一个错误消息,我不知道我在哪里做错了。 以前有人有过这样的经历吗?在

ValueError Traceback (most recent call last) in () 13 pipeline.fit(X, y) 14 clf = pipeline.named_steps['classifier'] ---> 15 f_importances((clf.coef_[0]), features_names) 16

in f_importances(coef, names) 5 imp = coef 6 imp,names = zip(*sorted(zip(imp,names))) ----> 7 plt.barh(range(len(names)), imp, align='center') 8 plt.yticks(range(len(names)), names) 9 plt.show()

/anaconda3/lib/python3.6/site-packages/matplotlib/pyplot.py in barh(*args, **kwargs) 2667 mplDeprecation)
2668 try: -> 2669 ret = ax.barh(*args, **kwargs) 2670 finally: 2671 ax._hold = washold

/anaconda3/lib/python3.6/site-packages/matplotlib/axes/_axes.py in barh(self, *args, **kwargs) 2281
kwargs.setdefault('orientation', 'horizontal') 2282 patches = self.bar(x=left, height=height, width=width, -> 2283 bottom=y, **kwargs) 2284 return patches 2285

/anaconda3/lib/python3.6/site-packages/matplotlib/init.py in inner(ax, *args, **kwargs) 1715
warnings.warn(msg % (label_namer, func.name), 1716
RuntimeWarning, stacklevel=2) -> 1717 return func(ax, *args, **kwargs) 1718 pre_doc = inner.doc 1719 if pre_doc is None:

/anaconda3/lib/python3.6/site-packages/matplotlib/axes/_axes.py in bar(self, *args, **kwargs) 2091 elif orientation == 'horizontal': 2092 r.sticky_edges.x.append(l) -> 2093 self.add_patch(r) 2094 patches.append(r) 2095

/anaconda3/lib/python3.6/site-packages/matplotlib/axes/_base.py in add_patch(self, p) 1852 if p.get_clip_path() is None:
1853 p.set_clip_path(self.patch) -> 1854 self._update_patch_limits(p) 1855 self.patches.append(p) 1856 p._remove_method = lambda h: self.patches.remove(h)

/anaconda3/lib/python3.6/site-packages/matplotlib/axes/_base.py in _update_patch_limits(self, patch) 1868 # or height. 1869 if (isinstance(patch, mpatches.Rectangle) and -> 1870 ((not patch.get_width()) and (not patch.get_height()))): 1871 return 1872
vertices = patch.get_path().vertices

/anaconda3/lib/python3.6/site-packages/scipy/sparse/base.py in bool(self) 286 return self.nnz != 0 287 else: --> 288 raise ValueError("The truth value of an array with more than one " 289 "element is ambiguous. Use a.any() or a.all().") 290 nonzero = bool

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all().

谢谢你!在


Tags: keytextinselfreturnpipelinenamestitle
1条回答
网友
1楼 · 发布于 2024-09-25 00:22:55

scikitlearn的文档states指出coef_U属性是shape=[n_class*(n_class-1)/2,n_features]的数组。假设有4个类和9个特性,_coef的形状是6 x 9(6行9列)。另一方面,barh期望每个特性有一个值,而不是六个,因此您得到了一个错误。如果您将每个列的系数相加,就可以消除它,如下例所示。在

import numpy as np
import matplotlib.pyplot as plt

def f_importances(coef, names):
    imp = coef
    imp,names = zip(*sorted(zip(imp,names)))
    plt.barh(range(len(names)), imp, align='center')
    plt.yticks(range(len(names)), names)
    plt.show()

features_names = ['title_mainText', 'upper_title', 'upper_mainText', 'punct_title', 'punct_mainText',
                  'exclamations_title', 'exclamations_text', 'title_words_not_stopword', 'text_words_not_stopword']

n_classes = 4
n_features = len(features_names)

clf_coef_ = np.random.randint(1, 30, size=(int(0.5*n_classes*(n_classes-1)), n_features))

f_importances(clf_coef_.sum(axis=0), features_names)

enter image description here

相关问题 更多 >