我已经尝试在我的模型上获得功能名称很长一段时间了,但是很难理解如何做到这一点。我在这里试过很多帖子,但都没能成功。这是我的密码:
加载将tfidfvectorizer与其他功能组合所需的类
from sklearn.base import TransformerMixin, BaseEstimator
class ItemSelector(BaseEstimator, TransformerMixin):
def __init__(self, key):
self.key = key
def fit(self, x, y=None):
return self
def transform(self, data_dict):
return data_dict[self.key]
class FeatureTypeSelector(TransformerMixin, BaseEstimator):
FEATURE_TYPES = {
'categorical': [
'COLUMN_A','COLUMN_B'
],
'continuous': [
'COLULMN_C','COLUMN_D'
]
}
def __init__(self, feature_type):
self.columns = self.FEATURE_TYPES[feature_type]
def fit(self, X, y=None):
return self
def transform(self, X):
return X[self.columns]
class RowToDictTransformer(TransformerMixin, BaseEstimator):
def fit(self, X, y=None):
return self
def transform(self, X):
return (row[1] for row in X.iterrows())
然后将代码放入管道中并运行回归器
from sklearn.pipeline import make_union, make_pipeline
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
# Create the preprocessor
preprocessor = make_union(
make_pipeline(
ItemSelector(key='TEXT_COLUMN'),
TfidfVectorizer(lowercase=False, min_df=1),
),
make_pipeline(
FeatureTypeSelector('continuous'),
MinMaxScaler(),
),
make_pipeline(
FeatureTypeSelector('categorical'),
RowToDictTransformer(),
DictVectorizer(sparse=False), # set sparse=True if you get MemoryError
),
)
# fit and transform the data
preprocessor.fit_transform(x_train)
# choose some estimator
# estimator = MultinomialNB()
estimator = LinearRegression()
# Create the model
model = make_pipeline(preprocessor, estimator)
# Training the model
model.fit(x_train, y_train)
# Predicting the model
predicted = model.predict(x_test)
我可以运行model.coef来获得所有的系数,但我想看看文本列的每一项是如何受到哪个权重的影响的。我曾尝试调用get_feature_names()或尝试在管道中传递它们,但没有成功(到目前为止,谷歌的大部分结果都是紫色的)
有谁能给我一些指导,告诉我如何将特性名称传递到管道的末尾?理想的结果是一个数据框,其中feature(TEXT_列中的行)和feature_权重作为值
目前没有回答
相关问题 更多 >
编程相关推荐