<p>我相信,为了执行虚拟编码,对初始答案的更新甚至更好
导入日志记录</p>
<pre><code>import pandas as pd
from sklearn.base import TransformerMixin
log = logging.getLogger(__name__)
class CategoricalDummyCoder(TransformerMixin):
"""Identifies categorical columns by dtype of object and dummy codes them. Optionally a pandas.DataFrame
can be returned where categories are of pandas.Category dtype and not binarized for better coding strategies
than dummy coding."""
def __init__(self, only_categoricals=False):
self.categorical_variables = []
self.categories_per_column = {}
self.only_categoricals = only_categoricals
def fit(self, X, y):
self.categorical_variables = list(X.select_dtypes(include=['object']).columns)
logging.debug(f'identified the following categorical variables: {self.categorical_variables}')
for col in self.categorical_variables:
self.categories_per_column[col] = X[col].astype('category').cat.categories
logging.debug('fitted categories')
return self
def transform(self, X):
for col in self.categorical_variables:
logging.debug(f'transforming cat col: {col}')
X[col] = pd.Categorical(X[col], categories=self.categories_per_column[col])
if self.only_categoricals:
X[col] = X[col].cat.codes
if not self.only_categoricals:
return pd.get_dummies(X, sparse=True)
else:
return X
</code></pre>