<p>你也可以试试<a href="http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html#sklearn.preprocessing.LabelEncoder" rel="nofollow noreferrer">^{<cd1>}</a>。如文档中所述,它对值介于0和n_classes-1之间的标签进行编码。你知道吗</p>
<pre><code>from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['city_num'] = le.fit_transform(df['city'])
print(df.head())
# city city_num
# 0 LIMA 72
# 1 VACAVILLE 122
# 2 CINCINNATI 21
# 3 GLASGOW 50
# 4 BOWLING GREEN 10
print(len(df.city.unique()))
# 132
print(len(set(df.city_num)))
# 132
</code></pre>
<p>然后可以将数字列转换为指示符列</p>
<pre><code>from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()
city_ind = ohe.fit_transform(df.city_num.values.reshape(-1, 1))
print(type(city_ind))
# <class 'scipy.sparse.csr.csr_matrix'>
print(city_ind.shape)
# (132, 132)
print(city_ind[0:2, ].toarray())
# [[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
# 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
# 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
# 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
# 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
# 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
# 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
# 0. 0. 0. 0. 0. 0.]
# [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
# 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
# 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
# 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
# 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
# 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
# 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.
# 0. 0. 0. 0. 0. 0.]]
</code></pre>