我正在做一个简历筛选的机器学习项目,我是熊猫和Python的初学者。在清理数据集阶段,我使用了下面的脚本,它在原始数据帧上工作,但不是我的。所以我试着自己解决这个错误,但我做不到。所以,如果有人能帮助我,我会心存感激
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score
from pandas.plotting import scatter_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
cols = ['Resume_title','City','State','Description','work_experiences','Educations','Skills','Links','Certificates','Additional information']
resumeDataSet = pd.read_csv(r"C:\Users\A\A\resume_data.csv", header=None, names=cols)
# above line will be different depending on where you saved your data, and your file name
resumeDataSet['cleaned_resume'] = ''
resumeDataSet.head()
获取错误的块:
import re
def cleanResume(resumeText):
resumeText = re.sub('http\S+\s*', ' ', resumeText) # remove URLs
resumeText = re.sub('RT|cc', ' ', resumeText) # remove RT and cc
resumeText = re.sub('#\S+', '', resumeText) # remove hashtags
resumeText = re.sub('@\S+', ' ', resumeText) # remove mentions
resumeText = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', resumeText) # remove punctuations
resumeText = re.sub(r'[^\x00-\x7f]',r' ', resumeText)
resumeText = re.sub('\s+', ' ', resumeText) # remove extra whitespace
return resumeText
resumeDataSet['cleaned_resume'] = resumeDataSet.Educations.apply(lambda x: cleanResume(x))
错误:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-16-cfdba0f1a095> in <module>
11
12
---> 13 resumeDataSet['cleaned_resume'] = resumeDataSet.Educations.apply(lambda x: cleanResume(x))
G:\Anaconda\envs\Pandas\lib\site-packages\pandas\core\series.py in apply(self, func, convert_dtype, args, **kwds)
4136 else:
4137 values = self.astype(object)._values
-> 4138 mapped = lib.map_infer(values, f, convert=convert_dtype)
4139
4140 if len(mapped) and isinstance(mapped[0], Series):
pandas\_libs\lib.pyx in pandas._libs.lib.map_infer()
<ipython-input-16-cfdba0f1a095> in <lambda>(x)
11
12
---> 13 resumeDataSet['cleaned_resume'] = resumeDataSet.Educations.apply(lambda x: cleanResume(x))
<ipython-input-16-cfdba0f1a095> in cleanResume(resumeText)
1 import re
2 def cleanResume(resumeText):
----> 3 resumeText = re.sub('http\S+\s*', ' ', resumeText) # remove URLs
4 resumeText = re.sub('RT|cc', ' ', resumeText) # remove RT and cc
5 resumeText = re.sub('#\S+', '', resumeText) # remove hashtags
G:\Anaconda\envs\Pandas\lib\re.py in sub(pattern, repl, string, count, flags)
208 a callable, it's passed the Match object and must return
209 a replacement string to be used."""
--> 210 return _compile(pattern, flags).sub(repl, string, count)
211
212 def subn(pattern, repl, string, count=0, flags=0):
TypeError: expected string or bytes-like object
字符串转义在url字符串中处理,为了避免使用原始字符串,请使用原始字符串替换re.sub中的模式
相关问题 更多 >
编程相关推荐