回溯(最近一次调用last)和预期的字符串或byteslike对象

2024-09-30 06:31:19 发布

您现在位置:Python中文网/ 问答频道 /正文

我正在做一个简历筛选的机器学习项目,我是熊猫和Python的初学者。在清理数据集阶段,我使用了下面的脚本,它在原始数据帧上工作,但不是我的。所以我试着自己解决这个错误,但我做不到。所以,如果有人能帮助我,我会心存感激

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score
from pandas.plotting import scatter_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
cols = ['Resume_title','City','State','Description','work_experiences','Educations','Skills','Links','Certificates','Additional information']
resumeDataSet = pd.read_csv(r"C:\Users\A\A\resume_data.csv", header=None, names=cols)
# above line will be different depending on where you saved your data, and your file name
resumeDataSet['cleaned_resume'] = ''
resumeDataSet.head()

获取错误的块:

import re
def cleanResume(resumeText):
    resumeText = re.sub('http\S+\s*', ' ', resumeText)  # remove URLs
    resumeText = re.sub('RT|cc', ' ', resumeText)  # remove RT and cc
    resumeText = re.sub('#\S+', '', resumeText)  # remove hashtags
    resumeText = re.sub('@\S+', '  ', resumeText)  # remove mentions
    resumeText = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', resumeText)  # remove punctuations
    resumeText = re.sub(r'[^\x00-\x7f]',r' ', resumeText) 
    resumeText = re.sub('\s+', ' ', resumeText)  # remove extra whitespace
    return resumeText
    

resumeDataSet['cleaned_resume'] = resumeDataSet.Educations.apply(lambda x: cleanResume(x))

错误:

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-16-cfdba0f1a095> in <module>
     11 
     12 
---> 13 resumeDataSet['cleaned_resume'] = resumeDataSet.Educations.apply(lambda x: cleanResume(x))

G:\Anaconda\envs\Pandas\lib\site-packages\pandas\core\series.py in apply(self, func, convert_dtype, args, **kwds)
   4136             else:
   4137                 values = self.astype(object)._values
-> 4138                 mapped = lib.map_infer(values, f, convert=convert_dtype)
   4139 
   4140         if len(mapped) and isinstance(mapped[0], Series):

pandas\_libs\lib.pyx in pandas._libs.lib.map_infer()

<ipython-input-16-cfdba0f1a095> in <lambda>(x)
     11 
     12 
---> 13 resumeDataSet['cleaned_resume'] = resumeDataSet.Educations.apply(lambda x: cleanResume(x))

<ipython-input-16-cfdba0f1a095> in cleanResume(resumeText)
      1 import re
      2 def cleanResume(resumeText):
----> 3     resumeText = re.sub('http\S+\s*', ' ', resumeText)  # remove URLs
      4     resumeText = re.sub('RT|cc', ' ', resumeText)  # remove RT and cc
      5     resumeText = re.sub('#\S+', '', resumeText)  # remove hashtags

G:\Anaconda\envs\Pandas\lib\re.py in sub(pattern, repl, string, count, flags)
    208     a callable, it's passed the Match object and must return
    209     a replacement string to be used."""
--> 210     return _compile(pattern, flags).sub(repl, string, count)
    211 
    212 def subn(pattern, repl, string, count=0, flags=0):

TypeError: expected string or bytes-like object

Tags: andinfromimportrepandasstringlib
1条回答
网友
1楼 · 发布于 2024-09-30 06:31:19

字符串转义在url字符串中处理,为了避免使用原始字符串,请使用原始字符串替换re.sub中的模式

import re
def cleanResume(resumeText):
    resumeText = str(resumeText)
    resumeText = re.sub(r'http\S+\s*', r' ', resumeText)  # remove URLs
    resumeText = re.sub(r'RT|cc', r' ', resumeText)  # remove RT and cc
    resumeText = re.sub(r'#\S+', r'', resumeText)  # remove hashtags
    resumeText = re.sub(r'@\S+', r'  ', resumeText)  # remove mentions
    resumeText = re.sub(r'[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), r' ', resumeText)  # remove punctuations
    resumeText = re.sub(r'[^\x00-\x7f]',r' ', resumeText) 
    resumeText = re.sub(r'\s+', r' ', resumeText)  # remove extra whitespace
    return resumeText
    

resumeDataSet['cleaned_resume'] = resumeDataSet.Educations.apply(lambda x: cleanResume(x), axis=1)

相关问题 更多 >

    热门问题