<p>有点晚了,但是这个解决方案防止了诸如hashtag1、hashtag2(不带空格)之类的标点错误,而且实现非常简单</p>
<pre><code>import re,string
def strip_links(text):
link_regex = re.compile('((https?):((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)', re.DOTALL)
links = re.findall(link_regex, text)
for link in links:
text = text.replace(link[0], ', ')
return text
def strip_all_entities(text):
entity_prefixes = ['@','#']
for separator in string.punctuation:
if separator not in entity_prefixes :
text = text.replace(separator,' ')
words = []
for word in text.split():
word = word.strip()
if word:
if word[0] not in entity_prefixes:
words.append(word)
return ' '.join(words)
tests = [
"@peter I really love that shirt at #Macy. http://bet.ly//WjdiW4",
"@shawn Titanic tragedy could have been prevented Economic Times: Telegraph.co.ukTitanic tragedy could have been preve... http://bet.ly/tuN2wx",
"I am at Starbucks http://4sh.com/samqUI (7419 3rd ave, at 75th, Brooklyn)",
]
for t in tests:
strip_all_entities(strip_links(t))
#'I really love that shirt at'
#'Titanic tragedy could have been prevented Economic Times Telegraph co ukTitanic tragedy could have been preve'
#'I am at Starbucks 7419 3rd ave at 75th Brooklyn'
</code></pre>