擅长:python、mysql、java
<p>这是一个基于人们的答案和我的研究的解决方案。在</p>
<pre><code>import html2text
import urllib2
import re
import nltk
useragent = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_4) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11'}
request = urllib2.Request('SomeURL',None,useragent)
myreq = urllib2.urlopen(request, timeout = 5)
html = myreq.read()
html = html.decode("utf-8")
textList = re.findall(r'(?<=<p>).*?(?=</p>)',html, re.MULTILINE|re.DOTALL)
mytext = ""
for par in textList:
if len(str(par))<2000:
par = re.sub('<[^<]+?>', '', par)
mytext +=" " + html2text.html2text(par)
print "the text is ", mytext
</code></pre>