使用lxm解析html文档时出现编码问题

from bs4 import UnicodeDammit import re import requests import lxml import lxml.html from time import sleep urls = [ "http://mathprofi.ru/zadachi_po_kombinatorike_primery_reshenij.html", "http://ru.onlinemschool.com/math/assistance/statistician/", "http://mathprofi.ru/zadachi_po_kombinatorike_primery_reshenij.html", "http://universarium.org/courses/info/332", "http://compsciclub.ru/course/wordscombinatorics", "http://ru.onlinemschool.com/math/assistance/statistician/", "http://lectoriy.mipt.ru/course/Maths-Combinatorics-AMR-Lects/", "http://www.youtube.com/watch?v=SLPrGWQBX0I" ] def check(url): print "That is url {}".format(url) r = requests.get(url) ud = UnicodeDammit(r.content, is_html=True) content = ud.unicode_markup.encode(ud.original_encoding, "ignore") root = lxml.html.fromstring(content) lxml.html.etree.strip_elements(root, lxml.etree.Comment, "script", "style") text = lxml.html.tostring(root, method="text", encoding=unicode) text = re.sub('\s+', ' ', text) print "Text type is {}!".format(type(text)) print text[:200] sleep(1) if __name__ == '__main__': for url in urls: check(url)

In [319]: r = requests.get(urls[-1]) In [320]: chardet.detect(r.content) Out[320]: {'confidence': 0.99, 'encoding': 'utf-8'} In [321]: UnicodeDammit(r.content, is_html=True).original_encoding Out[321]: 'utf-8' In [322]: r = requests.get(urls[-2]) In [323]: chardet.detect(r.content) Out[323]: {'confidence': 0.99, 'encoding': 'utf-8'} In [324]: UnicodeDammit(r.content, is_html=True).original_encoding Out[324]: u'utf-8'

1条回答

网友

1楼 · 发布于 2024-06-28 19:36:34

我终于明白了。解决办法是不使用

root = lxml.html.fromstring(content)

但是配置一个显式解析器对象，它可以被告知使用特定的编码enc：

^{pr2}$

另外，我发现即使UnicodeDammit在决定页面编码时也会犯明显的错误。因此，我添加了另一个if块：

if (declared_enc and enc != declared_enc):

以下是结果片段：

from lxml import html
from lxml.html import etree
import requests
from bs4 import UnicodeDammit
import chardet 


try:
    self.log.debug("Try to get content from page {}".format(url))
    r = requests.get(url)
except requests.exceptions.RequestException as e:
    self.log.warn("Unable to get page content of the url: {url}. "
                  "The reason: {exc!r}".format(url=url, exc=e))
    raise ParsingError(e.message)

ud = UnicodeDammit(r.content, is_html=True)

enc = ud.original_encoding.lower()
declared_enc = ud.declared_html_encoding
if declared_enc:
    declared_enc = declared_enc.lower()
# possible misregocnition of an encoding
if (declared_enc and enc != declared_enc):
    detect_dict = chardet.detect(r.content)
    det_conf = detect_dict["confidence"]
    det_enc = detect_dict["encoding"].lower()
    if enc == det_enc and det_conf < THRESHOLD_OF_CHARDETECT:
        enc = declared_enc
# if page contains any characters that differ from the main
# encodin we will ignore them
content = r.content.decode(enc, "ignore").encode(enc)
htmlparser = etree.HTMLParser(encoding=enc)
root = etree.HTML(content, parser=htmlparser)
etree.strip_elements(root, html.etree.Comment, "script", "style")
text = html.tostring(root, method="text", encoding=unicode)

相关问题更多 >

编程相关推荐

热门问题

热门文章