如何解析电子邮件消息中的HTML

2024-09-23 06:35:17 发布

您现在位置:Python中文网/ 问答频道 /正文

我试着从邮件中获取关于主题,从,到,日期的信息,这已经完成了,我还需要从同一封邮件中的HTML获取所有跨度,我试着使用Soup,但我做错了。那么我怎样才能从HTML中得到唯一的跨度呢?感谢您的帮助,我的代码如下:

import imaplib
import email
import mimetypes
from email import header
from bs4 import BeautifulSoup


def decodeHeader(headerMsg):
    L = header.decode_header(headerMsg)
    s = ''
    for s1, chset in L:
        if type(s1) == bytes:
            s += s1.decode(chset) if chset else s1.decode()
        else:
            s += s1
    return s


host = 'imap.yandex.ru'
userid = 'myid'
passwd = 'mypass'

imap = imaplib.IMAP4_SSL(host)
imap.login(userid, passwd)

imap.select('INBOX')
status, email_ids = imap.search(None, '(ALL)')


for num in email_ids[0].split():
    type1, data = imap.fetch(num, '(RFC822)')
    raw_email = data[0][1]
    email_msg = email.message_from_bytes(raw_email)
    if email_msg.is_multipart():
                for payload in email_msg.get_payload():
        # if payload.is_multipart(): ...
        doc = (payload.get_payload())
        source_code = doc
        soup = BeautifulSoup(source_code, 'lxml')
        print(soup.find('span', {'class': 'mail-ThreadSidebar-List-Item_content'}).text)
else:
    doc = (email_msg.get_payload())
    source_code = doc
    soup = BeautifulSoup(source_code, 'lxml')
    print(soup.find('span', {'class': 'mail-ThreadSidebar-List-Item_content'}))

    print('Subject: ', decodeHeader(email_msg['Subject']))
    print('From: ', decodeHeader(email_msg['From']))
    print('To: ', decodeHeader(email_msg['To']))
    print('Date: ', decodeHeader(email_msg['Date']))

    type1, data = imap.fetch(num, '(UID BODY[TEXT])')
    raw_email = str(data[0][1])
    #  print('contents: ', raw_email)
    print('----\n\n')

Tags: importsourcedatarawdocifemailcode