BeautifulSoup 4 HTML网页抓取查找Mailto链接并导出到电子表格

import requests from bs4 import BeautifulSoup import xlwt wb = xlwt.Workbook() ws = wb.add_sheet('Emails') ws.write(0,0,'Emails') emailList= [] r=0 #add url of the page you want to scrape to urlString urlString='http://www.uschess.org/assets/msa_joomla/AffiliateSearch/clubresultsnew.php?st=AL' #function that extracts all emails from a page you provided and stores them in a list def emailExtractor(urlString): getH=requests.get(urlString) h=getH.content soup=BeautifulSoup(h,'html.parser') mailtos = soup.select('a[href^=mailto]') for i in mailtos: href=i['href'] try: str1, str2 = href.split(':') except ValueError: break emailList.append(str2) emailExtractor(urlString) #adding scraped emails to an excel sheet for email in emailList: r=r+1 ws.write(r,0,email) wb.save('emails.xls')

2条回答

网友

1楼 · 编辑于 2024-10-06 07:49:24

您可以使用pandas进行此操作。以下是完整的代码：

from bs4 import BeautifulSoup
import requests
import re
import pandas as pd

urlString = 'http://www.uschess.org/assets/msa_joomla/AffiliateSearch/clubresultsnew.php?st=AL'


# function that extracts all emails from a page you provided and stores them in a list
def emailExtractor(urlString):
   
    emailList = []
    getH = requests.get(urlString)
    h = getH.content
    soup = BeautifulSoup(h, 'html.parser')

    mailtos = soup.find_all('a')

    href_lst = []
    for i in mailtos:
        href_lst.append(i['href'])

    for href in href_lst:
        if ':' in href:
            emailList.append(href)
    print(emailList)

    s = pd.Series(emailList)

    s = s.rename('Emails')
    
    s.to_excel('D:\\Emails.xls',index=False)

emailExtractor(urlString)

输出：

['http://msa.uschess.org/AffDtlMain.php?T6006791', 'https://alabamachess.org', 'http://msa.uschess.org/AffDtlMain.php?A6029262', 'http://www.caesarchess.com/', 'http://msa.uschess.org/AffDtlMain.php?A6045660', 'http://msa.uschess.org/AffDtlMain.php?H6046485', 'http://msa.uschess.org/AffDtlMain.php?A6040580']

Excel工作表屏幕截图：

如果希望将链接作为hyperlinks输出到excel工作表（单击链接后将重定向到网站），则将emailList.append(href)更改为emailList.append('=HYPERLINK("'+href+'")')。同时，您还应该将文件扩展名更改为.xlsx。只有这样，你才能得到超链接的链接

输出：

希望这有帮助

网友

2楼 · 编辑于 2024-10-06 07:49:24

因为电子邮件是受保护的。我只添加了电子邮件抓取部分。不要添加excel部分，因为你没有问题。将受保护的电子邮件转换为文本贷记将转到https://stackoverflow.com/a/36913154/7518304

emailList= []
r=0

#add url of the page you want to scrape to urlString
urlString='http://www.uschess.org/assets/msa_joomla/AffiliateSearch/clubresultsnew.php?st=AL'
def decodeEmail(e): #https://stackoverflow.com/a/36913154/7518304
    de = ""
    k = int(e[:2], 16)

    for i in range(2, len(e)-1, 2):
        de += chr(int(e[i:i+2], 16)^k)

    return de

#function that extracts all emails from a page you provided and stores them in a list
def emailExtractor(urlString):
    getH=requests.get(urlString)
    h=getH.content
    soup=BeautifulSoup(h,'html.parser')
    mailtos = soup.select('a[href]')
    for i in mailtos:
        href=i['href']
        if "email-protect" in href:
            emailList.append(decodeEmail(href.split("#")[1]))

emailExtractor(urlString)
emailList

相关问题更多 >

编程相关推荐

热门问题

热门文章