Python美味汤解析速度改善

2024-10-01 11:30:33 发布

您现在位置:Python中文网/ 问答频道 /正文

目前,我已经编写了我的第一个python脚本,以便遍历CSV中列出的一些URL。超过14000个链接。我正在尝试1)获取所有关键字标记2)检查页面状态(404个链接需要标记)。3) 将youtube视频转换为嵌入的youtube链接(可能在转到网页获取关键字然后转换为嵌入链接后)

它开得很慢,但我想不出一条更快的路。我觉得这是请求.get()属性,但我不知道如何才能加快速度。我只需要元数据,但他们是一种只获取页面开头而不是全部的方法?如何使代码更好/更快/优化。在

另外,在使用pyinstaller进行编译时,我收到了一个集合问题。我觉得我在Python3中使用了Python2代码。。当我用python3.5编写时

import requests
from bs4 import BeautifulSoup
import csv
import re
import time

linkLocation = r'C:\Users\JCModern\Desktop\content_links.csv'
source_code = ''
myURL = ''
timestr = time.strftime("%Y%m%d_%H%M%S")
newfilename = r'C:\Users\JCModern\Desktop\content_links_and_keywords_' + timestr + '.csv'

with open(newfilename, "w", newline='') as file:
    writer = csv.writer(file, delimiter=',')
    writer.writerow(('cmsid', 'filepath', 'metatags', 'pageurl', 'pageurlchange'))
file.close()
with open(linkLocation, "r", encoding="utf-8-sig") as f:
    csv_f = csv.reader(f, delimiter=",")
    next(csv_f, None)
    for row in csv_f:
        if len(row) != 0:
            # init variables
            myKeywords = ""
            myTitle = ''
            myURL = ''
            pageUrlChange = ''
            pageStatus = ''
            pageUrl = ''
            myCmsid = (row[0])
            myURL = (row[2])
            if "https://www.youtube.com/embed/" in myURL:
                youtubeurl = myURL.split('/')
                youtubeurl = youtubeurl[4]
                youtubeurl = re.sub(
                    r'\?|\&|\=|re(l=\d+|l)|featur(e=sub|e)|playnext|video(s=\w+|s)|watch|_?((youtu(\.be|be))|fro(m=TL|m)|gdata|player|lis(t=\w+|t)|(inde(x=\w+|x)))_?|(v|vi)=|channel|ytscreeningroom','', youtubeurl)
                myURL = 'https://www.youtube.com/watch?v=' + youtubeurl
            try:    
                source_code = requests.get(myURL)
            except Exception:
                with open('errors.txt', 'a', newline='') as file:
                    writer = csv.writer(file, delimiter=',')
                    writer.writerow((myCmsid, myURL))
                file.close()
            pageStatus = source_code.status_code
            plain_text = source_code.text
            soup = BeautifulSoup(plain_text, 'html.parser')
            pageStatus = str(pageStatus)
            pageStatus = pageStatus[:1]
            pageStatus = int(pageStatus)
            if pageStatus == 2:
                pageUrlChange = 0
            else:
                pageUrlChange = 1
            if pageStatus == 3:
                pageUrl = source_code.url
            l = soup.findAll("meta", attrs={"name": "keywords"})
            if l is None:
                myKeywords = ""
            else:
                try:
                    myKeywords = l[0]['content']
                except:
                    myKeywords = myKeywords
                myKeywords = myKeywords.replace(', ', '~')
                myKeywords = myKeywords.replace(',', '~')
                myKeywords = myKeywords.replace('(', '')
                myKeywords = myKeywords.replace(')', '')
            if soup.find('title'):
                myTitle = soup.find('title').string
            if "https://www.youtube.com/" in myURL:
                youtubeurl = myURL.split('/')
                youtubeurl = youtubeurl[3]
                youtubeurl = re.sub(r'\?|\&|\=|re(l=\d+|l)|featur(e=sub|e)|playnext|video(s=\w+|s)|watch|_?((youtu(\.be|be))|fro(m=TL|m)|gdata|player|lis(t=\w+|t)|(inde(x=\w+|x)))_?|(v|vi)=|channel|ytscreeningroom','', youtubeurl)
                myURL = 'https://www.youtube.com/embed/' + youtubeurl
#                print(youtubeurl)
            if "https://youtu.be/" in myURL:
                youtubeurl = myURL.split('/')
                youtubeurl = youtubeurl[3]
                youtubeurl = re.sub(
                    r'\?|\&|\=|re(l=\d+|l)|featur(e=sub|e)|playnext|video(s=\w+|s)|watch|_?((youtu(\.be|be))|fro(m=TL|m)|gdata|player|lis(t=\w+|t)|(inde(x=\w+|x)))_?|(v|vi)=|channel|ytscreeningroom','', youtubeurl)
                myURL = 'https://www.youtube.com/embed/' + youtubeurl
#                print(youtubeurl)
#            print((myCmsid, myURL, myKeywords, pageUrl, pageUrlChange))
            with open(newfilename, "a", newline='') as file:
                writer = csv.writer(file, delimiter=',')
                writer.writerow((myCmsid, myURL, myKeywords, pageUrl, pageUrlChange))
            file.close()
f.close()

Tags: csvhttpsimportresourceifyoutubecode
2条回答

html.parser是使用正则表达式的纯python实现。你真的不想用它。安装lxml并用C代码完成解析(记住使用BeautifulSoup(plain_text, 'lxml')。在

你也不想继续打开你的CSV文件。在循环外打开一次,只需将新行写入循环中的csv.writer()对象。在

否则你不能加速URL的加载,不是很多。网络速度永远是一个瓶颈。您可以使用非常低层^{} library,但我怀疑它所能提供的加速在这里是否会产生影响。在

除了建议使用更快的xml解析器之外,这也是通过multiprocessing模块进行并行化的一个很好的候选方案。我重新安排了您的代码,以便在可以委托给子流程的worker中执行请求/解析。worker返回需要添加到csv中的行。我在返回行的前面添加了一个0/-1错误代码,这样父进程就知道哪个csv得到了结果。在

import requests
from bs4 import BeautifulSoup
import csv
import re
import time
import multiprocessing
import traceback

def grabber(myCmsid, myURL):
    try:
        return grabber_impl(myCmsid, myURL)
    except:
        return (-1, myCmsid, myURL, traceback.format_exc())

def grabber_impl(myCmsid, myURL):
    # init variables
    myKeywords = ""
    myTitle = ''
    myURL = ''
    pageUrlChange = ''
    pageStatus = ''
    pageUrl = ''
    if "https://www.youtube.com/embed/" in myURL:
        youtubeurl = myURL.split('/')
        youtubeurl = youtubeurl[4]
        youtubeurl = re.sub(
            r'\?|\&|\=|re(l=\d+|l)|featur(e=sub|e)|playnext|video(s=\w+|s)|watch|_?((youtu(\.be|be))|fro(m=TL|m)|gdata|player|lis(t=\w+|t)|(inde(x=\w+|x)))_?|(v|vi)=|channel|ytscreeningroom','', youtubeurl)
        myURL = 'https://www.youtube.com/watch?v=' + youtubeurl

    source_code = requests.get(myURL)
    pageStatus = source_code.status_code
    plain_text = source_code.text
    soup = BeautifulSoup(plain_text, 'html.parser')
    pageStatus = str(pageStatus)
    pageStatus = pageStatus[:1]
    pageStatus = int(pageStatus)
    if pageStatus == 2:
        pageUrlChange = 0
    else:
        pageUrlChange = 1
    if pageStatus == 3:
        pageUrl = source_code.url
    l = soup.findAll("meta", attrs={"name": "keywords"})
    if l is None:
        myKeywords = ""
    else:
        try:
            myKeywords = l[0]['content']
        except:
            myKeywords = myKeywords
        myKeywords = myKeywords.replace(', ', '~')
        myKeywords = myKeywords.replace(',', '~')
        myKeywords = myKeywords.replace('(', '')
        myKeywords = myKeywords.replace(')', '')
    if soup.find('title'):
        myTitle = soup.find('title').string
    if "https://www.youtube.com/" in myURL:
        youtubeurl = myURL.split('/')
        youtubeurl = youtubeurl[3]
        youtubeurl = re.sub(r'\?|\&|\=|re(l=\d+|l)|featur(e=sub|e)|playnext|video(s=\w+|s)|watch|_?((youtu(\.be|be))|fro(m=TL|m)|gdata|player|lis(t=\w+|t)|(inde(x=\w+|x)))_?|(v|vi)=|channel|ytscreeningroom','', youtubeurl)
        myURL = 'https://www.youtube.com/embed/' + youtubeurl
#                print(youtubeurl)
    if "https://youtu.be/" in myURL:
        youtubeurl = myURL.split('/')
        youtubeurl = youtubeurl[3]
        youtubeurl = re.sub(
            r'\?|\&|\=|re(l=\d+|l)|featur(e=sub|e)|playnext|video(s=\w+|s)|watch|_?((youtu(\.be|be))|fro(m=TL|m)|gdata|player|lis(t=\w+|t)|(inde(x=\w+|x)))_?|(v|vi)=|channel|ytscreeningroom','', youtubeurl)
        myURL = 'https://www.youtube.com/embed/' + youtubeurl
#                print(youtubeurl)
#            print((myCmsid, myURL, myKeywords, pageUrl, pageUrlChange))
    return (0, myCmsid, myURL, myKeywords, pageUrl, pageUrlChange))


linkLocation = r'C:\Users\JCModern\Desktop\content_links.csv'
source_code = ''
myURL = ''
timestr = time.strftime("%Y%m%d_%H%M%S")
newfilename = r'C:\Users\JCModern\Desktop\content_links_and_keywords_' + timestr + '.csv'

with open(linkLocation, "r", encoding="utf-8-sig") as f:
    csv_f = csv.reader(f, delimiter=",")
    next(csv_f, None)
    pool = multiprocessing.Pool()

    with open(newfilename, 'a', newline='') as out, open('errors.txt', 'a', newline='') as err:
        writer = csv.writer(out, delimiter=',')
        err_writer = csv.writer(err, delimiter=',')
        for result in pool.imap_unordered(grabber, ((row[0], row[2]) for row in csv_f if row), chunksize=1):
            if result[0]:
                writer.writerow(result[1:])
            else:
                print(result[3])
                err_writer.writerow(result[1:3])
pool.close()
pool.join()

相关问题 更多 >