从多个网站下载文件。

import urllib, urllib2, re, os from BeautifulSoup import BeautifulSoup # Website List sas = 'http://cdn.superantispyware.com/SUPERAntiSpyware.exe' tds = 'http://support.kaspersky.com/downloads/utils/tdsskiller.exe' mbam = 'http://www.bleepingcomputer.com/download/malwarebytes-anti-malware/dl/7/?1' tr = 'http://www.simplysup.com/tremover/download.html' urllist = [sas, tr, tds, tr] urrllist2 = [] # Find exe files to download match = re.compile('\.exe') data = urllib2.urlopen(urllist) page = BeautifulSoup(data) # Check links #def findexe(): for link in page.findAll('a'): try: href = link['href'] if re.search(match, href): urllist2.append(href) except KeyError: pass os.chdir(r"C:\_VirusFixes") urllib.urlretrieve(urllist2, os.path.basename(urllist2))

3条回答

网友

1楼 · 编辑于 2024-09-30 18:13:25

除了mikez302's answer之外，还有一种更具可读性的编写代码的方法：

import os
import re
import urllib
import urllib2

from BeautifulSoup import BeautifulSoup

websites = [
    'http://cdn.superantispyware.com/SUPERAntiSpyware.exe'
    'http://support.kaspersky.com/downloads/utils/tdsskiller.exe'
    'http://www.bleepingcomputer.com/download/malwarebytes-anti-malware/dl/7/?1'
    'http://www.simplysup.com/tremover/download.html'
]

download_links = []

for url in websites:
    connection = urllib2.urlopen(url)
    soup = BeautifulSoup(connection)
    connection.close()

    for link in soup.findAll('a', {href: re.compile(r'\.exe$')}):
        download_links.append(link['href'])

for url in download_links:
    urllib.urlretrieve(url, r'C:\_VirusFixes', os.path.basename(url))

网友

2楼 · 编辑于 2024-09-30 18:13:25

上面的代码对我不起作用，在我的例子中，这是因为页面通过一个脚本组装它们的链接，而不是包含在代码中。当我遇到这个问题时，我使用了以下代码，这只是一个刮刀：

import os
import re
import urllib
import urllib2

from bs4 import BeautifulSoup

url = ''

connection = urllib2.urlopen(url)
soup = BeautifulSoup(connection) #Everything the same up to here 
regex = '(.+?).zip'       #Here we insert the pattern we are looking for
pattern = re.compile(regex)
link = re.findall(pattern,str(soup)) #This finds all the .zip (.exe) in the text
x=0
for i in link:
    link[x]=i.split(' ')[len(i.split(' '))-1] 
# When it finds all the .zip, it usually comes back with a lot of undesirable 
# text, luckily the file name is almost always separated by a space from the 
# rest of the text which is why we do the split
    x+=1  

os.chdir("F:\Documents")
# This is the filepath where I want to save everything I download

for i in link:
    urllib.urlretrieve(url,filename=i+".zip") # Remember that the text we found doesn't include the .zip (or .exe in your case) so we want to reestablish that.

这并不如前面的答案中的代码有效，但它几乎适用于大多数网站。在

网友

3楼 · 编辑于 2024-09-30 18:13:25

urllib2.urlopen是访问单个URL的函数。如果你想访问多个，你应该在列表中循环。你应该这样做：

for url in urllist:
    data = urllib2.urlopen(url)
    page = BeautifulSoup(data)

    # Check links
    for link in page.findAll('a'):
        try:
            href = link['href']
            if re.search(match, href):
                urllist2.append(href)

        except KeyError:
            pass

    os.chdir(r"C:\_VirusFixes")
    urllib.urlretrieve(urllist2, os.path.basename(urllist2))

相关问题更多 >

编程相关推荐

热门问题

热门文章