Python如何在https上搜索驻留在iframe中的zip文件

import requests, zipfile, StringIO, time, arcpy, urllib2, urlparse from BeautifulSoup import BeautifulSoup arcpy.env.overwriteOutput = True workPath = -- #The output GDB timestr = time.strftime("%Y%m%d") gdbName = "GlobalSDEUpdate_" + timestr gdbPath = workPath + "\\" + gdbName + ".gdb" class global_DataFinder(object): def __init__(self): object.__init__(self) self.gdbSetup() self.metro() def gdbSetup(self): arcpy.CreateFileGDB_management(workPath, gdbName) def fileDownload(self, key, url, dlPath, dsName): page = urllib2.urlopen(url).read() urlList = [] soup = BeautifulSoup(page) soup.prettify() for link in soup.findAll('a', href = True): if not 'http://' in link['href']: if urlparse.urljoin(url, link['href']) not in urlList: zipDL = urlparse.urljoin(url, link['href']) if zipDL.endswith(".zip"): if key in zipDL: urlList.append(zipDL) for x in urlList: print x r = requests.get(x, stream=True) z = zipfile.ZipFile(StringIO.StringIO(r.content)) z.extractall(dlPath) arcpy.CreateFeatureDataset_management(gdbPath, dsName) arcpy.env.workspace = dlPath shpList = [] for shp in arcpy.ListFeatureClasses(): shpList.append(shp) arcpy.FeatureClassToGeodatabase_conversion(shpList, (gdbPath + "\\" + dsName)) del shpList[:] def metro(self): key = "METRO_GIS_Data_Layers" url = "http://www.ridemetro.org/Pages/NewsDownloads.aspx" dlPath = -- *#Where my zipfiles output to* dsName = "Metro" self.fileDownload(key, url, dlPath, dsName) global_DataFinder()

1条回答

网友
1楼 · 发布于 2024-09-29 19:29:34

我可以使用requests模块下载zip文件，还选择使用PyQuery而不是beautifulsoup。我认为您所面临的问题与SSL证书验证有关，如果您将verify参数设置为False，那么requests模块将允许您跳过对证书的检查。在
下面的函数将下载所有zip文件并解压，您可以从那里将shapefile导入您的geodatabase：
import requests import os import zipfile from pyquery import PyQuery from requests.packages.urllib3.exceptions import InsecureRequestWarning, InsecurePlatformWarning, SNIMissingWarning # disable ssl warnings (we are not verifying SSL certificates at this time...future ehnancement?) for warning in [SNIMissingWarning, InsecurePlatformWarning, InsecureRequestWarning]: requests.packages.urllib3.disable_warnings(warning) def download_zips(out_path): url = 'http://www.floodmaps.fema.gov/NFHL/status.shtml' download_prefix = 'https://hazards.fema.gov/femaportal/NFHL' pq = PyQuery(requests.get(url, verify=False).content) #verify param important for SSL src = pq.find('iframe').attr('src') pq = PyQuery(requests.get(src, verify=False).content) table = pq.find('table') for a in table.find('a'): href = a.attrib.get('href') url = '/'.join([download_prefix, href]) r = requests.get(url, stream=True, verify=False) out_zip = os.path.join(out_path, href.split('=')[-1]) with open(out_zip, 'wb') as f: for chunk in r.iter_content(1024 *16): #grab 1KB at a time if chunk: f.write(chunk) print 'downloaded zip: "{}"'.format(href.split('=')[-1]) # do more stuff like unzip? unzipped = out_zip.split('.zip')[0] with zipfile.Zipfile(out_zip, 'r') as f: f.extractall(unzipped)

相关问题更多 >

编程相关推荐

热门问题

热门文章