如何使用python+selenium+phantomJ从google下载高分辨率图像

2024-09-29 23:31:36 发布

您现在位置:Python中文网/ 问答频道 /正文

我想从谷歌获取100多张高分辨率图片,使用python2.7+selenium+PhantomJS。在

但因为我的行为和他们说的一样,我只能得到一个有小图片的网页。我也找不到任何直接指向高分辨率图片的链接。我怎么能修好它?在

我的代码如下。在

from bs4 import BeautifulSoup
from selenium import webdriver
import time

class ImgCrawler:
    def __init__(self,searchlink = None):
        self.link = searchlink
        self.soupheader = {'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36"}
        self.scrolldown = None
        self.jsdriver = None

    def getPhantomJSDriver(self):
        self.jsdriver = webdriver.PhantomJS()
        self.jsdriver.get(self.link)

    def scrollDownUsePhatomJS(self, scrolltimes = 1, sleeptime = 10):
        for i in range(scrolltimes):
           self.jsdriver.execute_script('window.scrollTo(0,document.body.scrollHeight);')
           time.sleep(sleeptime)

    def getSoup(self, parser=None):
        print 'a', self.jsdriver.page_source
        return BeautifulSoup(self.jsdriver.page_source, parser)

    def getActualUrl(self, soup=None, flag=None, classflag=None, jsonflaglink=None, jsonflagtype=None):
        actualurl = []
        for a in soup.find_all(flag, {"class": classflag}):
            link = json.loads(a.text)[jsonflaglink]
            filetype = json.loads(a.text)[jsonflagtype]
            detailurl = link + u'.' + filetype
            actualurl.append(detailurl)
        return actualurl


if __name__ == '__main__':
    search_url = "https://www.google.com.hk/search?safe=strict&hl=zh-CN&site=imghp&tbm=isch&source=hp&biw=&bih=&btnG=Google+%E6%90%9C%E7%B4%A2&q="
    queryword = raw_input()
    query = queryword.split()
    query = '+'.join(query)
    weblink = search_url + query
    img = ImgCrawler(weblink)
    img.getPhantomJSDriver()
    img.scrollDownUsePhatomJS(2,5)
    soup = img.getSoup('html.parser')
    print weblink
    print soup           
    actualurllist = img.getActualUrl(soup,'div','rg_meta','ou','ity')
    print len(actualurllist)

Tags: importselfnoneparsersourceimgsearchdef
1条回答
网友
1楼 · 发布于 2024-09-29 23:31:36

我试了很长一段时间使用PhantomJS,但最终使用了Chrome,这不是你想要的,我知道,但它很管用。我不能让它和幽灵一起工作。在

首先得到一个驱动程序https://sites.google.com/a/chromium.org/chromedriver/downloads如果你在Windows上,你可以使用无头版本的chrome“chrome Canary”。在

from bs4 import BeautifulSoup
from selenium import webdriver
import time
import re
import urlparse

class ImgCrawler:
    def __init__(self,searchlink = None):
        self.link = searchlink
        self.soupheader = {'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36"}
        self.scrolldown = None
        self.jsdriver = None

    def getPhantomJSDriver(self):
        self.jsdriver = webdriver.Chrome()
        self.jsdriver.get(self.link)

    def scrollDownUsePhatomJS(self, scrolltimes = 1, sleeptime = 10):
        for i in range(scrolltimes):
           self.jsdriver.execute_script('window.scrollTo(0,document.body.scrollHeight);')
           time.sleep(sleeptime)

    def getSoup(self, parser=None):
        print 'a', self.jsdriver.page_source
        return BeautifulSoup(self.jsdriver.page_source, parser)

    def getActualUrl(self, soup=None):
        actualurl = []
        r = re.compile(r"/imgres\?imgurl=")
        for a in soup.find_all('a', href=r):
            parsed = urlparse.urlparse(a['href'])
            url = urlparse.parse_qs(parsed.query)['imgurl']
            actualurl.append(url)
            print url
        return actualurl


if __name__ == '__main__':
    search_url = "https://www.google.com.hk/search?safe=strict&hl=zh-CN&site=imghp&tbm=isch&source=hp&biw=&bih=&btnG=Google+%E6%90%9C%E7%B4%A2&q="
    queryword = raw_input()
    query = queryword.split()
    query = '+'.join(query)
    weblink = search_url + query
    img = ImgCrawler(weblink)
    img.getPhantomJSDriver()
    img.scrollDownUsePhatomJS(2,5)
    soup = img.getSoup('html.parser')
    print weblink
    print soup
    actualurllist = img.getActualUrl(soup)
    print len(actualurllist)

我更改了getActualUrl()以从“a”元素获取图像url,该元素的“href”属性以“/imgres”开头?imgurl=“

输出(当在终端输入“危险”时):

^{pr2}$

相关问题 更多 >

    热门问题