自动化无聊的东西图像网站下载

import requests, bs4, os # The outerHTML file which I got by rightClicking and copying the <html> tag on 'page source' flickrFile=open('flickrHtml.html',encoding="utf8") #Parsing the HTML document flickrSoup=bs4.BeautifulSoup(flickrFile,'html.parser') # categoryElem is the Element which has image source inside categoryElem=flickrSoup.select("a[class='overlay']") #len(categoryElem)=849 os.makedirs('FlickrImages', exist_ok=True) for i in range(len(categoryElem)-1): # Regex searching for the href import re html=str(categoryElem[i]) htmlRegex=re.compile(r'href.*/"') mo=htmlRegex.search(html) imageUrl=mo.group() imageUrl=imageUrl.replace('"','') imageUrl=imageUrl.replace('href=','') imageUrlFlickr="https://www.flickr.com"+str(imageUrl) # Downloading the response object of the Image URL res = requests.get(imageUrlFlickr) imageSoup=bs4.BeautifulSoup(res.text) picElem=imageSoup.select('div[class="view photo-well-media-scrappy-view requiredToShowOnServer"] img') # Regex searching for the jpg file in the picElem HTML element html=str(picElem) htmlRegex=re.compile(r'//live.*\.jpg') mo=htmlRegex.search(html) try: imageUrlRegex=mo.group() except Exception as exc: print('There was a problem: %s' % (exc)) res1=requests.get('https:'+imageUrlRegex) try: res1.raise_for_status() except Exception as exc: print('There was a problem: %s' % (exc)) # Dowloading the jpg to my folder imageFile = open(os.path.join('FlickrImages', os.path.basename(imageUrlRegex)), 'wb') for chunk in res1.iter_content(100000): imageFile.write(chunk)

2条回答

网友

1楼 · 编辑于 2024-09-22 10:31:46

如果要使用requests + Beautfulsoup，请尝试以下操作（通过传递参数page）：

import re, requests, threading, os
from bs4 import BeautifulSoup

def download_image(url):
    with open(os.path.basename(url), "wb") as f:
        f.write(requests.get(url).content)
    print(url, "download successfully")

original_url = "https://www.flickr.com/search/?text=sea&view_all=1&page={}"

pages = range(1, 5000) # not sure how many pages here

for page in pages:
    concat_url = original_url.format(page)
    print("Now it is page", page)
    soup = BeautifulSoup(requests.get(concat_url).content, "lxml")
    soup_list = soup.select(".photo-list-photo-view")
    for element in soup_list:
        img_url = 'https:'+re.search(r'url\((.*)\)', element.get("style")).group(1)
        # the url like: https://live.staticflickr.com/xxx/xxxxx_m.jpg
        # if you want to get a clearer(and larger) picture, remove the "_m" in the end of the url.
        # For prevent IO block,I create a thread to download it.pass the url of the image as argument.
        threading.Thread(target=download_image, args=(img_url,)).start()

如果使用selenium，可能会更简单，示例代码如下：

from selenium import webdriver
import re, requests, threading, os

# download_image
def download_image(url):
    with open(os.path.basename(url), "wb") as f:
        f.write(requests.get(url).content)


driver = webdriver.Chrome()
original_url = "https://www.flickr.com/search/?text=sea&view_all=1&page={}"

pages = range(1, 5000) # not sure how many pages here

for page in pages:
    concat_url = original_url.format(page)
    print("Now it is page", page)
    driver.get(concat_url)
    for element in driver.find_elements_by_css_selector(".photo-list-photo-view"):
        img_url = 'https:'+re.search(r'url\(\"(.*)\"\)', element.get_attribute("style")).group(1)
        # the url like: https://live.staticflickr.com/xxx/xxxxx_m.jpg
        # if you want to get a clearer(and larger) picture, remove the "_m" in the end of the url.
        # For prevent IO block,I create a thread to download it.pass the url of the image as argument.
        threading.Thread(target=download_image, args=(img_url, )).start()

并在我的电脑上成功下载

网友

2楼 · 编辑于 2024-09-22 10:31:46

首先，从Flicker这样的网站上抓取400万条搜索结果可能是不道德的。网络爬虫应该尽最大努力通过最小化服务器上的负载来尊重他们正在从中爬虫的网站。在短时间内有400万个请求可能会使你的IP被禁止。如果你使用代理，你可以绕过这一点，但再次-高度不道德。你也会遇到版权问题的风险，因为flicker上的很多图像都是受版权保护的

如果你要继续这样做，你就必须使用Scrapy和Scrapy Selenium组合。Scrapy非常适合运行并发请求，这意味着您可以同时请求大量图像。您可以在此处了解有关Scrapy的更多信息：https://docs.scrapy.org/en/latest/

工作流程如下所示：

Scrapy向网站请求html解析，通过它查找class='overlay no outline'的所有标记
Scrapy同时向每个url发出请求。这意味着URL不会一个接一个地跟随，而是并排跟随
当图像返回时，它们会被添加到数据库/存储空间中
Scrapy（可能是Selenium）滚动无限滚动页面并重复，而不重复已检查的图像（保留上次扫描项目的索引）

这就是Scrapy所需要的，但我强烈建议不要尝试刮取400万个元素。您可能会发现，您遇到的性能问题不值得您花费时间，特别是因为这应该是一次学习经历，您可能永远不需要刮取那么多元素

相关问题更多 >

编程相关推荐

热门问题

热门文章