粘在网上刮鳕鱼

'Traceback (most recent call last): File "C:\Users\528000\Desktop\kids print\Comic-gather.py", line 41, in <module > prevLink = soup.select('a[class="prevLink"]')[0] 'IndexError: list index out of range 'import requests, os, bs4 url = 'http://darklegacycomics.com' os.makedirs('darklegacy', exist_ok=True) while not url.endswith('#'): # Download the page. print('Downloading page %s...' % url) res = requests.get(url) res.raise_for_status() soup = bs4.BeautifulSoup(res.text) comicElem = soup.select('.comic img') if comicElem == []: print('Could not find comic image.') else: try: comicUrl ='http://darklegacycomics.com' + comicElem[0].get('src') # Download the image. print('Downloading image %s...' % (comicUrl)) res = requests.get(comicUrl) res.raise_for_status() except requests.exceptions.MissingSchema: # skip this comic prevLink = soup.select('.prevlink')[0] url = 'http://darklegacycomics.com' + prevLink.get('href') continue # Save the image to ./darklegacy. imageFile = open(os.path.join('darklegacy', os.path.basename(comicUrl)), 'wb') for chunk in res.iter_content(100000): imageFile.write(chunk) imageFile.close() # Get the Prev button's url. prevLink = soup.select('a[class="prevLink"]')[0] url = 'http://darklegacycomics.com' + prevLink.get('href')''

1条回答

网友

1楼 · 发布于 2024-05-02 18:11:53

这将获得所有图像：

import requests, os, bs4
from urlparse import urljoin
url = 'http://darklegacycomics.com'

soup = bs4.BeautifulSoup(requests.get(url).content)

# get all img links where src value starts with /images
links = soup.select(".comic img[src^=/image]")


for img in links:
    # extract the link
    src = img["src"]
    # use the image name as the file name
    with open(os.path.basename(src),"w") as f:
        # join the base an image url and write content to disk
        f.write(requests.get(urljoin(url, src)).content)

相关问题更多 >

编程相关推荐

热门问题

热门文章