urlretrieve在几次下载后生成http错误404

from bs4 import BeautifulSoup import requests url_ds3_part1 = 'https://darksouls3.wiki.fextralife.com' url_ds3 = { "daggers":"https://darksouls3.wiki.fextralife.com/Daggers", "straight_swords":"https://darksouls3.wiki.fextralife.com/Straight+Swords", "great_swords":"https://darksouls3.wiki.fextralife.com/Greatswords", "ultra_great_swords":"https://darksouls3.wiki.fextralife.com/Ultra+Greatswords", "curved_swords":"https://darksouls3.wiki.fextralife.com/Curved+Swords", "katanas":"https://darksouls3.wiki.fextralife.com/Katanas", "curved_great_swords":"https://darksouls3.wiki.fextralife.com/Curved+Greatswords", "piercing_swords":"https://darksouls3.wiki.fextralife.com/Piercing+Swords" } for url in url_ds3.values(): content = requests.get(url).content soup = BeautifulSoup(content,'lxml') image_tags = soup.findAll('img') [urllib.request.urlretrieve(url_ds3_part1+str(image_tag.get('src')), str('images_swords')+str(image_tag.get('src'))) for image_tag in image_tags if (('forum' in str(image_tag.get('src'))) == False) & (('None' in str(image_tag.get('src'))) == False) & (('avatar' in str(image_tag.get('src'))) == False) & (('Damage' in str(image_tag.get('src'))) == False) & (('Resist' in str(image_tag.get('src'))) == False) & (('STR' in str(image_tag.get('src'))) == False) & (('DEX' in str(image_tag.get('src'))) == False) & (('INT' in str(image_tag.get('src'))) == False) & (('FTH' in str(image_tag.get('src'))) == False) & (('attack' in str(image_tag.get('src'))) == False) & (('normal' in str(image_tag.get('src'))) == False) ]

for url in url_ds3.values(): content = requests.get(url).content soup = BeautifulSoup(content,'lxml') image_tags = soup.findAll('img') [print(url_ds3_part1+str(image_tag.get('src'))) for image_tag in image_tags if (('forum' in str(image_tag.get('src'))) == False) & (('None' in str(image_tag.get('src'))) == False) & (('avatar' in str(image_tag.get('src'))) == False) & (('Damage' in str(image_tag.get('src'))) == False) & (('Resist' in str(image_tag.get('src'))) == False) & (('STR' in str(image_tag.get('src'))) == False) & (('DEX' in str(image_tag.get('src'))) == False) & (('INT' in str(image_tag.get('src'))) == False) & (('FTH' in str(image_tag.get('src'))) == False) & (('attack' in str(image_tag.get('src'))) == False) & (('normal' in str(image_tag.get('src'))) == False) ]

for url in url_ds3.values(): content = requests.get(url).content soup = BeautifulSoup(content,'lxml') image_tags = soup.findAll('img') try: [urllib.request.urlretrieve(url_ds3_part1+str(image_tag.get('src')), str('images_swords')+str(image_tag.get('src'))) for image_tag in image_tags if (('forum' in str(image_tag.get('src'))) == False) & (('None' in str(image_tag.get('src'))) == False) & (('avatar' in str(image_tag.get('src'))) == False) & (('Damage' in str(image_tag.get('src'))) == False) & (('Resist' in str(image_tag.get('src'))) == False) & (('STR' in str(image_tag.get('src'))) == False) & (('DEX' in str(image_tag.get('src'))) == False) & (('INT' in str(image_tag.get('src'))) == False) & (('FTH' in str(image_tag.get('src'))) == False) & (('attack' in str(image_tag.get('src'))) == False) & (('normal' in str(image_tag.get('src'))) == False) ] except: pass

1条回答

网友

1楼 · 发布于 2024-04-16 16:49:07

运行代码时，似乎有些URL是绝对的（它们以https://开头），有些则不是。您需要检查以下各项：

import requests
import urllib.request
from bs4 import BeautifulSoup

url_ds3 = {
"daggers":"https://darksouls3.wiki.fextralife.com/Daggers",
"straight_swords":"https://darksouls3.wiki.fextralife.com/Straight+Swords",
"great_swords":"https://darksouls3.wiki.fextralife.com/Greatswords",
"ultra_great_swords":"https://darksouls3.wiki.fextralife.com/Ultra+Greatswords",
"curved_swords":"https://darksouls3.wiki.fextralife.com/Curved+Swords",
"katanas":"https://darksouls3.wiki.fextralife.com/Katanas",
"curved_great_swords":"https://darksouls3.wiki.fextralife.com/Curved+Greatswords",
"piercing_swords":"https://darksouls3.wiki.fextralife.com/Piercing+Swords"
}


url_ds3_part1 = 'https://darksouls3.wiki.fextralife.com'

for url in url_ds3.values():
    print(url)

    content = requests.get(url).content
    soup = BeautifulSoup(content,'lxml') 
    image_tags = soup.findAll('img')

    for image_tag in image_tags:
        if ( (('forum' in str(image_tag.get('src'))) == False)
             & (('None' in str(image_tag.get('src'))) == False)
             & (('avatar' in str(image_tag.get('src'))) == False)
             & (('Damage' in str(image_tag.get('src'))) == False)  
             & (('Resist' in str(image_tag.get('src'))) == False)
             & (('STR' in str(image_tag.get('src'))) == False)  
             & (('DEX' in str(image_tag.get('src'))) == False)  
             & (('INT' in str(image_tag.get('src'))) == False)  
             & (('FTH' in str(image_tag.get('src'))) == False)   
             & (('attack' in str(image_tag.get('src'))) == False)    
             & (('normal' in str(image_tag.get('src'))) == False) ):

            if image_tag.get('src').startswith('http'):
                u = image_tag['src']
            else:
                u = url_ds3_part1 + image_tag['src']

            urllib.request.urlretrieve(u, 'images_swords' + image_tag['src'].replace(url_ds3_part1, ''))

相关问题更多 >

编程相关推荐

热门问题

热门文章