无法定位某些<div>，尽管它们存在

#main method d = webdriver.Chrome('/Users/fatima.arshad/Downloads/chromedriver') d.get(url) start = BeautifulSoup(d.page_source, 'html.parser') Image_URL = self.saveImage("./products/", product_name, start) recommendations = self.getRecommendations(start, d) def getRecommendations(self,start,d): #code to scroll to the bottom of page recommended = [] s = start.find_all('div', class_='swiper-container swiper-container-horizontal') while not s : s = start.find_all('div', class_='swiper-container swiper-container-horizontal') for data in start.find_all('div', class_='swiper-container swiper-container-horizontal'): for a in data.find_all('a'): print(a.get('href')) # for getting link print(a.text) # for getting text between the link recommended.append("https://loft.com"+str(a.get('href'))) def saveImage(self, foldername, product_name,start): ##some other code s = start.find('div', class_='swiper-wrapper') for i in start.find_all('div', class_='swiper-wrapper'): for img in i.select('img'): print(img['src']) urllib.request.urlretrieve("http://"+img['src'], foldername + "/" + product_name + str(c) + ".jpg") c = c + 1

1条回答

网友

1楼 · 发布于 2024-09-29 23:17:44

链接是动态构建的。您可以在network选项卡中查看GET请求，该请求以json格式检索用于构建新图像url的信息

您可以模仿这些步骤：

from bs4 import BeautifulSoup as bs
import requests, re, json

p = re.compile(r'\((.*),')

with requests.Session() as s:
    r = s.get('https://www.loft.com/loft-plus-floral-maxi-shirtdress/514793')
    soup = bs(r.content, 'lxml')
    src = soup.select_one('.product-image img')['src'].split('?')[0]   
    r = s.get(f'{src}_IS?req=set,json&callback=s7R_1&handler=s7R_1&_=1')

data = json.loads(p.findall(r.text)[0])
for item in data['set']['item']:
    i = item['i']['n']
    image_url = f'https://anninc.scene7.com/is/image/{i}?$pdp$'
    print(image_url)

相关问题更多 >

编程相关推荐

热门问题

热门文章