用BeautifulSoup解析多个页面？

from bs4 import BeautifulSoup import requests page = 1 urldes = "https://www.johnpyeauctions.co.uk/lot_list.asp?saleid=4808&siteid=1&h=0&pageno={page}" #"https://www.johnpyeauctions.co.uk/lot_list.asp?saleid=4740&siteid=1&h=0&pageno=14" # add header mozila_agent = 'Mozilla/5.0 (Windows NT 6.3; Win64; x64)\ AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36' headers = {'User-Agent': mozila_agent} with requests.Session() as session: while True: response = session.get(urldes.format(page=page), headers=headers) soup = BeautifulSoup(response.content, "html.parser") ########## HOW TO parse the pages and collect the results here ? if page is 3 : #soup.find('u') is None: break # last page page += 1 ############################################################ the_whole_table = soup.find('table', width='97%') datalist = [] for tr in the_whole_table.find_all('tr')[1:]: # you want to start from the 1st item not the 0th so [1:] # Because the first is the thead i.e. Lot no, Picture, Lot Title... index_num = tr.find('td', width='8%') picture_link = index_num.next_sibling.a['data-img'] text_info = tr.find('td', width='41%') current_bid = tr.find('td', width='13%') time_left = tr.find('td', width='19%') datalist.append([index_num.text, picture_link, text_info.text, current_bid.text, time_left.text]) # for pic do ... print(picture_link) as for partial text only first 20 # characters index = datalist[0][0] picture = datalist[0][1] info = datalist[0][2] bid = datalist[0][3] time = datalist[0][4] df = ['Index Number', 'Picture', 'Informational text', 'Current BID', 'Time Left now'] theads = BeautifulSoup('<table style="width:50%; color: blue; font-family: verdana; font-size: 60%;"></table>', 'lxml') thekeys = BeautifulSoup('<thead style="color: blue; font-family: verdana; font-size: 60%;"></thead>', 'html.parser') #counter = 0 for i in df: tag = theads.new_tag('th') tag.append(i) thekeys.thead.append(tag) theads.table.append(thekeys) ############################################################### # The code above will initiate a table # after that the for loop will create and populate the first row (thead) for i in datalist: # thedata = BeautifulSoup('<tr style="color: blue; font-family: verdana; font-size: 50%;"></tr>', 'html.parser') thedata = BeautifulSoup('<tr></tr>', 'html.parser') # we loop through the data we collected # initiate a <td> </td> tag everytime we finish with one collection for j in i: if j.startswith('https'): img_tag = theads.new_tag('img', src=j, width='300') td_tag = theads.new_tag('td') td_tag.append(img_tag) thedata.append(td_tag) # counter += 1 else: # tag = theads.new_tag('td', style="color: blue; font-family: verdana; font-size: 50%;") tag = theads.new_tag('td') tag.append(j) thedata.append(tag) # counter += 1 # if counter is 5: # counter = 0 theads.table.append(thedata) #print(counter) css = "<style>{color: blue; font-family: verdana; font-size: 50%;}</style>" #css.string = css with open('asdf.html', 'w+') as f: f.write(theads.prettify()) print(css) # each of these if you print them you'll get a information that you can store # to test do print(index_num.text, text_info.text)

2条回答

网友

1楼 · 编辑于 2024-06-26 13:59:31

所以我设法用下面的代码解决了问题的第二部分。我没有试图修改“auction”对象中已经存在的元素，而是直接访问源代码，并在提取标记时添加它们。在

我遇到的另一个问题是向div标记添加一个类，这需要通过添加**来完成，比如soup.new_tag("div", **{'class':'auction'})

import itertools
from collections import namedtuple

import requests
from bs4 import BeautifulSoup

#saleid = '4793'
saleid = '4811'

url = 'https://www.johnpyeauctions.co.uk/lot_list.asp?saleid=' + saleid + '&siteid=1&h=0&pageno={}'

auctions = []
Auction = namedtuple('auction',
                     ['index', 'picture_link', 'description', 'current_bid', 'time_left'])

for page in itertools.count(start=1):
    response = requests.get(url.format(page))
    soup = BeautifulSoup(response.content, 'html.parser')
    table = soup.find('table', width='97%')

    for tr in table.find_all('tr')[1:]:  # skip the table header
        tds = tr.contents

        index = tds[0].find('h5').text
        index_tag = soup.new_tag("h5", **{'class':'index'})
        index_tag.append(index)

        pic_elem = tds[1].find('img')  # html of this img tag is broken, so I create a new one below
        picture = soup.new_tag('img', **{'class':'image'}, src=pic_elem['src'], width="160")
#        picture = soup.new_tag('img', **{'class':'image'}, src=pic_elem['src'], width=pic_elem['width'])

        description = tds[2].find('h5').text
        description_tag = soup.new_tag("h4", **{'class':'title'})
        description_tag.append(description)

        current_bid = tds[3].find('h5').text
        current_bid_tag = soup.new_tag("h4", **{'class':'price'})
        current_bid_tag.append(current_bid)

        time_left = tds[4].find('h5').text
        time_left_tag = soup.new_tag("h5", **{'class':'time'})
        time_left_tag.append(time_left)



        auction = Auction(index_tag, picture, description_tag, current_bid_tag, time_left_tag)
        auctions.append(auction)

    if not soup.find_all('a', string='Next'):
        break


soup = BeautifulSoup(
    '''
    <div class="container">
    </div>
    ''', 'lxml')

for auction in auctions:
    div_a = soup.new_tag("div", **{'class':'auction'})
    soup.div.append(div_a)

    for value in auction:    
        div_a.append(value)    


# this can also be included in the initial html, no need to do it programmatically
head = soup.new_tag('head')
head.append(soup.new_tag('meta', charset='utf-8'))
head.append(soup.new_tag('style', type='text/css'))
head.style.append(
    '''
* {
  margin: 0;
}

.container {
  font-family: "Arial";
  padding: 5px;
  display: grid;
  justify-items: center;
  grid-gap: 5px;
  grid-template-columns: repeat(5, 1fr);
  text-transform: capitalize;
}

.auction {
  display: grid;
  grid-template-columns: 140px auto;
  grid-template-areas:
    "title title time"
    "image image image"
    "image image image"
    "image image image"
    "price price index";

  width: 300px;
  height: 300px;
  border: 2px black solid;
  font-size: 12px;
}

.image {
  grid-area: image;
  margin: left
}
.title {
  grid-area: title;
  text-transform: lowercase;
}
.price {
  grid-area: price;
}
.time {
  grid-area: time;
}
.index {
  grid-area: index;
}

.title, .price, .time, .index {
    padding: 10px;
}
    ''')

soup.html.insert(0, head)

with open('auctions.html', 'w') as f:
    f.write(soup.prettify())

网友

2楼 · 编辑于 2024-06-26 13:59:31

您可以将脚本放入页面循环中，并在请求之间构造HTML，或者首先获取所有拍卖，将它们存储在某些数据结构中，例如，列表，然后在其上循环，将行附加到HTML中。我遵循第二种方法，因为它更接近你已经拥有的。我对代码进行了重构，并添加了：

if not soup.find_all('a', string='Next'):
    break

它在找不到包含Next文本的a标记后中断循环，这意味着这是最后一页。我不喜欢的是为表创建标题并以编程方式插入样式。如果我是你，我会创建一个包含所有样式、页面标题、表格的“模板”HTML，并从文件中读取。然后，您可以将这些行追加到表的tbody中。您还可以包含指向拍卖的链接，而不是纯文本。在

下面的代码可以工作并生成一个html，其中包含上次检查的1300多个拍卖：

^{pr2}$

相关问题更多 >

编程相关推荐

热门问题

热门文章