如何在标题中只保留唯一的值，并在不同的行中获取对应的值？

import requests import csv from bs4 import BeautifulSoup def cpap_spider(max_pages): page=1 while page<=max_pages: url= "https://www.1800cpap.com/cpap-masks/nasal?page=" +str(page) source_code= requests.get(url) plain_text= source_code.text soup= BeautifulSoup(plain_text, 'html.parser') for link in soup.findAll("a", {"class":"facets-item-cell-grid-title"}): href="https://www.1800cpap.com"+link.get("href") title= link.string each_item(href) print(href) #print(title) page+=1 data=[] def each_item(item_url): source_code= requests.get(item_url) plain_text= source_code.text soup= BeautifulSoup(plain_text, 'html.parser') table=soup.find("table", {"class":"table"}) table_rows= table.find_all('tr') for row in table_rows: cols = row.find_all('td') cols = [ele.text.strip() for ele in cols] data.append([ele for ele in cols if ele]) # Get rid of empty values b = open('all_appended.csv', 'w') a = csv.writer(b) a.writerows(data) b.close() cpap_spider(1)

2条回答

网友

1楼 · 编辑于 2024-07-01 07:41:25

假设标题始终是每个表的第一行，则只需跳过每个表中的该行，但不包括第一行。一种简单的方法是将要处理的第一行存储在初始化为0的变量中，并在处理函数中将其设置为1。可能代码：

def cpap_spider(max_pages):
    page=1
    start_row = 0
    while page<=max_pages:
        ...
        for link in soup.findAll("a", {"class":"facets-item-cell-grid-title"}):
            ...
            each_item(href, start_row)
            start_row = 1        # only first call to each_item will get start_row=1
            print(href)
            #print(title)
        page+=1
...
def each_item(item_url, start_row):
    ...    
    table_rows= table.find_all('tr')
    for row in table_rows[start_row:]:        # skip first row if start_row==1
        ...

网友

2楼 · 编辑于 2024-07-01 07:41:25

使用xlsxwriter而不是csv因为如果文本包含一个逗号，旁边没有空格","而不是逗号，旁边有空格", "，那么您的csv文件将出现问题，因为每个列值都由","分隔。例如，如果text = "aa,bb"，csv将认为此文本包含两列，如"aa"和"bb"。

这就是你需要的

import requests
import xlsxwriter
from bs4 import BeautifulSoup 
def cpap_spider(max_pages):
    global row_i
    page=1
    while page<=max_pages:
        url= "https://www.1800cpap.com/cpap-masks/nasal?page=" +str(page)
        source_code= requests.get(url)
        plain_text= source_code.text
        soup= BeautifulSoup(plain_text, 'html.parser')
        for link in soup.findAll("a", {"class":"facets-item-cell-grid-title"}):
            href="https://www.1800cpap.com"+link.get("href")
            title = link.string
            worksheet.write(row_i, 0, title)
            each_item(href)
            print(href)
            #print(title)
        page+=1

def each_item(item_url):
    global cols_names, row_i
    source_code= requests.get(item_url)
    plain_text= source_code.text
    soup= BeautifulSoup(plain_text, 'html.parser')
    table=soup.find("table", {"class":"table"})
    if table:
        table_rows = table.find_all('tr')
    else:
        return
    for row in table_rows:
      cols = row.find_all('td')
      for ele in range(0,len(cols)):
        temp = cols[ele].text.strip()
        if temp:
          # Here if you want then you can remove unwanted characters like : ? from temp
          # For example "Actual Weight" and ""
          if temp[-1:] == ":":
            temp = temp[:-1]
          # Name of column
          if ele == 0:
            try:
              cols_names_i = cols_names.index(temp)
            except:
              cols_names.append(temp)
              cols_names_i = len(cols_names) -  1
              worksheet.write(0, cols_names_i + 1, temp)
              continue;
          worksheet.write(row_i, cols_names_i + 1, temp)      
    row_i += 1
    
cols_names=[]
cols_names_i = 0
row_i = 1
workbook = xlsxwriter.Workbook('all_appended.xlsx')
worksheet = workbook.add_worksheet()
worksheet.write(0, 0, "Title")
    
cpap_spider(1)
#each_item("https://www.1800cpap.com/viva-nasal-cpap-mask-by-3b-medical")       
workbook.close()

相关问题更多 >

编程相关推荐

热门问题

热门文章