如何循环浏览BS4数据并正确打印div标签

#mkbook.py # coding: utf-8 from bs4 import BeautifulSoup import requests LINK = "https://codes.iccsafe.org/content/FAC2017" pop = "" #z = "" chapters = open("bchap.txt",'r') a = [] for aline in chapters: chap = aline #print (chap) #pop = "" pop = LINK+chap #print (pop) r = requests.get(pop) data = r.text #print(data) soup = BeautifulSoup(data, 'html.parser') mydivs = soup.findAll("div", {"class": ["annotator", "chapter_header_styling"]}) f = open("BOOK.html","a") f.write("test <br/>") ######################################## #MY PROBLEM IS BELOW NOT PRINTING DIV DATA INTO TXT FILE ######################################## for div in mydivs: print (div) z = str(div) print(z) #doesn't printout...why??? f.write(z) print len(mydivs) f.close() chapters.close() ############################################## ## this is the old mkbook.py code before I looped it - inputing url 1 @ time # # coding: utf-8 from bs4 import BeautifulSoup import requests r = requests.get("https://codes.iccsafe.org/content/FAC2017/preface") data = r.text soup = BeautifulSoup(data, 'html.parser') a = [] mydivs = soup.findAll("div",{"class":["annotator", "chapter_header_styling"]}) f = open("BOOK.html","a") for div in mydivs: z = str(div) f.write(z) f.close() print len(mydivs) #outputs 1 if copied div data. ####################################### #mkchap.py # coding: utf-8 from bs4 import BeautifulSoup import requests r = requests.get("https://codes.iccsafe.org/content/FAC2017") data = r.text soup = BeautifulSoup(data, 'html.parser') a = [] soup.findAll('option',{"value":True}) list = soup.findAll('option') with open('bchap.txt', 'w') as filehandle: for l in list: filehandle.write(l['value']) filehandle.write("\n") print l['value'] #with open('bchap.txt', 'w') as filehandle: # filehandle.write("%s\n" % list) filehandle.close()

1条回答

网友

1楼 · 发布于 2024-09-30 18:23:43

问题似乎是您使用了错误的基本url来构建url。你知道吗

LINK = "https://codes.iccsafe.org/content/FAC2017"

如果你看看你的第一个要求，你可以清楚地看到这一点。你知道吗

print(pop)
print(r.status_code)

输出：

https://codes.iccsafe.org/content/FAC2017/content/FAC2017

404

运行填充bchap.txt的代码后，其输出为

/content/FAC2017
/content/FAC2017/legend
/content/FAC2017/copyright
/content/FAC2017/preface
/content/FAC2017/chapter-1-application-and-administration
/content/FAC2017/chapter-2-scoping-requirements
/content/FAC2017/chapter-3-building-blocks
/content/FAC2017/chapter-4-accessible-routes
/content/FAC2017/chapter-5-general-site-and-building-elements
/content/FAC2017/chapter-6-plumbing-elements-and-facilities
/content/FAC2017/chapter-7-communication-elements-and-features
/content/FAC2017/chapter-8-special-rooms-spaces-and-elements
/content/FAC2017/chapter-9-built-in-elements
/content/FAC2017/chapter-10-recreation-facilities
/content/FAC2017/list-of-figures
/content/FAC2017/fair-housing-accessibility-guidelines-design-guidelines-for-accessible-adaptable-dwellings
/content/FAC2017/advisory

让我们先更改基本url，然后重试。你知道吗

from bs4 import BeautifulSoup
import requests

LINK = "https://codes.iccsafe.org"
pop = ""
chapters = open("bchap.txt",'r')
a = []
for aline in chapters:
  chap = aline
  pop = LINK+chap
  r = requests.get(pop)
  print(pop)
  print(r.status_code)
chapters.close()

输出：

https://codes.iccsafe.org/content/FAC2017

404
...

为什么？b'coz的\n。如果我们做一个

print(repr(pop))

它将输出

'https://codes.iccsafe.org/content/FAC2017\n'

你还得把那\n去掉。最后的代码是

from bs4 import BeautifulSoup
import requests
LINK = "https://codes.iccsafe.org"
pop = ""
chapters = open("bchap.txt",'r')
a = []
for aline in chapters:
  chap = aline
  pop = LINK+chap
  r = requests.get(pop.strip())
  data = r.text
  soup = BeautifulSoup(data, 'html.parser')
  mydivs = soup.findAll("div", class_="annotator chapter_header_styling")
  f = open("BOOK.html","a")
  for div in mydivs:
      z = str(div)
      f.write(z)
  f.close()
chapters.close()

相关问题更多 >

编程相关推荐

热门问题

热门文章