TypeError:从HTML进行分析时，出现“Request”对象不可编辑的错误

# -*- coding: utf-8 -*- """ Created on Tue Jul 23 22:39:05 2019 @author: Nijat """ try: # For Python 3.0 and later from urllib.request import Request, urlopen except ImportError: # Fall back to Python 2's urllib2 from urllib2 import urlopen from bs4 import BeautifulSoup import csv from datetime import datetime import requests webData = [] #websiteReq = ['https://boss.az/vacancies'] #FORBIDDEN ERROR WITH THIS websiteReq = Request('https://boss.az/vacancies', headers={'User-Agent': 'Mozilla/5.0'}) #Request not iterable #for pg in websiteReq: #ERROR OCCURS HERE page = urlopen(websiteReq) soupping = BeautifulSoup(page, 'html.parser').encode("utf-8") takingTitle = soupping.find_all('h3', attrs={'class':'results-i-title'}) takingCompany = soupping.find_all('a', attrs={'class':'results-i-company'}) takingDescription = soupping.find_all('div', attrs={'class':'results-i-summary'}) nameofVac = takingTitle.text.strip() nameofComp = takingCompany.text.strip() nameofDescript = takingDescription.text.strip() webData.append((nameofVac, nameofComp, nameofDescript)) with open('Vacancies.csv','a') as csvfile: writingtocsv = csv.writer(csvfile) for nameofVac, nameofComp in webData: writingtocsv.writerow([nameofVac, nameofComp, nameofDescript,datetime.now()])

3条回答

网友

1楼 · 编辑于 2024-04-19 06:40:20

查找所有返回列表，这样你就做不到了。返回的任何变量的文本。我会使用请求，当你的选择器返回长度相等的列表时，将它们压缩并与pandas一起放入一个数据帧中。然后可以写入csv。在

from bs4 import BeautifulSoup as bs
import requests
import pandas as pd

url = 'https://boss.az/vacancies'
r = requests.get(url)
soup = bs(r.content, 'lxml')
titles = [i.text for i in soup.select('.results-i-title')]
companies = [i.text for i in soup.select('.results-i-company')]
summaries = [i.text for i in soup.select('.results-i-summary')]
df = pd.DataFrame(list(zip(titles, companies, summaries)), columns = ['Title', 'Company', 'Summary'])
df.to_csv(r'Data.csv', sep=',', encoding='utf-8-sig',index = False )

网友

2楼 · 编辑于 2024-04-19 06:40:20

Request（）返回类Request的对象。这是不可忍受的。您应该使用这个类中的一些方法来返回文本/内容。然后你可以用BS来解析它。在

>>> import requests
>>> response = requests.get('https://httpbin.org/get')
>>> print(response.content)
b'{\n  "args": {}, \n  "headers": {\n    "Accept": "*/*", \n    "Accept-Encoding": "gzip, deflate", \n    "Host": "httpbin.org", \n    "User-Agent": "python-requests/2.9.1"\n  }, \n  "origin": "95.56.82.136", \n  "url": "https://httpbin.org/get"\n}\n'
>>> response.json()
{'headers': {'Accept-Encoding': 'gzip, deflate', 'User-Agent': 'python-requests/2.9.1', 'Host': 'httpbin.org', 'Accept': '*/*'}, 'args': {}, 'origin': '95.56.82.136', 'url': 'https://httpbin.org/get'}
>>> response.headers
{'Connection': 'keep-alive', 'Content-Type': 'application/json', 'Server': 'nginx', 'Access-Control-Allow-Credentials': 'true', 'Access-Control-Allow-Origin': '*', 'Content-Length': '237', 'Date': 'Wed, 23 Dec 2015 17:56:46 GMT'}
>>> response.headers.get('Server')
'nginx'

网友

3楼 · 编辑于 2024-04-19 06:40:20

只是路过。。。与其用靓汤，不如用正则表达式？是这样的：

import re
import requests

data = []
s = requests.Session()
page_html = s.get('https://en.boss.az/vacancies',
                  headers={'User-Agent': 'Mozilla/5.0'}).text

regex_logic = re.compile(
    r'<h3 class="results-i-title">.*?<\/h3><a target=.*?class="results-i-company" href=.*?>((\w|\s)+)')
myiter = re.finditer(regex_logic, page_html)
while True:
    try:
        data.append(next(myiter).group(1))
    except StopIteration:
        break

下面是更详细的代码，其中包含相当长的regex。在

其思想是获取GET请求的文本，并使用正则表达式创建一个迭代器。我用你的例子得到了我相信的公司名称。然后要循环遍历迭代器。在

如果您想要所有页面的数据，只需查找页面数量，并按照此特定网站的逻辑编辑geturl。在

相关问题更多 >

编程相关推荐

热门问题

热门文章