确实如何提取职位href链接？

import requests import time from random import randint from bs4 import BeautifulSoup import urllib, requests, re, pandas as pd from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.common.exceptions import TimeoutException webdriver.DesiredCapabilities.CHROME["unexpectedAlertBehaviour"] = "accept" webdriver.Chrome(chrome_options=options,executable_path=CHROMEDRIVER_PATH) options = Options() options.add_argument('start-maximized') options.add_argument('disable-infobars') options.add_argument("--disable-extensions") driver = webdriver.Chrome(chrome_options=options,executable_path='chromedriver') driver.get("https://www.indeed.co.uk/automotive-engineer-jobs-in-uk") soup=BeautifulSoup(driver.page_source, "lxml") title = [tag.text.strip() for tag in soup.select('.jobtitle')] company = [tag.text.strip() for tag in soup.select('.company')] location = [tag.text.strip() for tag in soup.select('.location')] for y in range (len(title)): tmpstring = (title[y] + ',' + company[y] + ',' + location[y] + ",0") tmpstring = tmpstring.encode("utf-8") f = open('FileDump','a') f.write(tmpstring) f.close

2条回答

网友

1楼 · 编辑于 2024-09-27 23:25:40

您可以使用下面的代码来提取链接

from BeautifulSoup import BeautifulSoup
import urllib2
import re

html_page = urllib2.urlopen("http://arstechnica.com")
soup = BeautifulSoup(html_page)
for link in soup.findAll('a', attrs={'href': re.compile("^http://")}):
    print link.get('href')

参考 https://pythonspot.com/extract-links-from-webpage-beautifulsoup/

网友

2楼 · 编辑于 2024-09-27 23:25:40

可以使用以下代码获取子元素。你知道吗

title_href = [tag.find("a")["href"] for tag in soup.findAll("h2",{"class":"jobtitle"})]

我尝试了你的代码并修改了一些地方。因为我发现它的全名可以从<a>

import requests  
import time
from random import randint
from bs4 import BeautifulSoup
import urllib, requests, re, pandas as pd

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException

webdriver.DesiredCapabilities.CHROME["unexpectedAlertBehaviour"] = "accept"


options = Options()
options.add_argument('start-maximized')
options.add_argument('disable-infobars')
options.add_argument(" disable-extensions")

driver = webdriver.Chrome(chrome_options=options,executable_path='chromedriver')

driver.get("https://www.indeed.co.uk/automotive-engineer-jobs-in-uk")

domain = "https://www.indeed.co.uk"

soup=BeautifulSoup(driver.page_source, "lxml")

title = [tag.find("a")["title"] for tag in soup.findAll("h2",{"class":"jobtitle"})]
title_href = [domain + tag.find("a")["href"] for tag in soup.findAll("h2",{"class":"jobtitle"})]
company = [tag.text.strip() for tag in soup.findAll("span",{"class":"company"})]
location = [tag.text.strip() for tag in soup.findAll("span",{"class":"location"})]

print(title_href)

driver.close()

相关问题更多 >

编程相关推荐

热门问题

热门文章