如何从div内部提取文本

import scrapy from scrapy.spiders import SitemapSpider from scrapy.crawler import CrawlerProcess import googletrans # from googletrans import Translator from translate import Translator class Myspider(SitemapSpider): name = 'spidername' sitemap_urls = ['https://www.arabam.com/sitemap/otomobil_1.xml'] sitemap_rules = [ ('/otomobil/', 'parse'), # ('/category/', 'parse_category'), ] def parse(self,response): for td in response.xpath("/html/body/div[3]/div[6]/div[4]/div/div[2]/table/tbody/tr/td[4]/div/a/@href").extract(): # / html / body / div[3] / div[6] / div[4] / div / div[2] / table / tbody / tr / td[4] / div / a checks = str(td.split("/")[3]).split("-") for items in checks: if items.isdigit(): if int(items) > 2001: url = "https://www.arabam.com/"+ td yield scrapy.Request(url, callback=self.parse_dir_contents) def parse_dir_contents(self,response): ##some other stuff im scraping overview1 = response.xpath("/html/body/div[3]/div[6]/div[3]/div/div[1]/div[3]/div/div[3]/div/div/div[2]/dl[1]/dd/span") print(response) print("s"+ str(overview1)) process = CrawlerProcess({ }) process.crawl(Myspider) process.start() # the script will block here until the c

[......or Kaputu: ', ' Orijinal ', ' ', 'Sol Ön Çamurluk: ', ' Boyanmış ', ' ', 'Ön Tampon: ', ' Orijinal ', ' ', 'Arka Tampon: ', ' Orijinal ', ' ', 'Belirtilmemiş', 'Orijinal', 'Boyalı', 'Değişmiş', ' ', ' ', ' Tramer tutarı yok ', ' ', ' ', ' ', 'ARAÇ BİLGİLERİ', ' ', ' ', 'DONANIM', '\xa0', ' ', '\xa0', ' ', '\xa0', ' ', '\xa0', ' ', '\xa0', ' ', '\xa0', ' ', ' ', 'KREDİ', ' ', ' ', 'SPONSORLU BAĞLANTILAR', " googletag.cmd.push(function () { googletag.display('div-gpt-ad-1547030262883-0'); }); ", " googletag.cmd.push(function () { googletag.display('div-gpt-ad-1547030358839-0'); }); "]

element = d.find_element_by_xpath("/html/body/div[3]/div[6]/div[3]/div/div[1]/div[3]/div/div[3]/div/div") d.execute_script("arguments[0].scrollIntoView();", element) element = d.find_element_by_xpath("/html/body/div[3]/div[6]/div[3]/div/div[1]/div[3]/div/div[3]/div/div") print(element) overview1 = element.text

element = d.find_element_by_xpath('/html/body/div[3]/div[6]/div[3]/div/div[1]/div[3]/div/div[3]/div/div') # you can use ANY way to locate element coordinates = element.location_once_scrolled_into_view # returns dict of X, Y coordinates d.execute_script('window.scrollTo({}, {});'.format(coordinates['x'], coordinates['y']))

1条回答

网友

1楼 · 发布于 2024-10-04 01:28:25

我使用selenium编写了以下代码来测试XPath（我以前没有使用scrapy）：

from selenium import webdriver
from time import sleep

url = 'https://www.arabam.com/ilan/sahibinden-satilik-peugeot-407-2-0-hdi-comfort/sahibinden-peugeot-407-1-6-hdi-comfort-2008-model/12776039'


driver = webdriver.Chrome()

driver.get(url)
driver.execute_script("window.scrollTo(0, 1080);")

sleep(1)

overview_info = [ data for section in driver.find_elements_by_xpath("//div[@class='col-md-6 genel-bakis']") for data in section.text.split("\n")]
enguine_info = [ data for section in driver.find_elements_by_xpath("//div[@class='col-md-6 motor-ve-performans']") for data in section.text.split("\n")]

print("VEHICLE INFORMATION")
for i in range(0,len(overview_info)-1,2):
    print(overview_info[i] + ": " + overview_info[i+1])
for i in range(0,len(enguine_info)-1,2):
    print(enguine_info[i] + ": " + enguine_info[i+1])

driver.quit()

这将提供以下输出：

输出是您在图片中突出显示的内容，因此我建议您使用以下路径：

#Get the text in the general section
"//div[@class='col-md-6 genel-bakis']//text()"
#Get text in the engine and performance section
"//div[@class='col-md-6 motor-ve-performans']//text()"

相关问题更多 >

编程相关推荐

热门问题

热门文章