Urllib Python没有提供我在inspect elemen中看到的html代码

def get_tweets(section_url): html = urlopen(section_url).read() soup = BeautifulSoup(html, "lxml") tweets = soup.find("div", "results") category_links = [dd.a["href"] for tweet in tweets.findAll("div", "result-tweet")] return category_links url = "http://topsy.com/trackback?url=http%3A%2F%2Fmashable.com%2F2014%2F08%2F27%2Faustralia-retail-evolution-lab-aopen-shopping%2F" cat_links = get_tweets(url)

1条回答

网友

1楼 · 发布于 2024-10-16 20:48:44

问题是resultsdiv的内容被额外的HTTP调用和在浏览器端执行的javascript代码填满。urllib只“看到”不包含所需数据的初始HTML页面。在

一种选择是遵循@Himal的建议，模拟发送给trackbacks.js的底层请求，该请求通过tweets来获取数据。结果是JSON格式的，您可以使用标准库附带的^{}模块^{}：

import json
import urllib2

url = 'http://otter.topsy.com/trackbacks.js?url=http%3A%2F%2Fmashable.com%2F2014%2F08%2F27%2Faustralia-retail-evolution-lab-aopen-shopping%2F&infonly=0&call_timestamp=1411090809443&apikey=09C43A9B270A470B8EB8F2946A9369F3'
data = json.load(urllib2.urlopen(url))
for tweet in data['response']['list']:
    print tweet['permalink_url']

印刷品：

^{pr2}$

这是“下到金属”的选择。在

否则，你可以采取“高层次”的方法，而不必担心幕后发生了什么。让真正的浏览器加载您将通过selenium WebDriver与之交互的页面：

from selenium import webdriver

driver = webdriver.Chrome()  # can be Firefox(), PhantomJS() and more
driver.get("http://topsy.com/trackback?url=http%3A%2F%2Fmashable.com%2F2014%2F08%2F27%2Faustralia-retail-evolution-lab-aopen-shopping%2F")

for tweet in driver.find_elements_by_class_name('result-tweet'):
    print tweet.find_element_by_xpath('.//div[@class="media-body"]//ul[@class="inline"]/li//a').get_attribute('href')

driver.close()

印刷品：

^{pr2}$

这是如何缩放第二个选项以获取分页后的所有tweet：

from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

BASE_URL = 'http://topsy.com/trackback?url=http%3A%2F%2Fmashable.com%2F2014%2F08%2F27%2Faustralia-retail-evolution-lab-aopen-shopping%2F&offset={offset}'

driver = webdriver.Chrome()

# get tweets count
driver.get('http://topsy.com/trackback?url=http%3A%2F%2Fmashable.com%2F2014%2F08%2F27%2Faustralia-retail-evolution-lab-aopen-shopping%2F')
tweets_count = int(driver.find_element_by_xpath('//li[@data-name="all"]/a/span').text)

for x in xrange(0, tweets_count, 10):
    driver.get(BASE_URL.format(offset=x))

    # page header appears in case no more tweets found
    try:
        driver.find_element_by_xpath('//div[@class="page-header"]/h3')
    except NoSuchElementException:
        pass
    else:
        break

    # wait for results
    WebDriverWait(driver, 5).until(
        EC.presence_of_element_located((By.ID, "results"))
    )

    # get tweets
    for tweet in driver.find_elements_by_class_name('result-tweet'):
        print tweet.find_element_by_xpath('.//div[@class="media-body"]//ul[@class="inline"]/li//a').get_attribute('href')

driver.close()

相关问题更多 >

编程相关推荐

热门问题

热门文章