我想在下面的url页面中获取整个pdf链接列表: 'http://www1.kiwoom.com/nkw.templateFrameSet.do?m=m0601010000'
问题是网页内部使用javascript来显示链接,而我无法获得pdf链接。你知道吗
事实上,我试着用google上找到的各种方法来解析。但我失败了。 你能提出解决问题的正确方法吗?你知道吗
下面是我尝试但失败的代码:
def crawle_kiwoom_mletter():
if not os.path.exists(dir_output_mletter):
os.makedirs(dir_output_mletter)
#urlformat = 'https://www.kiwoom.com/nkw.template.do?m=m0601010101&s_menu=ML&s_sqno=4784'
urlformat = 'http://www1.kiwoom.com/nkw.templateFrameSet.do?m=m0601010000'
index = -1
while True:
index = index + 1
url = urlformat.format(index)
print('processing {}...'.format(url))
page = urllib.request.urlopen(url)
soup = BeautifulSoup(page, 'lxml')
#print_anchors(soup)
print(soup.prettify())
'''
if browse_mbriefing_linkpages(soup) == False:
break
'''
break
'''
https://impythonist.wordpress.com/2015/01/06/ultimate-guide-for-scraping-javascript-rendered-web-pages/
'''
import sys
from PyQt4.QtGui import *
from PyQt4.QtCore import *
from PyQt4.QtWebKit import *
from lxml import html
class Render(QWebPage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
self.app.exec_()
def _loadFinished(self, result):
self.frame = self.mainFrame()
self.app.quit()
def crawl_kiwoom_mletter2():
url = 'http://www1.kiwoom.com/nkw.templateFrameSet.do?m=m0601010000'
url='http://www1.kiwoom.com/nkw.templateFrameSet.do?m=m0601010000&source=&xdr='
#This does the magic.Loads everything
r = Render(url)
#result is a QString.
result = r.frame.toHtml()
print(result)
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
'''
http://stackoverflow.com/questions/28289699/python-web-scraping-for-javascript-generated-content
'''
def crawl_kiwoom_mletter3():
browser = webdriver.Firefox()
url = 'http://www1.kiwoom.com/nkw.templateFrameSet.do?m=m0601010000'
browser.get(url)
res = browser.page_source
print(res)
driver.close()
使用python2和BeautifulSoup4尝试以下代码:
它从pdf列表页获取所有链接,并用getParams函数解析它们。你知道吗
使用urllib2 python模块将参数和附加的basePath参数发送到下载url。你知道吗
我建议您在每个请求之间添加一个延迟,以防止服务器过载。你知道吗
更新:
它现在浏览1到58页(实际页数),并解析所有链接。你知道吗
相关问题 更多 >
编程相关推荐