如何使用Selenium、Beautiful Soup、Pandas从网站的多个页面提取实际数据？

import requests r = requests.get('https://seffaflik.epias.com.tr/transparency/piyasalar/gop/arz-talep.xhtml') from bs4 import BeautifulSoup source = BeautifulSoup(r.content,"lxml") metin =source.title.get_text() source.find("input",attrs={"id":"j_idt206:txt1"}) from selenium import webdriver from selenium.webdriver.common.keys import Keys import pandas as pd tarih = source.find("input",attrs={"id":"j_idt206:date1_input"})["value"] import datetime import time x = datetime.datetime.now() today = datetime.date.today() # print(today) tomorrow = today + datetime.timedelta(days = 1) tomorrow = str(tomorrow) words = tarih.split('.') yeni_tarih = '.'.join(reversed(words)) yeni_tarih =yeni_tarih.replace(".","-") def tablo_cek(): tablo = source.find_all("table")#sayfadaki tablo dfs = pd.read_html(str(tablo))#tabloyu dataframe e çekmek dfs.append(dfs)#tabloya yeni çekilen tabloyu ekle print(dfs) return tablo if tomorrow == yeni_tarih : print(yeni_tarih == tomorrow) driver = webdriver.Chrome("C:/Users/tugba.ozkan/AppData/Local/SeleniumBasic/chromedriver.exe") driver.get("https://seffaflik.epias.com.tr/transparency/piyasalar/gop/arz-talep.xhtml") time.sleep(1) driver.find_element_by_xpath("//select/option[@value='96']").click() time.sleep(1) user = driver.find_element_by_name("j_idt206:txt1") nextpage = driver.find_element_by_xpath("//a/span[@class ='ui-icon ui-icon-seek-next']") num=0 while num < 24 : user.send_keys(num) #saate veri gönder driver.find_element_by_id('j_idt206:goster').click() #saati uygula nextpage = driver.find_element_by_xpath("//a/span[@class ='ui-icon ui-icon-seek-next']")#o saatteki next page nextpage.click() #next page e geç user = driver.find_element_by_name("j_idt206:txt1") #tekrar getiriyor saat yerini time.sleep(1) tablo_cek() num = num + 1 #saati bir arttır user.clear() #saati sıfırla else: print("Güncelleme gelmedi")

[ Fiyat (TL/MWh) Talep (MWh) Arz (MWh) 0 0 25.0101 19.15990 1 1 24.9741 19.16390 2 2 24.9741 19.18510 3 85 24.9741 19.18512 4 86 24.9736 19.20762 5 99 24.9736 19.20763 6 100 24.6197 19.20763 7 101 24.5697 19.20763 8 300 24.5697 19.20768 9 301 24.5697 19.20768 10 363 24.5697 19.20770 11 364 24.5497 19.20770 12 400 24.5497 19.20771 13 401 24.5297 19.20771 14 498 24.5297 19.20773 15 499 24.5297 19.36473 16 500 24.5297 19.36473 17 501 24.4097 19.36473 18 563 24.4097 19.36475 19 564 24.3897 19.36475 20 999 24.3897 19.36487 21 1000 24.3097 19.36487 22 1001 24.1897 19.36487 23 1449 24.1897 19.36499, [...]] [ Fiyat (TL/MWh) Talep (MWh) Arz (MWh) 0 0 25.0101 19.15990 1 1 24.9741 19.16390 2 2 24.9741 19.18510 3 85 24.9741 19.18512 4 86 24.9736 19.20762 5 99 24.9736 19.20763 6 100 24.6197 19.20763 7 101 24.5697 19.20763 8 300 24.5697 19.20768 9 301 24.5697 19.20768 10 363 24.5697 19.20770 11 364 24.5497 19.20770 12 400 24.5497 19.20771 13 401 24.5297 19.20771 14 498 24.5297 19.20773 15 499 24.5297 19.36473 16 500 24.5297 19.36473 17 501 24.4097 19.36473 18 563 24.4097 19.36475 19 564 24.3897 19.36475 20 999 24.3897 19.36487 21 1000 24.3097 19.36487 22 1001 24.1897 19.36487 23 1449 24.1897 19.36499, [...]] [ Fiyat (TL/MWh) Talep (MWh) Arz (MWh) 0 0 25.0101 19.15990 1 1 24.9741 19.16390 2 2 24.9741 19.18510 3 85 24.9741 19.18512 4 86 24.9736 19.20762 5 99 24.9736 19.20763 6 100 24.6197 19.20763 7 101 24.5697 19.20763 8 300 24.5697 19.20768 9 301 24.5697 19.20768 10 363 24.5697 19.20770 11 364 24.5497 19.20770 12 400 24.5497 19.20771 13 401 24.5297 19.20771 14 498 24.5297 19.20773 15 499 24.5297 19.36473 16 500 24.5297 19.36473 17 501 24.4097 19.36473 18 563 24.4097 19.36475 19 564 24.3897 19.36475 20 999 24.3897 19.36487 21 1000 24.3097 19.36487 22 1001 24.1897 19.36487 23 1449 24.1897 19.36499, [...]] [ Fiyat (TL/MWh) Talep (MWh) Arz (MWh) 0 0 25.0101 19.15990 1 1 24.9741 19.16390 2 2 24.9741 19.18510 3 85 24.9741 19.18512 4 86 24.9736 19.20762 5 99 24.9736 19.20763 6 100 24.6197 19.20763 7 101 24.5697 19.20763 8 300 24.5697 19.20768 9 301 24.5697 19.20768 10 363 24.5697 19.20770 11 364 24.5497 19.20770 12 400 24.5497 19.20771 13 401 24.5297 19.20771 14 498 24.5297 19.20773 15 499 24.5297 19.36473 16 500 24.5297 19.36473 17 501 24.4097 19.36473 18 563 24.4097 19.36475 19 564 24.3897 19.36475 20 999 24.3897 19.36487 21 1000 24.3097 19.36487 22 1001 24.1897 19.36487 23 1449 24.1897 19.36499, [...]]

2条回答

网友

1楼 · 编辑于 2024-09-29 19:34:09

我还将提供另一个解决方案，因为您可以直接从请求中提取数据。它还为您提供了一个选项，可以选择每页要拉多少次（并且您可以迭代每个页面），但是，如果您将该限制设置得足够高，您可以在一个请求中获得所有内容。所以大约有400多行，我将限制设置为1000行，然后您只需要第0页：

import requests
from bs4 import BeautifulSoup
import pandas as pd

url = 'https://seffaflik.epias.com.tr/transparency/piyasalar/gop/arz-talep.xhtml'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36'}

page = '0'
payload = {
'javax.faces.partial.ajax': 'true',
'javax.faces.source': 'j_idt206:dt',
'javax.faces.partial.execute': 'j_idt206:dt',
'javax.faces.partial.render': 'j_idt206:dt',
'j_idt206:dt': 'j_idt206:dt',
'j_idt206:dt_pagination': 'true',
'j_idt206:dt_first': page,
'j_idt206:dt_rows': '1000',
'j_idt206:dt_skipChildren': 'true',
'j_idt206:dt_encodeFeature': 'true',
'j_idt206': 'j_idt206',
'j_idt206:date1_input': '04.02.2021',
'j_idt206:txt1': '0',
'j_idt206:dt_rppDD': '1000'
}

rows = []
hours = list(range(0,24))
for hour in hours:
    payload.update({'j_idt206:txt1':str(hour)})
    response = requests.get(url, headers=headers, params=payload)
    soup = BeautifulSoup(response.text.replace('![CDATA[',''), 'lxml')
    columns = ['Fiyat (TL/MWh)',    'Talep (MWh)',  'Arz (MWh)', 'hour']
    
    trs = soup.find_all('tr')
    for row in trs:
        data = row.find_all('td')
        data = [x.text for x in data] + [str(hour)]
        rows.append(data)

df = pd.DataFrame(rows, columns=columns)

输出：

print(df)
    Fiyat (TL/MWh) Talep (MWh)  Arz (MWh)
0             0,00   25.113,70  17.708,10
1             0,01   25.077,69  17.712,10
2             0,02   25.077,67  17.723,10
3             0,85   25.076,57  17.723,12
4             0,86   25.076,05  17.746,12
..             ...         ...        ...
448         571,01   19.317,10  29.529,60
449         571,80   19.316,86  29.529,60
450         571,90   19.316,83  29.529,70
451         571,99   19.316,80  29.529,70
452         572,00   19.316,80  29.540,70

[453 rows x 3 columns]

要找到这只需要一点调查工作。如果转到开发工具->；网络->；XHR，您尝试查看数据是否嵌入到这些请求中（参见图）。如果您在那里找到它，请转到Headers选项卡，您可以在底部获得url和参数

在大多数情况下，您会看到数据是以漂亮的json格式返回的。这里的情况并非如此。它是以与xml稍有不同的方式返回的，因此需要做一些额外的工作来提取标记等等。但并非不可能

网友

2楼 · 编辑于 2024-09-29 19:34:09

这是因为您将初始html拉到这里source = BeautifulSoup(r.content,"lxml")，然后继续呈现该内容

您需要为每个页面提取html。只需添加一行即可。我在添加它的地方发表了评论：

import requests
r = requests.get('https://seffaflik.epias.com.tr/transparency/piyasalar/gop/arz-talep.xhtml')
from bs4 import BeautifulSoup
source = BeautifulSoup(r.content,"lxml")
metin =source.title.get_text()
source.find("input",attrs={"id":"j_idt206:txt1"})
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import pandas as pd 
tarih = source.find("input",attrs={"id":"j_idt206:date1_input"})["value"] 
import datetime
import time
x = datetime.datetime.now()
today = datetime.date.today()
# print(today)
tomorrow = today + datetime.timedelta(days = 1) 
tomorrow = str(tomorrow)
words = tarih.split('.')  
yeni_tarih = '.'.join(reversed(words))
yeni_tarih =yeni_tarih.replace(".","-")
def tablo_cek():
    source = BeautifulSoup(driver.page_source,"lxml")  #<  get the current html
    tablo = source.find_all("table")#sayfadaki tablo 
    dfs = pd.read_html(str(tablo))#tabloyu dataframe e çekmek
    dfs.append(dfs)#tabloya yeni çekilen tabloyu ekle
    print(dfs)
    return tablo 
if tomorrow == yeni_tarih :
    print(yeni_tarih == tomorrow)
    driver = webdriver.Chrome("C:/chromedriver_win32/chromedriver.exe")
    driver.get("https://seffaflik.epias.com.tr/transparency/piyasalar/gop/arz-talep.xhtml")
    time.sleep(1)
    driver.find_element_by_xpath("//select/option[@value='96']").click()
    time.sleep(1)
    user = driver.find_element_by_name("j_idt206:txt1")
    nextpage = driver.find_element_by_xpath("//a/span[@class ='ui-icon ui-icon-seek-next']")
    num=0
    
    tablo_cek() #<  need to get that data before moving to next page
    while num < 24 :
        user.send_keys(num) #saate veri gönder 
        driver.find_element_by_id('j_idt206:goster').click() #saati uygula
        nextpage = driver.find_element_by_xpath("//a/span[@class ='ui-icon ui-icon-seek-next']")#o saatteki next page
        nextpage.click() #next page e geç 
        user = driver.find_element_by_name("j_idt206:txt1") #tekrar getiriyor saat yerini 
        time.sleep(1)
        tablo_cek()
        num = num + 1 #saati bir arttır
        user.clear() #saati sıfırla
else:
    print("Güncelleme gelmedi")

输出：

True
[    Fiyat (TL/MWh)  Talep (MWh)  Arz (MWh)
0                0     25.11370   17.70810
1                1     25.07769   17.71210
2                2     25.07767   17.72310
3               85     25.07657   17.72312
4               86     25.07605   17.74612
..             ...          ...        ...
91           10000     23.97000   17.97907
92           10001     23.91500   17.97907
93           10014     23.91500   17.97907
94           10015     23.91500   17.97907
95           10100     23.91499   17.97909

[96 rows x 3 columns], [...]]
[    Fiyat (TL/MWh)  Talep (MWh)  Arz (MWh)
0            10101     23.91499   18.04009
1            10440     23.91497   18.04015
2            10999     23.91493   18.04025
3            11000     23.89993   18.04025
4            11733     23.89988   18.04039
..             ...          ...        ...
91           23999     23.55087   19.40180
92           24000     23.55087   19.40200
93           24001     23.53867   19.40200
94           24221     23.53863   19.40200
95           24222     23.53863   19.40200

[96 rows x 3 columns], [...]]
[    Fiyat (TL/MWh)  Talep (MWh)  Arz (MWh)
0            24360     21.33871    19.8112
1            24499     21.33868    19.8112
2            24500     21.33868    19.8112
3            24574     21.33867    19.8112
4            24575     21.33867    19.8112
..             ...          ...        ...
91           29864     21.18720    20.3708
92           29899     21.18720    20.3708
93           29900     21.18720    20.3808
94           29999     21.18720    20.3808
95           30000     21.18530    20.3811

[96 rows x 3 columns], [...]]

相关问题更多 >

编程相关推荐

热门问题

热门文章