如何使用Selenium、Beautiful Soup、Pandas从网站的多个页面提取实际数据?

2024-09-29 19:34:09 发布

您现在位置:Python中文网/ 问答频道 /正文


The website url : "https://seffaflik.epias.com.tr/transparency/piyasalar/gop/arz-talep.xhtml"

我想提取每小时的所有数据。 但我的错误是拉同一张表,即使页面改变。

import requests
r = requests.get('https://seffaflik.epias.com.tr/transparency/piyasalar/gop/arz-talep.xhtml')
from bs4 import BeautifulSoup
source = BeautifulSoup(r.content,"lxml")
metin =source.title.get_text()
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import pandas as pd 
tarih = source.find("input",attrs={"id":"j_idt206:date1_input"})["value"] 
import datetime
import time
x = datetime.datetime.now()
today = datetime.date.today()
# print(today)
tomorrow = today + datetime.timedelta(days = 1) 
tomorrow = str(tomorrow)
words = tarih.split('.')  
yeni_tarih = '.'.join(reversed(words))
yeni_tarih =yeni_tarih.replace(".","-")
def tablo_cek():
    tablo = source.find_all("table")#sayfadaki tablo 
    dfs = pd.read_html(str(tablo))#tabloyu dataframe e çekmek
    dfs.append(dfs)#tabloya yeni çekilen tabloyu ekle
    return tablo 
if tomorrow == yeni_tarih :
    print(yeni_tarih == tomorrow)
    driver = webdriver.Chrome("C:/Users/tugba.ozkan/AppData/Local/SeleniumBasic/chromedriver.exe")
    user = driver.find_element_by_name("j_idt206:txt1")
    nextpage = driver.find_element_by_xpath("//a/span[@class ='ui-icon ui-icon-seek-next']")
    while num < 24 :
        user.send_keys(num) #saate veri gönder 
        driver.find_element_by_id('j_idt206:goster').click() #saati uygula
        nextpage = driver.find_element_by_xpath("//a/span[@class ='ui-icon ui-icon-seek-next']")#o saatteki next page
        nextpage.click() #next page e geç 
        user = driver.find_element_by_name("j_idt206:txt1") #tekrar getiriyor saat yerini 
        num = num + 1 #saati bir arttır
        user.clear() #saati sıfırla
    print("Güncelleme gelmedi")


nextpage = driver.find_element_by_xpath("//a/span[@class ='ui-icon ui-icon-seek-next']")#o saatteki next page

当python单击按钮转到下一页时,下一页显示,然后它需要拉下一个表,如表所示。但它不起作用。 在输出端,我看到了附加的表,表中的值相同。如下所示: 这是我的输出:

[    Fiyat (TL/MWh)  Talep (MWh)  Arz (MWh)
0                0      25.0101   19.15990
1                1      24.9741   19.16390
2                2      24.9741   19.18510
3               85      24.9741   19.18512
4               86      24.9736   19.20762
5               99      24.9736   19.20763
6              100      24.6197   19.20763
7              101      24.5697   19.20763
8              300      24.5697   19.20768
9              301      24.5697   19.20768
10             363      24.5697   19.20770
11             364      24.5497   19.20770
12             400      24.5497   19.20771
13             401      24.5297   19.20771
14             498      24.5297   19.20773
15             499      24.5297   19.36473
16             500      24.5297   19.36473
17             501      24.4097   19.36473
18             563      24.4097   19.36475
19             564      24.3897   19.36475
20             999      24.3897   19.36487
21            1000      24.3097   19.36487
22            1001      24.1897   19.36487
23            1449      24.1897   19.36499, [...]]
[    Fiyat (TL/MWh)  Talep (MWh)  Arz (MWh)
0                0      25.0101   19.15990
1                1      24.9741   19.16390
2                2      24.9741   19.18510
3               85      24.9741   19.18512
4               86      24.9736   19.20762
5               99      24.9736   19.20763
6              100      24.6197   19.20763
7              101      24.5697   19.20763
8              300      24.5697   19.20768
9              301      24.5697   19.20768
10             363      24.5697   19.20770
11             364      24.5497   19.20770
12             400      24.5497   19.20771
13             401      24.5297   19.20771
14             498      24.5297   19.20773
15             499      24.5297   19.36473
16             500      24.5297   19.36473
17             501      24.4097   19.36473
18             563      24.4097   19.36475
19             564      24.3897   19.36475
20             999      24.3897   19.36487
21            1000      24.3097   19.36487
22            1001      24.1897   19.36487
23            1449      24.1897   19.36499, [...]]
[    Fiyat (TL/MWh)  Talep (MWh)  Arz (MWh)
0                0      25.0101   19.15990
1                1      24.9741   19.16390
2                2      24.9741   19.18510
3               85      24.9741   19.18512
4               86      24.9736   19.20762
5               99      24.9736   19.20763
6              100      24.6197   19.20763
7              101      24.5697   19.20763
8              300      24.5697   19.20768
9              301      24.5697   19.20768
10             363      24.5697   19.20770
11             364      24.5497   19.20770
12             400      24.5497   19.20771
13             401      24.5297   19.20771
14             498      24.5297   19.20773
15             499      24.5297   19.36473
16             500      24.5297   19.36473
17             501      24.4097   19.36473
18             563      24.4097   19.36475
19             564      24.3897   19.36475
20             999      24.3897   19.36487
21            1000      24.3097   19.36487
22            1001      24.1897   19.36487
23            1449      24.1897   19.36499, [...]]
[    Fiyat (TL/MWh)  Talep (MWh)  Arz (MWh)
0                0      25.0101   19.15990
1                1      24.9741   19.16390
2                2      24.9741   19.18510
3               85      24.9741   19.18512
4               86      24.9736   19.20762
5               99      24.9736   19.20763
6              100      24.6197   19.20763
7              101      24.5697   19.20763
8              300      24.5697   19.20768
9              301      24.5697   19.20768
10             363      24.5697   19.20770
11             364      24.5497   19.20770
12             400      24.5497   19.20771
13             401      24.5297   19.20771
14             498      24.5297   19.20773
15             499      24.5297   19.36473
16             500      24.5297   19.36473
17             501      24.4097   19.36473
18             563      24.4097   19.36475
19             564      24.3897   19.36475
20             999      24.3897   19.36487
21            1000      24.3097   19.36487
22            1001      24.1897   19.36487
23            1449      24.1897   19.36499, [...]]

Tags: importuisourcedatetimebydriverelementfind


import requests
from bs4 import BeautifulSoup
import pandas as pd

url = 'https://seffaflik.epias.com.tr/transparency/piyasalar/gop/arz-talep.xhtml'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36'}

page = '0'
payload = {
'javax.faces.partial.ajax': 'true',
'javax.faces.source': 'j_idt206:dt',
'javax.faces.partial.execute': 'j_idt206:dt',
'javax.faces.partial.render': 'j_idt206:dt',
'j_idt206:dt': 'j_idt206:dt',
'j_idt206:dt_pagination': 'true',
'j_idt206:dt_first': page,
'j_idt206:dt_rows': '1000',
'j_idt206:dt_skipChildren': 'true',
'j_idt206:dt_encodeFeature': 'true',
'j_idt206': 'j_idt206',
'j_idt206:date1_input': '04.02.2021',
'j_idt206:txt1': '0',
'j_idt206:dt_rppDD': '1000'

rows = []
hours = list(range(0,24))
for hour in hours:
    response = requests.get(url, headers=headers, params=payload)
    soup = BeautifulSoup(response.text.replace('![CDATA[',''), 'lxml')
    columns = ['Fiyat (TL/MWh)',    'Talep (MWh)',  'Arz (MWh)', 'hour']
    trs = soup.find_all('tr')
    for row in trs:
        data = row.find_all('td')
        data = [x.text for x in data] + [str(hour)]

df = pd.DataFrame(rows, columns=columns)


    Fiyat (TL/MWh) Talep (MWh)  Arz (MWh)
0             0,00   25.113,70  17.708,10
1             0,01   25.077,69  17.712,10
2             0,02   25.077,67  17.723,10
3             0,85   25.076,57  17.723,12
4             0,86   25.076,05  17.746,12
..             ...         ...        ...
448         571,01   19.317,10  29.529,60
449         571,80   19.316,86  29.529,60
450         571,90   19.316,83  29.529,70
451         571,99   19.316,80  29.529,70
452         572,00   19.316,80  29.540,70

[453 rows x 3 columns]



enter image description here

这是因为您将初始html拉到这里source = BeautifulSoup(r.content,"lxml"),然后继续呈现该内容


import requests
r = requests.get('https://seffaflik.epias.com.tr/transparency/piyasalar/gop/arz-talep.xhtml')
from bs4 import BeautifulSoup
source = BeautifulSoup(r.content,"lxml")
metin =source.title.get_text()
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import pandas as pd 
tarih = source.find("input",attrs={"id":"j_idt206:date1_input"})["value"] 
import datetime
import time
x = datetime.datetime.now()
today = datetime.date.today()
# print(today)
tomorrow = today + datetime.timedelta(days = 1) 
tomorrow = str(tomorrow)
words = tarih.split('.')  
yeni_tarih = '.'.join(reversed(words))
yeni_tarih =yeni_tarih.replace(".","-")
def tablo_cek():
    source = BeautifulSoup(driver.page_source,"lxml")  #<  get the current html
    tablo = source.find_all("table")#sayfadaki tablo 
    dfs = pd.read_html(str(tablo))#tabloyu dataframe e çekmek
    dfs.append(dfs)#tabloya yeni çekilen tabloyu ekle
    return tablo 
if tomorrow == yeni_tarih :
    print(yeni_tarih == tomorrow)
    driver = webdriver.Chrome("C:/chromedriver_win32/chromedriver.exe")
    user = driver.find_element_by_name("j_idt206:txt1")
    nextpage = driver.find_element_by_xpath("//a/span[@class ='ui-icon ui-icon-seek-next']")
    tablo_cek() #<  need to get that data before moving to next page
    while num < 24 :
        user.send_keys(num) #saate veri gönder 
        driver.find_element_by_id('j_idt206:goster').click() #saati uygula
        nextpage = driver.find_element_by_xpath("//a/span[@class ='ui-icon ui-icon-seek-next']")#o saatteki next page
        nextpage.click() #next page e geç 
        user = driver.find_element_by_name("j_idt206:txt1") #tekrar getiriyor saat yerini 
        num = num + 1 #saati bir arttır
        user.clear() #saati sıfırla
    print("Güncelleme gelmedi")


[    Fiyat (TL/MWh)  Talep (MWh)  Arz (MWh)
0                0     25.11370   17.70810
1                1     25.07769   17.71210
2                2     25.07767   17.72310
3               85     25.07657   17.72312
4               86     25.07605   17.74612
..             ...          ...        ...
91           10000     23.97000   17.97907
92           10001     23.91500   17.97907
93           10014     23.91500   17.97907
94           10015     23.91500   17.97907
95           10100     23.91499   17.97909

[96 rows x 3 columns], [...]]
[    Fiyat (TL/MWh)  Talep (MWh)  Arz (MWh)
0            10101     23.91499   18.04009
1            10440     23.91497   18.04015
2            10999     23.91493   18.04025
3            11000     23.89993   18.04025
4            11733     23.89988   18.04039
..             ...          ...        ...
91           23999     23.55087   19.40180
92           24000     23.55087   19.40200
93           24001     23.53867   19.40200
94           24221     23.53863   19.40200
95           24222     23.53863   19.40200

[96 rows x 3 columns], [...]]
[    Fiyat (TL/MWh)  Talep (MWh)  Arz (MWh)
0            24360     21.33871    19.8112
1            24499     21.33868    19.8112
2            24500     21.33868    19.8112
3            24574     21.33867    19.8112
4            24575     21.33867    19.8112
..             ...          ...        ...
91           29864     21.18720    20.3708
92           29899     21.18720    20.3708
93           29900     21.18720    20.3808
94           29999     21.18720    20.3808
95           30000     21.18530    20.3811

[96 rows x 3 columns], [...]]

相关问题 更多 >
