从较小的数据帧创建较大的数据帧

2024-09-28 03:14:13 发布

您现在位置:Python中文网/ 问答频道 /正文

我有一个数据结构的问题,因为我把它从PGA网站。我很难将数据放入数据框并合并数据,以便以后可以使用数据框进行分析。所搜集数据的规模永远都不对。每次运行代码时,我都会遇到一个无法协调的错误。你知道吗

我尝试过合并和连接数据帧,但似乎没有任何效果。感谢您的帮助

我真的希望我的数据框包含从不同的网站,但在同一行的其他数据格式的年和球员的名字个人统计数据。你知道吗

import csv
from urllib.request import urlopen
from bs4 import BeautifulSoup
import datetime
import socket
import urllib.error
import pandas as pd
import urllib
import sqlalchemy
import numpy as np
import functools

base = 'http://www.pgatour.com/'
inn = 'stats/stat'
end = '.html'
years = ['2017','2016']


alpha = []
#all pages with links to tables
urls =     ['http://www.pgatour.com/stats.html','http://www.pgatour.com/stats/categories.ROTT_INQ.html','http://www.pgatour.com/stats/categories.RAPP_INQ.html','http://www.pgatour.com/stats/categories.RARG_INQ.html','http://www.pgatour.com/stats/categories.RPUT_INQ.html','http://www.pgatour.com/stats/categories.RSCR_INQ.html','http://www.pgatour.com/stats/categories.RSTR_INQ.html','http://www.pgatour.com/stats/categories.RMNY_INQ.html','http://www.pgatour.com/stats/categories.RPTS_INQ.html']
for i in urls:
    data = urlopen(i)
    soup = BeautifulSoup(data, "html.parser")
    for link in soup.find_all('a'):
        if link.has_attr('href'):
            alpha.append(base + link['href'][17:]) #may need adjusting
#data links
beta = []
for i in alpha:
    if inn in i:
        beta.append(i)

gamma = []
for i in beta:
    if i not in gamma:
        gamma.append(i)

jan = []
for i in gamma:
    try:
        data = urlopen(i)
        soup = BeautifulSoup(data, "html.parser")
        for table in soup.find_all('section',{'class':'module-statistics-off-the-tee-details'}):
            for j in table.find_all('h3'):
                y=j.get_text().replace(" ","").replace("-","").replace(":","").replace(">","").replace("<","").replace(">","").replace(")","").replace("(","").replace("=","").replace("+","")
                jan.append([i,str(y+'.csv')])
                print([i,str(y+'.csv')])
    except Exception as e:
            print(e)
            pass

#my problem starts here
#using urls list so that I can find error faster
urls = [['http://www.pgatour.com/stats/stat.02356.html','d']
    ,['http://www.pgatour.com/stats/stat.02568.html','f']
    ,['http://www.pgatour.com/stats/stat.111.html','r']]        
list = []
master = pd.DataFrame()
#jan = [['http://www.pgatour.com/stats/stat.02356.html', 'Last15EventsScoring.csv']]
#make a list with url and title name and cleaned csv name
#write to csv
row_sp = []
rows_sp =[]
title1 = [] 
title = []  
for i in urls:
    try:
        for y in years:
            data = urlopen(i[0][:-4] +y+ end)
            soup = BeautifulSoup(data, "html.parser")
            data1 = urlopen(i[0])
            soup1 = BeautifulSoup(data1, "html.parser")
            for table in soup1.find_all('table',{'id':'statsTable'}):
                title.append('year')
                for k in table.find_all('tr'):
                    for n in k.find_all('th'):
                        title1.append(n.get_text())
                        for l in title1:
                            if l not in title:
                                title.append(l)
                rows_sp.append(title)
            for table in soup.find_all('table',{'id':'statsTable'}):
                for h in table.find_all('tr'):
                    row_sp = [y]
                    for j in h.find_all('td'):
                        row_sp.append(j.get_text().replace(" ","").replace("\n","").replace("\xa0"," "))
                    rows_sp.append(row_sp)
            df=pd.DataFrame(rows_sp)
            df.columns = title
            df.drop(df.index[1],inplace = True)
            print(df)
            list.append(df)
    except Exception as e:
        print(e)
        pass
df_merge = functools.reduce(lambda  left,right: pd.merge(left,right,on=['year','PLAYER NAME'], how='outer'), list)

Tags: inimportcomhttpforhtmlwwwstats

热门问题