加速python scrape sqllite3

2024-09-29 09:22:18 发布

您现在位置:Python中文网/ 问答频道 /正文

我想从NBA.com json feed中获取NBA boxscores。我想知道是否有任何方法来加速我目前的代码

def ParsePlayerBox(boxObj):
    statsDict = {}
    homeId, awayId = boxObj['resultSets'][1].values()[1][0][3], boxObj['resultSets'][1].values()[1][1][3]
    dateObj = parse(boxObj['resultSets'][1].values()[1][0][0].split('T')[0]).date()
    for x in  boxObj['resultSets'][4].values()[1:][0]:
         gameId, teamId, teamAbbr, teamCity, plyrId, plyrName, posStr, comment, mpStr, fgm, fga, fgPerc, fg3m, fg3a, pgPerc, ftm, fta, ftPerc, orb, drb, trb, ast, stl, blk, tov, pf, pts, plusMinus = x
         plyrGameId = gameId + '_' + str(plyrId)
         if mpStr != None: # PLAYER PLAYED IN GAME
            fg2m, fg2a =  fgm - fg3m, fga - fg3a
            minutes, seconds = [int(x) for x in mpStr.split(':')]
            mp = minutes +  seconds / 60 # CONVERT MP INTO DECIMAL
         else: # PLAYER DIDNT PLAY
            fg2m, fg2a = None, None
            mp = None
          if posStr == '':
             position = np.nan
          else: 
             position = posStr
          if teamId == homeId:
              isHome = True
              oppId = awayId
          elif teamId == awayId:
              isHome = False
              oppId = homeId
          statsDict[plyrGameId] = [gameId, dateObj, str(teamId), str(plyrId), plyrName, str(oppId), isHome, position, comment.strip(), mp, fgm, fga, fg2m, fg2a, fg3m, fg3a, ftm, fta,  orb, drb, trb, ast, stl, blk, tov, pf, pts, plusMinus]
    return statsDict



def ImportBoxScores(start, end):
    con = sqlite3.connect("nba.db")
    con.execute("CREATE TABLE IF NOT EXISTS NBAPLYRBOXTEST(PlyrGameId, GameId, DateObj, TeamId, PlyrId, PlyrName, OppId, IsHome, Position, Comment, MP, FGM, FGA, FG2M, FG2A, FG3M, FG3A, FTM, FTA, ORB, DRB, TRB, AST, STL, BLK, TOV, PF, PTS, PLUS_MINUS)")
    for gameId in range(start, end):
        urlBox = 'http://stats.nba.com/stats/boxscore?GameID=00' + str(gameId) + '&RangeType=0&StartPeriod=0&EndPeriod=0&StartRange=0&EndRange=0'
        rBox = requests.get(urlBox)
        boxObj = json.loads(rBox.content)
        playerBoxScore = ParsePlayerBox(boxObj)
        for k, v in playerBoxScore.iteritems():
            plyrGameId = k
            gameId, dateObj, teamId, plyrId, plyrName, oppId, isHome, position, comment, mp, fgm, fga, fg2m, fg2a, fg3m, fg3a, ftm, fta,  orb, drb, trb, ast, stl, blk, tov, pf, pts, plusMinus = v
            data = [(plyrGameId, gameId, dateObj, teamId, plyrId, plyrName, oppId, isHome, position, comment, mp, fgm, fga, fg2m, fg2a, fg3m, fg3a, ftm, fta,  orb, drb, trb, ast, stl, blk, tov, pf, pts, plusMinus)]
            stmt = "INSERT INTO  NBAPLYRBOXTEST VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)"         
            con.executemany(stmt, data) 
            con.commit()     
    print gameId, datetime.datetime.now().time()
con.close() 

导入游戏的所有数据大约需要11秒。我还想刮的发挥,按发挥的数据,这将是大约400场左右的比赛。我不太了解SQL,所以我希望有更好的方法导入数据

谢谢


Tags: positionmpstrfgagameidfg3mteamidfgm