类sql标量函数

2024-09-30 22:15:53 发布

您现在位置:Python中文网/ 问答频道 /正文

有没有一种方法可以为pandas实现类似sql的标量函数? 输入将取决于数据帧的行和列。你知道吗

import pandas as pd
import numpy as np

#################### this is the data ####################################################
volume_curve = pd.DataFrame(
    {'SecurityName': {0: 'GOOGL', 1: 'GOOGL', 2: 'GOOGL', 3: 'GOOGL', 4: 'GOOGL', 5: 'GOOGL', 6: 'GOOGL', 
                      7: 'GOOGL', 8: 'GOOGL', 9: 'GOOGL', 10: 'GOOGL', 11: 'GOOGL', 12: 'GOOGL', 13: 'GOOGL', 
                      14: 'GOOGL', 15: 'AAPL', 16: 'AAPL', 17: 'AAPL', 18: 'AAPL', 19: 'AAPL', 20: 'AAPL', 21: 'AAPL', 
                      22: 'AAPL', 23: 'AAPL', 24: 'AAPL', 25: 'AAPL', 26: 'AAPL', 27: 'AAPL', 28: 'AAPL', 29: 'AAPL'}, 
     'RemainingVolume': {0: 100, 1: 80, 2: 75, 3: 70, 4: 65, 5: 60, 6: 58, 7: 55, 8: 53, 9: 50, 10: 40, 11: 30, 12: 25,
                         13: 20, 14: 0, 15: 100, 16: 90, 17: 80, 18: 70, 19: 60, 20: 50, 21: 45, 22: 40, 23: 35, 24: 30, 
                         25: 25, 26: 20, 27: 15, 28: 10, 29: 0}, 
     'TradingTimeEST': {0: '3/22/2016 9:00:00', 1: '3/22/2016 9:30:00', 2: '3/22/2016 10:00:00', 3: '3/22/2016 10:30:00', 
                        4: '3/22/2016 11:00:00', 5: '3/22/2016 11:30:00', 6: '3/22/2016 12:00:00', 7: '3/22/2016 12:30:00',
                        8: '3/22/2016 13:00:00', 9: '3/22/2016 13:30:00', 10: '3/22/2016 14:00:00', 11: '3/22/2016 14:30:00', 
                        12: '3/22/2016 15:00:00', 13: '3/22/2016 15:30:00', 14: '3/22/2016 16:00:00', 15: '3/22/2016 9:00:00', 
                        16: '3/22/2016 9:30:00', 17: '3/22/2016 10:00:00', 18: '3/22/2016 10:30:00', 19: '3/22/2016 11:00:00', 
                        20: '3/22/2016 11:30:00', 21: '3/22/2016 12:00:00', 22: '3/22/2016 12:30:00', 23: '3/22/2016 13:00:00',
                        24: '3/22/2016 13:30:00', 25: '3/22/2016 14:00:00', 26: '3/22/2016 14:30:00', 27: '3/22/2016 15:00:00', 
                        28: '3/22/2016 15:30:00', 29: '3/22/2016 16:00:00'}}
)

trading_time = pd.DataFrame(
    {'SecurityName': {0: 'GOOGL', 1: 'GOOGL', 2: 'APPL', 3: 'APPL'}, 
     'EndTradingTimeEST': {0: '3/22/2016 11:30:00', 1: '3/22/2016 3:30:00', 2: '3/22/2016 11:30:00', 3: '3/22/2016 3:30:00'}, 
     'StartTradingTimeEST': {0: '3/22/2016 9:15:00', 1: '3/22/2016 1:30:00', 2: '3/22/2016 9:15:00', 3: '3/22/2016 1:30:00'}}
)

volume_curve['TradingTimeEST'] = pd.to_datetime(volume_curve['TradingTimeEST'])
trading_time['StartTradingTimeEST'] = pd.to_datetime(trading_time['StartTradingTimeEST'])
trading_time['EndTradingTimeEST'] = pd.to_datetime(trading_time['EndTradingTimeEST'])
#################### this is the data ####################################################

############################### a sql like scalar function that returns a single value #################
def get_available_volume(start, end, security_name, volume_info):
    """
    interpolate the volume of start trading and end trading time based on the volume information. 
    and the difference is the available volume    
    """
    start = start[0]
    end = end[0]
    security_name = security_name[0]

    stage = volume_info[volume_info['SecurityName'] == security_name]
    start_n_time = stage[stage['TradingTimeEST'] >= start]['TradingTimeEST'].min()
    start_n_value = stage[stage['TradingTimeEST'] == start_n_time]['RemainingVolume'].values[0]
    start_p_time = stage[stage['TradingTimeEST'] <= start]['TradingTimeEST'].max()
    start_p_value = stage[stage['TradingTimeEST'] == start_p_time]['RemainingVolume'].values[0]
    if start == start_n_time:
        start_interpolate_value = start_n_value
    else:
        start_interpolate_value = start_p_value-(start_p_value-start_n_value)*(start_n_time - start)/(start_n_time-start_p_time)

    end_n_time = stage[stage['TradingTimeEST'] >= end]['TradingTimeEST'].min()
    end_n_value = stage[stage['TradingTimeEST'] == end_n_time]['RemainingVolume'].values[0]
    end_p_time = stage[stage['TradingTimeEST'] <= end]['TradingTimeEST'].max()
    end_p_value = stage[stage['TradingTimeEST'] == end_p_time]['RemainingVolume'].values[0]

    if end == end_n_time:
        end_interpolate_value = end_n_value
    else:
        end_interpolate_value = end_p_value-(end_p_value-end_n_value)*(end_n_time - end)/(end_n_time-end_p_time)

    # return the difference of start time volume and end time volume
    return start_interpolate_value - end_interpolate_value
################################################# end of this function ############


print get_available_volume(
    start = [pd.to_datetime('2016-3-22 9:50')],
    end = [pd.to_datetime('2016-3-22 11:15')],
    security_name = ['GOOGL'],
    volume_info = volume_curve
)


trading_time['AvailableVolume'] = trading_time.apply(
    lambda x: get_available_volume(
        start = trading_time['StartTradingTimeEST'],
        end = trading_time['EndTradingTimeEST'],
        security_name = trading_time['SecurityName'],
        volume_info = volume_curve
    ), 
    axis = 1
)

编辑: 修正:{当我尝试应用这个函数时,我得到了一个错误:'长度必须匹配才能比较',u'出现在索引0')}

当前错误:它的值不正确


Tags: thenametimevaluestagestartendpd