测试数据帧处理管道的pythonic方法是什么

2024-09-23 06:34:05 发布

您现在位置:Python中文网/ 问答频道 /正文

测试熊猫数据帧处理链的最佳方法是什么?我在下面截取了脚本文件和测试文件,这样您就可以明白我的意思了

我对最佳实践感到困惑,我唯一的直觉是让测试能够以任何顺序运行,限制从磁盘加载csv的次数,同时确保链中的每个点不会修改夹具。流程中的每一步都依赖于前面的步骤,因此对每个节点进行单元测试就像对管道中该点的处理累积进行测试一样。到目前为止,我已经完成了任务,但似乎有很多代码重复发生,因为我在每次测试中都在逐步构建管道

测试这种python脚本的方法是什么

这是数据处理文件的存根:

#main_script.py

def calc_allocation_methodology(df_row):
    print('calculating allocation methodoloyg')
    return 'simple'

def flag_data_for_the_allocation_methodology(df):       
    allocation_methodology = df.apply(calc_allocation_methodology, axis=1)
    df.assign(allocation_methodology=allocation_methodology)
    print('flagging each row for the allocation methodoloyg')
    return df

def convert_repeating_values_to_nan(df):
    'keep one value and nan the rest of the values'
    print('convert repeating values to nan')
    return df

def melt_and_drop_accounting_columns(df):
    print('melt and drop accounting columns')
    print(f'colums remaining: {df.shape[0]}')
    return df
    
def melt_and_drop_engineering_columns(df):
    print('melt and drop engineering columns')
    print(f'colums remaining: {df.shape[0]}')
    return df
    
    
def process_csv_to_tiny_format(df):
    print('process the entire pipeline')
    return (df
        .pipe(flag_data_for_the_allocation_methodology)
        .pipe(convert_repeating_values_to_nan)
        .pipe(melt_and_drop_accounting_columns)
        .pipe(melt_and_drop_engineering_columns)
        )

这是存根掉的测试文件



#test_main.py


from pytest import fixture
import main_script as main
import pandas as pd

@fixture(scope='session')
def df_from_csv()
    return pd.load_csv('database_dump.csv')

@fixture
def df_copy(df_from_csv):
    df = df_from_csv.copy()
    return df

    
    
def test_expected_flag_data_for_the_allocation_methodology(df_copy)
    df = df_copy
    node_to_test = df.pipe(main.flag_data_for_the_allocation_methodology)
    assert True

def test_convert_repeating_values_to_nan(df_copy)
    df = df_copy
    node_to_test = df.pipe(main.flag_data_for_the_allocation_methodology).pipe(main.convert_repeating_values_to_nan)
    assert True     
        
def test_melt_and_drop_accounting_columns(df_copy)
    df = df_copy
    node_to_test = (df
        .pipe(main.flag_data_for_the_allocation_methodology)
        .pipe(main.convert_repeating_values_to_nan)
        .pipe(main.melt_and_drop_accounting_columns))
    assert True         
    
def test_melt_and_drop_engineering_columns(df_copy)
    df = df_copy
    node_to_test = (df
        .pipe(main.flag_data_for_the_allocation_methodology)
        .pipe(main.convert_repeating_values_to_nan)
        .pipe(main.melt_and_drop_accounting_columns)
        .pipe(main.melt_and_drop_engineering_columns))
    assert True     

def test_process_csv_to_tiny_format(df_from_csv):
    df = df_from_csv.copy()
    tiny_data = main.process_csv_to_tiny_format(df)
    assert True 

Tags: columnsandcsvthetotestdfmain