并行化Dask聚合

import dask.dataframe as dd from dask.distributed import Client from pandas import DataFrame def chunk(s): return s.value_counts() def agg(s): s = s._selected_obj return s.groupby(level=list(range(s.index.nlevels))).sum() def finalize(s): # s is a multi-index series of the form (group, value): count. First # manually group on the group part of the index. The lambda will receive a # sub-series with multi index. Next, drop the group part from the index. # Finally, determine the index with the maximum value, i.e., the mode. level = list(range(s.index.nlevels - 1)) return ( s.groupby(level=level) .apply(lambda s: s.reset_index(level=level, drop=True).argmax()) ) def main() -> DataFrame: client = Client('scheduler:8786') ddf = dd.read_csv('/sample/data.csv') custom_mode = dd.Aggregation('custom mode', chunk, agg, finalize) result = ddf.groupby(['a','b']).agg(custom_mode).compute() return result

1条回答

网友

1楼 · 发布于 2024-09-14 18:06:17

最后，我使用futures基本上并行化了每列的聚合。因为我有很多列，所以将每个聚合传递给它自己的工作线程可以节省我很多时间。感谢David的评论以及the article on parallel workloads from the dask documentation

from dask.distributed import Client
from pandas import DataFrame

def chunk(s):
    return s.value_counts()

def agg(s):
    s = s._selected_obj
    return s.groupby(level=list(range(s.index.nlevels))).sum()

def finalize(s):
    level = list(range(s.index.nlevels - 1))
    return (
        s.groupby(level=level)
        .apply(lambda s: s.reset_index(level=level, drop=True).idxmax())
    )

def delayed_mode(ddf, groupby, col, custom_agg):
    return ddf.groupby(groupby).agg({col: custom_agg}).compute()

def main() -> DataFrame:
    client = Client('scheduler:8786')

    ddf = dd.read_csv('/sample/data.csv')
    custom_mode = dd.Aggregation('custom mode', chunk, agg, finalize)

    futures = []

    for col in multiple_trimmed.columns:
        future = client.submit(delayed_mode, ddf, ["a", "b"], col, custom_mode_dask)
        futures.append(future)

    ddfs = client.gather(futures)
    result = pd.concat(ddfs, axis=1)
    return result

相关问题更多 >

编程相关推荐

热门问题

热门文章