为什么PandasUDF没有被并行化？

import pyspark.sql.functions as f import mlflow import pandas as pd import numpy as np from sklearn.metrics import mean_squared_error, mean_absolute_error import pmdarima as pm from statsmodels.tsa.statespace.sarimax import SARIMAX def train_model(df_pandas): ''' Trains a model on grouped instances ''' original_df = df_pandas.copy() #the original df will be returned in the end PA = df_pandas['Princípio_Ativo'].iloc[0] run_id = df_pandas['run_id'].iloc[0] # Pulls run ID to do a nested run observacoes_no_teste = 12 horizonte = 1 observacoes_total = len(df_pandas.index) observacoes_no_train = len(df_pandas.index) - observacoes_no_teste try: #train test split X = df_pandas[:observacoes_no_train]['Demanda'] y = df_pandas[observacoes_no_train:]['Demanda'] # Train the model model = pm.auto_arima(X, seasonal=True, m=12) order = model.get_params()['order'] seasonal_order = model.get_params()['seasonal_order'] except: pass # Resume the top-level training with mlflow.start_run(run_id=run_id, experiment_id=1333367041812290): # Create a nested run for the specific device with mlflow.start_run(run_name=str(PA), nested=True, experiment_id=1333367041812290) as run: mae_list = [] mse_list = [] previsoes_list = [] teste_list = [] predictions_list = [] try: #the purpose of the following loop is to do backtesting: the model is trained with n observations, and the (n+1)th is predicted. n is increased by 1 on each iteration. for i in range(observacoes_total-observacoes_no_train-horizonte+1): #train test split X = df_pandas[:observacoes_no_train+i]['Demanda'] y = df_pandas[observacoes_no_train+i:observacoes_no_train+i+horizonte]['Demanda'] #train model model = SARIMAX(X, order=order, seasonal_order=seasonal_order) model = model.fit() #make predictions predictions = model.predict(start=observacoes_no_train + i, end=(observacoes_no_train + i + horizonte-1)) predictions_list.append(predictions) mse = round(mean_squared_error(y, predictions),2) mae = round(mean_absolute_error(y, predictions),2) mse_list.append(mse) mae_list.append(mae) #series with predictions in_sample_predictions = pd.concat(predictions_list) in_sample_predictions.name = 'in_sample' #out of sample predictions hp = 3 out_of_sample_predictions = model.predict(start=observacoes_total, end=(observacoes_total + hp - 1)) out_of_sample_predictions.name = 'out_sample' #in sample + out of sample predictions df_predictions = pd.concat([df_pandas.drop('run_id',axis=1), in_sample_predictions,out_of_sample_predictions], axis=1) #save df with predictions to be logged as an artifact my mlflow. df_predictions.to_csv('df_predictions.csv') #mlflow logging mlflow.log_param("Princípio_Ativo", PA) mlflow.log_param("mae_list", str(mae_list)) mlflow.log_param("mse_list", str(mse_list)) mlflow.log_param("status_sucesso", 'sim') mlflow.log_artifact('df_predictions.csv') except: mlflow.log_param("status_falha", 'sim') return original_df.drop('run_id', axis=1)

with mlflow.start_run(run_name="SARIMA", experiment_id=1333367041812290) as run: run_id = run.info.run_uuid modelDirectoriesDF = (df .withColumn("run_id", f.lit(run_id)) # Add run_id .groupby("Princípio_Ativo") .applyInPandas(train_model, schema=df.schema) ) combinedDF = (df .join(modelDirectoriesDF, on="Princípio_Ativo", how="left") ) display(combinedDF)

2条回答

网友
1楼 · 编辑于 2024-10-03 04:33:18

对。只有一个任务的阶段是不并行的。这可以解释为什么不通过在集群中添加更多核心或节点来减少运行时间
您的输入数据集很小（69KB），因此除非您显式地repartition，否则如果数据帧分区大小保留为默认的128MB（由spark.sql.files.maxPartitionBytes参数指定），Spark将把它写入单个分区。因此，它将被分配给单个任务
通过设备列重新划分输入应该提供您正在寻找的并行培训

网友
2楼 · 编辑于 2024-10-03 04:33:18

我认为您可以通过在执行最后一个display命令之前对combinedDF执行sort()来改进并行化。比如：
import pyspark.sql.functions as F display(combinedDF.sort(F.col("Princípio_Ativo").asc()))
上面的代码片段将防止display函数仅计算combinedDF中的一些行。其思想是，通过对display之前的行进行排序，spark必须对所有行进行求值，以了解display函数的行顺序。因此，一切都安排为并行执行。 spark在不首先排序的情况下执行display，在计算足够的行以执行display之前，计划任务的数量会按阶段指数增长

相关问题更多 >

编程相关推荐

热门问题

热门文章