在fit_gen上使用multi_gpu_model时出现不兼容的形状错误

2024-06-17 16:28:37 发布

您现在位置:Python中文网/ 问答频道 /正文

我们正在尝试使用多个gpu模式,以便能够在4个gpu上进行训练,但是我们收到了一个不兼容的形状错误。如果删除了multi_gpu_模型行,则程序可以正常工作。在

我们试着让批处理大小是我们使用的gpu数量的倍数。在

from tensorflow._api.v1.keras.models import Model
from tensorflow._api.v1.keras.layers import Convolution2D, concatenate, Input, Lambda
from tensorflow._api.v1.keras.utils import multi_gpu_model, to_categorical
from tensorflow.python.keras.utils.data_utils import Sequence
from train import *
from utils import *
from config import *
import pickle
import sys


class Image_Generator(Sequence):

    def __init__(self, image_filenames, label_filenames, batch_size):
        self.image_filenames, self.label_filenames = image_filenames, label_filenames
        self.batch_size = batch_size

    def __len__(self):
        return int(np.ceil(len(self.image_filenames) / float(self.batch_size)))

    def __getitem__(self, idx):
        x_filenames = self.image_filenames[idx * self.batch_size:(idx + 1) * self.batch_size]
        y_filenames = self.label_filenames[idx * self.batch_size:(idx + 1) * self.batch_size]

        sky_images = np.array([np.asarray(imageio.imread(file_name)) for file_name in x_filenames])
        tsi = np.array([np.asarray(imageio.imread(file_name)) for file_name in y_filenames])
        masks = np.empty((self.batch_size, 480, 480))

        for i in range(len(tsi)):
            masks[i] = mask_to_index(tsi[i])

        X = [sky_images, masks]
        Y = to_categorical(masks)
        Y = Y[:, :, :, 0:4]

        return X, Y


def load_filenames(stamps, input_dir, masks):
    filenames = []
    if masks:
        for s in stamps:
            filenames.append(extract_mask_path_from_time(s, input_dir))
    else:
        for s in stamps:
            filenames.append(extract_img_path_from_time(s, input_dir))
    return filenames


def build_model():
    # Create the inputs to the network.
    sky_images = Input(shape=(480, 480, 3), name='sky_image')
    # Main body of the network
    conv1 = Convolution2D(filters=32, kernel_size=3, padding='same', data_format='channels_last', activation='relu')(sky_images)
    maxpool1 = Lambda(lambda x: tf.nn.max_pool(conv1, [1, 1, 100, 1], strides=[1, 1, 1, 1], padding='SAME'), name='maxpool1')(conv1)
    concat1 = concatenate([conv1, maxpool1], axis=3)
    conv3 = Convolution2D(filters=4, kernel_size=3, padding='same', data_format='channels_last', activation='relu')(concat1)
    always_full = tf.constant([[[0, 0, 0, 1] for i in range(480)] for j in range(480)], dtype='float32')
    masked = Lambda(lambda x: tf.add(always_full, conv3), name='masked')(conv3)
    # Build and return the model
    model = Model(inputs=sky_images, outputs=masked)
    model = multi_gpu_model(model, gpus=4)
    return model


if __name__ == '__main__':
    np.random.seed(123)  # for reproducibility
    run_name = sys.argv[0:]

    with open(TYPICAL_DATA_DIR + '/train.stamps', 'rb') as f:
        train_stamps = pickle.load(f)
    print('Training stamps loaded.')
    with open(TYPICAL_VALID_FILE, 'rb') as f:
        valid_stamps = pickle.load(f)
    print('Validation stamps loaded.')

    training_image_filenames = load_filenames(train_stamps, TYPICAL_DATA_DIR, False)
    print('Training image file paths loaded.')
    training_tsi_filenames = load_filenames(train_stamps, TYPICAL_DATA_DIR, True)
    print('Training mask file paths loaded.')
    validation_image_filenames = load_filenames(valid_stamps, TYPICAL_DATA_DIR, False)
    print('Validation image file paths loaded.')
    validation_tsi_filenames = load_filenames(valid_stamps, TYPICAL_DATA_DIR, True)
    print('Validation mask file paths loaded.')

    model = build_model()
    print('Model built.')
    model.compile(optimizer='adam', loss='categorical_crossentropy')
    print('Model compiled.')

    training_batch_generator = Image_Generator(training_image_filenames, training_tsi_filenames, TRAINING_BATCH_SIZE)
    print('Training generator initialized.')
    validation_batch_generator = Image_Generator(validation_image_filenames, validation_tsi_filenames,
                                                 TRAINING_BATCH_SIZE)
    print('Validation generator initialized.')

    model.summary()

    model.fit_generator(generator=training_batch_generator,
                        steps_per_epoch=(len(train_stamps) // (TRAINING_BATCH_SIZE)),
                        epochs=1,
                        verbose=1,
                        validation_data=validation_batch_generator,
                        validation_steps=(len(valid_stamps) // (TRAINING_BATCH_SIZE)),
                        use_multiprocessing=False)

我们希望形状是[26480480,4]而不是[104480480,4]。 以下是完整的错误消息:

Traceback (most recent call last):

File "multi_gpu_test.py", line 127, in use_multiprocessing=False)

File "/local/cluster/lib/python3.6/site-packages/tensorflow/python/keras/engine/training.py", line 1426, in fit_generator initial_epoch=initial_epoch)

File "/local/cluster/lib/python3.6/site-packages/tensorflow/python/keras/engine/training_generator.py", line 191, in model_iteration batch_outs = batch_function(*batch_data)

File "/local/cluster/lib/python3.6/site-packages/tensorflow/python/keras/engine/training.py", line 1191, in train_on_batch outputs = self._fit_function(ins) # pylint: disable=not-callable

File "/local/cluster/lib/python3.6/site-packages/tensorflow/python/keras/backend.py", line 3076, in call run_metadata=self.run_metadata)

File "/local/cluster/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1439, in call run_metadata_ptr)

File "/local/cluster/lib/python3.6/site-packages/tensorflow/python/framework/errors_impl.py", line 528, in exit c_api.TF_GetCode(self.status.status)) tensorflow.python.framework.errors_impl.InvalidArgumentError: Incompatible shapes: [26,480,480,4] vs. [104,480,480,4] [[{{node loss/masked_loss/mul}}]] [[{{node training/Adam/gradients/masked_1/concat_grad/Slice_1}}]]


Tags: nameinfromimageimportselfsizemodel