使用tf.data.Dataset的Tensorflow慢速性能

class Trainer: def __init__(self, model, optimizer, loss): self.model = model self.loss_function = loss self.optimizer = optimizer @tf.function def train_step(self, inputs, targets): with tf.GradientTape() as tape: predictions = self.model(inputs) loss = self.loss_function(targets, predictions) gradients = tape.gradient(loss, self.model.trainable_variables) self.optimizer.apply_gradients(zip(gradients, self.model.trainable_variables)) return loss # fit using dataset @tf.function def fit0(self, dataset, epochs): for epoch in tf.range(epochs): for input_batch, target_batch in dataset: self.train_step(input_batch, target_batch) # fit using list of tensors @tf.function def fit1(self, inputs, targets, epochs): for epoch in tf.range(epochs): for input_batch, target_batch in zip(inputs, targets): self.train_step(input_batch, target_batch)

input_size = 10000 batch_size = 100 q = input_size // batch_size # create random inputs (x) and outputs (y) x = tf.random.normal((input_size, 1), dtype=tf.float32) y = tf.random.normal((input_size, 1), dtype=tf.float32) splits = tf.fill([q, ], batch_size) # create a list of tensors rappresenting batches x_list = tf.split(x, splits) y_list = tf.split(y, splits) # create datasets in the different ways dataset0 = tf.data.Dataset.from_tensor_slices((x, y)).batch(batch_size) dataset1 = tf.data.Dataset.from_tensor_slices((tf.stack(x_list), tf.stack(y_list))) # model definition model = tf.keras.Sequential([ tf.keras.layers.Dense(20, activation='tanh', input_shape=(1,)), tf.keras.layers.Dense(1, activation='linear')]) # trainer initialization trainer = Trainer(model=model, optimizer=tf.keras.optimizers.Adam(), loss=tf.keras.losses.MeanSquaredError()) # first run to perform initializations time0 = time.time() trainer.fit0(dataset=dataset0, epochs=tf.constant(1, dtype=tf.int32)) time0 = time.time() - time0 time1 = time.time() trainer.fit0(dataset=dataset1, epochs=tf.constant(1, dtype=tf.int32)) time1 = time.time() - time1 time2 = time.time() trainer.fit1(inputs=x_list, targets=y_list, epochs=tf.constant(1, dtype=tf.int32)) time2 = time.time() - time2 print("first fit0 with dataset0 took {} seconds".format(time0)) print("first fit0 with dataset1 took {} seconds".format(time1)) print("first fit1 with tensorlist took {} seconds".format(time2)) # measure performances time0 = time.time() trainer.fit0(dataset=dataset0, epochs=tf.constant(100, dtype=tf.int32)) time0 = time.time() - time0 time1 = time.time() trainer.fit0(dataset=dataset1, epochs=tf.constant(100, dtype=tf.int32)) time1 = time.time() - time1 time2 = time.time() trainer.fit1(inputs=x_list, targets=y_list, epochs=tf.constant(100, dtype=tf.int32)) time2 = time.time() - time2 print("fit0 with dataset0 took {} seconds".format(time0)) print("fit0 with dataset1 took {} seconds".format(time1)) print("fit1 with tensorlist took {} seconds".format(time2))

1条回答

网友
1楼 · 发布于 2024-10-01 02:29:35

我获得了良好的性能改进，代码和结果如下所示。
然而，我只能部分回答这些问题，特别是第二个问题仍然悬而未决
配置：英特尔i3 cpu、tensorflow cpu 2.1
以下是改进后的函数代码fit0，培训师课程的其余部分保持不变：
# fit using dataset @tf.function def fit0(self, dataset, epochs, batches, unroll=1): tf.assert_equal(tf.is_tensor(unroll), False, "unroll must be a python variable.") tf.assert_equal(tf.math.floormod(batches, unroll), tf.constant(0), "unroll must be a divisor of batches.") entries = epochs * batches / unroll it = iter(dataset) for entry in tf.range(entries): # this loop gets unrolled if unroll # is python variable, not a tensor. for _ in range(unroll): input_batch, target_batch = next(it) self.train_step(input_batch, target_batch)
下面是我运行测试时使用的代码：
input_size = 100000 batch_size = 100 num_epochs = 100 num_unroll = 5 num_batches = input_size // batch_size # create random inputs (x) and outputs (y) x = tf.random.normal((input_size, 1), dtype=tf.float32) y = tf.random.normal((input_size, 1), dtype=tf.float32) splits = tf.fill([num_batches, ], batch_size) x_list, y_list = tf.split(x, splits), tf.split(y, splits) # create dataset dataset0 = tf.data.Dataset.from_tensor_slices((x, y)).batch(batch_size).cache().prefetch(1).repeat(num_epochs) dataset1 = tf.data.Dataset.from_tensor_slices((tf.stack(x_list), tf.stack(y_list))).cache().prefetch(1).repeat(num_epochs) # model definition model = tf.keras.Sequential([ tf.keras.layers.Dense(20, activation='tanh', input_shape=(1,)), tf.keras.layers.Dense(1, activation='linear')]) # trainer initialization trainer = Trainer(model=model, optimizer=tf.keras.optimizers.Adam(), loss=tf.keras.losses.MeanSquaredError()) # first run to perform initializations time0 = time.perf_counter() trainer.fit0( dataset=dataset0, epochs=tf.constant(1, dtype=tf.int32), batches=tf.constant(num_batches, dtype=tf.int32), unroll=num_unroll) time0 = time.perf_counter() - time0 time1 = time.perf_counter() trainer.fit0( dataset=dataset1, epochs=tf.constant(1, dtype=tf.int32), batches=tf.constant(num_batches, dtype=tf.int32), unroll=num_unroll) time1 = time.perf_counter() - time1 time2 = time.perf_counter() trainer.fit1(inputs=x_list, targets=y_list, epochs=tf.constant(1, dtype=tf.int32)) time2 = time.perf_counter() - time2 print("first fit0 with dataset0 took {} seconds".format(time0)) print("first fit0 with dataset1 took {} seconds".format(time1)) print("first fit1 with tensorlist took {} seconds".format(time2)) # measure performances time0 = time.perf_counter() trainer.fit0( dataset=dataset0, epochs=tf.constant(num_epochs, dtype=tf.int32), batches=tf.constant(num_batches, dtype=tf.int32), unroll=num_unroll) time0 = time.perf_counter() - time0 time1 = time.perf_counter() trainer.fit0( dataset=dataset1, epochs=tf.constant(num_epochs, dtype=tf.int32), batches=tf.constant(num_batches, dtype=tf.int32), unroll=num_unroll) time1 = time.perf_counter() - time1 time2 = time.perf_counter() trainer.fit1(inputs=x_list, targets=y_list, epochs=tf.constant(num_epochs, dtype=tf.int32)) time2 = time.perf_counter() - time2 print("fit0 with dataset0 took {} seconds".format(time0)) print("fit0 with dataset1 took {} seconds".format(time1)) print("fit1 with tensorlist took {} seconds".format(time2))
Why does tf.data.Dataset have worst performances when wrapped with tf.function?
我不知道发动机罩下到底发生了什么，但可以通过更换以下部件来解决：
dataset0 = tf.data.Dataset.from_tensor_slices((x, y)).batch(batch_size) dataset1 = tf.data.Dataset.from_tensor_slices((tf.stack(x_list), tf.stack(y_list)))
使用这种新的数据集，其中还包括epoches以及使用缓存和预取
dataset0 = tf.data.Dataset.from_tensor_slices((x, y)).batch(batch_size).cache().prefetch(1).repeat(num_epochs) dataset1 = tf.data.Dataset.from_tensor_slices((tf.stack(x_list), tf.stack(y_list))).cache().prefetch(1).repeat(num_epochs)
有关更多信息，请参见here
我测试了fit0，fit1有无tf.function，但有了这些变化，我总是通过使用tf.function获得更好的性能，因此只显示后者
使用的输入_大小是10倍大。以下是测试结果：
第一次试验为1000批，每批100个样品。
请注意，与num_unroll=1相比，num_unroll=5提高了性能。设置“展开数”>；5没有提供任何进一步的改进
input_size = 100000
batch_size = 100
num_epochs = 100
num_unroll = 5
first fit0 with dataset0 took 2.2224882999999993 seconds
first fit0 with dataset1 took 0.804360700000001 seconds
first fit1 with tensorlist took 88.2123332 seconds
fit0 with dataset0 took 35.27911590000001 seconds
fit0 with dataset1 took 20.370243099999982 seconds
fit1 with tensorlist took 23.66727979999999 seconds
第二批为1批1000000份样品
input_size = 100000
batch_size = 100000
input_size = 1000000
batch_size = 1000000
num_epochs = 100
num_unroll = 1
first fit0 with dataset0 took 4.3616363 seconds
first fit0 with dataset1 took 0.7977632000000003 seconds
first fit1 with tensorlist took 0.7329889000000005 seconds
fit0 with dataset0 took 21.131495899999997 seconds
fit0 with dataset1 took 19.915148600000002 seconds
fit1 with tensorlist took 19.817472700000003 seconds
上述结果可以回答以下问题：
fit1 with tf.function got the best long-run performances.
Is it possible to achieve the same performance using tf.data.Dataset?
Why is it taking so much time for the initialization?
When using 100 batches the first run took 7.3524699211120605 seconds and this time increase by increasing the number of batches. I guess is because autograph is creating a bigger graph, unrolling the computation of the different batches. I do not see any opportunity for parallelization though, because each batch is dependent on the result of the previous one.
通过检查张力板上的图形结构很容易看出，在fit1函数上使用autograph可以通过完全展开循环创建非常大的图形。这提供了更好的性能，但创建图形的时间很长，很可能是内存使用过度，这使得它无法用于更复杂的问题。
但是，如上所示，使用tf.data.Dataset可以实现相同的性能，只需几个展开的循环，并随之改善图形大小

相关问题更多 >

编程相关推荐

热门问题

热门文章