
2024-10-01 02:29:35 发布

您现在位置:Python中文网/ 问答频道 /正文



配置:Intel i3 cpu、tensorflow cpu 2.1

class Trainer:
    def __init__(self, model, optimizer, loss):
        self.model = model
        self.loss_function = loss
        self.optimizer = optimizer

    def train_step(self, inputs, targets):
        with tf.GradientTape() as tape:
            predictions = self.model(inputs)
            loss = self.loss_function(targets, predictions)
        gradients = tape.gradient(loss, self.model.trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, self.model.trainable_variables))
        return loss

    # fit using dataset
    def fit0(self, dataset, epochs):
        for epoch in tf.range(epochs):
            for input_batch, target_batch in dataset:
                self.train_step(input_batch, target_batch)

    # fit using list of tensors
    def fit1(self, inputs, targets, epochs):
        for epoch in tf.range(epochs):
            for input_batch, target_batch in zip(inputs, targets):
                self.train_step(input_batch, target_batch)




input_size = 10000
batch_size = 100
q = input_size // batch_size

# create random inputs (x) and outputs (y)
x = tf.random.normal((input_size, 1), dtype=tf.float32)
y = tf.random.normal((input_size, 1), dtype=tf.float32)

splits = tf.fill([q, ], batch_size)

# create a list of tensors rappresenting batches
x_list = tf.split(x, splits)
y_list = tf.split(y, splits)

# create datasets in the different ways
dataset0 = tf.data.Dataset.from_tensor_slices((x, y)).batch(batch_size)
dataset1 = tf.data.Dataset.from_tensor_slices((tf.stack(x_list), tf.stack(y_list)))

# model definition
model = tf.keras.Sequential([
    tf.keras.layers.Dense(20, activation='tanh', input_shape=(1,)),
    tf.keras.layers.Dense(1, activation='linear')])

# trainer initialization
trainer = Trainer(model=model, optimizer=tf.keras.optimizers.Adam(), loss=tf.keras.losses.MeanSquaredError())

# first run to perform initializations
time0 = time.time()
trainer.fit0(dataset=dataset0, epochs=tf.constant(1, dtype=tf.int32))
time0 = time.time() - time0

time1 = time.time()
trainer.fit0(dataset=dataset1, epochs=tf.constant(1, dtype=tf.int32))
time1 = time.time() - time1

time2 = time.time()
trainer.fit1(inputs=x_list, targets=y_list, epochs=tf.constant(1, dtype=tf.int32))
time2 = time.time() - time2

print("first fit0 with dataset0 took {} seconds".format(time0))
print("first fit0 with dataset1 took {} seconds".format(time1))
print("first fit1 with tensorlist took {} seconds".format(time2))

# measure performances
time0 = time.time()
trainer.fit0(dataset=dataset0, epochs=tf.constant(100, dtype=tf.int32))
time0 = time.time() - time0

time1 = time.time()
trainer.fit0(dataset=dataset1, epochs=tf.constant(100, dtype=tf.int32))
time1 = time.time() - time1

time2 = time.time()
trainer.fit1(inputs=x_list, targets=y_list, epochs=tf.constant(100, dtype=tf.int32))
time2 = time.time() - time2

print("fit0 with dataset0 took {} seconds".format(time0))
print("fit0 with dataset1 took {} seconds".format(time1))
print("fit1 with tensorlist took {} seconds".format(time2))



input_size = 10000
batch_size = 100

without @tf.function:
first fit0 with dataset0 took 0.9953532218933105 seconds
first fit0 with dataset1 took 0.07995295524597168 seconds
first fit1 with tensorlist took 0.05196571350097656 seconds
fit0 with dataset0 took 10.46957802772522 seconds
fit0 with dataset1 took 7.822799205780029 seconds
fit1 with tensorlist took 4.650130748748779 seconds

with @tf.function:
first fit0 with dataset0 took 1.4042332172393799 seconds
first fit0 with dataset1 took 0.46071624755859375 seconds
first fit1 with tensorlist took 7.3524699211120605 seconds
fit0 with dataset0 took 15.077088832855225 seconds
fit0 with dataset1 took 9.136569738388062 seconds
fit1 with tensorlist took 2.1366817951202393 seconds


input_size = 100000
batch_size = 100000

without @tf.function:
first fit0 with dataset0 took 1.1792669296264648 seconds
first fit0 with dataset1 took 0.027983427047729492 seconds
first fit1 with tensorlist took 0.020987749099731445 seconds
fit0 with dataset0 took 28.71895956993103 seconds
fit0 with dataset1 took 2.730872869491577 seconds
fit1 with tensorlist took 2.194814682006836 seconds

with @tf.function:
first fit0 with dataset0 took 1.5979444980621338 seconds
first fit0 with dataset1 took 0.4557182788848877 seconds
first fit1 with tensorlist took 0.3708038330078125 seconds
fit0 with dataset0 took 36.43854784965515 seconds
fit0 with dataset1 took 9.819332122802734 seconds
fit1 with tensorlist took 2.1136972904205322 seconds


  1. 为什么tf.data.Dataset使用tf.function包装时性能最差
  2. 即使dataset0和dataset1在功能上是等效的。这两者之间的区别是什么?为什么dataset1的性能优于dataset0
  3. fit1tf.function的长期性能最好。
    • 使用tf.data.Dataset是否可以达到相同的性能
    • 为什么初始化要花这么多时间?

Tags: selfinputsizetimetfwithbatchfunction
1楼 · 发布于 2024-10-01 02:29:35


配置:英特尔i3 cpu、tensorflow cpu 2.1

# fit using dataset
def fit0(self, dataset, epochs, batches, unroll=1):
    tf.assert_equal(tf.is_tensor(unroll), False, "unroll must be a python variable.")
    tf.assert_equal(tf.math.floormod(batches, unroll), tf.constant(0), "unroll must be a divisor of batches.")

    entries = epochs * batches / unroll
    it = iter(dataset)

    for entry in tf.range(entries):
        # this loop gets unrolled if unroll
        # is python variable, not a tensor.
        for _ in range(unroll):
            input_batch, target_batch = next(it)
            self.train_step(input_batch, target_batch)


input_size = 100000
batch_size = 100
num_epochs = 100
num_unroll = 5

num_batches = input_size // batch_size

# create random inputs (x) and outputs (y)
x = tf.random.normal((input_size, 1), dtype=tf.float32)
y = tf.random.normal((input_size, 1), dtype=tf.float32)

splits = tf.fill([num_batches, ], batch_size)
x_list, y_list = tf.split(x, splits), tf.split(y, splits)

# create dataset
dataset0 = tf.data.Dataset.from_tensor_slices((x, y)).batch(batch_size).cache().prefetch(1).repeat(num_epochs)
dataset1 = tf.data.Dataset.from_tensor_slices((tf.stack(x_list), tf.stack(y_list))).cache().prefetch(1).repeat(num_epochs)

# model definition
model = tf.keras.Sequential([
    tf.keras.layers.Dense(20, activation='tanh', input_shape=(1,)),
    tf.keras.layers.Dense(1, activation='linear')])

# trainer initialization
trainer = Trainer(model=model, optimizer=tf.keras.optimizers.Adam(), loss=tf.keras.losses.MeanSquaredError())

# first run to perform initializations
time0 = time.perf_counter()
    epochs=tf.constant(1, dtype=tf.int32),
    batches=tf.constant(num_batches, dtype=tf.int32),
time0 = time.perf_counter() - time0

time1 = time.perf_counter()
    epochs=tf.constant(1, dtype=tf.int32),
    batches=tf.constant(num_batches, dtype=tf.int32),
time1 = time.perf_counter() - time1

time2 = time.perf_counter()
trainer.fit1(inputs=x_list, targets=y_list, epochs=tf.constant(1, dtype=tf.int32))
time2 = time.perf_counter() - time2

print("first fit0 with dataset0 took {} seconds".format(time0))
print("first fit0 with dataset1 took {} seconds".format(time1))
print("first fit1 with tensorlist took {} seconds".format(time2))

# measure performances
time0 = time.perf_counter()
    epochs=tf.constant(num_epochs, dtype=tf.int32),
    batches=tf.constant(num_batches, dtype=tf.int32),
time0 = time.perf_counter() - time0

time1 = time.perf_counter()
    epochs=tf.constant(num_epochs, dtype=tf.int32),
    batches=tf.constant(num_batches, dtype=tf.int32),
time1 = time.perf_counter() - time1

time2 = time.perf_counter()
trainer.fit1(inputs=x_list, targets=y_list, epochs=tf.constant(num_epochs, dtype=tf.int32))
time2 = time.perf_counter() - time2

print("fit0 with dataset0 took {} seconds".format(time0))
print("fit0 with dataset1 took {} seconds".format(time1))
print("fit1 with tensorlist took {} seconds".format(time2))
  1. Why does tf.data.Dataset have worst performances when wrapped with tf.function?


dataset0 = tf.data.Dataset.from_tensor_slices((x, y)).batch(batch_size)
dataset1 = tf.data.Dataset.from_tensor_slices((tf.stack(x_list), tf.stack(y_list)))


dataset0 = tf.data.Dataset.from_tensor_slices((x, y)).batch(batch_size).cache().prefetch(1).repeat(num_epochs)
dataset1 = tf.data.Dataset.from_tensor_slices((tf.stack(x_list), tf.stack(y_list))).cache().prefetch(1).repeat(num_epochs)





input_size = 100000
batch_size = 100
num_epochs = 100
num_unroll = 5

first fit0 with dataset0 took 2.2224882999999993 seconds
first fit0 with dataset1 took 0.804360700000001 seconds
first fit1 with tensorlist took 88.2123332 seconds
fit0 with dataset0 took 35.27911590000001 seconds
fit0 with dataset1 took 20.370243099999982 seconds
fit1 with tensorlist took 23.66727979999999 seconds


input_size = 100000
batch_size = 100000

input_size = 1000000
batch_size = 1000000
num_epochs = 100
num_unroll = 1

first fit0 with dataset0 took 4.3616363 seconds
first fit0 with dataset1 took 0.7977632000000003 seconds
first fit1 with tensorlist took 0.7329889000000005 seconds
fit0 with dataset0 took 21.131495899999997 seconds
fit0 with dataset1 took 19.915148600000002 seconds
fit1 with tensorlist took 19.817472700000003 seconds


  1. fit1 with tf.function got the best long-run performances.
    • Is it possible to achieve the same performance using tf.data.Dataset?
    • Why is it taking so much time for the initialization?
      When using 100 batches the first run took 7.3524699211120605 seconds and this time increase by increasing the number of batches. I guess is because autograph is creating a bigger graph, unrolling the computation of the different batches. I do not see any opportunity for parallelization though, because each batch is dependent on the result of the previous one.

通过检查张力板上的图形结构很容易看出,在fit1函数上使用autograph可以通过完全展开循环创建非常大的图形。 这提供了更好的性能,但创建图形的时间很长,很可能是内存使用过度,这使得它无法用于更复杂的问题。

相关问题 更多 >