GradientTape根据损耗函数是否由tf.function修饰给出不同的梯度

weights=tf.constant([1.,.1])[tf.newaxis,...] def customloss1(y_true,y_pred,sample_weight=None): y_true_one_hot=tf.one_hot(tf.cast(y_true,tf.uint8),2) y_true_scale=tf.multiply(weights,y_true_one_hot) return tf.reduce_mean(tf.keras.losses.categorical_crossentropy(y_true_scale,y_pred)) @tf.function def customloss2(y_true,y_pred,sample_weight=None): y_true_one_hot=tf.one_hot(tf.cast(y_true,tf.uint8),2) y_true_scale=tf.multiply(weights,y_true_one_hot) return tf.reduce_mean(tf.keras.losses.categorical_crossentropy(y_true_scale,y_pred))

def get_gradients1(x,y): with tf.GradientTape() as tape1: p1=model(x) l1=customloss1(y,p1) with tf.GradientTape() as tape2: p2=model(x) l2=customloss2(y,p2) gradients1=tape1.gradient(l1,model.trainable_variables) gradients2=tape2.gradient(l2,model.trainable_variables) return gradients1, gradients2 @tf.function def get_gradients2(x,y): with tf.GradientTape() as tape1: p1=model(x) l1=customloss1(y,p1) with tf.GradientTape() as tape2: p2=model(x) l2=customloss2(y,p2) gradients1=tape1.gradient(l1,model.trainable_variables) gradients2=tape2.gradient(l2,model.trainable_variables) return gradients1, gradients2

([<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[ 0.11473544, -0.11473544]], dtype=float32)>], [<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[ 0.11473544, -0.11473544]], dtype=float32)>])

([<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[ 0.02213785, -0.5065186 ]], dtype=float32)>], [<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[ 0.11473544, -0.11473544]], dtype=float32)>])

@tf.function def customloss2(y_true,y_pred,sample_weight=None): y_true_one_hot=tf.one_hot(tf.cast(y_true,tf.uint8),2) y_true_scale=tf.multiply(weights,y_true_one_hot) tf.print('customloss2',type(y_true_scale),type(y_pred)) tf.print('y_true_scale','\n',y_true_scale) tf.print('y_pred','\n',y_pred) return tf.reduce_mean(tf.keras.losses.categorical_crossentropy(y_true_scale,y_pred))

customloss1 <type 'EagerTensor'> <type 'EagerTensor'> y_true_scale [[1 0] [0 0.1]] y_pred [[0.510775387 0.489224613] [0.529191136 0.470808864]] customloss2 <class 'tensorflow.python.framework.ops.Tensor'> <class 'tensorflow.python.framework.ops.Tensor'> y_true_scale [[1 0] [0 0.1]] y_pred [[0.510775387 0.489224613] [0.529191136 0.470808864]]

customloss1 <class 'tensorflow.python.framework.ops.Tensor'> <class 'tensorflow.python.framework.ops.Tensor'> y_true_scale [[1 0] [0 0.1]] y_pred [[0.510775387 0.489224613] [0.529191136 0.470808864]] customloss2 <class 'tensorflow.python.framework.ops.Tensor'> <class 'tensorflow.python.framework.ops.Tensor'> y_true_scale [[1 0] [0 0.1]] y_pred [[0.510775387 0.489224613] [0.529191136 0.470808864]]

2条回答

网友

1楼 · 编辑于 2024-09-26 18:02:32

原来这是一个bug，我提出了它

网友

2楼 · 编辑于 2024-09-26 18:02:32

这是一个有点复杂的问题，但它有一个解释。问题出在函数^{}中，该函数的行为不同，具体取决于您是在急切模式还是图形（tf.function）模式下运行

该函数考虑三种可能的情况。第一个是传递from_logits=True，在这种情况下它只调用^{}：

if from_logits:
  return nn.softmax_cross_entropy_with_logits_v2(
      labels=target, logits=output, axis=axis)

如果您给出from_logits=False，这在Keras中最常见，因为分类分类的输出层通常是softmax，那么它考虑了两种可能性。第一个是，如果给定的输出值来自softmax操作，那么它可以只使用该操作的输入并调用^{}，这更适合于使用softmax值计算实际交叉熵，因为它可以防止“饱和”结果。然而，这只能在图形模式下完成，因为渴望模式张量不跟踪它生成的操作，而不管该操作的输入

if not isinstance(output, (ops.EagerTensor, variables_module.Variable)):
  output = _backtrack_identity(output)
  if output.op.type == 'Softmax':
    # When softmax activation function is used for output operation, we
    # use logits from the softmax function directly to compute loss in order
    # to prevent collapsing zero when training.
    # See b/117284466
    assert len(output.op.inputs) == 1
    output = output.op.inputs[0]
    return nn.softmax_cross_entropy_with_logits_v2(
        labels=target, logits=output, axis=axis)

最后一种情况是，当您给定了from_logits=False，或者您处于渴望模式，或者给定的输出张量不是直接来自softmax操作，在这种情况下，唯一的选择是从softmax值计算交叉熵

# scale preds so that the class probas of each sample sum to 1
output = output / math_ops.reduce_sum(output, axis, True)
# Compute cross entropy from probabilities.
epsilon_ = _constant_to_tensor(epsilon(), output.dtype.base_dtype)
output = clip_ops.clip_by_value(output, epsilon_, 1. - epsilon_)
return -math_ops.reduce_sum(target * math_ops.log(output), axis)

问题是，尽管这些方法在数学上是等价的，可以计算交叉熵，但它们的精度并不相同。当logit较小时，它们几乎相同，但如果它们变大，则可能会出现很大的差异。下面是一个简单的测试：

import tensorflow as tf

@tf.function
def test_keras_xent(y, p, from_logits=False, mask_op=False):
    # p is always logits
    if not from_logits:
        # Compute softmax if not using logits
        p = tf.nn.softmax(p)
    if mask_op:
        # A dummy addition prevents Keras from detecting that
        # the value comes from a softmax operation
        p = p + tf.constant(0, p.dtype)
    return tf.keras.backend.categorical_crossentropy(y, p, from_logits=from_logits)

# Test
tf.random.set_seed(0)
y = tf.constant([1., 0., 0., 0.])

# Logits in [0, 1)
p = tf.random.uniform([4], minval=0, maxval=1)
tf.print(test_keras_xent(y, p, from_logits=True))
# 1.50469065
tf.print(test_keras_xent(y, p, from_logits=False, mask_op=False))
# 1.50469065
tf.print(test_keras_xent(y, p, from_logits=False, mask_op=True))
# 1.50469065

# Logits in [0, 10)
p = tf.random.uniform([4], minval=0, maxval=10)
tf.print(test_keras_xent(y, p, from_logits=True))
# 3.47569656
tf.print(test_keras_xent(y, p, from_logits=False, mask_op=False))
# 3.47569656
tf.print(test_keras_xent(y, p, from_logits=False, mask_op=True))
# 3.47569656

# Logits in [0, 100)
p = tf.random.uniform([4], minval=0, maxval=100)
tf.print(test_keras_xent(y, p, from_logits=True))
# 68.0106506
tf.print(test_keras_xent(y, p, from_logits=False, mask_op=False))
# 68.0106506
tf.print(test_keras_xent(y, p, from_logits=False, mask_op=True))
# 16.1180954

以你为例：

import tensorflow as tf

tf.random.set_seed(42)
x = tf.random.normal((2, 1))
y = tf.constant(np.random.choice([0, 1], 2))
y1h = tf.one_hot(y, 2, dtype=x.dtype)
model = tf.keras.Sequential([
    # Linear activation because we want the logits for testing
    tf.keras.layers.Dense(2, use_bias=False, activation='linear', input_shape=[1,])
])
p = model(x)
tf.print(test_keras_xent(y1h, p, from_logits=True))
# [0.603375256 0.964639068]
tf.print(test_keras_xent(y1h, p, from_logits=False, mask_op=False))
# [0.603375256 0.964639068]
tf.print(test_keras_xent(y1h, p, from_logits=False, mask_op=True))
# [0.603375256 0.964638948]

这里的结果几乎相同，但您可以看到第二个值有一个小的差异。这反过来会对计算的梯度产生影响（可能是放大的），这当然也是“等效”数学表达式，但具有不同的精度特性

相关问题更多 >

编程相关推荐

热门问题

热门文章