purepython和theano-RNN计算不同梯度的代码和结果

def backward(self, cache, target,c=leastsquares_cost, dc=leastsquares_dcost): ''' cache is from forward pass c is a cost function dc is a function used as dc(output, target) which gives the gradient dc/doutput ''' XdotW = cache['XdotW'] #num_time_steps x hidden_size Hin = cache['Hin'] # num_time_steps x hidden_size T = Hin.shape[0] Hout = cache['Hout'] Xin = cache['Xin'] Xout = cache['Xout'] Oin = cache['Oin'] # num_time_steps x output_size Oout=cache['Oout'] dcdOin = dc(Oout, target) # this will be num_time_steps x num_outputs. these are dc/dO_j dcdWho = np.dot(Hout.transpose(), dcdOin) # this is the sum of outer products for all time # bias term is added at the end with coefficient 1 hence the dot product is just the sum dcdbho = np.sum(dcdOin, axis=0, keepdims=True) #this sums all the time steps dcdHout = np.dot(dcdOin, self.Who.transpose()) #reflects dcdHout_ij should be the dot product of dcdoin and the i'th row of Who; this is only for the outputs # now go back in time dcdHin = np.zeros(dcdHout.shape) # for t=T we can ignore the other term (error from the next timestep). self.df is derivative of activation function (here, tanh): dcdHin[T-1] = self.df(Hin[T-1]) * dcdHout[T-1] # because we don't need to worry about the next timestep, dcdHout is already corrent for t=T for t in reversed(xrange(T-1)): # we need to add to dcdHout[t] the error from the next timestep dcdHout[t] += np.dot(dcdHin[t], self.Whh.transpose()) # now we have the correct form for dcdHout[t] dcdHin[t] = self.df(Hin[t]) * dcdHout[t] # now we've gone through all t, and we can continue dcdWhh = np.zeros(self.Whh.shape) for t in range(T-1): #skip T bc dHdin[T+1] doesn't exist dcdWhh += np.outer(Hout[t], dcdHin[t+1]) # and we can do bias as well dcdbhh = np.sum(dcdHin,axis=0, keepdims=True) # now we need to go back to the embeddings dcdWxh = np.dot(Xout.transpose(), dcdHin) return {'dcdOout': dcdOout, 'dcdWxh': dcdWxh, 'dcdWhh': dcdWhh, 'dcdWho': dcdWho, 'dcdbhh': dcdbhh, 'dcdbho': dcdbho, 'cost':c(Oout, target)}

fn = theano.function([h0, u, t, lr], [error, y, h, gW, gW_in, gW_out, gb_h, gb_o], updates={W: W - lr * gW, W_in: W_in - lr * gW_in, W_out: W_out - lr * gW_out}) er, yout, hout, gWhh, gWhx, gWho, gbh, gbo =fn(numpy.zeros((n,)), numpy.eye(5), numpy.eye(5),.01) cache = rnn.forward(np.eye(5)) bc = rnn.backward(cache, np.eye(5)) print "sum difference between gWho (theano) and bc['dcdWho'] (pure python):" print np.sum(gWho - bc['dcdWho']) print "sum differnce between gWhh(theano) and bc['dcdWho'] (pure python):" print np.sum(gWhh - bc['dcdWhh']) print "sum difference between gWhx (theano) and bc['dcdWxh'] (pure pyython):" print np.sum(gWhx - bc['dcdWxh']) print "sum different between the last row of gWhx (theano) and the last row of bc['dcdWxh'] (pure python):" print np.sum(gWhx[-1] - bc['dcdWxh'][-1])

sum difference between gWho (theano) and bc['dcdWho'] (pure python): -4.59268040265e-16 sum differnce between gWhh(theano) and bc['dcdWhh'] (pure python): 0.120527063611 sum difference between gWhx (theano) and bc['dcdWxh'] (pure pyython): -0.332613468652 sum different between the last row of gWhx (theano) and the last row of bc['dcdWxh'] (pure python): 4.33680868994e-18

1条回答

网友

1楼 · 发布于 2024-10-02 10:27:10

你应该计算两个矩阵之差的逐点绝对值之和。由于特定的学习任务，普通和可能接近于零（您是否模拟零函数？：），以哪个为准。在

最后一行大概实现了神经元上一个常数的权重，即偏差，所以你-似乎-总是得到正确的偏差（但是，检查绝对值的总和）。在

矩阵的行主和列主表示法也很混乱，比如

gWhx - bc['dcdWxh']

读起来像是从“隐藏到x”到“x到隐藏”的相反的权重。在

我宁愿把这篇文章作为评论发表，但我这样做缺乏声誉。对不起的！在

相关问题更多 >

编程相关推荐

热门问题

热门文章