为什么我的函数为LSTM而不是GRU获得了好的值?

2024-09-23 04:19:54 发布

您现在位置:Python中文网/ 问答频道 /正文

我试图实现一个程序,比较LSTM的性能和GRU的单词预测性能。我对它们都使用相同的参数,然而,虽然我得到了LSTM的良好困惑值,但我得到的GRU值绝对糟糕。 我最近尝试调试训练函数,因为它最初只用于LSTM模型,而不用于GRU模型。正如我已经说过的,这两个模型应该得到相似的值,但是现在LSTM模型开始时大约有150个复杂度,然后收敛到一个正常值,而GRU模型开始时的随机值在1000秒内,根本不收敛

对于所有RNN、LSTM和GRU的东西,我都是新手,所以如果有明显的遗漏,请原谅我。 任何帮助都将被感激

我使用以下两种模型:

class LSTM_Model(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers, dropout=0):
        super(LSTM_Model, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True, dropout = dropout)
        self.fc = nn.Linear(hidden_size, vocab_size)
        
    def forward(self, x, hidden_state):
        x = self.embed(x) 
        out, (hidden_state, cell_state) = self.lstm(x, hidden_state) 
        out = out.reshape(out.size(0)*out.size(1), out.size(2)) # Reshape output to (batch_size*sequence_length, hidden_size)
        out = self.fc(out) 
        return out, (hidden_state, cell_state)


class GRU_Model(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers, dropout=0):
        super(GRU_Model, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.gru = nn.GRU(embed_size, hidden_size, num_layers, batch_first=True, dropout = dropout)
        self.fc = nn.Linear(hidden_size, vocab_size)
        
    def forward(self, x, hidden_state):
        x = self.embed(x) 
        out, hidden_state = self.gru(x, hidden_state) 
        out = out.reshape(out.size(0)*out.size(1), out.size(2)) # Reshape output to (batch_size*sequence_length, hidden_size)
        out = self.fc(out) 
        return out, hidden_state

培训职能:

def run_model(model, epochs=epochs, learning_rate=learning_rate, clip=clip, momentum=momentum, LSTM=True, GRU=False, Dropout=False):
  # Define loss criterion and optimizer
  criterion = nn.CrossEntropyLoss()
  optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum)
  lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer=optimizer, step_size=step_size, gamma=decay_rate)

  train_perplexity, test_perplexity, valid_perplexity = [], [], []

  # Train the model
  for e in range(epochs):
      # Set all initial hidden and cell states to zeroes
      train_states=init_states(LSTM, GRU, num_layers, batch_size, hidden_size)
      test_states=init_states(LSTM, GRU, num_layers, batch_size, hidden_size)
      valid_states=init_states(LSTM, GRU, num_layers, batch_size, hidden_size)

      
      # RUN TRAINING SET #
      model.train()
      for i in range(0, ids.size(1) - seq_length, seq_length):
          # Set train_inputs and train_targets
          train_inputs = ids[:, i:i+seq_length].to(device)
          train_targets = ids[:, (i+1):(i+1)+seq_length].to(device)
          
          # Forward pass
          model.zero_grad() 
          if(LSTM==True):
              train_states = [state.detach() for state in train_states] # Detach the hidden state from how it was previously produced
          if(GRU==True):
              train_states = train_states.data #detach?
          train_outputs, train_states = model(train_inputs, train_states)
          train_loss = criterion(train_outputs, train_targets.reshape(-1))
          
          # Backward and optimize
          train_loss.backward()
          clip_grad_norm_(model.parameters(), clip)
          optimizer.step()

      lr_scheduler.step() 
      model.eval()
      with torch.no_grad(): 
       #test and validation, removed to reduce length

      model.train() # reset to train mode after iterating through validation data
      train_perplexity.append(math.exp(train_loss.item()))
      test_perplexity.append(np.exp(np.mean(test_losses)))
      valid_perplexity.append(np.exp(np.mean(valid_losses)))

      print('Epoch ' + str(e+1) + '/' + str(epochs) + ': ')
      print('Train Perplexity - ' + str(train_perplexity[e]))
      print('Test Perplexity - ' + str(test_perplexity[e]))
      print('Validation Perplexity - ' + str(valid_perplexity[e]))
      print("----------------------------------------------------")
  return train_perplexity, test_perplexity, valid_perplexity

隐藏状态初始化:

def init_states(LSTM, GRU, num_layers=num_layers, batch_size=batch_size, hidden_size=hidden_size):
    if (LSTM==True):
        return (torch.FloatTensor(num_layers, batch_size, hidden_size).uniform_(r1, r2).to(device),
                    torch.FloatTensor(num_layers, batch_size, hidden_size).uniform_(r1, r2).to(device))
    if (GRU==True):
        return torch.FloatTensor(num_layers, batch_size, hidden_size).uniform_(r1, r2).to(device)

Tags: toselfsizelayersbatchtrainembedout