利用深度学习从序列预测子序列

#!/usr/bin/env python # -*- coding: utf-8 -*- from __future__ import print_function import sys import json import pandas as pd from keras.models import Sequential from keras.engine.training import slice_X from keras.layers.core import Activation, RepeatVector, Dense from keras.layers import recurrent, TimeDistributed import numpy as np from six.moves import range class CharacterTable(object): ''' Given a set of characters: + Encode them to a one hot integer representation + Decode the one hot integer representation to their character output + Decode a vector of probabilties to their character output ''' def __init__(self, chars, maxlen): self.chars = sorted(set(chars)) self.char_indices = dict((c, i) for i, c in enumerate(self.chars)) self.indices_char = dict((i, c) for i, c in enumerate(self.chars)) self.maxlen = maxlen def encode(self, C, maxlen=None): maxlen = maxlen if maxlen else self.maxlen X = np.zeros((maxlen, len(self.chars))) for i, c in enumerate(C): X[i, self.char_indices[c]] = 1 return X def decode(self, X, calc_argmax=True): if calc_argmax: X = X.argmax(axis=-1) return ''.join(self.indices_char[x] for x in X) class colors: ok = '\033[92m' fail = '\033[91m' close = '\033[0m' INVERT = True HIDDEN_SIZE = 128 BATCH_SIZE = 64 LAYERS = 3 # Try replacing GRU, or SimpleRNN RNN = recurrent.LSTM def main(): """ Epitope_core = answers Antigen = questions """ epi_antigen_df = pd.io.parsers.read_table("http://dpaste.com/2PZ9WH6.txt") antigens = epi_antigen_df["Antigen"].tolist() epitopes = epi_antigen_df["Epitope Core"].tolist() if INVERT: antigens = [ x[::-1] for x in antigens] allchars = "".join(antigens+epitopes) allchars = list(set(allchars)) aa_chars = "".join(allchars) sys.stderr.write(aa_chars + "\n") max_antigen_len = len(max(antigens, key=len)) max_epitope_len = len(max(epitopes, key=len)) X = np.zeros((len(antigens),max_antigen_len, len(aa_chars)),dtype=np.bool) y = np.zeros((len(epitopes),max_epitope_len, len(aa_chars)),dtype=np.bool) ctable = CharacterTable(aa_chars, max_antigen_len) sys.stderr.write("Begin vectorization\n") for i, antigen in enumerate(antigens): X[i] = ctable.encode(antigen, maxlen=max_antigen_len) for i, epitope in enumerate(epitopes): y[i] = ctable.encode(epitope, maxlen=max_epitope_len) # Shuffle (X, y) in unison as the later parts of X will almost all be larger digits indices = np.arange(len(y)) np.random.shuffle(indices) X = X[indices] y = y[indices] # Explicitly set apart 10% for validation data that we never train over split_at = len(X) - len(X) / 10 (X_train, X_val) = (slice_X(X, 0, split_at), slice_X(X, split_at)) (y_train, y_val) = (y[:split_at], y[split_at:]) sys.stderr.write("Build model\n") model = Sequential() # "Encode" the input sequence using an RNN, producing an output of HIDDEN_SIZE # note: in a situation where your input sequences have a variable length, # use input_shape=(None, nb_feature). model.add(RNN(HIDDEN_SIZE, input_shape=(max_antigen_len, len(aa_chars)))) # For the decoder's input, we repeat the encoded input for each time step model.add(RepeatVector(max_epitope_len)) # The decoder RNN could be multiple layers stacked or a single layer for _ in range(LAYERS): model.add(RNN(HIDDEN_SIZE, return_sequences=True)) # For each of step of the output sequence, decide which character should be chosen model.add(TimeDistributed(Dense(len(aa_chars)))) model.add(Activation('softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) # Train the model each generation and show predictions against the validation dataset for iteration in range(1, 200): print() print('-' * 50) print('Iteration', iteration) model.fit(X_train, y_train, batch_size=BATCH_SIZE, nb_epoch=5, validation_data=(X_val, y_val)) ### # Select 10 samples from the validation set at random so we can visualize errors for i in range(10): ind = np.random.randint(0, len(X_val)) rowX, rowy = X_val[np.array([ind])], y_val[np.array([ind])] preds = model.predict_classes(rowX, verbose=0) q = ctable.decode(rowX[0]) correct = ctable.decode(rowy[0]) guess = ctable.decode(preds[0], calc_argmax=False) # print('Q', q[::-1] if INVERT else q) print('T', correct) print(colors.ok + '☑' + colors.close if correct == guess else colors.fail + '☒' + colors.close, guess) print('---') if __name__ == '__main__': main()

1条回答

网友

1楼 · 发布于 2024-09-19 23:29:07

RNN、LSTM或GRU能否用于预测上述子序列？

是的，这些你都可以用。LSTMs和gru是RNN的类型；如果你所说的RNN是指a fully-connected RNN，那么由于梯度消失问题（1，2），它们已经不再受欢迎了。由于数据集中的示例相对较少，GRU可能比LSTM更好，因为它的体系结构更简单。在

如何提高代码的准确性？

你提到培训和验证错误都是不好的。一般来说，这可能是由于以下几个因素之一：

学习率太低（这不是问题，因为您使用的是Adam，一种单参数自适应学习率算法）
模型对于数据来说太简单了（根本不是问题，因为您有一个非常复杂的模型和一个小的数据集）
你有消失梯度（可能是因为你有一个3层RNN的问题）。尝试将层的数量减少到1层（一般来说，最好先让一个简单的模型运行起来，然后再增加复杂性），同时考虑超参数搜索（例如，128维的隐藏状态可能太大，试试30？）。在

另一种选择是，由于表位是输入的子串，因此预测抗原序列内表位的起始和结束指数（可能由抗原序列的长度标准化），而不是一次预测一个字符的子串。这将是两个任务的回归问题。例如，如果抗原是FSKIAGLTVT（10个字母长），它的表位是KIAGL（位置3到7，一个基），那么输入就是FSKIAGLTVT，输出是0.3（第一个任务）和0.7（第二个任务）。在

或者，如果您可以使所有抗原的长度相同（通过删除带有短抗原的数据集的部分和/或切掉长抗原的末端，假设您知道表位不在末端附近，则将长抗原的末端切掉），你可以用两个任务（开始和结束）和序列长度类将它定义为一个分类问题，在这个问题中，你试图为抗原分配一个概率，从每个位置开始和结束。在

如何修改我的代码以使它运行得更快？

减少层的数量将显著提高代码的速度。此外，由于GRU的架构更简单，GRU将比LSTM更快。然而，这两种类型的递归网络都比卷积网络慢。在

如果你对合作感兴趣，可以给我发一封电子邮件（我个人资料中的地址）。在

相关问题更多 >

编程相关推荐

热门问题

热门文章