具有许多权重的大型数据集会导致非常慢的训练过程

import pandas as pd import tensorflow as tf import numpy # download csv data sheet of all cell lines input_data = pd.read_csv( 'C:/Users/lalalalalalala.csv', index_col=[0, 1], header=0, na_values='---') matrix_data = input_data.as_matrix() # user define cell lines of interest for supervised training group1 = input( "Please enter cell lines that makes up the your cluster of interest with spaces in between(case sensitive):") group_split1 = group1.split(sep=" ") # assign label of each: input cluster = 1 # rest of cluster = 0 # extract data of input group # split training and test set # all these if else statement represents split when the input group1 is not a even number split = len(group_split1) g1_train = input_data.loc[:, group_split1[0:int(split / 2) if len(group_split1) % 2 == 0 else (int(split / 2) + 1)]] g1_test = input_data.loc[:, group_split1[(int(split / 2) if len(group_split1) % 2 == 0 else (int(split / 2) + 1)):split]] g2 = input_data.loc[:, [x for x in list(input_data) if x not in group_split1]] split2 = g2.shape[1] g2_train = g2.iloc[:, 0:int(split2 / 2) if len(group_split1) % 2 == 0 else (int(split2 / 2) + 1)] g2_test = g2.iloc[:, (int(split2 / 2) if len(group_split1) % 2 == 0 else (int(split2 / 2) + 1)):split2] # amplify the input data if the input data is too small: amp1 = (int((g2_train.shape[1] - split) / int(split / 2))) if g2_train.shape[ 1] >= split else 1 # if g1 is less than g2 amplify g1_train = pd.DataFrame(pd.np.tile(g1_train, (1, amp1)), index=g2_train.index) amp2 = (int((g2_test.shape[1] - split) / int(split / 2))) if g2_test.shape[1] >= split else 1 g1_test = pd.DataFrame(pd.np.tile(g1_test, (1, amp2)), index=g2_test.index) regroup_train = pd.concat([g1_train, g2_train], axis=1, join_axes=[g1_train.index]) regroup_train = numpy.transpose(regroup_train.as_matrix()) regroup_test = pd.concat([g1_test, g2_test], axis=1, join_axes=[g1_test.index]) regroup_test = numpy.transpose(regroup_test.as_matrix()) # create labels split3 = g1_train.shape[1] labels_train = numpy.zeros(shape=[len(regroup_train), 1]) labels_train[0:split3] = 1 split4 = g1_test.shape[1] labels_test = numpy.zeros(shape=[len(regroup_test), 1]) labels_test[0:split4] = 1 # change all nan to 0 regroup_train = numpy.nan_to_num(regroup_train) regroup_test = numpy.nan_to_num(regroup_test) labels_train = numpy.nan_to_num(labels_train) labels_test = numpy.nan_to_num(labels_test) ####################################################################################################################### #####################################################NEURAL NETWORK#################################################### ####################################################################################################################### # define variables trainingtimes = 1000 # create model x = tf.placeholder(tf.float32, [None, 54781]) w = tf.Variable(tf.zeros([54781, 1])) b = tf.Variable(tf.zeros([1])) # define linear regression model, loss function y = tf.nn.sigmoid((tf.matmul(x, w) + b)) # define correct training group ytt = tf.placeholder(tf.float32, [None, 1]) # define cross optimizer and cost function mse = tf.reduce_mean(tf.losses.mean_squared_error(y, ytt)) # train step train_step = tf.train.GradientDescentOptimizer(learning_rate=0.3).minimize(mse) sess = tf.InteractiveSession() tf.global_variables_initializer().run() for i in range(trainingtimes): sess.run(train_step, feed_dict={x: regroup_train, ytt: labels_train}) if i % 100 == 0: print(sess.run(mse, feed_dict={x: regroup_train, ytt: labels_train}))

1条回答

网友

1楼 · 发布于 2024-10-02 00:26:21

这里有几个关键问题。你正在尝试定义一个单层神经网络，这听起来很适合这个问题。但是你的隐藏层比它应该的要大得多。尝试小重量的尺寸。试试128、256、512这样的数字（不需要2的幂）。你知道吗

而且，您的输入维度相当高。我知道有人在研究一个非常类似的癌症基因表达问题，大约有60000个基因表达和10000个样本。她用主成分分析法降低了数据的维数，同时保持了约90%的方差（她用不同的值进行了实验，发现这是最佳的）。你知道吗

这改善了结果。神经网络可以过拟合，PCA降维是有利的。在她的实验中，1层完全连接的网络也表现出了对数回归和XGA-boost。你知道吗

她正在处理这个问题的其他一些事情，可能也适用于你：

多任务学习可以提高学习效果。她最初有4个不同的神经网络（4个输出给定相同的数据），当她把它们组合成一个具有4个损失函数的神经网络时，它改进了所有4个的结果。你知道吗
代替PCA，你可以使用自动编码器作为另一种降维技术。完全可以将自动编码器连接到这个网络，并结合丢失功能对其进行训练。不过，我还没有实际尝试过这个方法，所以我只能说，我希望它能在理论上改善结果。PCA方法将更快地进行测试，所以我将从那里开始。你知道吗

相关问题更多 >

编程相关推荐

热门问题

热门文章