我有一个大的dataframe
,在这里我使用t检验计算每一行的p值。我现在想要一个包含前十个最低p值的行的箱线图
LeadSNPs = pd.unique(candidate_genes.LeadSNP) #rs3184504 rs531612
gene_counts_per_snp_df = pd.DataFrame.empty
save_path = "../figures/SM5_gene_counts/"
for LeadSNP_cnt, LeadSNP in enumerate(LeadSNPs):
print(LeadSNP)
candidate_genes_per_SNP = candidate_genes.Target[np.where(candidate_genes.LeadSNP==LeadSNP)[0]]
region = pd.unique(candidate_genes.Region[np.where(candidate_genes.LeadSNP==LeadSNP)[0]])
first_gene_flag = 1
for gene_cnt, target_gene in enumerate(candidate_genes_per_SNP):
gene_indexes = candidate_genes_per_SNP.index
PRE = candidate_genes['sumOfWeightedWeights (PRE)'][gene_indexes[gene_cnt]]
print(target_gene)
ensembl_id = get_ensembl_id(target_gene)
print(ensembl_id)
if pd.isnull(ensembl_id):
pass
else:
gene_counts_df = get_gene_counts_df(ensembl_id)
if gene_counts_df.shape[0]==0:
print('no ensemble id found in gene counts!')
else:
gene_counts_df = gene_counts_df.melt(id_vars=["Gene"], var_name='compartment', value_name='count')
gene_counts_df = reshape_gene_counts_df(gene_counts_df)
gene_counts_df['target_gene'] = target_gene
gene_counts_df['PRE'] = PRE
gene_counts_df['pval_ftest']= np.nan
pop3= gene_counts_df.loc[(gene_counts_df['target_gene']==target_gene) & (gene_counts_df['compartment']=='CSF_N')]['count']
pop4 = gene_counts_df.loc[(gene_counts_df['target_gene']==target_gene) & (gene_counts_df['compartment']=='PB_N')]['count']
pval1 = stats.ttest_ind(pop3, pop4)[1]
gene_counts_df.loc[(gene_counts_df['target_gene']==target_gene) & (gene_counts_df['compartment'].isin(['CSF_N','PB_N'])),"pval_ftest"]= pval_ftest
if first_gene_flag == 1:
gene_counts_per_snp_df = gene_counts_df
first_gene_flag = 0
else:
gene_counts_per_snp_df = pd.concat([gene_counts_per_snp_df, gene_counts_df])
gene_counts_per_snp_df['LeadSNP'] = LeadSNP
if LeadSNP_cnt == 0:
all_gene_counts = gene_counts_per_snp_df
else:
all_gene_counts = pd.concat([all_gene_counts, gene_counts_per_snp_df])
all_gene_counts = pd.DataFrame.reset_index(all_gene_counts)
plot_top_genes_snps(all_gene_counts_per_comp, 'target_gene')
绘图代码如下所示:
def plot_top_genes_snps(all_gene_counts_per_comp, x_label):
sns.set(style="white")
sns.set_context("poster")
palette = sns.color_palette("colorblind", 10)
fig, ax = plt.subplots(figsize=(25,4))
g = sns.boxplot(ax=ax, y='count', x=x_label, data=all_gene_counts_per_comp, hue = 'compartment', showfliers=False, palette=palette, hue_order=comp_order)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, horizontalalignment='right')
handles, _ = ax.get_legend_handles_labels()
current_legends = []
for str_ind in range(len(handles)):
current_legends.append(comp_dict[handles[str_ind].get_label()])
ax.legend(handles, current_legends, bbox_to_anchor=(1, 1), loc=2)
ax.yaxis.grid()
sns.set(font_scale = 2)
plt.xlabel('')
plt.ylabel('Gene count')
# plt.savefig(save_path+str(LeadSNP)+'.pdf', bbox_inches='tight')
plt.show()
对于上下文,我想要具有最低p值的前十个target_gene
。然而,这是我得到的情节:
如何仅提取十个最低的p值并进行箱线图绘制
更新:数据框如下所示,该表从不同的SNP重复:
文本格式的dataframe
Gene compartment count patient_id target_gene PRE \
1 ENSG00000157870 CSF_N 0 1 FAM213B 7.5
11 ENSG00000157870 CSF_N 0 2 FAM213B 7.5
21 ENSG00000157870 CSF_N 0 3 FAM213B 7.5
31 ENSG00000157870 CSF_N 0 4 FAM213B 7.5
41 ENSG00000157870 CSF_N 0 5 FAM213B 7.5
.. ... ... ... ... ... ...
21 ENSG00000182866 CSF_N 18 3 LCK 2.0
31 ENSG00000182866 CSF_N 45 4 LCK 2.0
41 ENSG00000182866 CSF_N 0 5 LCK 2.0
51 ENSG00000182866 CSF_N 9 6 LCK 2.0
61 ENSG00000182866 CSF_N 0 7 LCK 2.0
pval_ftest LeadSNP
1 0.222523 rs6670198
11 0.222523 rs6670198
21 0.222523 rs6670198
31 0.222523 rs6670198
41 0.222523 rs6670198
将为您提供具有最小
"pval_ftest"
值的前10行也许这个玩具示例会让我们更清楚地了解如何排序和选择数据帧的子集
相关问题 更多 >
编程相关推荐