已编辑
你好
我想创建一个python程序,将FCV file
、window
和increment value
作为输入,并在每个窗口中为所有样本(列)返回一个图,其中包含SNP密度下面的示例图像。
我希望采取的步骤:
我可以使用R/Bioconductor pachages或Biopython来完成,但我需要一个基本的python解决方案。 请帮忙! 谢谢
以下是我尝试过的:VCFfile
#!/usr/bin/env python
# libraries
import argparse
import io
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
## Read VCF file
# Read vcf file without headers
def read_vcf(path):
with open(path, 'r') as f:
lines = [l for l in f if not l.startswith('##')]
return pd.read_csv(
io.StringIO(''.join(lines)),
dtype={'#CHROM': str, 'POS': int, 'ID': str, 'REF': str, 'ALT': str,
'QUAL': str, 'FILTER': str, 'INFO': str},
sep='\t'
).rename(columns={'#CHROM': 'CHROM'})
df = read_vcf('VCFFile.vcf')
# cleaning data
## format CHROM column
df['CHROM'] = df['CHROM'].str.replace('chr0','').astype(int)
## select useful columns: all columns except not useful ones
df = df[df.columns.difference(['ID', 'INFO', 'REF', 'ALT', 'QUAL', 'FILTER', 'FORMAT'])]
# Get alleles for each sample
def get_alleles(df):
for i in df.columns.difference(['CHROM', 'POS']):
suffix= str(i) + '_genotype'
df[suffix] = df[str(i)].astype(str).str[0:3]
#df.drop(str(i), axis=1)
#df = df[df.columns.drop(str(i))]
# apply the function
get_alleles(df)
# remove original genotype columns
filter_col = [col for col in df if col.endswith('genotype')]
filter_col.append('CHROM')
filter_col.append('POS')
df = df[filter_col]
# replace genotypes: 1/1 by 1, else by 0
list_values = ['0/0', './.', './0', '0/.', '1/0', '0/1']
df = df.replace(to_replace =list_values, value ='NaN')
df = df.replace(to_replace ='1/1', value =1)
现在我想绘制每个样本的SNP密度:
# plot SNP density for each sample ==========================================
# get data for each sample
# create a function to select columns
def select_sample(col):
x = df[['POS', str(col)]]
#remove NaN
x = x[x[str(col)] ==1]
return x
sample_1 = select_sample("A_genotype")
sample_2 = select_sample("B_genotype")
sample_3 = select_sample("C_genotype")
sample_4 = select_sample("D_genotype")
sample_5 = select_sample("E_genotype")
sample_6 = select_sample("F_genotype")
sample_7 = select_sample("I_genotype")
sample_8 = select_sample("P_genotype")
我无法添加incrementValue来获得如下图。图1–使用1000000的窗口大小和100000的增量绘制多态性密度图
def plot_windowed_variant_density(pos, window_size, incrementValue=None, title, ax):
# setup windows
bins = np.arange(0, pos.max(), window_size)
print(bins)
#incrementValue
#incrementValue = ???????????
# use window midpoints as x coordinate
x = (bins[1:] + bins[:-1])/2
# compute variant density in each window
count, _ = np.histogram(sample['POS'], bins=bins)
y= count
# plot
sns.despine(ax=ax, offset=10)
ax.plot(x, y)
ax.set_xlabel('Chromosome position (Mb)')
ax.set_ylabel('Count')
if title:
ax.set_title(title)
#====================================================
fig, ax = plt.subplots(figsize=(12, 3))
# Apply the function:
for i in [sample_1, sample_2, sample_3, sample_4, sample_5, sample_6, sample_7, sample_8]:
plot_windowed_variant_density(i.POS, 1000000,'test', ax)
如果将图形的ax添加到函数参数,则可以在同一图形上创建覆盖
相关问题 更多 >
编程相关推荐