我是生物信息学领域的新手,也是Python的初学者。你知道吗
我正在使用一个名为Ruffus的框架来运行一个管道,使文件在工具中导航以获得有趣的输出。你知道吗
在我的管道的一部分(第三步)中,我需要做一些变通,使Ruffus考虑到我的文件:使用的工具可以随意更改输出名称。你知道吗
我的问题是:我必须运行管道的第一次,使它运行到第三步,它说,每个文件都完成了。当我重新跑的时候,它从第三个一直跑到最后。 我认为问题是我使用的glob是关于一个特定模式的,glob函数应该在脚本开始时运行,但是我要查找的文件(.*avinput)还没有创建。一旦我重新运行它,它们就是,然后这个过程就可以继续了。但这不是真的。你知道吗
我怎样才能“强制”这样的var等到某个特定的时刻运行,或者在到达某个特定的步骤时重新运行它?你知道吗
最好的
from ruffus import *
import sys
import subprocess
import os
import re
import glob
import time
import termcolor
#====================================
#
#Input area
#
#
#====================================
#Create a method to get all the files from the folder
input_files = (["/genetics/ongoing_work/pipeline/78_14_TR_p_BWA_mem_PIC_conv_ord_mdup_group_recal_rawvariants_Joint_INDEL_recalibrated_extracted_filter_appl.vcf",
"/genetics/ongoing_work/pipeline/78_14_TR_p_BWA_mem_PIC_conv_ord_mdup_group_recal_rawvariants_Joint_SNP_recalibrated_extracted_filter_appl.vcf"])
# DIRECTORIES
outdir = '/genetics/ongoing_work/pipeline'
annovar_db='/genetics/apps/annovar/humandb/'
#SOFTWARES
gatk = '/genetics/apps/gatk3.8/GenomeAnalysisTK.jar'
convert2annovar='/genetics/apps/annovar/convert2annovar.pl'
annotate_variation='/genetics/apps/annovar/annotate_variation.pl'
table_annovar='/genetics/apps/annovar/table_annovar.pl'
#FILES
reference_genome = '/genetics/reference/ucsc.hg19.fasta'
# ---------- 1st step -----------
# Filter Low Quality Genotypes
# Filter Low Quality Genotypes
# GATK
#
# ---------------------------
@transform(input_files, formatter("([^/]+).vcf$"), os.path.join(outdir, "{1[0]}_flqg.vcf"))
def LowQualityGenotype(input_file, output_file):
print(termcolor.colored(' ---------- 1st step ----------- \n \
Filter Low Quality Genotypes \n \
Filter Low Quality Genotypes \n \
GATK \n \
\n \
--------------------------- \n','red'))
subprocess.run("java -jar {tool} \
-T VariantFiltration \
-R {ref} \
-V {i_file} \
-G_filter 'GQ < 20.0' \
-G_filterName 'LowQG' \
-o {o_file}".format(
o_file=output_file,
i_file=input_file,
tool=gatk,
ref=reference_genome
), shell=True)
# ---------- 2nd step -----------
# Convert to Avinput
# ANNOVAR
# UPDATE TO FIX
# ---------------------------
@subdivide(LowQualityGenotype, formatter("([^/]+).vcf$"), os.path.join(outdir, "{1[0]}.avinput"))
def ConvertAvinput(input_file, output_files):
subprocess.run("perl {tool} \
-format vcf4 -allsample\
-withzyg \
-includeinfo \
{i_file} \
-outfile {o_file}".format(
o_file=output_files,
i_file=input_file,
tool=convert2annovar,
), shell=True)
#modifying outputs because 2 gives 4 with the samples names in it
file_list_avinput = glob.glob(outdir+'/*.avinput')
print('Impression du glob')
print(file_list_avinput)
#use of glob to get all the files created
for i in file_list_avinput:
if i:
os.rename(i, re.sub('(.avinput)(?:\.)(.{5})(?:.*)',"_sample_\\2\\1",i))
time.sleep(3)
# ---------- 3rd step -----------
# Filter 1000G
# ANNOVAR
#
# ---------------------------
pre1000g_files = glob.glob(outdir + '/*.avinput')
print('LIGNES POUR PRE1000G')
print(pre1000g_files)
@follows(ConvertAvinput)
@subdivide(pre1000g_files, formatter("([^/]+).avinput"),
os.path.join(outdir, "{1[0]}_1000G_filtered"))
def Filter1000G(input_file, output_files):
print('LIGNES POUR PRE1000G')
print(pre1000g_files)
subprocess.run("perl {tool} \
-filter \
-dbtype 1000g2015aug_eur \
-buildver hg19 \
-maf {maf_freq} \
-out {o_file} \
{i_file} \
{annodb} \
-outfile {o_file}".format(
o_file=output_files,
i_file=input_file,
maf_freq=0.03,
annodb=annovar_db,
tool=annotate_variation
), shell=True)
os.rename(output_files+'.hg19_EUR.sites.2015_08_filtered',output_files[0:-8]+'filtered')
os.rename(output_files + '.hg19_EUR.sites.2015_08_dropped', output_files[0:-7]+'_dropped')
# ---------- 4th step -----------
# Filter InHouse
# ANNOVAR
#
# ---------------------------
@transform(Filter1000G, formatter("([^/]+)_filtered"),
os.path.join(outdir, "{1[0]}_Norge_filtered"))
def FilterInHouse(input_file, output_file):
subprocess.run("perl {tool} \
-filter \
-dbtype generic \
-genericdbfile nimblegen_v3_and_v2_combined_jul2015.singleallelic.avinput.AC_gt_5 \
-buildver hg19 \
-out {o_file} \
{i_file} \
{annodb} \
-outfile {o_file}".format(
o_file=output_file,
i_file=input_file,
maf_freq=0.005,
annodb=annovar_db,
tool=annotate_variation
), shell=True)
os.rename(output_file+'.hg19_generic_filtered',output_file[0:-8]+'filtered')
os.rename(output_file + '.hg19_generic_dropped', output_file[0:-7]+'_dropped')
pipeline_run(FilterInHouse)
目前没有回答
相关问题 更多 >
编程相关推荐