如何为Python加速（fasta）子采样程序？

#!/usr/bin/env python3 import random import sys # How many random sequences do you want? num = int(input("Enter number of random sequences to select:\n")) # Import arguments infile = open(sys.argv[1], "r") outfile = open(sys.argv[2], "w") # Define lists fNames = [] fSeqs = [] # Extract fasta file into the two lists for line in infile: if line.startswith(">"): fNames.append(line.rstrip()) else: fSeqs.append(line.rstrip()) # Print total number of sequences in the original file print("There are "+str(len(fNames))+" in the input file") # Take random items out of the list for the total number of samples required for j in range(num): a = random.randint(0, (len(fNames)-1)) print(fNames.pop(a), file = outfile) print(fSeqs.pop(a), file = outfile) infile.close() outfile.close() input("Done.")

1条回答

网友

1楼 · 发布于 2024-09-30 01:34:46

random.sample（这是在注释中建议的）和字典使脚本更快。以下是最后的脚本：

#!/usr/bin/env python3
import random
import sys
# How many random sequences do you want?
num = int(input("Enter number of random sequences to select:\n"))

# Import arguments
infile = open(sys.argv[1], "r")
outfile = open(sys.argv[2], "w")

# Define list and dictionary
fNames = []
dicfasta = {}
# Extract fasta file into the two lists
for line in infile:
    if line.startswith(">"):
        fNames.append(line.rstrip())
        Id = line.rstrip()
    else:
        dicfasta[Id] = line.rstrip()

# Print total number of sequences in the original file
print("There are "+str(len(fNames))+" in the input file")

# Create subsamples
subsample = []
subsample = random.sample(fNames, num)

# Take random items out of the list for the total number of samples required
for j in subsample:
    print(j, file = outfile)
    print(dicfasta[j], file = outfile)

infile.close()
outfile.close()
input("Done.")

相关问题更多 >

编程相关推荐

热门问题

热门文章