擅长:python、mysql、java
<pre><code>sequences = open('fastafile.txt').read().split('>') # Creates a list of sequences.
needle = 'CTTTGTA'
occurrences = {}
for sequence in sequences:
occ = sequence.count(needle) # Returns the number of times the substring occurs in the string sequence.
if occ: # If greater than 0, create an entry in our dictionary. The sequence being the key and the count the value.
occurrences[sequence] = occ
output = []
sorted_occurrences = sorted(occurrences.items(), key=operator.itemgetter(1)) # Sort the dictionary by length, so sequences with the highest occurrence of the needle appear at the top.
for seq, occ_count in sorted_occurrences.iteritems():
gene_name, sequence = seq.split('\n')
formatted_line = '{gene_name} - {occ_count}'.format(gene_name=gene_name, occ_count=str(occ_count)) # Format the lines the way you want.
output.append(formatted_line)
with open('occurences.txt') as o_f:
o_f.write('\n'.join(output))
</code></pre>