<p>编辑1:完全重写以更好地匹配问题描述。在</p>
<p>假设这三个序列和另一个序列不一样。在</p>
<p>如果我理解正确的话,你在第三个序列中没有看到匹配的原因是实际上那里没有匹配。不过,前两个中有匹配项,如果运行此项,您将看到它们。在</p>
<p>'''</p>
<pre><code>import re
import string
with open('dna.txt', 'rb') as f:
data = f.read()
data = [x.split('\n', 1) for x in data.split('>')]
data = [(x[0], ''.join(x[1].split())) for x in data if len(x) == 2]
start, end = [re.compile(x) for x in 'ATG TAG|TGA|TAA'.split()]
revtrans = string.maketrans("ATGC","TACG")
def get_longest(starts, ends):
''' Simple brute-force for now. Optimize later...
Given a list of start locations and a list
of end locations, return the longest valid
string. Returns tuple (length, start position)
Assume starts and ends are sorted correctly
from beginning to end of string.
'''
results = {}
# Use smallest end that is bigger than each start
ends.reverse()
for start in starts:
for end in ends:
if end > start and (end - start) % 3 == 0:
results[start] = end + 3
results = [(end - start, start) for
start, end in results.iteritems()]
return max(results) if results else (0, 0)
def get_orfs(dna):
''' Returns length, header, forward/reverse indication,
and longest match (corrected if reversed)
'''
header, seqf = dna
seqr = seqf[::-1].translate(revtrans)
def readgroup(seq, group):
return list(x.start() for x in group.finditer(seq))
f = get_longest(readgroup(seqf, start), readgroup(seqf, end))
r = get_longest(readgroup(seqr, start), readgroup(seqr, end))
(length, index), s, direction = max((f, seqf, 'forward'), (r, seqr, 'reverse'))
return length, header, direction, s[index:index + length]
# Process entire file
all_orfs = [get_orfs(x) for x in data]
# Put in groups of 3
all_orfs = zip(all_orfs[::3], all_orfs[1::3], all_orfs[2::3])
# Process each group of 3
for x in all_orfs:
x = max(x) # Only pring longest in each group
print(x)
print('')
</code></pre>