<p>读了你的代码,我可以看出你是一个聪明的家伙,所以我要给你一些完全未经测试的东西,让你弄清楚如何让它工作,以及是否有任何一个比你现有的更快:-)</p>
<p>(嘿,不管怎样,你的问题并不是给了一个真实的数据集…)</p>
<p>编辑使用十六进制数字来计算不匹配。在</p>
<pre><code>#! /usr/bin/env python2.7
# Text is much faster in 2.7 than 3...
def rm_gaps(seq1, seq2):
''' Given a first sequence with gaps removed,
do the same operation to a second sequence.
'''
startpos = 0
for substring in seq1:
length = len(substring)
yield seq2[startpos:length]
startpos += length + 1
def seq_divergence(infile):
# Build a character translation map with ones
# in the positions of valid bases, and
# another one with hex numbers for each base.
transmap_v = ['0'] * 256
transmap_m = ['0'] * 256
for ch, num in zip('ACGT', '1248'):
transmap_v[ord(ch)] = '1'
transmap_m[ord(ch)] = num
transmap_v = ''.join(transmap_v)
transmap_m = ''.join(transmap_m)
# No need to do everything inside open you are
# done with the file once you have read it in.
with open(infile, 'rb') as f:
alignment = f.read()
# For this case, using regular string stuff might be faster than re
h1 = '>HEADER1\n'
h2 = h1.replace('1', '2')
h1loc = alignment.find(h1)
h2loc = alignment.find(h2)
# This assumes header 1 comes before header 2. If that is
# not invariant, you will need more code here.
seq1 = alignment[h1loc + len(h1):h2loc].replace('\n','')
seq2 = alignment[h2loc + len(h2):].replace('\n','')
# Remove the gaps
seq1 = seq1.split('-')
seq2 = rm_gaps(seq1, seq2)
seq1 = ''.join(seq1)
seq2 = ''.join(seq2)
assert len(seq1) == len(seq2)
# Let's use Python's long integer capability to
# find locations where both sequences are valid.
# Convert each sequence into a binary number,
# and AND them together.
num1 = int(seq1.translate(transmap_v), 2)
num2 = int(seq2.translate(transmap_v), 2)
valid = ('{0:0%db}' % len(seq1)).format(num1 & num2)
assert len(valid) == len(seq1)
# Now for the mismatch use hexadecimal instead
# of binary here. The 4 bits per character position
# nicely matches our 4 possible bases.
num1 = int(seq1.translate(transmap_m), 16)
num2 = int(seq2.translate(transmap_m), 16)
mismatch = ('{0:0%dx}' % len(seq1)).format(num1 & num2)
assert len(match) == len(seq1)
# This could possibly use some work. For example, if
# you expect very few invalid and/or mismatches, you
# could split on '0' in both these cases, and then
# use the length of the strings between the zeros
# and concatenate ranges for valid, or use them as
# skip distances for the mismatches.
valid = [x for x, y in enumerate(valid,1) if y == '1']
mismatch = [x for x, y in enumerate(mismatch, 1) if y == '0']
return valid, mismatch
</code></pre>