<p>嗨,请检查一下:我的fasta文件名为'txt':</p>
<p>代码段:</p>
<pre><code>#!/usr/bin/python
import re
protein_dict = [
('P00001', 'KVSPT*FDTNMVGK'),
('P00001', 'SLDAGPGMCS*R'),
('P00003', 'LDS*GNFSWKMTEACMK')
]
protein_id = None
def prepare_structure_from_fasta(file):
fasta_structure = dict()
with open(file, 'r') as fh:
for line in fh:
if '>' in line:
protein_id = line.split('|')[1]
else:
if not protein_id:
raise Exception("Wrong fasta file structure")
fasta_structure[protein_id] = line.strip()
return fasta_structure
def match(pattern, string):
matc = re.search(pattern, string)
if matc:
return matc.groups()[0]
return None
fasta_struct = prepare_structure_from_fasta('txt')
final_struct = []
for pro_d in protein_dict:
pro_id = pro_d[0]
pep_id = pro_d[1]
first, second = pep_id.split('*')
if len(first) <= 6:
f_count = 7 - len(first)
else:
first = first[len(first) - 7:]
f_count = 0
if len(second) <= 6:
s_count = 7 - len(second)
else:
second = second[0:6]
s_count = 0
_regex = '([A-Z]{0,%d}%s%s[A-Z]{0,%d})' % (f_count,first,second,s_count)
final_struct.append((pro_id, pep_id, match(_regex, fasta_struct[pro_id])))
for pro in final_struct:
print pro
</code></pre>
<p>输出:</p>
^{pr2}$