<p>这似乎对您的示例文本有效。我不知道每个文件是否可以有一个以上的提取,我在这里时间不够,所以如果需要,您必须扩展它:</p>
<pre><code>#!python3
import re
Extract = {}
def match_notes(line):
global _State
pattern = r"^\s+(.*)$"
m = re.match(pattern, line.rstrip())
if m:
if 'notes' not in Extract:
Extract['notes'] = []
Extract['notes'].append(m.group(1))
return True
else:
_State = match_sp
return False
def match_pattern(line):
global _State
pattern = r"^\s+Pattern: (.*)$"
m = re.match(pattern, line.rstrip())
if m:
Extract['pattern'] = m.group(1)
_State = match_notes
return True
return False
def match_sp(line):
global _State
pattern = r">sp\|([^|]+)\|(.*)$"
m = re.match(pattern, line.rstrip())
if m:
if 'sp' not in Extract:
Extract['sp'] = []
spinfo = {
'accession code': m.group(1),
'other code': m.group(2),
}
Extract['sp'].append(spinfo)
_State = match_sp_note
return True
return False
def match_sp_note(line):
"""Second line of >sp paragraph"""
global _State
pattern = r"^([^[]*)\[([^]]+)\)"
m = re.match(pattern, line.rstrip())
if m:
spinfo = Extract['sp'][-1]
spinfo['note'] = m.group(1).strip()
spinfo['species'] = m.group(2).strip()
spinfo['sequence'] = ''
_State = match_sp_sequence
return True
return False
def match_sp_range(line):
"""Last line of >sp paragraph"""
global _State
pattern = r"^\s+(\d+) - (\d+):\s+(.*)"
m = re.match(pattern, line.rstrip())
if m:
spinfo = Extract['sp'][-1]
spinfo['range'] = (m.group(1), m.group(2))
spinfo['flags'] = m.group(3)
_State = match_sp
return True
return False
def match_sp_sequence(line):
"""Middle block of >sp paragraph"""
global _State
spinfo = Extract['sp'][-1]
if re.match("^\s", line):
# End of sequence. Check for pattern, reset state for sp
if re.match(r"[AG].{4}GK[ST]", spinfo['sequence']):
spinfo['ag_4gkst'] = True
else:
spinfo['ag_4gkst'] = False
_State = match_sp_range
return False
spinfo['sequence'] += line.rstrip()
return True
def match_start(line):
"""Start of outer item"""
global _State
pattern = r"^Hits for ([A-Z]+\d+)|([^:]+) : (?:\[occurs (\w+)\])?"
m = re.match(pattern, line.rstrip())
if m:
Extract['pattern_id'] = m.group(1)
Extract['title'] = m.group(2)
Extract['occurrence'] = m.group(3)
_State = match_pattern
return True
return False
_State = match_start
def process_line(line):
while True:
state = _State
if state(line):
return True
if _State is not state:
continue
if len(line) == 0:
return False
print("Unexpected line:", line)
print("State was:", _State)
return False
def process_file(filename):
with open(filename, "r") as infile:
for line in infile:
process_line(line.rstrip())
process_file("ploop.fa")
import pprint
pprint.pprint(Extract)
</code></pre>