<p>我为这个纯python构建了一个也许不太好但功能强大的解析器,也许它至少可以用作一个基本思想:</p>
<pre><code>import re
import pprint
printer = pprint.PrettyPrinter(indent=4)
with open("entities.txt", "r") as file_obj:
entities = list()
for line in file_obj.readlines():
line = line.replace('\n', '')
if re.match(r'\s*(gene|CDS)\s+[\w(\.,)]+', line):
parts = line.split()
entity = {parts[0]: parts[1]}
entities.append(entity)
else:
try:
(attr_name,) = re.findall(r'/\w+=', line)
attr_name = attr_name.strip('/=')
except ValueError:
addition = line.strip()
entity[last_key] = ''.join([entity[last_key], addition])
else:
try:
(attr_value,) = re.findall(r'="\w+$', line)
last_key = attr_name
except ValueError:
try:
(attr_value,) = re.findall(r'="[\w\s\.:,-]+"', line)
except ValueError:
(attr_value,) = re.findall(r'=\d+$', line)
attr_value = attr_value.strip('"=')
if attr_name in entity:
entity[attr_name] = [entity[attr_name], attr_value]
else:
entity[attr_name] = attr_value
printer.pprint(entities)
</code></pre>