在Python中从一个文件到多个字典

gene join(373616..374161,1..174) /locus_tag="AM1_A0001" /db_xref="GeneID:5685236" CDS join(373616..374161,1..174) /locus_tag="AM1_A0001" /codon_start=1 /transl_table=11 /product="glutathione S-transferase, putative" /protein_id="YP_001520660.1" /db_xref="GI:158339653" /db_xref="GeneID:5685236" /translation="MKIVSFKICPFVQRVTALLEAKGIDYDIEYIDLSHKPQWFLDLS PNAQVPILITDDDDVLFESDAIVEFLDEVVGTPLSSDNAVKKAQDRAWSYLATKHYLV QCSAQRSPDAKTLEERSKKLSKAFGKIKVQLGESRYINGDDLSMVDIAWLPLLHRAAI IEQYSGYDFLEEFPKVKQWQQHLLSTGIAEKSVPEDFEERFTAFYLAESTCLGQLAKS KNGEACCGTAECTVDDLGCCA" gene 241..381 /locus_tag="AM1_A0002" /db_xref="GeneID:5685411" CDS 241..381 /locus_tag="AM1_A0002" /codon_start=1 /transl_table=11 /product="hypothetical protein" /protein_id="YP_001520661.1" /db_xref="GI:158339654" /db_xref="GeneID:5685411" /translation="MLINPEDKQVEIYRPGQDVELLQSPSTISGADVLPEFSLNLEWI WR" gene 388..525 /locus_tag="AM1_A0003" /db_xref="GeneID:5685412" CDS 388..525 /locus_tag="AM1_A0003" /codon_start=1 /transl_table=11 /product="hypothetical protein" /protein_id="YP_001520662.1" /db_xref="GI:158339655" /db_xref="GeneID:5685412" /translation="MKEAGFSENSRSREGQPKLAKDAAIAKPYLVAMTAELQIMATET L"

2条回答

网友

1楼 · 编辑于 2024-09-28 18:56:45

如果有人对我在我收到的评论的帮助下找到的初学者解决方案感兴趣，这里是：

import sys, re

annot = file("example.embl", "r")
embl = ""
annotation = []

for line in annot:
    embl += line

embl_list = embl.split("FT   gen")

for item in embl_list:
    if "e            " in item:
        split_item = item.split("\n")
        for l in split_item:
            if "e            " in l:
                if not "complement" in l:
                    coordinates = l[13:len(l)]
                    C = coordinates.split("..")
                    genestart = C[0]
                    geneend = C[1]
                    strand = "+"
                if "complement" in l:
                    coordinates = l[24:len(l)-1]
                    C = coordinates.split("..")
                    genestart = C[0]
                    geneend = C[1]
                    strand = "-"

            if "/locus_tag" in l:
                L = l.split('"')
                locus = L[1]

            if "/product" in l:
                P = l.split('"')
                product = P[1]

        annotation.append({
            "locus": locus,
            "genestart": genestart,
            "geneend": geneend,
            "product": product,
        })
    else:
        print "Finished!"

网友

2楼 · 编辑于 2024-09-28 18:56:45

我为这个纯python构建了一个也许不太好但功能强大的解析器，也许它至少可以用作一个基本思想：

import re
import pprint
printer = pprint.PrettyPrinter(indent=4)

with open("entities.txt", "r") as file_obj:
    entities = list()

    for line in file_obj.readlines():
        line = line.replace('\n', '')

        if re.match(r'\s*(gene|CDS)\s+[\w(\.,)]+', line):
            parts = line.split()
            entity = {parts[0]: parts[1]}
            entities.append(entity)
        else:
            try:
                (attr_name,) = re.findall(r'/\w+=', line)
                attr_name = attr_name.strip('/=')
            except ValueError:
                addition = line.strip()
                entity[last_key] = ''.join([entity[last_key], addition])
            else:
                try:
                    (attr_value,) = re.findall(r'="\w+$', line)
                    last_key = attr_name
                except ValueError:
                    try:
                        (attr_value,) = re.findall(r'="[\w\s\.:,-]+"', line)
                    except ValueError:
                        (attr_value,) = re.findall(r'=\d+$', line)

                    attr_value = attr_value.strip('"=')

                if attr_name in entity:
                    entity[attr_name] = [entity[attr_name], attr_value]
                else:
                    entity[attr_name] = attr_value

printer.pprint(entities)

相关问题更多 >

编程相关推荐

热门问题

热门文章