在平均距尖端<x的树中折叠子树

2024-06-15 03:39:30 发布

您现在位置:Python中文网/ 问答频道 /正文

我有一个Newick格式的层次树,例如:

(A:0.556705,(B:0.251059,C:0.251059):0.305646):0.556705;

括号表示树拓扑(分支结构/分组),数字表示分支长度(距离),例如:

enter image description here

我需要折叠到末端(终端节点/尖端/叶)的平均距离小于给定值x的clades(子树),这样输入和折叠的输出都是Newick格式的。例如,如果x是0.29,那么上面的B和C分解成BC,我们得到如下结果:

^{pr2}$

enter image description here

对于任何Newick树(例如Python),有没有一种简单的方法可以通过编程实现这种折叠?在


Tags: 方法终端距离节点格式分支数字结构
2条回答

这里可能有一个更健壮的解析器,它将使用所有的示例here

#! /usr/bin/python3

class Tree:
    def __init__ (self):
        self.name = ''
        self.length = None
        self.children = []

    def __repr__ (self):
        return '({}){}{}'.format (
                ','.join (str (child) for child in self.children),
                self.name,
                ':{}'.format (self.length) if self.length is not None else '')

class Node:
    def __init__ (self, name = ''):
        self.name = name
        self.length = None

    def __repr__ (self):
        return '{}{}'.format (self.name, ':{}'.format (self.length) if self.length is not None else '')

class Stack (list):
    def push (self, e):
        self.append (e)

    def pop (self):
        e = self [-1]
        del self [-1]
        return e

    def peek (self):
        return self [-1]

class UnexpectedSymbolException (Exception): pass

def parseNameOrLength (stack, buff):
        if stack.peek () == ':':
            try: length = float (buff)
            except ValueError:
                raise ValueError ('Non-numeric length "{}" at position {}.'.format (buff, pos) )
            stack.pop ()
            stack.peek ().length = length
        elif buff:
            stack.peek ().name = buff

def parse (tree):
    stack = Stack ()
    stack.push (Tree () )
    buff = ''
    pos = -1
    tree = tree.strip ()
    while True:
        pos += 1
        c = tree [0]
        tree = tree [1:].strip ()
        if tree: la = tree [0]

        if c == '(':
            if buff:
                raise UnexpectedSymbolException (
                        'Unexpected symbol {} at position {}.'.format (c, pos) )
            if la == '(': stack.push (Tree () )
            else: stack.push (Node () )
            continue

        if c == ')':
            parseNameOrLength (stack, buff)
            buff = ''

            child = stack.pop ()
            stack.peek ().children.append (child)
            continue

        if c in ',;':
            parseNameOrLength (stack, buff)
            buff = ''

            if c == ';': return stack.pop ()

            child = stack.pop ()
            stack.peek ().children.append (child)
            if la == '(': stack.push (Tree () )
            else: stack.push (Node () )
            continue

        if c == ':':
            if stack.peek () == ':':
                raise UnexpectedSymbolException (
                        'Unexpected symbol {} at position {}.'.format (c, pos) )
            stack.peek ().name = buff
            stack.push (':')
            buff = ''
            continue

        buff += c

tests = '''(,,(,));
        (A,B,(C,D));
        (A,B,(C,D)E)F;
        (:0.1,:0.2,(:0.3,:0.4):0.5);
        (:0.1,:0.2,(:0.3,:0.4):0.5):0.0;
        (A:0.1,B:0.2,(C:0.3,D:0.4):0.5);
        (A:0.1,B:0.2,(C:0.3,D:0.4)E:0.5)F;
        ((B:0.2,(C:0.3,D:0.4)E:0.5)F:0.1)A;'''.split ('\n')

for test in tests: print (parse (test) )

这个快速而肮脏的代码片段似乎可以在您的最小树上运行,但是我需要更多的示例数据来用更复杂的树来检查它:

#! /usr/bin/python3

class Tree:
    def __init__ (self, tokens):
        self.children = []
        while tokens:
            self.children.append ( (tokens [1], float (tokens [0] ) ) )
            tokens = tokens [2:]

    def __repr__ (self):
        return '<{}>'.format (self.children)

    def pprint (self, indent = 0):
        prefix = '  ' * indent
        for child, dist in self.children:
            if isinstance (child, Tree):
                print (prefix, dist)
                child.pprint (indent + 1)
            else:
                print (prefix, child, dist)

    def collapse (self, limit):
        self.children = [ (child.collapse (limit) [0], dist)
            if isinstance (child, Tree)
            else (child, dist) for child, dist in self.children]
        avg = sum (dist for _, dist in self.children) / len (self.children)
        if avg > limit:
            return (self, 0)
        else:
            if any (isinstance (child, Tree) for child in self.children):
                print ('Would like to collapse, but cannot, as at least one child is a tree.')
                return (self, 0)
            return (''.join (child for child, _ in self.children), 0)


def parse (tree):
    stack = []
    buff = ''
    while True:
        c = tree [0]
        if c == ';': break
        tree = tree [1:]
        if c == '(':
            stack.insert (0, '(')
            continue
        if c == ')':
            if buff: stack.insert (0, buff)
            buff = ''
            popped = ''
            tokens = []
            while True:
                token = stack [0]
                stack = stack [1:]
                if token == '(': break
                tokens.append (token)
            stack.insert (0, Tree (tokens) )
            continue
        if c in ':,':
            if buff: stack.insert (0, buff)
            buff = ''
            continue
        buff += c
    if buff: stack.insert (0, buff)
    return Tree (stack)

t = parse ('(A:0.556705,(B:0.251059,C:0.251059):0.305646):0.556705;')
t.pprint ()
t.collapse (.3)
print ()
t.pprint ()

未折叠的树是:

^{pr2}$

3的倒塌树是:

 0.556705
   CB 0.305646
   A 0.556705

1.0版本的折叠树是:

CBA 0.556705

0.1折叠的树是:

^{pr2}$

这里是代码的另一个更简洁的版本(它产生了nerwick符号输出):

NOTA BENE:输入树必须用括号括起来。在

#! /usr/bin/python3

class Tree:
    def __init__ (self, weight, children):
        self.weight = weight
        self.children = children [:]

    def distances (self, curDistance = .0, acc = None):
        if acc is None: acc = []
        for child in self.children:
            child.distances (self.weight + curDistance, acc)
        return acc

    def collapse (self, limit):
        self.children = [child.collapse (limit) for child in self.children]
        distances = self.distances (-self.weight)
        avg = sum (distances) / len (distances)
        if avg > limit: return self
        return Node (self.weight, ''.join (self.descendants () ) )

    def descendants (self):
        descendants = []
        for child in self.children:
            descendants.extend (child.descendants () )
        return descendants

    def __repr__ (self):
        return '({}):{}'.format (','.join (str (child) for child in self.children), self.weight)

class Node:
    def __init__ (self, weight, name):
        self.weight = weight
        self.name = name

    def distances (self, curDistance, acc):
        acc.append (curDistance + self.weight)

    def collapse (self, limit):
        return self

    def descendants (self):
        return [self.name]

    def __repr__ (self):
        return '{}:{}'.format (self.name, self.weight)

class Stack (list):
    def pop (self):
        e = self [0]
        del self [0]
        return e

    def push (self, e):
        self.insert (0, e)

def parse (tree):
    buff = ''
    stack = Stack ()
    while True:
        c = tree [0]
        if c == ';': break
        tree = tree [1:]

        if c == '(':
            stack.push (c)
            continue

        if c in ':,':
            if buff: stack.push (buff)
            buff = ''
            continue

        if c == ')':
            if buff: stack.push (buff)
            buff = ''
            popped = ''
            children = []
            while True:
                weight = stack.pop ()
                if weight == '(': break
                weight = float (weight)
                child = stack.pop ()
                if isinstance (child, Tree):
                    child.weight = weight
                else:
                    child = Node (weight, child)
                children.append (child)
            stack.push (Tree (0, children) )
            continue

        buff += c

    return stack.pop ()

t = parse ('((A:0.9,(B:0.2,C:0.3):0.3,(E:0.05,F:0.08):0.1):0.6);')
print ('Input tree is {}'.format (t) )
for limit in range (1, 6):
    limit = limit / 10
    print ('Collapsed by {} is {}'.format (limit, t.collapse (limit) ) )

输出为(修改输入):

Input tree is (((F:0.08,E:0.05):0.1,(C:0.3,B:0.2):0.3,A:0.9):0.6):0
Collapsed by 0.1 is ((FE:0.1,(C:0.3,B:0.2):0.3,A:0.9):0.6):0
Collapsed by 0.2 is ((FE:0.1,(C:0.3,B:0.2):0.3,A:0.9):0.6):0
Collapsed by 0.3 is ((FE:0.1,CB:0.3,A:0.9):0.6):0
Collapsed by 0.4 is ((FE:0.1,CB:0.3,A:0.9):0.6):0
Collapsed by 0.5 is (FECBA:0.6):0

相关问题 更多 >