Source code for dendropy.calculate.treemeasure

#! /usr/bin/env python

##############################################################################
##  DendroPy Phylogenetic Computing Library.
##
##  Copyright 2010-2015 Jeet Sukumaran and Mark T. Holder.
##  All rights reserved.
##
##  See "LICENSE.rst" for terms and conditions of usage.
##
##  If you use this work or any portion thereof in published work,
##  please cite it as:
##
##     Sukumaran, J. and M. T. Holder. 2010. DendroPy: a Python library
##     for phylogenetic computing. Bioinformatics 26: 1569-1571.
##
##############################################################################

"""
Statistics, metrics, measurements, and values calculated on (single) trees.
"""

import math
from dendropy.calculate import phylogeneticdistance

EULERS_CONSTANT = 0.5772156649015328606065120900824024310421

## legacy: will soon be deprecated
class PatrisiticDistanceMatrix(phylogeneticdistance.PhylogeneticDistanceMatrix):

    def __init__(self, tree):
        phylogeneticdistance.PhylogeneticDistanceMatrix.__init__(self)
        self.compile_from_tree(tree=tree)

[docs]def patristic_distance(tree, taxon1, taxon2, is_bipartitions_updated=False): """ Given a tree with bipartitions encoded, and two taxa on that tree, returns the patristic distance between the two. Much more inefficient than constructing a PhylogeneticDistanceMatrix object. """ mrca = tree.mrca(taxa=[taxon1, taxon2], is_bipartitions_updated=is_bipartitions_updated) dist = 0 n = tree.find_node(lambda x: x.taxon == taxon1) while n != mrca: if n.edge.length is not None: dist += n.edge.length n = n.parent_node n = tree.find_node(lambda x: x.taxon == taxon2) while n != mrca: if n.edge.length is not None: dist += n.edge.length n = n.parent_node return dist
########################################################################### ### Metrics -- Unary
[docs]def B1(tree): """ Returns the B1 statistic: the reciprocal of the sum of the maximum number of nodes between each interior node and tip over all internal nodes excluding root. """ b1 = 0.0 nd_mi = {} for nd in tree.postorder_node_iter(): if nd._parent_node is None: continue child_nodes = nd._child_nodes if len(child_nodes) == 0: nd_mi[nd] = 0.0 continue mi = max(nd_mi[ch] for ch in child_nodes) mi += 1 nd_mi[nd] = mi b1 += 1.0/mi return b1
[docs]def colless_tree_imbalance(tree, normalize="max"): """ Returns Colless' tree imbalance or I statistic: the sum of differences of numbers of children in left and right subtrees over all internal nodes. ``normalize`` specifies the normalization: * "max" or True [DEFAULT] normalized to maximum value for tree of this size * "yule" normalized to the Yule model * "pda" normalized to the PDA (Proportional to Distinguishable Arrangements) model * None or False no normalization """ colless = 0.0 num_leaves = 0 subtree_leaves = {} for nd in tree.postorder_node_iter(): if nd.is_leaf(): subtree_leaves[nd] = 1 num_leaves += 1 else: total_leaves = 0 if len(nd._child_nodes) > 2: raise TypeError("Colless' tree imbalance statistic requires strictly bifurcating trees") left = subtree_leaves[nd._child_nodes[0]] right = subtree_leaves[nd._child_nodes[1]] colless += abs(right-left) subtree_leaves[nd] = right + left if normalize == "yule": colless = float(colless - (num_leaves * math.log(num_leaves)) - (num_leaves * (EULERS_CONSTANT - 1.0 - math.log(2))))/num_leaves elif normalize == "pda": colless = colless / pow(num_leaves, 3.0/2) elif normalize is True or normalize == "max": ## note that Mooers 1995 (Evolution 49(2):379-384) ## remarks that the correct normalization factor is ## 2/((num_leaves - 1) * (num_leaves -2)) colless = colless * (2.0/(num_leaves * (num_leaves-3) + 2)) elif normalize is not None and normalize is not False: raise TypeError("``normalization`` accepts only None, True, False, 'yule' or 'pda' as argument values") return colless
[docs]def pybus_harvey_gamma(tree, prec=0.00001): """Returns the gamma statistic of Pybus and Harvey (2000). This statistic is used to test for constancy of birth and death rates over the course of a phylogeny. Under the pure-birth process, the statistic should follow a standard Normal distibution: a Normal(mean=0, variance=1). If the lengths of different paths to the node differ by more than ``prec``, then a ValueError exception will be raised indicating deviation from ultrametricty. Raises a Value Error if the tree is not ultrametric, is non-binary, or has only 2 leaves. As a side effect a ``age`` attribute is added to the nodes of the tree. Pybus and Harvey. 2000. "Testing macro-evolutionary models using incomplete molecular phylogenies." Proc. Royal Society Series B: Biological Sciences. (267). 2267-2272 """ # the equation is given by: # T = \sum_{j=2}^n (jg_j) # C = T \sqrt{\frac{1}{12(n-2)}} # C gamma = \frac{1}{n-2}\sum_{i=2}^{n-1} (\sum_{k=2}^i kg_k) - \frac{T}{2} # where n is the number of taxa, and g_2 ... g_n is the vector of waiting # times between consecutive (in time, not along a branch) speciation times. node = None speciation_ages = [] n = 0 if tree.seed_node.age is None: tree.calc_node_ages(ultrametricity_precision=prec) for node in tree.postorder_node_iter(): if len(node.child_nodes()) == 2: speciation_ages.append(node.age) else: n += 1 if node is None: raise ValueError("Empty tree encountered") speciation_ages.sort(reverse=True) g = [] older = speciation_ages[0] for age in speciation_ages[1:]: g.append(older - age) older = age g.append(older) if not g: raise ValueError("No internal nodes found (other than the root)") assert(len(g) == (n - 1)) T = 0.0 accum = 0.0 for i in range(2, n): list_index = i - 2 T += i * float(g[list_index]) accum += T list_index = n - 2 T += (n) * g[list_index] nmt = n - 2.0 numerator = accum/nmt - T/2.0 C = T*pow(1/(12*nmt), 0.5) return numerator/C
[docs]def N_bar(tree): """ Returns the $\bar{N}$ statistic: the average number of nodes above a terminal node. """ leaf_count = 0 nbar = 0 for leaf_node in tree.leaf_node_iter(): leaf_count += 1 for parent in leaf_node.ancestor_iter(inclusive=False): nbar += 1 return float(nbar) / leaf_count
[docs]def sackin_index(tree, normalize=True): """ Returns the Sackin's index: the sum of the number of ancestors for each tip of the tree. The larger the Sackin's index, the less balanced the tree. ``normalize`` specifies the normalization: * True [DEFAULT] normalized to number of leaves; this results in a value equivalent to that given by Tree.N_bar() * "yule" normalized to the Yule model * "pda" normalized to the PDA (Proportional to Distinguishable Arrangements) model * None or False no normalization """ leaf_count = 0 num_anc = 0 for leaf_node in tree.leaf_node_iter(): leaf_count += 1 for parent in leaf_node.ancestor_iter(inclusive=False): num_anc += 1 if normalize == "yule": x = sum(1.0/j for j in range(2, leaf_count+1)) s = float(num_anc - (2 * leaf_count * x))/leaf_count elif normalize == "pda": s = float(num_anc)/(pow(leaf_count, 3.0/2)) elif normalize is True: s = float(num_anc)/leaf_count elif normalize is None or normalize is False: s = float(num_anc) elif normalize is not None and normalize is not False: raise TypeError("``normalization`` accepts only None, True, False, 'yule' or 'pda' as argument values") return s
[docs]def treeness(tree): """ Returns the proportion of total tree length that is taken up by internal branches. """ internal = 0.0 external = 0.0 for nd in tree.postorder_node_iter(): if not nd._parent_node: continue if nd.is_leaf(): external += nd.edge.length else: internal += nd.edge.length return internal/(external + internal)