# Source code for dendropy.calculate.popgenstat

#! /usr/bin/env python

##############################################################################
##  DendroPy Phylogenetic Computing Library.
##
##  Copyright 2010-2015 Jeet Sukumaran and Mark T. Holder.
##
##  See "LICENSE.rst" for terms and conditions of usage.
##
##  If you use this work or any portion thereof in published work,
##
##     Sukumaran, J. and M. T. Holder. 2010. DendroPy: a Python library
##     for phylogenetic computing. Bioinformatics 26: 1569-1571.
##
##############################################################################

"""
Population genetic statistics.
"""

import math
import dendropy
from dendropy.calculate import probability
from dendropy.calculate import combinatorics

###############################################################################
## internal functions: generally taking lower-level data, such as sequences etc.
###############################################################################

def _count_differences(char_sequences, state_alphabet, ignore_uncertain=True):
"""
Returns pair of values: total number of pairwise differences observed between
all sequences, and mean number of pairwise differences pair base.
"""
sum_diff = 0.0
mean_diff = 0.0
sq_diff = 0.0
comps = 0

#Check that all sequences are the same length
if len(set([len(seq) for seq in char_sequences])) != 1:
raise Exception("sequences of unequal length")

if ignore_uncertain:
attr = "fundamental_indexes_with_gaps_as_missing"
_states_to_ignore = [state_alphabet.gap_state, state_alphabet.no_data_state]
states_to_ignore = set([getattr(char, attr) for char in _states_to_ignore])
else:
attr = "fundamental_indexes"
states_to_ignore = set()

reduced_char_sequences = []
for sequence in char_sequences:
seq = [getattr(char, attr) for char in sequence]
reduced_char_sequences.append(seq)

for vidx, i in enumerate(reduced_char_sequences[:-1]):
for j in reduced_char_sequences[vidx+1:]:
diff = 0
counted = 0
comps += 1
for cidx, c in enumerate(i):
c1 = c
c2 = j[cidx]
if c1 in states_to_ignore or c2 in states_to_ignore:
continue
counted += 1
if c1 is not c2:
diff += 1
sum_diff += float(diff)
# If counted < 0, this means that there is sites between these sequences
# in which both are not ignored: i.e., one or the other has a gap
# or an uncertain character. We consider this to mean (maybe
# somewhat paradoxically) that there are no sites that are
# different between the sequences. Put less paradoxically: there
# are no non-ignored sites that are different between the
# sequences.
mean_diff += (float(diff) / counted) if counted > 0 else float(diff)
sq_diff += (diff ** 2)
return sum_diff, mean_diff / comps, sq_diff

def _nucleotide_diversity(char_sequences, state_alphabet, ignore_uncertain=True):
"""
Returns $\pi$, the proportional nucleotide diversity, calculated for a
list of character sequences.
"""
return _count_differences(char_sequences, state_alphabet, ignore_uncertain)[1]

def _average_number_of_pairwise_differences(char_sequences, state_alphabet, ignore_uncertain=True):
"""
Returns $k$ (Tajima 1983; Wakely 1996), calculated for a set of sequences:

k = \frac{\right(\sum \sum \k_{ij}\left)}{n \choose 2}

where $k_{ij}$ is the number of pairwise differences between the
$i$th and $j$th sequence, and $n$ is the number of DNA sequences
sampled.
"""
sum_diff, mean_diff, sq_diff = _count_differences(char_sequences, state_alphabet, ignore_uncertain)
return sum_diff / combinatorics.choose(len(char_sequences), 2)

def _num_segregating_sites(char_sequences, state_alphabet, ignore_uncertain=True):
"""
Returns the raw number of segregating sites (polymorphic sites).
"""
s = 0
if ignore_uncertain:
attr = "fundamental_indexes_with_gaps_as_missing"
_states_to_ignore = [state_alphabet.gap_state, state_alphabet.no_data_state]
states_to_ignore= set([getattr(char, attr) for char in _states_to_ignore])
else:
attr = "fundamental_indexes"
states_to_ignore = set()
for i, c1 in enumerate(char_sequences[0]):
for v in char_sequences[1:]:
c2 = v[i]
f1 = getattr(c1, attr)
f2 = getattr(c2, attr)
if f1 in states_to_ignore or f2 in states_to_ignore:
continue
if f1 is not f2:
s += 1
break
return s

def _tajimas_d(num_sequences, avg_num_pairwise_differences, num_segregating_sites):

### VERIFICATION ###
###
### Given: num_sequences = 10, num_pairwise_differences = 3.888889, num_segregating_sites = 16
###  i.e.: tajimas_d(10, 3.888889, 16)  == -1.44617198561
###  Then:    a1 == 2.82896825397
###           a2 == 1.53976773117
###           b1 == 0.407407407407
###           b2 == 0.279012345679
###           c1 == 0.0539216450284
###           c2 == 0.0472267720013
###           e1 == 0.0190605338016
###           e2 == 0.0049489277699
###           D ==  -1.44617198561

a1 = sum([1.0/i for i in range(1, num_sequences)])
a2 = sum([1.0/(i**2) for i in range(1, num_sequences)])
b1 = float(num_sequences+1)/(3*(num_sequences-1))
b2 = float(2 * ( (num_sequences**2) + num_sequences + 3 )) / (9*num_sequences*(num_sequences-1))
c1 = b1 - 1.0/a1
c2 = b2 - float(num_sequences+2)/(a1 * num_sequences) + float(a2)/(a1 ** 2)
e1 = float(c1) / a1
e2 = float(c2) / ( (a1**2) + a2 )
D = (
float(avg_num_pairwise_differences - (float(num_segregating_sites)/a1))
/ math.sqrt(
(e1 * num_segregating_sites )
+ ((e2 * num_segregating_sites) * (num_segregating_sites - 1) ))
)
return D

###############################################################################
## friendlier-functions, generally taking a CharacterMatrix
###############################################################################

[docs]def num_segregating_sites(char_matrix, ignore_uncertain=True):
"""
Returns the raw number of segregating sites (polymorphic sites).
"""
return _num_segregating_sites(
char_matrix.sequences(),
char_matrix.default_state_alphabet,
ignore_uncertain)

[docs]def average_number_of_pairwise_differences(char_matrix, ignore_uncertain=True):
"""
Returns $k$, calculated for a character block.
"""
return _average_number_of_pairwise_differences(char_matrix.sequences(), char_matrix.default_state_alphabet, ignore_uncertain)

[docs]def nucleotide_diversity(char_matrix, ignore_uncertain=True):
"""
Returns $\pi$, calculated for a character block.
"""
return _nucleotide_diversity(char_matrix.sequences(), char_matrix.default_state_alphabet, ignore_uncertain)

[docs]def tajimas_d(char_matrix, ignore_uncertain=True):
"""
Returns Tajima's D.
"""
sequences = char_matrix.sequences()
num_sequences = len(sequences)
avg_num_pairwise_differences = _average_number_of_pairwise_differences(sequences, char_matrix.default_state_alphabet, ignore_uncertain=ignore_uncertain)
num_segregating_sites = _num_segregating_sites(
sequences,
char_matrix.default_state_alphabet,
ignore_uncertain=ignore_uncertain)
return _tajimas_d(num_sequences, avg_num_pairwise_differences, num_segregating_sites)

[docs]def wattersons_theta(char_matrix, ignore_uncertain=True):
"""
Returns Watterson's Theta (per sequence)
"""
sequences = char_matrix.sequences()
num_segregating_sites = _num_segregating_sites(
sequences,
char_matrix.default_state_alphabet,
ignore_uncertain=ignore_uncertain)
a1 = sum([1.0/i for i in range(1, len(sequences))])
return float(num_segregating_sites) / a1

###############################################################################
## Classes
###############################################################################

class PopulationPairSummaryStatistics(object):

def __init__(self, pop1_seqs, pop2_seqs, ignore_uncertain=True):
self.pop1_seqs = pop1_seqs
self.pop2_seqs = pop2_seqs
self.combined_seqs = pop1_seqs + pop2_seqs
self.ignore_uncertain = ignore_uncertain
self.state_alphabet = dendropy.DNA_STATE_ALPHABET

self.average_number_of_pairwise_differences = 0
self.average_number_of_pairwise_differences_between = 0
self.average_number_of_pairwise_differences_within = 0
self.average_number_of_pairwise_differences_net = 0
self.num_segregating_sites = 0
self.wattersons_theta = 0.0
self.wakeleys_psi = 0.0
self.tajimas_d = 0.0
if self.ignore_uncertain:
self.state_attr = "fundamental_indexes_with_gaps_as_missing"
self.states_to_ignore = set([self.state_alphabet.gap_state, self.state_alphabet.no_data_state])
else:
self.state_attr = "fundamental_indexes"
self.states_to_ignore = set()
self.calc()

def calc(self):
"""
Returns a summary of a set of sequences that can be partitioned into
the list of lists of taxa given by taxon_groups.
"""
diffs_x, mean_diffs_x, sq_diff_x = _count_differences(self.pop1_seqs, self.state_alphabet, self.ignore_uncertain)
diffs_y, mean_diffs_y, sq_diff_y = _count_differences(self.pop2_seqs, self.state_alphabet, self.ignore_uncertain)
d_x = diffs_x / combinatorics.choose(len(self.pop1_seqs), 2)
d_y = diffs_y / combinatorics.choose(len(self.pop2_seqs), 2)
d_xy = self._average_number_of_pairwise_differences_between_populations()
s2_x = (float(sq_diff_x) / combinatorics.choose(len(self.pop1_seqs), 2) ) - (d_x ** 2)
s2_y = (float(sq_diff_y) / combinatorics.choose(len(self.pop2_seqs), 2) ) - (d_y ** 2)
s2_xy = self._variance_of_pairwise_differences_between_populations(d_xy)
n = len(self.combined_seqs)
n_x = float(len(self.pop1_seqs))
n_y = float(len(self.pop2_seqs))
a = float(n * (n-1))
ax = float(n_x * (n_x - 1))
ay = float(n_y * (n_y - 1))
k = _average_number_of_pairwise_differences(self.combined_seqs, self.state_alphabet, self.ignore_uncertain)
n = len(self.combined_seqs)

# Hickerson 2006: pi #
self.average_number_of_pairwise_differences = k

# Hickerson 2006: pi_b #
self.average_number_of_pairwise_differences_between = d_xy

# Hickerson 2006: pi_w #
self.average_number_of_pairwise_differences_within = d_x + d_y

# Hickerson 2006: pi_net #
self.average_number_of_pairwise_differences_net = d_xy - (d_x + d_y)

# Hickerson 2006: S #
self.num_segregating_sites = _num_segregating_sites(
self.combined_seqs,
self.state_alphabet,
self.ignore_uncertain)

# Hickerson 2006: theta #
a1 = sum([1.0/i for i in range(1, n)])
self.wattersons_theta = float(self.num_segregating_sites) / a1

# Wakeley 1996 #
self.wakeleys_psi = (float(1)/(a)) * ( ax * (math.sqrt(s2_x)/d_x) + ay * (math.sqrt(s2_y)/d_y) + (2 * n_x * n_y * math.sqrt(s2_xy)/k))

# Tajima's D #
self.tajimas_d = _tajimas_d(n, self.average_number_of_pairwise_differences, self.num_segregating_sites)

def _average_number_of_pairwise_differences_between_populations(self):
"""
Implements Eq (3) of:

Wakeley, J. 1996. Distinguishing migration from isolation using the
variance of pairwise differences. Theoretical Population Biology 49:
369-386.
"""
diffs = 0
for sx in self.pop1_seqs:
for sy in self.pop2_seqs:
for cidx, c in enumerate(sx):
c1 = c
c2 = sy[cidx]
if c1 in self.states_to_ignore or c2 in self.states_to_ignore:
continue
f1 = getattr(c1, self.state_attr)
f2 = getattr(c2, self.state_attr)
if f1 != f2:
diffs += 1
dxy = float(1)/(len(self.pop1_seqs) * len(self.pop2_seqs)) * float(diffs)
return dxy

def _variance_of_pairwise_differences_between_populations(self, mean_diff):
"""
Implements Eq (10) of:

Wakeley, J. 1996. Distinguishing migration from isolation using the
variance of pairwise differences. Theoretical Population Biology 49:
369-386.
"""
ss_diffs = 0
for sx in self.pop1_seqs:
for sy in self.pop2_seqs:
diffs = 0
for cidx, c in enumerate(sx):
c1 = c
c2 = sy[cidx]
if c1 in self.states_to_ignore or c2 in self.states_to_ignore:
continue
f1 = getattr(c1, self.state_attr)
f2 = getattr(c2, self.state_attr)
if f1 != f2:
diffs += 1
ss_diffs += (float(diffs - mean_diff) ** 2)
return float(ss_diffs)/(len(self.pop1_seqs)*len(self.pop2_seqs))

[docs]def derived_state_matrix(
char_matrix,
ancestral_sequence=None,
derived_state_alphabet=None,
ignore_uncertain=True,
):
"""
Given a list of CharDataSequence objects, and a reference ancestral sequence,
this returns a list of strings corresponding to the list of CharDataSequence
objects, where a '0' indicates the ancestral state and '1' a derived state.

e.g.

Given:
GGCTAATCTGA
GCTTTTTCTGA
GCTCTCTCTTC

with ancestral sequence:
GGTTAATCTGA

this returns:
0010000000
0000110000
0001110011
"""
if derived_state_alphabet is None:
derived_state_alphabet = dendropy.StateAlphabet(
fundamental_states="01",
polymorphic_states=None,
ambiguous_states=None,
no_data_symbol="?",
gap_symbol="-")
derived_matrix = dendropy.StandardCharacterMatrix(
taxon_namespace=char_matrix.taxon_namespace,
default_state_alphabet=derived_state_alphabet)
if ignore_uncertain:
attr = "fundamental_indexes_with_gaps_as_missing"
states_to_ignore = set([char_matrix.default_state_alphabet.gap_state, char_matrix.default_state_alphabet.no_data_state])
else:
attr = "fundamental_indexes"
states_to_ignore = set()
if ancestral_sequence is None:
ancestral_sequence = char_matrix[0]
ancestral_fundamental_ids = []
for idx, c1 in enumerate(ancestral_sequence):
if c1 in states_to_ignore:
ancestral_fundamental_ids.append(None)
else:
ancestral_fundamental_ids.append(getattr(c1, attr))
for taxon in char_matrix:
s1 =  char_matrix[taxon]
for idx, c2 in enumerate(s1):
if ancestral_fundamental_ids[idx] is None or c2 in states_to_ignore:
derived_matrix[taxon].append(derived_matrix.default_state_alphabet["?"])
continue
f2 = getattr(c2, attr)
if f2 == ancestral_fundamental_ids[idx]:
derived_matrix[taxon].append(derived_matrix.default_state_alphabet["0"])
else:
derived_matrix[taxon].append(derived_matrix.default_state_alphabet["1"])
return derived_matrix

[docs]def unfolded_site_frequency_spectrum(
char_matrix,
ancestral_sequence=None,
ignore_uncertain=False,
"""
Returns the site frequency spectrum of list of CharDataSequence objects given by char_sequences,
with reference to the ancestral sequence given by ancestral_seq. If ancestral_seq
is None, then the first sequence in char_sequences is taken to be the ancestral
sequence.
"""
dsm = derived_state_matrix(
char_matrix=char_matrix,
ancestral_sequence=ancestral_sequence,
derived_state_alphabet=None,
ignore_uncertain=ignore_uncertain,
)
sites = zip(*dsm.sequences()) # transpose
freqs = {}