Skip to content

Usage as Python library

MrTomRod edited this page Aug 29, 2022 · 2 revisions

Python bindings to the pairwise comparisons algorithm, as described in Read, 1995, Maddison, 2000 and Brynildsrud, 2016.

Simple pair picking

from pprint import pprint
from scoary import ScoaryTree, pick_single, print_tree

tree = [['isolate1', 'isolate2'], [['isolate3', 'isolate4'], ['isolate5', 'isolate6']]]

label_to_trait_a = {
    'isolate1': True,
    'isolate2': False,
    'isolate3': True,
    'isolate4': False,
    'isolate5': True,
    'isolate6': False,
}

label_to_trait_b = {
    'isolate1': True,
    'isolate2': False,
    'isolate3': True,
    'isolate4': False,
    'isolate5': True,
    'isolate6': False,
}

print_tree(
    ScoaryTree.from_list(tree),
    label_to_trait_a, label_to_trait_b
)
#       /-11_isolate1
#    /-|
#   |   \-00_isolate2
#   |
# --|      /-11_isolate3
#   |   /-|
#   |  |   \-00_isolate4
#    \-|
#      |   /-11_isolate5
#       \-|
#          \-00_isolate6

result = pick_single(tree, label_to_trait_a, label_to_trait_b, calc_pvals=True)
pprint(result)
# {'best_fisher_p': 0.25,
#  'max_contrasting_pairs': 3,
#  'max_opposing_pairs': 0,
#  'max_supporting_pairs': 3,
#  'worst_pval': 0.25}

Parallel pair picking

This takes advantage of Numba optimizations.

import pandas as pd
from scoary import pick

tree = [['isolate1', 'isolate2'], ['isolate3', 'isolate4']]

# e.g. phenotype
label_to_trait_a = {
    'isolate1': True,
    'isolate2': False,
    'isolate3': False,
    'isolate4': True,
}

# e.g. presence/absence of genes
trait_b_df = pd.DataFrame(
    columns=['isolate1', 'isolate2', 'isolate3', 'isolate4'],
    data=[
        [True, True, False, False],  # gene 1
        [True, False, True, False],  # gene 2
        [True, False, False, True],  # ...
        [False, True, True, False],
        [False, True, False, True],
        [False, True, False, True],
        [False, True, False, True],
        [False, True, False, True],
    ]
)

max_contr, max_suppo, max_oppos, best, worst = pick(
    tree=tree,
    label_to_trait_a=label_to_trait_a,
    trait_b_df=trait_b_df,
    calc_pvals=True
)

print(f'{max_contr=}\n{max_suppo=}\n{max_oppos=}\n{best=}\n{worst=}')
# max_contr=array([1, 2, 2, 2, 2, 2, 2, 2])
# max_suppo=array([1, 1, 2, 0, 1, 1, 1, 1])
# max_oppos=array([1, 1, 0, 2, 1, 1, 1, 1])
# best=array([1. , 1. , 0.5, 0.5, 1. , 1. , 1. , 1. ])
# worst=array([1. , 1. , 0.5, 0.5, 1. , 1. , 1. , 1. ])
Clone this wiki locally