-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathbeta_diversity.py
68 lines (56 loc) · 1.79 KB
/
beta_diversity.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
"""Calculate beta diversity and ordination."""
import logging
import pickle
from os import path
import pandas as pd
from skbio.diversity import beta_diversity
from skbio.stats.ordination import pcoa
from skbio.stats import subsample_counts
logging.basicConfig(format="%(asctime)s - %(levelname)s: %(message)s")
log = logging.getLogger(__name__)
log.setLevel(logging.INFO)
def rarefy_counts(counts, depth=10000):
"""Normalize a count matrix by rarefaction (subsampling).
Parameters
----------
counts : pandas.DataFrame
The count matrix to be normalized. Contains variables as columns and
samples as rows.
Returns
-------
pandas.DataFrame
A new data frame with normalized samples such that each sample has
a depth of `depth` (sum of variables equals depth).
"""
log.info(
"Subsampling %dx%d count matrix to a depth of %d."
% (counts.shape[0], counts.shape[1], depth)
)
bad = counts.astype("int").sum(1) < depth
log.info("Removing %d samples due to low depth." % bad.sum())
rare = counts[~bad].apply(
lambda x: pd.Series(
subsample_counts(x.astype("int"), depth), index=counts.columns
),
axis=1,
)
return rare
log.info("Reading genus-level data.")
genera = pd.read_csv(
path.join("..", "data", "american_gut_genus.csv"), dtype={"id": str}
)
libsize = genera.groupby("id")["count"].sum()
mat = pd.pivot_table(
genera,
columns="Genus",
index="id",
values="count",
fill_value=0,
aggfunc="sum",
)
mat = rarefy_counts(mat, 1000)
log.info("Calculating beta diversity and PCoA.")
D = beta_diversity("braycurtis", mat.values, mat.index, validate=True)
red = pcoa(D, number_of_dimensions=2)
log.info("Saving results to `pcoa.csv`.")
red.samples.to_csv("pcoa.csv")