-
Notifications
You must be signed in to change notification settings - Fork 1
/
normalize_cells_fpkm.py
88 lines (70 loc) · 1.73 KB
/
normalize_cells_fpkm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
"""
Compute the geometric mean size factors to normalize the transcriptome fpkm's
"""
"""
Import python packages
"""
import HTSeq
import collections
import itertools
import os
import subprocess
import collections
import datetime
import yaml
import fnmatch
import shlex
import numpy
import scipy
import scipy.io as sio
import pyensembl
import h5py
import pandas as pd
import numpy as np
import matplotlib
import cPickle as pickle
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import seq_functions
matplotlib.style.use('ggplot')
"""
Load cells
"""
direc = '/scratch/PI/mcovert/dvanva/sequencing/'
all_cell_file = 'all_cells_qc.pkl'
all_cells = pickle.load(open(os.path.join(direc,all_cell_file)))
"""
Compute geometric mean normalization for each cells - Remove genes with 0 counts prior to computing size factor
"""
cell = all_cells[0]
gene_keys = list(cell.fpkm.index)
"""
Compute reference sample for each gene
"""
reference_sample = {}
for gene in gene_keys:
running_product = 1
number_of_cells = 0
for cell in all_cells:
if cell.fpkm.loc[gene] > 0:
running_product *= cell.fpkm.loc[gene]
number_of_cells += 1
print gene
if number_of_cells > 0:
reference_sample[gene] = running_product ** (1/number_of_cells)
else:
reference_sample[gene] = 0
"""
Compute size factor for each cell
"""
for cell in all_cells:
list_of_genes = []
for gene in gene_keys:
if cell.fpkm.loc[gene] > 0:
if reference_sample[gene] > 0:
list_of_genes += [cell.fpkm.loc[gene] / reference_sample[gene]]
cell.size_factor = np.median(list_of_genes)
for cell in all_cells:
print cell.size_factor
file_name_save = os.path.join(direc, 'all_cells_norm.pkl')
pickle.dump(all_cells, open(file_name_save, 'wb'), protocol = pickle.HIGHEST_PROTOCOL)