Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

prior + BBSR workflow and example input files #45

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
142,391 changes: 142,391 additions & 0 deletions data/yeast/motifs.bed

Large diffs are not rendered by default.

5,716 changes: 5,716 additions & 0 deletions data/yeast/target_genes.tsv

Large diffs are not rendered by default.

857 changes: 294 additions & 563 deletions data/yeast/tf_names.tsv

Large diffs are not rendered by default.

6,600 changes: 6,600 additions & 0 deletions data/yeast/tss.bed

Large diffs are not rendered by default.

98 changes: 98 additions & 0 deletions inferelator_ng/bbsr_tfa_chromatin_prior_workflow.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
"""
Run BSubtilis Network Inference with TFA BBSR.
"""

import numpy as np
import os
from workflow import WorkflowBase
import design_response_R
from tfa import TFA
from results_processor import ResultsProcessor
import mi_R
import bbsr_R
import datetime
from prior import Prior
from . import utils


class BBSR_TFA_Chromatin_Prior_Workflow(WorkflowBase):

motifs_file = 'motifs.bed'
annotation_file = 'tss.bed'
target_genes_file = 'target_genes.tsv'
prior_mode = 'closest'
prior_max_distance = float('Inf')
prior_ignore_downstream = True
output_dir = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')

def run(self):
"""
Execute workflow, after all configuration.
"""
np.random.seed(self.random_seed)

self.mi_clr_driver = mi_R.MIDriver()
self.regression_driver = bbsr_R.BBSR_driver()
self.design_response_driver = design_response_R.DRDriver()

self.get_data()
self.build_prior()
self.compute_common_data()
self.compute_activity()

betas = []
rescaled_betas = []

for idx, bootstrap in enumerate(self.get_bootstraps()):
print('Bootstrap {} of {}'.format((idx + 1), self.num_bootstraps))
X = self.activity.ix[:, bootstrap]
Y = self.response.ix[:, bootstrap]
print('Calculating MI, Background MI, and CLR Matrix')
(self.clr_matrix, self.mi_matrix) = self.mi_clr_driver.run(X, Y)
print('Calculating betas using BBSR')
current_betas, current_rescaled_betas = self.regression_driver.run(X, Y, self.clr_matrix, self.priors_data)
betas.append(current_betas)
rescaled_betas.append(current_rescaled_betas)
self.emit_results(betas, rescaled_betas, self.gold_standard, self.priors_data)


def get_data(self):
"""
Read data files in to data structures.
"""
self.expression_matrix = self.input_dataframe(self.expression_matrix_file)
tf_file = self.input_file(self.tf_names_file)

self.tf_names = utils.read_tf_names(tf_file)
target_genes_file = self.input_file(self.target_genes_file)
self.target_genes = utils.read_tf_names(target_genes_file)

# Read metadata, creating a default non-time series metadata file if none is provided
self.meta_data = self.input_dataframe(self.meta_data_file, has_index=False, strict=False)
if self.meta_data is None:
self.meta_data = self.create_default_meta_data(self.expression_matrix)
self.gold_standard = self.input_dataframe(self.gold_standard_file)


def build_prior(self):
print('Generating Prior Matrix from input Motifs and Annotation ... ')
prior_generator = Prior(self.input_file(self.motifs_file), self.input_file(self.annotation_file), self.target_genes, self.tf_names,
mode = self.prior_mode, max_distance = self.prior_max_distance, ignore_downstream = self.prior_ignore_downstream)
self.priors_data = prior_generator.make_prior()

def compute_activity(self):
"""
Compute Transcription Factor Activity
"""
print('Computing Transcription Factor Activity ... ')
TFA_calculator = TFA(self.priors_data, self.design, self.half_tau_response)
self.activity = TFA_calculator.compute_transcription_factor_activity()

def emit_results(self, betas, rescaled_betas, gold_standard, priors):
"""
Output result report(s) for workflow run.
"""
output_dir = os.path.join(self.input_dir, self.output_dir)
os.makedirs(output_dir)
self.results_processor = ResultsProcessor(betas, rescaled_betas)
self.results_processor.summarize_network(output_dir, gold_standard, priors)
71 changes: 28 additions & 43 deletions inferelator_ng/prior.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import pybedtools
import pandas as pd
import numpy as np

class Prior:

Expand All @@ -11,12 +12,12 @@ class Prior:
genes_file: path/to/genes BED format (TSS or gene body)
targets: list of target genes
regulators: list of regulators
mode: closest (find closest target gene for each motif)
mode: closest (find closest target gene for each motif)
window (finds motifs within a window of target gene feature)
max_distance: maximum distance allowed for an assignment motif --> gene to be valid
ignore_downstream: valid for closest mode only;
ignore_downstream: valid for closest mode only;
motifs can only be assigned to target gene if upstream (true in yeast, for example)
number_of_targets: valid for closest mode only;
number_of_targets: valid for closest mode only;
allow a motif to regulate 'number_of_targets' closest genes
"""

Expand All @@ -40,60 +41,44 @@ def make_prior(self):
TFs (columns) and target genes (rows) - weights are set as the number of motifs associated with interaction
Possibility of dumping motifs associated with a particular edge (?)
"""

# Reads and sort input BED file
motifs = pybedtools.BedTool(self.motifs).sort()
genes = pybedtools.BedTool(self.genes).sort()

# Define edges in prior using method defined in self.mode
edges = {}
motifs = pybedtools.BedTool(self.motifs).sort()#.to_dataframe()

# For each motif, find closest gene within a certain window (self.max_distance) and that a prior interaction
if self.mode == 'closest':
# pybedtools wrapper around Bedtools closest function, D reports signed distance between motifs and genes
assignments = motifs.closest(genes, D = 'b', k = self.number_of_targets, id = self.ignore_downstream) # id = True, for Yeast! what about k, is it something to optimize?
assignments = motifs.closest(genes, D = 'b', k = self.number_of_targets, id = self.ignore_downstream).to_dataframe() # id = True, for Yeast! what about k, is it something to optimize?
assignments = assignments.loc[assignments.iloc[:, -1].abs() <= self.max_distance, :]
# get index to retrieve important features later
motif_start = 0
target_idx = motifs.field_count()+3
target_idx = motifs.field_count() + 3

# For each target gene, finds motifs within a certain window (self.max_distance) and consider those as interactions in prior
if self.mode == 'window':
# pybedtools wrapper around Bedtools window function, w is the window around gene feature to be used
assignments = genes.window(motifs, w = self.max_distance)
assignments = genes.window(motifs, w = self.max_distance).to_dataframe()
# get index to retrieve important features later
motif_start = genes.field_count()
target_idx = 3

motif_end = motif_start + motifs.field_count()-1

# Loop over all assignments and define edges
for assignment in assignments:
# in the closest mode, one can only allow motifs that are within the distance set in max_distance
if self.mode == 'closest':
if abs(int(assignment[-1])) > self.max_distance:
continue
# record edges as well as motifs associated with them
assignment = assignment.fields
motif = assignment[motif_start:motif_end]
regulator = assignment[motif_start+3]
target = assignment[target_idx]

if regulator in edges:
if target in edges[regulator]:
edges[regulator][target].append(motif)
else:
edges[regulator][target] = [motif]
else:
edges[regulator] = {target : [motif]}

# Make prior matrix
prior = pd.DataFrame(0, index=self.targets, columns=self.regulators)
# If there are multiple motifs for a TF assigned to the same gene, give larger weight to that interaction
for regulator in edges:
for target in edges[regulator]:
if regulator in self.regulators and target in self.targets:
#weight = pybedtools.BedTool(edges[regulator][target]).merge().count()
weight = len(edges[regulator][target])
prior.ix[target, regulator] = weight
return prior
# Find column index for the regulator and target ids
regulator = assignments.columns[motif_start+3]
target = assignments.columns[target_idx]

# Count number of motifs associated with each target gene
assignments = assignments.groupby([regulator, target]).size().reset_index()
assignments.columns = ['regulator', 'target', 'interaction']
assignments = assignments.loc[assignments.regulator.isin(self.regulators),:]
assignments = assignments.loc[assignments.target.isin(self.targets),:]

# Make prior matrix
prior = pd.pivot_table(assignments, index='target', columns='regulator', values='interaction', fill_value=0)
prior = prior.loc[:, self.regulators]
prior = prior.loc[self.targets,: ]
if len(prior.columns) > 0:
del prior.columns.name
if len(prior.index) > 0:
del prior.index.name
prior.replace(np.nan, 0, inplace=True)
return prior.astype(int)
19 changes: 10 additions & 9 deletions inferelator_ng/tests/test_prior.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import pandas as pd
import numpy as np
import subprocess
import pandas.util.testing as pdt

class TestPrior(unittest.TestCase):

Expand Down Expand Up @@ -43,7 +44,7 @@ def test_prior_closest_zero_distance(self):
[0, 0, 0, 0]],
index = ['gene1', 'gene2', 'gene3'],
columns = ['TF1', 'TF2', 'TF3', 'TF4'])
self.assertTrue(prior_object.make_prior().equals(expected_prior))
pdt.assert_frame_equal(prior_object.make_prior(), expected_prior)


def test_prior_closest_zero_distance_genes_with_multiple_tss_at_different_locations(self):
Expand All @@ -59,7 +60,7 @@ def test_prior_closest_zero_distance_genes_with_multiple_tss_at_different_locati
[0, 0, 0, 0]],
index = ['gene1', 'gene2', 'gene3'],
columns = ['TF1', 'TF2', 'TF3', 'TF4'])
self.assertTrue(prior_object.make_prior().equals(expected_prior))
pdt.assert_frame_equal(prior_object.make_prior(), expected_prior)

def test_prior_closest_zero_distance_genes_with_multiple_tss_at_same_location(self):
self.setup_test_files()
Expand All @@ -74,7 +75,7 @@ def test_prior_closest_zero_distance_genes_with_multiple_tss_at_same_location(se
[0, 0, 0, 0]],
index = ['gene1', 'gene2', 'gene3'],
columns = ['TF1', 'TF2', 'TF3', 'TF4'])
self.assertTrue(prior_object.make_prior().equals(expected_prior))
pdt.assert_frame_equal(prior_object.make_prior(), expected_prior)

def test_prior_window_TSS_zero_distance(self):
self.setup_test_files()
Expand All @@ -89,7 +90,7 @@ def test_prior_window_TSS_zero_distance(self):
[0, 0, 0, 0]],
index = ['gene1', 'gene2', 'gene3'],
columns = ['TF1', 'TF2', 'TF3', 'TF4'])
self.assertTrue(prior_object.make_prior().equals(expected_prior))
pdt.assert_frame_equal(prior_object.make_prior(), expected_prior)

def test_prior_window_geneBody_zero_distance(self):
self.setup_test_files()
Expand All @@ -103,7 +104,7 @@ def test_prior_window_geneBody_zero_distance(self):
[0, 0, 0, 0]],
index = ['gene1', 'gene2', 'gene3'],
columns = ['TF1', 'TF2', 'TF3', 'TF4'])
self.assertTrue(prior_object.make_prior().equals(expected_prior))
pdt.assert_frame_equal(prior_object.make_prior(), expected_prior)


def test_prior_closestTSS_default(self):
Expand All @@ -118,7 +119,7 @@ def test_prior_closestTSS_default(self):
[0, 1, 1, 1]],
index = ['gene1', 'gene2', 'gene3'],
columns = ['TF1', 'TF2', 'TF3', 'TF4'])
self.assertTrue(prior_object.make_prior().equals(expected_prior))
pdt.assert_frame_equal(prior_object.make_prior(), expected_prior)

def test_prior_closestTSS_ignore_downstream(self):
self.setup_test_files()
Expand All @@ -132,7 +133,7 @@ def test_prior_closestTSS_ignore_downstream(self):
[0, 1, 1, 0]],
index = ['gene1', 'gene2', 'gene3'],
columns = ['TF1', 'TF2', 'TF3', 'TF4'])
self.assertTrue(prior_object.make_prior().equals(expected_prior))
pdt.assert_frame_equal(prior_object.make_prior(), expected_prior)


def test_prior_windowGeneBody_1000(self):
Expand All @@ -147,7 +148,7 @@ def test_prior_windowGeneBody_1000(self):
[2, 1, 1, 1]],
index = ['gene1', 'gene2', 'gene3'],
columns = ['TF1', 'TF2', 'TF3', 'TF4'])
self.assertTrue(prior_object.make_prior().equals(expected_prior))
pdt.assert_frame_equal(prior_object.make_prior(), expected_prior)

def test_prior_number_of_targets_2(self):
self.setup_test_files()
Expand All @@ -161,4 +162,4 @@ def test_prior_number_of_targets_2(self):
[0, 1, 1, 1]],
index = ['gene1', 'gene2', 'gene3'],
columns = ['TF1', 'TF2', 'TF3', 'TF4'])
self.assertTrue(prior_object.make_prior().equals(expected_prior))
pdt.assert_frame_equal(prior_object.make_prior(), expected_prior)
4 changes: 2 additions & 2 deletions yeast_bbsr_workflow_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,6 @@
workflow.delTmax = 110
workflow.delTmin = 0
workflow.tau = 45
workflow.random_seed = 0001
workflow.run()
workflow.random_seed = 1
workflow.run()