Skip to content

Latest commit

 

History

History
1900 lines (1576 loc) · 77.1 KB

Figure_2.md

File metadata and controls

1900 lines (1576 loc) · 77.1 KB

Code for generating the pannels of figure 2 and accompanying supplementary figures

Set up python environment & load data

## Set up environment

import os
import dill
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['svg.fonttype'] = 'none'

work_dir = '/staging/leuven/stg_00002/lcb/sdewin/PhD/Multiomics_pipeline/analysis/pbmc_granulocyte_sorted_10k/scenicplus_clustered_consensus'
plot_dir = '/staging/leuven/stg_00002/lcb/sdewin/PhD/papers/SCENICPLUS_BravoDeWinter_etal_2022/Figure_pbmc/v3_clustered_consensus/plots'
ranking_dir = '/staging/leuven/stg_00002/lcb/sdewin/PhD/Multiomics_pipeline/analysis/pbmc_granulocyte_sorted_10k'


color_dict = {
        'B cells': '#FF5714',
        'CD14+ Monocytes': '#6EEB83',
        'FCGR3A+ Monocytes': '#65B891',
        'Dendritic cells 1': '#575761',
        'Dendritic cells 2': '#1BE7FF',
        'CD4 T cells': '#1D3461',
        'CD8 T cells': '#3D5A6C',
        'NK cells': '#64B6AC'
}

## Load data

scplus_obj = dill.load(
	open(os.path.join(work_dir, 'scplus_obj.pkl'), 'rb')
)

region_ranking = dill.load(
        open(os.path.join(ranking_dir, 'scenicplus/region_ranking.pkl'), 'rb')
)

gene_ranking = dill.load(
        open(os.path.join(ranking_dir, 'scenicplus/gene_ranking.pkl'), 'rb')
)

Supplementary figure S9, pannel a: Filtering eRegulons

## Filter eRegulons

from scenic.preprocessing.filtering import apply_std_filtering_to_eRegulons

apply_std_filtering_to_eRegulons(scplus_obj)

score_eRegulons(scplus_obj,
                ranking = region_ranking,
                eRegulon_signatures_key = 'eRegulon_signatures_filtered',
                key_added = 'eRegulon_AUC_filtered',
                enrichment_type= 'region',
                auc_threshold = 0.05,
                normalize = False,
                n_cpu = 5)
score_eRegulons(scplus_obj,
                gene_ranking,
                eRegulon_signatures_key = 'eRegulon_signatures_filtered',
                key_added = 'eRegulon_AUC_filtered',
                enrichment_type = 'gene',
                auc_threshold = 0.05,
                normalize= False,
                n_cpu = 5)

from scenicplus.cistromes import TF_cistrome_correlation, generate_pseudobulks

generate_pseudobulks(
        scplus_obj = scplus_obj,
        variable = 'GEX_celltype',
        auc_key = 'eRegulon_AUC_filtered',
        signature_key = 'Gene_based'
)
generate_pseudobulks(
        scplus_obj = scplus_obj,
        variable = 'GEX_celltype',
        auc_key = 'eRegulon_AUC_filtered',
        signature_key = 'Region_based'
)

TF_cistrome_correlation(
            scplus_obj,
            use_pseudobulk = True,
            variable = 'GEX_celltype',
            auc_key = 'eRegulon_AUC_filtered',
            signature_key = 'Gene_based',
            out_key = 'filtered_gene_based')
TF_cistrome_correlation(
            scplus_obj,
            use_pseudobulk = True,
            variable = 'GEX_celltype',
            auc_key = 'eRegulon_AUC_filtered',
            signature_key = 'Region_based',
            out_key = 'filtered_region_based')

n_targets = [int(x.split('(')[1].replace('r)', '')) for x in scplus_obj.uns['TF_cistrome_correlation']['filtered_region_based']['Cistrome']]
rho = scplus_obj.uns['TF_cistrome_correlation']['filtered_region_based']['Rho'].to_list()
adj_pval = scplus_obj.uns['TF_cistrome_correlation']['filtered_region_based']['Adjusted_p-value'].to_list()

thresholds = {
        'rho': [-0.8, 0.7],
        'n_targets': 0
}
import seaborn as sns
plot_dir = plotdir
fig, ax = plt.subplots()
sc = ax.scatter(rho, n_targets, c = -np.log10(adj_pval), s = 5)
ax.set_xlabel('Correlation coefficient')
ax.set_ylabel('nr. target regions')
#ax.hlines(y = thresholds['n_targets'], xmin = min(rho), xmax = max(rho), color = 'black', ls = 'dashed', lw = 1)
ax.vlines(x = thresholds['rho'], ymin = 0, ymax = max(n_targets), color = 'black', ls = 'dashed', lw = 1)
ax.text(x = thresholds['rho'][0], y = max(n_targets), s = str(thresholds['rho'][0]))
ax.text(x = thresholds['rho'][1], y = max(n_targets), s = str(thresholds['rho'][1]))
sns.despine(ax = ax)
fig.colorbar(sc, label = '-log10(adjusted_pvalue)', ax = ax)
fig.savefig(os.path.join(plot_dir, 'eRegulon_selection_n_targets_rho.pdf'))

selected_cistromes = scplus_obj.uns['TF_cistrome_correlation']['filtered_region_based'].loc[
        np.logical_or(
                scplus_obj.uns['TF_cistrome_correlation']['filtered_region_based']['Rho'] > thresholds['rho'][1],
                scplus_obj.uns['TF_cistrome_correlation']['filtered_region_based']['Rho'] < thresholds['rho'][0]
        )]['Cistrome'].to_list()
selected_eRegulons = [x.split('_(')[0] for x in selected_cistromes]
selected_eRegulons_gene_sig = [
        x for x in scplus_obj.uns['eRegulon_signatures_filtered']['Gene_based'].keys()
        if x.split('_(')[0] in selected_eRegulons]
selected_eRegulons_region_sig = [
        x for x in scplus_obj.uns['eRegulon_signatures_filtered']['Region_based'].keys()
        if x.split('_(')[0] in selected_eRegulons]

scplus_obj.uns['selected_eRegulon'] = {'Gene_based': selected_eRegulons_gene_sig, 'Region_based': selected_eRegulons_region_sig}

## Set celltype names
scplus_obj.metadata_cell['celltype_name'] = [x.replace('_', ' ') for x in scplus_obj.metadata_cell['GEX_celltype']]

## Save changes
dill.dump(scplus_obj, open(os.path.join(work_dir, 'scplus_obj.pkl'), 'wb'))

Export to loom for visualizations in R

binarize_AUC(scplus_obj, 
             auc_key='eRegulon_AUC_filtered',
             out_key='eRegulon_AUC_filtered_thresholds',
             signature_keys=['Gene_based'],
             n_cpu=5)

def remove_second_sign(x):
	if 'extended' not in x:
		TF, first, second, n = x.split('_')
		return f'{TF}_{first}_{n}'
	else:
		TF, extended, first, second, n = x.split('_')
		return f'{TF}_{extended}_{first}_{n}'

scplus_obj.uns['eRegulon_metadata_filtered']['Gene_signature_name'] = [remove_second_sign(x) for x in scplus_obj.uns['eRegulon_metadata_filtered']['Gene_signature_name']]
scplus_obj.uns['eRegulon_metadata_filtered']['Region_signature_name'] = [remove_second_sign(x) for x in scplus_obj.uns['eRegulon_metadata_filtered']['Region_signature_name']]

del(scplus_obj.dr_cell['GEX_X_pca'])
del(scplus_obj.dr_cell['GEX_rep'])

from scenicplus.loom import *
export_to_loom(scplus_obj, 
       signature_key = 'Gene_based',
       eRegulon_metadata_key = 'eRegulon_metadata_filtered',
       auc_key = 'eRegulon_AUC_filtered',
       auc_thr_key = 'eRegulon_AUC_filtered_thresholds',
       keep_direct_and_extended_if_not_direct = True,
       tree_structure = (),
       title =  'Gene based eGRN',
       nomenclature = 'hg38',
       out_fname=os.path.join(work_dir,'SCENIC+_gene_based.loom'))

binarize_AUC(scplus_obj,
             auc_key='eRegulon_AUC_filtered',
             out_key='eRegulon_AUC_filtered_thresholds',
             signature_keys=['Region_based'],
             n_cpu=5)

export_to_loom(scplus_obj, 
       signature_key = 'Region_based',
       eRegulon_metadata_key = 'eRegulon_metadata_filtered',
       auc_key = 'eRegulon_AUC_filtered',
       auc_thr_key = 'eRegulon_AUC_filtered_thresholds',
       keep_direct_and_extended_if_not_direct = True,
       tree_structure = (),
       title =  'Region based eGRN',
       nomenclature = 'hg38',
       out_fname=os.path.join(work_dir,'SCENIC+_region_based.loom'))

with open(os.path.join(work_dir, 'selected_eRegulons.txt'), 'w') as f:
	for eRegulon_name in scplus_obj.uns['selected_eRegulon']['Gene_based']:
		f.write(eRegulon_name.rsplit('_', 1)[0])
		f.write('\n')

Main Figure 2, pannel A: eRegulons tSNE

work_dir <- '/staging/leuven/stg_00002/lcb/sdewin/PhD/Multiomics_pipeline/analysis/pbmc_granulocyte_sorted_10k/scenicplus_clustered_consensus'
plot_dir <- '/staging/leuven/stg_00002/lcb/sdewin/PhD/papers/SCENICPLUS_BravoDeWinter_etal_2022/Figure_pbmc/v3_clustered_consensus/plots'

# load data
library(SCopeLoomR)
loom <- open_loom(paste0(work_dir, '/SCENIC+_gene_based.loom'))
cell_data <- get_cell_annotation(loom)
embeddings <- get_embeddings(loom)
eRegulons_tSNE <- embeddings$eRegulons_tSNE
colnames(eRegulons_tSNE) <- c('tSNE_1', 'tSNE_2')
cell_data[] <- lapply(cell_data, sub, pattern = " ", replacement = "-")
cell_data[] <- lapply(cell_data, sub, pattern = "-(.*)", replacement = "")

# Color by cell type
cell_plot_data <- cbind(eRegulons_tSNE, cell_data[rownames(eRegulons_tSNE),])

color_dict = c(
        'B_cells' = '#FF5714',
        'CD14+_Monocytes' = '#6EEB83',
        'FCGR3A+_Monocytes' = '#65B891',
        'Dendritic_cells_1' = '#575761',
        'Dendritic_cells_2' = '#1BE7FF',
        'CD4_T_cells' = '#1D3461',
        'CD8_T_cells' = '#3D5A6C',
        'NK_cells' = '#64B6AC')

#plot
library(ggplot2)
source('/staging/leuven/stg_00002/lcb/cbravo/software/plotting_aux.R')
plot <- ggplot(cell_plot_data, aes(x=tSNE_1, y=tSNE_2, colour=ACC_celltype))
plot <- plot + geom_point(size = 0.2)
plot <- plot + theme_classic()
plot <- plot + theme(legend.position = "none")
plot <- plot + labs(x = NULL, y = NULL)
plot <- plot + scale_color_manual(values = color_dict)
plot <- plot + guides(x = "none", y = "none")  
ggsave(filename = paste0(plot_dir, '/R_eRegulons_tSNE.png'), device='png', bg = "transparent", width=5.24, height=4.5)

pdf(paste0(plot_dir, '/R_eRegulons_tSNE.pdf'), width = 5.24, height = 4.5)
LabelClusters(plot, 'ACC_celltype', split.by ='ACC_celltype', box=FALSE, repel=TRUE)  + scale_color_manual(values = color_dict)
dev.off()

Main Figure 2, Pannel b: n regions per gene

import os
import dill
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['svg.fonttype'] = 'none'

work_dir = '/staging/leuven/stg_00002/lcb/sdewin/PhD/Multiomics_pipeline/analysis/pbmc_granulocyte_sorted_10k/scenicplus_clustered_consensus'
plot_dir = '/staging/leuven/stg_00002/lcb/sdewin/PhD/papers/SCENICPLUS_BravoDeWinter_etal_2022/Figure_pbmc/v3_clustered_consensus/plots'

color_dict = {
        'B cells': '#FF5714',
        'CD14+ Monocytes': '#6EEB83',
        'FCGR3A+ Monocytes': '#65B891',
        'Dendritic cells 1': '#575761',
        'Dendritic cells 2': '#1BE7FF',
        'CD4 T cells': '#1D3461',
        'CD8 T cells': '#3D5A6C',
        'NK cells': '#64B6AC'
}

## Load data

scplus_obj = dill.load(
        open(os.path.join(work_dir, 'scplus_obj.pkl'), 'rb')
)

from tqdm import tqdm
from random import sample
scplus_obj.uns['search_space'] = scplus_obj.uns['search_space'].explode('Distance')

scplus_obj.uns['search_space']['abs_distance'] = abs(scplus_obj.uns['search_space']['Distance'])

from scipy.stats import rankdata
def get_rank_region_gene(region, gene):
        search_space_region = scplus_obj.uns['search_space'].query('Name == @region')
        ranking = rankdata(search_space_region['abs_distance'], method = 'dense')
        return ranking[np.where(search_space_region['Gene'] == gene)[0][0]]

from scenicplus.utils import *
selected_regions = list(set(flatten_list([scplus_obj.uns['eRegulon_signatures_filtered']['Region_based'][x] for x in   [x for x in scplus_obj.uns['selected_eRegulon']['Region_based'] if '_+_' in x]])))
selected_genes = list(set(flatten_list([scplus_obj.uns['eRegulon_signatures_filtered']['Gene_based'][x] for x in   [x for x in scplus_obj.uns['selected_eRegulon']['Gene_based'] if '_+_' in x]])))

n_regions_per_gene = scplus_obj.uns['eRegulon_metadata_filtered'].query('Gene in @selected_genes').query('Region in @selected_regions')[['Gene', 'Region']].drop_duplicates().groupby('Gene').count()['Region'].to_numpy()
nth_gene_region = []
for region, gene in tqdm(list(scplus_obj.uns['eRegulon_metadata_filtered'].query('Gene in @selected_genes').query('Region in @selected_regions')[['Region', 'Gene']].to_numpy()), total = 161180):
        nth_gene_region.append(get_rank_region_gene(region, gene))

import seaborn as sns
fig, axs = plt.subplots(ncols = 2, figsize = (3.75, 2.5), sharey = True)
ax = axs[0]
ax.hist(
        n_regions_per_gene,
        color = 'darkgray',
        edgecolor = 'black',
        bins = np.arange(0, 30, 1))
sns.despine(ax = ax)
ax.set_title('Number of regions linked to gene.', fontsize = 6)
ax.set_ylabel('Frequency', fontsize = 6)
ax.set_xticks(np.arange(1, 30, 5))
ax.spines['left'].set_position('zero')
ax.set_yticks([500, 1000, 1500, 2000])
ax.tick_params(axis='both', which='major', labelsize=5)
ax.set_xlim(left=0)
ax = axs[1]
ax.hist(nth_gene_region, bins = np.arange(max(nth_gene_region)) - 0.5, color = 'darkgray', edgecolor = 'black')
ax.set_xticks(np.arange(1, 30, 5))
sns.despine(ax = ax)
ax.set_title('nth. gene per region', fontsize = 6)
#ax.set_ylabel('Frequency', fontsize = 5)
ax.set_xlim(left=0)
ax.tick_params(axis='both', which='major', labelsize=5)
fig.tight_layout()
fig.savefig(os.path.join(plot_dir, 'n_regions_per_gene_nth_regions_per_gene.pdf'))

Main Figure 2, Pannel d: DotPlot

work_dir <- '/staging/leuven/stg_00002/lcb/sdewin/PhD/Multiomics_pipeline/analysis/pbmc_granulocyte_sorted_10k/scenicplus_clustered_consensus'
plot_dir <- '/staging/leuven/stg_00002/lcb/sdewin/PhD/papers/SCENICPLUS_BravoDeWinter_etal_2022/Figure_pbmc/v3_clustered_consensus/plots'

# load data
library(SCopeLoomR)
loom <- open_loom(paste0(work_dir, '/SCENIC+_gene_based.loom'))
cells_AUC_gene <- AUCell::getAUC(get_regulons_AUC(loom, column.attr.name='RegulonsAUC'))
rownames(cells_AUC_gene) <- gsub('_extended', '', rownames(cells_AUC_gene) )
cell_data <- get_cell_annotation(loom)
dgem <- get_dgem(loom)

loom <- open_loom(paste0(work_dir, '/SCENIC+_region_based.loom'))
cells_AUC_region <- AUCell::getAUC(get_regulons_AUC(loom, column.attr.name='RegulonsAUC'))
rownames(cells_AUC_region) <- gsub('_extended', '', rownames(cells_AUC_region))

cells_AUC_gene <- cells_AUC_gene[rownames(cells_AUC_region), which(colnames(cells_AUC_gene) %in% colnames(cells_AUC_region))]
cells_AUC_region <- cells_AUC_region[rownames(cells_AUC_gene), colnames(cells_AUC_gene)]

selected_eRegulon_names = scan(paste0(work_dir, '/selected_eRegulons.txt'), what="", sep="\n")

# Get eRegulons
loom <- open_loom(paste0(work_dir, '/SCENIC+_gene_based.loom'))
regulons_gene <- get_regulons(loom, column.attr.name='Regulons')
regulon_gene_list <- list()
for (row in rownames(regulons_gene)){
  regulon_gene_list[[row]] <- colnames(regulons_gene)[which(regulons_gene[row,] == 1)]
}
regulon_gene_list <- regulon_gene_list[selected_eRegulon_names]
names(regulon_gene_list) <- gsub('_extended', '', names(regulon_gene_list))

regulon_gene_list <- tapply(unlist(regulon_gene_list , use.names = FALSE), rep(names(regulon_gene_list), lengths(regulon_gene_list)), FUN = c)

positive <- names(regulon_gene_list)[grep('_+', names(regulon_gene_list), fixed=TRUE)]
negative <- names(regulon_gene_list)[grep('_-', names(regulon_gene_list), fixed=TRUE)]

# calculate RSS
library(SCENIC)
rss_values <- calcRSS(cells_AUC_gene, cell_data$ACC_celltype)
rss_values <- sweep(rss_values,2,colSums(rss_values),`/`)*100
rss_values <- rss_values[,sort(colnames(rss_values))]

# Prepare data for plotting

expression_list <- list()
for (x in unique(cell_data$ACC_celltype)){
  print(x)
  expression_list[[x]] <- as.data.frame(t(log(rowSums(dgem[,grep(x, cell_data$ACC_celltype, fixed = TRUE)])/sum(rowSums(dgem[,grep(x, cell_data$ACC_celltype, fixed = TRUE)]))*10^6+1)))
}
exp_mat <- t(data.table::rbindlist(expression_list))
colnames(exp_mat) <- names(expression_list)
exp_mat <- exp_mat

sel_rel <- c(positive, negative)
order_list <- list()
for (i in 1:ncol(rss_values)){
  order_list[[i]] <- vector()
}
for (x in sel_rel){
  i <- which.max(rss_values[x,])
  order_list[[i]] <- c(order_list[[i]], x)
}
sel_rel <- unlist(order_list)

rss_values <- t(apply(rss_values, 1, function(x)(x-min(x))/(max(x)-min(x))))
rel_list <- sel_rel
rel_data <- data.frame()
for (rel in rel_list){
  for (name in colnames(rss_values)){
      tf <- strsplit(rel, split = "_")[[1]][1]
      row_to_add <- t(c(rel, name, exp_mat[tf, name], rss_values[rel,name]))
      rel_data <- rbind(rel_data, row_to_add)
   }
}
colnames(rel_data) <- c('Regulon', 'Cell_type', 'Expression', 'RSS')
sel_rel_color <- rep('grey',length(sel_rel))
sel_rel_color[which(rev(sel_rel) %in% positive)] <- 'forestgreen'
sel_rel_color[which(rev(sel_rel) %in% negative)] <- 'red'

#plot
library(ggplot2)
rel_data$Regulon <- factor(rel_data$Regulon, levels=rev(sel_rel))
rel_data$Cell_type <- gsub('_', ' ', rel_data$Cell_type)
rel_data$Cell_type <- factor(rel_data$Cell_type, levels= gsub('_', ' ', colnames(rss_values)))
rel_data$RSS <- as.numeric(rel_data$RSS)
rel_data$Expression <- as.numeric(rel_data$Expression)
rel_data$Expression_scale <- ave(rel_data$Expression, rel_data$Regulon, FUN = scale)
rel_data$Repressor_activator <- unlist(lapply(grepl("+", rel_data$Regulon, fixed=TRUE), function(x) if (x) 'Activators' else 'Repressors'))
g <- ggplot(data = rel_data, mapping = aes_string(y = 'Cell_type', x = 'Regulon')) +
    facet_grid(rows = vars(Repressor_activator), scales = "free_y", space = 'free') + 
    geom_tile(mapping = aes_string(fill = 'Expression_scale')) + 
    geom_point(mapping = aes_string(size = 'RSS'), colour="black",pch=21, fill = 'black') +
    scale_radius(range = c(0.1, 3), limits=c(0, 1)) +
    scale_fill_distiller(palette = "RdYlBu") +
    theme_bw() +
    theme(axis.title.x = element_blank(), axis.title.y = element_blank(), axis.text.x = element_text(angle = 90, hjust = 1)) + coord_flip() +
    theme(legend.position = "none")
ggsave(paste0(plot_dir, '/R_Dotplot_RSS_no_legend.pdf'), g, width=3.5, height=13)

#plot
library(ggplot2)
rel_data$Regulon <- factor(rel_data$Regulon, levels=rev(sel_rel))
rel_data$Cell_type <- gsub('_', ' ', rel_data$Cell_type)
rel_data$Cell_type <- factor(rel_data$Cell_type, levels= gsub('_', ' ', colnames(rss_values)))
rel_data$RSS <- as.numeric(rel_data$RSS)
rel_data$Expression <- as.numeric(rel_data$Expression)
rel_data$Expression_scale <- ave(rel_data$Expression, rel_data$Regulon, FUN = scale)
rel_data$Repressor_activator <- unlist(lapply(grepl("+", rel_data$Regulon, fixed=TRUE), function(x) if (x) 'Activators' else 'Repressors'))
g <- ggplot(data = rel_data, mapping = aes_string(y = 'Cell_type', x = 'Regulon')) +
    facet_grid(rows = vars(Repressor_activator), scales = "free_y", space = 'free') +
    geom_tile(mapping = aes_string(fill = 'Expression_scale')) +
    geom_point(mapping = aes_string(size = 'RSS'), colour="black",pch=21, fill = 'black') +
    scale_radius(range = c(0.1, 3), limits=c(0, 1)) +
    scale_fill_distiller(palette = "RdYlBu") +
    theme_bw() +
    theme(axis.title.x = element_blank(), axis.title.y = element_blank(), axis.text.x = element_text(angle = 90, hjust = 1)) + coord_flip()
ggsave(paste0(plot_dir, '/R_Dotplot_RSS.pdf'), g, width=3.5, height=13)

#plot
library(ggplot2)
rel_data$Regulon <- factor(rel_data$Regulon, levels=rev(sel_rel))
rel_data$Cell_type <- gsub('_', ' ', rel_data$Cell_type)
rel_data$Cell_type <- factor(rel_data$Cell_type, levels= gsub('_', ' ', colnames(rss_values)))
rel_data$RSS <- as.numeric(rel_data$RSS)
rel_data$Expression <- as.numeric(rel_data$Expression)
rel_data$Expression_scale <- ave(rel_data$Expression, rel_data$Regulon, FUN = scale)
rel_data$Repressor_activator <- unlist(lapply(grepl("+", rel_data$Regulon, fixed=TRUE), function(x) if (x) 'Activators' else 'Repressors'))
g <- ggplot(data = rel_data[rel_data$Repressor_activator == 'Activators', ], mapping = aes_string(y = 'Cell_type', x = 'Regulon')) +
    geom_tile(mapping = aes_string(fill = 'Expression_scale')) +
    geom_point(mapping = aes_string(size = 'RSS'), colour="black",pch=21, fill = 'black') +
    scale_radius(range = c(0.1, 3), limits=c(0, 1)) +
    scale_fill_distiller(palette = "RdYlBu") +
    theme_bw() +
    theme(axis.title.x = element_blank(), axis.title.y = element_blank(), axis.text.x = element_text(angle = 90, hjust = 1)) + coord_flip() +
    theme(legend.position = "none")
ggsave(paste0(plot_dir, '/R_Dotplot_RSS_no_legend_activators.pdf'), g, width=3.5, height=13)

#plot
library(ggplot2)
rel_data$Regulon <- factor(rel_data$Regulon, levels=rev(sel_rel))
rel_data$Cell_type <- gsub('_', ' ', rel_data$Cell_type)
rel_data$Cell_type <- factor(rel_data$Cell_type, levels= gsub('_', ' ', colnames(rss_values)))
rel_data$RSS <- as.numeric(rel_data$RSS)
rel_data$Expression <- as.numeric(rel_data$Expression)
rel_data$Expression_scale <- ave(rel_data$Expression, rel_data$Regulon, FUN = scale)
rel_data$Repressor_activator <- unlist(lapply(grepl("+", rel_data$Regulon, fixed=TRUE), function(x) if (x) 'Activators' else 'Repressors'))
g <- ggplot(data = rel_data[rel_data$Repressor_activator == 'Activators', ], mapping = aes_string(y = 'Cell_type', x = 'Regulon')) +
    geom_tile(mapping = aes_string(fill = 'Expression_scale')) +
    geom_point(mapping = aes_string(size = 'RSS'), colour="black",pch=21, fill = 'black') +
    scale_radius(range = c(0.1, 3), limits=c(0, 1)) +
    scale_fill_distiller(palette = "RdYlBu") +
    theme_bw() +
    theme(axis.title.x = element_blank(), axis.title.y = element_blank(), axis.text.x = element_text(angle = 90, hjust = 1)) + coord_flip()
ggsave(paste0(plot_dir, '/R_Dotplot_RSS_activators.pdf'), g, width=3.5, height=13)

Main Figure 2, pannel e: overlap of regions

import os
import dill
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['svg.fonttype'] = 'none'

work_dir = '/staging/leuven/stg_00002/lcb/sdewin/PhD/Multiomics_pipeline/analysis/pbmc_granulocyte_sorted_10k/scenicplus_clustered_consensus'
plot_dir = '/staging/leuven/stg_00002/lcb/sdewin/PhD/papers/SCENICPLUS_BravoDeWinter_etal_2022/Figure_pbmc/v3_clustered_consensus/plots'

color_dict = {
        'B cells': '#FF5714',
        'CD14+ Monocytes': '#6EEB83',
        'FCGR3A+ Monocytes': '#65B891',
        'Dendritic cells 1': '#575761',
        'Dendritic cells 2': '#1BE7FF',
        'CD4 T cells': '#1D3461',
        'CD8 T cells': '#3D5A6C',
        'NK cells': '#64B6AC'
}

## Load data

scplus_obj = dill.load(
        open(os.path.join(work_dir, 'scplus_obj.pkl'), 'rb')
)

celltype_to_group = {
        'CD4_T_cells': 'T_cells',
        'B_cells': 'B_cells',
        'NK_cells': 'NK_cells',
        'CD8_T_cells': 'T_cells',
        'CD14+_Monocytes': 'Myeloid',
        'Dendritic_cells_1': 'Myeloid',
        'FCGR3A+_Monocytes': 'Myeloid',
        'Dendritic_cells_2': 'Myeloid'
}

scplus_obj.metadata_cell['cell_type_group'] = [celltype_to_group[ct] for ct in scplus_obj.metadata_cell['GEX_celltype']]

from scenicplus.RSS import *

regulon_specificity_scores(
        scplus_obj,
        variable = 'cell_type_group',
        auc_key = 'eRegulon_AUC_filtered',
        signature_keys = ['Region_based'],
        selected_regulons = [x for x in scplus_obj.uns['selected_eRegulon']['Region_based'] if '-' not in x],
        out_key_suffix = 'filtered')

flat_list = lambda t: [item for sublist in t for item in sublist]
selected_markers = list(set(flat_list([scplus_obj.uns['RSS']['cell_type_groupfiltered'].loc[celltype].sort_values(ascending = False).head(10).index.to_list() for celltype in scplus_obj.uns['RSS']['cell_type_groupfiltered'].index])))

from scenicplus.plotting.correlation_plot import *

region_intersetc_data, Z = jaccard_heatmap(
        scplus_obj,
        method = 'intersect',
        gene_or_region_based = 'Region_based',
        save = os.path.join(plot_dir, 'hm_overlap_regions_manuel_selection_w_cbar.pdf'),
        use_plotly = False,
        selected_regulons = selected_markers,
        signature_key = 'eRegulon_signatures_filtered',
        figsize = (10, 10), return_data = True, vmax = 0.5, cmap = 'plasma')

fig, ax = plt.subplots(figsize = (10, 10))
sns.heatmap(region_intersetc_data, ax = ax, cmap = 'plasma', vmax = 0.5, cbar=False, xticklabels = False)
fig.tight_layout()
fig.savefig(os.path.join(plot_dir, 'hm_overlap_regions_manuel_selection_wo_cbar.pdf'))

selected_markers = list(
	set(flat_list([scplus_obj.uns['RSS']['cell_type_groupfiltered'].loc[celltype].sort_values(ascending = False).head(5).index.to_list() for celltype in scplus_obj.uns['RSS']['cell_type_groupfiltered'].index])))
region_intersetc_data, Z = jaccard_heatmap(
        scplus_obj,
        method = 'intersect',
        gene_or_region_based = 'Region_based',
        save = os.path.join(plot_dir, 'hm_overlap_regions_manuel_selection_w_cbar_top_5.pdf'),
        use_plotly = False,
        selected_regulons = selected_markers,
        signature_key = 'eRegulon_signatures_filtered',
        figsize = (10, 10), return_data = True, vmax = 0.4, cmap = 'plasma')

fig, ax = plt.subplots(figsize = (10, 10))
sns.heatmap(region_intersetc_data, ax = ax, cmap = 'plasma', vmax = 0.4, cbar=False, xticklabels = False)
fig.tight_layout()
fig.savefig(os.path.join(plot_dir, 'hm_overlap_regions_manuel_selection_wo_cbar_top_5.pdf'))


clustered_jaccard_df,p_adj_df, Z = fisher_exact_test_heatmap(
	scplus_obj = scplus_obj,
	gene_or_region_based = 'Region_based',
	signature_key = 'eRegulon_signatures_filtered',
	selected_regulons = selected_markers,
	save = os.path.join(plot_dir, 'hm_overlap_regions_manuel_selection_w_cbar_top_5_fisher.pdf'),
	use_plotly = False,
	figsize = (10, 10), return_data = True, vmax = 3, cmap = 'plasma')

Main Figure 2, Pannel f: ChIP-seq coverage

eGRNs_of_interest = ['PAX5_+_(205r)', 'POU2AF1_+_(170r)', 'EBF1_+_(274r)', 'POU2F2_+_(489r)']

regions_POU2AF1 = set(scplus_obj.uns['eRegulon_signatures_filtered']['Region_based']['POU2AF1_+_(170r)'])
regions_POU2F2 = set(scplus_obj.uns['eRegulon_signatures_filtered']['Region_based']['POU2F2_+_(489r)'])
regions_EBF = set(scplus_obj.uns['eRegulon_signatures_filtered']['Region_based']['EBF1_+_(274r)'])
regions_PAX = set(scplus_obj.uns['eRegulon_signatures_filtered']['Region_based']['PAX5_+_(205r)'])

import pyranges as pr

summits_PAX5 = pr.read_bed('/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/data/ChIP-seq/summits/GM12878_PAX5_ENCSR000BHD.bed')
summits_EBF1 = pr.read_bed('/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/data/ChIP-seq/summits/GM12878_EBF1_ENCSR000BGU.bed')
summits_POU2F2 = pr.read_bed('/staging/leuven/stg_00002/lcb/sdewin/PhD/papers/SCENICPLUS_BravoDeWinter_etal_2022/Figure_pbmc/v3_clustered_consensus/data/ENCFF934JFA.bed.gz')

import pyBigWig
bw_PAX5 = pyBigWig.open('/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/data/ChIP-seq/bigwig/GM12878_PAX5_ENCFF702MTT.bigWig')
bw_EBF1 = pyBigWig.open('/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/data/ChIP-seq/bigwig/GM12878_EBF1_ENCFF107LDM.bigWig')
bw_POU2F2 = pyBigWig.open('/staging/leuven/stg_00002/lcb/sdewin/PhD/papers/SCENICPLUS_BravoDeWinter_etal_2022/Figure_pbmc/v3_clustered_consensus/data/ENCFF803HIP.bigWig')

def region_name_to_coord(r):
        return r.replace('-', ':').split(':')

def calc_coverage(regions, n_bins, bw, offset = 500):
        coverage = []
        for region in regions:
                chrom, start, end = region_name_to_coord(region)
                coverage.append(bw.stats(chrom, int(start) - offset, int(end) + offset, nBins = n_bins))
        return pd.DataFrame(coverage)

def calc_coverage_keep_region_names(regions, n_bins, bw, offset = 500):
	coverage = []
	for region in tqdm(regions, total = len(regions)):
		chrom, start, end = region_name_to_coord(region)
		coverage.append(bw.stats(chrom, int(start) - offset, int(end) + offset, nBins = n_bins))
	return pd.DataFrame(coverage, index = regions)

from scenicplus.utils import region_names_to_coordinates
from scenicplus.utils import coord_to_region_names
from pycistarget.utils import target_to_query
def intersect_w_summit(regions, summits):
        def _extend(r):
                chrom, start, end = r.replace('-', ':').split(':')
                return f'{chrom}:{int(start) - 249}-{int(end) + 250}'
        pr_regions = pr.PyRanges(region_names_to_coordinates(regions))
        target_query = target_to_query(summits, pr_regions).drop_duplicates(subset = 'Query', keep = 'first')
        overlapping_regions = [_extend(r) for r in target_query['Target']]
        non_overlapping_regions = set(regions) - set(target_query['Query'])
        return [*overlapping_regions, *non_overlapping_regions]

n_bins = 50

region_sets_of_interest = {'PAX5': regions_PAX, 'EBF1': regions_EBF, 'POU2F2': regions_POU2F2}
bw_of_interest = {'PAX5':bw_PAX5, 'EBF1':bw_EBF1 , 'POU2F2':bw_POU2F2}
summits_of_interest = {'PAX5': summits_PAX5, 'EBF1': summits_EBF1, 'POU2F2': summits_POU2F2}
eRegulons_of_interest = set(['PAX5', 'EBF1', 'POU2F2'])

coverages = {}
coverages_w_region_names = {}
from itertools import combinations
for ChIP_seq_track in eRegulons_of_interest:
	print(f'ChIP-seq: {ChIP_seq_track}')
	for n_region_sets in range(1, len(eRegulons_of_interest) + 1):
		for combination in combinations(eRegulons_of_interest, n_region_sets):
			others = eRegulons_of_interest - set(combination)
			regions = region_sets_of_interest[combination[0]]
			if n_region_sets > 1:
				for partner in combination[1:]:
					regions = regions & region_sets_of_interest[partner]
			if len(others) > 0:
				for other in others:
					regions = regions - region_sets_of_interest[other]
			print(f'c_{ChIP_seq_track}_r_{"_n_".join(combination)}_d_{"_".join(others)}: {len(regions)}')
			if len(regions) > 0:
				coverages[f'c_{ChIP_seq_track}_r_{"_n_".join(combination)}_d_{"_".join(others)}'] = calc_coverage(
					intersect_w_summit(regions, summits_of_interest[ChIP_seq_track]), n_bins, bw_of_interest[ChIP_seq_track])
			
color_dict = {
	'EBF1': 'Blue',
	'PAX5': 'Green',
	'POU2AF1': 'Red'
}

#set global min max for scaling
min_max_factor = {}
for ChIP_seq_track in eRegulons_of_interest:
        maxs = []
        mins = []
        for x in coverages.keys():
                if x.startswith(f'c_{ChIP_seq_track}') and len(x.split('_d_')[1]) > 0:
                        print(f'{ChIP_seq_track} -- {x}')
                        df = pd.DataFrame(coverages[x])
                        mins.append(df.mean(0).min())
                        maxs.append(df.mean(0).max())
        min_max_factor[ChIP_seq_track] = [min(mins), max(maxs)]

fig, axs = plt.subplots(ncols = 3, nrows = 3, figsize = (3.75, 5.75))
for j, region_set_1 in enumerate(eRegulons_of_interest):
        axs[0, j].set_title(region_set_1)
        for i, region_set_2 in enumerate(eRegulons_of_interest):
                axs[i, 0].set_ylabel(region_set_2)
                if j <= i: #this will only plot one triangle
                        for ChIP_seq_track in eRegulons_of_interest:
                                ChIP_min, ChIP_max = min_max_factor[ChIP_seq_track]
                                data_name = [
                                        x for x in coverages.keys()
                                        if x.startswith(f'c_{ChIP_seq_track}')
                                        and region_set_1 in x.split('_r_')[1].split('_d_')[0].split('_n_')
                                        and region_set_2 in x.split('_r_')[1].split('_d_')[0].split('_n_')][0]
                                print(f'{i}, {j}: {data_name} region 1: {region_set_1} region 2: {region_set_2}')
                                coverage = coverages[data_name]
                                axs[i, j].plot(
                                        np.arange(n_bins),
                                        (coverage.mean(0) - ChIP_min) / (ChIP_max - ChIP_min), #scale between global 1 and 0
                                        lw = 1,
                                        label = ChIP_seq_track,
                                        color = color_dict[ChIP_seq_track]
                                )
                                sns.despine(ax = axs[i, j])
                                axs[i,j].set_ylim(0,1)
                                axs[i, j].set_yticks([0, 0.25, 0.5, 0.75,  1])
                                axs[i,j].set_xticks(np.arange(0, 51, 16.66666))
                                axs[i, j].set_xticklabels(np.round((np.arange(0, 51, 16.66666) * 30 - 249.5)).astype(int), fontsize = 5)
                                axs[i, j].set_axisbelow(True)
                                axs[i, j].grid()
                else:
                        sns.despine(ax = axs[i, j], top = True, bottom = True, left = True, right = True)
                        axs[i, j].set_xticks([])
                        axs[i, j].set_yticks([])
handles, labels = axs[0,0].get_legend_handles_labels()
fig.legend(handles, labels, loc='center right', frameon = False)
fig.tight_layout()
fig.savefig(os.path.join(plot_dir, 'aggreation_chip_combinations_POU2AF1_EBF1_PAX5.pdf'))

Supplementary Figure S9, pannel m: Accessibility in B Cells

B_cell_bw = pyBigWig.open('/staging/leuven/stg_00002/lcb/sdewin/PhD/Multiomics_pipeline/analysis/pbmc_granulocyte_sorted_10k/pycistopic/consensus_peak_calling/pseudobulk_bw_files/B_cells.bw')

coverages = {}
for n_region_sets in range(1, len(eRegulons_of_interest) + 1):
	for combination in combinations(eRegulons_of_interest, n_region_sets):
		others = eRegulons_of_interest - set(combination)
		regions = region_sets_of_interest[combination[0]]
		if n_region_sets > 1:
			for partner in combination[1:]:
				regions = regions & region_sets_of_interest[partner]
		if len(others) > 0:
			for other in others:
				regions = regions - region_sets_of_interest[other]
		print(f'c_{ChIP_seq_track}_r_{"_n_".join(combination)}_d_{"_".join(others)}: {len(regions)}')
		if len(regions) > 5:
			coverages[f'c_B_cell_r_{"_n_".join(combination)}_d_{"_".join(others)}'] = calc_coverage(regions, n_bins, B_cell_bw)

fig, ax = plt.subplots(figsize = (3.5, 3.5))
for region_combination in coverages.keys():
	region_combination_name = region_combination.split('_r_')[1]
	region_combination_coverage = coverages[region_combination]
	ax.plot(
		np.arange(n_bins), 
		region_combination_coverage.mean(0), 
		label = region_combination_name.split('_d_')[0],
		color = color_dict[region_combination_name],
		lw = 1)
a = list(pd.concat([coverages[x] for x in coverages.keys() if '_n_' in x], axis = 0)[24])
b = list(pd.concat([coverages[x] for x in coverages.keys() if '_n_' not in x], axis = 0)[24])
pval = round(mannwhitneyu(a, b, nan_policy = 'omit', alternative = 'greater').pvalue, 4)
ax.text(1, 5.2 , f'p = {pval}', fontsize = 8)
ax.set_ylabel('B cells Accessibility')
ax.set_xticks(np.arange(0, 51, 16.66666))
ax.set_xticklabels((np.arange(0, 51, 16.66666) * 30 - 249.5).astype(int))
sns.despine(ax = ax)
plt.legend(frameon = False,  ncol = 1, loc = 'best', columnspacing = 0.5, fontsize = 5)
plt.grid(which = 'both')
ax.set_axisbelow(True)
fig.tight_layout()
fig.savefig(os.path.join(plot_dir, 'ACC_B_cell.pdf'))

fig, ax = plt.subplots(figsize = (3.5, 3.5))
ax.plot(
	np.arange(n_bins),
	pd.concat([coverages[x] for x in coverages.keys() if '_n_' in x]).mean(0),
	label = '> 1',
	color = 'blue',
	lw = 1)
ax.plot(
        np.arange(n_bins),
        pd.concat([coverages[x] for x in coverages.keys() if '_n_' not in x]).mean(0),
        label = '< 1',
        color = 'red',
        lw = 1)
ax.set_ylabel('B cells Accessibility')
ax.set_xticks(np.arange(0, 51, 16.66666))
ax.set_xticklabels((np.arange(0, 51, 16.66666) * 30 - 249.5).astype(int))
sns.despine(ax = ax)
plt.legend(frameon = False,  ncol = 1, loc = 'best', columnspacing = 0.5, fontsize = 5)
plt.grid(which = 'both')
ax.set_axisbelow(True)
fig.tight_layout()
fig.savefig(os.path.join(plot_dir, 'ACC_B_cell_1_v_2.pdf'))

Supplementary Figure S9, pannel L: Cell type specificity

expr_thr = 0
B_cells = scplus_obj.metadata_cell.query('GEX_celltype == "B_cells"').index.to_list()
X_B_cells = scplus_obj.to_df('EXP').loc[B_cells]

genes_PAX5 = set(scplus_obj.uns['eRegulon_signatures_filtered']['Gene_based']['PAX5_+_(132g)'])
genes_POU2AF1 = set(scplus_obj.uns['eRegulon_signatures_filtered']['Gene_based']['POU2AF1_+_(123g)'])
genes_EBF1 = set(scplus_obj.uns['eRegulon_signatures_filtered']['Gene_based']['EBF1_+_(161g)'])

gene_sets_of_interest = {'PAX5': genes_PAX5, 'EBF1': genes_EBF1, 'POU2AF1': genes_POU2AF1}
eRegulons_of_interest = set(['PAX5', 'EBF1', 'POU2AF1'])

from itertools import combinations
gene_sets = {}
for n_gene_sets in range(1, len(eRegulons_of_interest) + 1):
        for combination in combinations(eRegulons_of_interest, n_gene_sets):
                others = eRegulons_of_interest - set(combination)
                genes = gene_sets_of_interest[combination[0]]
                if n_gene_sets > 1:
                        for partner in combination[1:]:
                                genes = genes & gene_sets_of_interest[partner]
                if len(others) > 0:
                        for other in others:
                                genes = genes - gene_sets_of_interest[other]
                	print(f'{"_n_".join(combination)}_d_{"_".join(others)}: {len(genes)}')
                	if len(genes) > 5:
                        	gene_sets[f'{"_n_".join(combination)}_d_{"_".join(others)}'] = genes
		else:
			print(f'{"_n_".join(combination)}: {len(genes)}')
			gene_sets[f'{"_n_".join(combination)}'] = genes


import tspex
X = scplus_obj.to_df('EXP')
ct_indeces = {ct: scplus_obj.metadata_cell.query('GEX_celltype == @ct').index for ct in set(scplus_obj.metadata_cell['GEX_celltype'])}
X_bulk = {}
for ct in ct_indeces:
        X_bulk[ct] = np.log1p((X.loc[ct_indeces[ct]].T / X.loc[ct_indeces[ct]].T.sum() *1e6).mean(1))
X_bulk=pd.DataFrame(X_bulk)

flatten_list = lambda t: [item for sublist in t for item in sublist]
scale = lambda X: [(x - min(X)) / (max(X) - min(X)) for x in X]
spm = tspex.TissueSpecificity(X_bulk.loc[flatten_list(gene_sets.values())], 'tau', log = False, transform = False).tissue_specificity
spm.update(pd.Series(scale(spm.values), index = spm.index))

tau_gene_sets = {combo: spm.loc[list(gene_sets[combo])] for combo in gene_sets.keys()}
data = pd.DataFrame(
        [(np.repeat(combo, len(tau_gene_sets[combo])), tau_gene_sets[combo]) for combo in gene_sets.keys()],
        columns = ['Gene_set', 'tau']).explode(['Gene_set', 'tau']).reset_index(drop = True)
data['label'] = [x.split('_d_')[0].replace('_n_', '\n') for x in data['Gene_set']]
data['label'] = data['label'].astype('category')
data['tau'] = data['tau'].astype(float)

color_dict['EBF1_n_PAX5_n_POU2AF1'] = '#4CB963'
color_dict_genes = {x.split('_d_')[0].replace('_n_', '\n'): color_dict[x] for x in color_dict.keys()}

data['number_of_factors'] = ['<2' if '\n' not in x else '>=2' for x in data['label']]

a = data.query('number_of_factors == "<2"')['tau'].tolist()
b = data.query('number_of_factors == ">=2"')['tau'].tolist()
pval = round(mannwhitneyu(b, a, nan_policy = 'omit', alternative = 'greater').pvalue, 4)

fig, ax = plt.subplots(figsize = (3.5, 3.5))
sns.boxplot(
	ax = ax, 
	y = 'label', 
	x = 'tau', 
	data = data, 
	order = [x.split('_d_')[0].replace('_n_', '\n') for x in gene_sets.keys()], 
	palette = color_dict_genes)
ax.text(0.9, 3.5, f'p = {pval}', fontsize = 5)
sns.despine(ax = ax)
ax.set_ylabel(None)
plt.grid()
ax.set_axisbelow(True)
fig.tight_layout()
fig.savefig(os.path.join(plot_dir, 'tau.pdf'))

fig, ax = plt.subplots(figsize = (3.75, 3.5))
sns.boxplot(
        ax = ax,
	y = 'number_of_factors',
	x = 'tau',
	data = data,
	palette = {'<2': 'red', '>=2': 'blue'})
sns.despine(ax = ax)
ax.set_ylabel(None)
plt.grid()
ax.set_axisbelow(True)
fig.tight_layout()
fig.savefig(os.path.join(plot_dir, 'tau_1_2_.pdf'))

Export Region-to-gene links for visualization in R

import os
import dill

work_dir = '/staging/leuven/stg_00002/lcb/sdewin/PhD/Multiomics_pipeline/analysis/pbmc_granulocyte_sorted_10k/scenicplus_clustered_consensus'

## Load data

scplus_obj = dill.load(
        open(os.path.join(work_dir, 'scplus_obj.pkl'), 'rb')
)

with open(os.path.join(work_dir, 'scplus_cell_names.txt'), 'w') as f:
	for cell_barcode in scplus_obj.cell_names:
		f.write(cell_barcode.split('-10x_pbmc')[0])
		f.write('\n')

with open(os.path.join(work_dir, 'scplus_region_names.txt'), 'w') as f:
	for region_name in scplus_obj.region_names:
		f.write(region_name.replace(':', '-'))
		f.write('\n')

with open(os.path.join(work_dir, 'scplus_gene_names.txt'), 'w') as f:
	for gene_name in scplus_obj.gene_names:
		f.write(gene_name)
		f.write('\n')

scplus_obj.metadata_cell.to_csv(os.path.join(work_dir, 'scplus_metadata_cell.tsv'), sep = '\t')

save_path = work_dir
path_bedToBigBed = '/staging/leuven/stg_00002/lcb/sdewin/PhD/De_Winter_hNTorg/COMBINED_ANALYSIS/r2g/'
biomart_host = 'http://sep2019.archive.ensembl.org'
assembly = 'hg38'
species = 'hsapiens'
from scenicplus.enhancer_to_gene import export_to_UCSC_interact 
r2g_data = export_to_UCSC_interact(scplus_obj,
		    species,
		    os.path.join(save_path,'r2g.rho.bed'),
		    path_bedToBigBed=path_bedToBigBed,
		    bigbed_outfile=os.path.join(save_path,'r2g.rho.bb'),
		    region_to_gene_key='region_to_gene',
		    pbm_host=biomart_host,
		    assembly=assembly,
		    ucsc_track_name='R2G',
		    ucsc_description='SCENIC+ region to gene links',
		    cmap_neg='Reds',
		    cmap_pos='Greens',
		    key_for_color='rho',
		    scale_by_gene=False,
		    subset_for_eRegulons_regions=True,
		    eRegulons_key='eRegulons')
r2g_data = export_to_UCSC_interact(scplus_obj,
		    species,
		    os.path.join(save_path,'r2g.importance.bed'),
		    path_bedToBigBed=path_bedToBigBed,
		    bigbed_outfile=os.path.join(save_path,'r2g.importance.bb'),
		    region_to_gene_key='region_to_gene',
		    pbm_host=biomart_host,
		    assembly=assembly,
		    ucsc_track_name='R2G',
		    ucsc_description='SCENIC+ region to gene links',
		    cmap_neg='Reds',
		    cmap_pos='Greens',
		    key_for_color='importance',
		    scale_by_gene=True,
		    subset_for_eRegulons_regions=True,
		    eRegulons_key='eRegulons')

from scenicplus.utils import export_eRegulons
regions = export_eRegulons(scplus_obj,
	os.path.join(save_path,'eRegulons.bed'),
	assembly,
	eRegulon_signature_key = 'eRegulon_signatures_filtered',
	bigbed_outfile = os.path.join(save_path,'eRegulons.bb'),
	path_bedToBigBed=path_bedToBigBed)

Main Figure 2, pannel g and Supplementary Figure S9, pannel k: Genome Browser plots

# create signac object
library(Signac)
library(Seurat)
library(EnsDb.Hsapiens.v86)
library(BSgenome.Hsapiens.UCSC.hg38)

set.seed(1234)

counts <- Read10X_h5('/staging/leuven/stg_00002/lcb/sdewin/PhD/Multiomics_pipeline/analysis/pbmc_combined/data/pbmc_granulocyte_sorted_10k/outs/pbmc_granulocyte_sorted_10k_filtered_feature_bc_matrix.h5')
fragpath <- '/staging/leuven/stg_00002/lcb/sdewin/PhD/Multiomics_pipeline/analysis/pbmc_combined/data/pbmc_granulocyte_sorted_10k/outs/pbmc_granulocyte_sorted_10k_atac_fragments.tsv.gz'
work_dir <- '/staging/leuven/stg_00002/lcb/sdewin/PhD/Multiomics_pipeline/analysis/pbmc_granulocyte_sorted_10k/scenicplus_clustered_consensus'
plot_dir <- '/staging/leuven/stg_00002/lcb/sdewin/PhD/papers/SCENICPLUS_BravoDeWinter_etal_2022/Figure_pbmc/v3_clustered_consensus/plots'


annotation <- GetGRangesFromEnsDb(ensdb = EnsDb.Hsapiens.v86)
seqlevelsStyle(annotation) <- "UCSC"

# create a Seurat object containing the RNA adata
pbmc <- CreateSeuratObject(
  counts = counts$`Gene Expression`,
  assay = "RNA"
)

# create ATAC assay and add it to the object
pbmc[["ATAC"]] <- CreateChromatinAssay(
  counts = counts$Peaks,
  sep = c(":", "-"),
  fragments = fragpath,
  annotation = annotation
)

# load data
library(SCopeLoomR)
loom <- open_loom(paste0(work_dir, '/SCENIC+_gene_based.loom'))
cell_data <- get_cell_annotation(loom)

cells_to_keep <- gsub('-10x_pbmc', '', rownames(cell_data))

pbmc <- subset(pbmc, cells = cells_to_keep)

r2g_links <- read.table(
	paste0(work_dir, '/r2g.rho.bed'), sep = '\t', skip = 1,
	col.names = c('chrom', 'chromStart', 'chromEnd', 'name', 'score', 'value', 'exp',
       		      'color', 'sourceChrom', 'sourceStart', 'sourceEnd', 'sourceName',
       	              'sourceStrand', 'targetChrom', 'targetStart', 'targetEnd', 'targetName',
       	              'targetStrand'))

cell_metadata <- read.table(paste0(work_dir, '/scplus_metadata_cell.tsv'), sep = '\t', header = T)
rownames(cell_metadata) = cell_metadata$ACC_barcode
pbmc <- AddMetaData(pbmc, cell_metadata)

DefaultAssay(pbmc) <- 'ATAC'

library(GenomicRanges)
r2g_links <- transform(r2g_links, start_ = pmin(sourceStart, targetEnd))
r2g_links <- transform(r2g_links, end_ = pmax(sourceStart, targetEnd))
r2g_links <- r2g_links[c('chrom', 'start_', 'end_', 'targetName', 'value', 'chrom', 'chromStart', 'chromEnd')]
colnames(r2g_links) <- c('seqnames', 'start', 'end', 'gene', 'score', 'chrom', 'chromStart', 'chromEnd')

eRegulon_regions <- read.table(paste0(work_dir, '/eRegulons.bed'), sep = '\t')
colnames(eRegulon_regions) <- c('seqnames', 'start', 'end', 'name')


color_dict = c(
        'B_cells' = '#FF5714',
        'CD14+_Monocytes' = '#6EEB83',
        'FCGR3A+_Monocytes' = '#65B891',
        'Dendritic_cells_1' = '#575761',
        'Dendritic_cells_2' = '#1BE7FF',
        'CD4_T_cells' = '#1D3461',
        'CD8_T_cells' = '#3D5A6C',
        'NK_cells' = '#64B6AC')

selected_eRegulons <- c('POU2AF1_+', 'EBF1_+', 'PAX5_+')

color_dict_eRegulons = c(
	'POU2AF1_+' = 'Red',  
	'EBF1_+' = 'Green',  
	'PAX5_+' = 'Blue',
	'EBF1_+, POU2AF1_+' = 'goldenrod1',
	'EBF1_+, PAX5_+' =  'cyan4',
	'PAX5_+, POU2AF1_+' = 'purple4',
	'EBF1_+, PAX5_+, POU2AF1_+' = 'black'
)

bw_PAX5 <- '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/data/ChIP-seq/bigwig/GM12878_PAX5_ENCFF702MTT.bigWig'
bw_EBF1 <- '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/data/ChIP-seq/bigwig/GM12878_EBF1_ENCFF107LDM.bigWig'
bw_POU2F2 <- '/staging/leuven/stg_00002/lcb/sdewin/PhD/papers/SCENICPLUS_BravoDeWinter_etal_2022/Figure_pbmc/v3_clustered_consensus/data/ENCFF803HIP.bigWig'

library(dplyr)
eRegulon_regions <- subset(eRegulon_regions, name %in% selected_eRegulons)
eRegulon_regions <- eRegulon_regions[order(eRegulon_regions$name), ]
eRegulon_regions <- eRegulon_regions %>%
                        group_by(seqnames, start, end) %>%
                        mutate(name = paste0(name, collapse = ", "))
gr_eRegulon_regions <- makeGRangesFromDataFrame(eRegulon_regions, keep.extra.columns = TRUE)
r2g_links_gene <- subset(r2g_links, gene == "BLNK")
r2g_links_gr <- makeGRangesFromDataFrame(subset(r2g_links_gene, chromStart %in% eRegulon_regions$start)[c('seqnames', 'start', 'end', 'gene', 'score')], keep.extra.columns = TRUE)
Links(pbmc) <- r2g_links_gr

gene_to_region_of_interest = c(
	'BLNK' = 'chr10-96226082-96316945',
	'PLEKHG1' = 'chr6-150562217-150730918',
	'CD22' ='chr19-35316521-35485142'
)

library(ggplot2)
library(RColorBrewer)

for(gene in names(gene_to_region_of_interest)) {
	print(gene)
	region = gene_to_region_of_interest[[gene]]
	eRegulon_regions <- eRegulon_regions[eRegulon_regions[['name']] %in% selected_eRegulons, ]
	eRegulon_regions <- eRegulon_regions[order(eRegulon_regions$name), ]
	eRegulon_regions <- eRegulon_regions %>%
				group_by(seqnames, start, end) %>%
				mutate(name = paste0(name, collapse = ", "))
	gr_eRegulon_regions <- makeGRangesFromDataFrame(eRegulon_regions, keep.extra.columns = TRUE)
	r2g_links_gene <- r2g_links[r2g_links[['gene']] == gene, ]
	r2g_links_gr <- makeGRangesFromDataFrame(r2g_links_gene[r2g_links_gene[['chromStart']] %in% eRegulon_regions$start,][c('seqnames', 'start', 'end', 'gene', 'score')], keep.extra.columns = TRUE)
	Links(pbmc) <- r2g_links_gr
	peak_plot <- PeakPlot(
		object = pbmc,
		region = region,
		peaks = gr_eRegulon_regions,
		group.by = 'name'
	) & scale_fill_manual(values=color_dict_eRegulons)
	cov_plot <- CoveragePlot(
	  object = pbmc,
	  region = region,
	  group.by = 'GEX_celltype',
	  annotation = FALSE,
	  peaks = FALSE,
	  links=FALSE,
	  bigwig = list('PAX5 ChIP-seq' = bw_PAX5, 'EBF1 ChIP-seq' = bw_EBF1, 'POU2F2 ChIP-seq' = bw_POU2F2),
	  bigwig.scale = 'separate'
	)  & scale_fill_manual(values=color_dict)
	cov_plot_wo_bw <- CoveragePlot(
          object = pbmc,
          region = region,
          group.by = 'GEX_celltype',
          annotation = FALSE,
          peaks = FALSE,
          links=FALSE
        )  & scale_fill_manual(values=color_dict)
	expr_plot <- ExpressionPlot(
		object = pbmc,
		features = c(gene, 'PAX5', 'POU2AF1', 'EBF1'),
		assay = 'RNA',
		group.by = 'GEX_celltype'
	) & scale_fill_manual(values=color_dict)
	# Annotation
	gene_plot <- AnnotationPlot(
	  object = pbmc,
	  region = region
	)
	# Links
	link_plot <- LinkPlot(
	  object = pbmc,
	  region = region,
	  min.cutoff = -1) + scale_colour_gradientn(colours = brewer.pal(n = 9, name = "Blues"), limits=c(0, NA))
	library(patchwork)
	p <- CombineTracks(
	  plotlist = list(cov_plot, peak_plot, link_plot, gene_plot),
	  heights = c(5, 1, 1, 1),
	  widths = c(10,4)
	)
	pdf(paste0(plot_dir, paste0('/R_R2G_', gene, '.pdf')))
	print(p)
	dev.off()	
	p <- CombineTracks(
		plotlist = list(cov_plot_wo_bw, peak_plot, link_plot, gene_plot),
		expression.plot = expr_plot,
		heights = c(5, 1, 1, 1),
		widths = c(10,4))
	pdf(paste0(plot_dir, paste0('/R_R2G_', gene, '_w_expr.pdf')))
	print(p)
	dev.off()
}

saveRDS(pbmc, paste0(work_dir, 'SIGNAC_obj.RDS'))

Supplementary Figure S9, pannel c: Comparison with HiC

import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['svg.fonttype'] = 'none'

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import os

wdir = '/staging/leuven/stg_00002/lcb/sdewin/PhD/papers/SCENICPLUS_BravoDeWinter_etal_2022/revisions_nat_meth/figures_reviewer_questions/HiC_pbmc'
data_dir = '/staging/leuven/stg_00002/lcb/sdewin/PhD/papers/SCENICPLUS_BravoDeWinter_etal_2022/revisions_nat_meth/HIC_PBMC'

merged_data_hic = pd.read_csv(
	os.path.join(data_dir, 'merged_data_hic.tsv'), sep = '\t')

merged_data_scplus = pd.read_csv(
	os.path.join(data_dir, 'merged_data_scplus.tsv'), sep = '\t')

import dill
scplus_obj = dill.load(
        open('/staging/leuven/stg_00002/lcb/sdewin/PhD/Multiomics_pipeline/analysis/pbmc_granulocyte_sorted_10k/scenicplus_clustered_consensus/scplus_obj.pkl', 'rb'))

from scenicplus.utils import *
selected_regions = list(set(flatten_list([scplus_obj.uns['eRegulon_signatures_filtered']['Region_based'][x] for x in   [x for x in scplus_obj.uns['selected_eRegulon']['Region_based'] if '_+_' in x]])))
selected_genes = list(set(flatten_list([scplus_obj.uns['eRegulon_signatures_filtered']['Gene_based'][x] for x in   [x for x in scplus_obj.uns['selected_eRegulon']['Gene_based'] if '_+_' in x]])))

n_regions_per_gene = scplus_obj.uns['eRegulon_metadata_filtered'].query('Gene in @selected_genes').query('Region in @selected_regions')[['Gene', 'Region']].drop_duplicates().groupby('Gene').count()['Region'].to_numpy()


#histogram of nr of regions per gene and rank
fig, axs = plt.subplots(nrows = 2, ncols = 2, figsize = (8,8), sharex = 'col', sharey = 'col')
axs[0, 0].hist(
	merged_data_scplus.groupby('Gene')['Region'].apply(lambda x: len(set(x))),
	bins = np.arange(merged_data_scplus.groupby('Gene')['Region'].apply(lambda x: len(set(x))).max()), color = 'gray', edgecolor = 'black')
axs[1, 0].hist(
	merged_data_hic.groupby('Gene')['Region'].apply(lambda x: len(set(x))),
	bins = np.arange(merged_data_hic.groupby('Gene')['Region'].apply(lambda x: len(set(x))).max()), color = 'gray', edgecolor = 'black')
rank_max_importance_splus = merged_data_scplus.groupby('Region')[['rank_gene', 'R2G_importance']].apply(lambda x: np.array(x['rank_gene'])[np.argmax(x['R2G_importance'])])
axs[0, 1].hist(
	rank_max_importance_splus,
	bins = np.arange(rank_max_importance_splus.max()), color = 'gray', edgecolor = 'black')
rank_max_score = merged_data_hic.groupby('Region')[['rank_gene', 'Score']].apply(lambda x: np.array(x['rank_gene'])[np.argmax(x['Score'])])
axs[1, 1].hist(
	rank_max_score,
	bins = np.arange(rank_max_score.max()), color = 'gray', edgecolor = 'black')
for ax in axs.ravel():
	sns.despine(ax = ax)
axs[0, 0].set_ylabel('Frequency')
axs[1, 0].set_ylabel('Frequency')
axs[0, 0].set_title('SCENIC+: # regions per gene')
axs[0, 1].set_title('SCENIC+: nth. gene per region')
axs[1, 0].set_title('Hi-C: # regions per gene')
axs[1, 1].set_title('Hi-C: nth. gene per region')
fig.tight_layout()
fig.savefig(os.path.join(wdir, 'hist_n_regions_per_gene_and_rank.pdf'))
fig.savefig(os.path.join(wdir, 'hist_n_regions_per_gene_and_rank.png'))

hic_correlation_data = pd.read_csv(
	os.path.join(data_dir, 'correlation_w_hic_B_cell_marker_genes.tsv'), sep = '\t')

order = ['GBM_rnd', 'GBM', 'rho_rnd', 'rho']
hic_correlation_data['method'] = pd.Categorical(hic_correlation_data['method'], categories = order)
color_dict = {
	'GBM_rnd': '#D0CFEC', 
	'GBM': '#3454D1', 
	'rho_rnd': '#F8C0C8', 
	'rho': '#CC5A71'}
sns.set_palette([color_dict[t] for t in order])


from scipy.stats import mannwhitneyu
pairs = [('GBM_rnd', 'GBM'), ('rho_rnd', 'rho')]
p_values = [
	mannwhitneyu(hic_correlation_data.loc[hic_correlation_data['method'] == t1, 'correlation'], hic_correlation_data.loc[hic_correlation_data['method'] == t2, 'correlation']).pvalue
	for t1, t2 in pairs]

p_values_formated = [f'p={p:.1e}' for p in p_values]
from statannotations.Annotator import Annotator

fig, ax = plt.subplots()
sns.boxplot(
	data = hic_correlation_data,
	x = 'method', y = 'correlation')
ax.hlines(y = 0, xmin = -0.5, xmax = 4, ls = 'dashed', color = 'gray')
ax.set_ylim((-1, 1))
annotator = Annotator(ax, pairs, data = hic_correlation_data, x = 'method', y = 'correlation')
annotator.set_custom_annotations(p_values_formated)
annotator.annotate()
sns.despine(ax = ax)
fig.tight_layout()
fig.savefig(os.path.join(wdir, 'bxplot_corr_hic.pdf'))
fig.savefig(os.path.join(wdir, 'bxplot_corr_hic.png'))

sum(merged_data_scplus.groupby('Gene')['Region'].apply(lambda x: len(set(x))).values < 10) #5928
len(set(merged_data_scplus['Gene'])) #6605

sum(merged_data_hic.groupby('Gene')['Region'].apply(lambda x: len(set(x))).values < 10) #4047
len(set(merged_data_hic['Gene'])) #6825

sum(rank_max_importance_splus == 1) #11533
len(rank_max_importance_splus) #23470

sum(rank_max_score == 1) #17903
len(rank_max_score) #23450

Supplementary Figure S9, pannel e: TFs found by SCENIC+, Signac and ArchR

import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['svg.fonttype'] = 'none'

outdir = '/staging/leuven/stg_00002/lcb/sdewin/PhD/papers/SCENICPLUS_BravoDeWinter_etal_2022/revisions_nat_meth/figures_reviewer_questions/pbmc_signac_archr'

import os
import pandas as pd
import numpy as np

import dill
scplus_folder = "/staging/leuven/stg_00002/lcb/sdewin/PhD/Multiomics_pipeline/analysis/pbmc_granulocyte_sorted_10k/scenicplus_clustered_consensus"
scplus_obj = dill.load(
        open(os.path.join(scplus_folder, 'scplus_obj.pkl'), 'rb'))

data_folder = "/staging/leuven/stg_00002/lcb/sdewin/PhD/papers/SCENICPLUS_BravoDeWinter_etal_2022/revisions_nat_meth/extra_methods/pbmc/extract_data"

archr_signac_outdir = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/archr_signac_comparison/pbmc/'

tool_to_data = {
	'SCENIC+': scplus_obj.uns['eRegulon_metadata_filtered'].copy(),
	'Signac DEM': pd.read_table(os.path.join(archr_signac_outdir, 'find_motifs_dars_matrix.tsv')),
	'Signac ChromVar': pd.read_table(os.path.join(archr_signac_outdir, 'chromVAR_cistromes.tsv')),
	'ArchR motif DEM': pd.read_table(os.path.join(archr_signac_outdir, 'ArchR/ArchR_motifs_DARs_matrix.tsv')),
	'ArchR track DEM': pd.read_table(os.path.join(archr_signac_outdir, 'ArchR/ArchR_ENCODE_DARs_matrix.tsv')),
	'ArchR ChromVar': pd.read_table(os.path.join(archr_signac_outdir, 'ArchR/ArchR_chromVAR_cistromes_matrix.tsv'))}

from tqdm import tqdm
def split_dimers_into_two_columns(df: pd.DataFrame):
	for column in tqdm(df.columns, total = len(df.columns)):
		if '::' in column:
			col1, col2 = column.split('::')
			if col1 in df.columns:
				df[col1] = df[col1].to_numpy() + df[column].to_numpy()
			else:
				df[col1] = df[column].to_numpy()
			if col2 in df.columns:
				df[col2] = df[col2].to_numpy() + df[column].to_numpy()
			else:
				df[col2] = df[column].to_numpy()
			df.drop(column, axis = 1, inplace = True)

for tool in set(tool_to_data.keys()) - set(['SCENIC+']):
	print(tool)
	split_dimers_into_two_columns(tool_to_data[tool])


tool_to_data['ArchR track DEM'].columns =  [x.split('_')[0].split('-')[0] for x in tool_to_data['ArchR track DEM'].columns]

tool_to_data['ArchR track DEM'] = tool_to_data['ArchR track DEM'].groupby(lambda x: x, axis = 1).sum()

tool_to_data['SCENIC+']['Consensus_name'] = [x.split('_(')[0] for x in tool_to_data['SCENIC+']['Region_signature_name']]

tool_to_data['SCENIC+'].rename({'Region': 'region', 'Gene': 'gene'}, axis = 1, inplace = True)

TFs_all_tools = list(set(tool_to_data['SCENIC+']['TF']))

TFs_all_tools = []
for tool in set(tool_to_data.keys()) - set(['SCENIC+']):
	TFs_all_tools.extend(list(tool_to_data[tool].columns[tool_to_data[tool].sum() >= 10]))
TFs_all_tools = set(TFs_all_tools)

TFs_for_tools = pd.DataFrame(
	index = TFs_all_tools,
	columns = tool_to_data.keys()).fillna(0)

TFs_for_tools.loc[tool_to_data['SCENIC+']['TF']] = 1 

for tool in set(tool_to_data.keys() - set(['SCENIC+'])):
	TFs_for_tools.loc[list(tool_to_data[tool].columns[tool_to_data[tool].sum() >= 10]), tool] = 1

TFs_for_tools = TFs_for_tools.loc[list(set(TFs_for_tools.index) & set(scplus_obj.gene_names))]

import seaborn as sns
import matplotlib.pyplot as plt

g = sns.clustermap(
	TFs_for_tools.T,
	cmap = ['Red', 'Green'],
	yticklabels = True,
	xticklabels = False,
	figsize = (13, 4),
	dendrogram_ratio = 0.05,
	**dict(cbar_kws=dict(ticks=[0, 1], orientation='horizontal', shrink = 0.01)))
x0, _y0, _w, _h = g.cbar_pos
g.ax_heatmap.set_yticklabels(labels = g.ax_heatmap.get_yticklabels(), va = 'center', rotation = 0)
g.ax_heatmap.set_xticklabels(labels = g.ax_heatmap.get_xticklabels(), ha = 'right', rotation = 45, rotation_mode="anchor")
g.ax_cbar.set_position([0.02, 0.98, 0.01, 0.02])
g.ax_cbar.set_title('TF found')
#g.ax_cbar.set_ticks([0.25,0.75])
#g.ax_cbar.set_ticklabels(['0', '1'])
g.savefig(os.path.join(outdir, "scplus_TFs_found_by_methods.pdf"))
g.savefig(os.path.join(outdir, "scplus_TFs_found_by_methods.png"), dpi = 300)


TFs_for_tools_w_cistromes = TFs_for_tools.copy()

cistrome_TFs = list(set([x.split('_')[0] for x in scplus_obj.uns['Cistromes']['Unfiltered'].keys()]))

new_TFs = list(set(cistrome_TFs) - set(TFs_for_tools_w_cistromes.index))

TFs_for_tools_w_cistromes = pd.concat([pd.DataFrame(index = new_TFs, columns = TFs_for_tools_w_cistromes.columns), TFs_for_tools_w_cistromes]).fillna(0)

TFs_for_tools_w_cistromes.loc[cistrome_TFs, 'SCENIC+ Cistromes'] = 1
TFs_for_tools_w_cistromes = TFs_for_tools_w_cistromes.fillna(0)

g = sns.clustermap(
	TFs_for_tools_w_cistromes.T,
	cmap = ['Red', 'Green'],
	yticklabels = True,
	xticklabels = False,
	figsize = (13, 4),
	dendrogram_ratio = 0.05,
	**dict(cbar_kws=dict(ticks=[0, 1], orientation='horizontal', shrink = 0.01)))
x0, _y0, _w, _h = g.cbar_pos
g.ax_heatmap.set_yticklabels(labels = g.ax_heatmap.get_yticklabels(), va = 'center', rotation = 0)
g.ax_heatmap.set_xticklabels(labels = g.ax_heatmap.get_xticklabels(), ha = 'right', rotation = 45, rotation_mode="anchor")
g.ax_cbar.set_position([0.02, 0.98, 0.01, 0.02])
g.ax_cbar.set_title('TF found')
#g.ax_cbar.set_ticks([0.25,0.75])
#g.ax_cbar.set_ticklabels(['0', '1'])
g.savefig(os.path.join(outdir, "scplus_TFs_found_by_methods_w_cistromes.pdf"))
g.savefig(os.path.join(outdir, "scplus_TFs_found_by_methods_w_cistromes.png"), dpi = 300)

Supplementary Figure S9, pannel f: TFs found by SCENIC+, CellOracle, FigR and SCENIC

import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['svg.fonttype'] = 'none'

outdir = '/staging/leuven/stg_00002/lcb/sdewin/PhD/papers/SCENICPLUS_BravoDeWinter_etal_2022/revisions_nat_meth/figures_reviewer_questions/pbmc_other_eGRN_methods'
import os
os.makedirs(outdir)

import os
import pandas as pd
import numpy as np

import dill
scplus_folder = "/staging/leuven/stg_00002/lcb/sdewin/PhD/Multiomics_pipeline/analysis/pbmc_granulocyte_sorted_10k/scenicplus_clustered_consensus"
scplus_obj = dill.load(
        open(os.path.join(scplus_folder, 'scplus_obj.pkl'), 'rb'))

data_folder = "/staging/leuven/stg_00002/lcb/sdewin/PhD/papers/SCENICPLUS_BravoDeWinter_etal_2022/revisions_nat_meth/extra_methods/pbmc/extract_data"

tool_to_regulon_metadata = {
        'SCENIC': pd.read_table(os.path.join(data_folder, 'SCENIC_regulons.tsv')),
        'CellOracle': pd.read_table(os.path.join(data_folder, 'celloracle_eRegulons.tsv')),
        'FigR': pd.read_table(os.path.join(data_folder, 'figr_eRegulons.tsv')),
        'Pando': pd.read_table(os.path.join(data_folder, 'pando_eRegulons.tsv')),
        #'SCENIC+': scplus_obj.uns['eRegulon_metadata_filtered'].copy()}

tool_to_regulon_metadata['SCENIC']['TF'] = [x.split('_')[0] for x in tool_to_regulon_metadata['SCENIC']['TF']]

tool_to_regulon_metadata['SCENIC']['Consensus_name'] = tool_to_regulon_metadata['SCENIC']['TF']

tool_to_regulon_metadata['CellOracle']['Consensus_name'] = [
	f"{TF}_+" if score >= 0 else f"{TF}_-" for TF, score in tool_to_regulon_metadata['CellOracle'][['TF', 'coef_mean']].to_numpy()]

tool_to_regulon_metadata['FigR']['Consensus_name'] = [
	f"{TF}_+" if score >= 0 else f"{TF}_-" for TF, score in tool_to_regulon_metadata['FigR'][['TF', 'Corr']].to_numpy()]

tool_to_regulon_metadata['Pando']['Consensus_name'] = [
	 f"{TF}_+" if score >= 0 else f"{TF}_-" for TF, score in tool_to_regulon_metadata['Pando'][['TF', 'estimate']].to_numpy()]

tool_to_regulon_metadata['SCENIC+']['Consensus_name'] = [x.split('_(')[0] for x in tool_to_regulon_metadata['SCENIC+']['Region_signature_name']]

tool_to_regulon_metadata['SCENIC+'].rename({'Region': 'region', 'Gene': 'gene'}, axis = 1, inplace = True)

min_target_genes = 10
for tool in tool_to_regulon_metadata.keys():
	eRegulon_and_counts = tool_to_regulon_metadata[tool].groupby('Consensus_name')['gene'].apply(lambda x: len(set(x)))
	eRegulons_to_keep = eRegulon_and_counts.index[eRegulon_and_counts >= min_target_genes]
	tool_to_regulon_metadata[tool] = tool_to_regulon_metadata[tool].query('Consensus_name in @eRegulons_to_keep')

TFs_all_tools = []
for tool in tool_to_regulon_metadata.keys():
	TFs_all_tools.extend(tool_to_regulon_metadata[tool]['TF'])
TFs_all_tools = set(TFs_all_tools)

TFs_for_tools = pd.DataFrame(
	index = TFs_all_tools,
	columns = tool_to_regulon_metadata.keys()).fillna(0)
for tool in tool_to_regulon_metadata.keys():
	TFs_found_by_tool = list(set(tool_to_regulon_metadata[tool]['TF']))
	TFs_for_tools.loc[TFs_found_by_tool, tool] = 1

import seaborn as sns
import matplotlib.pyplot as plt

g = sns.clustermap(
	TFs_for_tools.T[[x.split('_')[0] for x in scplus_obj.uns['selected_eRegulon']['Gene_based']]],
	cmap = ['Red', 'Green'],
	yticklabels = True,
	xticklabels = True,
	figsize = (13, 4),
	dendrogram_ratio = 0.05,
	**dict(cbar_kws=dict(ticks=[0, 1], orientation='horizontal', shrink = 0.01)))
x0, _y0, _w, _h = g.cbar_pos
g.ax_heatmap.set_yticklabels(labels = g.ax_heatmap.get_yticklabels(), va = 'center', rotation = 0)
g.ax_heatmap.set_xticklabels(labels = g.ax_heatmap.get_xticklabels(), ha = 'right', rotation = 45, rotation_mode="anchor")
g.ax_cbar.set_position([0.02, 0.98, 0.01, 0.02])
g.ax_cbar.set_title('TF found')
#g.ax_cbar.set_ticks([0.25,0.75])
#g.ax_cbar.set_ticklabels(['0', '1'])
g.savefig(os.path.join(outdir, "scplus_TFs_found_by_methods.pdf"))
g.savefig(os.path.join(outdir, "scplus_TFs_found_by_methods.png"), dpi = 300)


g = sns.clustermap(
	TFs_for_tools.T,
	cmap = ['Red', 'Green'],
	yticklabels = True,
	xticklabels = False,
	figsize = (13, 4),
	dendrogram_ratio = 0.05,
	**dict(cbar_kws=dict(ticks=[0, 1], orientation='horizontal', shrink = 0.01)))
x0, _y0, _w, _h = g.cbar_pos
g.ax_heatmap.set_yticklabels(labels = g.ax_heatmap.get_yticklabels(), va = 'center', rotation = 0)
g.ax_heatmap.set_xticklabels(labels = g.ax_heatmap.get_xticklabels(), ha = 'right', rotation = 45, rotation_mode="anchor")
g.ax_cbar.set_position([0.02, 0.98, 0.01, 0.02])
g.ax_cbar.set_title('TF found')
#g.ax_cbar.set_ticks([0.25,0.75])
#g.ax_cbar.set_ticklabels(['0', '1'])
g.savefig(os.path.join(outdir, "TFs_found_by_methods.pdf"))
g.savefig(os.path.join(outdir, "TFs_found_by_methods.png"), dpi = 300)

Supplementary Fig S9, pannel g: GO enrichment on TFs found by Signac, SCENIC+ and ArchR

import pandas as pd

out_dir = '/staging/leuven/stg_00002/lcb/sdewin/PhD/papers/SCENICPLUS_BravoDeWinter_etal_2022/revisions_nat_meth/figures_reviewer_questions/pbmc_signac_archr'

import os

data_dir = '/staging/leuven/stg_00002/lcb/sdewin/PhD/papers/SCENICPLUS_BravoDeWinter_etal_2022/revisions_nat_meth/extra_methods/pbmc/pbmc_signac_archr/'
geo_result_files = {
	'SCENIC+': pd.read_csv(os.path.join(data_dir, 'SCPLUS_targets.csv')),
	'Signac': pd.read_csv(os.path.join(data_dir, 'Signac_targets.csv')),
	'ArchR':  pd.read_csv(os.path.join(data_dir, 'ArchR_targets.csv'))}

top_n = 10
top_on = 'negative_log10_of_adjusted_p_value'

geo_result = []
for tool in geo_result_files.keys():
	tmp = geo_result_files[tool]
	tmp = tmp.nlargest(top_n, top_on)
	tmp['Tool'] = tool
	geo_result.append(tmp)

geo_result = pd.concat(geo_result)

import matplotlib.pyplot as plt
import numpy as np
from adjustText import adjust_text
import seaborn as sns

fig, axs = plt.subplots(ncols = 3, figsize = (3 * 4, 5))
for tool,ax in zip(set(geo_result['Tool']), axs.ravel()):
	data = geo_result.loc[geo_result['Tool'] == tool]
	ax.scatter(
		y = np.arange(len(data))[::-1], x = data['negative_log10_of_adjusted_p_value'], color = 'black')
	texts = [ax.text(x, y, t) for x, y, t in zip(data['negative_log10_of_adjusted_p_value'], np.arange(len(data))[::-1], data['term_name'])]
	#adjust_text(texts)
	ax.set_title(f'{tool}')
	ax.set_xlabel('log10(Adjusted p value)')
	sns.despine(ax = ax)
fig.tight_layout()
fig.savefig(os.path.join(out_dir, 'geo_scatter.pdf'))
fig.savefig(os.path.join(out_dir, 'geo_scatter.png'))

Supplementary Fig S9, pannel h: Supplementary Fig S9, pannel g: GO enrichment on TFs found by Pando, SCENIC+ and CellOracle

import pandas as pd

out_dir = '/staging/leuven/stg_00002/lcb/sdewin/PhD/papers/SCENICPLUS_BravoDeWinter_etal_2022/revisions_nat_meth/figures_reviewer_questions/pbmc_other_eGRN_methods'

import os

data_dir = '/staging/leuven/stg_00002/lcb/sdewin/PhD/papers/SCENICPLUS_BravoDeWinter_etal_2022/revisions_nat_meth/extra_methods/pbmc/extract_data'
geo_result_files = {
	'SCENIC+': pd.read_csv(os.path.join(data_dir, 'gProfiler_scenic_plus.csv')),
	'CellOracle': pd.read_csv(os.path.join(data_dir, 'gProfiler_celloracle.csv')),
	'Pando':  pd.read_csv(os.path.join(data_dir, 'gProfiler_pando.csv')),
	'SCENIC': pd.read_csv(os.path.join(data_dir, 'gProfiler_scenic.csv'))}

top_n = 10
top_on = 'negative_log10_of_adjusted_p_value'

geo_result = []
for tool in geo_result_files.keys():
	tmp = geo_result_files[tool]
	tmp = tmp.nlargest(top_n, top_on)
	tmp['Tool'] = tool
	geo_result.append(tmp)

geo_result = pd.concat(geo_result)

import matplotlib.pyplot as plt
import numpy as np
from adjustText import adjust_text
import seaborn as sns

fig, axs = plt.subplots(ncols = 3, figsize = (3 * 4, 5), sharex = False)
for tool,ax in zip(set(geo_result['Tool']), axs.ravel()):
	data = geo_result.loc[geo_result['Tool'] == tool]
	ax.scatter(
		y = np.arange(len(data))[::-1], x = data['negative_log10_of_adjusted_p_value'], color = 'black')
	texts = [ax.text(x, y, t) for x, y, t in zip(data['negative_log10_of_adjusted_p_value'], np.arange(len(data))[::-1], data['term_name'])]
	#adjust_text(texts)
	ax.set_title(f'{tool}')
	ax.set_xlabel('log10(Adjusted p value)')
	sns.despine(ax = ax)
fig.tight_layout()
fig.savefig(os.path.join(out_dir, 'geo_scatter.pdf'))
fig.savefig(os.path.join(out_dir, 'geo_scatter.png'))

Supplementary Fig S9, pannel i: Accuracy of target regions identified by SCENIC+, Signac and ArchR

data_dir = '/staging/leuven/stg_00002/lcb/sdewin/PhD/papers/SCENICPLUS_BravoDeWinter_etal_2022/revisions_nat_meth/extra_methods/pbmc/pbmc_signac_archr'

import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['svg.fonttype'] = 'none'

outdir = '/staging/leuven/stg_00002/lcb/sdewin/PhD/papers/SCENICPLUS_BravoDeWinter_etal_2022/revisions_nat_meth/figures_reviewer_questions/pbmc_signac_archr'
out_dir = outdir

import pandas as pd
import os
data_f1 = pd.read_csv(os.path.join(data_dir, 'data_f1.tsv'), sep = '\t')
data_pr = pd.read_csv(os.path.join(data_dir, 'data_pr.tsv'), sep = '\t')
data_re = pd.read_csv(os.path.join(data_dir, 'data_re.tsv'), sep = '\t')

tool_to_color = {
	'SCENIC+ Cistromes': '#f70404',
	'Signac ChromVar': '#3772FF',
	'Signac DEM': '#30BCED',
	'ArchR motif DEM': '#44AF69',
	'ArchR ChromVar': '#008148',
	'ArchR track DEM': '#C8D5B9'}

def plot(params, ax):
	sns.violinplot(
		inner = None,
		ax = ax,
		**params)
	sns.stripplot(
		color="#2b2b2b", size = 2,
		ax = ax,
		**params)

def set_y_log(ax):
	ax.yaxis.set_major_formatter(mticker.StrMethodFormatter("$10^{{{x:.0f}}}$"))
	ymin, ymax = ax.get_ylim()
	tick_range = np.arange(np.floor(ymin), ymax)
	ax.yaxis.set_ticks(tick_range)
	ax.yaxis.set_ticks([np.log10(x) for p in tick_range for x in np.linspace(10 ** p, 10 ** (p + 1), 10)], minor=True)

import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import mannwhitneyu
from itertools import combinations
from scenicplus.utils import p_adjust_bh
from statannotations.Annotator import Annotator
from matplotlib import ticker as mticker
import numpy as np

fig, axs = plt.subplots(ncols = 3, nrows = 1, figsize = (12, 4))
order = data_f1.groupby('Tool')['F1'].median().sort_values(ascending = False).index
data_f1['Tool'] = pd.Categorical(data_f1['Tool'], categories = order)
sns.set_palette(sns.color_palette([tool_to_color[tool] for tool in order]))
plot(params = {'data': data_f1, 'x': 'Tool','y': 'F1'}, ax = axs[0])
axs[0].set_xticklabels(order, rotation=45, ha='right')
set_y_log(axs[0])
sns.despine(ax = axs[0])
pairs = []
for i in range(0, len(order) - 1, 1):
	pairs.append((order[i], order[i + 1]))
pairs = pairs[::-1]
p_values = [mannwhitneyu(data_f1.loc[data_f1['Tool'] == t1, 'F1'], data_f1.loc[data_f1['Tool'] == t2, 'F1']).pvalue for t1, t2 in pairs]
p_values = p_adjust_bh(p_values)
p_values_formatted = [f'p={p:.1e}' for p in p_values]
annotator = Annotator(axs[0], pairs, **{'data': data_f1, 'x': 'Tool','y': 'F1'})
annotator.set_custom_annotations(p_values_formatted)
annotator.annotate()
order = data_pr.groupby('Tool')['Precision'].median().sort_values(ascending = False).index
data_pr['Tool'] = pd.Categorical(data_pr['Tool'], categories = order)
sns.set_palette(sns.color_palette([tool_to_color[tool] for tool in order]))
plot(params = {'data': data_pr, 'x': 'Tool','y': 'Precision'}, ax = axs[1])
axs[1].set_xticklabels(order, rotation=45, ha='right')
set_y_log(axs[1])
sns.despine(ax = axs[1])
pairs = []
for i in range(0, len(order) - 1, 1):
	pairs.append((order[i], order[i + 1]))
pairs = pairs[::-1]
p_values = [mannwhitneyu(data_pr.loc[data_pr['Tool'] == t1, 'Precision'], data_pr.loc[data_pr['Tool'] == t2, 'Precision']).pvalue for t1, t2 in pairs]
p_values = p_adjust_bh(p_values)
p_values_formatted = [f'p={p:.1e}' for p in p_values]
annotator = Annotator(axs[1], pairs, **{'data': data_pr, 'x': 'Tool','y': 'Precision'})
annotator.set_custom_annotations(p_values_formatted)
annotator.annotate()
order = data_re.groupby('Tool')['Recall'].median().sort_values(ascending = False).index
data_re['Tool'] = pd.Categorical(data_re['Tool'], categories = order)
sns.set_palette(sns.color_palette([tool_to_color[tool] for tool in order]))
plot(params = {'data': data_re,'x': 'Tool','y': 'Recall'}, ax = axs[2])
axs[2].set_xticklabels(order, rotation=45, ha='right')
set_y_log(axs[2])
sns.despine(ax = axs[2])
pairs = []
for i in range(0, len(order) - 1, 1):
	pairs.append((order[i], order[i + 1]))
pairs = pairs[::-1]
p_values = [mannwhitneyu(data_re.loc[data_re['Tool'] == t1, 'Recall'], data_re.loc[data_re['Tool'] == t2, 'Recall']).pvalue for t1, t2 in pairs]
p_values = p_adjust_bh(p_values)
p_values_formatted = [f'p={p:.1e}' for p in p_values]
annotator = Annotator(axs[2], pairs, **{'data': data_re, 'x': 'Tool','y': 'Recall'})
annotator.set_custom_annotations(p_values_formatted)
annotator.annotate()
fig.tight_layout()
fig.savefig(os.path.join(out_dir, 'pr_re_f1.pdf'))
fig.savefig(os.path.join(out_dir, 'pr_re_f1.png'))

Supplementary Fig S9, pannel j: Accuracy of target regions identified by SCENIC+, CellOracle and Pando

tool_to_n_regions = {}
for tool in ['SCENIC+', 'Pando', 'CellOracle']:
	tool_to_n_regions[tool] = tool_to_regulon_metadata[tool].groupby('Consensus_name')['region'].apply(lambda x: len(set(x))).values

tool_to_color = {
          'SCENIC+': '#f70404',
          'GRaNIE': '#f9a204',
          'Pando': '#1c94fb',
          'CellOracle': '#228b21',
	  'SCENIC': '#ff1494',
	  'FigR': '#a020f0'}

from scipy.stats import mannwhitneyu

SCENIC = np.log10(list(tool_to_n_regions.values())[0])
Pando = np.log10(list(tool_to_n_regions.values())[1])
CellOracle = np.log10(list(tool_to_n_regions.values())[2])

data = pd.concat([
	pd.DataFrame({'SCENIC+': SCENIC}).melt().rename({'variable': 'tool', 'value': 'log(n_regions)'}, axis = 1),
	pd.DataFrame({'Pando': Pando}).melt().rename({'variable': 'tool', 'value': 'log(n_regions)'}, axis = 1),
	pd.DataFrame({'CellOracle': CellOracle}).melt().rename({'variable': 'tool', 'value': 'log(n_regions)'}, axis = 1)
])

from statannotations.Annotator import Annotator
pvalues = [
	mannwhitneyu(SCENIC, Pando).pvalue,
	mannwhitneyu(Pando, CellOracle).pvalue,
	mannwhitneyu(SCENIC, CellOracle).pvalue]
formatted_pvalues = [f'p={pvalue:.1e}' for pvalue in pvalues]
pairs = [('SCENIC+', 'Pando'), ('Pando', 'CellOracle'), ('SCENIC+', 'CellOracle')]

plotting_parameters = {
	'data': data,
	'x': 'tool',
	'y': 'log(n_regions)'}

from matplotlib import ticker as mticker
fig, ax = plt.subplots()
sns.violinplot(
	inner = None,
#	hue = 'tool',
	palette = tool_to_color,
	ax = ax,
	**plotting_parameters)
sns.stripplot(
	color="#2b2b2b", size = 4,
	ax =ax, alpha = 0.4,
	**plotting_parameters)
#plt.legend([],[], frameon=False)
annotator = Annotator(ax, pairs, **plotting_parameters)
annotator.set_custom_annotations(formatted_pvalues)
annotator.annotate()
ax.yaxis.set_major_formatter(mticker.StrMethodFormatter("$10^{{{x:.0f}}}$"))
ymin, ymax = ax.get_ylim()
tick_range = np.arange(np.floor(ymin), ymax)
ax.yaxis.set_ticks(tick_range)
ax.yaxis.set_ticks([np.log10(x) for p in tick_range for x in np.linspace(10 ** p, 10 ** (p + 1), 10)], minor=True)
ax.set_xticklabels(list(tool_to_n_regions.keys()))
ax.set_ylabel('Nr. of target regions per eRegulon')
fig.tight_layout()
sns.despine(ax = ax)
fig.savefig(os.path.join(outdir, 'n_target_regions_vln.pdf'))
fig.savefig(os.path.join(outdir, 'n_target_regions_vln.png'))
ax_n_targets = ax


data_dir = '/staging/leuven/stg_00002/lcb/sdewin/PhD/papers/SCENICPLUS_BravoDeWinter_etal_2022/revisions_nat_meth/extra_methods/pbmc/ChIP_seq_comparison/'
F1 = pd.read_csv(os.path.join(data_dir, 'F1.tsv'), sep = '\t')
PREC =  pd.read_csv(os.path.join(data_dir, 'PREC.tsv'), sep = '\t')
REC =  pd.read_csv(os.path.join(data_dir, 'REC.tsv'), sep = '\t')

import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import ticker as mticker
from adjustText import adjust_text
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['svg.fonttype'] = 'none'
from statannotations.Annotator import Annotator
from itertools import combinations
from scipy.stats import mannwhitneyu
from scenicplus.utils import p_adjust_bh

def plot(ax, data, x, y, TFs_to_label):
	data = data.copy()
	tools_to_median = data.groupby(x)[y].median()
	tool_order = tools_to_median.sort_values(ascending = False).index
	data[y] = np.log10(data[y])
	data[x] = pd.Categorical(data[x], categories = tool_order)
	pairs = list(combinations(tool_order, 2))
	p_values = [
		mannwhitneyu(data.loc[data[x] == tool_a, y], data.loc[data[x] == tool_b, y]).pvalue
		for tool_a, tool_b in pairs]
	p_values = p_adjust_bh(p_values)
	#pairs = [pair for pair, pval in zip(pairs, p_values) if pval < 0.05]
	#p_values = [pval for pval in p_values if pval < 0.05]
	p_values_formatted = [f'p={p:.1e}' for p in p_values]
	sns.set_palette(sns.color_palette([tool_to_color[tool] for tool in tool_order]))
	sns.violinplot(
		inner = None, ax = ax,
		data = data, x = x, y = y)
	sns.swarmplot(
		color="#2b2b2b", size = 4, ax = ax,
		data = data, x = x, y = y)
	y_TF_to_label = {TF: 
		[
			data.loc[np.logical_and(data[x] == tool, data['TF'] == TF), y].values[0]
			if TF in data.loc[data[x] == tool, 'TF'].to_numpy() else None
			for tool in tool_order]
		for TF in TFs_to_label}
	labels = []
	xs = []
	ys = []
	for tool_id in range(len(tool_order)):
		for TF in y_TF_to_label.keys():
			if not y_TF_to_label[TF][tool_id] == None:
				#print(TF)
				xy = ax.collections[tool_id + len(tool_order)].get_offsets()
				xy_TF = xy[
					np.where(
				[np.round(_xy[1], 5) == np.round(y_TF_to_label[TF][tool_id], 5) for _xy in xy])[0][0]]
				labels.append(ax.text(xy_TF[0], xy_TF[1], TF, fontweight = 'bold', fontname = "Arial", fontsize = 11))
				xs.append(xy_TF[0])
				ys.append(xy_TF[1])
	adjust_text(labels, 
		ax = ax, 
		add_objects = ax.collections[0:3], 
		x = xs, y = ys, 
		arrowprops=dict(arrowstyle='->', lw = 2, color = 'black'),
		autoalign = 'x')
		#expand_points = (2.5, 1.5))
	#ax.set_yscale('log')
	ax.yaxis.set_major_formatter(mticker.StrMethodFormatter("$10^{{{x:.0f}}}$"))
	ymin, ymax = ax.get_ylim()
	tick_range = np.arange(np.floor(ymin), ymax)
	ax.yaxis.set_ticks(tick_range)
	ax.yaxis.set_ticks([np.log10(x) for p in tick_range for x in np.linspace(10 ** p, 10 ** (p + 1), 10)], minor=True)
	#ax.set_ylim(-7, 0)
	annotator = Annotator(ax, pairs, data = data, x = x, y = y)
	annotator.set_custom_annotations(p_values_formatted)
	annotator.annotate()
	sns.despine(ax = ax)

TFs_to_label = ['PAX5', 'CEBPB', 'BCL11A', 'EBF1']
out_dir = '/staging/leuven/stg_00002/lcb/sdewin/PhD/papers/SCENICPLUS_BravoDeWinter_etal_2022/revisions_nat_meth/figures_reviewer_questions/pbmc_other_eGRN_methods'

fig, axs = plt.subplots(ncols = 3, nrows = 1, figsize = (15, 5))
plot(axs[0], F1, 'tool', 'F1', TFs_to_label)
plot(axs[1], PREC, 'tool', 'Precision', TFs_to_label)
plot(axs[2], REC, 'tool', 'Recall', TFs_to_label)
fig.tight_layout()
fig.savefig(os.path.join(out_dir, 'f1_prec_rec.pdf'))
fig.savefig(os.path.join(out_dir, 'f1_prec_rec.png'))