diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..052c273
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+.idea/*
+05_coeqtl_mapping/launch_sbatch_files.sh
diff --git a/01_association_metrics/.ipynb_checkpoints/GRNBoost2-checkpoint.ipynb b/01_association_metrics/.ipynb_checkpoints/GRNBoost2-checkpoint.ipynb
new file mode 100644
index 0000000..b1b9528
--- /dev/null
+++ b/01_association_metrics/.ipynb_checkpoints/GRNBoost2-checkpoint.ipynb
@@ -0,0 +1,349 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import matplotlib as mpl\n",
+ "mpl.rcParams['pdf.fonttype'] = 42\n",
+ "import pandas as pd\n",
+ "import matplotlib.pyplot as plt\n",
+ "import numpy as np\n",
+ "from pathlib import Path\n",
+ "import seaborn as sns\n",
+ "%matplotlib inline\n",
+ "%run dataset.ipynb\n",
+ "\n",
+ "def select_gene_nonzeroratio(df, ratio):\n",
+ " nonzerocounts = np.count_nonzero(df.values, axis=0) / df.shape[0]\n",
+ " selected_genes = df.columns[nonzerocounts > ratio]\n",
+ " return selected_genes"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "datasetname = 'onemillionv2'\n",
+ "dataset = DATASET(datasetname)\n",
+ "dataset.load_dataset()\n",
+ "data_sc = dataset.data_sc"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "395\n"
+ ]
+ }
+ ],
+ "source": [
+ "monocyte_ut = data_sc[(data_sc.obs['time']=='UT') & (data_sc.obs['cell_type_lowerres']=='monocyte')]\n",
+ "monocyte_ut_df = pd.DataFrame(data=monocyte_ut.X.toarray(),\n",
+ " index=monocyte_ut.obs.index,\n",
+ " columns=monocyte_ut.var.index)\n",
+ "mono_genes = select_gene_nonzeroratio(df=monocyte_ut_df, ratio=0.50)\n",
+ "print(len(mono_genes))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "(11482, 379) (194, 379)\n"
+ ]
+ }
+ ],
+ "source": [
+ "bp_df = pd.read_csv('mono_gene_nor_combat_20151109.ProbesWithZeroVarianceRemoved.ProbesCentered.SamplesZTransformed.1PCAsOverSamplesRemoved.txt.gz',\n",
+ " compression='gzip',\n",
+ " sep='\\t', index_col=0)\n",
+ "name_mapping_dic = pd.read_csv('features_v3_reformated_names.tsv',\n",
+ " sep ='\\t',\n",
+ " names=['geneid', 'genename']).set_index(['geneid'])['genename'].T.to_dict()\n",
+ "\n",
+ "bp_df['geneid'] = [item.split('.')[0] for item in bp_df.index]\n",
+ "bp_df['genename'] = [name_mapping_dic.get(geneid) for geneid in bp_df['geneid']]\n",
+ "bp_df = bp_df.dropna(subset=['genename'])\n",
+ "bp_df = bp_df.drop('geneid', axis=1)\n",
+ "bp_df = bp_df.set_index('genename')\n",
+ "print(bp_df.shape)\n",
+ "\n",
+ "bp_trans_df = bp_df.T\n",
+ "common_genes = list(set(mono_genes) & set(bp_trans_df.columns))\n",
+ "selected_mono_df = monocyte_ut_df[common_genes]\n",
+ "selected_bp_df = bp_trans_df[common_genes]\n",
+ "print(selected_mono_df.shape, selected_bp_df.shape)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "selected_mono_df.T.to_csv('sc_Expression.csv', sep=',')\n",
+ "selected_bp_df.T.to_csv('bp_Expression.csv', sep=',')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# create this fake pseudo time ordering because it's required to run the Beeline tool, but not used by GRNBoost2\n",
+ "fake_timepoint_bp = pd.DataFrame(index=selected_bp_df.index)\n",
+ "fake_timepoint_bp['time'] = np.arange(selected_bp_df.shape[0])\n",
+ "fake_timepoint_bp.to_csv('bp_timepoint.fake.csv',\n",
+ " sep=',')\n",
+ "fake_timepoint_sc = pd.DataFrame(index=selected_mono_df.index)\n",
+ "fake_timepoint_sc['time'] = np.arange(selected_mono_df.shape[0])\n",
+ "fake_timepoint_sc.to_csv('sc_timepoint.fake.csv',\n",
+ " sep=',')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# perform GRNBoost2 with BEELINE, see the yaml files in the same directory\n",
+ "# python BLRunner.py --config config-files/config_bp_mono.yaml\n",
+ "# python BLRunner.py --config config-files/config_sc_mono.yaml"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Gene1_sc | \n",
+ " Gene2_sc | \n",
+ " EdgeWeight_sc | \n",
+ " Gene1_bp | \n",
+ " Gene2_bp | \n",
+ " EdgeWeight_bp | \n",
+ "
\n",
+ " \n",
+ " sorted_genepairs | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " CCL3;CCL4 | \n",
+ " CCL3 | \n",
+ " CCL4 | \n",
+ " 554.503642 | \n",
+ " CCL3 | \n",
+ " CCL4 | \n",
+ " 55.157748 | \n",
+ "
\n",
+ " \n",
+ " CCL4;CCL3 | \n",
+ " CCL4 | \n",
+ " CCL3 | \n",
+ " 480.484753 | \n",
+ " CCL4 | \n",
+ " CCL3 | \n",
+ " 77.414467 | \n",
+ "
\n",
+ " \n",
+ " S100A9;S100A8 | \n",
+ " S100A9 | \n",
+ " S100A8 | \n",
+ " 341.726427 | \n",
+ " S100A9 | \n",
+ " S100A8 | \n",
+ " 104.542395 | \n",
+ "
\n",
+ " \n",
+ " S100A8;S100A9 | \n",
+ " S100A8 | \n",
+ " S100A9 | \n",
+ " 284.321568 | \n",
+ " S100A8 | \n",
+ " S100A9 | \n",
+ " 65.915233 | \n",
+ "
\n",
+ " \n",
+ " S100A9;LYZ | \n",
+ " S100A9 | \n",
+ " LYZ | \n",
+ " 221.872616 | \n",
+ " S100A9 | \n",
+ " LYZ | \n",
+ " 0.149064 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Gene1_sc Gene2_sc EdgeWeight_sc Gene1_bp Gene2_bp \\\n",
+ "sorted_genepairs \n",
+ "CCL3;CCL4 CCL3 CCL4 554.503642 CCL3 CCL4 \n",
+ "CCL4;CCL3 CCL4 CCL3 480.484753 CCL4 CCL3 \n",
+ "S100A9;S100A8 S100A9 S100A8 341.726427 S100A9 S100A8 \n",
+ "S100A8;S100A9 S100A8 S100A9 284.321568 S100A8 S100A9 \n",
+ "S100A9;LYZ S100A9 LYZ 221.872616 S100A9 LYZ \n",
+ "\n",
+ " EdgeWeight_bp \n",
+ "sorted_genepairs \n",
+ "CCL3;CCL4 55.157748 \n",
+ "CCL4;CCL3 77.414467 \n",
+ "S100A9;S100A8 104.542395 \n",
+ "S100A8;S100A9 65.915233 \n",
+ "S100A9;LYZ 0.149064 "
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "sc_edges = pd.read_csv('sc_edges.csv', sep='\\t')\n",
+ "sc_edges['sorted_genepairs'] = [';'.join(item) for item in sc_edges[['Gene1', 'Gene2']].values]\n",
+ "bp_edges = pd.read_csv('bp_edges.csv', sep='\\t')\n",
+ "bp_edges['sorted_genepairs'] = [';'.join(item) for item in bp_edges[['Gene1', 'Gene2']].values]\n",
+ "\n",
+ "sc_edges = sc_edges.set_index('sorted_genepairs')\n",
+ "bp_edges = bp_edges.set_index('sorted_genepairs')\n",
+ "concated_edges = pd.concat([sc_edges.add_suffix('_sc'), bp_edges.add_suffix('_bp')], axis=1)\n",
+ "\n",
+ "concated_edges.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "SpearmanrResult(correlation=0.16937964029402044, pvalue=0.0)"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "concated_edges = concated_edges.dropna()\n",
+ "spearmanr(concated_edges['EdgeWeight_sc'], concated_edges['EdgeWeight_bp'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Text(0.5, 1.0, 'Spearman r = 0.17')"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "coef, p = spearmanr(concated_edges['EdgeWeight_sc'], concated_edges['EdgeWeight_bp'])\n",
+ "plt.figure(figsize=(5, 5))\n",
+ "plt.scatter(concated_edges['EdgeWeight_sc'], concated_edges['EdgeWeight_bp'], s=1, alpha=0.5)\n",
+ "plt.xlabel('Edge weight from scRNAseq')\n",
+ "plt.ylabel('Edge weight from BLUEPRINT')\n",
+ "plt.title(f'Spearman r = {coef:.2f}')\n",
+ "# plt.savefig('grnboost2_sc_bp_comparison.png')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.11"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/01_association_metrics/.ipynb_checkpoints/compare_cell_classification-checkpoint.ipynb b/01_association_metrics/.ipynb_checkpoints/compare_cell_classification-checkpoint.ipynb
new file mode 100644
index 0000000..75b0224
--- /dev/null
+++ b/01_association_metrics/.ipynb_checkpoints/compare_cell_classification-checkpoint.ipynb
@@ -0,0 +1,1164 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 34,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import matplotlib.pyplot as plt\n",
+ "import seaborn as sns\n",
+ "from scipy import stats\n",
+ "import pandas as pd\n",
+ "import matplotlib as mpl\n",
+ "mpl.rcParams['pdf.fonttype'] = 42\n",
+ "import numpy as np\n",
+ "%matplotlib inline"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%run dataset.ipynb"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "onemillionv2 = DATASET('onemillionv2')\n",
+ "onemillionv2.load_dataset()\n",
+ "\n",
+ "onemillionv3 = DATASET('onemillionv3')\n",
+ "onemillionv3.load_dataset()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "data_sc = onemillionv2.data_sc[onemillionv2.data_sc.obs['time']=='UT']\n",
+ "data_obs = data_sc.obs"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " orig.ident | \n",
+ " nCount_RNA | \n",
+ " nFeature_RNA | \n",
+ " batch | \n",
+ " lane | \n",
+ " chem | \n",
+ " exp.id | \n",
+ " timepoint | \n",
+ " percent.mt | \n",
+ " nCount_SCT | \n",
+ " nFeature_SCT | \n",
+ " cell_type | \n",
+ " cell_type_lowerres | \n",
+ " assignment | \n",
+ " bare_barcode_lane | \n",
+ " time | \n",
+ "
\n",
+ " \n",
+ " index | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " AAACCTGAGAGTACAT_180925_lane1 | \n",
+ " 1M_cells | \n",
+ " 5190.0 | \n",
+ " 1518 | \n",
+ " 180925_lane1 | \n",
+ " 180925_lane1 | \n",
+ " V2 | \n",
+ " 2 | \n",
+ " UT | \n",
+ " 1.560694 | \n",
+ " 3297.0 | \n",
+ " 1438 | \n",
+ " mono 2 | \n",
+ " monocyte | \n",
+ " LLDeep_1370 | \n",
+ " AAACCTGAGAGTACAT_180925_lane1 | \n",
+ " UT | \n",
+ "
\n",
+ " \n",
+ " AAACCTGAGTGTCTCA_180925_lane1 | \n",
+ " 1M_cells | \n",
+ " 5597.0 | \n",
+ " 1652 | \n",
+ " 180925_lane1 | \n",
+ " 180925_lane1 | \n",
+ " V2 | \n",
+ " 12 | \n",
+ " UT | \n",
+ " 3.394676 | \n",
+ " 3353.0 | \n",
+ " 1507 | \n",
+ " th1 CD4T | \n",
+ " CD4T | \n",
+ " LLDeep_0434 | \n",
+ " AAACCTGAGTGTCTCA_180925_lane1 | \n",
+ " UT | \n",
+ "
\n",
+ " \n",
+ " AAACCTGCAGTCGATT_180925_lane1 | \n",
+ " 1M_cells | \n",
+ " 3039.0 | \n",
+ " 849 | \n",
+ " 180925_lane1 | \n",
+ " 180925_lane1 | \n",
+ " V2 | \n",
+ " 11 | \n",
+ " UT | \n",
+ " 3.685423 | \n",
+ " 2786.0 | \n",
+ " 849 | \n",
+ " naive CD8T | \n",
+ " CD8T | \n",
+ " LLDeep_1319 | \n",
+ " AAACCTGCAGTCGATT_180925_lane1 | \n",
+ " UT | \n",
+ "
\n",
+ " \n",
+ " AAACCTGCATTCGACA_180925_lane1 | \n",
+ " 1M_cells | \n",
+ " 3876.0 | \n",
+ " 1048 | \n",
+ " 180925_lane1 | \n",
+ " 180925_lane1 | \n",
+ " V2 | \n",
+ " 2 | \n",
+ " UT | \n",
+ " 3.766770 | \n",
+ " 2996.0 | \n",
+ " 1047 | \n",
+ " mono 1 | \n",
+ " monocyte | \n",
+ " LLDeep_1370 | \n",
+ " AAACCTGCATTCGACA_180925_lane1 | \n",
+ " UT | \n",
+ "
\n",
+ " \n",
+ " AAACCTGGTAATAGCA_180925_lane1 | \n",
+ " 1M_cells | \n",
+ " 4272.0 | \n",
+ " 1141 | \n",
+ " 180925_lane1 | \n",
+ " 180925_lane1 | \n",
+ " V2 | \n",
+ " 12 | \n",
+ " UT | \n",
+ " 4.564607 | \n",
+ " 3076.0 | \n",
+ " 1131 | \n",
+ " mono 1 | \n",
+ " monocyte | \n",
+ " LLDeep_0434 | \n",
+ " AAACCTGGTAATAGCA_180925_lane1 | \n",
+ " UT | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " orig.ident nCount_RNA nFeature_RNA \\\n",
+ "index \n",
+ "AAACCTGAGAGTACAT_180925_lane1 1M_cells 5190.0 1518 \n",
+ "AAACCTGAGTGTCTCA_180925_lane1 1M_cells 5597.0 1652 \n",
+ "AAACCTGCAGTCGATT_180925_lane1 1M_cells 3039.0 849 \n",
+ "AAACCTGCATTCGACA_180925_lane1 1M_cells 3876.0 1048 \n",
+ "AAACCTGGTAATAGCA_180925_lane1 1M_cells 4272.0 1141 \n",
+ "\n",
+ " batch lane chem exp.id \\\n",
+ "index \n",
+ "AAACCTGAGAGTACAT_180925_lane1 180925_lane1 180925_lane1 V2 2 \n",
+ "AAACCTGAGTGTCTCA_180925_lane1 180925_lane1 180925_lane1 V2 12 \n",
+ "AAACCTGCAGTCGATT_180925_lane1 180925_lane1 180925_lane1 V2 11 \n",
+ "AAACCTGCATTCGACA_180925_lane1 180925_lane1 180925_lane1 V2 2 \n",
+ "AAACCTGGTAATAGCA_180925_lane1 180925_lane1 180925_lane1 V2 12 \n",
+ "\n",
+ " timepoint percent.mt nCount_SCT nFeature_SCT \\\n",
+ "index \n",
+ "AAACCTGAGAGTACAT_180925_lane1 UT 1.560694 3297.0 1438 \n",
+ "AAACCTGAGTGTCTCA_180925_lane1 UT 3.394676 3353.0 1507 \n",
+ "AAACCTGCAGTCGATT_180925_lane1 UT 3.685423 2786.0 849 \n",
+ "AAACCTGCATTCGACA_180925_lane1 UT 3.766770 2996.0 1047 \n",
+ "AAACCTGGTAATAGCA_180925_lane1 UT 4.564607 3076.0 1131 \n",
+ "\n",
+ " cell_type cell_type_lowerres assignment \\\n",
+ "index \n",
+ "AAACCTGAGAGTACAT_180925_lane1 mono 2 monocyte LLDeep_1370 \n",
+ "AAACCTGAGTGTCTCA_180925_lane1 th1 CD4T CD4T LLDeep_0434 \n",
+ "AAACCTGCAGTCGATT_180925_lane1 naive CD8T CD8T LLDeep_1319 \n",
+ "AAACCTGCATTCGACA_180925_lane1 mono 1 monocyte LLDeep_1370 \n",
+ "AAACCTGGTAATAGCA_180925_lane1 mono 1 monocyte LLDeep_0434 \n",
+ "\n",
+ " bare_barcode_lane time \n",
+ "index \n",
+ "AAACCTGAGAGTACAT_180925_lane1 AAACCTGAGAGTACAT_180925_lane1 UT \n",
+ "AAACCTGAGTGTCTCA_180925_lane1 AAACCTGAGTGTCTCA_180925_lane1 UT \n",
+ "AAACCTGCAGTCGATT_180925_lane1 AAACCTGCAGTCGATT_180925_lane1 UT \n",
+ "AAACCTGCATTCGACA_180925_lane1 AAACCTGCATTCGACA_180925_lane1 UT \n",
+ "AAACCTGGTAATAGCA_180925_lane1 AAACCTGGTAATAGCA_180925_lane1 UT "
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data_obs.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXoAAAD8CAYAAAB5Pm/hAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8rg+JYAAAACXBIWXMAAAsTAAALEwEAmpwYAAAU2UlEQVR4nO3df5Cd1X3f8ffHwsoPFdtjs8auBJViK2XUFBy8QFxoXNyaIDOtoHZqEdc4/lENUxQP9XhqTTOT0HEzgZZ0XNvYGg2DHTKmmsRGjFpkMHUmJo1No5XBgAhyFKGWjewgsCe2G4IQfPvHfdRclivts9JerTh6v2Z29j7nx3PPkXY/e+65z703VYUkqV0vW+gBSJLGy6CXpMYZ9JLUOINekhpn0EtS4wx6SWpcr6BPcmmSXUl2J9lwhHbnJXkuybvm2leSNB6zBn2SRcBNwGpgFXBlklWHaXcDcPdc+0qSxqfPiv58YHdV7amqA8BmYM2Idr8CfAl44ij6SpLG5JQebZYCjw8dTwMXDDdIshS4AngbcN5c+g6dYx2wDmDJkiVvPuuss3oMTZIEsGPHjieramJUXZ+gz4iyme+b8AngY1X1XPKC5n36DgqrNgGbACYnJ2tqaqrH0CRJAEn+9+Hq+gT9NHDG0PEyYN+MNpPA5i7kTwPekeRgz76SpDHqE/TbgZVJVgB/DqwFfmm4QVWtOHQ7yeeB/15VdyQ5Zba+kqTxmjXoq+pgkvUMrqZZBNxSVTuTXN3Vb5xr3/kZuiSpj5yIb1PsHr0kzU2SHVU1OarOV8ZKUuMMeklqnEEvSY0z6CWpcX0ur3xJWb7hzoUeQi97r79soYcg6SThil6SGmfQS1LjDHpJapxBL0mNM+glqXEGvSQ1zqCXpMYZ9JLUOINekhpn0EtS4wx6SWqcQS9JjTPoJalxvYI+yaVJdiXZnWTDiPo1SR5M8kCSqSQXDdXtTfLQobr5HLwkaXazvk1xkkXATcDbgWlge5KtVfXIULOvAlurqpKcDfwucNZQ/cVV9eQ8jluS1FOfFf35wO6q2lNVB4DNwJrhBlX1o/qbTxlfApx4nzguSSepPkG/FHh86Hi6K3uBJFckeRS4E/jAUFUBX0myI8m6YxmsJGnu+gR9RpS9aMVeVVuq6izgcuDjQ1UXVtW5wGrgmiQ/P/JOknXd/v7U/v37ewxLktRHn6CfBs4YOl4G7Dtc46q6F3hDktO6433d9yeALQy2gkb121RVk1U1OTEx0XP4kqTZ9An67cDKJCuSLAbWAluHGyR5Y5J0t88FFgNPJVmS5NSufAlwCfDwfE5AknRks151U1UHk6wH7gYWAbdU1c4kV3f1G4F3AlcleRZ4Gnh3dwXO6cCW7m/AKcBtVXXXmOYiSRph1qAHqKptwLYZZRuHbt8A3DCi3x7gnGMcoyTpGPjKWElqnEEvSY0z6CWpcQa9JDXOoJekxhn0ktQ4g16SGmfQS1LjDHpJapxBL0mNM+glqXEGvSQ1zqCXpMYZ9JLUOINekhpn0EtS4wx6SWqcQS9JjTPoJalxvYI+yaVJdiXZnWTDiPo1SR5M8kCSqSQX9e0rSRqvWYM+ySLgJmA1sAq4MsmqGc2+CpxTVW8CPgDcPIe+kqQx6rOiPx/YXVV7quoAsBlYM9ygqn5UVdUdLgGqb19J0nj1CfqlwONDx9Nd2QskuSLJo8CdDFb1vft2/dd12z5T+/fv7zN2SVIPfYI+I8rqRQVVW6rqLOBy4ONz6dv131RVk1U1OTEx0WNYkqQ++gT9NHDG0PEyYN/hGlfVvcAbkpw2176SpPnXJ+i3AyuTrEiyGFgLbB1ukOSNSdLdPhdYDDzVp68kabxOma1BVR1Msh64G1gE3FJVO5Nc3dVvBN4JXJXkWeBp4N3dk7Mj+45pLpKkEWYNeoCq2gZsm1G2cej2DcANfftKko4fXxkrSY0z6CWpcQa9JDXOoJekxhn0ktQ4g16SGmfQS1LjDHpJapxBL0mNM+glqXEGvSQ1zqCXpMYZ9JLUOINekhpn0EtS4wx6SWqcQS9JjTPoJalxvYI+yaVJdiXZnWTDiPr3JHmw+/p6knOG6vYmeSjJA0mm5nPwkqTZzfqZsUkWATcBbwemge1JtlbVI0PNHgPeWlXfT7Ia2ARcMFR/cVU9OY/jliT11GdFfz6wu6r2VNUBYDOwZrhBVX29qr7fHd4HLJvfYUqSjlafoF8KPD50PN2VHc4HgS8PHRfwlSQ7kqw7XKck65JMJZnav39/j2FJkvqYdesGyIiyGtkwuZhB0F80VHxhVe1L8lrgniSPVtW9Lzph1SYGWz5MTk6OPP/JaPmGOxd6CL3svf6yhR6CpMPos6KfBs4YOl4G7JvZKMnZwM3Amqp66lB5Ve3rvj8BbGGwFSRJOk76BP12YGWSFUkWA2uBrcMNkpwJ3A68t6q+PVS+JMmph24DlwAPz9fgJUmzm3XrpqoOJlkP3A0sAm6pqp1Jru7qNwK/BrwG+EwSgINVNQmcDmzpyk4Bbququ8YyE0nSSH326KmqbcC2GWUbh25/CPjQiH57gHNmlkuSjh9fGStJjTPoJalxBr0kNc6gl6TGGfSS1DiDXpIaZ9BLUuMMeklqnEEvSY0z6CWpcQa9JDXOoJekxhn0ktQ4g16SGmfQS1LjDHpJapxBL0mNM+glqXG9gj7JpUl2JdmdZMOI+vckebD7+nqSc/r2lSSN16xBn2QRcBOwGlgFXJlk1YxmjwFvraqzgY8Dm+bQV5I0Rn1W9OcDu6tqT1UdADYDa4YbVNXXq+r73eF9wLK+fSVJ49Un6JcCjw8dT3dlh/NB4Mtz7ZtkXZKpJFP79+/vMSxJUh99gj4jympkw+RiBkH/sbn2rapNVTVZVZMTExM9hiVJ6uOUHm2mgTOGjpcB+2Y2SnI2cDOwuqqemktfSdL49FnRbwdWJlmRZDGwFtg63CDJmcDtwHur6ttz6StJGq9ZV/RVdTDJeuBuYBFwS1XtTHJ1V78R+DXgNcBnkgAc7LZhRvYd01wkSSP02bqhqrYB22aUbRy6/SHgQ337SpKOH18ZK0mNM+glqXEGvSQ1zqCXpMYZ9JLUOINekhpn0EtS4wx6SWqcQS9JjTPoJalxBr0kNc6gl6TGGfSS1DiDXpIaZ9BLUuMMeklqnEEvSY0z6CWpcb0+SjDJpcB/YfC5rzdX1fUz6s8CPgecC/xqVd04VLcX+CHwHN1nyc7P0KUTw/INdy70EHrZe/1lCz0ELZBZgz7JIuAm4O3ANLA9ydaqemSo2feADwOXH+Y0F1fVk8c4VknSUeizdXM+sLuq9lTVAWAzsGa4QVU9UVXbgWfHMEZJ0jHoE/RLgceHjqe7sr4K+EqSHUnWHa5RknVJppJM7d+/fw6nlyQdSZ+gz4iymsN9XFhV5wKrgWuS/PyoRlW1qaomq2pyYmJiDqeXJB1Jnydjp4Ezho6XAfv63kFV7eu+P5FkC4OtoHvnMki1xScvpeOrz4p+O7AyyYoki4G1wNY+J0+yJMmph24DlwAPH+1gJUlzN+uKvqoOJlkP3M3g8spbqmpnkqu7+o1JXgdMAa8Ank9yLbAKOA3YkuTQfd1WVXeNZSaSpJF6XUdfVduAbTPKNg7d/i6DLZ2ZfgCccywDlCQdG18ZK0mNM+glqXEGvSQ1zqCXpMYZ9JLUOINekhpn0EtS4wx6SWqcQS9JjTPoJalxBr0kNc6gl6TGGfSS1DiDXpIaZ9BLUuMMeklqnEEvSY0z6CWpcb2CPsmlSXYl2Z1kw4j6s5J8I8kzST46l76SpPGaNeiTLAJuAlYz+MDvK5OsmtHse8CHgRuPoq8kaYz6rOjPB3ZX1Z6qOgBsBtYMN6iqJ6pqO/DsXPtKksarT9AvBR4fOp7uyvro3TfJuiRTSab279/f8/SSpNn0CfqMKKue5+/dt6o2VdVkVU1OTEz0PL0kaTZ9gn4aOGPoeBmwr+f5j6WvJGke9An67cDKJCuSLAbWAlt7nv9Y+kqS5sEpszWoqoNJ1gN3A4uAW6pqZ5Kru/qNSV4HTAGvAJ5Pci2wqqp+MKrvmOYiSRph1qAHqKptwLYZZRuHbn+XwbZMr76SpOPHV8ZKUuMMeklqnEEvSY0z6CWpcQa9JDXOoJekxhn0ktQ4g16SGmfQS1LjDHpJapxBL0mNM+glqXEGvSQ1zqCXpMYZ9JLUOINekhpn0EtS4wx6SWpcr6BPcmmSXUl2J9kwoj5JPtnVP5jk3KG6vUkeSvJAkqn5HLwkaXazfmZskkXATcDbgWlge5KtVfXIULPVwMru6wLgs933Qy6uqifnbdSSpN76rOjPB3ZX1Z6qOgBsBtbMaLMGuLUG7gNeleT18zxWSdJR6BP0S4HHh46nu7K+bQr4SpIdSdYd7UAlSUdn1q0bICPKag5tLqyqfUleC9yT5NGquvdFdzL4I7AO4Mwzz+wxLElSH31W9NPAGUPHy4B9fdtU1aHvTwBbGGwFvUhVbaqqyaqanJiY6Dd6SdKs+gT9dmBlkhVJFgNrga0z2mwFruquvvk54C+r6jtJliQ5FSDJEuAS4OF5HL8kaRazbt1U1cEk64G7gUXALVW1M8nVXf1GYBvwDmA38FfA+7vupwNbkhy6r9uq6q55n4Uk6bD67NFTVdsYhPlw2cah2wVcM6LfHuCcYxyjJOkY+MpYSWqcQS9JjTPoJalxBr0kNc6gl6TGGfSS1DiDXpIaZ9BLUuMMeklqXK9Xxko6uSzfcOdCD6GXvddfttBDeElwRS9JjXNFL6l5J/sjFFf0ktQ4g16SGmfQS1LjDHpJapxBL0mNM+glqXEGvSQ1rlfQJ7k0ya4ku5NsGFGfJJ/s6h9Mcm7fvpKk8Zo16JMsAm4CVgOrgCuTrJrRbDWwsvtaB3x2Dn0lSWPUZ0V/PrC7qvZU1QFgM7BmRps1wK01cB/wqiSv79lXkjRGfd4CYSnw+NDxNHBBjzZLe/YFIMk6Bo8GAH6UZFePsR0vpwFPzucJc8N8nm3OWpsPtDen1uYD7c3pRJvP3zlcRZ+gz4iy6tmmT99BYdUmYFOP8Rx3SaaqanKhxzFfWpsPtDen1uYD7c3ppTSfPkE/DZwxdLwM2NezzeIefSVJY9Rnj347sDLJiiSLgbXA1hlttgJXdVff/Bzwl1X1nZ59JUljNOuKvqoOJlkP3A0sAm6pqp1Jru7qNwLbgHcAu4G/At5/pL5jmcl4nZBbSsegtflAe3NqbT7Q3pxeMvNJ1cgtc0lSI3xlrCQ1zqCXpMadlEGf5HVJNif5sySPJNmW5KeTPJ3k/iR/kuSPk7xvRN/zkjyX5F1JXpPkge7ru0n+fOh48Yk8nySvTPLfknwryc4k7z9R5nO8JfnlJH97ge67kvzW0PFHk1zX3b4uyUe72z+e5J4kv74Q4+yr+914oPuZ+laSjyR52VD9+Unu7d4W5dEkNyf5yYUc81wNzfFbSb6Z5B8s9Jhmc9J9ZmySAFuA366qtV3Zm4DTgT+rqp/tyn4KuD3Jy6rqc13ZIuAGBk8uU1VPAW/q6q4DflRVN75E5nMN8EhV/dMkE8Au4AtVtaDzWSC/DDzMwlz6+wzwz5P8ZlWNfPFN90f2S8COqvr3x3V0c/f00M/Qa4HbgFcCv57kdOD3gLVV9Y3uZ/edwKkMLuJ4qRie4y8Avwm8dUFHNIuTcUV/MfBsd7UQAFX1AC98BS9VtQf4CPDhoeJfYfAL98T4h9nb0c6ngFO7X7a/BXwPODiuQSZZPrSCezjJF5L8kyR/lORPu5Xeq5PckcEb492X5Oyu73VJbknyB0n2JPnw0Hk/0p3v4STXDpVf1Z3nW0l+J8mpSR5L8vKu/hVJ9ib5RWAS+EK3SvuJJG9O8rUkO5LcncHbeYzLQQZXb/ybw9SfwuCtQ/60ql5SbwpYVU8weLX7+u7n7BoGC5JvdPVVVV+sqr9YyHEeo1cA31/oQczmpFvRAz8D7OjZ9pvAWQBJlgJXAG8DzhvP0I7KUc0H+DSD1zTsY7CiendVPT//w3uBNwK/yOCXfzvwS8BFwD8D/h2DP073V9XlSd4G3Er3iKkb98XdWHcl+SxwNoNLeS9g8Crs/5Xka8AB4FeBC6vqySSvrqofJvkD4DLgDgav6fhSVf1ekmuAj1bVVPeH4FPAmqran+TdwG8AHxjjv8tNwINJ/uOIun8L/I+qunaM9z82VbWn27p5LYOf1d9e4CHNh59I8gDw48DrGWTCCe1kDPq5GH4Lh08AH6uq5waLk5ek4YH/AvAAgx/SNwD3JPnDqvrBGO//sap6CCDJTuCrVVVJHgKWM3ivjncCVNXvd88ZvLLre2dVPQM8k+QJBltTFwFbqur/due8HfiHDB6tfPHQVkhVfa87x80MgvMOBn8g/tWIMf5dBoF0T/f/vAj4zrz9C4xQVT9IciuDR1tPz6j+n8Bbkvx0VX17nOMYo5fsL8xhDG/dvAW4NcnP1Al8rfrJuHWzE3hzz7Y/C/xJd3sS2JxkL/Au4DNJLp/30c3d0c7n/cDt3cPn3cBj/M1qf1yeGbr9/NDx8wwWHUd6b6Thvs8doT1d+Yt+6arqj4DlSd4KLKqqhw/Td2dVvan7+vtVdcnhJjSPPgF8EFgyo/xe4Frgywv1hPGx6J4beo7BdudcflZfErptqNOAiYUey5GcjEH/+8CPJfn/q7kk5zHjnd+SLAduZPAwnqpaUVXLq2o58EXgX1fVHcdpzEdyVPMB/g/wj7u60xmsZPcch/Eeyb3AewCS/CPgyVkeYdwLXJ7kJ5MsYbC19ofAV4F/keQ13blePdTnVuC/Ap8bKvshgy0hGDwpPdGt1Ejy8iR/7xjnNavuUcfvMgj7mXVfAv4TcFeSV417LPOle5J/I/DpbrX7aeB9SS4YavMvk7xuocZ4rJKcxeBR31MLPZYjOem2brqtgiuAT2TwiVd/DexlsGp6Q5L7Gey9/RD41KErbk5UxzCfjwOf77ZNwmBbal7fcvUoXAd8LsmDDK7CeNHlrcOq6ptJPg/8cVd0c1XdD5DkN4CvJXkOuJ/BlTUAXwD+A4OwP+TzwMYkTwNvYfCI7ZPdttEpDFbbx+OtO34LWD+qoqo2doG4NcklVfXXx2E8R+PQ/vXLGTzR/DvAfwaoqr9Isha4sbsi53kGf6xvX6CxHq1Dc4TB7877quq5BRzPrHwLBJ1UkryLwROt713osUjHy0m3otfJK8mnGHys5TsWeizS8eSKXpIadzI+GStJJxWDXpIaZ9BLUuMMeklqnEEvSY37f/cjzCi39pQ6AAAAAElFTkSuQmCC\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "celltypes = ['CD4T', 'CD8T', 'monocyte', 'NK', 'DC', 'B']\n",
+ "plot_df = data_obs['cell_type_lowerres'].value_counts(normalize=True).loc[celltypes]\n",
+ "plt.bar(x=plot_df.index, height=plot_df.values)\n",
+ "plt.ylim([0, 0.40])\n",
+ "plt.savefig('cell_abundance_oelenv2.pdf')\n",
+ "plt.savefig('cell_abundance_oelenv2.png')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " predicted.celltype.l1 | \n",
+ " predicted.celltype.l1.score | \n",
+ " predicted.celltype.l2 | \n",
+ " predicted.celltype.l2.score | \n",
+ "
\n",
+ " \n",
+ " barcode | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " AAACCTGAGAAACCAT_180920_lane1 | \n",
+ " CD8 T | \n",
+ " 0.755924 | \n",
+ " CD8 TEM | \n",
+ " 0.755924 | \n",
+ "
\n",
+ " \n",
+ " AAACCTGAGTAGCCGA_180920_lane1 | \n",
+ " Mono | \n",
+ " 1.000000 | \n",
+ " CD16 Mono | \n",
+ " 1.000000 | \n",
+ "
\n",
+ " \n",
+ " AAACCTGCAATCTACG_180920_lane1 | \n",
+ " NK | \n",
+ " 1.000000 | \n",
+ " NK | \n",
+ " 1.000000 | \n",
+ "
\n",
+ " \n",
+ " AAACCTGCACATCCAA_180920_lane1 | \n",
+ " other | \n",
+ " 0.546320 | \n",
+ " ILC | \n",
+ " 0.546320 | \n",
+ "
\n",
+ " \n",
+ " AAACCTGCAGCTCGAC_180920_lane1 | \n",
+ " NK | \n",
+ " 0.733094 | \n",
+ " NK | \n",
+ " 0.733094 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " predicted.celltype.l1 \\\n",
+ "barcode \n",
+ "AAACCTGAGAAACCAT_180920_lane1 CD8 T \n",
+ "AAACCTGAGTAGCCGA_180920_lane1 Mono \n",
+ "AAACCTGCAATCTACG_180920_lane1 NK \n",
+ "AAACCTGCACATCCAA_180920_lane1 other \n",
+ "AAACCTGCAGCTCGAC_180920_lane1 NK \n",
+ "\n",
+ " predicted.celltype.l1.score \\\n",
+ "barcode \n",
+ "AAACCTGAGAAACCAT_180920_lane1 0.755924 \n",
+ "AAACCTGAGTAGCCGA_180920_lane1 1.000000 \n",
+ "AAACCTGCAATCTACG_180920_lane1 1.000000 \n",
+ "AAACCTGCACATCCAA_180920_lane1 0.546320 \n",
+ "AAACCTGCAGCTCGAC_180920_lane1 0.733094 \n",
+ "\n",
+ " predicted.celltype.l2 \\\n",
+ "barcode \n",
+ "AAACCTGAGAAACCAT_180920_lane1 CD8 TEM \n",
+ "AAACCTGAGTAGCCGA_180920_lane1 CD16 Mono \n",
+ "AAACCTGCAATCTACG_180920_lane1 NK \n",
+ "AAACCTGCACATCCAA_180920_lane1 ILC \n",
+ "AAACCTGCAGCTCGAC_180920_lane1 NK \n",
+ "\n",
+ " predicted.celltype.l2.score \n",
+ "barcode \n",
+ "AAACCTGAGAAACCAT_180920_lane1 0.755924 \n",
+ "AAACCTGAGTAGCCGA_180920_lane1 1.000000 \n",
+ "AAACCTGCAATCTACG_180920_lane1 1.000000 \n",
+ "AAACCTGCACATCCAA_180920_lane1 0.546320 \n",
+ "AAACCTGCAGCTCGAC_180920_lane1 0.733094 "
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "azimuith_df = pd.read_csv(\n",
+ " '1M_v2_20201029_azimuth.tsv',\n",
+ " sep='\\t', index_col=0\n",
+ ")\n",
+ "azimuith_df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['monocyte', 'CD4T', 'CD8T', 'NK', 'megakaryocyte', 'B', 'DC', 'plasma B', 'unknown', 'hemapoietic stem']\n",
+ "Categories (10, object): ['B', 'hemapoietic stem', 'megakaryocyte', 'NK', ..., 'CD4T', 'CD8T', 'monocyte', 'DC']"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data_obs['cell_type_lowerres'].unique()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array(['CD8 T', 'Mono', 'NK', 'other', 'CD4 T', 'DC', 'B', 'other T'],\n",
+ " dtype=object)"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "azimuith_df['predicted.celltype.l1'].unique()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " predicted.celltype.l1 | \n",
+ " predicted.celltype.l1.score | \n",
+ " predicted.celltype.l2 | \n",
+ " predicted.celltype.l2.score | \n",
+ " cell_type_mapped | \n",
+ "
\n",
+ " \n",
+ " barcode | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " AAACCTGAGAAACCAT_180920_lane1 | \n",
+ " CD8 T | \n",
+ " 0.755924 | \n",
+ " CD8 TEM | \n",
+ " 0.755924 | \n",
+ " CD8T | \n",
+ "
\n",
+ " \n",
+ " AAACCTGAGTAGCCGA_180920_lane1 | \n",
+ " Mono | \n",
+ " 1.000000 | \n",
+ " CD16 Mono | \n",
+ " 1.000000 | \n",
+ " monocyte | \n",
+ "
\n",
+ " \n",
+ " AAACCTGCAATCTACG_180920_lane1 | \n",
+ " NK | \n",
+ " 1.000000 | \n",
+ " NK | \n",
+ " 1.000000 | \n",
+ " NK | \n",
+ "
\n",
+ " \n",
+ " AAACCTGCACATCCAA_180920_lane1 | \n",
+ " other | \n",
+ " 0.546320 | \n",
+ " ILC | \n",
+ " 0.546320 | \n",
+ " None | \n",
+ "
\n",
+ " \n",
+ " AAACCTGCAGCTCGAC_180920_lane1 | \n",
+ " NK | \n",
+ " 0.733094 | \n",
+ " NK | \n",
+ " 0.733094 | \n",
+ " NK | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " predicted.celltype.l1 \\\n",
+ "barcode \n",
+ "AAACCTGAGAAACCAT_180920_lane1 CD8 T \n",
+ "AAACCTGAGTAGCCGA_180920_lane1 Mono \n",
+ "AAACCTGCAATCTACG_180920_lane1 NK \n",
+ "AAACCTGCACATCCAA_180920_lane1 other \n",
+ "AAACCTGCAGCTCGAC_180920_lane1 NK \n",
+ "\n",
+ " predicted.celltype.l1.score \\\n",
+ "barcode \n",
+ "AAACCTGAGAAACCAT_180920_lane1 0.755924 \n",
+ "AAACCTGAGTAGCCGA_180920_lane1 1.000000 \n",
+ "AAACCTGCAATCTACG_180920_lane1 1.000000 \n",
+ "AAACCTGCACATCCAA_180920_lane1 0.546320 \n",
+ "AAACCTGCAGCTCGAC_180920_lane1 0.733094 \n",
+ "\n",
+ " predicted.celltype.l2 \\\n",
+ "barcode \n",
+ "AAACCTGAGAAACCAT_180920_lane1 CD8 TEM \n",
+ "AAACCTGAGTAGCCGA_180920_lane1 CD16 Mono \n",
+ "AAACCTGCAATCTACG_180920_lane1 NK \n",
+ "AAACCTGCACATCCAA_180920_lane1 ILC \n",
+ "AAACCTGCAGCTCGAC_180920_lane1 NK \n",
+ "\n",
+ " predicted.celltype.l2.score cell_type_mapped \n",
+ "barcode \n",
+ "AAACCTGAGAAACCAT_180920_lane1 0.755924 CD8T \n",
+ "AAACCTGAGTAGCCGA_180920_lane1 1.000000 monocyte \n",
+ "AAACCTGCAATCTACG_180920_lane1 1.000000 NK \n",
+ "AAACCTGCACATCCAA_180920_lane1 0.546320 None \n",
+ "AAACCTGCAGCTCGAC_180920_lane1 0.733094 NK "
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "mapping_names = {'CD8 T': 'CD8T', \n",
+ " 'CD4 T': 'CD4T',\n",
+ " 'Mono': 'monocyte',\n",
+ " 'NK': 'NK',\n",
+ " 'B': 'B',\n",
+ " 'DC': 'DC'}\n",
+ "azimuith_df['cell_type_mapped'] = [mapping_names.get(name) for name in \n",
+ " azimuith_df['predicted.celltype.l1']]\n",
+ "azimuith_df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " cell_type_lowerres | \n",
+ " cell_type_mapped | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " AAACCTGAGAGTACAT_180925_lane1 | \n",
+ " monocyte | \n",
+ " monocyte | \n",
+ "
\n",
+ " \n",
+ " AAACCTGAGTGTCTCA_180925_lane1 | \n",
+ " CD4T | \n",
+ " CD4T | \n",
+ "
\n",
+ " \n",
+ " AAACCTGCAGTCGATT_180925_lane1 | \n",
+ " CD8T | \n",
+ " CD8T | \n",
+ "
\n",
+ " \n",
+ " AAACCTGCATTCGACA_180925_lane1 | \n",
+ " monocyte | \n",
+ " monocyte | \n",
+ "
\n",
+ " \n",
+ " AAACCTGGTAATAGCA_180925_lane1 | \n",
+ " monocyte | \n",
+ " monocyte | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " cell_type_lowerres cell_type_mapped\n",
+ "AAACCTGAGAGTACAT_180925_lane1 monocyte monocyte\n",
+ "AAACCTGAGTGTCTCA_180925_lane1 CD4T CD4T\n",
+ "AAACCTGCAGTCGATT_180925_lane1 CD8T CD8T\n",
+ "AAACCTGCATTCGACA_180925_lane1 monocyte monocyte\n",
+ "AAACCTGGTAATAGCA_180925_lane1 monocyte monocyte"
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "merged_classification_df = pd.concat([data_obs[['cell_type_lowerres']],\n",
+ " azimuith_df[['cell_type_mapped']]],\n",
+ " axis=1).dropna()\n",
+ "merged_classification_df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "celltypes = ['CD4T', 'CD8T', 'monocyte', 'DC', 'NK', 'B']\n",
+ "accuracy_df = pd.DataFrame(\n",
+ " data=np.zeros((6, 6)),\n",
+ " index=celltypes,\n",
+ " columns=celltypes\n",
+ ")\n",
+ "for celltype_onemillionv2 in celltypes:\n",
+ " for celltype_azimuth in celltypes:\n",
+ " common_classification_num = merged_classification_df[\n",
+ " (merged_classification_df['cell_type_lowerres']==celltype_onemillionv2) & \n",
+ " (merged_classification_df['cell_type_mapped']==celltype_azimuth)\n",
+ " ].shape[0]\n",
+ " onemillion_classification_num = merged_classification_df[\n",
+ " (merged_classification_df['cell_type_lowerres']==celltype_onemillionv2)\n",
+ " ].shape[0]\n",
+ " azimuth_classification_num = merged_classification_df[\n",
+ " (merged_classification_df['cell_type_mapped']==celltype_azimuth)\n",
+ " ].shape[0]\n",
+ " accuracy_df[celltype_onemillionv2].loc[celltype_azimuth] = common_classification_num/onemillion_classification_num"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "plt.figure(figsize=(6, 5))\n",
+ "sns.heatmap(accuracy_df, vmin=0, vmax=1,\n",
+ " cmap=\"YlGnBu\", \n",
+ " annot=True, annot_kws={\"size\": 10},\n",
+ " fmt='.2f')\n",
+ "plt.xlabel('Cell type in Oelen v2')\n",
+ "plt.ylabel('Cell type by Azimuth Algorithm')\n",
+ "plt.savefig('marker_gene_azimuth_classification.pdf')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " predicted.celltype.l1 | \n",
+ " predicted.celltype.l1.score | \n",
+ " predicted.celltype.l2 | \n",
+ " predicted.celltype.l2.score | \n",
+ "
\n",
+ " \n",
+ " barcode | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " AAACCCAAGATACCAA_190109_lane1 | \n",
+ " CD8 T | \n",
+ " 0.981818 | \n",
+ " CD8 Naive | \n",
+ " 0.955707 | \n",
+ "
\n",
+ " \n",
+ " AAACCCAAGTCCCTAA_190109_lane1 | \n",
+ " CD4 T | \n",
+ " 0.379819 | \n",
+ " CD4 CTL | \n",
+ " 0.327444 | \n",
+ "
\n",
+ " \n",
+ " AAACCCACAAGAGTGC_190109_lane1 | \n",
+ " CD4 T | \n",
+ " 0.740111 | \n",
+ " CD4 Naive | \n",
+ " 0.723822 | \n",
+ "
\n",
+ " \n",
+ " AAACCCACAATCCAGT_190109_lane1 | \n",
+ " CD4 T | \n",
+ " 0.916869 | \n",
+ " CD4 TCM | \n",
+ " 0.468549 | \n",
+ "
\n",
+ " \n",
+ " AAACCCACACTATCCC_190109_lane1 | \n",
+ " Mono | \n",
+ " 1.000000 | \n",
+ " CD14 Mono | \n",
+ " 1.000000 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " predicted.celltype.l1 \\\n",
+ "barcode \n",
+ "AAACCCAAGATACCAA_190109_lane1 CD8 T \n",
+ "AAACCCAAGTCCCTAA_190109_lane1 CD4 T \n",
+ "AAACCCACAAGAGTGC_190109_lane1 CD4 T \n",
+ "AAACCCACAATCCAGT_190109_lane1 CD4 T \n",
+ "AAACCCACACTATCCC_190109_lane1 Mono \n",
+ "\n",
+ " predicted.celltype.l1.score \\\n",
+ "barcode \n",
+ "AAACCCAAGATACCAA_190109_lane1 0.981818 \n",
+ "AAACCCAAGTCCCTAA_190109_lane1 0.379819 \n",
+ "AAACCCACAAGAGTGC_190109_lane1 0.740111 \n",
+ "AAACCCACAATCCAGT_190109_lane1 0.916869 \n",
+ "AAACCCACACTATCCC_190109_lane1 1.000000 \n",
+ "\n",
+ " predicted.celltype.l2 \\\n",
+ "barcode \n",
+ "AAACCCAAGATACCAA_190109_lane1 CD8 Naive \n",
+ "AAACCCAAGTCCCTAA_190109_lane1 CD4 CTL \n",
+ "AAACCCACAAGAGTGC_190109_lane1 CD4 Naive \n",
+ "AAACCCACAATCCAGT_190109_lane1 CD4 TCM \n",
+ "AAACCCACACTATCCC_190109_lane1 CD14 Mono \n",
+ "\n",
+ " predicted.celltype.l2.score \n",
+ "barcode \n",
+ "AAACCCAAGATACCAA_190109_lane1 0.955707 \n",
+ "AAACCCAAGTCCCTAA_190109_lane1 0.327444 \n",
+ "AAACCCACAAGAGTGC_190109_lane1 0.723822 \n",
+ "AAACCCACAATCCAGT_190109_lane1 0.468549 \n",
+ "AAACCCACACTATCCC_190109_lane1 1.000000 "
+ ]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "azimuith_df_v3 = pd.read_csv(\n",
+ " '1M_v3_20201106_azimuth.tsv',\n",
+ " sep='\\t', index_col=0\n",
+ ")\n",
+ "azimuith_df_v3.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "mapping_names = {'CD8 T': 'CD8T', \n",
+ " 'CD4 T': 'CD4T',\n",
+ " 'Mono': 'monocyte',\n",
+ " 'NK': 'NK',\n",
+ " 'B': 'B',\n",
+ " 'DC': 'DC'}\n",
+ "azimuith_df_v3['cell_type_mapped'] = [mapping_names.get(name) for name in \n",
+ " azimuith_df_v3['predicted.celltype.l1']]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "data_scv3 = onemillionv3.data_sc[onemillionv3.data_sc.obs['time']=='UT']\n",
+ "data_obsv3 = data_scv3.obs"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " cell_type_lowerres | \n",
+ " cell_type_mapped | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " AAACCCAAGATACCAA_190109_lane1 | \n",
+ " CD4T | \n",
+ " CD8T | \n",
+ "
\n",
+ " \n",
+ " AAACCCACAAGAGTGC_190109_lane1 | \n",
+ " CD4T | \n",
+ " CD4T | \n",
+ "
\n",
+ " \n",
+ " AAACCCACAATCCAGT_190109_lane1 | \n",
+ " CD4T | \n",
+ " CD4T | \n",
+ "
\n",
+ " \n",
+ " AAACCCACAGGTACGA_190109_lane1 | \n",
+ " CD4T | \n",
+ " CD4T | \n",
+ "
\n",
+ " \n",
+ " AAACCCAGTCTACGTA_190109_lane1 | \n",
+ " CD4T | \n",
+ " CD4T | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " cell_type_lowerres cell_type_mapped\n",
+ "AAACCCAAGATACCAA_190109_lane1 CD4T CD8T\n",
+ "AAACCCACAAGAGTGC_190109_lane1 CD4T CD4T\n",
+ "AAACCCACAATCCAGT_190109_lane1 CD4T CD4T\n",
+ "AAACCCACAGGTACGA_190109_lane1 CD4T CD4T\n",
+ "AAACCCAGTCTACGTA_190109_lane1 CD4T CD4T"
+ ]
+ },
+ "execution_count": 28,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "merged_classification_df_v3 = pd.concat([data_obsv3[['cell_type_lowerres']],\n",
+ " azimuith_df_v3[['cell_type_mapped']]],\n",
+ " axis=1).dropna()\n",
+ "merged_classification_df_v3.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "celltypes = ['CD4T', 'CD8T', 'monocyte', 'DC', 'NK', 'B']\n",
+ "accuracy_df_v3 = pd.DataFrame(\n",
+ " data=np.zeros((6, 6)),\n",
+ " index=celltypes,\n",
+ " columns=celltypes\n",
+ ")\n",
+ "for celltype_onemillionv2 in celltypes:\n",
+ " for celltype_azimuth in celltypes:\n",
+ " common_classification_num = merged_classification_df_v3[\n",
+ " (merged_classification_df_v3['cell_type_lowerres']==celltype_onemillionv2) & \n",
+ " (merged_classification_df_v3['cell_type_mapped']==celltype_azimuth)\n",
+ " ].shape[0]\n",
+ " onemillion_classification_num = merged_classification_df_v3[\n",
+ " (merged_classification_df_v3['cell_type_lowerres']==celltype_onemillionv2)\n",
+ " ].shape[0]\n",
+ " azimuth_classification_num = merged_classification_df_v3[\n",
+ " (merged_classification_df_v3['cell_type_mapped']==celltype_azimuth)\n",
+ " ].shape[0]\n",
+ " accuracy_df_v3[celltype_onemillionv2].loc[celltype_azimuth] = common_classification_num/onemillion_classification_num"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " CD4T | \n",
+ " CD8T | \n",
+ " monocyte | \n",
+ " DC | \n",
+ " NK | \n",
+ " B | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " CD4T | \n",
+ " 0.952222 | \n",
+ " 0.157213 | \n",
+ " 0.003671 | \n",
+ " 0.001845 | \n",
+ " 0.016403 | \n",
+ " 0.001099 | \n",
+ "
\n",
+ " \n",
+ " CD8T | \n",
+ " 0.046923 | \n",
+ " 0.787428 | \n",
+ " 0.001606 | \n",
+ " 0.000000 | \n",
+ " 0.014666 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " monocyte | \n",
+ " 0.000513 | \n",
+ " 0.000409 | \n",
+ " 0.993346 | \n",
+ " 0.177122 | \n",
+ " 0.000386 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " DC | \n",
+ " 0.000085 | \n",
+ " 0.000000 | \n",
+ " 0.000574 | \n",
+ " 0.821033 | \n",
+ " 0.000193 | \n",
+ " 0.001099 | \n",
+ "
\n",
+ " \n",
+ " NK | \n",
+ " 0.000256 | \n",
+ " 0.054950 | \n",
+ " 0.000688 | \n",
+ " 0.000000 | \n",
+ " 0.968352 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " B | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000115 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.997802 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " CD4T CD8T monocyte DC NK B\n",
+ "CD4T 0.952222 0.157213 0.003671 0.001845 0.016403 0.001099\n",
+ "CD8T 0.046923 0.787428 0.001606 0.000000 0.014666 0.000000\n",
+ "monocyte 0.000513 0.000409 0.993346 0.177122 0.000386 0.000000\n",
+ "DC 0.000085 0.000000 0.000574 0.821033 0.000193 0.001099\n",
+ "NK 0.000256 0.054950 0.000688 0.000000 0.968352 0.000000\n",
+ "B 0.000000 0.000000 0.000115 0.000000 0.000000 0.997802"
+ ]
+ },
+ "execution_count": 32,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "accuracy_df_v3"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "plt.figure(figsize=(6, 5))\n",
+ "sns.heatmap(accuracy_df_v3, vmin=0, vmax=1,\n",
+ " cmap=\"YlGnBu\", \n",
+ " annot=True, annot_kws={\"size\": 10},\n",
+ " fmt='.2f')\n",
+ "plt.xlabel('Cell type in Oelen v3')\n",
+ "plt.ylabel('Cell type by Azimuth Algorithm')\n",
+ "plt.savefig('marker_gene_azimuth_classification_oelenv3.pdf')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.11"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/01_association_metrics/GRNBoost2.ipynb b/01_association_metrics/GRNBoost2.ipynb
new file mode 100644
index 0000000..b1b9528
--- /dev/null
+++ b/01_association_metrics/GRNBoost2.ipynb
@@ -0,0 +1,349 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import matplotlib as mpl\n",
+ "mpl.rcParams['pdf.fonttype'] = 42\n",
+ "import pandas as pd\n",
+ "import matplotlib.pyplot as plt\n",
+ "import numpy as np\n",
+ "from pathlib import Path\n",
+ "import seaborn as sns\n",
+ "%matplotlib inline\n",
+ "%run dataset.ipynb\n",
+ "\n",
+ "def select_gene_nonzeroratio(df, ratio):\n",
+ " nonzerocounts = np.count_nonzero(df.values, axis=0) / df.shape[0]\n",
+ " selected_genes = df.columns[nonzerocounts > ratio]\n",
+ " return selected_genes"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "datasetname = 'onemillionv2'\n",
+ "dataset = DATASET(datasetname)\n",
+ "dataset.load_dataset()\n",
+ "data_sc = dataset.data_sc"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "395\n"
+ ]
+ }
+ ],
+ "source": [
+ "monocyte_ut = data_sc[(data_sc.obs['time']=='UT') & (data_sc.obs['cell_type_lowerres']=='monocyte')]\n",
+ "monocyte_ut_df = pd.DataFrame(data=monocyte_ut.X.toarray(),\n",
+ " index=monocyte_ut.obs.index,\n",
+ " columns=monocyte_ut.var.index)\n",
+ "mono_genes = select_gene_nonzeroratio(df=monocyte_ut_df, ratio=0.50)\n",
+ "print(len(mono_genes))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "(11482, 379) (194, 379)\n"
+ ]
+ }
+ ],
+ "source": [
+ "bp_df = pd.read_csv('mono_gene_nor_combat_20151109.ProbesWithZeroVarianceRemoved.ProbesCentered.SamplesZTransformed.1PCAsOverSamplesRemoved.txt.gz',\n",
+ " compression='gzip',\n",
+ " sep='\\t', index_col=0)\n",
+ "name_mapping_dic = pd.read_csv('features_v3_reformated_names.tsv',\n",
+ " sep ='\\t',\n",
+ " names=['geneid', 'genename']).set_index(['geneid'])['genename'].T.to_dict()\n",
+ "\n",
+ "bp_df['geneid'] = [item.split('.')[0] for item in bp_df.index]\n",
+ "bp_df['genename'] = [name_mapping_dic.get(geneid) for geneid in bp_df['geneid']]\n",
+ "bp_df = bp_df.dropna(subset=['genename'])\n",
+ "bp_df = bp_df.drop('geneid', axis=1)\n",
+ "bp_df = bp_df.set_index('genename')\n",
+ "print(bp_df.shape)\n",
+ "\n",
+ "bp_trans_df = bp_df.T\n",
+ "common_genes = list(set(mono_genes) & set(bp_trans_df.columns))\n",
+ "selected_mono_df = monocyte_ut_df[common_genes]\n",
+ "selected_bp_df = bp_trans_df[common_genes]\n",
+ "print(selected_mono_df.shape, selected_bp_df.shape)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "selected_mono_df.T.to_csv('sc_Expression.csv', sep=',')\n",
+ "selected_bp_df.T.to_csv('bp_Expression.csv', sep=',')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# create this fake pseudo time ordering because it's required to run the Beeline tool, but not used by GRNBoost2\n",
+ "fake_timepoint_bp = pd.DataFrame(index=selected_bp_df.index)\n",
+ "fake_timepoint_bp['time'] = np.arange(selected_bp_df.shape[0])\n",
+ "fake_timepoint_bp.to_csv('bp_timepoint.fake.csv',\n",
+ " sep=',')\n",
+ "fake_timepoint_sc = pd.DataFrame(index=selected_mono_df.index)\n",
+ "fake_timepoint_sc['time'] = np.arange(selected_mono_df.shape[0])\n",
+ "fake_timepoint_sc.to_csv('sc_timepoint.fake.csv',\n",
+ " sep=',')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# perform GRNBoost2 with BEELINE, see the yaml files in the same directory\n",
+ "# python BLRunner.py --config config-files/config_bp_mono.yaml\n",
+ "# python BLRunner.py --config config-files/config_sc_mono.yaml"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Gene1_sc | \n",
+ " Gene2_sc | \n",
+ " EdgeWeight_sc | \n",
+ " Gene1_bp | \n",
+ " Gene2_bp | \n",
+ " EdgeWeight_bp | \n",
+ "
\n",
+ " \n",
+ " sorted_genepairs | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " CCL3;CCL4 | \n",
+ " CCL3 | \n",
+ " CCL4 | \n",
+ " 554.503642 | \n",
+ " CCL3 | \n",
+ " CCL4 | \n",
+ " 55.157748 | \n",
+ "
\n",
+ " \n",
+ " CCL4;CCL3 | \n",
+ " CCL4 | \n",
+ " CCL3 | \n",
+ " 480.484753 | \n",
+ " CCL4 | \n",
+ " CCL3 | \n",
+ " 77.414467 | \n",
+ "
\n",
+ " \n",
+ " S100A9;S100A8 | \n",
+ " S100A9 | \n",
+ " S100A8 | \n",
+ " 341.726427 | \n",
+ " S100A9 | \n",
+ " S100A8 | \n",
+ " 104.542395 | \n",
+ "
\n",
+ " \n",
+ " S100A8;S100A9 | \n",
+ " S100A8 | \n",
+ " S100A9 | \n",
+ " 284.321568 | \n",
+ " S100A8 | \n",
+ " S100A9 | \n",
+ " 65.915233 | \n",
+ "
\n",
+ " \n",
+ " S100A9;LYZ | \n",
+ " S100A9 | \n",
+ " LYZ | \n",
+ " 221.872616 | \n",
+ " S100A9 | \n",
+ " LYZ | \n",
+ " 0.149064 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Gene1_sc Gene2_sc EdgeWeight_sc Gene1_bp Gene2_bp \\\n",
+ "sorted_genepairs \n",
+ "CCL3;CCL4 CCL3 CCL4 554.503642 CCL3 CCL4 \n",
+ "CCL4;CCL3 CCL4 CCL3 480.484753 CCL4 CCL3 \n",
+ "S100A9;S100A8 S100A9 S100A8 341.726427 S100A9 S100A8 \n",
+ "S100A8;S100A9 S100A8 S100A9 284.321568 S100A8 S100A9 \n",
+ "S100A9;LYZ S100A9 LYZ 221.872616 S100A9 LYZ \n",
+ "\n",
+ " EdgeWeight_bp \n",
+ "sorted_genepairs \n",
+ "CCL3;CCL4 55.157748 \n",
+ "CCL4;CCL3 77.414467 \n",
+ "S100A9;S100A8 104.542395 \n",
+ "S100A8;S100A9 65.915233 \n",
+ "S100A9;LYZ 0.149064 "
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "sc_edges = pd.read_csv('sc_edges.csv', sep='\\t')\n",
+ "sc_edges['sorted_genepairs'] = [';'.join(item) for item in sc_edges[['Gene1', 'Gene2']].values]\n",
+ "bp_edges = pd.read_csv('bp_edges.csv', sep='\\t')\n",
+ "bp_edges['sorted_genepairs'] = [';'.join(item) for item in bp_edges[['Gene1', 'Gene2']].values]\n",
+ "\n",
+ "sc_edges = sc_edges.set_index('sorted_genepairs')\n",
+ "bp_edges = bp_edges.set_index('sorted_genepairs')\n",
+ "concated_edges = pd.concat([sc_edges.add_suffix('_sc'), bp_edges.add_suffix('_bp')], axis=1)\n",
+ "\n",
+ "concated_edges.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "SpearmanrResult(correlation=0.16937964029402044, pvalue=0.0)"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "concated_edges = concated_edges.dropna()\n",
+ "spearmanr(concated_edges['EdgeWeight_sc'], concated_edges['EdgeWeight_bp'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Text(0.5, 1.0, 'Spearman r = 0.17')"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "coef, p = spearmanr(concated_edges['EdgeWeight_sc'], concated_edges['EdgeWeight_bp'])\n",
+ "plt.figure(figsize=(5, 5))\n",
+ "plt.scatter(concated_edges['EdgeWeight_sc'], concated_edges['EdgeWeight_bp'], s=1, alpha=0.5)\n",
+ "plt.xlabel('Edge weight from scRNAseq')\n",
+ "plt.ylabel('Edge weight from BLUEPRINT')\n",
+ "plt.title(f'Spearman r = {coef:.2f}')\n",
+ "# plt.savefig('grnboost2_sc_bp_comparison.png')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.11"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/01_association_metrics/README.md b/01_association_metrics/README.md
new file mode 100644
index 0000000..cf61992
--- /dev/null
+++ b/01_association_metrics/README.md
@@ -0,0 +1,31 @@
+# 01_association_metrics
+
+
+In *setting_files_for_grnboost2* there are setting files for running GRNBoost2 on scRNAseq and BIOS data
+
+*GRNBoost2.ipynb*: Prepare the input files for BEELINE, and examine the GRNBoost2 results for scRNAseq and for BIOS data
+
+*rho_comparison_lowlyexpressed.R*: explores the differences between Spearman correlation and Rho propensity specially for very lowly expressed genes
+
+*scorpius_and_slingshot_clean.R*: calculates the pseudotime ordering for Oelen v2 classical monocytes, using SCORPIUS and Slingshot algorithms
+
+*scvelo_analysis_dm.py*: runs RNA velocity analysis on Oelen v3 dataset classical monocytes after creating loom files using [velocyto](http://velocyto.org/velocyto.py/tutorial/cli.html) to get both spliced and unspliced gene count matrices
+
+*compare_cell_classification.ipynb*: compares the aximuth cell type classification with the marker gene cell type classification in Oelen v2 and v3 dataset for untreated cells
+
+Metacell calculation and evaluation files are all in the directory *metacell*:
+
+*metacell_per_sample_original_algorithm.R*: calculates metacells based on original algorithm (implemented in the metacell R package)
+
+*metacells_from_leiden.R*: calculates metacells based on grouping from leiden clustering
+
+*create_genesets.R*: split all genes expressed in Oelen v3 dataset, Monocytes, into different expression bins for treshold-dependent evaluation with BLUEPRINT
+
+*metacell_general_correlation_tp.R*: calculates correlation from metacells (original or leiden) for different expression tresholds from *create_genesets.R* for comparison with BLUEPRINT
+
+*single_cell_correlation_tp.R*: calculates correlation from single cell dataset for different expression tresholds from *create_genesets.R* for comparison with BLUEPRINT
+
+*eval_blueprint_genesets.R*: compares correlation from BLUEPRINT with correlation from metacells/single cell for different expression tresholds, using correlation vlaues from *metacell_general_correlation_tp.R* and *single_cell_correlation_tp.R*
+
+*plot_overview_metacell.R*: visualize outputs from metacell evaluation in one plot
+
diff --git a/01_association_metrics/compare_cell_classification.ipynb b/01_association_metrics/compare_cell_classification.ipynb
new file mode 100644
index 0000000..75b0224
--- /dev/null
+++ b/01_association_metrics/compare_cell_classification.ipynb
@@ -0,0 +1,1164 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 34,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import matplotlib.pyplot as plt\n",
+ "import seaborn as sns\n",
+ "from scipy import stats\n",
+ "import pandas as pd\n",
+ "import matplotlib as mpl\n",
+ "mpl.rcParams['pdf.fonttype'] = 42\n",
+ "import numpy as np\n",
+ "%matplotlib inline"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%run dataset.ipynb"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "onemillionv2 = DATASET('onemillionv2')\n",
+ "onemillionv2.load_dataset()\n",
+ "\n",
+ "onemillionv3 = DATASET('onemillionv3')\n",
+ "onemillionv3.load_dataset()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "data_sc = onemillionv2.data_sc[onemillionv2.data_sc.obs['time']=='UT']\n",
+ "data_obs = data_sc.obs"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " orig.ident | \n",
+ " nCount_RNA | \n",
+ " nFeature_RNA | \n",
+ " batch | \n",
+ " lane | \n",
+ " chem | \n",
+ " exp.id | \n",
+ " timepoint | \n",
+ " percent.mt | \n",
+ " nCount_SCT | \n",
+ " nFeature_SCT | \n",
+ " cell_type | \n",
+ " cell_type_lowerres | \n",
+ " assignment | \n",
+ " bare_barcode_lane | \n",
+ " time | \n",
+ "
\n",
+ " \n",
+ " index | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " AAACCTGAGAGTACAT_180925_lane1 | \n",
+ " 1M_cells | \n",
+ " 5190.0 | \n",
+ " 1518 | \n",
+ " 180925_lane1 | \n",
+ " 180925_lane1 | \n",
+ " V2 | \n",
+ " 2 | \n",
+ " UT | \n",
+ " 1.560694 | \n",
+ " 3297.0 | \n",
+ " 1438 | \n",
+ " mono 2 | \n",
+ " monocyte | \n",
+ " LLDeep_1370 | \n",
+ " AAACCTGAGAGTACAT_180925_lane1 | \n",
+ " UT | \n",
+ "
\n",
+ " \n",
+ " AAACCTGAGTGTCTCA_180925_lane1 | \n",
+ " 1M_cells | \n",
+ " 5597.0 | \n",
+ " 1652 | \n",
+ " 180925_lane1 | \n",
+ " 180925_lane1 | \n",
+ " V2 | \n",
+ " 12 | \n",
+ " UT | \n",
+ " 3.394676 | \n",
+ " 3353.0 | \n",
+ " 1507 | \n",
+ " th1 CD4T | \n",
+ " CD4T | \n",
+ " LLDeep_0434 | \n",
+ " AAACCTGAGTGTCTCA_180925_lane1 | \n",
+ " UT | \n",
+ "
\n",
+ " \n",
+ " AAACCTGCAGTCGATT_180925_lane1 | \n",
+ " 1M_cells | \n",
+ " 3039.0 | \n",
+ " 849 | \n",
+ " 180925_lane1 | \n",
+ " 180925_lane1 | \n",
+ " V2 | \n",
+ " 11 | \n",
+ " UT | \n",
+ " 3.685423 | \n",
+ " 2786.0 | \n",
+ " 849 | \n",
+ " naive CD8T | \n",
+ " CD8T | \n",
+ " LLDeep_1319 | \n",
+ " AAACCTGCAGTCGATT_180925_lane1 | \n",
+ " UT | \n",
+ "
\n",
+ " \n",
+ " AAACCTGCATTCGACA_180925_lane1 | \n",
+ " 1M_cells | \n",
+ " 3876.0 | \n",
+ " 1048 | \n",
+ " 180925_lane1 | \n",
+ " 180925_lane1 | \n",
+ " V2 | \n",
+ " 2 | \n",
+ " UT | \n",
+ " 3.766770 | \n",
+ " 2996.0 | \n",
+ " 1047 | \n",
+ " mono 1 | \n",
+ " monocyte | \n",
+ " LLDeep_1370 | \n",
+ " AAACCTGCATTCGACA_180925_lane1 | \n",
+ " UT | \n",
+ "
\n",
+ " \n",
+ " AAACCTGGTAATAGCA_180925_lane1 | \n",
+ " 1M_cells | \n",
+ " 4272.0 | \n",
+ " 1141 | \n",
+ " 180925_lane1 | \n",
+ " 180925_lane1 | \n",
+ " V2 | \n",
+ " 12 | \n",
+ " UT | \n",
+ " 4.564607 | \n",
+ " 3076.0 | \n",
+ " 1131 | \n",
+ " mono 1 | \n",
+ " monocyte | \n",
+ " LLDeep_0434 | \n",
+ " AAACCTGGTAATAGCA_180925_lane1 | \n",
+ " UT | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " orig.ident nCount_RNA nFeature_RNA \\\n",
+ "index \n",
+ "AAACCTGAGAGTACAT_180925_lane1 1M_cells 5190.0 1518 \n",
+ "AAACCTGAGTGTCTCA_180925_lane1 1M_cells 5597.0 1652 \n",
+ "AAACCTGCAGTCGATT_180925_lane1 1M_cells 3039.0 849 \n",
+ "AAACCTGCATTCGACA_180925_lane1 1M_cells 3876.0 1048 \n",
+ "AAACCTGGTAATAGCA_180925_lane1 1M_cells 4272.0 1141 \n",
+ "\n",
+ " batch lane chem exp.id \\\n",
+ "index \n",
+ "AAACCTGAGAGTACAT_180925_lane1 180925_lane1 180925_lane1 V2 2 \n",
+ "AAACCTGAGTGTCTCA_180925_lane1 180925_lane1 180925_lane1 V2 12 \n",
+ "AAACCTGCAGTCGATT_180925_lane1 180925_lane1 180925_lane1 V2 11 \n",
+ "AAACCTGCATTCGACA_180925_lane1 180925_lane1 180925_lane1 V2 2 \n",
+ "AAACCTGGTAATAGCA_180925_lane1 180925_lane1 180925_lane1 V2 12 \n",
+ "\n",
+ " timepoint percent.mt nCount_SCT nFeature_SCT \\\n",
+ "index \n",
+ "AAACCTGAGAGTACAT_180925_lane1 UT 1.560694 3297.0 1438 \n",
+ "AAACCTGAGTGTCTCA_180925_lane1 UT 3.394676 3353.0 1507 \n",
+ "AAACCTGCAGTCGATT_180925_lane1 UT 3.685423 2786.0 849 \n",
+ "AAACCTGCATTCGACA_180925_lane1 UT 3.766770 2996.0 1047 \n",
+ "AAACCTGGTAATAGCA_180925_lane1 UT 4.564607 3076.0 1131 \n",
+ "\n",
+ " cell_type cell_type_lowerres assignment \\\n",
+ "index \n",
+ "AAACCTGAGAGTACAT_180925_lane1 mono 2 monocyte LLDeep_1370 \n",
+ "AAACCTGAGTGTCTCA_180925_lane1 th1 CD4T CD4T LLDeep_0434 \n",
+ "AAACCTGCAGTCGATT_180925_lane1 naive CD8T CD8T LLDeep_1319 \n",
+ "AAACCTGCATTCGACA_180925_lane1 mono 1 monocyte LLDeep_1370 \n",
+ "AAACCTGGTAATAGCA_180925_lane1 mono 1 monocyte LLDeep_0434 \n",
+ "\n",
+ " bare_barcode_lane time \n",
+ "index \n",
+ "AAACCTGAGAGTACAT_180925_lane1 AAACCTGAGAGTACAT_180925_lane1 UT \n",
+ "AAACCTGAGTGTCTCA_180925_lane1 AAACCTGAGTGTCTCA_180925_lane1 UT \n",
+ "AAACCTGCAGTCGATT_180925_lane1 AAACCTGCAGTCGATT_180925_lane1 UT \n",
+ "AAACCTGCATTCGACA_180925_lane1 AAACCTGCATTCGACA_180925_lane1 UT \n",
+ "AAACCTGGTAATAGCA_180925_lane1 AAACCTGGTAATAGCA_180925_lane1 UT "
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data_obs.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXoAAAD8CAYAAAB5Pm/hAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8rg+JYAAAACXBIWXMAAAsTAAALEwEAmpwYAAAU2UlEQVR4nO3df5Cd1X3f8ffHwsoPFdtjs8auBJViK2XUFBy8QFxoXNyaIDOtoHZqEdc4/lENUxQP9XhqTTOT0HEzgZZ0XNvYGg2DHTKmmsRGjFpkMHUmJo1No5XBgAhyFKGWjewgsCe2G4IQfPvHfdRclivts9JerTh6v2Z29j7nx3PPkXY/e+65z703VYUkqV0vW+gBSJLGy6CXpMYZ9JLUOINekhpn0EtS4wx6SWpcr6BPcmmSXUl2J9lwhHbnJXkuybvm2leSNB6zBn2SRcBNwGpgFXBlklWHaXcDcPdc+0qSxqfPiv58YHdV7amqA8BmYM2Idr8CfAl44ij6SpLG5JQebZYCjw8dTwMXDDdIshS4AngbcN5c+g6dYx2wDmDJkiVvPuuss3oMTZIEsGPHjieramJUXZ+gz4iyme+b8AngY1X1XPKC5n36DgqrNgGbACYnJ2tqaqrH0CRJAEn+9+Hq+gT9NHDG0PEyYN+MNpPA5i7kTwPekeRgz76SpDHqE/TbgZVJVgB/DqwFfmm4QVWtOHQ7yeeB/15VdyQ5Zba+kqTxmjXoq+pgkvUMrqZZBNxSVTuTXN3Vb5xr3/kZuiSpj5yIb1PsHr0kzU2SHVU1OarOV8ZKUuMMeklqnEEvSY0z6CWpcX0ur3xJWb7hzoUeQi97r79soYcg6SThil6SGmfQS1LjDHpJapxBL0mNM+glqXEGvSQ1zqCXpMYZ9JLUOINekhpn0EtS4wx6SWqcQS9JjTPoJalxvYI+yaVJdiXZnWTDiPo1SR5M8kCSqSQXDdXtTfLQobr5HLwkaXazvk1xkkXATcDbgWlge5KtVfXIULOvAlurqpKcDfwucNZQ/cVV9eQ8jluS1FOfFf35wO6q2lNVB4DNwJrhBlX1o/qbTxlfApx4nzguSSepPkG/FHh86Hi6K3uBJFckeRS4E/jAUFUBX0myI8m6YxmsJGnu+gR9RpS9aMVeVVuq6izgcuDjQ1UXVtW5wGrgmiQ/P/JOknXd/v7U/v37ewxLktRHn6CfBs4YOl4G7Dtc46q6F3hDktO6433d9yeALQy2gkb121RVk1U1OTEx0XP4kqTZ9An67cDKJCuSLAbWAluHGyR5Y5J0t88FFgNPJVmS5NSufAlwCfDwfE5AknRks151U1UHk6wH7gYWAbdU1c4kV3f1G4F3AlcleRZ4Gnh3dwXO6cCW7m/AKcBtVXXXmOYiSRph1qAHqKptwLYZZRuHbt8A3DCi3x7gnGMcoyTpGPjKWElqnEEvSY0z6CWpcQa9JDXOoJekxhn0ktQ4g16SGmfQS1LjDHpJapxBL0mNM+glqXEGvSQ1zqCXpMYZ9JLUOINekhpn0EtS4wx6SWqcQS9JjTPoJalxvYI+yaVJdiXZnWTDiPo1SR5M8kCSqSQX9e0rSRqvWYM+ySLgJmA1sAq4MsmqGc2+CpxTVW8CPgDcPIe+kqQx6rOiPx/YXVV7quoAsBlYM9ygqn5UVdUdLgGqb19J0nj1CfqlwONDx9Nd2QskuSLJo8CdDFb1vft2/dd12z5T+/fv7zN2SVIPfYI+I8rqRQVVW6rqLOBy4ONz6dv131RVk1U1OTEx0WNYkqQ++gT9NHDG0PEyYN/hGlfVvcAbkpw2176SpPnXJ+i3AyuTrEiyGFgLbB1ukOSNSdLdPhdYDDzVp68kabxOma1BVR1Msh64G1gE3FJVO5Nc3dVvBN4JXJXkWeBp4N3dk7Mj+45pLpKkEWYNeoCq2gZsm1G2cej2DcANfftKko4fXxkrSY0z6CWpcQa9JDXOoJekxhn0ktQ4g16SGmfQS1LjDHpJapxBL0mNM+glqXEGvSQ1zqCXpMYZ9JLUOINekhpn0EtS4wx6SWqcQS9JjTPoJalxvYI+yaVJdiXZnWTDiPr3JHmw+/p6knOG6vYmeSjJA0mm5nPwkqTZzfqZsUkWATcBbwemge1JtlbVI0PNHgPeWlXfT7Ia2ARcMFR/cVU9OY/jliT11GdFfz6wu6r2VNUBYDOwZrhBVX29qr7fHd4HLJvfYUqSjlafoF8KPD50PN2VHc4HgS8PHRfwlSQ7kqw7XKck65JMJZnav39/j2FJkvqYdesGyIiyGtkwuZhB0F80VHxhVe1L8lrgniSPVtW9Lzph1SYGWz5MTk6OPP/JaPmGOxd6CL3svf6yhR6CpMPos6KfBs4YOl4G7JvZKMnZwM3Amqp66lB5Ve3rvj8BbGGwFSRJOk76BP12YGWSFUkWA2uBrcMNkpwJ3A68t6q+PVS+JMmph24DlwAPz9fgJUmzm3XrpqoOJlkP3A0sAm6pqp1Jru7qNwK/BrwG+EwSgINVNQmcDmzpyk4Bbququ8YyE0nSSH326KmqbcC2GWUbh25/CPjQiH57gHNmlkuSjh9fGStJjTPoJalxBr0kNc6gl6TGGfSS1DiDXpIaZ9BLUuMMeklqnEEvSY0z6CWpcQa9JDXOoJekxhn0ktQ4g16SGmfQS1LjDHpJapxBL0mNM+glqXG9gj7JpUl2JdmdZMOI+vckebD7+nqSc/r2lSSN16xBn2QRcBOwGlgFXJlk1YxmjwFvraqzgY8Dm+bQV5I0Rn1W9OcDu6tqT1UdADYDa4YbVNXXq+r73eF9wLK+fSVJ49Un6JcCjw8dT3dlh/NB4Mtz7ZtkXZKpJFP79+/vMSxJUh99gj4jympkw+RiBkH/sbn2rapNVTVZVZMTExM9hiVJ6uOUHm2mgTOGjpcB+2Y2SnI2cDOwuqqemktfSdL49FnRbwdWJlmRZDGwFtg63CDJmcDtwHur6ttz6StJGq9ZV/RVdTDJeuBuYBFwS1XtTHJ1V78R+DXgNcBnkgAc7LZhRvYd01wkSSP02bqhqrYB22aUbRy6/SHgQ337SpKOH18ZK0mNM+glqXEGvSQ1zqCXpMYZ9JLUOINekhpn0EtS4wx6SWqcQS9JjTPoJalxBr0kNc6gl6TGGfSS1DiDXpIaZ9BLUuMMeklqnEEvSY0z6CWpcb0+SjDJpcB/YfC5rzdX1fUz6s8CPgecC/xqVd04VLcX+CHwHN1nyc7P0KUTw/INdy70EHrZe/1lCz0ELZBZgz7JIuAm4O3ANLA9ydaqemSo2feADwOXH+Y0F1fVk8c4VknSUeizdXM+sLuq9lTVAWAzsGa4QVU9UVXbgWfHMEZJ0jHoE/RLgceHjqe7sr4K+EqSHUnWHa5RknVJppJM7d+/fw6nlyQdSZ+gz4iymsN9XFhV5wKrgWuS/PyoRlW1qaomq2pyYmJiDqeXJB1Jnydjp4Ezho6XAfv63kFV7eu+P5FkC4OtoHvnMki1xScvpeOrz4p+O7AyyYoki4G1wNY+J0+yJMmph24DlwAPH+1gJUlzN+uKvqoOJlkP3M3g8spbqmpnkqu7+o1JXgdMAa8Ank9yLbAKOA3YkuTQfd1WVXeNZSaSpJF6XUdfVduAbTPKNg7d/i6DLZ2ZfgCccywDlCQdG18ZK0mNM+glqXEGvSQ1zqCXpMYZ9JLUOINekhpn0EtS4wx6SWqcQS9JjTPoJalxBr0kNc6gl6TGGfSS1DiDXpIaZ9BLUuMMeklqnEEvSY0z6CWpcb2CPsmlSXYl2Z1kw4j6s5J8I8kzST46l76SpPGaNeiTLAJuAlYz+MDvK5OsmtHse8CHgRuPoq8kaYz6rOjPB3ZX1Z6qOgBsBtYMN6iqJ6pqO/DsXPtKksarT9AvBR4fOp7uyvro3TfJuiRTSab279/f8/SSpNn0CfqMKKue5+/dt6o2VdVkVU1OTEz0PL0kaTZ9gn4aOGPoeBmwr+f5j6WvJGke9An67cDKJCuSLAbWAlt7nv9Y+kqS5sEpszWoqoNJ1gN3A4uAW6pqZ5Kru/qNSV4HTAGvAJ5Pci2wqqp+MKrvmOYiSRph1qAHqKptwLYZZRuHbn+XwbZMr76SpOPHV8ZKUuMMeklqnEEvSY0z6CWpcQa9JDXOoJekxhn0ktQ4g16SGmfQS1LjDHpJapxBL0mNM+glqXEGvSQ1zqCXpMYZ9JLUOINekhpn0EtS4wx6SWpcr6BPcmmSXUl2J9kwoj5JPtnVP5jk3KG6vUkeSvJAkqn5HLwkaXazfmZskkXATcDbgWlge5KtVfXIULPVwMru6wLgs933Qy6uqifnbdSSpN76rOjPB3ZX1Z6qOgBsBtbMaLMGuLUG7gNeleT18zxWSdJR6BP0S4HHh46nu7K+bQr4SpIdSdYd7UAlSUdn1q0bICPKag5tLqyqfUleC9yT5NGquvdFdzL4I7AO4Mwzz+wxLElSH31W9NPAGUPHy4B9fdtU1aHvTwBbGGwFvUhVbaqqyaqanJiY6Dd6SdKs+gT9dmBlkhVJFgNrga0z2mwFruquvvk54C+r6jtJliQ5FSDJEuAS4OF5HL8kaRazbt1U1cEk64G7gUXALVW1M8nVXf1GYBvwDmA38FfA+7vupwNbkhy6r9uq6q55n4Uk6bD67NFTVdsYhPlw2cah2wVcM6LfHuCcYxyjJOkY+MpYSWqcQS9JjTPoJalxBr0kNc6gl6TGGfSS1DiDXpIaZ9BLUuMMeklqXK9Xxko6uSzfcOdCD6GXvddfttBDeElwRS9JjXNFL6l5J/sjFFf0ktQ4g16SGmfQS1LjDHpJapxBL0mNM+glqXEGvSQ1rlfQJ7k0ya4ku5NsGFGfJJ/s6h9Mcm7fvpKk8Zo16JMsAm4CVgOrgCuTrJrRbDWwsvtaB3x2Dn0lSWPUZ0V/PrC7qvZU1QFgM7BmRps1wK01cB/wqiSv79lXkjRGfd4CYSnw+NDxNHBBjzZLe/YFIMk6Bo8GAH6UZFePsR0vpwFPzucJc8N8nm3OWpsPtDen1uYD7c3pRJvP3zlcRZ+gz4iy6tmmT99BYdUmYFOP8Rx3SaaqanKhxzFfWpsPtDen1uYD7c3ppTSfPkE/DZwxdLwM2NezzeIefSVJY9Rnj347sDLJiiSLgbXA1hlttgJXdVff/Bzwl1X1nZ59JUljNOuKvqoOJlkP3A0sAm6pqp1Jru7qNwLbgHcAu4G/At5/pL5jmcl4nZBbSsegtflAe3NqbT7Q3pxeMvNJ1cgtc0lSI3xlrCQ1zqCXpMadlEGf5HVJNif5sySPJNmW5KeTPJ3k/iR/kuSPk7xvRN/zkjyX5F1JXpPkge7ru0n+fOh48Yk8nySvTPLfknwryc4k7z9R5nO8JfnlJH97ge67kvzW0PFHk1zX3b4uyUe72z+e5J4kv74Q4+yr+914oPuZ+laSjyR52VD9+Unu7d4W5dEkNyf5yYUc81wNzfFbSb6Z5B8s9Jhmc9J9ZmySAFuA366qtV3Zm4DTgT+rqp/tyn4KuD3Jy6rqc13ZIuAGBk8uU1VPAW/q6q4DflRVN75E5nMN8EhV/dMkE8Au4AtVtaDzWSC/DDzMwlz6+wzwz5P8ZlWNfPFN90f2S8COqvr3x3V0c/f00M/Qa4HbgFcCv57kdOD3gLVV9Y3uZ/edwKkMLuJ4qRie4y8Avwm8dUFHNIuTcUV/MfBsd7UQAFX1AC98BS9VtQf4CPDhoeJfYfAL98T4h9nb0c6ngFO7X7a/BXwPODiuQSZZPrSCezjJF5L8kyR/lORPu5Xeq5PckcEb492X5Oyu73VJbknyB0n2JPnw0Hk/0p3v4STXDpVf1Z3nW0l+J8mpSR5L8vKu/hVJ9ib5RWAS+EK3SvuJJG9O8rUkO5LcncHbeYzLQQZXb/ybw9SfwuCtQ/60ql5SbwpYVU8weLX7+u7n7BoGC5JvdPVVVV+sqr9YyHEeo1cA31/oQczmpFvRAz8D7OjZ9pvAWQBJlgJXAG8DzhvP0I7KUc0H+DSD1zTsY7CiendVPT//w3uBNwK/yOCXfzvwS8BFwD8D/h2DP073V9XlSd4G3Er3iKkb98XdWHcl+SxwNoNLeS9g8Crs/5Xka8AB4FeBC6vqySSvrqofJvkD4DLgDgav6fhSVf1ekmuAj1bVVPeH4FPAmqran+TdwG8AHxjjv8tNwINJ/uOIun8L/I+qunaM9z82VbWn27p5LYOf1d9e4CHNh59I8gDw48DrGWTCCe1kDPq5GH4Lh08AH6uq5waLk5ek4YH/AvAAgx/SNwD3JPnDqvrBGO//sap6CCDJTuCrVVVJHgKWM3ivjncCVNXvd88ZvLLre2dVPQM8k+QJBltTFwFbqur/due8HfiHDB6tfPHQVkhVfa87x80MgvMOBn8g/tWIMf5dBoF0T/f/vAj4zrz9C4xQVT9IciuDR1tPz6j+n8Bbkvx0VX17nOMYo5fsL8xhDG/dvAW4NcnP1Al8rfrJuHWzE3hzz7Y/C/xJd3sS2JxkL/Au4DNJLp/30c3d0c7n/cDt3cPn3cBj/M1qf1yeGbr9/NDx8wwWHUd6b6Thvs8doT1d+Yt+6arqj4DlSd4KLKqqhw/Td2dVvan7+vtVdcnhJjSPPgF8EFgyo/xe4Frgywv1hPGx6J4beo7BdudcflZfErptqNOAiYUey5GcjEH/+8CPJfn/q7kk5zHjnd+SLAduZPAwnqpaUVXLq2o58EXgX1fVHcdpzEdyVPMB/g/wj7u60xmsZPcch/Eeyb3AewCS/CPgyVkeYdwLXJ7kJ5MsYbC19ofAV4F/keQ13blePdTnVuC/Ap8bKvshgy0hGDwpPdGt1Ejy8iR/7xjnNavuUcfvMgj7mXVfAv4TcFeSV417LPOle5J/I/DpbrX7aeB9SS4YavMvk7xuocZ4rJKcxeBR31MLPZYjOem2brqtgiuAT2TwiVd/DexlsGp6Q5L7Gey9/RD41KErbk5UxzCfjwOf77ZNwmBbal7fcvUoXAd8LsmDDK7CeNHlrcOq6ptJPg/8cVd0c1XdD5DkN4CvJXkOuJ/BlTUAXwD+A4OwP+TzwMYkTwNvYfCI7ZPdttEpDFbbx+OtO34LWD+qoqo2doG4NcklVfXXx2E8R+PQ/vXLGTzR/DvAfwaoqr9Isha4sbsi53kGf6xvX6CxHq1Dc4TB7877quq5BRzPrHwLBJ1UkryLwROt713osUjHy0m3otfJK8mnGHys5TsWeizS8eSKXpIadzI+GStJJxWDXpIaZ9BLUuMMeklqnEEvSY37f/cjzCi39pQ6AAAAAElFTkSuQmCC\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "celltypes = ['CD4T', 'CD8T', 'monocyte', 'NK', 'DC', 'B']\n",
+ "plot_df = data_obs['cell_type_lowerres'].value_counts(normalize=True).loc[celltypes]\n",
+ "plt.bar(x=plot_df.index, height=plot_df.values)\n",
+ "plt.ylim([0, 0.40])\n",
+ "plt.savefig('cell_abundance_oelenv2.pdf')\n",
+ "plt.savefig('cell_abundance_oelenv2.png')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " predicted.celltype.l1 | \n",
+ " predicted.celltype.l1.score | \n",
+ " predicted.celltype.l2 | \n",
+ " predicted.celltype.l2.score | \n",
+ "
\n",
+ " \n",
+ " barcode | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " AAACCTGAGAAACCAT_180920_lane1 | \n",
+ " CD8 T | \n",
+ " 0.755924 | \n",
+ " CD8 TEM | \n",
+ " 0.755924 | \n",
+ "
\n",
+ " \n",
+ " AAACCTGAGTAGCCGA_180920_lane1 | \n",
+ " Mono | \n",
+ " 1.000000 | \n",
+ " CD16 Mono | \n",
+ " 1.000000 | \n",
+ "
\n",
+ " \n",
+ " AAACCTGCAATCTACG_180920_lane1 | \n",
+ " NK | \n",
+ " 1.000000 | \n",
+ " NK | \n",
+ " 1.000000 | \n",
+ "
\n",
+ " \n",
+ " AAACCTGCACATCCAA_180920_lane1 | \n",
+ " other | \n",
+ " 0.546320 | \n",
+ " ILC | \n",
+ " 0.546320 | \n",
+ "
\n",
+ " \n",
+ " AAACCTGCAGCTCGAC_180920_lane1 | \n",
+ " NK | \n",
+ " 0.733094 | \n",
+ " NK | \n",
+ " 0.733094 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " predicted.celltype.l1 \\\n",
+ "barcode \n",
+ "AAACCTGAGAAACCAT_180920_lane1 CD8 T \n",
+ "AAACCTGAGTAGCCGA_180920_lane1 Mono \n",
+ "AAACCTGCAATCTACG_180920_lane1 NK \n",
+ "AAACCTGCACATCCAA_180920_lane1 other \n",
+ "AAACCTGCAGCTCGAC_180920_lane1 NK \n",
+ "\n",
+ " predicted.celltype.l1.score \\\n",
+ "barcode \n",
+ "AAACCTGAGAAACCAT_180920_lane1 0.755924 \n",
+ "AAACCTGAGTAGCCGA_180920_lane1 1.000000 \n",
+ "AAACCTGCAATCTACG_180920_lane1 1.000000 \n",
+ "AAACCTGCACATCCAA_180920_lane1 0.546320 \n",
+ "AAACCTGCAGCTCGAC_180920_lane1 0.733094 \n",
+ "\n",
+ " predicted.celltype.l2 \\\n",
+ "barcode \n",
+ "AAACCTGAGAAACCAT_180920_lane1 CD8 TEM \n",
+ "AAACCTGAGTAGCCGA_180920_lane1 CD16 Mono \n",
+ "AAACCTGCAATCTACG_180920_lane1 NK \n",
+ "AAACCTGCACATCCAA_180920_lane1 ILC \n",
+ "AAACCTGCAGCTCGAC_180920_lane1 NK \n",
+ "\n",
+ " predicted.celltype.l2.score \n",
+ "barcode \n",
+ "AAACCTGAGAAACCAT_180920_lane1 0.755924 \n",
+ "AAACCTGAGTAGCCGA_180920_lane1 1.000000 \n",
+ "AAACCTGCAATCTACG_180920_lane1 1.000000 \n",
+ "AAACCTGCACATCCAA_180920_lane1 0.546320 \n",
+ "AAACCTGCAGCTCGAC_180920_lane1 0.733094 "
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "azimuith_df = pd.read_csv(\n",
+ " '1M_v2_20201029_azimuth.tsv',\n",
+ " sep='\\t', index_col=0\n",
+ ")\n",
+ "azimuith_df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['monocyte', 'CD4T', 'CD8T', 'NK', 'megakaryocyte', 'B', 'DC', 'plasma B', 'unknown', 'hemapoietic stem']\n",
+ "Categories (10, object): ['B', 'hemapoietic stem', 'megakaryocyte', 'NK', ..., 'CD4T', 'CD8T', 'monocyte', 'DC']"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data_obs['cell_type_lowerres'].unique()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array(['CD8 T', 'Mono', 'NK', 'other', 'CD4 T', 'DC', 'B', 'other T'],\n",
+ " dtype=object)"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "azimuith_df['predicted.celltype.l1'].unique()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " predicted.celltype.l1 | \n",
+ " predicted.celltype.l1.score | \n",
+ " predicted.celltype.l2 | \n",
+ " predicted.celltype.l2.score | \n",
+ " cell_type_mapped | \n",
+ "
\n",
+ " \n",
+ " barcode | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " AAACCTGAGAAACCAT_180920_lane1 | \n",
+ " CD8 T | \n",
+ " 0.755924 | \n",
+ " CD8 TEM | \n",
+ " 0.755924 | \n",
+ " CD8T | \n",
+ "
\n",
+ " \n",
+ " AAACCTGAGTAGCCGA_180920_lane1 | \n",
+ " Mono | \n",
+ " 1.000000 | \n",
+ " CD16 Mono | \n",
+ " 1.000000 | \n",
+ " monocyte | \n",
+ "
\n",
+ " \n",
+ " AAACCTGCAATCTACG_180920_lane1 | \n",
+ " NK | \n",
+ " 1.000000 | \n",
+ " NK | \n",
+ " 1.000000 | \n",
+ " NK | \n",
+ "
\n",
+ " \n",
+ " AAACCTGCACATCCAA_180920_lane1 | \n",
+ " other | \n",
+ " 0.546320 | \n",
+ " ILC | \n",
+ " 0.546320 | \n",
+ " None | \n",
+ "
\n",
+ " \n",
+ " AAACCTGCAGCTCGAC_180920_lane1 | \n",
+ " NK | \n",
+ " 0.733094 | \n",
+ " NK | \n",
+ " 0.733094 | \n",
+ " NK | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " predicted.celltype.l1 \\\n",
+ "barcode \n",
+ "AAACCTGAGAAACCAT_180920_lane1 CD8 T \n",
+ "AAACCTGAGTAGCCGA_180920_lane1 Mono \n",
+ "AAACCTGCAATCTACG_180920_lane1 NK \n",
+ "AAACCTGCACATCCAA_180920_lane1 other \n",
+ "AAACCTGCAGCTCGAC_180920_lane1 NK \n",
+ "\n",
+ " predicted.celltype.l1.score \\\n",
+ "barcode \n",
+ "AAACCTGAGAAACCAT_180920_lane1 0.755924 \n",
+ "AAACCTGAGTAGCCGA_180920_lane1 1.000000 \n",
+ "AAACCTGCAATCTACG_180920_lane1 1.000000 \n",
+ "AAACCTGCACATCCAA_180920_lane1 0.546320 \n",
+ "AAACCTGCAGCTCGAC_180920_lane1 0.733094 \n",
+ "\n",
+ " predicted.celltype.l2 \\\n",
+ "barcode \n",
+ "AAACCTGAGAAACCAT_180920_lane1 CD8 TEM \n",
+ "AAACCTGAGTAGCCGA_180920_lane1 CD16 Mono \n",
+ "AAACCTGCAATCTACG_180920_lane1 NK \n",
+ "AAACCTGCACATCCAA_180920_lane1 ILC \n",
+ "AAACCTGCAGCTCGAC_180920_lane1 NK \n",
+ "\n",
+ " predicted.celltype.l2.score cell_type_mapped \n",
+ "barcode \n",
+ "AAACCTGAGAAACCAT_180920_lane1 0.755924 CD8T \n",
+ "AAACCTGAGTAGCCGA_180920_lane1 1.000000 monocyte \n",
+ "AAACCTGCAATCTACG_180920_lane1 1.000000 NK \n",
+ "AAACCTGCACATCCAA_180920_lane1 0.546320 None \n",
+ "AAACCTGCAGCTCGAC_180920_lane1 0.733094 NK "
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "mapping_names = {'CD8 T': 'CD8T', \n",
+ " 'CD4 T': 'CD4T',\n",
+ " 'Mono': 'monocyte',\n",
+ " 'NK': 'NK',\n",
+ " 'B': 'B',\n",
+ " 'DC': 'DC'}\n",
+ "azimuith_df['cell_type_mapped'] = [mapping_names.get(name) for name in \n",
+ " azimuith_df['predicted.celltype.l1']]\n",
+ "azimuith_df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " cell_type_lowerres | \n",
+ " cell_type_mapped | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " AAACCTGAGAGTACAT_180925_lane1 | \n",
+ " monocyte | \n",
+ " monocyte | \n",
+ "
\n",
+ " \n",
+ " AAACCTGAGTGTCTCA_180925_lane1 | \n",
+ " CD4T | \n",
+ " CD4T | \n",
+ "
\n",
+ " \n",
+ " AAACCTGCAGTCGATT_180925_lane1 | \n",
+ " CD8T | \n",
+ " CD8T | \n",
+ "
\n",
+ " \n",
+ " AAACCTGCATTCGACA_180925_lane1 | \n",
+ " monocyte | \n",
+ " monocyte | \n",
+ "
\n",
+ " \n",
+ " AAACCTGGTAATAGCA_180925_lane1 | \n",
+ " monocyte | \n",
+ " monocyte | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " cell_type_lowerres cell_type_mapped\n",
+ "AAACCTGAGAGTACAT_180925_lane1 monocyte monocyte\n",
+ "AAACCTGAGTGTCTCA_180925_lane1 CD4T CD4T\n",
+ "AAACCTGCAGTCGATT_180925_lane1 CD8T CD8T\n",
+ "AAACCTGCATTCGACA_180925_lane1 monocyte monocyte\n",
+ "AAACCTGGTAATAGCA_180925_lane1 monocyte monocyte"
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "merged_classification_df = pd.concat([data_obs[['cell_type_lowerres']],\n",
+ " azimuith_df[['cell_type_mapped']]],\n",
+ " axis=1).dropna()\n",
+ "merged_classification_df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "celltypes = ['CD4T', 'CD8T', 'monocyte', 'DC', 'NK', 'B']\n",
+ "accuracy_df = pd.DataFrame(\n",
+ " data=np.zeros((6, 6)),\n",
+ " index=celltypes,\n",
+ " columns=celltypes\n",
+ ")\n",
+ "for celltype_onemillionv2 in celltypes:\n",
+ " for celltype_azimuth in celltypes:\n",
+ " common_classification_num = merged_classification_df[\n",
+ " (merged_classification_df['cell_type_lowerres']==celltype_onemillionv2) & \n",
+ " (merged_classification_df['cell_type_mapped']==celltype_azimuth)\n",
+ " ].shape[0]\n",
+ " onemillion_classification_num = merged_classification_df[\n",
+ " (merged_classification_df['cell_type_lowerres']==celltype_onemillionv2)\n",
+ " ].shape[0]\n",
+ " azimuth_classification_num = merged_classification_df[\n",
+ " (merged_classification_df['cell_type_mapped']==celltype_azimuth)\n",
+ " ].shape[0]\n",
+ " accuracy_df[celltype_onemillionv2].loc[celltype_azimuth] = common_classification_num/onemillion_classification_num"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "plt.figure(figsize=(6, 5))\n",
+ "sns.heatmap(accuracy_df, vmin=0, vmax=1,\n",
+ " cmap=\"YlGnBu\", \n",
+ " annot=True, annot_kws={\"size\": 10},\n",
+ " fmt='.2f')\n",
+ "plt.xlabel('Cell type in Oelen v2')\n",
+ "plt.ylabel('Cell type by Azimuth Algorithm')\n",
+ "plt.savefig('marker_gene_azimuth_classification.pdf')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " predicted.celltype.l1 | \n",
+ " predicted.celltype.l1.score | \n",
+ " predicted.celltype.l2 | \n",
+ " predicted.celltype.l2.score | \n",
+ "
\n",
+ " \n",
+ " barcode | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " AAACCCAAGATACCAA_190109_lane1 | \n",
+ " CD8 T | \n",
+ " 0.981818 | \n",
+ " CD8 Naive | \n",
+ " 0.955707 | \n",
+ "
\n",
+ " \n",
+ " AAACCCAAGTCCCTAA_190109_lane1 | \n",
+ " CD4 T | \n",
+ " 0.379819 | \n",
+ " CD4 CTL | \n",
+ " 0.327444 | \n",
+ "
\n",
+ " \n",
+ " AAACCCACAAGAGTGC_190109_lane1 | \n",
+ " CD4 T | \n",
+ " 0.740111 | \n",
+ " CD4 Naive | \n",
+ " 0.723822 | \n",
+ "
\n",
+ " \n",
+ " AAACCCACAATCCAGT_190109_lane1 | \n",
+ " CD4 T | \n",
+ " 0.916869 | \n",
+ " CD4 TCM | \n",
+ " 0.468549 | \n",
+ "
\n",
+ " \n",
+ " AAACCCACACTATCCC_190109_lane1 | \n",
+ " Mono | \n",
+ " 1.000000 | \n",
+ " CD14 Mono | \n",
+ " 1.000000 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " predicted.celltype.l1 \\\n",
+ "barcode \n",
+ "AAACCCAAGATACCAA_190109_lane1 CD8 T \n",
+ "AAACCCAAGTCCCTAA_190109_lane1 CD4 T \n",
+ "AAACCCACAAGAGTGC_190109_lane1 CD4 T \n",
+ "AAACCCACAATCCAGT_190109_lane1 CD4 T \n",
+ "AAACCCACACTATCCC_190109_lane1 Mono \n",
+ "\n",
+ " predicted.celltype.l1.score \\\n",
+ "barcode \n",
+ "AAACCCAAGATACCAA_190109_lane1 0.981818 \n",
+ "AAACCCAAGTCCCTAA_190109_lane1 0.379819 \n",
+ "AAACCCACAAGAGTGC_190109_lane1 0.740111 \n",
+ "AAACCCACAATCCAGT_190109_lane1 0.916869 \n",
+ "AAACCCACACTATCCC_190109_lane1 1.000000 \n",
+ "\n",
+ " predicted.celltype.l2 \\\n",
+ "barcode \n",
+ "AAACCCAAGATACCAA_190109_lane1 CD8 Naive \n",
+ "AAACCCAAGTCCCTAA_190109_lane1 CD4 CTL \n",
+ "AAACCCACAAGAGTGC_190109_lane1 CD4 Naive \n",
+ "AAACCCACAATCCAGT_190109_lane1 CD4 TCM \n",
+ "AAACCCACACTATCCC_190109_lane1 CD14 Mono \n",
+ "\n",
+ " predicted.celltype.l2.score \n",
+ "barcode \n",
+ "AAACCCAAGATACCAA_190109_lane1 0.955707 \n",
+ "AAACCCAAGTCCCTAA_190109_lane1 0.327444 \n",
+ "AAACCCACAAGAGTGC_190109_lane1 0.723822 \n",
+ "AAACCCACAATCCAGT_190109_lane1 0.468549 \n",
+ "AAACCCACACTATCCC_190109_lane1 1.000000 "
+ ]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "azimuith_df_v3 = pd.read_csv(\n",
+ " '1M_v3_20201106_azimuth.tsv',\n",
+ " sep='\\t', index_col=0\n",
+ ")\n",
+ "azimuith_df_v3.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "mapping_names = {'CD8 T': 'CD8T', \n",
+ " 'CD4 T': 'CD4T',\n",
+ " 'Mono': 'monocyte',\n",
+ " 'NK': 'NK',\n",
+ " 'B': 'B',\n",
+ " 'DC': 'DC'}\n",
+ "azimuith_df_v3['cell_type_mapped'] = [mapping_names.get(name) for name in \n",
+ " azimuith_df_v3['predicted.celltype.l1']]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "data_scv3 = onemillionv3.data_sc[onemillionv3.data_sc.obs['time']=='UT']\n",
+ "data_obsv3 = data_scv3.obs"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " cell_type_lowerres | \n",
+ " cell_type_mapped | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " AAACCCAAGATACCAA_190109_lane1 | \n",
+ " CD4T | \n",
+ " CD8T | \n",
+ "
\n",
+ " \n",
+ " AAACCCACAAGAGTGC_190109_lane1 | \n",
+ " CD4T | \n",
+ " CD4T | \n",
+ "
\n",
+ " \n",
+ " AAACCCACAATCCAGT_190109_lane1 | \n",
+ " CD4T | \n",
+ " CD4T | \n",
+ "
\n",
+ " \n",
+ " AAACCCACAGGTACGA_190109_lane1 | \n",
+ " CD4T | \n",
+ " CD4T | \n",
+ "
\n",
+ " \n",
+ " AAACCCAGTCTACGTA_190109_lane1 | \n",
+ " CD4T | \n",
+ " CD4T | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " cell_type_lowerres cell_type_mapped\n",
+ "AAACCCAAGATACCAA_190109_lane1 CD4T CD8T\n",
+ "AAACCCACAAGAGTGC_190109_lane1 CD4T CD4T\n",
+ "AAACCCACAATCCAGT_190109_lane1 CD4T CD4T\n",
+ "AAACCCACAGGTACGA_190109_lane1 CD4T CD4T\n",
+ "AAACCCAGTCTACGTA_190109_lane1 CD4T CD4T"
+ ]
+ },
+ "execution_count": 28,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "merged_classification_df_v3 = pd.concat([data_obsv3[['cell_type_lowerres']],\n",
+ " azimuith_df_v3[['cell_type_mapped']]],\n",
+ " axis=1).dropna()\n",
+ "merged_classification_df_v3.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "celltypes = ['CD4T', 'CD8T', 'monocyte', 'DC', 'NK', 'B']\n",
+ "accuracy_df_v3 = pd.DataFrame(\n",
+ " data=np.zeros((6, 6)),\n",
+ " index=celltypes,\n",
+ " columns=celltypes\n",
+ ")\n",
+ "for celltype_onemillionv2 in celltypes:\n",
+ " for celltype_azimuth in celltypes:\n",
+ " common_classification_num = merged_classification_df_v3[\n",
+ " (merged_classification_df_v3['cell_type_lowerres']==celltype_onemillionv2) & \n",
+ " (merged_classification_df_v3['cell_type_mapped']==celltype_azimuth)\n",
+ " ].shape[0]\n",
+ " onemillion_classification_num = merged_classification_df_v3[\n",
+ " (merged_classification_df_v3['cell_type_lowerres']==celltype_onemillionv2)\n",
+ " ].shape[0]\n",
+ " azimuth_classification_num = merged_classification_df_v3[\n",
+ " (merged_classification_df_v3['cell_type_mapped']==celltype_azimuth)\n",
+ " ].shape[0]\n",
+ " accuracy_df_v3[celltype_onemillionv2].loc[celltype_azimuth] = common_classification_num/onemillion_classification_num"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " CD4T | \n",
+ " CD8T | \n",
+ " monocyte | \n",
+ " DC | \n",
+ " NK | \n",
+ " B | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " CD4T | \n",
+ " 0.952222 | \n",
+ " 0.157213 | \n",
+ " 0.003671 | \n",
+ " 0.001845 | \n",
+ " 0.016403 | \n",
+ " 0.001099 | \n",
+ "
\n",
+ " \n",
+ " CD8T | \n",
+ " 0.046923 | \n",
+ " 0.787428 | \n",
+ " 0.001606 | \n",
+ " 0.000000 | \n",
+ " 0.014666 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " monocyte | \n",
+ " 0.000513 | \n",
+ " 0.000409 | \n",
+ " 0.993346 | \n",
+ " 0.177122 | \n",
+ " 0.000386 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " DC | \n",
+ " 0.000085 | \n",
+ " 0.000000 | \n",
+ " 0.000574 | \n",
+ " 0.821033 | \n",
+ " 0.000193 | \n",
+ " 0.001099 | \n",
+ "
\n",
+ " \n",
+ " NK | \n",
+ " 0.000256 | \n",
+ " 0.054950 | \n",
+ " 0.000688 | \n",
+ " 0.000000 | \n",
+ " 0.968352 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " B | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000115 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.997802 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " CD4T CD8T monocyte DC NK B\n",
+ "CD4T 0.952222 0.157213 0.003671 0.001845 0.016403 0.001099\n",
+ "CD8T 0.046923 0.787428 0.001606 0.000000 0.014666 0.000000\n",
+ "monocyte 0.000513 0.000409 0.993346 0.177122 0.000386 0.000000\n",
+ "DC 0.000085 0.000000 0.000574 0.821033 0.000193 0.001099\n",
+ "NK 0.000256 0.054950 0.000688 0.000000 0.968352 0.000000\n",
+ "B 0.000000 0.000000 0.000115 0.000000 0.000000 0.997802"
+ ]
+ },
+ "execution_count": 32,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "accuracy_df_v3"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "plt.figure(figsize=(6, 5))\n",
+ "sns.heatmap(accuracy_df_v3, vmin=0, vmax=1,\n",
+ " cmap=\"YlGnBu\", \n",
+ " annot=True, annot_kws={\"size\": 10},\n",
+ " fmt='.2f')\n",
+ "plt.xlabel('Cell type in Oelen v3')\n",
+ "plt.ylabel('Cell type by Azimuth Algorithm')\n",
+ "plt.savefig('marker_gene_azimuth_classification_oelenv3.pdf')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.11"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/01_association_metrics/metacell/create_genesets.R b/01_association_metrics/metacell/create_genesets.R
new file mode 100644
index 0000000..d0fe688
--- /dev/null
+++ b/01_association_metrics/metacell/create_genesets.R
@@ -0,0 +1,39 @@
+# ------------------------------------------------------------------------------
+# Gene selections for metacell evaluation: generate files for different
+# gene subsets (expressed in x% until (x+20)% of the cells with x between 20-80)
+# for Oelen v3 dataset, Monocytes
+# This allows threshold dependent evaluation for BLUEPRINT comparison.
+# See details downstream scripts metacell_general_correlation_tp.R,
+# single_cell_correlation_tp.R, eval_blueprint_genesets.R
+# ------------------------------------------------------------------------------
+
+library(Seurat)
+
+#Load complete seurat object
+seurat<-readRDS("seurat_objects/1M_v3_mediumQC_ctd_rnanormed_demuxids_20201106.rds")
+DefaultAssay(seurat)<-"RNA"
+
+#Filter for monocytes
+seurat<-seurat[,seurat$cell_type_lowerres=="monocyte"]
+
+#Selected cutoffs
+cutoffs<-c(1,0.8,0.6,0.4,0.2)
+
+#Split into lists dependent on expression cutoff
+exprGenes.singleCell<-rowSums(as.matrix(seurat@assays$RNA@counts)>0)/ncol(seurat)
+
+print(paste("Number of genes expressed in at least 50% of cells:",
+ sum(exprGenes.singleCell>=0.5)))
+
+for(i in 1:(length(cutoffs)-1)){
+ gene.subset<-rownames(seurat)[exprGenes.singleCell<=cutoffs[i] &
+ exprGenes.singleCell>cutoffs[i+1]]
+ print(paste("Number of genes with expression cutoff",
+ cutoffs[i+1],":",length(gene.subset)))
+
+ write.table(gene.subset,
+ file=paste0("metacell_general/eval_allmethods/gene_lists/",
+ "mono_expr_genes_cut_",cutoffs[i+1],".txt"),
+ row.names = FALSE,col.names = FALSE,quote=FALSE)
+}
+
diff --git a/01_association_metrics/metacell/eval_blueprint_genesets.R b/01_association_metrics/metacell/eval_blueprint_genesets.R
new file mode 100644
index 0000000..2320fb4
--- /dev/null
+++ b/01_association_metrics/metacell/eval_blueprint_genesets.R
@@ -0,0 +1,127 @@
+# ------------------------------------------------------------------------------
+# Compare correlation from BLUEPRINT with correlation from metacells/single cell
+# for different expression tresholds
+# Here: shown for leiden metacells, calculation done the same way for original
+# metacells and single cell
+# ------------------------------------------------------------------------------
+
+library(reticulate) # to read the single cell data (numpy)
+library(data.table)
+library(ggplot2) # only required if plotting=TRUE
+
+np <- import("numpy")
+
+#Iterate over the list of different gene sets
+corr_files<-c("metacell_general/leiden_metacells/correlation_r_leiden_SCT_cutoff08.tsv",
+ "metacell_general/leiden_metacells/correlation_r_leiden_SCT_cutoff06.tsv",
+ "metacell_general/leiden_metacells/correlation_r_leiden_SCT_cutoff04.tsv",
+ "metacell_general/leiden_metacells/correlation_r_leiden_SCT_cutoff02.tsv")
+
+res_file<-"metacell_general/eval_allmethods/sc_leiden_SCT_eval.tsv"
+
+plotting<-TRUE
+
+#Write column headers
+write.table(data.frame("Condition","Num_pairs","Corr_corr","Test","File"),
+ file=res_file,quote=FALSE,sep="\t",
+ row.names=FALSE, col.names = FALSE)
+
+#Load the large blueprint data set
+path<-"blueprint/allGenePairs_BlueprintScMonocytes_GeneGeneCorrelationComparison.pairwiseSpearman."
+corr.blue.vals <- np$load(paste0(path,"npy"))
+corr.blue.vals<-corr.blue.vals[,1]
+
+corr.blue<-fread(paste0(path,"genePairs.txt"),header=FALSE)
+
+#Split into gene1 and gene2
+corr.blue$gene1<-sapply(corr.blue$V1,function(s) strsplit(s,"/")[[1]][1])
+corr.blue$gene2<-sapply(corr.blue$V1,function(s) strsplit(s,"/")[[1]][2])
+corr.blue$V1<-NULL
+corr.blue$corr.blue<-corr.blue.vals
+rm(corr.blue.vals)
+
+for(cfile in corr_files){
+
+ print(paste("Processing:",cfile))
+
+ #Load corr.mc.r
+ corr.mc.r<-read.table(cfile)
+
+ #Filter Blueprint matrix
+ corr.genes<-unique(c(corr.mc.r$Gene1,corr.mc.r$Gene2))
+ corr.blue.subset<-corr.blue[gene1 %in% corr.genes &
+ gene2 %in% corr.genes]
+
+ #Order correctly so that gene1 smaller than gene2
+ corr.blue.subset$swap<-ifelse(corr.blue.subset$gene1 < corr.blue.subset$gene2,
+ corr.blue.subset$gene1,corr.blue.subset$gene2)
+ corr.blue.subset$gene2<-ifelse(corr.blue.subset$gene1 < corr.blue.subset$gene2,
+ corr.blue.subset$gene2,corr.blue.subset$gene1)
+ corr.blue.subset$gene1<-corr.blue.subset$swap
+ corr.blue.subset$swap<-NULL
+ colnames(corr.blue.subset)<-c("Gene1","Gene2","Correlation.blue")
+
+ #Merge everything
+ corr.mc.r<-merge(corr.mc.r,corr.blue.subset,by=c("Gene1","Gene2"))
+ corr.mc.r<-reshape2::melt(corr.mc.r,id.vars=c("Gene1","Gene2","Correlation.blue"))
+ colnames(corr.mc.r)[4:5]<-c("Condition","Correlation")
+
+ #Correlation for all genes
+ corr.corr<-sapply(unique(corr.mc.r$Condition), function(tp)
+ cor(corr.mc.r$Correlation.blue[corr.mc.r$Condition==tp],
+ corr.mc.r$Correlation[corr.mc.r$Condition==tp],
+ method="pearson",use = "pairwise.complete.obs"))
+
+ res<-data.frame(condition=unique(corr.mc.r$Condition),
+ num.pairs=as.vector(table(corr.mc.r$Condition)),
+ corr.corr,
+ test="allGenes",
+ file=cfile)
+
+ write.table(res, file=res_file,quote=FALSE,sep="\t",
+ append=TRUE,row.names=FALSE, col.names = FALSE)
+
+ corr.mc.r<-corr.mc.r[! is.na(corr.mc.r$Correlation.blue),]
+ if(plotting){
+ corr.mc.r$class<-ifelse(corr.mc.r$Correlation.blue>0,
+ "positive","negative")
+
+ g<-ggplot(corr.mc.r,aes(x=Correlation.blue,y=Correlation,color=class))+
+ geom_point()+facet_wrap(~Condition,ncol=3)+
+ xlab("Correlation Blueprint")+ylab("Correlation MC")
+ ggsave(g,file=paste0("metacell_general/eval_allmethods/plots/",
+ "comp_corr_blue_",strsplit(cfile,"/")[[1]][2],".png"))
+ }
+
+ #Correlation for genes with positive correlation
+ corr.mc.r.pos<-corr.mc.r[corr.mc.r$Correlation.blue>0,]
+ corr.corr<-sapply(unique(corr.mc.r.pos$Condition), function(tp)
+ cor(corr.mc.r.pos$Correlation.blue[corr.mc.r.pos$Condition==tp],
+ corr.mc.r.pos$Correlation[corr.mc.r.pos$Condition==tp],
+ method="pearson",use = "pairwise.complete.obs"))
+
+ res<-data.frame(condition=unique(corr.mc.r.pos$Condition),
+ num.pairs=as.vector(table(corr.mc.r.pos$Condition)),
+ corr.corr,
+ test="posGenes",
+ file=cfile)
+
+ write.table(res, file=res_file,quote=FALSE,sep="\t",
+ append=TRUE,row.names=FALSE, col.names = FALSE)
+
+ #Correlation for genes with negative correlation
+ corr.mc.r.neg<-corr.mc.r[corr.mc.r$Correlation.blue<0,]
+ corr.corr<-sapply(unique(corr.mc.r.neg$Condition), function(tp)
+ cor(corr.mc.r.neg$Correlation.blue[corr.mc.r.neg$Condition==tp],
+ corr.mc.r.neg$Correlation[corr.mc.r.neg$Condition==tp],
+ method="pearson",use = "pairwise.complete.obs"))
+
+ res<-data.frame(condition=unique(corr.mc.r.neg$Condition),
+ num.pairs=as.vector(table(corr.mc.r.neg$Condition)),
+ corr.corr,
+ test="negGenes",
+ file=cfile)
+
+ write.table(res, file=res_file,quote=FALSE,sep="\t",
+ append=TRUE,row.names=FALSE, col.names = FALSE)
+}
diff --git a/01_association_metrics/metacell/metacell_general_correlation_tp.R b/01_association_metrics/metacell/metacell_general_correlation_tp.R
new file mode 100644
index 0000000..028644e
--- /dev/null
+++ b/01_association_metrics/metacell/metacell_general_correlation_tp.R
@@ -0,0 +1,188 @@
+# ------------------------------------------------------------------------------
+# Calculate correlation per timepoint (and sample if stated) from the
+# metacells (original or leiden) for different
+# gene sets (split dependent on gene expression cutoff) for comparison with
+# metacells (see corresponding files create_genesets.R,
+# metacell_general_correlation_tp.R and eval_blueprint_genesets.R)
+# Input: Seurat object, file with selected genes
+# Output: files with correlation values (r-values and p-values)
+# ------------------------------------------------------------------------------
+
+library(Hmisc)
+library(optparse)
+
+#Parse arguments
+option_list = list(
+ make_option(c("-g","--selectedGenes"),
+ default="../../benchmark/celltypes/gene_expressed_over_hald_cells.txt",
+ help="path to list with selected genes"),
+ make_option(c("-m","--method"),
+ default="metacell",
+ help="method for metacell grouping (leiden[_SCT] or metacell)"),
+ make_option(c("-s","--perSample"),action="store_true",
+ default=FALSE,
+ help="Shall the evaluation be done for each sample separatly"),
+ make_option(c("-o","--outputFile"),
+ default="timepoint_monocytes",
+ help="Suffix of the output files")
+)
+
+opt_parser = OptionParser(option_list=option_list)
+opt = parse_args(opt_parser)
+
+pathSelectedGenes<-opt$selectedGenes
+type<-opt$method
+perSample<-opt$perSample
+outputSuffix<-opt$outputFile
+
+print(paste("Evaluating",type,"for gene set:"))
+print(pathSelectedGenes)
+
+print(paste("Evaluating each sample individually:", perSample))
+
+#For leiden clustering
+if(type=="leiden") {
+ setwd("leiden_metacells/")
+ pseudobulkFile<-"metacell_leiden.RDS"
+ annotationFile<-"annotations_mc_leiden_tp.tsv"
+
+ #Read pseudobulk data frame
+ metacell.allsamples<-readRDS(pseudobulkFile)
+
+ #For leiden clustering based on SCT counts
+} else if(type=="leiden_SCT") {
+ setwd("leiden_metacells/")
+ pseudobulkFile<-"metacell_leiden_SCT.RDS"
+ annotationFile<-"annotations_mc_leiden_SCT_tp.tsv"
+
+ #Read pseudobulk data frame
+ metacell.allsamples<-readRDS(pseudobulkFile)
+
+} else if(type=="metacell"){
+ setwd("metacell_general/metacell")
+
+ metacellDir<-"metacells_K20_minCells10"
+ setwd(metacellDir)
+
+ pseudobulkFile<-"pseudobulk_metacell.RDS"
+ annotationFile<-"annotations_metacell.tsv"
+
+ metacell.allsamples<-readRDS(pseudobulkFile)
+} else {
+ stop("Metacell method type not known!")
+}
+
+##########################
+
+#Read annotation data frame
+annotations.allsamples<-read.table(annotationFile)
+colnames(annotations.allsamples)[2]<-"timepoint"
+
+#Select which genes shall be chosen for correlation (same as for single cell)
+selected.genes<-read.table(pathSelectedGenes,
+ header=FALSE)
+metacell.allsamples<-metacell.allsamples[selected.genes$V1,]
+
+#Result data frame (correlation and pvalues)
+corr.df<-NULL
+pval.df<-NULL
+
+correlationRes<-function(meta_counts,colName){
+
+ #Be carefull: rcorr does not work with less than 5 samples
+ corr.mc<-rcorr(t(meta_counts), type="spearman")
+
+ #Create a pairwise data frame for the correlation
+ corr.pairs.mc<-as.data.frame(as.table(corr.mc$r),
+ stringsAsFactors = FALSE)
+ corr.pairs.mc<-corr.pairs.mc[corr.pairs.mc$Var14){
+
+ print(paste("Calculate correlation for timepoint",timepoint))
+
+ meta_counts<-metacell.allsamples[,mc.ids.timepoint]
+ tmp<-correlationRes(meta_counts,colName = paste0(timepoint,"-",sample))
+ corr.pairs.mc<-tmp[[1]]
+ corr.pairs.pval<-tmp[[2]]
+
+ #Concatinate the sample - timepoint pairs
+ if(is.null(corr.df)){
+ corr.df<-corr.pairs.mc
+ pval.df<-corr.pairs.pval
+ } else {
+ corr.df<-merge(corr.df,corr.pairs.mc,by=c("Gene1","Gene2"),
+ all=TRUE)
+ pval.df<-merge(pval.df,corr.pairs.pval,by=c("Gene1","Gene2"),
+ all=TRUE)
+ }
+
+ } else {
+ print(paste("Skip timepoint",timepoint,"(too less metacells)"))
+ }
+ }
+ }
+
+} else {
+
+ annot.sample<-annotations.allsamples
+ for(timepoint in unique(annot.sample$timepoint)){
+
+ #Run the analysis only if at least 5 meta-cells exists
+ #Probably increase the threshold again to more later ...
+ mc.ids.timepoint<-annot.sample$metacell[annot.sample$timepoint==timepoint]
+ if(length(mc.ids.timepoint)>4){
+
+ print(paste("Calculate correlation for timepoint",timepoint))
+
+ meta_counts<-metacell.allsamples[,mc.ids.timepoint]
+ tmp<-correlationRes(meta_counts,colName = timepoint)
+ corr.pairs.mc<-tmp[[1]]
+ corr.pairs.pval<-tmp[[2]]
+
+ #Concatinate the sample - timepoint pairs
+ if(is.null(corr.df)){
+ corr.df<-corr.pairs.mc
+ pval.df<-corr.pairs.pval
+ } else {
+ corr.df<-merge(corr.df,corr.pairs.mc,by=c("Gene1","Gene2"),
+ all=TRUE)
+ pval.df<-merge(pval.df,corr.pairs.pval,by=c("Gene1","Gene2"),
+ all=TRUE)
+ }
+
+ } else {
+ print(paste("Skip timepoint",timepoint,"(too less metacells)"))
+ }
+ }
+}
+
+write.table(corr.df,
+ file=paste0("correlation_r_",outputSuffix,".tsv"),
+ sep="\t",quote=FALSE)
+write.table(pval.df,
+ file=paste0("correlation_pval_",outputSuffix,".tsv"),
+ sep="\t",quote=FALSE)
\ No newline at end of file
diff --git a/01_association_metrics/metacell/metacell_per_sample_original_algorithm.R b/01_association_metrics/metacell/metacell_per_sample_original_algorithm.R
new file mode 100644
index 0000000..e3d1b52
--- /dev/null
+++ b/01_association_metrics/metacell/metacell_per_sample_original_algorithm.R
@@ -0,0 +1,274 @@
+# ------------------------------------------------------------------------------
+# Metacell algorithm (original) run for each sample separately
+# with Oelen v3 dataset (Monocytes)
+# Take all 200 variable genes
+# (but removing for each sample the ones with too low coverage)
+#
+# Remarks:
+# * metacells uses a "data base", the processed files are not loaded
+# directly in the workspace
+# * in the function mcell_mc_from_coclust_balanced the parameters
+# K and min_mc_size can be used to change the "size" of meta cells,
+# but too small is not recommended
+#
+# ------------------------------------------------------------------------------
+
+library(metacell)
+library(SingleCellExperiment)
+library(ggplot2)
+library(optparse)
+
+#Parse arguments
+option_list = list(
+ make_option(c("-K","--coclustK"), default="20",
+ help="Parameter K of mcell_mc_from_coclust_balanced",
+ type="integer"),
+ make_option(c("-m","--coclustMinMcSize"), default="10",
+ help="Parameter min_mc_size of mcell_mc_from_coclust_balanced",
+ type="integer")
+)
+
+opt_parser = OptionParser(option_list=option_list)
+opt = parse_args(opt_parser)
+
+#Parameters to change the granularity of the samples
+mc_coclustK<-opt$coclustK
+mc_coclustMin_size<-opt$coclustMinMcSize
+
+print(paste("Running meta cells with following parameters (changing granularity):",
+ "K",mc_coclustK,"min_mc_size",mc_coclustMin_size))
+
+#Create directories to save the meta cells and the correlation results
+mainDir<-paste0("metacells_K",mc_coclustK,"_minCells",mc_coclustMin_size)
+if(!dir.exists(mainDir))
+ dir.create(mainDir)
+
+setwd(mainDir)
+
+##########################################################################
+# Important note: a lot of the parameter are manged over tgconfig
+# see: https://github.com/tanaylab/tgconfig
+
+# To check all set parameters
+#tgconfig::get_package_params('metacell')
+
+#Set number of cores (otherwise I get issues for very small data sets)
+tgconfig::set_param('mc_cores', 8, 'metacell')
+
+#Issues with downsampling matrix => set parameter for downsampling lower
+#(probably problem as matrix is too sparse)
+#tgconfig::set_param("scm_n_downsamp_gstat",300,'metacell')
+
+########################################################################
+
+#Option to create additional plots for visualization
+allPlots<-FALSE
+fileType<-"seurat"
+
+#Load the h5ad object and convert it to a single cell object
+if(fileType=="h5ad"){
+
+ library(reticulate) # to load h5ad object
+ library(zellkonverter) # to convert h5ad object to single cell object
+
+ sc<-import("scanpy")
+ adata<-sc$read("../../seurat_objects/1M_v3_mediumQC_ctd_rnanormed_demuxids_20201106.SCT.h5ad")
+ #Filter for monocytes
+ adata<-adata[adata$obs$cell_type_lowerres=="monocyte"]$copy()
+ sce<-AnnData2SCE(adata)
+ rm(adata)
+
+ #Convert assay name to counts (before called X)
+ assayNames(sce)<-c("counts")
+
+ #Alternatively read a seurat object
+} else if (fileType=="seurat"){
+
+ library(Seurat)
+
+ seurat<-readRDS("../../seurat_objects/1M_v3_mediumQC_ctd_rnanormed_demuxids_20201106.rds")
+
+ #Filter for monocytes
+ seurat<-seurat[,seurat$cell_type_lowerres=="monocyte"]
+
+ #Convert into a single cell object
+ sce <- as.SingleCellExperiment(seurat,assay="RNA")
+ rm(seurat)
+
+} else {
+ stop("File type not known!")
+}
+
+#Select all remaining samples
+samples<-as.character(unique(sce@colData$assignment))
+
+#Save the results
+metacell.allsamples<-NULL
+annotations.allsamples<-NULL
+annotations.percell<-NULL
+#Calculate meta-cell algorithm for each sample separatly
+#Also option to calculate correlation, but currently not done
+for(sample in samples){
+
+ print(paste("Processing sample:",sample))
+ sce.sample.full<-sce[,sce@colData$assignment == sample]
+
+ #Create data base directory
+ if(!dir.exists("database")){
+ dir.create("database/")
+ } else {
+ do.call(file.remove, list(list.files("database/", full.names = TRUE)))
+ }
+ scdb_init("database/", force_reinit=T)
+
+ #Upload SCE object
+ #Filter for genes with at least 4 counts (preliminary before the real filtering downstream
+ #to reduce the calculation burden)
+ mat<-scm_import_sce_to_mat(sce.sample.full[rowSums(counts(sce.sample.full))>3,])
+ scdb_add_mat(sample, mat)
+
+ #Create a directory for figures
+ if(!dir.exists("figs")) dir.create("figs/")
+ scfigs_init("figs/")
+
+ #Create a gset for generating the knn graph
+ mcell_add_gene_stat(gstat_id="stat", mat_id=sample)
+ mcell_gset_filter_varmean(gset_id="sample_feat", gstat_id="stat", T_vm=0.08, force_new=T)
+ #Sampled coverage of at least T_tot and threshold for the third highest UMI count > T_top3
+ mcell_gset_filter_cov(gset_id = "sample_feat", gstat_id="stat", T_tot=100, T_top3=2)
+ #Check generated gene set
+ gset<-scdb_gset("sample_feat")
+ print(paste("Number of selected genes:",length(gset@gene_set)))
+
+ #Create the knn graph based on correlation
+ mcell_add_cgraph_from_mat_bknn(mat_id=sample,
+ gset_id = "sample_feat",
+ graph_id="sample_graph",
+ K=50,
+ dsamp=T)
+
+ #Resample cells from the graph to robustly define groups
+ mcell_coclust_from_graph_resamp(
+ coc_id="sample_coc500",
+ graph_id="sample_graph",
+ min_mc_size=20,
+ p_resamp=0.75,
+ n_resamp=500)
+
+ #Remark the size of the meta cells can be influenced by the paramters
+ #K and min_mc_size (for both is true: the smaller, the more cells ...)
+ mcell_mc_from_coclust_balanced(
+ coc_id="sample_coc500",
+ mat_id= sample,
+ mc_id= paste0(sample,"_mc"),
+ K=mc_coclustK,
+ min_mc_size=mc_coclustMin_size,
+ alpha=2)
+
+ #Plotting outlier (only possible for small groups)
+ if(allPlots){
+ mcell_plot_outlier_heatmap(mc_id=paste0(sample,"_mc"),
+ mat_id = sample, T_lfc=3)
+ }
+
+ #Split and filter metacells using dbscan and outlier gene detection
+ mcell_mc_split_filt(new_mc_id=paste0(sample,"_mc_f"),
+ mc_id=paste0(sample,"_mc"),
+ mat_id=sample,
+ T_lfc=3, plot_mats=F)
+
+ ##Selecting marker genes automatically
+ mcell_gset_from_mc_markers(gset_id="sample_markers", mc_id=paste0(sample,"_mc_f"))
+ mc_colorize_default(paste0(sample,"_mc_f"))
+
+ #Creating a heatmap of genes and metacells
+ #(also not really well visible with too many cells)
+ if(allPlots){
+ mcell_mc_plot_marks(mc_id=paste0(sample,"_mc_f"), gset_id="sample_markers",
+ mat_id=sample)
+ }
+
+ #Create graph layout
+ mcell_mc2d_force_knn(mc2d_id=paste0(sample,"_2dproj"),
+ mc_id=paste0(sample,"_mc_f"), graph_id="sample_graph")
+ #Plotting also not really interesting for two large
+ mcell_mc2d_plot(mc2d_id=paste0(sample,"_2dproj"))
+
+ #Save it again as a h5ad object to compare the results
+ #So far no direct exporting function found, therefore processing the object myself
+ #See https://tanaylab.github.io/metacell/reference/tgMCCov-class.html
+ sce_meta<-scdb_mc(paste0(sample,"_mc_f"))
+
+ #Meta cell annotations
+ mc.annot<-data.frame(metaCell=sce_meta@mc)
+ mc.annot$cell<-rownames(mc.annot)
+ rownames(mc.annot)<-NULL
+ mc.annot<-rbind(mc.annot,
+ data.frame(metaCell=0,
+ cell=sce_meta@outliers))
+
+ annotations.percell<-rbind(annotations.percell,
+ mc.annot)
+
+ #Check distributions between cell types and stimulation results
+ annotations<-sce.sample.full@colData
+ #annotations$cell<-rownames(annotations)
+ annotations<-merge(mc.annot,annotations,by.x="cell",by.y="bare_barcode_lane")
+
+ perMetacell<-as.data.frame(table(annotations$metaCell))
+
+ #Plot only timepoint for now
+ freqs<-as.data.frame(table(annotations$metaCell,
+ annotations$timepoint))
+ freqs<-merge(freqs,perMetacell,by="Var1",suffixes=c(".spc",".mc"))
+ freqs$Fraction<-freqs$Freq.spc/freqs$Freq.mc
+
+ g<-ggplot(freqs,aes(x=as.factor(Var1),y=Fraction,fill=Var2))+
+ geom_bar(stat="identity")+
+ xlab("Meta cell (0=Outlier)")+
+ scale_fill_discrete(name = "Time point")+
+ ggtitle(paste("Cell number in total:",sum(freqs$Freq.spc)))
+ ggsave(g,filename=paste0("figs/barplot_time_ct_",sample,".png"))
+
+ #Create a pseudobulk object with the meta-cell annotation (without outliers)
+ mc.annot<-mc.annot[mc.annot$metaCell>0,]
+ sc.counts<-counts(sce.sample.full)[,mc.annot$cell]
+ all(colnames(sc.counts)==mc.annot$cell)
+
+ mc.annot$metaCell<-as.factor(paste0(sample,"_mc_",mc.annot$metaCell))
+ mc.pseudobulk<- t(apply(sc.counts, 1, tapply, mc.annot$metaCell,
+ sum, na.rm=T))
+
+ #Normalize to 10,000 per metacell
+ libSize<-colSums(mc.pseudobulk)
+ mc.pseudobulk<-t(t(mc.pseudobulk)/libSize*10000)
+
+ metacell.allsamples<-cbind(metacell.allsamples,mc.pseudobulk)
+
+ #Create a majority annotation for each metacell
+ annotations$metaCell<-paste0(sample,"_mc_",annotations$metaCell)
+ timepoint.mc<-sapply(colnames(mc.pseudobulk),
+ function(id) names(which.max(table(
+ annotations$timepoint[
+ annotations$metaCell==id]))))
+
+ annot.mc<-data.frame(metacell=names(timepoint.mc),
+ timepoint=timepoint.mc,
+ sample=sample)
+
+ #Add how many cells where part of the meta-cell
+ perMetacell$Var1<-paste0(sample,"_mc_",perMetacell$Var1)
+ colnames(perMetacell)<-c("metacell","cell.count")
+ annot.mc<-merge(annot.mc,perMetacell,by="metacell")
+ annotations.allsamples<-rbind(annotations.allsamples,annot.mc)
+
+}
+
+#Save results
+write.table(annotations.allsamples,file="annotations_metacell.tsv",sep="\t")
+write.table(annotations.percell,file="annotations_singlecell_metacell.tsv",sep="\t")
+saveRDS(metacell.allsamples,file="pseudobulk_metacell.RDS")
+
+#Delete the database directory
+unlink("database",recursive = TRUE)
+
diff --git a/01_association_metrics/metacell/metacells_from_leiden.R b/01_association_metrics/metacell/metacells_from_leiden.R
new file mode 100644
index 0000000..bd9a811
--- /dev/null
+++ b/01_association_metrics/metacell/metacells_from_leiden.R
@@ -0,0 +1,117 @@
+# ------------------------------------------------------------------------------
+# Implement own method to generate metacells based on leiden clustering
+# Run leiden clustering separatley for each donor (run on Oelen v3, Monocytes)
+# and use group cells that are part of the same cluster
+# ------------------------------------------------------------------------------
+
+library(Seurat)
+
+#Load complete seurat object
+seurat<-readRDS("../../seurat_objects/1M_v3_mediumQC_ctd_rnanormed_demuxids_20201106.rds")
+DefaultAssay(seurat)<-"SCT" #3000 most variable genes already identified
+
+#Filter for monocytes
+seurat<-seurat[,seurat$cell_type_lowerres=="monocyte"]
+
+#Resolution for leiden clusters
+leidenRes<-100
+print(paste("Leiden resolution:",leidenRes))
+
+type<-"SCT" #choose RNA or SCT
+print(paste("Normalization:",type))
+
+#Files with overall annotation and metacell matrix
+annot_mc_all<-NULL
+annot_mc_major_all<-NULL
+metacellBulk_all<-NULL
+
+#Iterate over all samples
+samples<-levels(seurat$assignment)
+for(donor in samples){
+
+ print(paste("Processing donor:",donor))
+
+ #Filter for the donor
+ seurat_donor<-seurat[,seurat$assignment==donor]
+
+ #Calculate PCA
+ seurat_donor<-RunPCA(seurat_donor, verbose=FALSE)
+
+ #Generate kNN graph and leidern clustering
+ seurat_donor <- FindNeighbors(seurat_donor, dims = 1:20)
+ seurat_donor <- FindClusters(seurat_donor, resolution = leidenRes,
+ algorithm = 4, #4=Leiden
+ group.singletons=FALSE)
+ #don't assign all singletons to the nearest cluster
+
+ #Save metacell - cell annotation
+ annot_mc<-data.frame(cluster=Idents(seurat_donor),
+ metacell=paste0("mc_",Idents(seurat_donor),"_",donor),
+ sample=donor,
+ cell=names(Idents(seurat_donor)),
+ row.names=NULL)
+ annot_mc_all<-rbind(annot_mc_all,annot_mc)
+
+ #Create pseudobulk
+ #all(colnames(seurat_donor)==annot_mc$cell)
+ if(type=="RNA"){
+ metacellBulk <- t(apply(as.matrix(seurat_donor@assays$RNA@counts), 1, tapply,
+ as.factor(annot_mc$cluster),
+ mean, na.rm=T))
+ } else if (type=="SCT"){
+ metacellBulk <- t(apply(as.matrix(seurat_donor@assays$SCT@counts), 1, tapply,
+ as.factor(annot_mc$cluster),
+ mean, na.rm=T))
+ } else {
+ stop(paste("Matrix type",type,"not known! Only RNA or SCT!"))
+ }
+
+
+ colnames(metacellBulk)<-paste0("mc_",1:ncol(metacellBulk),"_",donor)
+ metacellBulk_all<-cbind(metacellBulk_all,metacellBulk)
+
+ #Get majority annotation
+ meta.data<-seurat_donor@meta.data
+ meta.data$cell<-rownames(meta.data)
+ meta.data<-merge(meta.data,annot_mc,
+ by.x="cell",by.y="cell")
+
+ # Annotate each meta-cell to the most frequent condition
+ timepoint.mc<-sapply(colnames(metacellBulk),
+ function(id) names(which.max(table(
+ meta.data$timepoint[
+ meta.data$metacell==id]))))
+
+ #Save majority annotation
+ annot_mc_major<-data.frame(metacell=names(timepoint.mc),
+ condition=unlist(timepoint.mc),
+ sample=donor,
+ row.names=NULL)
+
+ annot_mc_major_all<-rbind(annot_mc_major_all,annot_mc_major)
+
+}
+
+
+
+if(type=="RNA"){
+ #Save per cell annotation
+ write.table(annot_mc_all,file="annotations_metacell_leiden_perCell.tsv",sep="\t")
+ write.table(annot_mc_major_all,file="annotations_mc_leiden_tp.tsv",sep="\t")
+
+ #Save peudobulk counts
+ saveRDS(metacellBulk_all, file="metacell_leiden.RDS")
+} else if(type=="SCT"){
+ write.table(annot_mc_all,file=paste0("annotations_metacell_leiden_SCT_perCell_",
+ leidenRes,".tsv"),
+ sep="\t")
+ write.table(annot_mc_major_all,file=paste0("annotations_mc_leiden_SCT_tp_",
+ leidenRes,".tsv"),
+ sep="\t")
+
+ #Save peudobulk counts
+ saveRDS(metacellBulk_all, file=paste0("metacell_leiden_SCT_",
+ leidenRes,".RDS"))
+}
+
+
diff --git a/01_association_metrics/metacell/plot_overview_metacell.R b/01_association_metrics/metacell/plot_overview_metacell.R
new file mode 100644
index 0000000..00c9ea1
--- /dev/null
+++ b/01_association_metrics/metacell/plot_overview_metacell.R
@@ -0,0 +1,180 @@
+# ------------------------------------------------------------------------------
+# Supplementary figure to show MetaCell overview
+# * Expression distribution of genes in a cell
+# * number of (meta)cells per sample
+# * comparison with Blueprint
+# ------------------------------------------------------------------------------
+
+library(Seurat)
+library(dplyr)
+library(ggplot2)
+library(ggpubr)
+
+theme_set(theme_bw())
+
+# ------------------------------------------------------------------------------
+# Expression distribution of genes in a cell
+# ------------------------------------------------------------------------------
+
+#Load the single cell object and get expressed genes
+seurat<-readRDS("seurat_objects/1M_v3_mediumQC_ctd_rnanormed_demuxids_20201106.rds")
+
+#Filter for monocytes
+seurat<-seurat[,seurat$cell_type_lowerres=="monocyte" &
+ seurat$timepoint=="UT"]
+
+exprGenes.singleCell<-rowSums(as.matrix(seurat@assays$SCT@counts)>0)/ncol(seurat)
+
+print(paste("Number of genes expressed in at least 50% of cells:",
+ sum(exprGenes.singleCell>=0.5)))
+
+exprGene.df<-data.frame(expr.perc=sort(exprGenes.singleCell),
+ position=1:length(exprGenes.singleCell),
+ Type="SingleCell",
+ stringsAsFactors = FALSE)
+
+#Metacell
+metacell.allsamples<-readRDS("metacell_general/metacell/metacells_K20_minCells10/pseudobulk_metacell.RDS")
+#Keep only UT metacells
+meta_annot<-read.table("metacell_general/metacell/metacells_K20_minCells10/annotations_metacell.tsv")
+#all(meta_annot$metacell == colnames(metacell.allsamples))
+metacell.allsamples<-metacell.allsamples[,meta_annot$timepoint == "UT"]
+exprGenes.metacell<-rowSums(metacell.allsamples>0)/ncol(metacell.allsamples)
+
+print(paste("Number of genes expressed in at least 50% of cells:",
+ sum(exprGenes.metacell>=0.5)))
+
+exprGene.df<-rbind(exprGene.df,
+ data.frame(expr.perc=sort(exprGenes.metacell),
+ position=1:length(exprGenes.metacell),
+ Type="MetaCell",
+ stringsAsFactors = FALSE))
+
+#Leiden
+metacell.allsamples<-readRDS("metacell_general/leiden_metacells/metacell_leiden_SCT.RDS")
+#Keep only UT metacells
+meta_annot<-read.table("metacell_general/leiden_metacells/annotations_mc_leiden_SCT_tp.tsv")
+#all(meta_annot$metacell == colnames(metacell.allsamples))
+metacell.allsamples<-metacell.allsamples[,meta_annot$condition == "UT"]
+exprGenes.metacell<-rowSums(metacell.allsamples>0)/ncol(metacell.allsamples)
+
+print(paste("Number of genes expressed in at least 50% of cells:",
+ sum(exprGenes.metacell>=0.5)))
+
+exprGene.df<-rbind(exprGene.df,
+ data.frame(expr.perc=sort(exprGenes.metacell),
+ position=1:length(exprGenes.metacell),
+ Type="Leiden",
+ stringsAsFactors = FALSE))
+
+
+g.1<-ggplot(exprGene.df,aes(x=position,y=expr.perc,color=Type))+geom_point()+
+ xlab("Gene index")+ylab("Expressed in x% of the cells")+
+ scale_color_discrete("Method")+
+ theme(axis.title = element_text(size=14),
+ axis.text=element_text(size=13),
+ legend.position = "none")
+
+# ------------------------------------------------------------------------------
+# Number of (meta)cells per sample
+# ------------------------------------------------------------------------------
+
+counts_all_mc<-NULL
+
+#Load metacell annotation leiden
+mc_method<-read.table("metacell_general/leiden_metacells/annotations_mc_leiden_SCT_tp.tsv",
+ stringsAsFactors = FALSE)
+
+mc_method<-mc_method%>%
+ group_by(sample,condition)%>%
+ summarize(counts=n())
+
+mc_method$method<-"Leiden"
+counts_all_mc<-rbind(counts_all_mc,mc_method)
+
+#Load metacell annotation
+mc_method<-read.table("metacell_general/metacell/metacells_K20_minCells10/annotations_metacell.tsv",
+ stringsAsFactors=FALSE)
+mc_method<-mc_method%>%
+ group_by(sample,timepoint)%>%
+ summarize(counts=n())
+
+mc_method$method<-"MetaCell"
+colnames(mc_method)<-colnames(counts_all_mc)
+counts_all_mc<-rbind(counts_all_mc,mc_method)
+
+#Get number of single cells per sample and condition
+sc_annot<-seurat@meta.data
+sc_annot<-sc_annot%>%
+ group_by(assignment,timepoint)%>%
+ summarize(counts=n())
+sc_annot$method<-"SingleCell"
+colnames(sc_annot)<-colnames(counts_all_mc)
+counts_all_mc<-rbind(counts_all_mc,sc_annot)
+
+#Filter to show only UT cells
+counts_all_mc<-counts_all_mc[counts_all_mc$condition=="UT",]
+
+#Create plot
+g.2<-ggplot(counts_all_mc,aes(x=method,y=counts,fill=method))+
+ geom_boxplot()+
+ ylab("Number of (meta)cells per sample")+
+ xlab("Method")+
+ scale_y_log10()+
+ scale_fill_discrete("Method")+
+ theme(legend.position = "none",
+ axis.title = element_text(size=14),
+ axis.text = element_text(size=13),
+ legend.title = element_text(size=13),
+ legend.text= element_text(size=13))
+
+# ------------------------------------------------------------------------------
+# BLUEPRINT comparison
+# ------------------------------------------------------------------------------
+
+res<-read.table("metacell_general/eval_allmethods/perCondition_eval.tsv",header=TRUE)
+res2<-read.table("metacell_general/eval_allmethods/singleCell_eval.tsv",header=TRUE)
+res3<-read.table("metacell_general/eval_allmethods/sc_leiden_SCT_eval.tsv",header=TRUE)
+#Parse method
+res$method<-ifelse(grepl("leiden",res$File),"leiden",
+ ifelse(grepl("MetaCellar",res$File),"MetaCellaR","metacell"))
+res2$method<-"singleCell"
+res3$method<-ifelse(grepl("leiden",res3$File),"leiden_SCT","singleCell_SCT")
+
+res<-rbind(res,res2,res3)
+rm(res2,res3)
+
+#Parse cutoff
+res$cutoff<-as.numeric(stringi::stri_match(res$File,regex="cutoff(.*?)(_|\\.)")[,2])
+res$cutoff<-paste0(as.character(res$cutoff*10),"%")
+
+#Filter it to show only UT results and allGenes
+res<-res[res$Condition=="UT" & res$Test == "allGenes",]
+
+#Show only SCT results (also used later and noMetaCellaR)
+res<-res[res$method %in% c("leiden_SCT","metacell","singleCell_SCT"),]
+rename_methods<-setNames(c("Leiden","MetaCell","SingleCell"),
+ c("leiden_SCT","metacell","singleCell_SCT"))
+res$method<-rename_methods[res$method]
+
+g.3<-ggplot(res,aes(x=cutoff,y=Corr_corr,fill=method))+
+ geom_bar(stat="identity",position="dodge")+
+ ylab("Correlation with BLUEPRINT")+
+ xlab("Genes stratified by x% expression in single cell")+
+ scale_fill_discrete("Method")+
+ theme(legend.position = "bottom",
+ axis.title = element_text(size=14),
+ axis.text = element_text(size=13),
+ legend.title = element_text(size=13),
+ legend.text= element_text(size=13))
+
+g.bottom<-ggarrange(g.2,g.3,ncol=2,widths=c(0.4,0.6),
+ common.legend = TRUE,legend="bottom",
+ labels=c("b)","c)"))
+g<-ggarrange(g.1,g.bottom,ncol=1,
+ labels=c("a)",""))
+ggsave(g,file="metacell_general/plots/metacell_overview_suppfigure.pdf",
+ width=9,height=9)
+ggsave(g,file="metacell_general/plots/metacell_overview_suppfigure.png",
+ width=9,height=9)
+
diff --git a/01_association_metrics/metacell/single_cell_correlation_tp.R b/01_association_metrics/metacell/single_cell_correlation_tp.R
new file mode 100644
index 0000000..6d40e2c
--- /dev/null
+++ b/01_association_metrics/metacell/single_cell_correlation_tp.R
@@ -0,0 +1,157 @@
+# ------------------------------------------------------------------------------
+# Calculate correlation per timepoint (and sample if stated) from the
+# original single cell dataset (Oelen v3 dataset, Monocytes) for different
+# gene sets (split dependent on gene expression cutoff) for comparison with
+# metacells (see corresponding files create_genesets.R,
+# metacell_general_correlation_tp.R and eval_blueprint_genesets.R)
+# Input: Seurat object, file with selected genes
+# Output: files with correlation values (r-values and p-values)
+# ------------------------------------------------------------------------------
+
+library(Seurat)
+library(Hmisc) #for fast calculation of correlation
+library(optparse)
+
+#Parse arguments
+option_list = list(
+ make_option(c("-g","--selectedGenes"),
+ default="benchmark/celltypes/gene_expressed_over_hald_cells.txt",
+ help="path to list with selected genes"),
+ make_option(c("-s","--perSample"),action="store_true",
+ default=FALSE,
+ help="Shall the evaluation be done for each sample separatly"),
+ make_option(c("-t","--type"),
+ default="RNA",
+ help="Use either RNA count matrix (RNA) or SCT count matrix (SCT)."),
+ make_option(c("-o","--outputFile"),
+ default="timepoint_monocytes",
+ help="Suffix of the output files")
+)
+
+opt_parser = OptionParser(option_list=option_list)
+opt = parse_args(opt_parser)
+
+pathSelectedGenes<-opt$selectedGenes
+perSample<-opt$perSample
+matrixType<-opt$type
+outputSuffix<-opt$outputFile
+
+print(paste("Evaluating single cell data for gene set:"))
+print(pathSelectedGenes)
+
+print(paste("Evaluating each sample individually:", perSample))
+
+#Load complete seurat object
+seurat<-readRDS("seurat_objects/1M_v3_mediumQC_ctd_rnanormed_demuxids_20201106.rds")
+
+#Filter for monocytes
+seurat<-seurat[,seurat$cell_type_lowerres=="monocyte"]
+
+#Select which genes shall be chosen for correlation (same as for single cell)
+selected.genes<-read.table(pathSelectedGenes,
+ header=FALSE)
+seurat<-seurat[selected.genes$V1,]
+
+#Result data frame (correlation and pvalues)
+corr.df<-NULL
+pval.df<-NULL
+
+correlationRes<-function(meta_counts,colName){
+
+ #Be carefull: rcorr does not work with less than 5 samples
+ corr.mc<-rcorr(t(meta_counts), type="spearman")
+ #corr.mc<-cor(t(meta_counts), method="spearman")
+
+ #Create a pairwise data frame for the correlation
+ corr.pairs.mc<-as.data.frame(as.table(corr.mc$r),
+ stringsAsFactors = FALSE)
+ corr.pairs.mc<-corr.pairs.mc[corr.pairs.mc$Var10% and <5% of cells)
+# and 50 very highly expressed genes (expressed in >95% of the cells)
+# Input: Seurat object of Oelen v3 dataset (UT monocytes)
+# Output: scatterplot for comparison
+# ------------------------------------------------------------------------------
+
+library(Seurat)
+library(propr)
+library(Matrix)
+library(ggplot2)
+
+theme_set(theme_bw())
+
+#Load complete seurat object
+seurat<-readRDS("seurat_objects/1M_v3_mediumQC_ctd_rnanormed_demuxids_20201106.rds")
+
+#Filter for monocytes and UT timepoint
+seurat<-seurat[,seurat$cell_type_lowerres == "monocyte"]
+seurat<-seurat[,seurat$timepoint == "UT"]
+
+# Get non-zero-ratio of each gene
+nozeroratio<-rowMeans(seurat$RNA@data>0)
+
+full_count_matrix<-t(as.matrix(seurat$RNA@data))
+
+#Select 50 very lowly expressed genes and 50 very highly expressed genes
+set.seed(1)
+low_genes<-sample(names(nozeroratio)[nozeroratio > 0 & nozeroratio < 0.05],50)
+high_genes<-sample(names(nozeroratio)[nozeroratio > 0.9],50)
+
+#Calculate rho values
+res<-propr::perb(full_count_matrix,
+ select=c(low_genes,high_genes))@matrix
+
+propr<-reshape2::melt(res)
+propr$Var1<-as.character(propr$Var1)
+propr$Var2<-as.character(propr$Var2)
+propr<-propr[propr$Var1 < propr$Var2,]
+
+#Compare with spearman values
+spearman<-cor(full_count_matrix[,c(low_genes,high_genes)],method="spearman")
+spearman<-reshape2::melt(spearman)
+spearman$Var1<-as.character(spearman$Var1)
+spearman$Var2<-as.character(spearman$Var2)
+spearman<-spearman[spearman$Var1 < spearman$Var2,]
+
+#Combine both into one plot
+all(propr$Var1 == spearman$Var1)
+all(propr$Var2 == spearman$Var2)
+
+propr$corr<-spearman$value
+
+propr$type<-ifelse(propr$Var1 %in% low_genes,
+ ifelse(propr$Var2 %in% high_genes,"mixed","both_low"),
+ ifelse(propr$Var2 %in% high_genes,"both_high","mixed"))
+
+g<-ggplot(propr,aes(x=corr,y=value,color=type))+
+ geom_point(alpha=0.5)+
+ xlab("Spearman correlation")+
+ ylab("Rho proportionality")+
+ xlim(-0.2,1)+ylim(-0.2,1)+
+ scale_color_discrete("Expression gene pair")+
+ geom_abline()
+ggsave(g,file="test_rho.pdf")
diff --git a/01_association_metrics/scorpius_and_slingshot_clean.R b/01_association_metrics/scorpius_and_slingshot_clean.R
new file mode 100644
index 0000000..6ca2390
--- /dev/null
+++ b/01_association_metrics/scorpius_and_slingshot_clean.R
@@ -0,0 +1,143 @@
+require(Seurat)
+require(slingshot)
+library(tradeSeq)
+library(RColorBrewer)
+library(SingleCellExperiment)
+
+all <- readRDS('1M_v3_mediumQC_sct_celltyped_minimized_rnascaled.rds')
+degenes <- read.table('degenes_monocyteUTX3hCA.txt')$V1
+mono1 <- subset(x = all, subset = cell_type == 'mono 1')
+mono1Ca <- subset(mono1, subset = (timepoint == 'UT') | (timepoint == 'X3hCA') | (timepoint == 'X24hCA'))
+library(Matrix)
+writeMM(GetAssayData(mono1Ca, assay='SCT', slot='data'),
+ "mono1Ca_allgenes.mtx")
+write.table(as.matrix(mono1Ca[[]]), 'mono1Ca_allgenes.meta.csv', sep=",")
+write.table(as.matrix(rownames(mono1Ca)), 'mono1Ca_allgenes.genes.txt')
+mono1Ca_de3h <- subset(mono1, subset = (timepoint == 'UT') | (timepoint == 'X3hCA') | (timepoint == 'X24hCA'),
+ features = degenes)
+# also select DE genes
+
+# plot
+pdf("pca_umap_sling_mono1CA_degenesUTX3h.pdf")
+mono1Ca_de3h <- RunPCA(mono1Ca_de3h, npcs=10)
+mono1Ca_de3h <- FindNeighbors(mono1Ca_de3h, verbose = FALSE, dims = 1:10)
+mono1Ca_de3h <- FindClusters(mono1Ca_de3h, pc=1:10, algorithm = 2, random.seed = 256, resolution = 0.8)
+mono1Ca_de3h <- RunUMAP(mono1Ca_de3h, dims = 1:10, reduction = "pca")
+
+DimPlot(mono1Ca_de3h, reduction = "pca",
+ group.by = "timepoint", pt.size = 0.5, label = TRUE, repel = TRUE)
+ElbowPlot(mono1Ca_de3h, ndims=10)
+DimPlot(mono1Ca_de3h, reduction = 'umap',
+ group.by = "lane", pt.size = 0.5, label = TRUE, repel = TRUE)
+DimPlot(mono1Ca_de3h, pt.size = 0.5, reduction = "umap",
+ group.by = "timepoint", label = TRUE)
+DimPlot(mono1Ca_de3h, pt.size = 0.5, reduction = "umap",
+ group.by = "SCT_snn_res.0.8", label = TRUE)
+
+# slingshot
+mono1sling <- slingshot(Embeddings(mono1Ca_de3h, "umap"), clusterLabels = mono1Ca_de3h$SCT_snn_res.0.8,
+ start.clus = 0, stretch = 0)
+saveRDS(mono1Ca_de3h, 'mono1Ca_degenes.Rda')
+saveRDS(mono1sling, 'mono1sling_degenes.Rda')
+# load the expression data
+mono1Ca_degenes <- readRDS('mono1Ca_degenes.Rda')
+# load the slingshot
+mono1sling <- readRDS('mono1sling_degenes.Rda')
+pdf("evaluateK_chooseKnots.pdf")
+mono1ca_matrix <- as.matrix(GetAssayData(mono1Ca_de3h, slot='data'))
+icMat <- evaluateK(counts = mono1ca_matrix,
+ sds = mono1sling, k = 3:10,
+ nGenes = 200, verbose = T)
+pdf("slingshot_pseudotime.pdf")
+pseudotime <- slingPseudotime(mono1sling)
+
+nc <- 2
+nms <- colnames(pseudotime)
+nr <- ceiling(length(nms)/nc)
+par(mfrow = c(nr, nc))
+for (i in nms) {
+ ggplot(data.frame(pseudotime), aes(x=i, color=timepoint)) +
+ geom_histogram(fill="white", alpha=0.5, position="identity")
+}
+
+ggplot_frame = data.frame(pseudotime)
+ggplot_frame$timepoint <- mono1Ca_degenes[[]]$timepoint
+ggplot(ggplot_frame, aes(x=curve1, color=timepoint)) +
+ geom_histogram(fill="white", alpha=0.5, position="identity")
+ggplot(ggplot_frame, aes(x=curve2, color=timepoint)) +
+ geom_histogram(fill="white", alpha=0.5, position="identity")
+ggplot(ggplot_frame, aes(x=curve3, color=timepoint)) +
+ geom_histogram(fill="white", alpha=0.5, position="identity")
+ggplot(ggplot_frame, aes(x=curve4, color=timepoint)) +
+ geom_histogram(fill="white", alpha=0.5, position="identity")
+dev.off()
+
+#cellWeights <- slingCurveWeights(mono1sling)
+#sce <- fitGAM(counts = GetAssayData(mono1Ca, slot='data'),
+# pseudotime = pseudotime, cellWeights = cellWeights,
+library(viridis)
+pdf('slingshot_cells_in_different_linearges.pdf')
+nc <- 2
+nms <- colnames(pseudotime)
+nr <- ceiling(length(nms)/nc)
+pal <- viridis(100, end = 0.95)
+par(mfrow = c(nr, nc))
+for (i in nms) {
+ colors <- pal[cut(pseudotime[,i], breaks = 100)]
+ plot(reducedDim(mono1sling), col = colors, pch = 16, cex = 0.5, main = i)
+ lines(mono1sling, lwd = 2, col = 'black', type = 'lineages')
+}
+
+dev.off()
+
+
+library(SCORPIUS)
+mono1Ca_degenes <- readRDS('mono1Ca_degenes.Rda')
+#pdf('SCORPIUS_plots.pdf')
+expression <- t(as.matrix(GetAssayData(mono1Ca_degenes, slot='data')))
+group_name <- factor(as.character(mono1Ca_degenes[[]]$timepoint))
+# try with PCA
+#pdf('scorpius_pca.pdf')
+#pearson_space <- reduce_dimensionality(expression, "pearson")
+#pearson_traj <- infer_trajectory(pearson_space)
+#draw_trajectory_plot(pearson_space, group_name, pearson_traj$path, contour = TRUE)
+#dev.off()
+
+space <- reduce_dimensionality(expression, "spearman")
+traj <- infer_trajectory(space)
+saveRDS(space, 'scorpius_space.rds')
+saveRDS(traj, 'scorpius_traj.rds')
+# save traj#time in tsv
+write.table(as.matrix(traj$time), file='scorpius_trajtime.tsv', sep = '\t')
+write.table(as.matrix(traj$path), file='scorpius_trajpath.tsv', sep = '\t')
+
+# load scorpius results
+space <- readRDS('scorpius_space.rds')
+traj <- readRDS('scorpius_traj.rds')
+draw_trajectory_plot(space, group_name, traj$path, contour = TRUE)
+histogram_data <- data.frame("time" = matrix(unlist(traj$time), nrow=length(traj$time), byrow=T),
+ row.names=names(traj$time))
+histogram_data$timepoint <- mono1Ca_degenes[[]]$timepoint
+
+ggplot(histogram_data, aes(x=time, color=timepoint)) +
+ geom_histogram(fill="white", alpha=0.5, position="identity")
+
+# draw_trajectory_heatmap(space, traj$time, progression_group=group_name)
+pdf('scorpius_heatmap.pdf')
+gimp <- gene_importances(
+ expression,
+ traj$time,
+ num_permutations = 10,
+ num_threads = 8,
+ ntree = 10000,
+ ntree_perm = 1000
+)
+saveRDS(gimp, 'scorpius_gimp.rds')
+gimp$qvalue <- p.adjust(gimp$pvalue, "BH", length(gimp$pvalue))
+gene_sel <- gimp$gene[gimp$qvalue < .05]
+expr_sel <- scale_quantile(expression[,gene_sel])
+
+modules <- extract_modules(scale_quantile(expr_sel), traj$time, verbose = T) # needs more RAM than 50G
+draw_trajectory_heatmap(expr_sel, traj$time, group_name, modules)
+dev.off()
+
diff --git a/01_association_metrics/scvelo_analysis_dm.py b/01_association_metrics/scvelo_analysis_dm.py
new file mode 100644
index 0000000..4cddacd
--- /dev/null
+++ b/01_association_metrics/scvelo_analysis_dm.py
@@ -0,0 +1,100 @@
+"""
+RNA velocity analysis using the dynamic model
+run on all samples of Oelen v3 dataset for classical monocytes (mon1, mono2)
+and filtered for the 2000 most variable genes
+
+Input: loom files generated from velocyto
+Output: hd5ad object with RNA velocity estimates
+"""
+
+import scvelo as scv
+import pandas as pd
+import os
+
+scv.logging.print_version()
+scv.settings.verbosity = 3 # show errors(0), warnings(1), info(2), hints(3)
+scv.settings.set_figure_params('scvelo') # for beautified visualization
+
+#Load annotation file with UMAP coordinates
+fpath="annotations/umap_monocytes.tsv"
+umap_coords=pd.read_csv(fpath, sep='\t')
+
+#Load data for each processed lane
+lanes=os.listdir("velocyto")
+ldata_array=[]
+for lane in lanes:
+
+ print(lane)
+
+ #Get loom file for each lane (file name unfortnuately not always the same)
+ files=os.listdir("velocyto/"+lane)
+ file=[f for f in files if f.endswith(".loom")]
+ lfile="velocyto/"+lane+"/"+file[0]
+
+ #Read file
+ ldata = scv.read(lfile, cache=True)
+
+ #Filter monocytes from file
+ filteredNames=[barcodeName.split(":")[1] for barcodeName in ldata.obs.index]
+ filteredNames=[barcodeName.replace("x","")+"_"+lane for barcodeName in filteredNames]
+ ldata.obs.index=filteredNames
+ umap_coords_filtered=umap_coords[umap_coords["Unnamed: 0"].isin(filteredNames)]
+
+ #Filter for monocyotes (barcode in umap file)
+ ldata=ldata[umap_coords_filtered["Unnamed: 0"],:].copy()
+
+ #Make variable names unique
+ ldata.var_names_make_unique()
+ #Add ldata object
+ ldata_array.append(ldata)
+
+ldata_filtered=ldata_array[0].concatenate(ldata_array[1:], batch_key='lane',
+ batch_categories=lanes,index_unique=None)
+
+#Delete variables which are not required anymore
+del ldata
+del ldata_array
+
+#Add information about cell types and time points (more annotations are available)
+annotations=pd.read_csv("seurat_object_meta.tsv", sep='\t')
+annotations=annotations.loc[ldata_filtered.obs.index.tolist(),:]
+ldata_filtered.obs["timepoint"]=annotations["timepoint"]
+ldata_filtered.obs["celltype"]=annotations["cell_type"]
+
+#Filter for only monocytes 1 and 2
+ldata_filtered=ldata_filtered[ldata_filtered.obs.celltype.isin(["mono 1","mono 2"])].copy()
+
+#RNA velocity analysis
+scv.utils.show_proportions(ldata_filtered)
+#Filter genes with less than 20 counts (spliced + unspliced) and
+#reduce to the top 2000 highly variable genes
+scv.pp.filter_and_normalize(ldata_filtered, min_shared_counts=20, n_top_genes=2000)
+scv.pp.moments(ldata_filtered)
+
+#Run dynamic model
+scv.tl.recover_dynamics(ldata_filtered)
+scv.tl.velocity(ldata_filtered, mode='dynamical')
+scv.tl.velocity_graph(ldata_filtered)
+
+#Add UMAP coordinates
+umap_coords.index=umap_coords["Unnamed: 0"]
+umap_coords=umap_coords.loc[ldata_filtered.obs.index.tolist(),:]
+ldata_filtered.obsm["X_umap"]=umap_coords[["umap_1","umap_2"]].to_numpy()
+
+#Save file
+ldata_filtered.write("h5ad_objects/scveloAnalysis_dynamic_velocity_womono34.h5ad")
+
+#Create plot with embedding
+scv.pl.velocity_embedding_stream(ldata_filtered, basis='umap', color=['timepoint', 'celltype'],
+ show=False,save="embedding_dynamic_monocytes_womono34.png")
+
+scv.pl.velocity_graph(ldata_filtered,color="timepoint",
+ show=False,save="velocityGraph_dynamic_monocytes_womono34.png")
+
+#Calculate pseudotime
+scv.tl.latent_time(ldata_filtered)
+scv.pl.scatter(ldata_filtered, color='latent_time', cmap='gnuplot',
+ show=False,save="latenttime_dynamic_monocytes_womono34.png")
+
+#Save file
+ldata_filtered.write("h5ad_objects/scveloAnalysis_dynamic_velocity_latenttime_womono34.h5ad")
diff --git a/01_association_metrics/setting_files_for_grnboost2/config_bp_mono.yaml b/01_association_metrics/setting_files_for_grnboost2/config_bp_mono.yaml
new file mode 100644
index 0000000..1fcae52
--- /dev/null
+++ b/01_association_metrics/setting_files_for_grnboost2/config_bp_mono.yaml
@@ -0,0 +1,45 @@
+# Input Settings: initialize base input folder names,
+# dataset collections, and algorithms to run over
+input_settings:
+
+ # Base input directory
+ input_dir : "inputs"
+
+ # Subdirectory of inputs that datasets are placed in
+ dataset_dir: "example"
+
+ # Denotes a list of datasets, each with the following parameters:
+ # name: Name of the dataset. May be used in logging or other
+ # messages written during execution
+ #
+ # exprData: scRNA-Seq expression data file. Cells are along the
+ # columns and genes are along the rows.
+ # cellData: a file containing pseudotime ordering, or any other
+ # information about cells.
+ # trueEdges: Name of the refrence network file in the
+ # edge list format. Needed for evaluation.
+ datasets:
+ - name: "compare_grnboost2_bp_mono"
+ exprData: "bp_Expression.csv"
+ cellData: "bp_timepoint.fake.csv"
+ trueEdges: "selected_mono1data.string.csv"
+
+ # Denotes a list of algorithms to run. Each has the following parameters:
+ # name: Name of the algorithm. Must be recognized by the pipeline, see
+ # code for acceptable values
+ #
+ # should_run: whether or not to run the algorithm
+ #
+ # params: any additional, algorithm-specific parameters
+ # should be specified in the params map for a given algorithm
+ #
+ algorithms:
+ - name: "GRNBOOST2"
+ params:
+ should_run: [True]
+# Output Settings: initialize base output folder names
+output_settings:
+
+ # Base output directory
+ output_dir: "outputs"
+ output_prefix: "compare_grnboost2_bp_mono"
diff --git a/01_association_metrics/setting_files_for_grnboost2/config_sc_mono.yaml b/01_association_metrics/setting_files_for_grnboost2/config_sc_mono.yaml
new file mode 100644
index 0000000..a3f61cf
--- /dev/null
+++ b/01_association_metrics/setting_files_for_grnboost2/config_sc_mono.yaml
@@ -0,0 +1,45 @@
+# Input Settings: initialize base input folder names,
+# dataset collections, and algorithms to run over
+input_settings:
+
+ # Base input directory
+ input_dir : "inputs"
+
+ # Subdirectory of inputs that datasets are placed in
+ dataset_dir: "example"
+
+ # Denotes a list of datasets, each with the following parameters:
+ # name: Name of the dataset. May be used in logging or other
+ # messages written during execution
+ #
+ # exprData: scRNA-Seq expression data file. Cells are along the
+ # columns and genes are along the rows.
+ # cellData: a file containing pseudotime ordering, or any other
+ # information about cells.
+ # trueEdges: Name of the refrence network file in the
+ # edge list format. Needed for evaluation.
+ datasets:
+ - name: "compare_grnboost2_sc_mono"
+ exprData: "sc_Expression.csv"
+ cellData: "sc_timepoint.fake.csv"
+ trueEdges: "selected_mono1data.string.csv"
+
+ # Denotes a list of algorithms to run. Each has the following parameters:
+ # name: Name of the algorithm. Must be recognized by the pipeline, see
+ # code for acceptable values
+ #
+ # should_run: whether or not to run the algorithm
+ #
+ # params: any additional, algorithm-specific parameters
+ # should be specified in the params map for a given algorithm
+ #
+ algorithms:
+ - name: "GRNBOOST2"
+ params:
+ should_run: [True]
+# Output Settings: initialize base output folder names
+output_settings:
+
+ # Base output directory
+ output_dir: "outputs"
+ output_prefix: "compare_grnboost2_sc_mono"
diff --git a/02_correlation_evaluation/README.md b/02_correlation_evaluation/README.md
new file mode 100644
index 0000000..b4706fb
--- /dev/null
+++ b/02_correlation_evaluation/README.md
@@ -0,0 +1,35 @@
+# 02_correlation_evaluation
+
+*blueprint_normalize.sh* normalize BLUEPRINT dataset, as well as regress out the first PC
+
+*blueprint_correlation.py*: calculate the co-expression for gene pairs in BLUEPRINT data
+
+*compare_blueprint_cutoffs_CD4T.py* : Compare correlation between Blueprint and single cell (Oelen v3 dataset) for different expression thresholds (number of cells expressing the gene), implemented for UT and CD4+ T cells here
+
+*compare_immunexut_cutoffs_CD4T.py*: Same approach as in *compare_blueprint_cutoffs_CD4T.py*, but comparing correlation between ImmuNexUT and Oelen v3 dataset instead
+
+*correlation_between_datasets.R*: check Pearson correlation between data sets (for CD4+ T cells) for single cell vs single cell dataset comparison, single cell vs bulk dataset comparison and bulk vs bulk dataset comparison, afterwards combines all results in one large heatmap
+
+*correlation_between_datasets_extended.R*: check if the correlation values between matched cell types for single cell and bulk (ImmuNexUT) are higher than for not-matched cell types
+
+*correlation_between_datasets_othercts.R*: extension of *correlation_between_datasets.R* that includes all cell types (not only CD4+ T cells)
+
+*correlation_timepoint_combined_indivs_1mio.py*: calculate the co-expression for genes that are expressed in more than 50% cells in Oelen v2 and v3 dataset
+
+*correlation_timepoint_combined_indivs_ng.py*: calculate the co-expression for genes that are expressed in more than 50% cells in van der Wijst dataset
+
+*correlation_timepoint_combined_indivs_stemiv2.py*: calculate the co-expression for genes that are expressed in more than 50% cells in van Blokland v2 dataset
+
+*correlation_timepoint_combined_indivs_stemiv3.py*: calculate the co-expression for genes that are expressed in more than 50% cells in van Blokland v3 dataset
+
+*figure2_barplot_cutoffs.R*: create barplots from the results of *compare_blueprint_cutoffs_CD4T.py* and *compare_immunexut_cutoffs_CD4T.py*
+
+*figure2_scatterplots.R*: creates inset plots for Main Figure 2 (a,b,d), showing scatterplots of gene pair-wise Spearman correlation values between two data sets for a) Oelen v3 dataset vs van Blokland v2 dataset (both CD4+ T cells), b) ImmuNexUT - van Blokland v2 (naive CD4+ T cells and CD4+ T cells) and c) Blueprint - ImmuNexUT (both naive CD4+ T cells)
+
+*normalize_ImmuNexUT.R*: preprocessingImmuNexUT data (separately for each cell type with a matching single-cell cell type) following the description in the corresponding publication (filtering lowly expressed genes, TMM normalization and batch correction) followed by correlation calculation for all genes expressed in 50% of the cells of the Oelen v3 dataset (for comparison with single cell data)
+
+*wilcoxon_test_crispr.R*: Benchmark our correlation results from single cell (Oelen v3, CD4+ T cells) and bulk (ImmuNexUT, naive CD4+ T cells) with a public CRISPR perturbation dataset using Wilcoxon Rank Sum Test
+
+*wilcoxon_test_string.R*: Compare if correlated pairs from single cell (Oelen v3, CD4+ T cells) and bulk (ImmuNexUT, naive CD4+ T cells) are enriched in STRING database (Using the same strategy as in CRISPR validation with Wilcoxon Rank Sum Test)
+
+
diff --git a/02_correlation_evaluation/blueprint_correlation.py b/02_correlation_evaluation/blueprint_correlation.py
new file mode 100644
index 0000000..9e8c5e4
--- /dev/null
+++ b/02_correlation_evaluation/blueprint_correlation.py
@@ -0,0 +1,48 @@
+import pandas as pd
+import numpy as np
+from sklearn.decomposition import PCA
+from sklearn.preprocessing import StandardScaler
+from scipy.stats import spearmanr, pearsonr
+import scanpy as sc
+import seaborn as sns
+import matplotlib.pyplot as plt
+from tqdm import tqdm
+# %matplotlib inline
+
+# %%bash
+# export HDF5_USE_FILE_LOCKING='FALSE'
+
+def read_numpy(fileprefix, rowname='rows', colname='cols'):
+ data = np.load(fileprefix+'.npy')
+ rows = [item.strip() for item in open(fileprefix+f'.{rowname}.txt', 'r').readlines()]
+ cols = [item.strip() for item in open(fileprefix+f'.{colname}.txt', 'r').readlines()]
+ return pd.DataFrame(data=data,
+ index=rows,
+ columns=cols)
+
+def get_pairwise_correlations(corr_df):
+ corrmatrix = corr_df.corr()
+ triuindices = np.triu_indices(corrmatrix.shape[0], k=1)
+ return corrmatrix.values[triuindices]
+
+blueprint_mappings = pd.read_csv('../blueprint/blueprint_mappings.txt',
+ sep='\t', index_col=0)['Gene name'].T.to_dict()
+data = pd.read_csv('../blueprint/mono_gene_nor_combat_20151109.ProbesWithZeroVarianceRemoved.ProbesCentered.SamplesZTransformed.1PCAsOverSamplesRemoved.txt.gz',
+ sep='\t', index_col=0, compression='gzip')
+
+data.index = [item.split('.')[0] for item in data.index]
+data['genename'] = [blueprint_mappings.get(ids) for ids in data.index]
+print(data.shape)
+data = data.dropna(subset=['genename']).drop_duplicates(subset=['genename'])
+data = data.set_index('genename')
+
+data.head()
+coefs, ps = spearmanr(data, axis=1)
+print(coefs.shape)
+np.save('mono_gene_nor_combat_20151109.ProbesWithZeroVarianceRemoved.ProbesCentered.SamplesZTransformed.1PCAsOverSamplesRemoved.spearmanr.npy',
+ coefs)
+np.save('mono_gene_nor_combat_20151109.ProbesWithZeroVarianceRemoved.ProbesCentered.SamplesZTransformed.1PCAsOverSamplesRemoved.spearmanrPvalues.npy',
+ ps)
+with open('mono_gene_nor_combat_20151109.ProbesWithZeroVarianceRemoved.ProbesCentered.SamplesZTransformed.1PCAsOverSamplesRemoved.spearmanr.genes.txt',
+ 'w') as f:
+ f.write('\n'.join(data.index.values))
\ No newline at end of file
diff --git a/02_correlation_evaluation/blueprint_normalize.sh b/02_correlation_evaluation/blueprint_normalize.sh
new file mode 100644
index 0000000..1c295e2
--- /dev/null
+++ b/02_correlation_evaluation/blueprint_normalize.sh
@@ -0,0 +1,35 @@
+#!/usr/bin/env bash
+#SBATCH --time=7:00:00
+#SBATCH --cpus-per-task=10
+#SBATCH --mem=30gb
+#SBATCH --nodes=1
+#SBATCH --open-mode=append
+#SBATCH --export=NONE
+#SBATCH --get-user-env=L
+
+module purge
+module load Java
+
+jar_file=eqtl-mapping-pipeline-1.4.9-SNAPSHOT/eqtl-mapping-pipeline.jar
+traitfile=./blueprint/tcel_gene_nor_combat_20151109.ProbesWithZeroVarianceRemoved.ProbesCentered.SamplesZTransformed.txt.gz
+outdir=./blueprint
+logFile=./blueprint/blueprint_cd4t_adjustPCA.log
+java -Xmx30g -Xms30g -jar ${jar_file} \
+--mode normalize \
+--in ${traitfile} \
+--out ${outdir} \
+--adjustPCA \
+--maxnrpcaremoved 3 \
+--stepsizepcaremoval 1 | tee ${logFile}
+
+jar_file=eqtl-mapping-pipeline-1.4.9-SNAPSHOT/eqtl-mapping-pipeline.jar
+traitfile=./blueprint/mono_gene_nor_combat_20151109.ProbesWithZeroVarianceRemoved.ProbesCentered.SamplesZTransformed.txt.gz
+outdir=./blueprint
+logFile=./blueprint/blueprint_normalize.log
+java -Xmx30g -Xms30g -jar ${jar_file} \
+--mode normalize \
+--in ${traitfile} \
+--out ${outdir} \
+--adjustPCA \
+--maxnrpcaremoved 3 \
+--stepsizepcaremoval 1 | tee ${logFile}
\ No newline at end of file
diff --git a/02_correlation_evaluation/compare_blueprint_cutoffs_CD4T.py b/02_correlation_evaluation/compare_blueprint_cutoffs_CD4T.py
new file mode 100644
index 0000000..381d0dd
--- /dev/null
+++ b/02_correlation_evaluation/compare_blueprint_cutoffs_CD4T.py
@@ -0,0 +1,108 @@
+# ---------------------------------------------------------------------------------------
+# Compare correlation between Blueprint and single cell (Oelen v3 dataset)
+# for different thresholds (number of cells expressing the gene),
+# implemented for UT and CD4T cells here
+# Input: seurat objects with Oelen v3 dataset and precalculated Blueprint correlation
+# for all possible gene pairs
+# Output: csv file with the correlation between Blueprint and Oelen v3 for each threshold
+# ---------------------------------------------------------------------------------------
+
+from scipy.stats import spearmanr, pearsonr
+import scanpy as sc
+import numpy as np
+import pandas as pd
+from pathlib import Path
+from time import time
+import os
+import re
+
+# load scanpy object (Oelen v3 dataset)
+alldata = sc.read_h5ad('seurat_objects/1M_v3_mediumQC_ctd_rnanormed_demuxids_20201106.SCT.h5ad')
+
+# filter for CD4+ T cells and UT cells
+alldata = alldata[alldata.obs.cell_type_lowerres=='CD4T']
+alldata = alldata[alldata.obs.timepoint=='UT'].copy() #copy to not create only a view object
+
+celltype_data = pd.DataFrame(data=alldata.X.toarray(),
+ index=alldata.obs.index,
+ columns=alldata.var.index)
+
+# load Blueprint object
+bp_corr = np.load('blueprint_data/tcel_gene_nor_combat_20151109.ProbesWithZeroVarianceRemoved.ProbesCentered.SamplesZTransformed.spearmanR.npy',mmap_mode="r")
+
+bp_corr_genes = []
+f= open('blueprint_data/tcel_gene_nor_combat_20151109.ProbesWithZeroVarianceRemoved.ProbesCentered.SamplesZTransformed.spearmanR.cols.txt','r')
+for line in f.readlines():
+ bp_corr_genes.append(line.rstrip())
+
+# method to select genes above a certain nonzero ratio
+def select_gene_nonzeroratio(df, ratio):
+ nonzerocounts = np.count_nonzero(df.values, axis=0)/df.shape[0]
+ selected_genes = df.columns[nonzerocounts>ratio]
+ return selected_genes
+
+# generate a set of thresholds that should be tested (start with stricter thresholds)
+thresholds = [i/10 for i in range(1,10)]
+thresholds.reverse()
+
+f_out = open("co-expression_indivs_combined/blueprint_cutoff_eval_CD4T.txt", "w")
+f_out.write("threshold,ngenes,corr_pearson\n")
+
+# iterate over all thresholds
+for th in thresholds:
+
+ #select all genes within the threshold
+ selected_genes = select_gene_nonzeroratio(celltype_data, th)
+
+ # filter genes that are not in Blueprint
+ selected_genes = list(set(selected_genes) & set(bp_corr_genes))
+
+ print(f"Number of selected genes for {th}: {len(selected_genes)}")
+
+ gene_pairs = []
+ for i,gene1 in enumerate(selected_genes):
+ for j in range(i+1, len(selected_genes)):
+ if gene1 < selected_genes[j]:
+ gene_pairs.append(';'.join([gene1, selected_genes[j]]))
+ else:
+ gene_pairs.append(';'.join([selected_genes[j],gene1]))
+
+ # calculate correlation single cell
+ input_df = celltype_data[selected_genes]
+ input_data = spearmanr(input_df, axis=0)[0]
+ input_data_uppertria = input_data[np.triu_indices_from(input_data, 1)]
+
+ corrs_df = pd.DataFrame({'UT': input_data_uppertria},
+ index=gene_pairs)
+
+ # filter blueprint and order it the same way as the single cell object
+ filter_bp_genes = [gene in selected_genes for gene in bp_corr_genes]
+ bp_corr_filtered = bp_corr[filter_bp_genes][:,filter_bp_genes]
+ bp_uppertria = bp_corr_filtered[np.triu_indices_from(bp_corr_filtered, 1)]
+
+ # get genes from the blueprint object
+ bp_corr_genes_filtered = [gene for gene in bp_corr_genes if gene in selected_genes]
+ gene_pairs_bp=[]
+ for i,gene1 in enumerate(bp_corr_genes_filtered):
+ for j in range(i+1, len(bp_corr_genes_filtered)):
+ if gene1 < bp_corr_genes_filtered[j]:
+ gene_pairs_bp.append(';'.join([gene1, bp_corr_genes_filtered[j]]))
+ else:
+ gene_pairs_bp.append(';'.join([bp_corr_genes_filtered[j],gene1]))
+
+ corrs_df_bp = pd.DataFrame({'BP': bp_uppertria},
+ index=gene_pairs_bp)
+
+ # sort both and combine them
+ corrs_df = corrs_df.sort_index()
+ corrs_df_bp = corrs_df_bp.sort_index()
+ #all(corrs_df.index == corrs_df_bp.index)
+
+ # calculate correlation between datasets and save results
+ corr_data = pearsonr(corrs_df.UT, corrs_df_bp.BP)[0]
+
+ # save results
+ f_out.write(f"{th},{len(selected_genes)},{corr_data}\n")
+
+# close file
+f.close()
diff --git a/02_correlation_evaluation/compare_immunexut_cutoffs_CD4T.py b/02_correlation_evaluation/compare_immunexut_cutoffs_CD4T.py
new file mode 100644
index 0000000..6ffde6b
--- /dev/null
+++ b/02_correlation_evaluation/compare_immunexut_cutoffs_CD4T.py
@@ -0,0 +1,93 @@
+# ---------------------------------------------------------------------------------------
+# Compare correlation between ImmuNexUT and single cell (Oelen v3 dataset)
+# for different thresholds (number of cells expressing the gene),
+# implemented for UT and CD4T cells here
+# Input: seurat objects with Oelen v3 dataset and normalized ImmuNexUT counts
+# Output: csv file with the correlation between ImmuNexUT and Oelen v3 for each threshold
+# ---------------------------------------------------------------------------------------
+
+from scipy.stats import spearmanr, pearsonr
+import scanpy as sc
+import numpy as np
+import pandas as pd
+from pathlib import Path
+from time import time
+import os
+import re
+
+# load scanpy object (Oelen v3 dataset)
+alldata = sc.read_h5ad('seurat_objects/1M_v3_mediumQC_ctd_rnanormed_demuxids_20201106.SCT.h5ad')
+
+# filter for CD4+ T cells and UT cells
+alldata = alldata[alldata.obs.cell_type_lowerres=='CD4T']
+alldata = alldata[alldata.obs.timepoint=='UT'].copy() #copy to not create only a view object
+
+celltype_data = pd.DataFrame(data=alldata.X.toarray(),
+ index=alldata.obs.index,
+ columns=alldata.var.index)
+
+# load ImmuNexuT object
+counts = pd.read_csv('imd_paper_rna_data/norm_count/Naive_CD4_norm_count.txt',sep="\t")
+immunexut_genes = counts.index.values
+counts = counts.transpose()
+
+# method to select genes above a certain nonzero ratio
+def select_gene_nonzeroratio(df, ratio):
+ nonzerocounts = np.count_nonzero(df.values, axis=0)/df.shape[0]
+ selected_genes = df.columns[nonzerocounts>ratio]
+ return selected_genes
+
+# generate a set of thresholds that should be tested (start with stricter thresholds)
+thresholds = [i/10 for i in range(1,10)]
+thresholds.reverse()
+
+f_out = open("co-expression_indivs_combined/immunexut_cutoff_eval_CD4T.txt", "w")
+f_out.write("threshold,ngenes,corr_pearson\n")
+
+# iterate over all thresholds
+for th in thresholds:
+
+ #select all genes within the threshold
+ selected_genes = select_gene_nonzeroratio(celltype_data, th)
+
+ # filter genes that are not in ImmuNexUT
+ selected_genes = list(set(selected_genes) & set(immunexut_genes))
+
+ print(f"Number of selected genes for {th}: {len(selected_genes)}")
+
+ gene_pairs = []
+ for i,gene1 in enumerate(selected_genes):
+ for j in range(i+1, len(selected_genes)):
+ if gene1 < selected_genes[j]:
+ gene_pairs.append(';'.join([gene1, selected_genes[j]]))
+ else:
+ gene_pairs.append(';'.join([selected_genes[j],gene1]))
+
+ # calculate correlation single cell
+ input_df = celltype_data[selected_genes]
+ input_data = spearmanr(input_df, axis=0)[0]
+ input_data_uppertria = input_data[np.triu_indices_from(input_data, 1)]
+
+ corrs_df = pd.DataFrame({'UT': input_data_uppertria},
+ index=gene_pairs)
+
+ #c alculate correlation ImmuNexUT
+ input_df_ImmuNexUT = counts[selected_genes]
+ input_data = spearmanr(input_df_ImmuNexUT, axis=0)[0]
+ input_data_uppertria = input_data[np.triu_indices_from(input_data, 1)]
+
+ corrs_df_ImmuNexUT = pd.DataFrame({'BULK': input_data_uppertria},
+ index=gene_pairs)
+
+ # sorting both is not necessary here
+ #all(corrs_df.index == corrs_df_ImmuNexUT.index)
+
+ # calculate correlation between datasets and save results
+ corr_data = pearsonr(corrs_df.UT, corrs_df_ImmuNexUT.BULK)[0]
+
+ # save results
+ f_out.write(f"{th},{len(selected_genes)},{corr_data}\n")
+
+# close file
+f_out.close()
+
diff --git a/02_correlation_evaluation/correlation_between_datasets.R b/02_correlation_evaluation/correlation_between_datasets.R
new file mode 100644
index 0000000..72ecdfd
--- /dev/null
+++ b/02_correlation_evaluation/correlation_between_datasets.R
@@ -0,0 +1,341 @@
+# ------------------------------------------------------------------------------
+# Check Pearson correlation between data sets (for CD4+ T cells)
+# * for single cell vs single cell data set
+# * for single cell vs bulk data set
+# * for bulk vs bulk data set
+# Combine all results in one large heatmap
+# -----------------------------------------------------------------------------
+
+library(data.table)
+library(reticulate) # to read the single cell data (numpy)
+library(ggplot2)
+library(viridis)
+library(ggpubr)
+
+np <- import("numpy")
+
+theme_set(theme_bw())
+
+cell_type<-"CD4T"
+
+#Path to different single cell dataset
+datasets<-c(mio_v3="co-expression_indivs_combined/",
+ mio_v2="co-expression_indivs_combined/one_million_version2/",
+ stemi_v2="co-expression_indivs_combined/stemi/version2/",
+ stemi_v3="co-expression_indivs_combined/stemi/version3/",
+ pilot="co-expression_indivs_combined/ng_updated_version/")
+
+#File endings for different single cell datasets
+file_suffixes<-c(mio_v3="_UT_correlation.csv",
+ mio_v2="_UT_correlation.csv",
+ stemi_v2="_t8w_correlation.csv",
+ stemi_v3="_t8w_correlation.csv",
+ pilot="_correlation.csv")
+
+#Name on plots for different single cell datasets
+dataset_names<-c(mio_v3="Oelen (v3)",
+ mio_v2="Oelen (v2)",
+ stemi_v2="van Blokland (v2)",
+ stemi_v3="van Blokland (v3)",
+ pilot="van der Wijst")
+
+bulk_datasets<-c("Blueprint","BIOS","ImmuNexUT")
+
+resort<-function(corr){
+ #Split into two genes
+ corr$gene1<-gsub(";.*","",corr$V1)
+ corr$gene2<-gsub(".*;","",corr$V1)
+
+ #Order them alphabetically
+ corr$V1<-ifelse(corr$gene1 < corr$gene2,corr$V1,
+ paste0(corr$gene2,";",corr$gene1))
+ corr$gene1<-NULL
+ corr$gene2<-NULL
+
+ return(corr)
+}
+
+################################################################################
+# Compare single cell with each other
+################################################################################
+
+corr_comp<-NULL
+for(c1 in 1:(length(datasets)-1)){
+
+ #Read correlation file one
+ dataset_name1<-dataset_names[c1]
+ corr_c1<-fread(paste0(datasets[c1],cell_type,
+ "/",cell_type,file_suffixes[c1]))
+ corr_c1<-resort(corr_c1)
+
+ #Unique genes
+ num_genes<-length(union(gsub(";.*","",corr_c1$V1),
+ gsub(".*;","",corr_c1$V1)))
+
+ corr_comp<-rbind(corr_comp,
+ data.frame(c1=dataset_name1,
+ c2=dataset_name1,
+ gene_pairs=nrow(corr_c1),
+ genes_unique=num_genes,
+ corr=1))
+
+ for(c2 in (c1+1):length(datasets)){
+
+ #Read correlation file two
+ dataset_name2<-dataset_names[c2]
+ corr_c2<-fread(paste0(datasets[c2],cell_type,"/",
+ cell_type,file_suffixes[c2]))
+ corr_c2<-resort(corr_c2)
+
+ corr<-merge(corr_c1,corr_c2,by=c("V1"))
+
+ #Unique genes
+ num_genes<-length(union(gsub(";.*","",corr$V1),
+ gsub(".*;","",corr$V1)))
+
+ corr_comp<-rbind(corr_comp,
+ data.frame(c1=dataset_name1,
+ c2=dataset_name2,
+ gene_pairs=nrow(corr),
+ genes_unique=num_genes,
+ corr=cor(corr[[2]],corr[[3]],method="pearson")))
+ }
+}
+
+#Read correlation file one
+c1<-length(datasets)
+dataset_name1<-dataset_names[c1]
+corr_c1<-fread(paste0(datasets[c1],cell_type,
+ "/",cell_type,file_suffixes[c1]))
+
+#Unique genes
+num_genes<-length(union(gsub(";.*","",corr_c1$V1),
+ gsub(".*;","",corr_c1$V1)))
+
+corr_comp<-rbind(corr_comp,
+ data.frame(c1=dataset_name1,
+ c2=dataset_name1,
+ genes_unique=num_genes,
+ gene_pairs=nrow(corr_c1),
+ corr=1))
+
+corr_comp$c1<-factor(corr_comp$c1,levels=dataset_names)
+corr_comp$c2<-factor(corr_comp$c2,levels=dataset_names)
+
+# Save correlations
+write.table(corr_comp,
+ file="co-expression_indivs_combined/dataset_comp_summary/correlation_singlecell_datasets.tsv",
+ sep="\t",row.names = FALSE,quote=FALSE)
+
+################################################################################
+# Compare single cell with bulk
+################################################################################
+
+
+#Special function to read bulk data as they are not all saved in the same file type
+read_bulk_data<-function(dataset_name){
+
+ if(dataset_name=="Blueprint"){
+ path<-"blueprint_data/tcel_gene_nor_combat_20151109.ProbesWithZeroVarianceRemoved.ProbesCentered.SamplesZTransformed.spearmanR."
+ rowname_suffix<-"rows.txt"
+ colname_suffix<-"cols.txt"
+
+ corr_c1 <- np$load(paste0(path,"npy"), mmap_mode="r")
+ row_names<-fread(paste0(path,rowname_suffix),header=FALSE)
+ rownames(corr_c1)<-row_names$V1
+ col_names<-fread(paste0(path,colname_suffix),header=FALSE)
+ colnames(corr_c1)<-col_names$V1
+ rm(row_names,col_names)
+
+ #Filter for single cell data
+ ct_single_cell<-"CD4T"
+ corr_sc<-fread(paste0("co-expression_indivs_combined/",ct_single_cell,"/",
+ ct_single_cell,"_UT_correlation.csv"))
+ corr_sc$gene1<-gsub(";.*","",corr_sc$V1)
+ corr_sc$gene2<-gsub(".*;","",corr_sc$V1)
+ sc_genes<-union(corr_sc$gene1,corr_sc$gene2)
+ sc_genes<-sc_genes[sc_genes %in% colnames(corr_c1)]
+
+ corr_c1<-corr_c1[sc_genes,sc_genes]
+ corr_c1<-reshape2::melt(corr_c1)
+ corr_c1$Var1<-as.character(corr_c1$Var1)
+ corr_c1$Var2<-as.character(corr_c1$Var2)
+ colnames(corr_c1)[1:3]<-c("gene1","gene2","corr")
+
+ #Order so that gene1 is always the one first in alphabet
+ corr_c1<-corr_c1[corr_c1$gene1!=corr_c1$gene2,]
+ corr_c1$V1<-paste0(corr_c1$gene1,";",corr_c1$gene2)
+ corr_c1$gene1<-NULL
+ corr_c1$gene2<-NULL
+
+ corr_c1<-corr_c1[,c("V1","corr")]
+
+ } else if (dataset_name=="BIOS"){
+ corr_c1<-fread("bios/bios_correlation_tcellfiltered.tsv")
+ #all(corr_c1$gene1 < corr_c1$gene2)
+ corr_c1$V1<-paste0(corr_c1$gene1,";",corr_c1$gene2)
+ corr_c1$gene1<-NULL
+ corr_c1$gene2<-NULL
+ } else { #ImmuNexUT
+ corr_c1<-fread("imd_paper_rna_data/correlation/Naive_CD4_correlation.txt")
+ #all(corr_c1$gene1 < corr_c1$gene2)
+ corr_c1$V1<-paste0(corr_c1$gene1,";",corr_c1$gene2)
+ corr_c1$gene1<-NULL
+ corr_c1$gene2<-NULL
+ }
+
+ return(corr_c1)
+}
+
+corr_comp<-NULL
+for(c1 in 1:length(bulk_datasets)){
+
+ dataset_name1<-bulk_datasets[c1]
+ corr_c1<-read_bulk_data(dataset_name1)
+
+ for(c2 in 1:length(datasets)){
+
+ #Read correlation file two
+ dataset_name2<-dataset_names[c2]
+ corr_c2<-fread(paste0(datasets[c2],cell_type,"/",
+ cell_type,file_suffixes[c2]))
+ corr_c2<-resort(corr_c2)
+
+ corr<-merge(corr_c1,corr_c2,by=c("V1"))
+
+ #Unique genes
+ num_genes<-length(union(gsub(";.*","",corr$V1),
+ gsub(".*;","",corr$V1)))
+
+ corr_comp<-rbind(corr_comp,
+ data.frame(c1=dataset_name1,
+ c2=dataset_name2,
+ gene_pairs=nrow(corr),
+ genes_unique=num_genes,
+ corr=cor(corr[[2]],corr[[3]],method="pearson")))
+ }
+}
+
+# Save correlations
+write.table(corr_comp,
+ file="co-expression_indivs_combined/dataset_comp_summary/correlation_singlevsbulk_datasets.tsv",
+ sep="\t",row.names = FALSE,quote=FALSE)
+
+################################################################################
+# Compare bulk with bulk
+################################################################################
+
+corr_comp<-NULL
+for(c1 in 1:(length(bulk_datasets)-1)){
+
+ #Read correlation file one
+ dataset_name1<-bulk_datasets[c1]
+ corr_c1<-read_bulk_data(dataset_name1)
+
+ #Unique genes
+ num_genes<-length(union(gsub(";.*","",corr_c1$V1),
+ gsub(".*;","",corr_c1$V1)))
+
+ corr_comp<-rbind(corr_comp,
+ data.frame(c1=dataset_name1,
+ c2=dataset_name1,
+ gene_pairs=nrow(corr_c1),
+ genes_unique=num_genes,
+ corr=1))
+
+ for(c2 in (c1+1):length(bulk_datasets)){
+
+ #Read correlation file two
+ dataset_name2<-bulk_datasets[c2]
+ corr_c2<-read_bulk_data(dataset_name2)
+
+ corr<-merge(corr_c1,corr_c2,by=c("V1"))
+
+ #Unique genes
+ num_genes<-length(union(gsub(";.*","",corr$V1),
+ gsub(".*;","",corr$V1)))
+
+ corr_comp<-rbind(corr_comp,
+ data.frame(c1=dataset_name1,
+ c2=dataset_name2,
+ gene_pairs=nrow(corr),
+ genes_unique=num_genes,
+ corr=cor(corr[[2]],corr[[3]],method="pearson")))
+ }
+}
+
+#Read correlation file one
+c1<-length(bulk_datasets)
+dataset_name1<-bulk_datasets[c1]
+corr_c1<-read_bulk_data(dataset_name1)
+
+#Unique genes
+num_genes<-length(union(gsub(";.*","",corr_c1$V1),
+ gsub(".*;","",corr_c1$V1)))
+
+corr_comp<-rbind(corr_comp,
+ data.frame(c1=dataset_name1,
+ c2=dataset_name1,
+ gene_pairs=nrow(corr_c1),
+ genes_unique=num_genes,
+ corr=1))
+
+# Save correlations
+write.table(corr_comp,
+ file="co-expression_indivs_combined/dataset_comp_summary/correlation_bulk_datasets.tsv",
+ sep="\t",row.names = FALSE,quote=FALSE)
+
+
+################################################################################
+# Combine all results in one large plot
+################################################################################
+
+corr_comp<-fread("co-expression_indivs_combined/dataset_comp_summary/correlation_singlecell_datasets.tsv")
+corr_comp$c1<-factor(corr_comp$c1,levels=dataset_names)
+corr_comp$c2<-factor(corr_comp$c2,levels=dataset_names)
+g.1<-ggplot(corr_comp,aes(x=c1,y=c2,fill=corr))+
+ geom_tile()+
+ geom_text(aes(label=paste0(round(corr,3),"\n(",genes_unique,")")),size=3)+
+ xlab("Single cell data set")+
+ ylab("Single cell data set")+
+ scale_fill_viridis("Correlation",limits=c(0,1))+
+ scale_y_discrete(labels=c("Oelen (v3)","Oelen (v2)","van Blokland\n(v2)",
+ "van Blokland\n(v3)","van der Wijst"))+
+ scale_x_discrete(labels=c("Oelen\n(v3)","Oelen\n(v2)","van\nBlokland\n(v2)",
+ "van\nBlokland\n(v3)","van der\nWijst"))
+
+corr_comp<-fread("co-expression_indivs_combined/dataset_comp_summary/correlation_singlevsbulk_datasets.tsv")
+corr_comp$c1[corr_comp$c1=="Blueprint"]<-"BLUEPRINT"
+corr_comp$c2<-factor(corr_comp$c2,levels=bulk_datasets)
+corr_comp$c1<-factor(corr_comp$c1,levels=dataset_names)
+g.2<-ggplot(corr_comp,aes(x=c2,y=c1,fill=corr))+
+ geom_tile()+
+ geom_text(aes(label=paste0(round(corr,3),"\n(",genes_unique,")")),size=3,
+ color="white")+
+ xlab("Bulk data set")+
+ ylab("Single cell data set")+
+ scale_fill_viridis("Correlation",limits=c(0,1))+
+ scale_y_discrete(labels=c("Oelen (v3)","Oelen (v2)","van Blokland\n(v2)",
+ "van Blokland\n(v3)","van der Wijst"))
+
+corr_comp<-fread("co-expression_indivs_combined/dataset_comp_summary/correlation_bulk_datasets.tsv")
+corr_comp$c1[corr_comp$c1=="Blueprint"]<-"BLUEPRINT"
+corr_comp$c2[corr_comp$c2=="Blueprint"]<-"BLUEPRINT"
+corr_comp$c1<-factor(corr_comp$c1,levels=bulk_datasets)
+corr_comp$c2<-factor(corr_comp$c2,levels=bulk_datasets)
+g.3<-ggplot(corr_comp,aes(x=c1,y=c2,fill=corr))+
+ geom_tile()+
+ geom_text(aes(label=paste0(round(corr,3),"\n(",genes_unique,")"),
+ color=ifelse(corr<0.6,'white','black')),size=3)+
+ scale_color_manual(values=c("black","white"))+
+ xlab("Bulk data set")+
+ ylab("Bulk data set")+
+ scale_fill_viridis("Correlation",limits=c(0,1))+
+ coord_flip()
+
+
+g_empty<-ggplot()+theme_void()
+g<-ggarrange(g.1,g.2,g_empty,g.3,ncol=2,nrow=2,widths=c(4,3),heights=c(4,3),
+ common.legend = TRUE,legend="bottom",align="hv")
+ggsave(g,file=paste0("co-expression_indivs_combined/plots/corr_datasets_combined.pdf"),
+ width=6.5,height=6.5)
diff --git a/02_correlation_evaluation/correlation_between_datasets_extended.R b/02_correlation_evaluation/correlation_between_datasets_extended.R
new file mode 100644
index 0000000..fd52dd6
--- /dev/null
+++ b/02_correlation_evaluation/correlation_between_datasets_extended.R
@@ -0,0 +1,188 @@
+###############################################################################
+# In order to better interpret the correlation levels:
+# check if correlation between single cell and bulk (ImmuNexUT) is higher for matched
+# cell types compared to not matched cell types
+###############################################################################
+
+library(data.table)
+library(reticulate) # to read the single cell data (numpy)
+library(ggplot2)
+library(viridis)
+library(dplyr)
+
+theme_set(theme_bw())
+
+#Rename cell types
+ct_fullname<-setNames(c("CD8+ T cells","monocytes","NK cells","B cells","DC"),
+ c("CD8T","monocyte","NK","B","DC"))
+
+#Path to different single cell dataset
+datasets<-c(mio_v3="co-expression_indivs_combined/",
+ mio_v2="co-expression_indivs_combined/one_million_version2/",
+ stemi_v2="co-expression_indivs_combined/stemi/version2/",
+ stemi_v3="co-expression_indivs_combined/stemi/version3/",
+ pilot="co-expression_indivs_combined/ng_updated_version/")
+
+#File endings for different single cell datasets
+file_suffixes<-c(mio_v3="_UT_correlation.csv",
+ mio_v2="_UT_correlation.csv",
+ stemi_v2="_t8w_correlation.csv",
+ stemi_v3="_t8w_correlation.csv",
+ pilot="_correlation.csv")
+
+#Name on plots for different single cell datasets
+dataset_names<-c(mio_v3="Oelen (v3)",
+ mio_v2="Oelen (v2)",
+ stemi_v2="van Blokland (v2)",
+ stemi_v3="van Blokland (v3)",
+ pilot="van der Wijst")
+
+#Different bulk datasets
+bulk_datasets<-c("BLUEPRINT","BIOS","ImmuNexUT")
+
+resort<-function(corr){
+ #Split into two genes
+ corr$gene1<-gsub(";.*","",corr$V1)
+ corr$gene2<-gsub(".*;","",corr$V1)
+
+ #Order them alphabetically
+ corr$V1<-ifelse(corr$gene1 < corr$gene2,corr$V1,
+ paste0(corr$gene2,";",corr$gene1))
+ corr$gene1<-NULL
+ corr$gene2<-NULL
+
+ return(corr)
+}
+
+################################################################################
+# Compare single cell vs ImmuNexUT - all cell types against all cell types
+################################################################################
+
+# Cell type matching
+ct_mapping<-data.frame(sc_ct=c("CD4T","CD8T","B","monocyte","NK","DC"),
+ imn_ct=c("Naive_CD4","Naive_CD8","Naive_B","CL_Mono","NK","mDC"))
+
+corr_comp<-NULL
+for(i in 1:nrow(ct_mapping)){
+
+ ct <- ct_mapping$imn_ct[i]
+ #cell_type<- "CD4T"
+
+ #Load ImmuNexUT data
+ combat_tmm<-fread(paste0("imd_paper_rna_data/norm_count/",ct,"_norm_count.txt"))
+
+ #Load the different single cell data sets
+ for(c1 in 1:length(datasets)){
+
+ #Load for each single cell dataset all cell types
+ for(cell_type in ct_mapping$sc_ct){
+
+ #Load single cell data set
+ corr_c2<-fread(paste0(datasets[c1],cell_type,
+ "/",cell_type,file_suffixes[c1]))
+ corr_c2<-resort(corr_c2)
+
+ #Filter the ImmuNexUT data set
+ expressed_genes<-union(gsub(";.*","",corr_c2$V1),
+ gsub(".*;","",corr_c2$V1))
+ expressed_genes<-intersect(expressed_genes,combat_tmm$V1)
+
+ combat_tmm_filtered<-combat_tmm[combat_tmm$V1 %in% expressed_genes,]
+ combat_tmm_filtered<-as.data.frame(combat_tmm_filtered)
+ rownames(combat_tmm_filtered)<-combat_tmm_filtered$V1
+ combat_tmm_filtered$V1<-NULL
+
+ #Calculation correlation
+ cor_matrix<-cor(t(combat_tmm_filtered),method="spearman")
+ cor_matrix<-reshape2::melt(cor_matrix)
+ cor_matrix$Var1<-as.character(cor_matrix$Var1)
+ cor_matrix$Var2<-as.character(cor_matrix$Var2)
+ cor_matrix<-cor_matrix[cor_matrix$Var1 < cor_matrix$Var2,]
+ cor_matrix$V1<-paste0(cor_matrix$Var1,";",cor_matrix$Var2)
+ cor_matrix$Var1<-NULL
+ cor_matrix$Var2<-NULL
+
+ #Compare BIOS with single cell
+ corr<-merge(corr_c2,cor_matrix,by=c("V1"))
+
+ #Unique genes
+ num_genes<-length(union(gsub(";.*","",corr$V1),
+ gsub(".*;","",corr$V1)))
+
+ corr_comp<-rbind(corr_comp,
+ data.frame(sc_ct=cell_type,
+ bulk_ct=ct,
+ c1=dataset_names[c1],
+ c2="ImmuNexUT",
+ gene_pairs=nrow(corr),
+ genes_unique=num_genes,
+ corr=cor(corr[[2]],corr[[3]],method="pearson")))
+ }
+ }
+}
+
+# Save correlations
+write.table(corr_comp,
+ file="co-expression_indivs_combined/dataset_comp_summary/correlation_singlecell_immunexut_mixedcts.tsv",
+ sep="\t",row.names = FALSE,quote=FALSE)
+
+
+################################################################################
+# Plot the results
+################################################################################
+
+corr_comp<-fread("co-expression_indivs_combined/dataset_comp_summary/correlation_singlecell_immunexut_mixedcts.tsv")
+
+ct_mapping<-data.frame(sc_ct=c("CD4T","CD8T","B","monocyte","NK","DC"),
+ imn_ct=c("Naive_CD4","Naive_CD8","Naive_B","CL_Mono","NK","mDC"))
+
+#Order single cell and bulk the same way
+corr_comp$sc_ct<-factor(corr_comp$sc_ct,levels=ct_mapping$sc_ct)
+corr_comp$bulk_ct<-factor(corr_comp$bulk_ct,levels=ct_mapping$imn_ct)
+
+g<-ggplot(corr_comp,aes(x=sc_ct,y=bulk_ct,fill=corr))+
+ geom_tile()+
+ geom_text(aes(label=paste0(round(corr,3),"\n(",genes_unique,")"),
+ color=ifelse(corr<0.6,'white','black')),size=3)+
+ scale_color_manual(values=c("black","white"))+
+ facet_wrap(~c1)+
+ xlab("Cell type - single cell")+
+ ylab("Cell type - bulk")+
+ scale_fill_viridis("Correlation",limits=c(0,1))+
+ theme(legend.position = "bottom")+
+ guides(color="none")
+
+
+print(g)
+
+ggsave(g,file="correlation_mixed_cts.png",height=7,width=9)
+
+################################################################################
+# Normalize the columns to always by the diagonal (matched cell types)
+################################################################################
+
+ct_mapping_list<-setNames(c("CD4T","CD8T","B","monocyte","NK","DC"),
+ c("Naive_CD4","Naive_CD8","Naive_B","CL_Mono","NK","mDC"))
+corr_comp$bulk_matched_ct<-ct_mapping_list[corr_comp$bulk_ct]
+corr_diagonal<-corr_comp[corr_comp$sc_ct==corr_comp$bulk_matched_ct,c("sc_ct","c1","corr")]
+colnames(corr_diagonal)<-c("sc_ct","c1","diag_corr")
+
+corr_comp<-merge(corr_comp,corr_diagonal,by=c("sc_ct","c1"))
+corr_comp$rel_corr<-corr_comp$corr/corr_comp$diag_corr
+
+g<-ggplot(corr_comp,aes(x=sc_ct,y=bulk_ct,fill=rel_corr))+
+ geom_tile()+
+ geom_text(aes(label=paste0(round(rel_corr,3),"\n(",round(corr,3),")"),
+ color=ifelse(rel_corr<1,'white','black')),size=3)+
+ scale_color_manual(values=c("black","white"))+
+ facet_wrap(~c1)+
+ xlab("Cell type - single cell")+
+ ylab("Cell type - bulk")+
+ scale_fill_viridis("Relative corr")+
+ theme(legend.position = "bottom")+
+ guides(color="none")
+
+
+print(g)
+
+ggsave(g,file="correlation_mixed_cts_normalized.png",height=7,width=9)
diff --git a/02_correlation_evaluation/correlation_between_datasets_othercts.R b/02_correlation_evaluation/correlation_between_datasets_othercts.R
new file mode 100644
index 0000000..35a45b1
--- /dev/null
+++ b/02_correlation_evaluation/correlation_between_datasets_othercts.R
@@ -0,0 +1,616 @@
+# ------------------------------------------------------------------------------
+# Extension of correlation_between_datasets.R (which looks only at CD4+ T cells)
+# for other cell types: get Pearson correlation between data sets
+# * for single cell vs single cell data set
+# * for single cell vs bulk data set
+# * for bulk vs bulk data set (here only monocytes)
+# Plot one heatmap for each comparison
+# ------------------------------------------------------------------------------
+
+library(data.table)
+library(reticulate) # to read the single cell data (numpy)
+library(ggplot2)
+library(viridis)
+library(dplyr)
+
+theme_set(theme_bw())
+
+#Rename cell types
+ct_fullname<-setNames(c("CD8+ T cells","monocytes","NK cells","B cells","DC"),
+ c("CD8T","monocyte","NK","B","DC"))
+
+#Path to different single cell dataset
+datasets<-c(mio_v3="co-expression_indivs_combined/",
+ mio_v2="co-expression_indivs_combined/one_million_version2/",
+ stemi_v2="co-expression_indivs_combined/stemi/version2/",
+ stemi_v3="co-expression_indivs_combined/stemi/version3/",
+ pilot="co-expression_indivs_combined/ng_updated_version/")
+
+#File endings for different single cell datasets
+file_suffixes<-c(mio_v3="_UT_correlation.csv",
+ mio_v2="_UT_correlation.csv",
+ stemi_v2="_t8w_correlation.csv",
+ stemi_v3="_t8w_correlation.csv",
+ pilot="_correlation.csv")
+
+#Name on plots for different single cell datasets
+dataset_names<-c(mio_v3="Oelen (v3)",
+ mio_v2="Oelen (v2)",
+ stemi_v2="van Blokland (v2)",
+ stemi_v3="van Blokland (v3)",
+ pilot="van der Wijst")
+
+#Different bulk datasets
+bulk_datasets<-c("BLUEPRINT","BIOS","ImmuNexUT")
+
+resort<-function(corr){
+ #Split into two genes
+ corr$gene1<-gsub(";.*","",corr$V1)
+ corr$gene2<-gsub(".*;","",corr$V1)
+
+ #Order them alphabetically
+ corr$V1<-ifelse(corr$gene1 < corr$gene2,corr$V1,
+ paste0(corr$gene2,";",corr$gene1))
+ corr$gene1<-NULL
+ corr$gene2<-NULL
+
+ return(corr)
+}
+
+
+################################################################################
+# 1) Compare single cell with each other
+################################################################################
+
+corr_comp<-NULL
+for(cell_type in c("CD8T","monocyte","NK","B","DC")){
+
+ for(c1 in 1:(length(datasets)-1)){
+
+ #Read correlation file one
+ dataset_name1<-dataset_names[c1]
+ corr_c1<-fread(paste0(datasets[c1],cell_type,
+ "/",cell_type,file_suffixes[c1]))
+ corr_c1<-resort(corr_c1)
+
+ #Unique genes
+ num_genes<-length(union(gsub(";.*","",corr_c1$V1),
+ gsub(".*;","",corr_c1$V1)))
+
+ corr_comp<-rbind(corr_comp,
+ data.frame(cell_type,
+ c1=dataset_name1,
+ c2=dataset_name1,
+ gene_pairs=nrow(corr_c1),
+ genes_unique=num_genes,
+ corr=1))
+
+ for(c2 in (c1+1):length(datasets)){
+
+ #Read correlation file two
+ dataset_name2<-dataset_names[c2]
+ corr_c2<-fread(paste0(datasets[c2],cell_type,"/",
+ cell_type,file_suffixes[c2]))
+ corr_c2<-resort(corr_c2)
+
+ corr<-merge(corr_c1,corr_c2,by=c("V1"))
+
+ #Unique genes
+ num_genes<-length(union(gsub(";.*","",corr$V1),
+ gsub(".*;","",corr$V1)))
+
+ corr_comp<-rbind(corr_comp,
+ data.frame(cell_type,
+ c1=dataset_name1,
+ c2=dataset_name2,
+ gene_pairs=nrow(corr),
+ genes_unique=num_genes,
+ corr=cor(corr[[2]],corr[[3]],method="pearson")))
+ }
+ }
+
+
+ #Read correlation file one
+ c1<-length(datasets)
+ dataset_name1<-dataset_names[c1]
+ corr_c1<-fread(paste0(datasets[c1],cell_type,
+ "/",cell_type,file_suffixes[c1]))
+
+ #Unique genes
+ num_genes<-length(union(gsub(";.*","",corr_c1$V1),
+ gsub(".*;","",corr_c1$V1)))
+
+ corr_comp<-rbind(corr_comp,
+ data.frame(cell_type,
+ c1=dataset_name1,
+ c2=dataset_name1,
+ genes_unique=num_genes,
+ gene_pairs=nrow(corr_c1),
+ corr=1))
+}
+
+# Save correlations
+write.table(corr_comp,
+ file="co-expression_indivs_combined/dataset_comp_summary/correlation_singlecell_datasets_othercts.tsv",
+ sep="\t",row.names = FALSE,quote=FALSE)
+
+################################################################################
+# Plot comparison of single cell vs single cell (Supplementary Figure)
+################################################################################
+
+corr_comp<-fread("co-expression_indivs_combined/dataset_comp_summary/correlation_singlecell_datasets_othercts.tsv")
+corr_comp$c1<-factor(corr_comp$c1,levels=dataset_names)
+corr_comp$c2<-factor(corr_comp$c2,levels=dataset_names)
+corr_comp$cell_type<-ct_fullname[corr_comp$cell_type]
+
+g<-ggplot(corr_comp,aes(x=c1,y=c2,fill=corr))+
+ geom_tile()+
+ geom_text(aes(label=paste0(round(corr,3),"\n(",genes_unique,")"),
+ color=ifelse(corr<0.6,'white','black')),size=3)+
+ scale_color_manual(values=c("black","white"))+
+ xlab("Single cell data set")+
+ ylab("Single cell data set")+
+ scale_fill_viridis("Correlation",limits=c(0,1))+
+ facet_wrap(~cell_type)+
+ scale_y_discrete(labels=c("Oelen (v3)","Oelen (v2)","van Blokland\n(v2)",
+ "van Blokland\n(v3)","van der Wijst"))+
+ scale_x_discrete(labels=c("Oelen\n(v3)","Oelen\n(v2)","van\nBlokland\n(v2)",
+ "van\nBlokland\n(v3)","van der\nWijst"))+
+ theme(legend.position=c(0.9,0.1))+
+ guides(color=FALSE)
+print(g)
+
+ggsave(g,file=paste0("co-expression_indivs_combined/plots/corr_single_cell_othercts.png"),
+ width=8.5,height=6.5)
+
+#Get also CD4 T cell results
+corr_comp_ct<-fread("co-expression_indivs_combined/dataset_comp_summary/correlation_singlecell_datasets.tsv")
+corr_comp_ct$cell_type<-"CD4T"
+corr_comp_ct<-corr_comp_ct[,colnames(corr_comp),with=FALSE]
+corr_comp<-rbind(corr_comp_ct,corr_comp)
+
+#Get median correlation
+corr_comp%>%
+ group_by(cell_type)%>%
+ summarise(mean(corr),median(corr),min(corr),max(corr))
+
+#Overall distribution across all cell types
+summary(corr_comp$corr)
+
+################################################################################
+# 2) Compare single cell with bulk
+################################################################################
+
+################################################################################
+# For Blueprint - Monocytes
+################################################################################
+
+#Blueprint Monocyte correlation
+path<-"blueprint_data/mono_gene_nor_combat_20151109.ProbesWithZeroVarianceRemoved.ProbesCentered.SamplesZTransformed.1PCAsOverSamplesRemoved.spearmanr."
+rowname_suffix<-"genes.txt"
+
+corr_c1 <- np$load(paste0(path,"npy"), mmap_mode="r")
+row_names<-fread(paste0(path,rowname_suffix),header=FALSE)
+rownames(corr_c1)<-row_names$V1
+colnames(corr_c1)<-row_names$V1
+rm(row_names)
+
+corr_comp<-NULL
+cell_type<-"monocyte"
+for(c1 in 1:length(datasets)){
+
+ #Load single cell data set
+ corr_c2<-fread(paste0(datasets[c1],cell_type,
+ "/",cell_type,file_suffixes[c1]))
+ corr_c2<-resort(corr_c2)
+
+ #Filter the Blueprint data set
+ expressed_genes<-union(gsub(";.*","",corr_c2$V1),
+ gsub(".*;","",corr_c2$V1))
+ expressed_genes<-intersect(expressed_genes,colnames(corr_c1))
+
+ corr_c1_filtered<-corr_c1[expressed_genes,expressed_genes]
+ corr_c1_filtered<-reshape2::melt(corr_c1_filtered)
+ corr_c1_filtered$Var1<-as.character(corr_c1_filtered$Var1)
+ corr_c1_filtered$Var2<-as.character(corr_c1_filtered$Var2)
+ corr_c1_filtered<-corr_c1_filtered[corr_c1_filtered$Var1 < corr_c1_filtered$Var2,]
+ corr_c1_filtered$V1<-paste0(corr_c1_filtered$Var1,";",corr_c1_filtered$Var2)
+ corr_c1_filtered$Var1<-NULL
+ corr_c1_filtered$Var2<-NULL
+
+ corr<-merge(corr_c1_filtered,corr_c2,by=c("V1"))
+
+ #Unique genes
+ num_genes<-length(union(gsub(";.*","",corr$V1),
+ gsub(".*;","",corr$V1)))
+
+ corr_comp<-rbind(corr_comp,
+ data.frame(cell_type,
+ c1=dataset_names[c1],
+ c2="BLUEPRINT",
+ gene_pairs=nrow(corr),
+ genes_unique=num_genes,
+ corr=cor(corr[[2]],corr[[3]],method="pearson")))
+}
+
+# Save correlations
+write.table(corr_comp,
+ file="co-expression_indivs_combined/dataset_comp_summary/correlation_singlecell_blueprint_mono.tsv",
+ sep="\t",row.names = FALSE,quote=FALSE)
+
+################################################################################
+# For Blueprint - CD4T
+################################################################################
+
+#Blueprint CD4T correlation
+path<-"blueprint_data/tcel_gene_nor_combat_20151109.ProbesWithZeroVarianceRemoved.ProbesCentered.SamplesZTransformed.spearmanR."
+rowname_suffix<-"rows.txt"
+
+corr_c1 <- np$load(paste0(path,"npy"), mmap_mode="r")
+row_names<-fread(paste0(path,rowname_suffix),header=FALSE)
+rownames(corr_c1)<-row_names$V1
+colnames(corr_c1)<-row_names$V1
+rm(row_names)
+
+corr_comp<-NULL
+cell_type<-"CD4T"
+for(c1 in 1:length(datasets)){
+
+ #Load single cell data set
+ corr_c2<-fread(paste0(datasets[c1],cell_type,
+ "/",cell_type,file_suffixes[c1]))
+ corr_c2<-resort(corr_c2)
+
+ #Filter the Blueprint data set
+ expressed_genes<-union(gsub(";.*","",corr_c2$V1),
+ gsub(".*;","",corr_c2$V1))
+ expressed_genes<-intersect(expressed_genes,colnames(corr_c1))
+
+ corr_c1_filtered<-corr_c1[expressed_genes,expressed_genes]
+ corr_c1_filtered<-reshape2::melt(corr_c1_filtered)
+ corr_c1_filtered$Var1<-as.character(corr_c1_filtered$Var1)
+ corr_c1_filtered$Var2<-as.character(corr_c1_filtered$Var2)
+ corr_c1_filtered<-corr_c1_filtered[corr_c1_filtered$Var1 < corr_c1_filtered$Var2,]
+ corr_c1_filtered$V1<-paste0(corr_c1_filtered$Var1,";",corr_c1_filtered$Var2)
+ corr_c1_filtered$Var1<-NULL
+ corr_c1_filtered$Var2<-NULL
+
+ corr<-merge(corr_c1_filtered,corr_c2,by=c("V1"))
+
+ #Unique genes
+ num_genes<-length(union(gsub(";.*","",corr$V1),
+ gsub(".*;","",corr$V1)))
+
+ corr_comp<-rbind(corr_comp,
+ data.frame(cell_type,
+ c1=dataset_names[c1],
+ c2="BLUEPRINT",
+ gene_pairs=nrow(corr),
+ genes_unique=num_genes,
+ corr=cor(corr[[2]],corr[[3]],method="pearson")))
+}
+
+# Save correlations
+write.table(corr_comp,
+ file="co-expression_indivs_combined/dataset_comp_summary/correlation_singlecell_blueprint_cd4t.tsv",
+ sep="\t",row.names = FALSE,quote=FALSE)
+
+################################################################################
+# For ImmuNexUT - all cell types
+################################################################################
+
+# Cell type matching
+ct_mapping<-data.frame(sc_ct=c("CD4T","CD8T","B","monocyte","NK","DC"),
+ imn_ct=c("Naive_CD4","Naive_CD8","Naive_B","CL_Mono","NK","mDC"))
+
+corr_comp<-NULL
+for(i in 1:nrow(ct_mapping)){
+
+ ct <- ct_mapping$imn_ct[i]
+ cell_type<- ct_mapping$sc_ct[i]
+
+ #Load ImmuNexUT data
+ combat_tmm<-fread(paste0("imd_paper_rna_data/norm_count/",ct,"_norm_count.txt"))
+
+ #Load the different single cell data sets
+ for(c1 in 1:length(datasets)){
+ #Load single cell data set
+ corr_c2<-fread(paste0(datasets[c1],cell_type,
+ "/",cell_type,file_suffixes[c1]))
+ corr_c2<-resort(corr_c2)
+
+ #Filter the ImmuNexUT data set
+ expressed_genes<-union(gsub(";.*","",corr_c2$V1),
+ gsub(".*;","",corr_c2$V1))
+ expressed_genes<-intersect(expressed_genes,combat_tmm$V1)
+
+ combat_tmm_filtered<-combat_tmm[combat_tmm$V1 %in% expressed_genes,]
+ combat_tmm_filtered<-as.data.frame(combat_tmm_filtered)
+ rownames(combat_tmm_filtered)<-combat_tmm_filtered$V1
+ combat_tmm_filtered$V1<-NULL
+
+ #Calculation correlation
+ cor_matrix<-cor(t(combat_tmm_filtered),method="spearman")
+ cor_matrix<-reshape2::melt(cor_matrix)
+ cor_matrix$Var1<-as.character(cor_matrix$Var1)
+ cor_matrix$Var2<-as.character(cor_matrix$Var2)
+ cor_matrix<-cor_matrix[cor_matrix$Var1 < cor_matrix$Var2,]
+ cor_matrix$V1<-paste0(cor_matrix$Var1,";",cor_matrix$Var2)
+ cor_matrix$Var1<-NULL
+ cor_matrix$Var2<-NULL
+
+ #Compare BIOS with single cell
+ corr<-merge(corr_c2,cor_matrix,by=c("V1"))
+
+ #Unique genes
+ num_genes<-length(union(gsub(";.*","",corr$V1),
+ gsub(".*;","",corr$V1)))
+
+ corr_comp<-rbind(corr_comp,
+ data.frame(cell_type,
+ c1=dataset_names[c1],
+ c2="ImmuNexUT",
+ gene_pairs=nrow(corr),
+ genes_unique=num_genes,
+ corr=cor(corr[[2]],corr[[3]],method="pearson")))
+ }
+}
+
+# Save correlations
+write.table(corr_comp,
+ file="co-expression_indivs_combined/dataset_comp_summary/correlation_singlecell_immunexut_allcts.tsv",
+ sep="\t",row.names = FALSE,quote=FALSE)
+
+################################################################################
+# For BIOS - all cell types
+################################################################################
+
+#Load the bios expression matrix
+bios_data<-fread("bios/gene_read_counts_BIOS_and_LLD_passQC.tsv.SampleSelection.ProbesWithZeroVarianceRemoved.TMM.SampleSelection.ProbesWithZeroVarianceRemoved.Log2Transformed.ProbesCentered.SamplesZTransformed.CovariatesRemovedOLS.noLLDeep.scGeneOnly.txt.gz")
+
+corr_comp<-NULL
+for(cell_type in c("CD4T","CD8T","monocyte","NK","B","DC")){
+ for(c1 in 1:length(datasets)){
+
+ corr_c1<-fread(paste0(datasets[c1],cell_type,
+ "/",cell_type,file_suffixes[c1]))
+ corr_c1<-resort(corr_c1)
+ expressed_genes<-union(gsub(";.*","",corr_c1$V1),
+ gsub(".*;","",corr_c1$V1))
+
+ #Filter BIOS for the genes expressed in the respective data set
+ bios_data_filtered<-bios_data[bios_data$genename %in% expressed_genes,]
+ bios_data_filtered<-as.data.frame(bios_data_filtered)
+ rownames(bios_data_filtered)<-bios_data_filtered$genename
+ bios_data_filtered$genename<-NULL
+
+ #Calculate correlation for BIOS
+ cor_matrix<-cor(t(bios_data_filtered),method="spearman")
+ cor_matrix<-reshape2::melt(cor_matrix)
+ cor_matrix$Var1<-as.character(cor_matrix$Var1)
+ cor_matrix$Var2<-as.character(cor_matrix$Var2)
+ cor_matrix<-cor_matrix[cor_matrix$Var1 < cor_matrix$Var2,]
+ cor_matrix$V1<-paste0(cor_matrix$Var1,";",cor_matrix$Var2)
+ cor_matrix$Var1<-NULL
+ cor_matrix$Var2<-NULL
+
+ #Compare BIOS with single cell
+ corr<-merge(corr_c1,cor_matrix,by=c("V1"))
+
+ #Unique genes
+ num_genes<-length(union(gsub(";.*","",corr$V1),
+ gsub(".*;","",corr$V1)))
+
+ corr_comp<-rbind(corr_comp,
+ data.frame(cell_type,
+ c1=dataset_names[c1],
+ c2="BIOS",
+ gene_pairs=nrow(corr),
+ genes_unique=num_genes,
+ corr=cor(corr[[2]],corr[[3]],method="pearson")))
+
+ }
+}
+
+# Save correlations
+write.table(corr_comp,
+ file="co-expression_indivs_combined/dataset_comp_summary/correlation_singlecell_bios_allcts.tsv",
+ sep="\t",row.names = FALSE,quote=FALSE)
+
+################################################################################
+# Plot comparison of single cell vs bulk (Supplementary Figure)
+################################################################################
+
+#Load the different data sets
+corr_comp<-rbind(fread("co-expression_indivs_combined/dataset_comp_summary/correlation_singlecell_bios_allcts.tsv"),
+ fread("co-expression_indivs_combined/dataset_comp_summary/correlation_singlecell_immunexut_allcts.tsv"),
+ fread("co-expression_indivs_combined/dataset_comp_summary/correlation_singlecell_blueprint_mono.tsv"),
+ fread("co-expression_indivs_combined/dataset_comp_summary/correlation_singlecell_blueprint_cd4t.tsv"))
+
+corr_comp%>%
+ group_by(cell_type,c2)%>%
+ summarise(mean(corr),median(corr),min(corr),max(corr))
+
+#Remove CD4T cells (already shown in the main figure)
+corr_comp<-corr_comp[corr_comp$cell_type != "CD4T",]
+
+corr_comp$c1<-factor(corr_comp$c1,levels=dataset_names)
+
+corr_comp$cell_type<-ct_fullname[corr_comp$cell_type]
+
+g<-ggplot(corr_comp,aes(x=c2,y=c1,fill=corr))+
+ geom_tile()+
+ geom_text(aes(label=paste0(round(corr,3),"\n(",genes_unique,")"),
+ color=ifelse(corr<0.6,'white','black')),size=3)+
+ scale_color_manual(values=c("black","white"))+
+ xlab("Bulk cell data set")+
+ ylab("Single cell data set")+
+ scale_fill_viridis("Correlation",limits=c(0,1))+
+ facet_wrap(~cell_type)+
+ scale_y_discrete(labels=c("Oelen (v3)","Oelen (v2)","van Blokland\n(v2)",
+ "van Blokland\n(v3)","van der Wijst"))+
+ theme(legend.position=c(0.9,0.1))+
+ guides(color=FALSE)
+print(g)
+
+ggsave(g,file=paste0("co-expression_indivs_combined/plots/corr_singlevsbulk_othercts.png"),
+ width=8.5,height=6.5)
+
+################################################################################
+# 3) Compare bulk vs bulk for Monocytes
+################################################################################
+
+#Special function to read bulk data as they are not all saved in the same file type
+read_bulk_data<-function(dataset_name){
+
+ if(dataset_name=="BLUEPRINT"){
+ #Blueprint Monocyte correlation
+ path<-"blueprint_data/mono_gene_nor_combat_20151109.ProbesWithZeroVarianceRemoved.ProbesCentered.SamplesZTransformed.1PCAsOverSamplesRemoved.spearmanr."
+ rowname_suffix<-"genes.txt"
+
+ corr_c1 <- np$load(paste0(path,"npy"), mmap_mode="r")
+ row_names<-fread(paste0(path,rowname_suffix),header=FALSE)
+ rownames(corr_c1)<-row_names$V1
+ colnames(corr_c1)<-row_names$V1
+ rm(row_names)
+
+ #Filter for single cell data
+ ct_single_cell<-"monocyte"
+ corr_sc<-fread(paste0("co-expression_indivs_combined/",ct_single_cell,"/",
+ ct_single_cell,"_UT_correlation.csv"))
+ corr_sc$gene1<-gsub(";.*","",corr_sc$V1)
+ corr_sc$gene2<-gsub(".*;","",corr_sc$V1)
+ sc_genes<-union(corr_sc$gene1,corr_sc$gene2)
+ sc_genes<-sc_genes[sc_genes %in% colnames(corr_c1)]
+
+ corr_c1<-corr_c1[sc_genes,sc_genes]
+ corr_c1<-reshape2::melt(corr_c1)
+ corr_c1$Var1<-as.character(corr_c1$Var1)
+ corr_c1$Var2<-as.character(corr_c1$Var2)
+ colnames(corr_c1)[1:3]<-c("gene1","gene2","corr")
+
+ #Order so that gene1 is always the one first in alphabet
+ corr_c1<-corr_c1[corr_c1$gene1!=corr_c1$gene2,]
+ corr_c1$V1<-paste0(corr_c1$gene1,";",corr_c1$gene2)
+ corr_c1$gene1<-NULL
+ corr_c1$gene2<-NULL
+
+ corr_c1<-corr_c1[,c("V1","corr")]
+
+ } else if (dataset_name=="BIOS"){
+
+ #Load the bios expression matrix
+ bios_data<-fread("bios/gene_read_counts_BIOS_and_LLD_passQC.tsv.SampleSelection.ProbesWithZeroVarianceRemoved.TMM.SampleSelection.ProbesWithZeroVarianceRemoved.Log2Transformed.ProbesCentered.SamplesZTransformed.CovariatesRemovedOLS.noLLDeep.scGeneOnly.txt.gz")
+
+ # Read single cell data to filter for the expressed genes
+ cell_type <- "monocyte"
+ corr_sc<-fread(paste0("co-expression_indivs_combined/",cell_type,
+ "/",cell_type,"_UT_correlation.csv"))
+ corr_sc<-resort(corr_sc)
+ expressed_genes<-union(gsub(";.*","",corr_sc$V1),
+ gsub(".*;","",corr_sc$V1))
+
+ #Filter BIOS for the genes expressed in the respective data set
+ bios_data_filtered<-bios_data[bios_data$genename %in% expressed_genes,]
+ bios_data_filtered<-as.data.frame(bios_data_filtered)
+ rownames(bios_data_filtered)<-bios_data_filtered$genename
+ bios_data_filtered$genename<-NULL
+
+ #Calculate correlation for BIOS
+ corr_c1<-cor(t(bios_data_filtered),method="spearman")
+ corr_c1<-reshape2::melt(corr_c1)
+ corr_c1$Var1<-as.character(corr_c1$Var1)
+ corr_c1$Var2<-as.character(corr_c1$Var2)
+ corr_c1<-corr_c1[corr_c1$Var1 < corr_c1$Var2,]
+ corr_c1$V1<-paste0(corr_c1$Var1,";",corr_c1$Var2)
+ corr_c1$Var1<-NULL
+ corr_c1$Var2<-NULL
+
+ } else { #ImmuNexUT
+ corr_c1<-fread("imd_paper_rna_data/correlation/CL_Mono_correlation.txt")
+ #all(corr_c1$gene1 < corr_c1$gene2)
+ corr_c1$V1<-paste0(corr_c1$gene1,";",corr_c1$gene2)
+ corr_c1$gene1<-NULL
+ corr_c1$gene2<-NULL
+ }
+
+ return(corr_c1)
+}
+
+#Compare each bulk dataset against all other
+corr_comp<-NULL
+for(c1 in 1:(length(bulk_datasets)-1)){
+
+ #Read correlation file one
+ dataset_name1<-bulk_datasets[c1]
+ corr_c1<-read_bulk_data(dataset_name1)
+
+ #Unique genes
+ num_genes<-length(union(gsub(";.*","",corr_c1$V1),
+ gsub(".*;","",corr_c1$V1)))
+
+ corr_comp<-rbind(corr_comp,
+ data.frame(c1=dataset_name1,
+ c2=dataset_name1,
+ gene_pairs=nrow(corr_c1),
+ genes_unique=num_genes,
+ corr=1))
+
+ for(c2 in (c1+1):length(bulk_datasets)){
+
+ #Read correlation file two
+ dataset_name2<-bulk_datasets[c2]
+ corr_c2<-read_bulk_data(dataset_name2)
+
+ corr<-merge(corr_c1,corr_c2,by=c("V1"))
+
+ #Unique genes
+ num_genes<-length(union(gsub(";.*","",corr$V1),
+ gsub(".*;","",corr$V1)))
+
+ corr_comp<-rbind(corr_comp,
+ data.frame(c1=dataset_name1,
+ c2=dataset_name2,
+ gene_pairs=nrow(corr),
+ genes_unique=num_genes,
+ corr=cor(corr[[2]],corr[[3]],method="pearson")))
+ }
+}
+
+#Read correlation file one
+c1<-length(bulk_datasets)
+dataset_name1<-bulk_datasets[c1]
+corr_c1<-read_bulk_data(dataset_name1)
+
+#Unique genes
+num_genes<-length(union(gsub(";.*","",corr_c1$V1),
+ gsub(".*;","",corr_c1$V1)))
+
+corr_comp<-rbind(corr_comp,
+ data.frame(c1=dataset_name1,
+ c2=dataset_name1,
+ gene_pairs=nrow(corr_c1),
+ genes_unique=num_genes,
+ corr=1))
+
+# Save correlations
+write.table(corr_comp,
+ file="co-expression_indivs_combined/dataset_comp_summary/correlation_bulk_datasets_monocytes.tsv",
+ sep="\t",row.names = FALSE,quote=FALSE)
+
+# Save plot
+corr_comp$c1<-factor(corr_comp$c1,levels=bulk_datasets)
+corr_comp$c2<-factor(corr_comp$c2,levels=bulk_datasets)
+
+g<-ggplot(corr_comp,aes(x=c1,y=c2,fill=corr))+
+ geom_tile()+
+ geom_text(aes(label=paste0(round(corr,3),"\n(",genes_unique,")"),
+ color=ifelse(corr<0.6,'white','black')),size=3)+
+ scale_color_manual(values=c("black","white"))+
+ xlab("Bulk data set")+
+ ylab("Bulk data set")+
+ scale_fill_viridis("Correlation",limits=c(0,1))+
+ guides(color=FALSE)
+
+ggsave(g,file="co-expression_indivs_combined/plots/corr_bulk_mono.png",
+ width=5,height=3)
\ No newline at end of file
diff --git a/02_correlation_evaluation/correlation_timepoint_combined_indivs_1mio.py b/02_correlation_evaluation/correlation_timepoint_combined_indivs_1mio.py
new file mode 100644
index 0000000..15f0bdb
--- /dev/null
+++ b/02_correlation_evaluation/correlation_timepoint_combined_indivs_1mio.py
@@ -0,0 +1,97 @@
+###########################################################################################
+# Calculate correlation for each cell type, selecting always one timepoint (UT)
+# merging all individuals for Oelen v2 and v3 dataset
+###########################################################################################
+
+#from scipy.stats import t, norm
+from scipy.stats import spearmanr
+import scanpy as sc
+import numpy as np
+import pandas as pd
+from pathlib import Path
+from time import time
+import os
+import re
+
+# specify if Oelen v3 or v2 dataset should be used
+version2 = True
+
+# load scanpy object
+lif version2:
+ prefix_results = Path('co-expression_indivs_combined/one_million_version2/')
+else:
+ prefix_results = Path('co-expression_indivs_combined/')
+
+if version2:
+ alldata = sc.read_h5ad('seurat_objects/1M_v2_mediumQC_ctd_rnanormed_demuxids_20201029.sct.h5ad')
+else:
+ alldata = sc.read_h5ad('seurat_objects/1M_v3_mediumQC_ctd_rnanormed_demuxids_20201106.SCT.h5ad')
+
+def select_gene_nonzeroratio(df, ratio):
+ nonzerocounts = np.count_nonzero(df.values, axis=0)/df.shape[0]
+ selected_genes = df.columns[nonzerocounts>ratio]
+ return selected_genes
+
+# extract timepoint from timepoint - stimulation annotation
+def get_time(x):
+ if x == 'UT':
+ return x
+ else:
+ pattern = re.compile(r'\d+h')
+ return re.findall(pattern, x)[0]
+
+
+# extract timepoint from timepoint - stimulation annotation
+observations = alldata.obs.copy()
+observations['time_merged'] = [get_time(item) for item in observations['timepoint']]
+observations['timepoint_id_celltype'] = [f'{item[0]}_{item[1]}' for item
+ in observations[['time_merged', 'cell_type_lowerres']].values]
+
+celltypes = ['B', 'CD4T', 'CD8T', 'monocyte', 'DC', 'NK']
+for celltype in celltypes:
+ if not os.path.isdir(prefix_results/celltype):
+ os.mkdir(prefix_results/celltype)
+ starttime = time()
+ print(celltype)
+ specific = alldata[alldata.obs.cell_type_lowerres==celltype]
+ celltype_data = pd.DataFrame(data=specific.X.toarray(),
+ index=specific.obs.index,
+ columns=specific.var.index)
+
+ # get the set of gene pairs
+ specific_obs = observations[observations['cell_type_lowerres']==celltype]
+
+ for condition in ['UT', '3h', '24h']:
+
+ # filter for the condition
+ celltype_condition_data = celltype_data[specific_obs.time_merged==condition]
+
+ # take either tsv file with selected genes or filter genes after a nonzero rate
+ if gene_selection_file:
+ selected_genes = pd.read_csv('co-expression_indivs_combined/coexp_tp_union/genelists_tp_union/expressed_gene_'+celltype+'.tsv')
+ selected_genes =selected_genes["genes"].tolist()
+ else:
+ selected_genes = select_gene_nonzeroratio(celltype_condition_data, 0.5)
+
+ print(f"Number of selected genes for {celltype} {condition}: {len(selected_genes)}")
+
+ gene_pairs = []
+ for i,gene1 in enumerate(selected_genes):
+ for j in range(i+1, len(selected_genes)):
+ gene_pairs.append(';'.join([gene1, selected_genes[j]]))
+
+ input_df = celltype_condition_data[selected_genes]
+ input_data = spearmanr(input_df, axis=0)[0]
+ input_data_uppertria = input_data[np.triu_indices_from(input_data, 1)]
+
+ corrs_df = pd.DataFrame(data=input_data_uppertria,
+ columns=[f'{condition}'],
+ index=gene_pairs)
+
+ corrs_df.to_csv(prefix_results/celltype/f'{celltype}_{condition}_correlation.csv')
+
+ #Filter for 0.3 correlation cutoff
+ corrs_df = corrs_df[corrs_df[condition]>0.3]
+ corrs_df.to_csv(prefix_results/celltype/f'{celltype}_{condition}_correlation_03filtered.csv')
+
+ print(f"Finished {celltype} with time {time() - starttime}")
diff --git a/02_correlation_evaluation/correlation_timepoint_combined_indivs_ng.py b/02_correlation_evaluation/correlation_timepoint_combined_indivs_ng.py
new file mode 100644
index 0000000..616a978
--- /dev/null
+++ b/02_correlation_evaluation/correlation_timepoint_combined_indivs_ng.py
@@ -0,0 +1,88 @@
+######################################################################
+# Calculate correlation for each cell type for van der Wijst dataset,
+# merging all individuals
+######################################################################
+
+from scipy.stats import spearmanr
+import numpy as np
+import pandas as pd
+from pathlib import Path
+from time import time
+import scanpy as sc
+import os
+
+
+def select_gene_nonzeroratio(df, ratio):
+ nonzerocounts = np.count_nonzero(df.values, axis=0)/df.shape[0]
+ selected_genes = df.columns[nonzerocounts>ratio]
+ return selected_genes
+
+
+def select_gene_variances(df, ratio):
+ variances = np.var(df.values, axis=0)/df.shape[0]
+ var_thres = np.percentile(variances, ratio)
+ # var_thres = np.nanmedian(variances)
+ selected_genes = df.columns[variances>var_thres]
+ print(selected_genes[:5])
+ return selected_genes
+
+
+def get_genename(df, mapping):
+ df['genename'] = [mapping.get(geneid) for geneid in df.index]
+ df = df.dropna(subset=['genename']).drop_duplicates(subset=['genename'])
+ df = df.set_index('genename')
+ return df
+
+
+# set working directory (to shorten path length)
+os.chdir('./')
+gene_selection_file = False
+
+# load scanpy object
+prefix_results = Path('co-expression_indivs_combined/ng_updated_version')
+# test stemi v2
+# alldata = sc.read_h5ad('seurat_objects/pilot3_subsetted_celltypes_final_ensemble_converted_samples.h5ad')
+alldata = sc.read_h5ad('seurat_objects/pilot3_seurat3_200420_sct_azimuth.h5ad')
+
+# extract timepoint from timepoint - stimulation annotation
+celltype_maping = {'CD4 T': 'CD4T', 'CD8 T': 'CD8T', 'Mono': 'monocyte', 'DC': 'DC', 'NK':'NK', 'other T': 'otherT', 'other': 'other', 'B':'B'}
+alldata.obs['cell_type_mapped_to_onemillion'] = [celltype_maping.get(name) for name in alldata.obs['predicted.celltype.l1']]
+observations = alldata.obs.copy()
+celltypes = [item for item in observations['cell_type_mapped_to_onemillion'].unique() if not pd.isnull(item)]
+print(celltypes)
+for celltype in celltypes:
+ if not os.path.isdir(prefix_results / celltype):
+ os.mkdir(prefix_results / celltype)
+ starttime = time()
+ print(celltype)
+ specific = alldata[alldata.obs['cell_type_mapped_to_onemillion'] == celltype]
+ celltype_data = pd.DataFrame(data=specific.X.toarray(),
+ index=specific.obs.index,
+ columns=specific.var.index)
+ print(celltype_data.shape)
+ # get the set of gene pairs
+ specific_obs = observations[observations['cell_type_mapped_to_onemillion'] == celltype]
+ # filter for the condition
+ celltype_condition_data = celltype_data
+ # take either tsv file with selected genes or filter genes after a nonzero rate
+ if gene_selection_file:
+ selected_genes = pd.read_csv(' /genelists_tp_union/expressed_gene_' + celltype + '.tsv')
+ selected_genes = selected_genes["genes"].tolist()
+ else:
+ selected_genes = select_gene_nonzeroratio(celltype_condition_data, 0.5)
+ print(f"Number of selected genes for {celltype} : {len(selected_genes)}")
+ gene_pairs = []
+ for i, gene1 in enumerate(selected_genes):
+ for j in range(i + 1, len(selected_genes)):
+ gene_pairs.append(';'.join([gene1, selected_genes[j]]))
+ input_df = celltype_condition_data[selected_genes]
+ input_data = spearmanr(input_df, axis=0)[0]
+ input_data_uppertria = input_data[np.triu_indices_from(input_data, 1)]
+ corrs_df = pd.DataFrame(data=input_data_uppertria,
+ columns=[f'UT'],
+ index=gene_pairs)
+ corrs_df.to_csv(prefix_results / celltype / f'{celltype}_correlation.csv')
+ # Filter for 0.3 correlation cutoff
+ corrs_df = corrs_df[corrs_df['UT'] > 0.3]
+ corrs_df.to_csv(prefix_results / celltype / f'{celltype}_correlation_03filtered.csv')
+ print(f"Finished {celltype} with time {time() - starttime}")
diff --git a/02_correlation_evaluation/correlation_timepoint_combined_indivs_stemiv2.py b/02_correlation_evaluation/correlation_timepoint_combined_indivs_stemiv2.py
new file mode 100644
index 0000000..8312a5f
--- /dev/null
+++ b/02_correlation_evaluation/correlation_timepoint_combined_indivs_stemiv2.py
@@ -0,0 +1,50 @@
+##############################################################################
+# Calculate correlation for each cell type for the van Blokland v2 dataset,
+# timpeoint 6-8 weeks after admission, merging all individuals
+##############################################################################
+
+from scipy.stats import spearmanr
+import numpy as np
+import pandas as pd
+from pathlib import Path
+from time import time
+import os
+
+
+def select_gene_nonzeroratio(df, ratio):
+ nonzerocounts = np.count_nonzero(df.values, axis=0)/df.shape[0]
+ selected_genes = df.columns[nonzerocounts>ratio]
+ return selected_genes
+
+# set working directory (to shorten path length)
+os.chdir('./')
+
+# load scanpy object
+prefix_results = Path('co-expression_indivs_combined/stemi/')
+stemi_data = pd.read_csv('seurat_objects/stemi_v2_monocyte.csv.gz', compression='gzip', sep=' ', index_col=0).T
+stemi_meta = pd.read_csv('seurat_objects/stemi_v2_monocyte.meta.csv', sep=' ', index_col=0)
+
+for condition in stemi_meta['timepoint.final'].unique():
+ starttime = time()
+ # filter for the condition
+ celltype_condition_data = stemi_data[stemi_meta['timepoint.final']==condition]
+ # take either tsv file with selected genes or filter genes after a nonzero rate
+ selected_genes = select_gene_nonzeroratio(celltype_condition_data, 0.5)
+ print(f"Number of selected genes for stemi {condition}: {len(selected_genes)}")
+ # get gene pair names
+ gene_pairs = []
+ for i,gene1 in enumerate(selected_genes):
+ for j in range(i+1, len(selected_genes)):
+ gene_pairs.append(';'.join([gene1, selected_genes[j]]))
+ # get gene-gene correlations
+ input_df = celltype_condition_data[selected_genes]
+ input_data = spearmanr(input_df, axis=0)[0]
+ input_data_uppertria = input_data[np.triu_indices_from(input_data, 1)]
+ corrs_df = pd.DataFrame(data=input_data_uppertria,
+ columns=[f'{condition}'],
+ index=gene_pairs)
+ corrs_df.to_csv(prefix_results/f'monocyte_{condition}_correlation.csv')
+ #Filter for 0.3 correlation cutoff
+ corrs_df = corrs_df[corrs_df[condition]>0.3]
+ corrs_df.to_csv(prefix_results/f'monocyte_{condition}_correlation_03filtered.csv')
+ print(f"Finished {condition} with time {time() - starttime}")
diff --git a/02_correlation_evaluation/correlation_timepoint_combined_indivs_stemiv3.py b/02_correlation_evaluation/correlation_timepoint_combined_indivs_stemiv3.py
new file mode 100644
index 0000000..6892bc5
--- /dev/null
+++ b/02_correlation_evaluation/correlation_timepoint_combined_indivs_stemiv3.py
@@ -0,0 +1,91 @@
+##############################################################################
+# Calculate correlation for each cell type for the van Blokland v3 dataset,
+# timpeoint 6-8 weeks after admission, merging all individuals
+##############################################################################
+
+#from scipy.stats import t, norm
+from scipy.stats import spearmanr
+import scanpy as sc
+import numpy as np
+import pandas as pd
+from pathlib import Path
+from time import time
+import os
+import re
+
+# specify if the gene selection was done before and is passed in a file
+gene_selection_file = False
+
+# set working directory (to shorten path length)
+os.chdir('./')
+
+# load scanpy object
+prefix_results = Path('co-expression_indivs_combined/stemi/version3')
+# test stemi v2
+alldata = sc.read_h5ad('seurat_objects/cardio.integrated.20210301.stemiv3.h5ad')
+
+def select_gene_nonzeroratio(df, ratio):
+ nonzerocounts = np.count_nonzero(df.values, axis=0)/df.shape[0]
+ selected_genes = df.columns[nonzerocounts>ratio]
+ return selected_genes
+
+# extract timepoint from timepoint - stimulation annotation
+def get_time(x):
+ if x == 'UT':
+ return x
+ else:
+ pattern = re.compile(r'\d+h')
+ return re.findall(pattern, x)[0]
+
+
+# extract timepoint from timepoint - stimulation annotation
+observations = alldata.obs.copy()
+observations['timepoint_id_celltype'] = [f'{item[0]}_{item[1]}' for item
+ in observations[['timepoint.final', 'cell_type_lowerres']].values]
+
+celltypes = ['B', 'CD4T', 'CD8T', 'monocyte', 'DC', 'NK']
+for celltype in celltypes:
+ if not os.path.isdir(prefix_results/celltype):
+ os.mkdir(prefix_results/celltype)
+ starttime = time()
+ print(celltype)
+ specific = alldata[alldata.obs.cell_type_lowerres==celltype]
+ celltype_data = pd.DataFrame(data=specific.X.toarray(),
+ index=specific.obs.index,
+ columns=specific.var.index)
+
+ # get the set of gene pairs
+ specific_obs = observations[observations['cell_type_lowerres']==celltype]
+ for condition in observations['timepoint.final'].unique():
+ # filter for the condition
+ celltype_condition_data = celltype_data[specific_obs['timepoint.final']==condition]
+
+ # take either tsv file with selected genes or filter genes after a nonzero rate
+ if gene_selection_file:
+ selected_genes = pd.read_csv(' /genelists_tp_union/expressed_gene_'+celltype+'.tsv')
+ selected_genes =selected_genes["genes"].tolist()
+ else:
+ selected_genes = select_gene_nonzeroratio(celltype_condition_data, 0.5)
+
+ print(f"Number of selected genes for {celltype} {condition}: {len(selected_genes)}")
+
+ gene_pairs = []
+ for i,gene1 in enumerate(selected_genes):
+ for j in range(i+1, len(selected_genes)):
+ gene_pairs.append(';'.join([gene1, selected_genes[j]]))
+
+ input_df = celltype_condition_data[selected_genes]
+ input_data = spearmanr(input_df, axis=0)[0]
+ input_data_uppertria = input_data[np.triu_indices_from(input_data, 1)]
+
+ corrs_df = pd.DataFrame(data=input_data_uppertria,
+ columns=[f'{condition}'],
+ index=gene_pairs)
+
+ corrs_df.to_csv(prefix_results/celltype/f'{celltype}_{condition}_correlation.csv')
+
+ #Filter for 0.3 correlation cutoff
+ corrs_df = corrs_df[corrs_df[condition]>0.3]
+ corrs_df.to_csv(prefix_results/celltype/f'{celltype}_{condition}_correlation_03filtered.csv')
+
+ print(f"Finished {celltype} with time {time() - starttime}")
diff --git a/02_correlation_evaluation/figure2_barplot_cutoffs.R b/02_correlation_evaluation/figure2_barplot_cutoffs.R
new file mode 100644
index 0000000..df79e75
--- /dev/null
+++ b/02_correlation_evaluation/figure2_barplot_cutoffs.R
@@ -0,0 +1,72 @@
+# ------------------------------------------------------------------------------
+# Create barplot of correlation dependency on expression cutoff
+# for Oelen v3 and ImmuNexUT / Blueprint
+# (only plotting in R, calculation done with python script)
+# Input: correlation comparison between Blueprint and Oelen v3 dataset
+# (precalculated in compare_blueprint_cutoffs_CD4T.py) and between
+# ImmuNexUT and Oelen v3 dataset (precalculated in
+# compare_immunexut_cutoffs_CD4T.py)
+# Output: two barplots, one for Blueprint comparsion and one for
+# ImmuNexUT comparison
+# ------------------------------------------------------------------------------
+
+library(ggplot2)
+library(RColorBrewer)
+
+theme_set(theme_bw())
+
+################################################################################
+# Plot for ImmuNexUT (main Figure 2c)
+################################################################################
+
+vals<-read.table("co-expression_indivs_combined/immunexut_cutoff_eval_CD4T.txt",
+ sep=",",header=TRUE)
+
+vals$threshold<-as.factor(vals$threshold)
+g<-ggplot(vals,aes(x=threshold,y=corr_pearson,fill=ngenes))+
+ geom_bar(stat="identity")+
+ geom_text(aes(x = threshold, y = corr_pearson / 2, label = ngenes,
+ color=ifelse(ngenes<1000,'white','black')),size=5)+
+ scale_color_manual(values=c("black","white"))+
+ xlab("Expression cutoff")+
+ ylab("Correlation between Oelen (v3)\nand ImmuNexUT")+ylim(0,1)+
+ scale_fill_distiller("Number of\ngenes",palette="YlOrBr")+
+ theme(legend.position = "bottom",
+ legend.key.width = unit(1, "cm"),
+ axis.title = element_text(size=16),
+ axis.text = element_text(size=14),
+ legend.title=element_text(size=13),
+ legend.text=element_text(size=12))+
+ guides(color=FALSE)
+print(g)
+ggsave(g,file="co-expression_indivs_combined/plots/eval_immunexut_cutoff.pdf",
+ width=6,height=5)
+
+
+################################################################################
+# Plot for Blueprint (Supplement)
+################################################################################
+
+vals<-read.table("co-expression_indivs_combined/blueprint_cutoff_eval_CD4T.txt",
+ sep=",",header=TRUE)
+
+vals$threshold<-as.factor(vals$threshold)
+g<-ggplot(vals,aes(x=threshold,y=corr_pearson,fill=ngenes))+
+ geom_bar(stat="identity")+
+ geom_text(aes(x = threshold, y = corr_pearson / 2, label = ngenes,
+ color=ifelse(ngenes<1000,'white','black')),size=5)+
+ scale_color_manual(values=c("black","white"))+
+ xlab("Expression cutoff")+
+ ylab("Correlation between Oelen (v3)\nand BLUEPRINT")+ylim(0,1)+
+ scale_fill_distiller("Number of\ngenes",palette="YlOrBr")+
+ theme(legend.position = "bottom",
+ legend.key.width = unit(1, "cm"),
+ axis.title = element_text(size=16),
+ axis.text = element_text(size=14),
+ legend.title=element_text(size=13),
+ legend.text=element_text(size=12))+
+ guides(color=FALSE)
+print(g)
+ggsave(g,file="co-expression_indivs_combined/plots/eval_blueprint_cutoff.png",
+ width=6,height=5)
+
diff --git a/02_correlation_evaluation/figure2_scatterplots.R b/02_correlation_evaluation/figure2_scatterplots.R
new file mode 100644
index 0000000..15925d6
--- /dev/null
+++ b/02_correlation_evaluation/figure2_scatterplots.R
@@ -0,0 +1,162 @@
+# ------------------------------------------------------------------------------
+# Create inset plots for Main Figure 2 (a,b,d), showing scatterplots of
+# gene pair-wise Spearman correlation values between two data sets for
+# a) Oelen v3 dataset vs van Blokland v2 dataset (both CD4+ T cells)
+# b) ImmuNexUT - van Blokland v2 (naive CD4+ T cells and CD4+ T cells)
+# c) Blueprint - ImmuNexUT (both naive CD4+ T cells)
+# ------------------------------------------------------------------------------
+
+library(data.table)
+library(reticulate) # to read the single cell data (numpy)
+library(reshape2)
+library(ggplot2)
+library(viridis)
+library(ggpubr)
+
+np <- import("numpy")
+
+theme_set(theme_bw())
+
+#Load single cell
+load_sc_corr_data<-function(path){
+
+ # Load single cell data
+ corr_ct<-fread(path)
+ corr_ct$gene1<-sapply(corr_ct$V1,function(s) strsplit(s,";")[[1]][1])
+ corr_ct$gene2<-sapply(corr_ct$V1,function(s) strsplit(s,";")[[1]][2])
+ corr_ct$V1<-NULL
+
+ #Order so that gene1 is always the one first in alphabet
+ corr_ct$swap<-ifelse(corr_ct$gene1 > corr_ct$gene2,corr_ct$gene1,corr_ct$gene2)
+ corr_ct$gene1<-ifelse(corr_ct$gene1 > corr_ct$gene2,corr_ct$gene2,corr_ct$gene1)
+ corr_ct$gene2<-corr_ct$swap
+ corr_ct$swap<-NULL
+
+ return(corr_ct)
+}
+
+#Load data saved in numpy format
+load_numpy_data<-function(path,rowname_suffix,colname_suffix,corr_sc){
+ corr_bios <- np$load(paste0(path,"npy"), mmap_mode="r")
+ row_names<-fread(paste0(path,rowname_suffix),header=FALSE)
+ rownames(corr_bios)<-row_names$V1
+ col_names<-fread(paste0(path,colname_suffix),header=FALSE)
+ colnames(corr_bios)<-col_names$V1
+ rm(row_names,col_names)
+
+ #Filter for single cell data
+ sc_genes<-sort(union(corr_sc$gene1,corr_sc$gene2))
+ sc_genes<-sc_genes[sc_genes %in% colnames(corr_bios)]
+ corr_bios<-corr_bios[sc_genes,sc_genes]
+ corr_bios<-reshape2::melt(corr_bios)
+ corr_bios$Var1<-as.character(corr_bios$Var1)
+ corr_bios$Var2<-as.character(corr_bios$Var2)
+ colnames(corr_bios)[1:2]<-c("gene1","gene2")
+
+
+ #Order so that gene1 is always the one first in alphabet
+ corr_bios$swap<-ifelse(corr_bios$gene1 > corr_bios$gene2,corr_bios$gene1,corr_bios$gene2)
+ corr_bios$gene1<-ifelse(corr_bios$gene1 > corr_bios$gene2,corr_bios$gene2,corr_bios$gene1)
+ corr_bios$gene2<-corr_bios$swap
+ corr_bios$swap<-NULL
+ corr_bios<-corr_bios[corr_bios$gene1!=corr_bios$gene2,]
+
+ return(corr_bios)
+}
+
+#Create ggplot 2d histogram based on two correlation data frames
+create_corr_plot<-function(corr_d1,corr_d2,
+ xlab_text,ylab_text,
+ annot_text_size=9,annot_text_digits=3){
+
+ #Merge both
+ corrs<-merge(corr_d1,corr_d2,by=c("gene1","gene2"))
+
+ print(paste("Overlapping genes:",length(union(corrs$gene1,
+ corrs$gene2))))
+
+ corr_corr<-cor(corrs$corr1,corrs$corr2,
+ method="pearson")
+
+ #Plot
+ g<-ggplot(corrs,aes(corr1,corr2))+
+ geom_bin2d(bins=50)+
+ xlab(xlab_text)+
+ ylab(ylab_text)+
+ xlim(-1,1)+ylim(-1,1)+
+ scale_fill_distiller("Density",palette="BuPu",trans="log10",
+ breaks = c(2, 600),
+ labels = c("Low", "High"))+
+ annotate(geom="text", x=-0.95, y=0.95,size=annot_text_size,
+ hjust = 0,vjust=1,
+ label=paste0("r = ",format(corr_corr,digits=annot_text_digits)))+
+ ggtitle("Pairwise gene correlation")+
+ geom_smooth(method="lm",color="black")+
+ theme(legend.position="none",
+ plot.title=element_text(size=25),
+ axis.title=element_text(size=25),
+ axis.text=element_text(size=20))
+
+ return(g)
+}
+
+
+################################################################################
+# For 2a: Oelen v3 - van Blokland v2
+################################################################################
+
+main_celltype<-"CD4T"
+
+corr_oelen<-load_sc_corr_data(paste0("co-expression_indivs_combined/",
+ main_celltype,"/",main_celltype,
+ "_UT_correlation.csv"))
+
+corr_stemi<-load_sc_corr_data(paste0("co-expression_indivs_combined/stemi/version2/",
+ main_celltype,"/",main_celltype,
+ "_t8w_correlation.csv"))
+
+colnames(corr_oelen)<-c("corr1","gene1","gene2")
+colnames(corr_stemi)<-c("corr2","gene1","gene2")
+
+#Create gggplot
+g<-create_corr_plot(corr_oelen,corr_stemi,
+ xlab_text="Oelen (v3)", ylab_text="van Blokland (v2)")
+
+g_leg<-get_legend(g+theme(legend.position = "bottom"))
+g_leg<-as_ggplot(g_leg)
+ggsave(g_leg,file="bios/plots/figure2_legend_inset.pdf",width=3,height=1)
+
+
+ggsave(g,file="bios/plots/figure2a_exampleplot.pdf",width=5,height=5)
+
+################################################################################
+# For 2b: ImmuNexUT - van Blokland v2
+################################################################################
+
+corr_immu<-fread("imd_paper_rna_data/correlation/Naive_CD4_correlation.txt")
+corr_immu$V1<-NULL
+
+colnames(corr_immu)<-c("gene1","gene2","corr1")
+
+#Create gggplot
+g<-create_corr_plot(corr_immu,corr_stemi,
+ xlab_text="ImmuNexUT",ylab_text="van Blokland (v2)")
+
+ggsave(g,file="bios/plots/figure2b_exampleplot.pdf",width=5,height=5)
+
+################################################################################
+# For 2c: Blueprint - ImmuNexUT
+################################################################################
+
+#Load Blueprint data
+corr_bp<-load_numpy_data(path="blueprint_data/tcel_gene_nor_combat_20151109.ProbesWithZeroVarianceRemoved.ProbesCentered.SamplesZTransformed.spearmanR.",
+ rowname_suffix="rows.txt",
+ colname_suffix="cols.txt",
+ corr_immu)
+
+colnames(corr_bp)<-c("gene1","gene2","corr2")
+
+#Create gggplot
+g<-create_corr_plot(corr_immu,corr_bp,
+ xlab_text="ImmuNexUT",ylab_text="BLUEPRINT")
+ggsave(g,file="bios/plots/figure2c_exampleplot.pdf",width=5,height=5)
diff --git a/02_correlation_evaluation/normalize_ImmuNexUT.R b/02_correlation_evaluation/normalize_ImmuNexUT.R
new file mode 100644
index 0000000..80e04ea
--- /dev/null
+++ b/02_correlation_evaluation/normalize_ImmuNexUT.R
@@ -0,0 +1,142 @@
+# ------------------------------------------------------------------------------
+# Normalize ImmuNexUT data (separately for each cell type with a matching
+# single-cell cell type) following the description in the corresponding
+# publication (filtering lowly expressed genes, TMM normalization and
+# batch correction)
+# followed by correlation calculation for all genes expressed in 50% of the cells
+# of the Oelen v3 dataset (for comparison with single cell data)
+# Input: Count matrices downloaded from
+# https://humandbs.biosciencedbc.jp/en/hum0214-v5#E-GEAD-397,
+# correlation estimates from Oelen v3 to identify the expressed genes
+# for downstream comparisons
+# Output: normalized count matrices (one per cell type), orrelation matrices
+# for all genes expressed in 50% of the cells of the Oelen v3 dataset and
+# plots for comparison between ImmuNexUT and Oelen v3 dataset
+# ------------------------------------------------------------------------------
+
+library(data.table)
+library(edgeR) #for normalization
+library(sva) # for batch correction with combat
+library(corrplot) # to plot sample correlations
+library(ggplot2)
+library(viridis)
+
+theme_set(theme_bw())
+
+# Cell type matching
+ct_mapping<-data.frame(sc_ct=c("CD4T","CD8T","B","monocyte","NK","DC"),
+ imn_ct=c("Naive_CD4","Naive_CD8","Naive_B","CL_Mono","NK","mDC"))
+
+
+for(i in 1:nrow(ct_mapping)){
+
+ ct <- ct_mapping$imn_ct[i]
+ ct_single_cell<- ct_mapping$sc_ct[i]
+
+ # Try to prevent redoing the whole normalization when the correlation
+ # is already calculated
+ corr_file_name<-paste0("imd_paper_rna_data/correlation/",ct,"_correlation.txt")
+ if(! file.exists(corr_file_name)){
+
+ counts<-fread(paste0("imd_paper_rna_data/count/",ct,"_count.txt"))
+
+ #Format to matrix
+ gene_id<-counts$Gene_id
+ gene_name<-counts$Gene_name
+ counts$Gene_name<-NULL
+ counts$Gene_id<-NULL
+ counts<-as.matrix(counts)
+ rownames(counts)<-gene_name
+
+ #Filter lowly expressed genes (at least 10 in > 90% of samples)
+ counts<-counts[!(rowSums(counts<10) > 0.9 * ncol(counts)),]
+
+ #Normalize using edgeR (TMM plus log-transformed CPM)
+ dge <- DGEList(counts=counts)
+ dge <- calcNormFactors(dge, method = "TMM")
+ tmm <- cpm(dge) #in publication it says log-transformed CPM, but log-transformation
+ #is not working in combination with combat ...
+
+ #Remove batch data using combat
+ batch_data<-fread("imd_paper_rna_data/clinical_diagnosis_age_sex_v2.txt")
+
+ #Filter batch data for samples in the matrix
+ batch_data<-batch_data[batch_data$id %in% colnames(tmm),]
+ print(paste("Sorted the batch data correctly:",all(batch_data$id == colnames(tmm))))
+
+ modcombat = model.matrix(~1, data=batch_data)
+ combat_tmm = ComBat(dat=tmm, batch=batch_data$Phase,mod=modcombat, prior.plots = FALSE)
+
+ #Check that correlation between samples is high
+ cor_matrix<-cor(combat_tmm)
+
+ #Filter samples with a correlation coefficient less than 0.9
+ cor_coef_mean<-rowMeans(cor_matrix)
+ combat_tmm<-combat_tmm[,names(cor_coef_mean)[cor_coef_mean>=0.9]]
+
+ # #Plot remaining samples
+ # cor_matrix<-cor(combat_tmm)
+ # png("imd_paper_rna_data/plots/sample_correlation.png")
+ # corrplot(cor_matrix,method="color",order="hclust",tl.col="black",tl.cex=0.2)
+ # dev.off()
+
+ #Combine genes that appear multiple times in the matrix
+ combat_tmm<- apply(combat_tmm, 2, tapply, rownames(combat_tmm),
+ mean, na.rm=T)
+
+ #Save normalized matrix
+ write.table(combat_tmm, file=paste0("imd_paper_rna_data/norm_count/",ct,"_norm_count.txt"),
+ quote=FALSE,sep="\t")
+
+ #Read single cell correlation
+ corr_ct<-fread(paste0("co-expression_indivs_combined/",ct_single_cell,"/",
+ ct_single_cell,"_UT_correlation.csv"))
+ corr_ct$gene1<-gsub(";.*","",corr_ct$V1)
+ corr_ct$gene2<-gsub(".*;","",corr_ct$V1)
+
+ #Order so that gene1 is always the one first in alphabet
+ corr_ct$swap<-ifelse(corr_ct$gene1 > corr_ct$gene2,corr_ct$gene1,corr_ct$gene2)
+ corr_ct$gene1<-ifelse(corr_ct$gene1 > corr_ct$gene2,corr_ct$gene2,corr_ct$gene1)
+ corr_ct$gene2<-corr_ct$swap
+ corr_ct$swap<-NULL
+ corr_ct$V1<-NULL
+
+ #Filter for correlation values in CD4 T cells
+ expressed_genes<-union(corr_ct$gene1,corr_ct$gene2)
+ combat_tmm<-combat_tmm[rownames(combat_tmm) %in% expressed_genes,]
+
+ #Calculation correlation
+ cor_matrix<-cor(t(combat_tmm),method="spearman")
+ cor_matrix<-reshape2::melt(cor_matrix)
+ cor_matrix$Var1<-as.character(cor_matrix$Var1)
+ cor_matrix$Var2<-as.character(cor_matrix$Var2)
+ cor_matrix<-cor_matrix[cor_matrix$Var1 < cor_matrix$Var2,]
+ colnames(cor_matrix)<-c("gene1","gene2","corr")
+
+ #Save correlation
+ write.table(cor_matrix, file=corr_file_name,
+ quote=FALSE,sep="\t")
+ }
+
+ #Compare with single cell correlation
+ cor_matrix<-merge(cor_matrix,corr_ct,by=c("gene1","gene2"))
+
+ ylab_text<-paste("Correlation ImmuNexUT -",ct)
+ plot_path<-paste0("imd_paper_rna_data/plots/correlation_",ct,".png")
+ corr_corr<-cor(cor_matrix$UT,cor_matrix$corr)
+
+ g<-ggplot(cor_matrix,aes(UT,corr))+
+ geom_bin2d(bins=50)+
+ xlab("Correlation single cell")+
+ ylab(ylab_text)+
+ xlim(-1,1)+ylim(-1,1)+
+ scale_fill_viridis(trans="log10")+
+ annotate(geom="text", x=-0.95, y=0.95,size=8,
+ hjust = 0,vjust=1,
+ label=paste0("r = ",format(corr_corr,digits=2)))+
+ theme(axis.title=element_text(size=16),
+ axis.text=element_text(size=14),
+ legend.position="none")
+
+ ggsave(g,file=plot_path)
+}
\ No newline at end of file
diff --git a/02_correlation_evaluation/wilcoxon_test_crispr.R b/02_correlation_evaluation/wilcoxon_test_crispr.R
new file mode 100644
index 0000000..bf85c42
--- /dev/null
+++ b/02_correlation_evaluation/wilcoxon_test_crispr.R
@@ -0,0 +1,212 @@
+# ------------------------------------------------------------------------------
+# Benchmark our correlation results from single cell (Oelen v3, CD4+ T cells)
+# and bulk (ImmuNexUT, naive CD4+ T cells)
+# with a public CRISPR perturbation dataset using Wilcoxon Rank Sum Test
+# ------------------------------------------------------------------------------
+
+library(data.table)
+library(ggplot2)
+library(ggpubr)
+library(RColorBrewer)
+library(gtools)
+library(dplyr)
+
+theme_set(theme_bw())
+
+#Get colors
+cols_brewer <- c(brewer.pal(n = 3, "Set2")[2],"grey78")
+
+#Set MT correction for KO gene identification
+MTcorrection<-"FDR" #alternatives: "Bonf","FDR"
+print(paste("MT correction:",MTcorrection))
+
+# Load single cell data
+ct<-"CD4T" #alternative "CD8T"
+cond<-"UT"
+
+corr_sc<-fread(paste0("co-expression_indivs_combined/",ct,"/",ct,"_",cond,
+ "_correlation.csv"))
+corr_sc$gene1<-gsub(";.*","",corr_sc$V1)
+corr_sc$gene2<-gsub(".*;","",corr_sc$V1)
+
+corr_sc$swap<-ifelse(corr_sc$gene1 > corr_sc$gene2,corr_sc$gene1,corr_sc$gene2)
+corr_sc$gene1<-ifelse(corr_sc$gene1 > corr_sc$gene2,corr_sc$gene2,corr_sc$gene1)
+corr_sc$gene2<-corr_sc$swap
+corr_sc$swap<-NULL
+
+# Load ImmuNexUT data (already preprocessed correctly)
+ct<-"Naive_CD4"
+corr_imn<-fread(paste0("imd_paper_rna_data/correlation/",
+ ct,"_correlation_extended.txt"))
+corr_imn$V1<-NULL
+colnames(corr_imn)[3]<-"UT"
+
+# Filter for genes that are expressed in both data sets
+expressed_genes_sc<-union(corr_sc$gene1,corr_sc$gene2)
+expressed_genes_bulk<-union(corr_imn$gene1,corr_imn$gene2)
+expressed_genes<-intersect(expressed_genes_sc,expressed_genes_bulk)
+
+print(paste("Number of genes expressed in both data sets:",length(expressed_genes)))
+
+# Load perturbation data
+path<-"perturbation_dataset/perturbation_data/CD4T_GATE2019_MAST_DE/WT_KO/"
+path_negControl <- "perturbation_dataset/perturbation_data/CD4T_GATE2019_MAST_DE/WT_NP/"
+
+# Get a list with all DE genes
+files<-list.files(path)
+
+# Use the setting without artifical cells
+files<-files[!startsWith(files,"artificialCells_")]
+genes<-unique(sapply(files,function(fl) strsplit(fl,"\\.")[[1]][1]))
+print(paste0("Unique genes:",length(genes)))
+
+genes<-genes[genes %in% expressed_genes]
+print(paste0("Unique genes expressed in 50% of cells:",length(genes)))
+
+# Iterate over both data sets and all KO genes to perform Wilcoxon test
+p_vals<-NULL
+all_comps<-NULL
+for(data_type in c("ImmuNexUT","sc")){
+
+ if(data_type == "sc"){
+ corr_ct<-corr_sc
+ expressed_genes<-expressed_genes_sc
+
+ } else if (data_type == "ImmuNexUT"){
+ corr_ct<-corr_imn
+ expressed_genes<-expressed_genes_bulk
+ }
+
+ #Bonferroni cutoff corrected for the number of expressed genes
+ if(MTcorrection == "Bonf"){
+ cutoff<-0.05/length(expressed_genes)
+ } else {
+ cutoff<-0.05
+ }
+
+ # Go over each gene
+ plot_list<-list()
+ for(gene in genes){
+
+ corr_ct_ko<-corr_ct[corr_ct$gene1==gene,c("gene2","UT")]
+ colnames(corr_ct_ko)[1]<-"gene1"
+ corr_ct_ko<-rbind(corr_ct_ko,corr_ct[corr_ct$gene2==gene,c("gene1","UT")])
+
+ #Use absolute correlation
+ corr_ct_ko$UT<-abs(corr_ct_ko$UT)
+
+ #Get all knock_out genes
+ all_measured_ko_genes<-NULL
+ ko_genes_combined<-NULL
+ for(fl in files[startsWith(files,gene)]){
+ ko_genes<-read.table(paste0(path,fl))
+ all_measured_ko_genes<-union(all_measured_ko_genes,rownames(ko_genes))
+
+ if(MTcorrection=="FDR"){
+ ko_genes<-ko_genes[rownames(ko_genes) %in% expressed_genes,]
+ ko_genes$p_val<-p.adjust(ko_genes$p_val,method="BH")
+ }
+
+ #Filter for expressed genes and significant threshold
+ ko_genes<-ko_genes[rownames(ko_genes) %in% expressed_genes &
+ ko_genes$p_val%
+ group_by(ko_gene,data_type)%>%
+ summarize(max_UT=max(UT))
+
+p_vals<-merge(p_vals,max_corr,by=c("ko_gene","data_type"))
+p_vals$max_UT<-p_vals$max_UT*1.1
+p_vals$is_ko<-1.5
+
+g_sc<-ggplot()+
+ geom_violin(data=all_comps[all_comps$data_type=="single cell",],
+ aes(x=is_ko,y=UT,fill=is_ko))+
+ geom_boxplot(data=all_comps[all_comps$data_type=="single cell",],
+ aes(x=is_ko,y=UT,fill=is_ko),
+ width = 0.15, outlier.shape = NA)+
+ geom_text(data=p_vals[p_vals$data_type=="single cell",],
+ aes(x=is_ko,y=max_UT,label=pvaltext),
+ size=6)+
+ facet_wrap(~ko_gene,scales ="free",nrow=1)+
+ xlab("")+
+ ylab("Absolute correlation (single cell)")+
+ scale_fill_manual("DE gene\nafter KO",values=cols_brewer)+
+ theme(legend.position="none",
+ axis.title=element_text(size=15),
+ axis.text=element_text(size=14),
+ strip.text=element_text(size=15))
+
+g_bulk<-ggplot()+
+ geom_violin(data=all_comps[all_comps$data_type=="ImmuNexUT",],
+ aes(x=is_ko,y=UT,fill=is_ko))+
+ geom_boxplot(data=all_comps[all_comps$data_type=="ImmuNexUT",],
+ aes(x=is_ko,y=UT,fill=is_ko),
+ width = 0.15, outlier.shape = NA)+
+ geom_text(data=p_vals[p_vals$data_type=="ImmuNexUT",],
+ aes(x=is_ko,y=max_UT,label=pvaltext),
+ size=6)+
+ facet_wrap(~ko_gene,scales ="free",nrow=1)+
+ xlab("")+
+ ylab("Absolute correlation (ImmuNexUT)")+
+ scale_fill_manual("DE gene\nafter KO",values=cols_brewer)+
+ theme(legend.position="none",
+ axis.title=element_text(size=15),
+ axis.text=element_text(size=14),
+ strip.text=element_text(size=15))
+
+g<-ggarrange(g_sc,g_bulk,ncol=1,align="hv")
+
+ggsave(g,file="perturbation_dataset/plots/wilcoxon_all_combined.pdf",
+ width=15,height=8)
+
\ No newline at end of file
diff --git a/02_correlation_evaluation/wilcoxon_test_string.R b/02_correlation_evaluation/wilcoxon_test_string.R
new file mode 100644
index 0000000..996825d
--- /dev/null
+++ b/02_correlation_evaluation/wilcoxon_test_string.R
@@ -0,0 +1,96 @@
+# ------------------------------------------------------------------------------
+# Compare if correlated pairs from single cell (Oelen v3, CD4+ T cells)
+# and bulk (ImmuNexUT, naive CD4+ T cells) are enriched in STRING database
+# (Using the same strategy as in CRISPR validation with
+# Wilcoxon Rank Sum Test)
+# ------------------------------------------------------------------------------
+
+library(data.table)
+library(ggplot2)
+library(RColorBrewer)
+
+theme_set(theme_bw())
+
+#Get colors
+cols_brewer <- c("grey78",brewer.pal(n = 3, "Set2")[2])
+
+cond<-"UT"
+
+plot_list<-NULL
+for(data_type in c("sc","ImmuNexUT")){
+
+ print(data_type)
+
+ # Load single cell correlation (cell type specific)
+ if(data_type == "sc"){
+
+ ct<-"CD4T" #alternative "CD8T"
+
+ corr_ct<-fread(paste0("co-expression_indivs_combined/",ct,"/",ct,"_",cond,
+ "_correlation.csv"))
+ corr_ct$gene1<-gsub(";.*","",corr_ct$V1)
+ corr_ct$gene2<-gsub(".*;","",corr_ct$V1)
+
+ corr_ct$swap<-ifelse(corr_ct$gene1 > corr_ct$gene2,corr_ct$gene1,corr_ct$gene2)
+ corr_ct$gene1<-ifelse(corr_ct$gene1 > corr_ct$gene2,corr_ct$gene2,corr_ct$gene1)
+ corr_ct$gene2<-corr_ct$swap
+ corr_ct$swap<-NULL
+ } else if (data_type == "ImmuNexUT"){
+
+ ct<-"Naive_CD4"
+
+ #Read ImmuNexUT data (already preprocessed correctly)
+ corr_ct<-fread(paste0("imd_paper_rna_data/correlation/",
+ ct,"_correlation.txt"))
+ corr_ct$V1<-NULL
+ colnames(corr_ct)[3]<-"UT"
+ }
+
+ expressed_genes<-union(corr_ct$gene1,corr_ct$gene2)
+
+ corr_ct$UT<-abs(corr_ct$UT)
+
+ #Read STRING data base
+ string<-fread("additional_files/STRING-network.csv")
+ string<-string[string$Gene1 %in% expressed_genes &
+ string$Gene2 %in% expressed_genes,]
+
+ string$swap<-ifelse(string$Gene1 > string$Gene2,string$Gene1,string$Gene2)
+ string$Gene1<-ifelse(string$Gene1 > string$Gene2,string$Gene2,string$Gene1)
+ string$Gene2<-string$swap
+ string$swap<-NULL
+
+ #Combine with correlation values
+ string$is_string<-TRUE
+ corr_ct<-merge(corr_ct,string,by.x=c("gene1","gene2"),
+ by.y=c("Gene1","Gene2"),all.x=TRUE)
+ corr_ct$is_string[is.na(corr_ct$is_string)]<-FALSE
+
+ wt<-wilcox.test(corr_ct$UT[corr_ct$is_string],corr_ct$UT[!corr_ct$is_string],
+ paired=FALSE,alternative="greater")
+ print(wt$p.value)
+
+ if(data_type=="sc"){
+ data_type<-"Oelen (v3)"
+ }
+
+ g<-ggplot(corr_ct,aes(x=is_string,y=UT, fill=is_string))+
+ geom_violin()+
+ geom_boxplot(width = 0.15, outlier.shape = NA)+
+ xlab("Gene pair in STRING network")+
+ ylab("Absolute correlation")+
+ ylim(c(0,1))+
+ ggtitle(paste(data_type,"dataset"))+
+ annotate("text",x=1.5,y=0.9,label=paste0("p =",
+ format(wt$p.value,digits=2)),size=4.5)+
+ scale_fill_manual(values=cols_brewer)+
+ theme(legend.position = "none",
+ plot.title=element_text(size=15),
+ axis.title=element_text(size=14),
+ axis.text=element_text(size=12))
+ plot_list<-c(plot_list,list(g))
+}
+
+g<-ggarrange(plotlist=plot_list,ncol=2,labels=c("a","b"))
+ggsave(g,file=paste0("compare_with_string/plots/string_wilcoxon_combined.pdf"),
+ width=7,height=4)
\ No newline at end of file
diff --git a/03_celltype_individual_comparison/README.md b/03_celltype_individual_comparison/README.md
new file mode 100644
index 0000000..e9df023
--- /dev/null
+++ b/03_celltype_individual_comparison/README.md
@@ -0,0 +1,15 @@
+# 03_celltype_individual_comparison
+
+*compare_individuals_variance.R* : explores for all genes expressed in at least 50% of the cells the variance across individuals
+
+*correlation_between_celltypes.R* : calculates the Pearson correlation of gene pairwise Spearman correlation for all pairwise combinations of cell types within each dataset (for Oelen v2 and v3 dataset, input from *correlation_celltype.py*), taking only genes expressed in 50% of the cells in both cell types; plots results in heatmap afterwards
+
+*correlation_celltype.py* : calculates Spearman correlation for each genepair expressed in 50% of the cells for Oelen dataset (V2) and (V3), separately per cell type, but combing all individuals; provides so the input csv files for *correlation_between_celltypes.R*
+
+*correlation_correlation_distribution_celltypes_and_individuals.R* : combines two basic overview plots: the correlation distribution in each cell type (input from *correlation_celltype.py*) and the concordance of donor-specific correlation (calculates Pearson correlation of gene pairwise Spearman correlation for each combination of individuals within each cell type)
+
+*correlation_subsampling.py* : calculates per donor correlation for each cell type and different numbers of cells for the sample (randomly subsampling to this number of cells), followed by comparison between donors for within the cell type and the subsampling step, taking genepairs expressed in 50% of the cells, using again Oelen v2 and v3 dataset separately
+
+*fit_logcurve_indiv_subsampling_effect.R* : tkes the results from *correlation_subsampling.py* and fitting logarithmic curves for the relationship between number of cells and concordance between individuals, one per celltype, to better describe this relationship
+
+*plot_indiv_subsampling_effect.R* : plots results from *correlation_subsampling.py* to show relationship between number of cells and concordance between individuals
diff --git a/03_celltype_individual_comparison/compare_individuals_variance.R b/03_celltype_individual_comparison/compare_individuals_variance.R
new file mode 100644
index 0000000..ffb661c
--- /dev/null
+++ b/03_celltype_individual_comparison/compare_individuals_variance.R
@@ -0,0 +1,81 @@
+# ------------------------------------------------------------------------------
+# Combine gene pair variance across individuals
+# for Oelen (v2) and (v3) in one plot (taking Z scores)
+# Input: correlation matrices per individual and cell type
+# (for comparison of individuals)
+# Output: plot and summary as output text
+# ------------------------------------------------------------------------------
+
+library(data.table)
+library(dplyr)
+library(ggplot2)
+library(ggpubr)
+
+theme_set(theme_bw())
+
+path<-"coeqtl_mapping/input/individual_networks/UT/"
+
+cell_types<-c("B","CD4T","CD8T","DC","monocyte","NK")
+
+#Full cell type names as reported in the paper
+cell_types_corrected<-setNames(c("CD4+ T","CD8+ T","Monocyte","NK","DC","B"),
+ c("CD4T","CD8T","monocyte","NK","DC","B"))
+
+g_list<-NULL
+#Evaluate both Oelen v2 and v3 dataset
+for(dataset in c("onemillionv2","onemillionv3")){
+ corr_summary<-NULL
+ for(ct in cell_types){
+
+ #Correlation values
+ if(dataset=="onemillionv2"){
+ corr<-fread(paste0(path,dataset,"/UT_",ct,".genesnonzero0.5.zscores.tsv.gz"))
+ } else {
+ corr<-fread(paste0(path,dataset,"/UT_",ct,".genesnonzero0.5.zscores.gz"))
+ }
+
+ gene_pairs<-corr$V1
+ corr$V1<-NULL
+
+ #Set Inf values to NA to remove them afterwards
+ corr<-as.matrix(corr)
+ corr[is.infinite(corr)]<-NA
+
+ #Get mean and variance for each gene pair (drop NA and Inf values from calculation)
+ corr_summary<-rbind(corr_summary,
+ data.frame(ct,
+ gene_pairs,
+ var=apply(corr,1,var,na.rm=TRUE),
+ mean=apply(corr,1,mean,na.rm=TRUE)))
+ }
+
+ #Check frequency of "highly variable" genes
+ print(paste("Frequency of highly variable genes for",dataset))
+ tmp<-corr_summary%>%
+ group_by(ct)%>%
+ summarise(freq_high_var=mean(var>2,na.rm=TRUE))
+ print(tmp)
+ print(median(tmp$freq_high_var))
+
+ #Replace cell type names
+ corr_summary$ct<-cell_types_corrected[corr_summary$ct]
+
+ g<-ggplot(corr_summary,aes(x=var,color=ct))+
+ geom_density()+
+ xlab("Correlation variance across individuals")+
+ ylab("Density")+
+ ylim(0,2)+
+ xlim(0,15)+
+ scale_color_discrete("Cell type")+
+ theme(axis.title=element_text(size=10),
+ axis.text=element_text(size=9),
+ legend.title=element_text(size=10),
+ legend.text=element_text(size=10))
+
+ g_list<-c(g_list,list(g))
+}
+
+g<-ggarrange(plotlist=g_list,ncol=2,common.legend = TRUE,
+ legend="bottom",labels=c("a","b"))
+ggsave(g,file="co-expression_indivs_combined/plots/per_indivual_var_zscores_combined.pdf",
+ width=8,height=4)
\ No newline at end of file
diff --git a/03_celltype_individual_comparison/correlation_between_celltypes.R b/03_celltype_individual_comparison/correlation_between_celltypes.R
new file mode 100644
index 0000000..d61c2c1
--- /dev/null
+++ b/03_celltype_individual_comparison/correlation_between_celltypes.R
@@ -0,0 +1,110 @@
+# ------------------------------------------------------------------------------
+# Check Pearson correlation between cell types for Oelen v2 and v3 dataset
+# Input: correlation matrices generated with correlation_timepoint_combined_indivs.py
+# Output: heatmap plot and summary as output text
+# -----------------------------------------------------------------------------
+
+library(data.table)
+library(ggplot2)
+library(viridis)
+
+theme_set(theme_bw())
+
+cell_types<-c("CD4T","CD8T","monocyte","NK","DC","B")
+
+#Full cell type names as reported in the paper
+cell_types_corrected<-setNames(c("CD4+ T","CD8+ T","Monocyte","NK","DC","B"),
+ c("CD4T","CD8T","monocyte","NK","DC","B"))
+
+#Check for both Oelen v3 and v2 datasets (paths set dependent on that)
+path_v3<-"co-expression_indivs_combined/"
+path_v2<-"co-expression_indivs_combined/one_million_version2/"
+
+for(version in c("v2","v3")){
+
+ if(version=="v2"){
+ path<-path_v2
+ } else {
+ path<-path_v3
+ }
+
+ #Iterate over each cell type combination to do all pairwise comparisons
+ corr_comp<-NULL
+ for(c1 in 1:c(length(cell_types)-1)){
+
+ #Read correlation file one
+ cell_type1<-cell_types[c1]
+ corr_c1<-fread(paste0(path,cell_type1,"/",cell_type1,"_UT_correlation.csv"))
+
+ #Unique genes
+ num_genes<-length(union(gsub(";.*","",corr_c1$V1),
+ gsub(".*;","",corr_c1$V1)))
+
+ corr_comp<-rbind(corr_comp,
+ data.frame(c1=cell_type1,
+ c2=cell_type1,
+ gene_pairs=nrow(corr_c1),
+ genes_unique=num_genes,
+ corr=1))
+
+ for(c2 in (c1+1):length(cell_types)){
+
+ #Read correlation file two
+ cell_type2<-cell_types[c2]
+ corr_c2<-fread(paste0(path,cell_type2,"/",cell_type2,"_UT_correlation.csv"))
+
+ corr<-merge(corr_c1,corr_c2,by=c("V1"))
+
+ #Unique genes
+ num_genes<-length(union(gsub(";.*","",corr$V1),
+ gsub(".*;","",corr$V1)))
+
+ corr_comp<-rbind(corr_comp,
+ data.frame(c1=cell_type1,
+ c2=cell_type2,
+ gene_pairs=nrow(corr),
+ genes_unique=num_genes,
+ corr=cor(corr$UT.x,corr$UT.y,method="pearson")))
+ }
+ }
+
+ #Add last diagonal entry
+ cell_type1<-cell_types[length(cell_types)]
+ corr_c1<-fread(paste0(path,cell_type1,"/",cell_type1,"_UT_correlation.csv"))
+
+ #Unique genes
+ num_genes<-length(union(gsub(";.*","",corr_c1$V1),
+ gsub(".*;","",corr_c1$V1)))
+
+ corr_comp<-rbind(corr_comp,
+ data.frame(c1=cell_type1,
+ c2=cell_type1,
+ gene_pairs=nrow(corr_c1),
+ genes_unique=num_genes,
+ corr=1))
+
+ #Rename cell types to make it coherent with other part of the manuscript
+ corr_comp$c1<-cell_types_corrected[corr_comp$c1]
+ corr_comp$c2<-cell_types_corrected[corr_comp$c2]
+ corr_comp$c1<-factor(corr_comp$c1,levels=cell_types_corrected)
+ corr_comp$c2<-factor(corr_comp$c2,levels=cell_types_corrected)
+
+ #Create heatmap
+ g<-ggplot(corr_comp,aes(x=c1,y=c2,fill=corr))+
+ geom_tile()+
+ geom_text(aes(label=paste0(round(corr,3),"\n(",genes_unique,")")),size=4)+
+ xlab("Cell type")+
+ ylab("Cell type")+
+ #scale_fill_gradient2("Correlation",limits = c(-1,1),low="darkblue",mid="white",high="darkred")
+ scale_fill_viridis("Correlation",limits=c(0,1))+
+ theme(axis.title = element_text(size=14),
+ axis.text = element_text(size=12),
+ legend.title = element_text(size=14),
+ legend.text = element_text(size=12))
+ ggsave(g,file=paste0("co-expression_indivs_combined/plots/corr_celltypes_",version,".pdf"),
+ width=7,height=5)
+
+ #Check correlation distribution across cell types
+ summary(corr_comp$corr[corr_comp$c1 != corr_comp$c2])
+
+}
\ No newline at end of file
diff --git a/03_celltype_individual_comparison/correlation_celltype.py b/03_celltype_individual_comparison/correlation_celltype.py
new file mode 100644
index 0000000..86b3e78
--- /dev/null
+++ b/03_celltype_individual_comparison/correlation_celltype.py
@@ -0,0 +1,96 @@
+#############################################################################################
+# Calculate correlation for each pairwise gene combiation in each cell type and the UT timepoint,
+# where both genes are expressed in at least 50% of the cells
+# merging all individuals for Oelen v2 and v3 dataset (specified in parameter version2)
+# Input: seurat objects with Oelen v2 and v3 dataset
+# Output: correlation values as csv files (one per cell type)
+############################################################################################
+
+#from scipy.stats import t, norm
+from scipy.stats import spearmanr
+import scanpy as sc
+import numpy as np
+import pandas as pd
+from pathlib import Path
+from time import time
+import os
+import re
+
+# specify if Oelen v2 (version2=True) or v3 (version2=False) dataset is used
+version2 = True
+
+# set result path
+if version2:
+ prefix_results = Path('co-expression_indivs_combined/one_million_version2/')
+else:
+ prefix_results = Path('co-expression_indivs_combined/')
+
+# load scanpy object
+if version2:
+ alldata = sc.read_h5ad('seurat_objects/1M_v2_mediumQC_ctd_rnanormed_demuxids_20201029.sct.h5ad')
+else:
+ alldata = sc.read_h5ad('seurat_objects/1M_v3_mediumQC_ctd_rnanormed_demuxids_20201106.SCT.h5ad')
+
+def select_gene_nonzeroratio(df, ratio):
+ nonzerocounts = np.count_nonzero(df.values, axis=0)/df.shape[0]
+ selected_genes = df.columns[nonzerocounts>ratio]
+ return selected_genes
+
+# extract timepoint from timepoint - stimulation annotation
+def get_time(x):
+ if x == 'UT':
+ return x
+ else:
+ pattern = re.compile(r'\d+h')
+ return re.findall(pattern, x)[0]
+
+
+# extract timepoint from timepoint - stimulation annotation
+observations = alldata.obs.copy()
+observations['time_merged'] = [get_time(item) for item in observations['timepoint']]
+observations['timepoint_id_celltype'] = [f'{item[0]}_{item[1]}' for item
+ in observations[['time_merged', 'cell_type_lowerres']].values]
+
+# iterate over each cell type
+celltypes = ['B', 'CD4T', 'CD8T', 'monocyte', 'DC', 'NK']
+for celltype in celltypes:
+ if not os.path.isdir(prefix_results/celltype):
+ os.mkdir(prefix_results/celltype)
+ starttime = time()
+ print(celltype)
+ specific = alldata[alldata.obs.cell_type_lowerres==celltype]
+ celltype_data = pd.DataFrame(data=specific.X.toarray(),
+ index=specific.obs.index,
+ columns=specific.var.index)
+
+ # get the set of gene pairs
+ specific_obs = observations[observations['cell_type_lowerres']==celltype]
+
+ # select only UT cells
+ for condition in ['UT']:
+
+ # filter for the condition
+ celltype_condition_data = celltype_data[specific_obs.time_merged==condition]
+
+ #filter genes after a nonzero rate of at least 0.5
+ selected_genes = select_gene_nonzeroratio(celltype_condition_data, 0.5)
+
+ print(f"Number of selected genes for {celltype} {condition}: {len(selected_genes)}")
+
+ gene_pairs = []
+ for i,gene1 in enumerate(selected_genes):
+ for j in range(i+1, len(selected_genes)):
+ gene_pairs.append(';'.join([gene1, selected_genes[j]]))
+
+ input_df = celltype_condition_data[selected_genes]
+ input_data = spearmanr(input_df, axis=0)[0]
+ input_data_uppertria = input_data[np.triu_indices_from(input_data, 1)]
+
+ corrs_df = pd.DataFrame(data=input_data_uppertria,
+ columns=[f'{condition}'],
+ index=gene_pairs)
+
+ corrs_df.to_csv(prefix_results/celltype/f'{celltype}_{condition}_correlation.csv')
+
+
+ print(f"Finished {celltype} with time {time() - starttime}")
diff --git a/03_celltype_individual_comparison/correlation_distribution_celltypes_and_individuals.R b/03_celltype_individual_comparison/correlation_distribution_celltypes_and_individuals.R
new file mode 100644
index 0000000..1400f3d
--- /dev/null
+++ b/03_celltype_individual_comparison/correlation_distribution_celltypes_and_individuals.R
@@ -0,0 +1,167 @@
+# ------------------------------------------------------------------------------
+# Check Pearson correlation between individuals (per cell type)
+# and combine it with correlation levels in the cell type
+# (as both are below each other in the final figure)
+# Input:
+# 1) correlation matrices per cell type generated with
+# correlation_timepoint_combined_indivs.py
+# (for correlation levels in each cell type)
+# 2) correlation matrices per individual and cell type
+# (for comparison of individuals)
+# Output: plot and summary as output text
+# -----------------------------------------------------------------------------
+
+library(data.table)
+library(ggplot2)
+library(ggpubr)
+library(RColorBrewer)
+library(dplyr)
+
+theme_set(theme_bw())
+
+path<-"coeqtl_mapping/input/individual_networks/UT/"
+
+cell_types<-c("B","CD4T","CD8T","DC","monocyte","NK")
+
+#Full cell type names as reported in the paper
+cell_types_corrected<-setNames(c("CD4+ T","CD8+ T","Monocyte","NK","DC","B"),
+ c("CD4T","CD8T","monocyte","NK","DC","B"))
+
+#Get standard color scale
+gg_color_hue <- function(n) {
+ hues = seq(15, 375, length = n + 1)
+ hcl(h = hues, l = 65, c = 100)[1:n]
+}
+
+#Evaluate both Oelen v2 and v3 dataset
+for(dataset in c("onemillionv2","onemillionv3")){
+
+ #Evaluate correlation distribution in the cell type
+ all_corrs<-NULL
+ tp<-"UT"
+ for(ct in cell_types){
+
+ #Load correlation values
+ if(dataset=="onemillionv3"){
+ corr_ct<-fread(paste0("co-expression_indivs_combined/",ct,"/",
+ ct,"_",tp,"_correlation.csv"))
+ } else {
+ corr_ct<-fread(paste0("co-expression_indivs_combined/one_million_version2/",
+ ct,"/",ct,"_",tp,"_correlation.csv"))
+ }
+ colnames(corr_ct)[2]<-"corr"
+
+ #Get absolute correlation
+ corr_ct$corr<-abs(corr_ct$corr)
+ all_corrs<-rbind(all_corrs,
+ data.frame(level=c("<0.05",">0.05",">0.1",">0.2",">0.3"),
+ values=c(sum(abs(corr_ct$corr<0.05)),
+ sum(abs(corr_ct$corr)>0.05 &
+ abs(corr_ct$corr)<0.1),
+ sum(abs(corr_ct$corr)>0.1 &
+ abs(corr_ct$corr)<0.2),
+ sum(abs(corr_ct$corr)>0.2 &
+ abs(corr_ct$corr)<0.3),
+ sum(abs(corr_ct$corr)>0.3)),
+ freq=c(mean(abs(corr_ct$corr<0.05)),
+ mean(abs(corr_ct$corr)>0.05 &
+ abs(corr_ct$corr)<0.1),
+ mean(abs(corr_ct$corr)>0.1 &
+ abs(corr_ct$corr)<0.2),
+ mean(abs(corr_ct$corr)>0.2 &
+ abs(corr_ct$corr)<0.3),
+ mean(abs(corr_ct$corr)>0.3)),
+ ct,tp))
+ }
+
+ #Check general distribution of "highly correlated genes"
+ high_corr<-all_corrs[all_corrs$level %in% c(">0.3",">0.2",">0.1"),]
+ high_corr<-high_corr%>%group_by(ct)%>%
+ summarize(high_freq=sum(freq))%>%
+ as.data.frame()
+ median(high_corr$high_freq)
+
+ #Get comparison between cell types
+ summary<-NULL
+ for(ct in cell_types){
+ #Load file with all correlation values per individual and cell type
+ #each individual one column
+ if(dataset=="onemillionv3"){
+ corr<-fread(paste0(path,dataset,"/UT_",ct,".genesnonzero0.5.coefs.gz"))
+ } else {
+ corr<-fread(paste0(path,dataset,"/UT_",ct,".genesnonzero0.5.coefs.tsv.gz"))
+ }
+
+ #Get Pearson correlation between individual-specific correlations
+ gene_pairs<-corr$V1
+ corr$V1<-NULL
+ indiv_corr<-cor(corr,method="pearson")
+
+ #Melt the upper triangle
+ tmp<-reshape2::melt(indiv_corr)
+ tmp$Var1<-as.character(tmp$Var1)
+ tmp$Var2<-as.character(tmp$Var2)
+ tmp<-tmp[tmp$Var1 < tmp$Var2,]
+
+ tmp$ct<-ct
+
+ summary<-rbind(summary,tmp)
+ }
+
+ #Replace cell type names
+ all_corrs$ct<-cell_types_corrected[all_corrs$ct]
+ summary$ct<-cell_types_corrected[summary$ct]
+
+ #Sort cell types according to their highly correlated genes
+ sorting<-all_corrs[all_corrs$level==">0.3",]
+
+ #Sort cell type colors
+ colors_cts<-gg_color_hue(6)
+ colors_cts<-colors_cts[order(sorting$freq)]
+
+ sorting<-sorting[order(sorting$freq),]
+
+ #Barplot showing the general correlation distribution in the cell type
+ all_corrs$level<-factor(all_corrs$level,levels=c("<0.05",">0.05",">0.1",">0.2",">0.3"))
+ all_corrs$ct<-factor(all_corrs$ct,levels=sorting$ct)
+ colors_bars<-brewer.pal(n = 6, "YlGnBu")
+ g.1<-ggplot(all_corrs,aes(x=ct,y=freq,fill=level))+
+ geom_bar(stat="identity")+
+ xlab("Cell type")+ylab("Fraction correlated genes")+
+ scale_fill_manual("Absolute\ncorrelation",values=colors_bars[2:6])+
+ theme(legend.position="bottom",
+ axis.title.y = element_text(size=13.5),
+ axis.title.x = element_blank(),
+ axis.text = element_text(size=12),
+ legend.title=element_text(size=11),
+ legend.text=element_text(size=10.5))
+
+ #Violin plot showing differences between individuals in the cell type
+ summary$ct<-factor(summary$ct,levels=sorting$ct)
+ g.2<-ggplot(summary,aes(x=ct,fill=ct,y=value))+
+ geom_violin()+
+ geom_boxplot(width = 0.15, outlier.shape = NA)+
+ ylim(0,1)+
+ xlab("Cell type")+
+ ylab("Correlation between individuals")+
+ scale_fill_manual(values=colors_cts)+
+ theme(legend.position = "none",
+ axis.title.y = element_text(size=13.5),
+ axis.title.x = element_blank(),
+ axis.text = element_text(size=12))
+
+
+ g<-ggarrange(g.1,g.2,ncol=1,align="hv")
+ ggsave(g,file=paste0("co-expression_indivs_combined/plots/corr_ct_indiv_",
+ dataset,".pdf"),
+ width=5,height=6)
+
+ #Get median correlation in each cell type
+ med_corr<-summary%>%
+ group_by(ct)%>%
+ summarise(median(value,na.rm=TRUE))
+
+ print(dataset)
+ print(med_corr)
+
+}
diff --git a/03_celltype_individual_comparison/correlation_subsampling.py b/03_celltype_individual_comparison/correlation_subsampling.py
new file mode 100644
index 0000000..bfd1845
--- /dev/null
+++ b/03_celltype_individual_comparison/correlation_subsampling.py
@@ -0,0 +1,121 @@
+####################################################################################
+# Calculate per sample correlation with subsampled number of cells per donor
+# to explore the relationship between number of cells and
+# and concordance between donors
+# Individuals with a total number of cells below the respective subsampled value
+# are not tested, sampling range from 25 cells to the 75% quantile for the cell type
+# (so that at least 25% of the individuals can be included each time)
+# Selecting again genes expressed in at least 50% of the cells
+# Input: seurat objects with Oelen v2 and v3 dataset
+# Output: Csv file with Pearson correlation values for all individual comparison
+# per celltype and subsampled number of cells
+####################################################################################
+
+import scanpy as sc
+import numpy as np
+import pandas as pd
+from tqdm import tqdm
+from scipy.stats import pearsonr, spearmanr
+
+def select_gene_nonzeroratio(df, ratio):
+ '''
+ Select genes with non-zero ratio across all cells > specified ratio
+ '''
+ nonzerocounts = np.count_nonzero(df.values, axis=0)/df.shape[0]
+ selected_genes = df.columns[nonzerocounts>ratio]
+ return selected_genes
+
+def calculate_individual_network(individual_df, n_cells=0, random_state=8):
+ '''
+ Randomly select the n_cells from individual_df to calculate the gene-gene spearman network;
+ if n_cell not set, then use all cells from the individual
+ Return: A list of correlation coefficients
+ '''
+ if n_cells > 0:
+ specific_individual_network = individual_df.sample(n_cells, random_state=random_state).corr(method='spearman')
+ else:
+ specific_individual_network = individual_df.corr(method='spearman')
+ indices = np.tril_indices_from(np.zeros((specific_individual_network.shape[0],
+ specific_individual_network.shape[0])),
+ k=1)
+ return specific_individual_network.values[indices].flatten()
+
+def calculate_correlation(celltype_df, celltype_obs, selected_genes, select_individuals, n_cells=100):
+ '''
+ Calculate all inidividual networks for certain n_cells, and selected_indidviauls
+ Input: celltype_df: gene index in columns, cell index in rows, cell index will be used to match the index of
+ celltype_df; it needs at least one column named 'assignment', storing the individual index for
+ each cell
+ selected_genes: a list of gene index
+ select_individuals: a list of individual index
+ n_cells: number of cells to select from each individual, it should be smaller than the max number of cells
+ in all individuals
+ Output: all_individuals_correlation: a dataframe, each column of values is all the gene-gene spearman
+ correlation coefficients
+ correlation_of_individual_correlations: spearman correlation between all pairs of individuals' networks
+ '''
+ all_individuals_correlation = pd.DataFrame()
+ if select_individuals is not None:
+ for assignment in select_individuals:
+ allcells_individual = celltype_obs[celltype_obs.assignment==assignment].index.values
+ specific_individual_df = celltype_df[selected_genes].loc[allcells_individual]
+ all_individuals_correlation[assignment] = calculate_individual_network(specific_individual_df, n_cells)
+ correlation_of_individual_correlations = all_individuals_correlation.corr(method='pearson')
+ individual_indices = np.triu_indices_from(correlation_of_individual_correlations.values, k=1)
+ return all_individuals_correlation, correlation_of_individual_correlations.values[individual_indices]
+
+# define path (run for Oelen v2 and v3 dataset separately)
+version2 = False
+if version2:
+ input_path = 'seurat_objects/1M_v2_mediumQC_ctd_rnanormed_demuxids_20201029.sct.h5ad'
+ output_path ='co-expression_indivs_subsampled/correlation_individuals_subsampled_1M_v2.csv'
+else:
+ input_path = 'seurat_objects/1M_v3_mediumQC_ctd_rnanormed_demuxids_20201106.SCT.h5ad'
+ output_path ='co-expression_indivs_subsampled/correlation_individuals_subsampled_1M_v3.csv'
+
+# load single cell data
+alldata = sc.read_h5ad(input_path)
+
+# filter to look only at UT cells
+alldata = alldata[alldata.obs.timepoint == "UT"].copy()
+
+# select common individual per celltype
+celltypes = ['CD4T','NK','monocyte','CD8T','B','DC']
+selected_individuals = {}
+selected_individuals_cell_number = {}
+for celltype in celltypes:
+ celltype_data = alldata[alldata.obs.cell_type_lowerres == celltype]
+ selected_individuals_cell_number[f'{celltype}'] = celltype_data.obs.assignment.value_counts().values
+ selected_individuals[f'{celltype}'] = celltype_data.obs.assignment.value_counts().index
+ #Check distribution of cells per individual
+ print(celltype_data.obs.assignment.value_counts().describe())
+
+# calculate for each celltype
+all_celltype_res = pd.DataFrame()
+for celltype in tqdm(celltypes):
+ celltype_data = alldata[alldata.obs.cell_type_lowerres == celltype]
+ celltype_df = pd.DataFrame(data=celltype_data.X.toarray(),
+ columns=celltype_data.var.index, # genes
+ index=celltype_data.obs.index) # cells
+
+ # select only genes expressed in at least 50% of the cells
+ selected_genes = select_gene_nonzeroratio(celltype_df, ratio=0.5)
+
+ # Run each cell type for different number of cells so that at least 25% of individuals have that many cells
+ for cell_num in range(25, int(np.quantile(selected_individuals_cell_number[f'{celltype}'],0.75)),25):
+
+ print(cell_num)
+
+ # Select all individuals that have enough cells
+ indivs = selected_individuals[f'{celltype}'][selected_individuals_cell_number[f'{celltype}']>=cell_num]
+ # Get all pairwise correlation for these pairs
+ celltype_correlations = pd.DataFrame(data=calculate_correlation(celltype_df, celltype_data.obs,
+ selected_genes=selected_genes,
+ n_cells=cell_num,
+ select_individuals=indivs)[1],
+ columns=['corr'])
+ celltype_correlations['celltype'] = celltype
+ celltype_correlations['cell_num'] = cell_num
+ all_celltype_res = pd.concat([all_celltype_res, celltype_correlations], axis=0)
+
+all_celltype_res.to_csv(output_path)
diff --git a/03_celltype_individual_comparison/fit_logcurve_indiv_subsampling_effect.R b/03_celltype_individual_comparison/fit_logcurve_indiv_subsampling_effect.R
new file mode 100644
index 0000000..09903ea
--- /dev/null
+++ b/03_celltype_individual_comparison/fit_logcurve_indiv_subsampling_effect.R
@@ -0,0 +1,102 @@
+# ------------------------------------------------------------------------------
+# Fit logarithmic curve describing mean correlation between individuals dependent
+# on the number of cells per individual and cell type, fit down separately
+# for each cell type
+# Input: pairwise comparison of all individuals (Pearson correlation) per cell type
+# and for different numbers of cells
+# (subsampling and calculation done in correlation_subsampling.py)
+# Output: logarithmic fit per cell type and curve visualizing the fit
+# ------------------------------------------------------------------------------
+
+library(ggplot2)
+library(dplyr)
+
+theme_set(theme_bw())
+
+suffix<-"v3"
+#suffix<-"v2"
+
+color_coding <- list()
+color_coding[["CD4+ T"]] <- "#2E9D33"
+color_coding[["CD8+ T"]] <- "#126725"
+color_coding[["Monocyte"]] <- "#EDBA1B"
+color_coding[["NK"]] <- "#E64B50"
+color_coding[["B"]] <- "#009DDB"
+color_coding[["DC"]] <- "#965EC8"
+
+#Full cell type names as reported in the paper
+cell_types_corrected<-setNames(c("CD4+ T","CD8+ T","Monocyte","NK","DC","B"),
+ c("CD4T","CD8T","monocyte","NK","DC","B"))
+
+#Load results
+res<-read.csv(paste0("co-expression_indivs_subsampled/",
+ "correlation_individuals_subsampled_1M_",suffix,".csv"),
+ stringsAsFactors = FALSE)
+res$X<-NULL
+
+res$celltype<-cell_types_corrected[res$celltype]
+
+res_summary<-res%>%
+ group_by(celltype,cell_num)%>%
+ summarise(mean_corr=mean(corr),
+ quantile_25=quantile(corr,0.25),
+ quantile_75=quantile(corr,0.75))%>%
+ as.data.frame()
+
+#Filter out B cells and DCs because no line can be drawn for them
+res_summary<-res_summary[! res_summary$celltype %in% c("B","DC"),]
+
+#Fit one log function for each cell type
+log_parameters<-NULL
+for(cell_type in unique(res_summary$celltype)){
+
+ #Fit the linear model
+ res_ct<-res[res$celltype == cell_type,]
+ model_lm<-lm(corr~log(cell_num),data=res_ct)
+
+ #Save model summary
+ summary_model<-summary(model_lm)
+ print(summary_model)
+
+ log_parameters<-rbind(log_parameters,
+ data.frame(cell_type,
+ intercept=summary_model$coefficients[1,1],
+ log_beta=summary_model$coefficients[2,1],
+ adj_r_squared=summary_model$adj.r.squared,
+ stringsAsFactors = FALSE))
+ res_summary<-rbind(res_summary,
+ data.frame(celltype=cell_type,
+ cell_num=seq(max(res_ct$cell_num)+25,1500,by=25),
+ mean_corr=NA,
+ quantile_25=NA,
+ quantile_75=NA))
+}
+
+
+
+res_summary$fitted_corr<-sapply(1:nrow(res_summary),function(i)
+ log_parameters$intercept[log_parameters$cell_type == res_summary$celltype[i]] +
+ log_parameters$log_beta[log_parameters$cell_type == res_summary$celltype[i]] *
+ log(res_summary$cell_num[i]))
+
+res_summary_melt<-reshape2::melt(res_summary[,c("celltype","cell_num","mean_corr","fitted_corr")],
+ id.vars=c("celltype","cell_num"))
+
+g<-ggplot()+
+ geom_line(data=res_summary_melt,aes(x=cell_num,y=value,color=celltype,
+ linetype=variable))+
+ geom_point(data=res_summary,aes(x=cell_num,y=mean_corr,color=celltype))+
+ # annotate("text",x=850,y=0.2,hjust=0,
+ # label=paste0("y ~ -0.56 + 0.21 * log(x), R^2 = 0.98 (CD4+ T)\n",
+ # "y ~ -0.48 + 0.20 * log(x), R^2 = 0.86 (CD8+ T)\n",
+ # "y ~ -0.53 + 0.20 * log(x), R^2 = 0.94 (Monocyte)\n",
+ # "y ~ -0.41 + 0.15 * log(x), R^2 = 0.93 (NK)\n"))+
+ scale_color_manual("Cell type",values=unlist(color_coding))+
+ scale_linetype_discrete("",labels=c("Observed","Predicted"))+
+ xlab("Subsampled number of cells per individual")+
+ ylab("Correlation between individuals")
+print(g)
+
+ggsave(g,file=paste0("co-expression_indivs_subsampled/plots/subsampling_1M_",
+ suffix,"_fitted_lines.pdf"),
+ width=10,height=4)
diff --git a/03_celltype_individual_comparison/individual_networks_for_selected_genepairs.py b/03_celltype_individual_comparison/individual_networks_for_selected_genepairs.py
new file mode 100644
index 0000000..3f72781
--- /dev/null
+++ b/03_celltype_individual_comparison/individual_networks_for_selected_genepairs.py
@@ -0,0 +1,268 @@
+import argparse
+import os
+import re
+from collections import namedtuple
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+import scanpy as sc
+from scipy.stats import rankdata
+from scipy.stats import t, norm
+from tqdm import tqdm
+
+
+def get_time(x):
+ if x == 'UT':
+ return x
+ else:
+ pattern = re.compile(r'\d+h')
+ return re.findall(pattern, x)[0]
+
+
+class DATASET:
+ def __init__(self, datasetname):
+ self.name = datasetname
+ self.path_prefix = Path("./")
+ self.information = self.get_information()
+
+ def get_information(self):
+ if self.name == 'onemillionv2':
+ self.path = '1M_v2_mediumQC_ctd_rnanormed_demuxids_20201029.sct.h5ad'
+ self.individual_id_col = 'assignment'
+ self.timepoint_id_col = 'time'
+ self.celltype_id = 'cell_type_lowerres'
+ self.chosen_condition = {'UT': 'UT',
+ 'stimulated': '3h'}
+ elif self.name == 'onemillionv3':
+ self.path = '1M_v3_mediumQC_ctd_rnanormed_demuxids_20201106.SCT.h5ad'
+ self.individual_id_col = 'assignment'
+ self.timepoint_id_col = 'time'
+ self.celltype_id = 'cell_type_lowerres'
+ self.chosen_condition = {'UT': 'UT',
+ 'stimulated': '3h'}
+ elif self.name == 'stemiv2':
+ self.path = 'cardio.integrated.20210301.stemiv2.h5ad'
+ self.individual_id_col = 'assignment.final'
+ self.timepoint_id_col = 'timepoint.final'
+ self.celltype_id = 'cell_type_lowerres'
+ self.chosen_condition = {'UT': 't8w',
+ 'stimulated': 'Baseline'}
+ elif self.name == 'ng':
+ self.path = 'pilot3_seurat3_200420_sct_azimuth.h5ad'
+ self.individual_id_col = 'snumber'
+ self.celltype_id = 'cell_type_mapped_to_onemillion'
+ else:
+ raise IOError("Dataset name not understood.")
+
+ def load_dataset(self):
+ self.get_information()
+ print(f'Loading dataset {self.name} from {self.path_prefix} {self.path}')
+ self.data_sc = sc.read_h5ad(self.path_prefix / self.path)
+ if self.name.startswith('onemillion'):
+ self.data_sc.obs['time'] = [get_time(item) for item in self.data_sc.obs['timepoint']]
+ elif self.name == 'ng':
+ celltype_maping = {'CD4 T': 'CD4T', 'CD8 T': 'CD8T', 'Mono': 'monocyte', 'DC': 'DC', 'NK': 'NK',
+ 'other T': 'otherT', 'other': 'other', 'B': 'B'}
+ self.data_sc.obs['cell_type_mapped_to_onemillion'] = [celltype_maping.get(name) for name in
+ self.data_sc.obs['predicted.celltype.l1']]
+
+
+def select_gene_nonzeroratio(df, ratio):
+ nonzerocounts = np.count_nonzero(df.values, axis=0) / df.shape[0]
+ selected_genes = df.columns[nonzerocounts > ratio]
+ return selected_genes
+
+
+def corr_to_z(coef, num):
+ t_statistic = coef * np.sqrt((num - 2) / (1 - coef ** 2))
+ prob = t.cdf(t_statistic, num - 2)
+ z_score = norm.ppf(prob)
+ positive_coef_probs = 1 - prob
+ positive_coef_probs[coef < 0] = 0
+ negative_coef_probs = prob
+ negative_coef_probs[coef > 0] = 0
+ probs = negative_coef_probs + positive_coef_probs
+ return z_score, probs
+
+
+def get_om_name(filename):
+ pattern = re.compile(r'LLDeep_\d\d\d\d')
+ return re.findall(pattern, filename)[0]
+
+
+def get_stemi_name(filename):
+ pattern = re.compile(r'TEST_\d.')
+ return re.findall(pattern, filename)[0]
+
+
+def save_numpy(data_df, prefix):
+ np.save(f'{prefix}.npy', data_df.values)
+ with open(f'{prefix}.cols.txt', 'w') as f:
+ f.write('\n'.join(data_df.columns))
+ with open(f'{prefix}.rows.txt', 'w') as f:
+ f.write('\n'.join(data_df.index))
+ return None
+
+
+def _contains_nan(a, nan_policy='propagate'):
+ policies = ['propagate', 'raise', 'omit']
+ if nan_policy not in policies:
+ raise ValueError("nan_policy must be one of {%s}" %
+ ', '.join("'%s'" % s for s in policies))
+ try:
+ with np.errstate(invalid='ignore'):
+ contains_nan = np.isnan(np.sum(a))
+ except TypeError:
+ try:
+ contains_nan = np.nan in set(a.ravel())
+ except TypeError:
+ contains_nan = False
+ nan_policy = 'omit'
+ if contains_nan and nan_policy == 'raise':
+ raise ValueError("The input contains nan values")
+ return contains_nan, nan_policy
+
+
+def _chk_asarray(a, axis):
+ if axis is None:
+ a = np.ravel(a)
+ outaxis = 0
+ else:
+ a = np.asarray(a)
+ outaxis = axis
+ if a.ndim == 0:
+ a = np.atleast_1d(a)
+ return a, outaxis
+
+
+def spearmanr_withnan(a, axis=0, nan_policy='propagate'):
+ SpearmanrResult = namedtuple('SpearmanrResult', ('correlation', 'pvalue'))
+ if axis is not None and axis > 1:
+ raise ValueError("spearmanr only handles 1-D or 2-D arrays, supplied axis argument {}, "
+ "please use only values 0, 1 or None for axis".format(axis))
+ a, axisout = _chk_asarray(a, axis)
+ if a.ndim > 2:
+ raise ValueError("spearmanr only handles 1-D or 2-D arrays")
+ n_vars = a.shape[1 - axisout]
+ n_obs = a.shape[axisout]
+ if n_obs <= 1:
+ # Handle empty arrays or single observations.
+ return SpearmanrResult(np.nan, np.nan)
+ a_contains_nan, nan_policy = _contains_nan(a, nan_policy)
+ variable_has_nan = np.zeros(n_vars, dtype=bool)
+ if a_contains_nan:
+ if nan_policy == 'propagate':
+ if a.ndim == 1 or n_vars <= 2:
+ return SpearmanrResult(np.nan, np.nan)
+ else:
+ variable_has_nan = np.isnan(a).sum(axis=axisout)
+ a_ranked = np.apply_along_axis(rankdata, axisout, a)
+ rs = np.corrcoef(a_ranked, rowvar=axisout)
+ dof = n_obs - 2 # degrees of freedom
+ # rs can have elements equal to 1, so avoid zero division warnings
+ with np.errstate(divide='ignore'):
+ t_ = rs * np.sqrt((dof / ((rs + 1.0) * (1.0 - rs))).clip(0))
+ prob = 2 * t.sf(np.abs(t_), dof)
+ # For backwards compatibility, return scalars when comparing 2 columns
+ if rs.shape == (2, 2):
+ return SpearmanrResult(rs[1, 0], prob[1, 0])
+ else:
+ rs[variable_has_nan, :] = np.nan
+ rs[:, variable_has_nan] = np.nan
+ return SpearmanrResult(rs, prob)
+
+
+def get_individual_networks_halfratioGenes(data_sc, individual_colname, selected_genes=None):
+ data_df = pd.DataFrame(data=data_sc.X.toarray(),
+ index=data_sc.obs.index,
+ columns=data_sc.var.index)
+ selected_genes = select_gene_nonzeroratio(data_df, ratio=0.5)
+ print(f"Selected {len(selected_genes)} genes.")
+ from itertools import combinations
+ selected_genes_sorted_genepairs = [';'.join(sorted(item)) for item in combinations(selected_genes, 2)]
+ coef_df = pd.DataFrame(index=selected_genes_sorted_genepairs)
+ coef_p_df = pd.DataFrame(index=selected_genes_sorted_genepairs)
+ zscore_df = pd.DataFrame(index=selected_genes_sorted_genepairs)
+ zscore_p_df = pd.DataFrame(index=selected_genes_sorted_genepairs)
+ data_selected_df = data_df[selected_genes]
+ print(f"Begin calculating networks for {len(data_sc.obs[individual_colname].unique())} individuals.")
+ for ind_id in tqdm(data_sc.obs[individual_colname].unique()):
+ cell_num = data_sc.obs[data_sc.obs[individual_colname] == ind_id].shape[0]
+ if cell_num > 10:
+ individual_df = data_selected_df.loc[data_sc.obs[individual_colname] == ind_id]
+ individual_coefs, individual_coef_ps = spearmanr_withnan(individual_df.values, axis=0)
+ individual_coefs_flatten = pd.DataFrame(data=individual_coefs[np.triu_indices_from(individual_coefs, 1)],
+ index=selected_genes_sorted_genepairs).loc[
+ selected_genes_sorted_genepairs]
+ individual_coef_ps_flatten = \
+ pd.DataFrame(data=individual_coef_ps[np.triu_indices_from(individual_coefs, 1)],
+ index=selected_genes_sorted_genepairs).loc[selected_genes_sorted_genepairs]
+ coef_df[ind_id] = individual_coefs_flatten
+ coef_p_df[ind_id] = individual_coef_ps_flatten
+ try:
+ individual_zscores_flatten, individual_zscore_ps_flatten = corr_to_z(individual_coefs_flatten.values,
+ cell_num)
+ zscore_df[ind_id] = individual_zscores_flatten
+ zscore_p_df[ind_id] = individual_zscore_ps_flatten
+ except:
+ continue
+ else:
+ print("Deleted this individual because of low cell number", cell_num)
+ return coef_df, coef_p_df, zscore_df, zscore_p_df
+
+
+def get_individual_networks_given_celltype_condition_datasetname_for_6major_celltypes(datasetname, condition='UT',
+ genelist=None):
+ # load the data and data information
+ celltypes = ['CD4T', 'CD8T', 'monocyte', 'NK', 'B', 'DC']
+ dataset = DATASET(datasetname)
+ dataset.load_dataset()
+ print(f"{datasetname} loaded.")
+ # calculate the individual network for specific condition and celltype
+ for celltype in celltypes:
+ print(datasetname, celltype, condition)
+ if datasetname == 'ng':
+ data_selected = dataset.data_sc[(dataset.data_sc.obs[dataset.celltype_id] == celltype)]
+ else:
+ data_selected = dataset.data_sc[(dataset.data_sc.obs[dataset.celltype_id] == celltype) &
+ (dataset.data_sc.obs[dataset.timepoint_id_col] == dataset.chosen_condition[
+ condition])]
+ individual_coefs_df, individual_coefs_p_df, individual_zscores_df, individual_zscores_p_df = \
+ get_individual_networks_halfratioGenes(
+ data_selected,
+ dataset.individual_id_col,
+ genelist
+ )
+ print(individual_coefs_df.head())
+ save_prefix = Path(
+ 'coeqtl_mapping/input')
+ if not os.path.exists(save_prefix / 'individual_networks' / condition / datasetname):
+ os.mkdir(save_prefix / 'individual_networks' / condition / datasetname)
+ individual_coefs_df.to_csv(
+ save_prefix / 'individual_networks' / condition / datasetname / f'{condition}_{celltype}.genesnonzero0.5.coefs.gz',
+ sep='\t', compression='gzip')
+ individual_zscores_df.to_csv(
+ save_prefix / 'individual_networks' / condition / datasetname / f'{condition}_{celltype}.genesnonzero0.5.zscores.gz',
+ sep='\t', compression='gzip')
+ print("Saved ")
+ return None
+
+
+def argumentsparser():
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--datasetname', type=str, dest='datasetname')
+ parser.add_argument('--condition', type=str, dest='condition')
+ return parser
+
+
+def run_get_individual_networks_given_celltype_condition_datasetname():
+ args = argumentsparser().parse_args()
+ print(f"Starting to calculate individual network for {args.datasetname}, {args.celltype}, {args.condition}.")
+ get_individual_networks_given_celltype_condition_datasetname_for_6major_celltypes(condition=args.condition,
+ datasetname=args.datasetname)
+ return None
+
+
+if __name__ == '__main__':
+ run_get_individual_networks_given_celltype_condition_datasetname()
diff --git a/03_celltype_individual_comparison/plot_indiv_subsampling_effect.R b/03_celltype_individual_comparison/plot_indiv_subsampling_effect.R
new file mode 100644
index 0000000..103007e
--- /dev/null
+++ b/03_celltype_individual_comparison/plot_indiv_subsampling_effect.R
@@ -0,0 +1,55 @@
+# ------------------------------------------------------------------------------
+# Plot effect of number of cells on correlation between individuals
+# Input: pairwise comparison of all individuals (Pearson correlation) per cell type
+# and for different numbers of cells
+# (subsampling and calculation done in correlation_subsampling.py)
+# Output: violin plot showing trend
+# ------------------------------------------------------------------------------
+
+library(ggplot2)
+
+theme_set(theme_bw())
+
+suffix<-"v3"
+#suffix<-"v2"
+
+color_coding <- list()
+color_coding[["CD4+ T"]] <- "#2E9D33"
+color_coding[["CD8+ T"]] <- "#126725"
+color_coding[["Monocyte"]] <- "#EDBA1B"
+color_coding[["NK"]] <- "#E64B50"
+color_coding[["B"]] <- "#009DDB"
+color_coding[["DC"]] <- "#965EC8"
+
+#Full cell type names as reported in the paper
+cell_types_corrected<-setNames(c("CD4+ T","CD8+ T","Monocyte","NK","DC","B"),
+ c("CD4T","CD8T","monocyte","NK","DC","B"))
+
+#Load results
+res<-read.csv(paste0("co-expression_indivs_subsampled/",
+ "correlation_individuals_subsampled_1M_",suffix,".csv"))
+res$X<-NULL
+
+res$celltype<-cell_types_corrected[res$celltype]
+
+#Filter out some values to make it more visible
+res<-res[res$cell_num %in% seq(25,500,50),]
+
+res$cell_num<-as.factor(res$cell_num)
+
+g<-ggplot(res,aes(x=cell_num,y=corr,fill=celltype))+
+ geom_violin(position = position_dodge(0.9)) +
+ xlab("Subsampled number of cells per individual")+
+ ylab("Correlation between individuals")+
+ ylim(0,1)+
+ scale_fill_manual("Cell type",values=color_coding)+
+ theme(axis.title = element_text(size=16),
+ axis.text = element_text(size=14),
+ legend.title = element_text(size=13),
+ legend.text = element_text(size=13),
+ legend.position=c(0.9,0.2))+
+ guides(fill=guide_legend(nrow=3,byrow=FALSE))
+print(g)
+ggsave(g,file=paste0("co-expression_indivs_subsampled/plots/subsampling_1M_",
+ suffix,"_filtered.pdf"),
+ width=14,height=4)
diff --git a/04_coeqtl_mapping/.ipynb_checkpoints/examine_bios_replication-checkpoint.ipynb b/04_coeqtl_mapping/.ipynb_checkpoints/examine_bios_replication-checkpoint.ipynb
new file mode 100644
index 0000000..8c5b906
--- /dev/null
+++ b/04_coeqtl_mapping/.ipynb_checkpoints/examine_bios_replication-checkpoint.ipynb
@@ -0,0 +1,1013 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import matplotlib.pyplot as plt\n",
+ "from scipy.stats import spearmanr\n",
+ "from pathlib import Path\n",
+ "from scipy.stats import t, norm\n",
+ "import seaborn as sns\n",
+ "%matplotlib inline\n",
+ "\n",
+ "def flip_zscore(zscore, coeqtlallele, altaf, altallele):\n",
+ " if not pd.isnull(zscore):\n",
+ " if coeqtlallele == altallele:\n",
+ " coeqtlaf = altaf\n",
+ " else:\n",
+ " coeqtlaf = 1 - altaf\n",
+ " if coeqtlaf > 0.5:\n",
+ " return -zscore\n",
+ " else:\n",
+ " return zscore\n",
+ " else:\n",
+ " return np.nan\n",
+ " \n",
+ "def flip_allele(altaf, altallele, refallele):\n",
+ " if altaf > 0.5:\n",
+ " return refallele\n",
+ " else:\n",
+ " return altallele"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "coeqtl_withbios_prefix = Path(\n",
+ " \"./coeqtl_mapping/output\"\n",
+ ")\n",
+ "filter_type = 'filtered_results'\n",
+ "\n",
+ "def flip_direction(allele1, allele2, zscore2):\n",
+ " if allele1 == allele2:\n",
+ " return zscore2\n",
+ " else:\n",
+ " return -1*zscore2\n",
+ "\n",
+ "\n",
+ "def get_z_score(t_statistic, num):\n",
+ " prob = t.cdf(t_statistic, num - 2)\n",
+ " z_score = norm.ppf(prob)\n",
+ " return z_score"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import seaborn as sns\n",
+ "import numpy as np\n",
+ "import matplotlib.pyplot as plt\n",
+ "import matplotlib.patches as mpatches"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "color_dict = {'CD4T': '#2E9D33',\n",
+ " 'CD8T': 'darkgreen',\n",
+ " 'monocyte': '#EDBA1B',\n",
+ " 'NK': '#E64B50',\n",
+ " 'DC': '#965EC8',\n",
+ " 'B': '#009DDB',\n",
+ " 'cMono': 'peru',\n",
+ " 'ncMono': 'y',\n",
+ " 'CD4T_individual_100': '#2E9D33',\n",
+ " 'CD4T_individual_50': '#2E9D33',\n",
+ " 'CD4T_50': '#2E9D33',\n",
+ " 'CD4T_150': '#2E9D33',\n",
+ " 'CD4T_250': '#2E9D33'}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "workdir = Path(\"./coeqtl_mapping/\")\n",
+ "bios_replication_filtered_df = pd.read_csv(\n",
+ " workdir/'bios/onlyRNAAlignMetrics_rmLLD/filtered_results/replication_summary.csv', \n",
+ " index_col=0\n",
+ ").set_index('celltype')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "celltype = 'CD4T'\n",
+ "eqtldf = pd.read_csv(\n",
+ " workdir/f'input/snp_selection/eqtl/UT_{celltype}_eQTLProbesFDR0.05-ProbeLevel_withAF.tsv',\n",
+ " sep='\\t'\n",
+ " )\n",
+ "eqtldf['snp_eqtlgene'] = ['_'.join(item) for item in eqtldf[['SNPName', 'genename']].values]\n",
+ "eqtl_allele_af_df = eqtldf.drop_duplicates(subset=['snp_eqtlgene', 'AlleleAssessed', 'AF'])\n",
+ "eqtl_allele_af_dict = eqtl_allele_af_df.set_index('snp_eqtlgene')[['AlleleAssessed', 'AF', 'alt_allele', 'ref_allele']].T.to_dict()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 50,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "biostype = 'onlyRNAAlignMetrics_rmLLD'\n",
+ "celltype = 'CD4T'\n",
+ "filter_type = 'filtered_results'\n",
+ "\n",
+ "coeqtl_df = pd.read_csv(\n",
+ " coeqtl_withbios_prefix/filter_type/f'UT_{celltype}/coeqtls_fullresults_fixed.sig.withbios{biostype}.tsv.gz',\n",
+ " compression='gzip', \n",
+ " index_col=0, \n",
+ " sep='\\t')\n",
+ "coeqtl_df = coeqtl_df.dropna(subset=['t_bios'])\n",
+ "coeqtl_df['zscore_bios'] = [get_z_score(item[0], item[1]) for item in \n",
+ " coeqtl_df[['t_bios', \n",
+ " 'num_individuals_bios']].values]\n",
+ "coeqtl_df['flipped_zscore_bios'] = [flip_direction(item[0], item[1], item[2]) for item in \n",
+ " coeqtl_df[['SNPEffectAllele', \n",
+ " 'assessed_allele_bios',\n",
+ " 'zscore_bios']].values]\n",
+ "\n",
+ "isConcordant = lambda x:True if x[0]*x[1] > 0 else False\n",
+ "coeqtl_df['is_concordant'] = [isConcordant(item) for item in \n",
+ " coeqtl_df[['MetaPZ', 'flipped_zscore_bios']].values]\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 51,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " snp_genepair | \n",
+ " Gene | \n",
+ " GeneChr | \n",
+ " GenePos | \n",
+ " GeneStrand | \n",
+ " GeneSymbol | \n",
+ " SNP | \n",
+ " SNPChr | \n",
+ " SNPPos | \n",
+ " SNPAlleles | \n",
+ " ... | \n",
+ " gene1_bios | \n",
+ " gene2_bios | \n",
+ " assessed_allele_bios | \n",
+ " num_individuals_bios | \n",
+ " isinteractionterm_bios | \n",
+ " snp_genepair_bios | \n",
+ " corrected_p_bios | \n",
+ " zscore_bios | \n",
+ " flipped_zscore_bios | \n",
+ " is_concordant | \n",
+ "
\n",
+ " \n",
+ " snp_gene1_gene2 | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " rs7605824_SH3YL1_NPM1 | \n",
+ " rs7605824_NPM1;SH3YL1 | \n",
+ " NPM1;SH3YL1 | \n",
+ " 2 | \n",
+ " 217730 | \n",
+ " NaN | \n",
+ " NPM1;SH3YL1 | \n",
+ " rs7605824 | \n",
+ " 2 | \n",
+ " 280819 | \n",
+ " G/A | \n",
+ " ... | \n",
+ " SH3YL1 | \n",
+ " NPM1 | \n",
+ " A | \n",
+ " 2491.0 | \n",
+ " True | \n",
+ " rs7605824_NPM1;SH3YL1 | \n",
+ " 0.000000 | \n",
+ " -3.617874 | \n",
+ " -3.617874 | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " rs7605824_SH3YL1_CD48 | \n",
+ " rs7605824_CD48;SH3YL1 | \n",
+ " CD48;SH3YL1 | \n",
+ " 2 | \n",
+ " 217730 | \n",
+ " NaN | \n",
+ " CD48;SH3YL1 | \n",
+ " rs7605824 | \n",
+ " 2 | \n",
+ " 280819 | \n",
+ " G/A | \n",
+ " ... | \n",
+ " SH3YL1 | \n",
+ " CD48 | \n",
+ " A | \n",
+ " 2491.0 | \n",
+ " True | \n",
+ " rs7605824_CD48;SH3YL1 | \n",
+ " 0.784422 | \n",
+ " -0.446946 | \n",
+ " -0.446946 | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " rs7605824_SH3YL1_RPS13 | \n",
+ " rs7605824_RPS13;SH3YL1 | \n",
+ " RPS13;SH3YL1 | \n",
+ " 2 | \n",
+ " 217730 | \n",
+ " NaN | \n",
+ " RPS13;SH3YL1 | \n",
+ " rs7605824 | \n",
+ " 2 | \n",
+ " 280819 | \n",
+ " G/A | \n",
+ " ... | \n",
+ " SH3YL1 | \n",
+ " RPS13 | \n",
+ " A | \n",
+ " 2491.0 | \n",
+ " True | \n",
+ " rs7605824_RPS13;SH3YL1 | \n",
+ " 0.000000 | \n",
+ " -3.489377 | \n",
+ " -3.489377 | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " rs7605824_SH3YL1_RPL31 | \n",
+ " rs7605824_RPL31;SH3YL1 | \n",
+ " RPL31;SH3YL1 | \n",
+ " 2 | \n",
+ " 217730 | \n",
+ " NaN | \n",
+ " RPL31;SH3YL1 | \n",
+ " rs7605824 | \n",
+ " 2 | \n",
+ " 280819 | \n",
+ " G/A | \n",
+ " ... | \n",
+ " SH3YL1 | \n",
+ " RPL31 | \n",
+ " A | \n",
+ " 2491.0 | \n",
+ " True | \n",
+ " rs7605824_RPL31;SH3YL1 | \n",
+ " 0.349601 | \n",
+ " -1.325633 | \n",
+ " -1.325633 | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " rs7605824_SH3YL1_RPL3 | \n",
+ " rs7605824_RPL3;SH3YL1 | \n",
+ " RPL3;SH3YL1 | \n",
+ " 2 | \n",
+ " 217730 | \n",
+ " NaN | \n",
+ " RPL3;SH3YL1 | \n",
+ " rs7605824 | \n",
+ " 2 | \n",
+ " 280819 | \n",
+ " G/A | \n",
+ " ... | \n",
+ " SH3YL1 | \n",
+ " RPL3 | \n",
+ " A | \n",
+ " 2491.0 | \n",
+ " True | \n",
+ " rs7605824_RPL3;SH3YL1 | \n",
+ " 0.000000 | \n",
+ " -3.854851 | \n",
+ " -3.854851 | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " rs4147638_SMDT1_ACTB | \n",
+ " rs4147638_ACTB;SMDT1 | \n",
+ " ACTB;SMDT1 | \n",
+ " 22 | \n",
+ " 42475695 | \n",
+ " NaN | \n",
+ " ACTB;SMDT1 | \n",
+ " rs4147638 | \n",
+ " 22 | \n",
+ " 42487900 | \n",
+ " G/A | \n",
+ " ... | \n",
+ " SMDT1 | \n",
+ " ACTB | \n",
+ " G | \n",
+ " 2491.0 | \n",
+ " True | \n",
+ " rs4147638_ACTB;SMDT1 | \n",
+ " 0.000000 | \n",
+ " -3.748326 | \n",
+ " 3.748326 | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " rs4147638_SMDT1_RPS25 | \n",
+ " rs4147638_RPS25;SMDT1 | \n",
+ " RPS25;SMDT1 | \n",
+ " 22 | \n",
+ " 42475695 | \n",
+ " NaN | \n",
+ " RPS25;SMDT1 | \n",
+ " rs4147638 | \n",
+ " 22 | \n",
+ " 42487900 | \n",
+ " G/A | \n",
+ " ... | \n",
+ " SMDT1 | \n",
+ " RPS25 | \n",
+ " G | \n",
+ " 2491.0 | \n",
+ " True | \n",
+ " rs4147638_RPS25;SMDT1 | \n",
+ " 0.000000 | \n",
+ " 5.773036 | \n",
+ " -5.773036 | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " rs4147638_SMDT1_RPS3A | \n",
+ " rs4147638_RPS3A;SMDT1 | \n",
+ " RPS3A;SMDT1 | \n",
+ " 22 | \n",
+ " 42475695 | \n",
+ " NaN | \n",
+ " RPS3A;SMDT1 | \n",
+ " rs4147638 | \n",
+ " 22 | \n",
+ " 42487900 | \n",
+ " G/A | \n",
+ " ... | \n",
+ " SMDT1 | \n",
+ " RPS3A | \n",
+ " G | \n",
+ " 2491.0 | \n",
+ " True | \n",
+ " rs4147638_RPS3A;SMDT1 | \n",
+ " 0.000000 | \n",
+ " 4.434777 | \n",
+ " -4.434777 | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " rs4147638_SMDT1_RPS18 | \n",
+ " rs4147638_RPS18;SMDT1 | \n",
+ " RPS18;SMDT1 | \n",
+ " 22 | \n",
+ " 42475695 | \n",
+ " NaN | \n",
+ " RPS18;SMDT1 | \n",
+ " rs4147638 | \n",
+ " 22 | \n",
+ " 42487900 | \n",
+ " G/A | \n",
+ " ... | \n",
+ " SMDT1 | \n",
+ " RPS18 | \n",
+ " G | \n",
+ " 2491.0 | \n",
+ " True | \n",
+ " rs4147638_RPS18;SMDT1 | \n",
+ " 0.000000 | \n",
+ " 7.128733 | \n",
+ " -7.128733 | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " rs4147638_SMDT1_RPL11 | \n",
+ " rs4147638_RPL11;SMDT1 | \n",
+ " RPL11;SMDT1 | \n",
+ " 22 | \n",
+ " 42475695 | \n",
+ " NaN | \n",
+ " RPL11;SMDT1 | \n",
+ " rs4147638 | \n",
+ " 22 | \n",
+ " 42487900 | \n",
+ " G/A | \n",
+ " ... | \n",
+ " SMDT1 | \n",
+ " RPL11 | \n",
+ " G | \n",
+ " 2491.0 | \n",
+ " True | \n",
+ " rs4147638_RPL11;SMDT1 | \n",
+ " 0.000000 | \n",
+ " 5.896748 | \n",
+ " -5.896748 | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
497 rows × 55 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " snp_genepair Gene GeneChr \\\n",
+ "snp_gene1_gene2 \n",
+ "rs7605824_SH3YL1_NPM1 rs7605824_NPM1;SH3YL1 NPM1;SH3YL1 2 \n",
+ "rs7605824_SH3YL1_CD48 rs7605824_CD48;SH3YL1 CD48;SH3YL1 2 \n",
+ "rs7605824_SH3YL1_RPS13 rs7605824_RPS13;SH3YL1 RPS13;SH3YL1 2 \n",
+ "rs7605824_SH3YL1_RPL31 rs7605824_RPL31;SH3YL1 RPL31;SH3YL1 2 \n",
+ "rs7605824_SH3YL1_RPL3 rs7605824_RPL3;SH3YL1 RPL3;SH3YL1 2 \n",
+ "... ... ... ... \n",
+ "rs4147638_SMDT1_ACTB rs4147638_ACTB;SMDT1 ACTB;SMDT1 22 \n",
+ "rs4147638_SMDT1_RPS25 rs4147638_RPS25;SMDT1 RPS25;SMDT1 22 \n",
+ "rs4147638_SMDT1_RPS3A rs4147638_RPS3A;SMDT1 RPS3A;SMDT1 22 \n",
+ "rs4147638_SMDT1_RPS18 rs4147638_RPS18;SMDT1 RPS18;SMDT1 22 \n",
+ "rs4147638_SMDT1_RPL11 rs4147638_RPL11;SMDT1 RPL11;SMDT1 22 \n",
+ "\n",
+ " GenePos GeneStrand GeneSymbol SNP SNPChr \\\n",
+ "snp_gene1_gene2 \n",
+ "rs7605824_SH3YL1_NPM1 217730 NaN NPM1;SH3YL1 rs7605824 2 \n",
+ "rs7605824_SH3YL1_CD48 217730 NaN CD48;SH3YL1 rs7605824 2 \n",
+ "rs7605824_SH3YL1_RPS13 217730 NaN RPS13;SH3YL1 rs7605824 2 \n",
+ "rs7605824_SH3YL1_RPL31 217730 NaN RPL31;SH3YL1 rs7605824 2 \n",
+ "rs7605824_SH3YL1_RPL3 217730 NaN RPL3;SH3YL1 rs7605824 2 \n",
+ "... ... ... ... ... ... \n",
+ "rs4147638_SMDT1_ACTB 42475695 NaN ACTB;SMDT1 rs4147638 22 \n",
+ "rs4147638_SMDT1_RPS25 42475695 NaN RPS25;SMDT1 rs4147638 22 \n",
+ "rs4147638_SMDT1_RPS3A 42475695 NaN RPS3A;SMDT1 rs4147638 22 \n",
+ "rs4147638_SMDT1_RPS18 42475695 NaN RPS18;SMDT1 rs4147638 22 \n",
+ "rs4147638_SMDT1_RPL11 42475695 NaN RPL11;SMDT1 rs4147638 22 \n",
+ "\n",
+ " SNPPos SNPAlleles ... gene1_bios gene2_bios \\\n",
+ "snp_gene1_gene2 ... \n",
+ "rs7605824_SH3YL1_NPM1 280819 G/A ... SH3YL1 NPM1 \n",
+ "rs7605824_SH3YL1_CD48 280819 G/A ... SH3YL1 CD48 \n",
+ "rs7605824_SH3YL1_RPS13 280819 G/A ... SH3YL1 RPS13 \n",
+ "rs7605824_SH3YL1_RPL31 280819 G/A ... SH3YL1 RPL31 \n",
+ "rs7605824_SH3YL1_RPL3 280819 G/A ... SH3YL1 RPL3 \n",
+ "... ... ... ... ... ... \n",
+ "rs4147638_SMDT1_ACTB 42487900 G/A ... SMDT1 ACTB \n",
+ "rs4147638_SMDT1_RPS25 42487900 G/A ... SMDT1 RPS25 \n",
+ "rs4147638_SMDT1_RPS3A 42487900 G/A ... SMDT1 RPS3A \n",
+ "rs4147638_SMDT1_RPS18 42487900 G/A ... SMDT1 RPS18 \n",
+ "rs4147638_SMDT1_RPL11 42487900 G/A ... SMDT1 RPL11 \n",
+ "\n",
+ " assessed_allele_bios num_individuals_bios \\\n",
+ "snp_gene1_gene2 \n",
+ "rs7605824_SH3YL1_NPM1 A 2491.0 \n",
+ "rs7605824_SH3YL1_CD48 A 2491.0 \n",
+ "rs7605824_SH3YL1_RPS13 A 2491.0 \n",
+ "rs7605824_SH3YL1_RPL31 A 2491.0 \n",
+ "rs7605824_SH3YL1_RPL3 A 2491.0 \n",
+ "... ... ... \n",
+ "rs4147638_SMDT1_ACTB G 2491.0 \n",
+ "rs4147638_SMDT1_RPS25 G 2491.0 \n",
+ "rs4147638_SMDT1_RPS3A G 2491.0 \n",
+ "rs4147638_SMDT1_RPS18 G 2491.0 \n",
+ "rs4147638_SMDT1_RPL11 G 2491.0 \n",
+ "\n",
+ " isinteractionterm_bios snp_genepair_bios \\\n",
+ "snp_gene1_gene2 \n",
+ "rs7605824_SH3YL1_NPM1 True rs7605824_NPM1;SH3YL1 \n",
+ "rs7605824_SH3YL1_CD48 True rs7605824_CD48;SH3YL1 \n",
+ "rs7605824_SH3YL1_RPS13 True rs7605824_RPS13;SH3YL1 \n",
+ "rs7605824_SH3YL1_RPL31 True rs7605824_RPL31;SH3YL1 \n",
+ "rs7605824_SH3YL1_RPL3 True rs7605824_RPL3;SH3YL1 \n",
+ "... ... ... \n",
+ "rs4147638_SMDT1_ACTB True rs4147638_ACTB;SMDT1 \n",
+ "rs4147638_SMDT1_RPS25 True rs4147638_RPS25;SMDT1 \n",
+ "rs4147638_SMDT1_RPS3A True rs4147638_RPS3A;SMDT1 \n",
+ "rs4147638_SMDT1_RPS18 True rs4147638_RPS18;SMDT1 \n",
+ "rs4147638_SMDT1_RPL11 True rs4147638_RPL11;SMDT1 \n",
+ "\n",
+ " corrected_p_bios zscore_bios flipped_zscore_bios \\\n",
+ "snp_gene1_gene2 \n",
+ "rs7605824_SH3YL1_NPM1 0.000000 -3.617874 -3.617874 \n",
+ "rs7605824_SH3YL1_CD48 0.784422 -0.446946 -0.446946 \n",
+ "rs7605824_SH3YL1_RPS13 0.000000 -3.489377 -3.489377 \n",
+ "rs7605824_SH3YL1_RPL31 0.349601 -1.325633 -1.325633 \n",
+ "rs7605824_SH3YL1_RPL3 0.000000 -3.854851 -3.854851 \n",
+ "... ... ... ... \n",
+ "rs4147638_SMDT1_ACTB 0.000000 -3.748326 3.748326 \n",
+ "rs4147638_SMDT1_RPS25 0.000000 5.773036 -5.773036 \n",
+ "rs4147638_SMDT1_RPS3A 0.000000 4.434777 -4.434777 \n",
+ "rs4147638_SMDT1_RPS18 0.000000 7.128733 -7.128733 \n",
+ "rs4147638_SMDT1_RPL11 0.000000 5.896748 -5.896748 \n",
+ "\n",
+ " is_concordant \n",
+ "snp_gene1_gene2 \n",
+ "rs7605824_SH3YL1_NPM1 True \n",
+ "rs7605824_SH3YL1_CD48 True \n",
+ "rs7605824_SH3YL1_RPS13 True \n",
+ "rs7605824_SH3YL1_RPL31 True \n",
+ "rs7605824_SH3YL1_RPL3 True \n",
+ "... ... \n",
+ "rs4147638_SMDT1_ACTB True \n",
+ "rs4147638_SMDT1_RPS25 True \n",
+ "rs4147638_SMDT1_RPS3A True \n",
+ "rs4147638_SMDT1_RPS18 True \n",
+ "rs4147638_SMDT1_RPL11 True \n",
+ "\n",
+ "[497 rows x 55 columns]"
+ ]
+ },
+ "execution_count": 51,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "coeqtl_df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 52,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# flip direction according to AF\n",
+ "coeqtl_df['eqtl_effect_allele'] = [eqtl_allele_af_dict.get(eqtl)['AlleleAssessed'] for eqtl in \n",
+ " coeqtl_df['snp_eqtlgene']]\n",
+ "coeqtl_df['eqtl_alt_af'] = [eqtl_allele_af_dict.get(eqtl)['AF'] for eqtl in coeqtl_df['snp_eqtlgene']]\n",
+ "coeqtl_df['eqtl_alt_allele'] = [eqtl_allele_af_dict.get(eqtl)['alt_allele'] for eqtl in \n",
+ " coeqtl_df['snp_eqtlgene']]\n",
+ "coeqtl_df['eqtl_ref_allele'] = [eqtl_allele_af_dict.get(eqtl)['ref_allele'] for eqtl in \n",
+ " coeqtl_df['snp_eqtlgene']]\n",
+ "coeqtl_df[f'MetaPZ_flippedforAF'] = [flip_zscore(zscore, coeqtlallele, altaf, altallele)\n",
+ " for zscore, coeqtlallele, altaf, altallele in\n",
+ " coeqtl_df[[f'MetaPZ',\n",
+ " f'SNPEffectAllele',\n",
+ " 'eqtl_alt_af',\n",
+ " 'eqtl_alt_allele']].values]\n",
+ "coeqtl_df[f'flipped_zscore_bios_flippedforAF'] = [flip_zscore(zscore, coeqtlallele, altaf, altallele)\n",
+ " for zscore, coeqtlallele, altaf, altallele in\n",
+ " coeqtl_df[[f'flipped_zscore_bios',\n",
+ " f'SNPEffectAllele',\n",
+ " 'eqtl_alt_af',\n",
+ " 'eqtl_alt_allele']].values]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 53,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "0.9637681159420289\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "Text(3, -5, 'Concordance = 0.96\\nrb = 0.61')"
+ ]
+ },
+ "execution_count": 53,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "coeqtl_sig = coeqtl_df[coeqtl_df['corrected_p_bios']<=0.05]\n",
+ "coeqtl_nonsig = coeqtl_df[coeqtl_df['corrected_p_bios']>0.05]\n",
+ "plt.figure(figsize=(5, 5))\n",
+ "plt.scatter(coeqtl_nonsig['MetaPZ_flippedforAF'], \n",
+ " coeqtl_nonsig['flipped_zscore_bios_flippedforAF'], \n",
+ " label='Insignificant',\n",
+ " edgecolor='gray',\n",
+ " facecolor='white', alpha=1)\n",
+ "plt.scatter(coeqtl_sig['MetaPZ_flippedforAF'],\n",
+ " coeqtl_sig['flipped_zscore_bios_flippedforAF'], \n",
+ " label='Significant',\n",
+ " edgecolor=color_dict[celltype],\n",
+ " facecolor=color_dict[celltype], alpha=1)\n",
+ "plt.plot([-15, 12], [0, 0], linestyle='--', color='lightgray')\n",
+ "plt.plot([0, 0], [-6.5, 4], linestyle='--', color='lightgray')\n",
+ "plt.legend()\n",
+ "\n",
+ "concordance_rate = coeqtl_sig[coeqtl_sig['is_concordant']].shape[0] / coeqtl_sig.shape[0]\n",
+ "print(concordance_rate)\n",
+ "\n",
+ "celltype_rb = bios_replication_filtered_df.loc[celltype]['r']\n",
+ "plt.text(3, -5, f'Concordance = {concordance_rate:.2f}\\nrb = {celltype_rb:.2f}')\n",
+ "\n",
+ "# plt.savefig('bios_replication.cd4t.filtered_results.pdf')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 62,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 62,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "def plot_ci_manual(t, s_err, n, x, x2, y2, ax=None):\n",
+ " \"\"\"Return an axes of confidence bands using a simple approach.\n",
+ " \n",
+ " Notes\n",
+ " -----\n",
+ " .. math:: \\left| \\: \\hat{\\mu}_{y|x0} - \\mu_{y|x0} \\: \\right| \\; \\leq \\; T_{n-2}^{.975} \\; \\hat{\\sigma} \\; \\sqrt{\\frac{1}{n}+\\frac{(x_0-\\bar{x})^2}{\\sum_{i=1}^n{(x_i-\\bar{x})^2}}}\n",
+ " .. math:: \\hat{\\sigma} = \\sqrt{\\sum_{i=1}^n{\\frac{(y_i-\\hat{y})^2}{n-2}}}\n",
+ " \n",
+ " References\n",
+ " ----------\n",
+ " .. [1] M. Duarte. \"Curve fitting,\" Jupyter Notebook.\n",
+ " http://nbviewer.ipython.org/github/demotu/BMC/blob/master/notebooks/CurveFitting.ipynb\n",
+ " \n",
+ " \"\"\"\n",
+ " if ax is None:\n",
+ " ax = plt.gca()\n",
+ " \n",
+ " ci = t * s_err * np.sqrt(1/n + (x2 - np.mean(x))**2 / np.sum((x - np.mean(x))**2))\n",
+ " ax.fill_between(x2, y2 + ci, y2 - ci, alpha=0.1, color='gray')\n",
+ " return ax\n",
+ "\n",
+ "from scipy import stats\n",
+ "def equation(a, b):\n",
+ " \"\"\"Return a 1D polynomial.\"\"\"\n",
+ " return np.polyval(a, b) \n",
+ "\n",
+ "x=coeqtl_df['MetaPZ_flippedforAF']\n",
+ "y=coeqtl_df['flipped_zscore_bios_flippedforAF']\n",
+ "\n",
+ "p, cov = np.polyfit(x, y, 1, cov=True) # parameters and covariance from of the fit of 1-D polynom.\n",
+ "y_model = equation(p, x) \n",
+ "# Statistics\n",
+ "n = y.size # number of observations\n",
+ "m = p.size # number of parameters\n",
+ "dof = n - m # degrees of freedom\n",
+ "t = stats.t.ppf(0.975, n - m) # used for CI and PI bands\n",
+ "# Estimates of Error in Data/Model\n",
+ "resid = y - y_model \n",
+ "chi2 = np.sum((resid / y_model)**2) # chi-squared; estimates error in data\n",
+ "chi2_red = chi2 / dof # reduced chi-squared; measures goodness of fit\n",
+ "s_err = np.sqrt(np.sum(resid**2) / dof) # standard deviation of the error\n",
+ "\n",
+ "# Plotting --------------------------------------------------------------------\n",
+ "fig, ax = plt.subplots(figsize=(5, 5))\n",
+ "# Data\n",
+ "ax.scatter(\n",
+ " x, y\n",
+ ")\n",
+ "\n",
+ "\n",
+ "# Fit\n",
+ "ax.plot(x, y_model, \"-\", color=\"0.1\", linewidth=1.5, alpha=0.5, label=\"Fit\") \n",
+ "\n",
+ "x2 = np.linspace(np.min(x), np.max(x), 100)\n",
+ "y2 = equation(p, x2)\n",
+ "\n",
+ "# Confidence Interval (select one)\n",
+ "plot_ci_manual(t, s_err, n, x, x2, y2, ax=ax)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 46,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ ":19: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " coeqtl_sig['celltype'] = celltype\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 46,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# withbiostechnicalandcelltypePICs\n",
+ "sig_df = pd.DataFrame()\n",
+ "fig, axes = plt.subplots(2, 3, figsize=(15, 10), sharex=True, sharey=True)\n",
+ "celltypes = ['CD4T', 'CD8T', 'monocyte', 'NK', 'B', 'DC']\n",
+ "for i in range(2):\n",
+ " for j in range(3):\n",
+ " celltype = celltypes[i*3+j]\n",
+ " coeqtl_df = pd.read_csv(\n",
+ " coeqtl_withbios_prefix/filter_type/f'UT_{celltype}/coeqtls_fullresults_fixed.sig.withbiosonlyRNAAlignMetrics_rmLLD.tsv.gz',\n",
+ " compression='gzip', index_col=0, sep='\\t')\n",
+ " coeqtl_df['zscore_bios'] = [get_z_score(item[0], item[1]) for item in \n",
+ " coeqtl_df[['t_bios', \n",
+ " 'num_individuals_bios']].values]\n",
+ " coeqtl_df['flipped_zscore_bios'] = [flip_direction(item[0], item[1], item[2]) for item in \n",
+ " coeqtl_df[['SNPEffectAllele', \n",
+ " 'assessed_allele_bios',\n",
+ " 'zscore_bios']].values]\n",
+ " # flip the direction according to AF\n",
+ " coeqtl_df['eqtl_effect_allele'] = [eqtl_allele_af_dict.get(eqtl)['AlleleAssessed'] for eqtl in \n",
+ " coeqtl_df['snp_eqtlgene']]\n",
+ " coeqtl_df['eqtl_alt_af'] = [eqtl_allele_af_dict.get(eqtl)['AF'] for eqtl in coeqtl_df['snp_eqtlgene']]\n",
+ " coeqtl_df['eqtl_alt_allele'] = [eqtl_allele_af_dict.get(eqtl)['alt_allele'] for eqtl in \n",
+ " coeqtl_df['snp_eqtlgene']]\n",
+ " coeqtl_df['eqtl_ref_allele'] = [eqtl_allele_af_dict.get(eqtl)['ref_allele'] for eqtl in \n",
+ " coeqtl_df['snp_eqtlgene']]\n",
+ " coeqtl_df[f'MetaPZ_flippedforAF'] = [flip_zscore(zscore, coeqtlallele, altaf, altallele)\n",
+ " for zscore, coeqtlallele, altaf, altallele in\n",
+ " coeqtl_df[[f'MetaPZ',\n",
+ " f'SNPEffectAllele',\n",
+ " 'eqtl_alt_af',\n",
+ " 'eqtl_alt_allele']].values]\n",
+ " coeqtl_df[f'flipped_zscore_bios_flippedforAF'] = [flip_zscore(zscore, coeqtlallele, altaf, altallele)\n",
+ " for zscore, coeqtlallele, altaf, altallele in\n",
+ " coeqtl_df[[f'flipped_zscore_bios',\n",
+ " f'SNPEffectAllele',\n",
+ " 'eqtl_alt_af',\n",
+ " 'eqtl_alt_allele']].values]\n",
+ " ## end flip\n",
+ " coeqtl_sig = coeqtl_df[coeqtl_df['corrected_p_bios']<=0.05]\n",
+ " coeqtl_sig['celltype'] = celltype\n",
+ " sig_df = pd.concat([coeqtl_sig, sig_df], axis=0)\n",
+ " significant_ratio = coeqtl_sig.shape[0] / coeqtl_df.shape[0]\n",
+ " coeqtl_sig_samedirection = coeqtl_sig[((coeqtl_sig['MetaPZ']>0) & (coeqtl_sig['flipped_zscore_bios']>0)) | \n",
+ " ((coeqtl_sig['MetaPZ']<0) & (coeqtl_sig['flipped_zscore_bios']<0))]\n",
+ " consistent_ratio = coeqtl_sig_samedirection.shape[0] / coeqtl_sig.shape[0]\n",
+ " # draw\n",
+ " ax = axes[i][j]\n",
+ " ax.scatter(coeqtl_df['MetaPZ'][coeqtl_df['corrected_p_bios']>0.05], \n",
+ " coeqtl_df['flipped_zscore_bios'][coeqtl_df['corrected_p_bios']>0.05], alpha=0.5,\n",
+ " label='Non-sig')\n",
+ " ax.scatter(coeqtl_df['MetaPZ'][coeqtl_df['corrected_p_bios']<=0.05],\n",
+ " coeqtl_df['flipped_zscore_bios'][coeqtl_df['corrected_p_bios']<=0.05], alpha=0.5,\n",
+ " label='Sig')\n",
+ " ax.set_xlabel('single cell')\n",
+ " ax.set_ylabel('BIOS')\n",
+ " ax.set_title(celltype)\n",
+ " ax.text(-2, -8, \n",
+ " f'Significant ratio: {significant_ratio:.2f}\\nConcordance ratio: {consistent_ratio:.2f}')\n",
+ "ax.legend(loc='upper left')\n",
+ " \n",
+ "# plt.savefig('bios_replication.filtered_results.scatterplots.pdf')\n",
+ "# plt.savefig('bios_replication.filtered_results.scatterplots.png')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ ":19: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " coeqtl_sig['celltype'] = celltype\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# unfiltered results \n",
+ "# withbiosonlyRNAAlignMetrics_rmLLD\n",
+ "sig_df = pd.DataFrame()\n",
+ "fig, axes = plt.subplots(2, 3, figsize=(15, 10), sharex=True, sharey=True)\n",
+ "celltypes = ['CD4T', 'CD8T', 'monocyte', 'NK', 'B', 'DC']\n",
+ "for i in range(2):\n",
+ " for j in range(3):\n",
+ " celltype = celltypes[i*3+j]\n",
+ " coeqtl_df = pd.read_csv(\n",
+ " coeqtl_withbios_prefix/'unfiltered_results'/f'UT_{celltype}/coeqtls_fullresults_fixed.sig.withbiosonlyRNAAlignMetrics_rmLLD.tsv.gz',\n",
+ " compression='gzip', index_col=0, sep='\\t')\n",
+ " coeqtl_df['zscore_bios'] = [get_z_score(item[0], item[1]) for item in \n",
+ " coeqtl_df[['t_bios', \n",
+ " 'num_individuals_bios']].values]\n",
+ " coeqtl_df['flipped_zscore_bios'] = [flip_direction(item[0], item[1], item[2]) for item in \n",
+ " coeqtl_df[['SNPEffectAllele', \n",
+ " 'assessed_allele_bios',\n",
+ " 'zscore_bios']].values]\n",
+ " coeqtl_sig = coeqtl_df[coeqtl_df['corrected_p_bios']<=0.05]\n",
+ " coeqtl_sig['celltype'] = celltype\n",
+ " sig_df = pd.concat([coeqtl_sig, sig_df], axis=0)\n",
+ " # draw\n",
+ " ax = axes[i][j]\n",
+ " ax.scatter(coeqtl_df['MetaPZ'][coeqtl_df['corrected_p_bios']>0.05], \n",
+ " coeqtl_df['flipped_zscore_bios'][coeqtl_df['corrected_p_bios']>0.05], alpha=0.5,\n",
+ " label='Non-sig')\n",
+ " ax.scatter(coeqtl_df['MetaPZ'][coeqtl_df['corrected_p_bios']<=0.05],\n",
+ " coeqtl_df['flipped_zscore_bios'][coeqtl_df['corrected_p_bios']<=0.05], alpha=0.5,\n",
+ " label='Sig')\n",
+ " ax.set_xlabel('single cell')\n",
+ " ax.set_ylabel('BIOS')\n",
+ " ax.set_title(celltype)\n",
+ "ax.legend(loc='upper left')\n",
+ "# plt.savefig('bios_replication.unfiltered_results.scatterplots.pdf')\n",
+ "# plt.savefig('bios_replication.unfiltered_results.scatterplots.png', dpi=300)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.11"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/04_coeqtl_mapping/.ipynb_checkpoints/plot_example_imputed_zero-checkpoint.ipynb b/04_coeqtl_mapping/.ipynb_checkpoints/plot_example_imputed_zero-checkpoint.ipynb
new file mode 100644
index 0000000..fc28e77
--- /dev/null
+++ b/04_coeqtl_mapping/.ipynb_checkpoints/plot_example_imputed_zero-checkpoint.ipynb
@@ -0,0 +1,571 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import matplotlib.pyplot as plt\n",
+ "import seaborn as sns\n",
+ "import os\n",
+ "import re\n",
+ "from pathlib import Path\n",
+ "\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import scanpy as sc\n",
+ "from scipy.stats import spearmanr, pearsonr\n",
+ "from scipy.stats import t, norm\n",
+ "from tqdm import tqdm\n",
+ "\n",
+ "\n",
+ "def get_time(x):\n",
+ " if x == 'UT':\n",
+ " return x\n",
+ " else:\n",
+ " pattern = re.compile(r'\\d+h')\n",
+ " return re.findall(pattern, x)[0]\n",
+ "\n",
+ "\n",
+ "class DATASET:\n",
+ " def __init__(self, datasetname):\n",
+ " self.name = datasetname\n",
+ " self.path_prefix = Path(\"./seurat_objects\")\n",
+ " self.information = self.get_information()\n",
+ " def get_information(self):\n",
+ " if self.name == 'onemillionv2':\n",
+ " self.path = '1M_v2_mediumQC_ctd_rnanormed_demuxids_20201029.sct.h5ad'\n",
+ " self.individual_id_col = 'assignment'\n",
+ " self.timepoint_id_col = 'time'\n",
+ " self.celltype_id = 'cell_type_lowerres'\n",
+ " self.chosen_condition = {'UT': 'UT',\n",
+ " 'stimulated': '3h'}\n",
+ " elif self.name == 'onemillionv3':\n",
+ " self.path = '1M_v3_mediumQC_ctd_rnanormed_demuxids_20201106.SCT.h5ad'\n",
+ " self.individual_id_col = 'assignment'\n",
+ " self.timepoint_id_col = 'time'\n",
+ " self.celltype_id = 'cell_type_lowerres'\n",
+ " self.chosen_condition = {'UT': 'UT',\n",
+ " 'stimulated': '3h'}\n",
+ " elif self.name == 'stemiv2':\n",
+ " self.path = 'cardio.integrated.20210301.stemiv2.h5ad'\n",
+ " self.individual_id_col = 'assignment.final'\n",
+ " self.timepoint_id_col = 'timepoint.final'\n",
+ " self.celltype_id = 'cell_type_lowerres'\n",
+ " self.chosen_condition = {'UT': 't8w',\n",
+ " 'stimulated': 'Baseline'}\n",
+ " elif self.name == 'ng':\n",
+ " self.path = 'pilot3_seurat3_200420_sct_azimuth.h5ad'\n",
+ " self.individual_id_col = 'snumber'\n",
+ " self.celltype_id = 'cell_type_mapped_to_onemillion'\n",
+ " else:\n",
+ " raise IOError(\"Dataset name not understood.\")\n",
+ " def load_dataset(self):\n",
+ " self.get_information()\n",
+ " print(f'Loading dataset {self.name} from {self.path_prefix} {self.path}')\n",
+ " self.data_sc = sc.read_h5ad(self.path_prefix / self.path)\n",
+ " if self.name.startswith('onemillion'):\n",
+ " self.data_sc.obs['time'] = [get_time(item) for item in self.data_sc.obs['timepoint']]\n",
+ " elif self.name == 'ng':\n",
+ " celltype_maping = {'CD4 T': 'CD4T', 'CD8 T': 'CD8T', 'Mono': 'monocyte', 'DC': 'DC', 'NK': 'NK',\n",
+ " 'other T': 'otherT', 'other': 'other', 'B': 'B'}\n",
+ " self.data_sc.obs['cell_type_mapped_to_onemillion'] = [celltype_maping.get(name) for name in\n",
+ " self.data_sc.obs['predicted.celltype.l1']]\n",
+ " def get_cMono_ncMono(self):\n",
+ " def tell_cmono_foronemillion(x):\n",
+ " if x == 'mono 1' or x == 'mono 3' or x == 'mono 4':\n",
+ " return 'cMono'\n",
+ " elif x == 'mono 2':\n",
+ " return 'ncMono'\n",
+ " if self.name.startswith('onemillion'):\n",
+ " self.data_sc.obs['sub_monocytes'] = [tell_cmono_foronemillion(x) for x in\n",
+ " self.data_sc.obs['cell_type']]\n",
+ " self.cmono = self.data_sc[self.data_sc.obs['sub_monocytes'] == 'cMono']\n",
+ " self.ncmono = self.data_sc[self.data_sc.obs['sub_monocytes'] == 'ncMono']\n",
+ " elif self.name.startswith('stemi'):\n",
+ " self.cmono = self.data_sc[self.data_sc.obs['cell_type'] == 'cMono']\n",
+ " self.ncmono = self.data_sc[self.data_sc.obs['cell_type'] == 'ncMono']\n",
+ " elif self.name == 'ng':\n",
+ " self.cmono = self.data_sc[self.data_sc.obs['predicted.celltype.l2'] == 'CD14 Mono']\n",
+ " self.ncmono = self.data_sc[self.data_sc.obs['predicted.celltype.l2'] == 'CD16 Mono']\n",
+ " else:\n",
+ " raise IOError(\"Dataset name not understood.\")\n",
+ "\n",
+ "example_savedir = Path(\n",
+ " \"/groups/umcg-lld/tmp01/projects/1MCellRNAseq/GRN_reconstruction/ongoing/coeqtl_mapping/output/examples\"\n",
+ ")\n",
+ "\n",
+ "import subprocess\n",
+ "bashfile_path = '/groups/umcg-lld/tmp01/projects/1MCellRNAseq/GRN_reconstruction/ongoing/coeqtl_mapping/bios/select_snps_from_vcf.sh'\n",
+ "def get_snps_from_vcffile(bashfile_path, vcf_path, snps_path, savepath):\n",
+ " response = subprocess.run([bashfile_path, vcf_path, snps_path, savepath])\n",
+ " print(response)\n",
+ " return None\n",
+ "\n",
+ "# sample id mapping\n",
+ "gtefile = pd.read_csv(\n",
+ " '/groups/umcg-lld/tmp01/projects/1MCellRNAseq/GRN_reconstruction/ongoing/coeqtl_mapping/input/summary/gte-fix.tsv',\n",
+ " sep='\\t'\n",
+ ")\n",
+ "gte_dict = gtefile.set_index(\"expressionsampleID\")[\"genotypesampleID\"].T.to_dict()\n",
+ "\n",
+ "\n",
+ "def corr_to_z(coef, num):\n",
+ " t_statistic = coef * np.sqrt((num - 2) / (1 - coef ** 2))\n",
+ " prob = t.cdf(t_statistic, num - 2)\n",
+ " z_score = norm.ppf(prob)\n",
+ " positive_coef_probs = 1 - prob\n",
+ " positive_coef_probs[coef < 0] = 0\n",
+ " negative_coef_probs = prob\n",
+ " negative_coef_probs[coef > 0] = 0\n",
+ " probs = negative_coef_probs + positive_coef_probs\n",
+ " return z_score, probs\n",
+ "\n",
+ "\n",
+ "def get_individual_networks_selected_genepairs(data_df, data_sc, individual_colname, genepair, fillna=False):\n",
+ "# data_df = pd.DataFrame(data=data_sc.X.toarray(),\n",
+ "# index=data_sc.obs.index,\n",
+ "# columns=data_sc.var.index)\n",
+ " gene1, gene2 = genepair.split(';')\n",
+ " sorted_genepair = [';'.join(sorted([gene1, gene2]))]\n",
+ " coef_df = pd.DataFrame(index=sorted_genepair)\n",
+ " coef_p_df = pd.DataFrame(index=sorted_genepair)\n",
+ " zscore_df = pd.DataFrame(index=sorted_genepair)\n",
+ " zscore_p_df = pd.DataFrame(index=sorted_genepair)\n",
+ " data_selected_df = data_df[[gene1, gene2]]\n",
+ " print(\n",
+ " f\"Calculating networks for {len(data_sc.obs[individual_colname].unique())} individuals and;\\n{genepair}\"\n",
+ " )\n",
+ " for ind_id in tqdm(data_sc.obs[individual_colname].unique()):\n",
+ " cell_num = data_sc.obs[data_sc.obs[individual_colname] == ind_id].shape[0]\n",
+ " if cell_num > 10:\n",
+ " individual_df = data_selected_df.loc[data_sc.obs[individual_colname] == ind_id]\n",
+ " individual_coefs, individual_coef_ps = spearmanr(individual_df.values, axis=0)\n",
+ " if data_selected_df.shape[1] == 2:\n",
+ " individual_coefs_flatten = pd.DataFrame(data = [individual_coefs],\n",
+ " index = sorted_genepair)\n",
+ " individual_coef_ps_flatten = \\\n",
+ " pd.DataFrame(data=[individual_coef_ps],\n",
+ " index=sorted_genepair)\n",
+ " else:\n",
+ " individual_coefs_flatten = pd.DataFrame(\n",
+ " data=individual_coefs[np.triu_indices_from(individual_coefs, 1)],\n",
+ " index=sorted_genepair).loc[sorted_genepair]\n",
+ " individual_coef_ps_flatten = \\\n",
+ " pd.DataFrame(data=individual_coef_ps[np.triu_indices_from(individual_coefs, 1)],\n",
+ " index=sorted_genepair).loc[sorted_genepair]\n",
+ " coef_df[ind_id] = individual_coefs_flatten\n",
+ " coef_p_df[ind_id] = individual_coef_ps_flatten\n",
+ " try:\n",
+ " individual_zscores_flatten, individual_zscore_ps_flatten = corr_to_z(\n",
+ " individual_coefs_flatten.values,\n",
+ " cell_num\n",
+ " )\n",
+ " zscore_df[ind_id] = individual_zscores_flatten\n",
+ " zscore_p_df[ind_id] = individual_zscore_ps_flatten\n",
+ " except:\n",
+ " continue\n",
+ " else:\n",
+ " print(\"Deleted this individual because of low cell number\", cell_num)\n",
+ " if fillna:\n",
+ " zscore_df = zscore_df.fillna(0)\n",
+ " return data_selected_df, zscore_df, zscore_p_df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Loading dataset onemillionv2 from /groups/umcg-lld/tmp01/projects/1MCellRNAseq/GRN_reconstruction/ongoing/seurat_objects 1M_v2_mediumQC_ctd_rnanormed_demuxids_20201029.sct.h5ad\n"
+ ]
+ }
+ ],
+ "source": [
+ "datasetname = 'onemillionv2'\n",
+ "dataset = DATASET(datasetname)\n",
+ "dataset.load_dataset()\n",
+ "data_sc = dataset.data_sc"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "CompletedProcess(args=['/groups/umcg-lld/tmp01/projects/1MCellRNAseq/GRN_reconstruction/ongoing/coeqtl_mapping/bios/select_snps_from_vcf.sh', '/groups/umcg-lld/tmp01/projects/1MCellRNAseq/GRN_reconstruction/ongoing/coeqtl_mapping/output/genotypevcfs/chr1/GenotypeData.vcf.gz', PosixPath('/groups/umcg-lld/tmp01/projects/1MCellRNAseq/GRN_reconstruction/ongoing/coeqtl_mapping/output/examples/snplist.rs221045'), PosixPath('/groups/umcg-lld/tmp01/projects/1MCellRNAseq/GRN_reconstruction/ongoing/coeqtl_mapping/output/examples/rs221045.vcf')], returncode=0)\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " #CHROM | \n",
+ " POS | \n",
+ " ID | \n",
+ " REF | \n",
+ " ALT | \n",
+ " QUAL | \n",
+ " FILTER | \n",
+ " INFO | \n",
+ " FORMAT | \n",
+ " 1_LLDeep_1191 | \n",
+ " ... | \n",
+ " s21 | \n",
+ " s43 | \n",
+ " s24 | \n",
+ " s23 | \n",
+ " s45 | \n",
+ " s26 | \n",
+ " s25 | \n",
+ " s28 | \n",
+ " s27 | \n",
+ " s29 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 16530049 | \n",
+ " rs221045 | \n",
+ " T | \n",
+ " C | \n",
+ " . | \n",
+ " . | \n",
+ " . | \n",
+ " GT:DS | \n",
+ " 0/0:0.03 | \n",
+ " ... | \n",
+ " 0/1:1.0 | \n",
+ " 0/0:0.010000000000000009 | \n",
+ " 0/1:1.0 | \n",
+ " 0/0:0.0 | \n",
+ " 0/0:0.0 | \n",
+ " 1/1:2.0 | \n",
+ " 0/0:0.0 | \n",
+ " 0/1:1.0 | \n",
+ " 0/0:0.0 | \n",
+ " 0/1:1.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
1 rows × 182 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT 1_LLDeep_1191 \\\n",
+ "0 1 16530049 rs221045 T C . . . GT:DS 0/0:0.03 \n",
+ "\n",
+ " ... s21 s43 s24 s23 s45 s26 \\\n",
+ "0 ... 0/1:1.0 0/0:0.010000000000000009 0/1:1.0 0/0:0.0 0/0:0.0 1/1:2.0 \n",
+ "\n",
+ " s25 s28 s27 s29 \n",
+ "0 0/0:0.0 0/1:1.0 0/0:0.0 0/1:1.0 \n",
+ "\n",
+ "[1 rows x 182 columns]"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "celltype = 'monocyte'\n",
+ "snp_id = 'rs221045'\n",
+ "chromosome = '1'\n",
+ "snp_vcf_path = example_savedir/f'{snp_id}.vcf'\n",
+ "with open(example_savedir/f'snplist.{snp_id}', 'w') as f:\n",
+ " f.write(f'{snp_id}\\n')\n",
+ "vcf_path = f'/groups/umcg-lld/tmp01/projects/1MCellRNAseq/GRN_reconstruction/ongoing/coeqtl_mapping/output/genotypevcfs/chr{chromosome}/GenotypeData.vcf.gz'\n",
+ "get_snps_from_vcffile(bashfile_path, vcf_path, example_savedir/f'snplist.{snp_id}', snp_vcf_path)\n",
+ "gt = pd.read_csv(snp_vcf_path, sep='\\t', skiprows=6)\n",
+ "gt"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {
+ "scrolled": false
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Calculating networks for 72 individuals and;\n",
+ "AC005076.5;ARHGEF19\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 0%| | 0/72 [00:00, ?it/s]/groups/umcg-lld/tmp01/projects/1MCellRNAseq/GRN_reconstruction/tools/Beeline/miniconda/envs/scpy3.8/lib/python3.8/site-packages/scipy/stats/stats.py:4264: SpearmanRConstantInputWarning: An input array is constant; the correlation coefficent is not defined.\n",
+ " warnings.warn(SpearmanRConstantInputWarning())\n",
+ "100%|██████████| 72/72 [00:00<00:00, 210.51it/s]\n",
+ "/groups/umcg-lld/tmp01/projects/1MCellRNAseq/GRN_reconstruction/tools/Beeline/miniconda/envs/scpy3.8/lib/python3.8/site-packages/seaborn/categorical.py:1296: UserWarning: 42.5% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.\n",
+ " warnings.warn(msg, UserWarning)\n",
+ "/groups/umcg-lld/tmp01/projects/1MCellRNAseq/GRN_reconstruction/tools/Beeline/miniconda/envs/scpy3.8/lib/python3.8/site-packages/seaborn/categorical.py:1296: UserWarning: 7.1% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.\n",
+ " warnings.warn(msg, UserWarning)\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Not Imputed SpearmanrResult(correlation=-0.028018282506059713, pvalue=0.8942369051146191)\n",
+ "Imputed SpearmanrResult(correlation=-0.24638574744096847, pvalue=0.03833253459364005)\n"
+ ]
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# genepair = 'RP1-29C18.10;ZNF501'\n",
+ "# genepair = 'CCDC15;UNC5B'\n",
+ "# genepair = 'GSTM3;RP1-29C18.10'\n",
+ "# genepair = 'MMEL1;SARS2'\n",
+ "genepair = 'AC005076.5;ARHGEF19'\n",
+ "gene1, gene2 = genepair.split(';')\n",
+ "\n",
+ "if datasetname == 'ng':\n",
+ " ut_celltype = data_sc[data_sc.obs[dataset.celltype_id]==celltype]\n",
+ "else:\n",
+ " ut_celltype = data_sc[(data_sc.obs[dataset.celltype_id]==celltype) &\n",
+ " (data_sc.obs[dataset.timepoint_id_col]==dataset.chosen_condition['UT'])]\n",
+ "\n",
+ "ut_celltype_df = pd.DataFrame(data=ut_celltype.X.toarray(),\n",
+ " columns=ut_celltype.var.index,\n",
+ " index=ut_celltype.obs.index)\n",
+ "selected_expression_df, ut_zscore_df, ut_zscore_p_df = get_individual_networks_selected_genepairs(\n",
+ " data_df = ut_celltype_df,\n",
+ " data_sc = ut_celltype,\n",
+ " individual_colname = dataset.individual_id_col,\n",
+ " genepair = genepair,\n",
+ " fillna=False\n",
+ ")\n",
+ "\n",
+ "ut_t = ut_zscore_df.T\n",
+ "ut_t['gt_sampleid'] = [gte_dict.get(name) for name in ut_t.index]\n",
+ "ut_t = ut_t.set_index('gt_sampleid')\n",
+ "common_individuals = list(set(gt.columns) & set(ut_t.index))\n",
+ "gt_t = gt[common_individuals].T\n",
+ "gt_t['genotype'] = [item.split(':')[0].count('1') for item in gt_t[0]]\n",
+ "concat_df = pd.concat([gt_t, ut_t], axis=1).replace([np.inf, -np.inf], np.nan).dropna()\n",
+ "print('Not Imputed', spearmanr(concat_df['genotype'], concat_df[genepair]))\n",
+ "\n",
+ "ut_t_imputed = ut_zscore_df.fillna(0).T\n",
+ "ut_t_imputed['gt_sampleid'] = [gte_dict.get(name) for name in ut_t_imputed.index]\n",
+ "ut_t_imputed = ut_t_imputed.set_index('gt_sampleid')\n",
+ "common_individuals_imputed = list(set(gt.columns) & set(ut_t_imputed.index))\n",
+ "gt_t_imputed = gt[common_individuals_imputed].T\n",
+ "gt_t_imputed['genotype'] = [item.split(':')[0].count('1') for item in gt_t_imputed[0]]\n",
+ "concat_imputed_df = pd.concat([gt_t_imputed, ut_t_imputed], axis=1).replace([np.inf, -np.inf], np.nan).dropna()\n",
+ "print('Imputed', spearmanr(concat_imputed_df['genotype'], concat_imputed_df[genepair]))\n",
+ "\n",
+ "# dosage_dict = gt_t['genotype'].T.to_dict()\n",
+ "# selected_expression_df_withsample = pd.concat([selected_expression_df,\n",
+ "# ut_celltype.obs[[dataset.individual_id_col]]],\n",
+ "# axis=1)\n",
+ "# selected_expression_df_withsample['gt_sampleid'] = [gte_dict.get(name) for name in\n",
+ "# selected_expression_df_withsample[dataset.individual_id_col]]\n",
+ "# selected_expression_df_withsample['genotype'] = [dosage_dict.get(gt_sampleid) for gt_sampleid in\n",
+ "# selected_expression_df_withsample['gt_sampleid']]\n",
+ "\n",
+ "sns.set_style('white')\n",
+ "refallele = gt['REF'].values[0]\n",
+ "altallele = gt['ALT'].values[0]\n",
+ "snp_name = f'{snp_id}_{altallele}'\n",
+ "\n",
+ "_, axes = plt.subplots(1, 2, figsize=(10, 5), sharey=True)\n",
+ "ax1, ax2 = axes\n",
+ "\n",
+ "im_coef, im_p = spearmanr(concat_imputed_df['genotype'], concat_imputed_df[genepair])\n",
+ "sns.violinplot(x=concat_imputed_df['genotype'], \n",
+ " y=concat_imputed_df[genepair], \n",
+ " ax=ax1,\n",
+ " inner=None)\n",
+ "sns.swarmplot(x=concat_imputed_df['genotype'], \n",
+ " y=concat_imputed_df[genepair], \n",
+ " ax=ax1,\n",
+ " color='black')\n",
+ "ax1.set_title(f'Imputed r={im_coef:.2f}; pvalue {im_p:.4f}')\n",
+ "# ax1.set_xticklabels([f'{refallele}{refallele}', \n",
+ "# f'{refallele}{altallele}',\n",
+ "# f'{altallele}{altallele}'])\n",
+ "ax1.set_xlabel(snp_id)\n",
+ "\n",
+ "coef, p = spearmanr(concat_df['genotype'], concat_df[genepair])\n",
+ "sns.violinplot(x=concat_df['genotype'], \n",
+ " y=concat_df[genepair], \n",
+ " ax=ax2,\n",
+ " inner=None)\n",
+ "sns.swarmplot(x=concat_df['genotype'], \n",
+ " y=concat_df[genepair], \n",
+ " ax=ax2,\n",
+ " color='black')\n",
+ "ax2.set_xlabel('')\n",
+ "ax2.set_title(f'Not Imputed r={coef:.2f}; pvalue {p:.4f}')\n",
+ "# ax2.set_xticklabels([f'{refallele}{refallele}', \n",
+ "# f'{refallele}{altallele}',\n",
+ "# f'{altallele}{altallele}'])\n",
+ "ax2.set_xlabel(snp_id)\n",
+ "plt.savefig(example_savedir/f'{snp_name}_ref{refallele}_alt{altallele}_{gene1}_{gene2}.{celltype}_{datasetname}.full.pdf')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/groups/umcg-lld/tmp01/projects/1MCellRNAseq/GRN_reconstruction/tools/Beeline/miniconda/envs/scpy3.8/lib/python3.8/site-packages/seaborn/categorical.py:1296: UserWarning: 42.5% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.\n",
+ " warnings.warn(msg, UserWarning)\n",
+ "/groups/umcg-lld/tmp01/projects/1MCellRNAseq/GRN_reconstruction/tools/Beeline/miniconda/envs/scpy3.8/lib/python3.8/site-packages/seaborn/categorical.py:1296: UserWarning: 7.1% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.\n",
+ " warnings.warn(msg, UserWarning)\n"
+ ]
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "_, axes = plt.subplots(1, 2, figsize=(10, 5), sharey=True)\n",
+ "ax1, ax2 = axes\n",
+ "\n",
+ "im_coef, im_p = spearmanr(concat_imputed_df['genotype'], concat_imputed_df[genepair])\n",
+ "# sns.violinplot(x=concat_imputed_df['genotype'], \n",
+ "# y=concat_imputed_df[genepair], \n",
+ "# ax=ax1,\n",
+ "# inner=None)\n",
+ "sns.swarmplot(x=concat_imputed_df['genotype'], \n",
+ " y=concat_imputed_df[genepair], \n",
+ " ax=ax1,\n",
+ " color='black')\n",
+ "sns.regplot(x=concat_imputed_df['genotype'], \n",
+ " y=concat_imputed_df[genepair], \n",
+ " ax=ax1, scatter=False)\n",
+ "ax1.set_title(f'Imputed r={im_coef:.2f}; pvalue {im_p:.4f}')\n",
+ "ax1.set_xticklabels([f'{refallele}{refallele}', \n",
+ " f'{refallele}{altallele}',\n",
+ " f'{altallele}{altallele}'])\n",
+ "ax1.set_xlabel(snp_id)\n",
+ "\n",
+ "coef, p = spearmanr(concat_df['genotype'], concat_df[genepair])\n",
+ "# sns.violinplot(x=concat_df['genotype'], \n",
+ "# y=concat_df[genepair], \n",
+ "# ax=ax2,\n",
+ "# inner=None)\n",
+ "sns.swarmplot(x=concat_df['genotype'], \n",
+ " y=concat_df[genepair], \n",
+ " ax=ax2,\n",
+ " color='black')\n",
+ "sns.regplot(x=concat_df['genotype'], \n",
+ " y=concat_df[genepair], \n",
+ " ax=ax2, scatter=False)\n",
+ "ax2.set_xlabel('')\n",
+ "ax2.set_title(f'Not Imputed r={coef:.2f}; pvalue {p:.4f}')\n",
+ "ax2.set_xticklabels([f'{refallele}{refallele}', \n",
+ " f'{refallele}{altallele}',\n",
+ " f'{altallele}{altallele}'])\n",
+ "ax2.set_xlabel(snp_id)\n",
+ "plt.savefig(example_savedir/f'{snp_name}_ref{refallele}_alt{altallele}_{gene1}_{gene2}.{celltype}_{datasetname}.full.pdf')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 112,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "PosixPath('/groups/umcg-lld/tmp01/projects/1MCellRNAseq/GRN_reconstruction/ongoing/coeqtl_mapping/output/examples/rs221045_C_refT_altC_AC005076.5_ARHGEF19.monocyte_onemillionv2.full.pdf')"
+ ]
+ },
+ "execution_count": 112,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "example_savedir/f'{snp_name}_ref{refallele}_alt{altallele}_{gene1}_{gene2}.{celltype}_{datasetname}.full.pdf'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.11"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/04_coeqtl_mapping/.ipynb_checkpoints/rb_celltypes-checkpoint.ipynb b/04_coeqtl_mapping/.ipynb_checkpoints/rb_celltypes-checkpoint.ipynb
new file mode 100644
index 0000000..834ede3
--- /dev/null
+++ b/04_coeqtl_mapping/.ipynb_checkpoints/rb_celltypes-checkpoint.ipynb
@@ -0,0 +1,2026 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 67,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import matplotlib as mpl\n",
+ "mpl.rcParams['pdf.fonttype'] = 42\n",
+ "import matplotlib.pyplot as plt\n",
+ "import seaborn as sns\n",
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "%matplotlib inline\n",
+ "\n",
+ "from pathlib import Path\n",
+ "workdir = Path(\"./coeqtl_mapping/\")\n",
+ "\n",
+ "celltypes = ['CD4T', 'CD8T', 'monocyte', 'DC', 'NK', 'B']\n",
+ "import matplotlib\n",
+ "def heatmap(data, row_labels, col_labels, ax=None,\n",
+ " cbar_kw={}, cbarlabel=\"\", **kwargs):\n",
+ " \"\"\"\n",
+ " Create a heatmap from a numpy array and two lists of labels.\n",
+ "\n",
+ " Parameters\n",
+ " ----------\n",
+ " data\n",
+ " A 2D numpy array of shape (M, N).\n",
+ " row_labels\n",
+ " A list or array of length M with the labels for the rows.\n",
+ " col_labels\n",
+ " A list or array of length N with the labels for the columns.\n",
+ " ax\n",
+ " A `matplotlib.axes.Axes` instance to which the heatmap is plotted. If\n",
+ " not provided, use current axes or create a new one. Optional.\n",
+ " cbar_kw\n",
+ " A dictionary with arguments to `matplotlib.Figure.colorbar`. Optional.\n",
+ " cbarlabel\n",
+ " The label for the colorbar. Optional.\n",
+ " **kwargs\n",
+ " All other arguments are forwarded to `imshow`.\n",
+ " \"\"\"\n",
+ "\n",
+ " if not ax:\n",
+ " ax = plt.gca()\n",
+ "\n",
+ " # Plot the heatmap\n",
+ " im = ax.pcolormesh(data, **kwargs)\n",
+ "\n",
+ " # Create colorbar\n",
+ " cbar = ax.figure.colorbar(im, ax=ax, **cbar_kw)\n",
+ " cbar.ax.set_ylabel(cbarlabel, rotation=-90, va=\"bottom\")\n",
+ "\n",
+ " # Let the horizontal axes labeling appear on top.\n",
+ " ax.tick_params(top=True, bottom=False,\n",
+ " labeltop=True, labelbottom=False)\n",
+ "\n",
+ " # Rotate the tick labels and set their alignment.\n",
+ " plt.setp(ax.get_xticklabels(), rotation=-30, ha=\"right\",\n",
+ " rotation_mode=\"anchor\")\n",
+ "\n",
+ " # Turn spines off and create white grid.\n",
+ "# ax.spines[:].set_visible(False)\n",
+ "\n",
+ "# ax.set_xticks(np.arange(-0.5, data.shape[1]-2, 1), minor=True)\n",
+ "# ax.set_yticks(np.arange(-0.5, data.shape[0]-2, 1), minor=True)\n",
+ " # Show all ticks and label them with the respective list entries.\n",
+ " ax.set_xticklabels([\"\"]+col_labels)\n",
+ " ax.set_yticklabels([\"\"]+row_labels)\n",
+ "# ax.grid(which='minor', color=\"white\", linestyle='-', linewidth=2)\n",
+ "# ax.tick_params(which=\"minor\", bottom=False, left=False)\n",
+ " return im, cbar\n",
+ "\n",
+ "\n",
+ "def annotate_heatmap(im, data=None, valfmt=\"{x:.2f}\",\n",
+ " textcolors=(\"black\", \"white\"),\n",
+ " threshold=None, **textkw):\n",
+ " \"\"\"\n",
+ " A function to annotate a heatmap.\n",
+ "\n",
+ " Parameters\n",
+ " ----------\n",
+ " im\n",
+ " The AxesImage to be labeled.\n",
+ " data\n",
+ " Data used to annotate. If None, the image's data is used. Optional.\n",
+ " valfmt\n",
+ " The format of the annotations inside the heatmap. This should either\n",
+ " use the string format method, e.g. \"$ {x:.2f}\", or be a\n",
+ " `matplotlib.ticker.Formatter`. Optional.\n",
+ " textcolors\n",
+ " A pair of colors. The first is used for values below a threshold,\n",
+ " the second for those above. Optional.\n",
+ " threshold\n",
+ " Value in data units according to which the colors from textcolors are\n",
+ " applied. If None (the default) uses the middle of the colormap as\n",
+ " separation. Optional.\n",
+ " **kwargs\n",
+ " All other arguments are forwarded to each call to `text` used to create\n",
+ " the text labels.\n",
+ " \"\"\"\n",
+ "\n",
+ " # Normalize the threshold to the images color range.\n",
+ " if threshold is not None:\n",
+ " threshold = im.norm(threshold)\n",
+ " else:\n",
+ " threshold = im.norm(data.max())/2.\n",
+ "\n",
+ " # Set default alignment to center, but allow it to be\n",
+ " # overwritten by textkw.\n",
+ " kw = dict(horizontalalignment=\"center\",\n",
+ " verticalalignment=\"center\")\n",
+ " kw.update(textkw)\n",
+ "\n",
+ " # Get the formatter in case a string is supplied\n",
+ " if isinstance(valfmt, str):\n",
+ " valfmt = matplotlib.ticker.StrMethodFormatter(valfmt)\n",
+ "\n",
+ " # Loop over the data and create a `Text` for each \"pixel\".\n",
+ " # Change the text's color depending on the data.\n",
+ " texts = []\n",
+ " for i in range(data.shape[0]):\n",
+ " for j in range(data.shape[1]):\n",
+ "# kw.update(color=textcolors[int(im.norm(data[i, j]) > threshold)])\n",
+ " text = im.axes.text(j+0.5, i+0.5, valfmt(data[i, j], None), **kw)#j+0.1, i+0.5\n",
+ " texts.append(text)\n",
+ "\n",
+ " return texts"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## celltypes"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "filtered_res_df = pd.read_csv(workdir/'output/filtered_results/rb_calculations/summary.csv', index_col=0)\n",
+ "unfiltered_res_df = pd.read_csv(workdir/'output/unfiltered_results/rb_calculations/summary.csv', index_col=0)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "filtered_res_df_clean = filtered_res_df[filtered_res_df['celltype_discovery']!='B']\n",
+ "filtered_res_df_clean = filtered_res_df_clean.dropna()\n",
+ "filtered_res_df_clean.to_excel(workdir/'output/summary/rb_values_replication_in_other_celltypes_filtered_results.xlsx')\n",
+ "\n",
+ "unfiltered_res_df_clean = unfiltered_res_df[unfiltered_res_df['celltype_discovery']!='B']\n",
+ "unfiltered_res_df_clean = unfiltered_res_df_clean.dropna()\n",
+ "unfiltered_res_df_clean.to_excel(workdir/'output/summary/rb_values_replication_in_other_celltypes_unfiltered_results.xlsx')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### filtered results"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "# filtered results\n",
+ "rb_df = pd.DataFrame(data=np.zeros((len(celltypes), len(celltypes))), \n",
+ " columns=celltypes, index=celltypes)\n",
+ "rbse_df = pd.DataFrame(data=np.zeros((len(celltypes), len(celltypes))), \n",
+ " columns=celltypes, index=celltypes)\n",
+ "rbpvalue_df = pd.DataFrame(data=np.zeros((len(celltypes), len(celltypes))), \n",
+ " columns=celltypes, index=celltypes)\n",
+ "numcoeqtl_df = pd.DataFrame(data=np.zeros((len(celltypes), len(celltypes))), \n",
+ " columns=celltypes, index=celltypes)\n",
+ "num_anno_df = pd.DataFrame(data=np.zeros((len(celltypes), len(celltypes))), \n",
+ " columns=celltypes, index=celltypes)\n",
+ "rbse_anno_df = pd.DataFrame(data=np.zeros((len(celltypes), len(celltypes))), \n",
+ " columns=celltypes, index=celltypes)\n",
+ "for discovery_celltype in celltypes:\n",
+ " # replication in other celltypes\n",
+ " for replication_celltype in celltypes:\n",
+ " if discovery_celltype != replication_celltype:\n",
+ " rb_results = filtered_res_df[(filtered_res_df['celltype_discovery'] == discovery_celltype) &\n",
+ " (filtered_res_df['celltype_replication'] == replication_celltype)]\n",
+ " replicated_coeqtls_num = pd.read_csv(\n",
+ " workdir/f'output/filtered_results/rb_calculations/discovery_{discovery_celltype}_replication_{replication_celltype}.tsv.gz',\n",
+ " compression='gzip',\n",
+ " sep='\\t',\n",
+ " index_col=0\n",
+ " ).shape[0]\n",
+ " if rb_results['r'].values[0] < 10 and discovery_celltype != 'B':\n",
+ " rb_df.loc[replication_celltype, discovery_celltype] = rb_results['r'].values[0]\n",
+ " rbse_df.loc[replication_celltype, discovery_celltype] = rb_results['se_r'].values[0]\n",
+ " rbpvalue_df.loc[replication_celltype, discovery_celltype] = rb_results['p'].values[0]\n",
+ " numcoeqtl_df.loc[replication_celltype, discovery_celltype] = replicated_coeqtls_num\n",
+ " rbvalue = rb_results['r'].values[0]\n",
+ " rbsevalue = rb_results['se_r'].values[0]\n",
+ " num_anno_df.loc[replication_celltype, discovery_celltype] = \\\n",
+ " f\"N={replicated_coeqtls_num}\"\n",
+ " rbse_anno_df.loc[replication_celltype, discovery_celltype] = \\\n",
+ " f\"{rbvalue:.2f}\\nN={replicated_coeqtls_num}\"\n",
+ " elif discovery_celltype == 'B':\n",
+ " rb_df.loc[replication_celltype, discovery_celltype] = np.nan\n",
+ " rbse_df.loc[replication_celltype, discovery_celltype] = np.nan\n",
+ " rbpvalue_df.loc[replication_celltype, discovery_celltype] = 0\n",
+ " numcoeqtl_df.loc[replication_celltype, discovery_celltype] = replicated_coeqtls_num\n",
+ " rbvalue = rb_results['r'].values[0]\n",
+ " rbsevalue = rb_results['se_r'].values[0]\n",
+ " num_anno_df.loc[replication_celltype, discovery_celltype] = \\\n",
+ " f\"N={replicated_coeqtls_num}\"\n",
+ " rbse_anno_df.loc[replication_celltype, discovery_celltype] = \\\n",
+ " f\"N={replicated_coeqtls_num}\"\n",
+ " else:\n",
+ " rb_df.loc[replication_celltype, discovery_celltype] = np.nan\n",
+ " rbse_df.loc[replication_celltype, discovery_celltype] = np.nan\n",
+ " rbpvalue_df.loc[replication_celltype, discovery_celltype] = 0\n",
+ " numcoeqtl_df.loc[replication_celltype, discovery_celltype] = replicated_coeqtls_num\n",
+ " num_anno_df.loc[replication_celltype, discovery_celltype] = \\\n",
+ " f\"N={replicated_coeqtls_num}\"\n",
+ " rbse_anno_df.loc[replication_celltype, discovery_celltype] = \\\n",
+ " f\"N={replicated_coeqtls_num}\"\n",
+ " else:\n",
+ " rb_df.loc[replication_celltype, discovery_celltype] = 1\n",
+ " rbse_df.loc[replication_celltype, discovery_celltype] = 0\n",
+ " rbpvalue_df.loc[replication_celltype, discovery_celltype] = 0\n",
+ " replicated_coeqtls_num = pd.read_csv(\n",
+ " workdir/f'output/filtered_results/UT_{discovery_celltype}/coeqtls_fullresults_fixed.sig.tsv.gz',\n",
+ " compression='gzip',\n",
+ " sep='\\t'\n",
+ " ).shape[0]\n",
+ " numcoeqtl_df.loc[replication_celltype, discovery_celltype] = replicated_coeqtls_num\n",
+ " num_anno_df.loc[replication_celltype, discovery_celltype] = \\\n",
+ " f\"N={replicated_coeqtls_num}\"\n",
+ " rbse_anno_df.loc[replication_celltype, discovery_celltype] = \\\n",
+ " f\"N={replicated_coeqtls_num}\"\n",
+ " \n",
+ "replicated_ratio_df = pd.DataFrame(data=np.zeros((len(celltypes), len(celltypes))), \n",
+ " columns=celltypes, index=celltypes)\n",
+ "for discovery_celltype in numcoeqtl_df.columns:\n",
+ " for replication_celltype in numcoeqtl_df.index:\n",
+ " replicated_ratio_df.loc[replication_celltype, discovery_celltype] = \\\n",
+ " numcoeqtl_df.loc[replication_celltype, discovery_celltype] / numcoeqtl_df.loc[discovery_celltype, discovery_celltype]\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " CD4T | \n",
+ " CD8T | \n",
+ " monocyte | \n",
+ " DC | \n",
+ " NK | \n",
+ " B | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " CD4T | \n",
+ " 1.000000 | \n",
+ " 0.971596 | \n",
+ " 0.759425 | \n",
+ " 0.773429 | \n",
+ " 0.953264 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " CD8T | \n",
+ " 0.988285 | \n",
+ " 1.000000 | \n",
+ " 0.847118 | \n",
+ " 1.002450 | \n",
+ " 0.966100 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " monocyte | \n",
+ " 0.792142 | \n",
+ " 0.779688 | \n",
+ " 1.000000 | \n",
+ " 0.797139 | \n",
+ " 0.960618 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " DC | \n",
+ " 0.794745 | \n",
+ " 0.815816 | \n",
+ " 0.935905 | \n",
+ " 1.000000 | \n",
+ " 0.853924 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " NK | \n",
+ " 0.925802 | \n",
+ " 0.967842 | \n",
+ " 0.868747 | \n",
+ " NaN | \n",
+ " 1.000000 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " B | \n",
+ " 0.918479 | \n",
+ " 0.952496 | \n",
+ " 0.948709 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " CD4T CD8T monocyte DC NK B\n",
+ "CD4T 1.000000 0.971596 0.759425 0.773429 0.953264 NaN\n",
+ "CD8T 0.988285 1.000000 0.847118 1.002450 0.966100 NaN\n",
+ "monocyte 0.792142 0.779688 1.000000 0.797139 0.960618 NaN\n",
+ "DC 0.794745 0.815816 0.935905 1.000000 0.853924 NaN\n",
+ "NK 0.925802 0.967842 0.868747 NaN 1.000000 NaN\n",
+ "B 0.918479 0.952496 0.948709 NaN NaN 1.0"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "rb_df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " CD4T | \n",
+ " CD8T | \n",
+ " monocyte | \n",
+ " DC | \n",
+ " NK | \n",
+ " B | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " CD4T | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 1.126679e-35 | \n",
+ " 2.425843e-03 | \n",
+ " 0.000000e+00 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " CD8T | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 7.557685e-59 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " monocyte | \n",
+ " 1.052643e-121 | \n",
+ " 5.216640e-92 | \n",
+ " 0.000000e+00 | \n",
+ " 1.774726e-21 | \n",
+ " 1.393096e-317 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " DC | \n",
+ " 3.609987e-25 | \n",
+ " 4.217830e-39 | \n",
+ " 5.947381e-316 | \n",
+ " 0.000000e+00 | \n",
+ " 4.322965e-05 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " NK | \n",
+ " 2.552726e-264 | \n",
+ " 0.000000e+00 | \n",
+ " 8.365584e-06 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " B | \n",
+ " 2.320757e-144 | \n",
+ " 1.610287e-212 | \n",
+ " 1.074123e-78 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " CD4T CD8T monocyte DC \\\n",
+ "CD4T 0.000000e+00 0.000000e+00 1.126679e-35 2.425843e-03 \n",
+ "CD8T 0.000000e+00 0.000000e+00 7.557685e-59 0.000000e+00 \n",
+ "monocyte 1.052643e-121 5.216640e-92 0.000000e+00 1.774726e-21 \n",
+ "DC 3.609987e-25 4.217830e-39 5.947381e-316 0.000000e+00 \n",
+ "NK 2.552726e-264 0.000000e+00 8.365584e-06 0.000000e+00 \n",
+ "B 2.320757e-144 1.610287e-212 1.074123e-78 0.000000e+00 \n",
+ "\n",
+ " NK B \n",
+ "CD4T 0.000000e+00 0.0 \n",
+ "CD8T 0.000000e+00 0.0 \n",
+ "monocyte 1.393096e-317 0.0 \n",
+ "DC 4.322965e-05 0.0 \n",
+ "NK 0.000000e+00 0.0 \n",
+ "B 0.000000e+00 0.0 "
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "rbpvalue_df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import numpy as np\n",
+ "import matplotlib.pyplot as plt\n",
+ "from matplotlib import cm\n",
+ "from matplotlib.colors import ListedColormap, LinearSegmentedColormap"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 72,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "color_dict = {'CD4T': '#2E9D33',\n",
+ " 'CD8T': '#126725',\n",
+ " 'monocyte': '#EDBA1B',\n",
+ " 'NK': '#965EC8',\n",
+ " 'DC': '#E64B50',\n",
+ " 'B': '#009DDB',\n",
+ " 'cMono': 'peru',\n",
+ " 'ncMono': 'y',\n",
+ " 'CD4T_individual_100': '#2E9D33',\n",
+ " 'CD4T_individual_50': '#2E9D33',\n",
+ " 'CD4T_50': '#2E9D33',\n",
+ " 'CD4T_150': '#2E9D33',\n",
+ " 'CD4T_250': '#2E9D33'}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ ":60: UserWarning: FixedFormatter should only be used together with FixedLocator\n",
+ " ax.set_xticklabels([\"\"]+col_labels)\n",
+ ":61: UserWarning: FixedFormatter should only be used together with FixedLocator\n",
+ " ax.set_yticklabels([\"\"]+row_labels)\n"
+ ]
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "matplotlib.rcParams.update({'font.size': 16})\n",
+ "discovery_celltype = 'CD4T'\n",
+ "fig, axes = plt.subplots(1, 6, figsize=(7, 7), sharey=True)\n",
+ "for i, discovery_celltype in enumerate(['CD4T', 'CD8T', 'monocyte', 'DC', 'NK', 'B']):\n",
+ " colors = [\"white\", color_dict[discovery_celltype]]\n",
+ " cmap1 = LinearSegmentedColormap.from_list(\"mycmap\", colors)\n",
+ " im1, bar = heatmap(rb_df[discovery_celltype].values.reshape((6, 1)), \n",
+ " list(rb_df.index), \n",
+ " [discovery_celltype],\n",
+ " cmap=cmap1, ax=axes[i], vmin=0.7, vmax=1)\n",
+ " bar.remove()\n",
+ " _ = annotate_heatmap(im1, \n",
+ " data=rbse_anno_df[discovery_celltype].values.reshape((6, 1)), \n",
+ " valfmt=\"{x:^}\", \n",
+ " textcolors=(\"white\", \"white\"),\n",
+ " threshold=1)\n",
+ " if i > 0:\n",
+ " axes[i].axis('off')\n",
+ "plt.subplots_adjust(wspace=0, hspace=0)\n",
+ "plt.savefig('rb_values.filtered_results.pdf')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# cdict = {'red': [[0.0, 0.0, 0.0],\n",
+ "# [0.5, 0.5, 0.5],\n",
+ "# [1.0, 1.0, 1.0]],\n",
+ " \n",
+ "# 'green': [[0.0, 0.0, 0.0],\n",
+ "# [0.5, 0.5, 0.5],\n",
+ "# [1.0, 1.0, 1.0]],\n",
+ " \n",
+ "# 'blue': [[0.0, 0.0, 0.0],\n",
+ "# [0.5, 0.5, 0.5],\n",
+ "# [1.0, 1.0, 1.0]]}\n",
+ "\n",
+ "# cdict['alpha'] = ((0.0, 0.0, 0.0),\n",
+ "# (0.5, 0.5, 0.5),\n",
+ "# (1.0, 1.0, 1.0))\n",
+ "# newcmp = LinearSegmentedColormap('alpha', segmentdata=cdict, N=256)\n",
+ "\n",
+ "c_white = matplotlib.colors.colorConverter.to_rgba('white',alpha = 0)\n",
+ "c_black= matplotlib.colors.colorConverter.to_rgba('black',alpha = 1)\n",
+ "cmap_rb = matplotlib.colors.LinearSegmentedColormap.from_list('rb_cmap',[c_white,c_black],512)\n",
+ "\n",
+ "\n",
+ "\n",
+ "mpl.cm.register_cmap(cmap=cmap_rb, name='alpha')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ ":62: UserWarning: FixedFormatter should only be used together with FixedLocator\n",
+ " ax.set_xticklabels([\"\"]+col_labels)\n",
+ ":63: UserWarning: FixedFormatter should only be used together with FixedLocator\n",
+ " ax.set_yticklabels([\"\"]+row_labels)\n"
+ ]
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "im, bar = heatmap(replicated_ratio_df.values, \n",
+ " list(rb_df.index), \n",
+ " celltypes,\n",
+ " cmap='alpha', \n",
+ " vmin=0, vmax=1)\n",
+ "_ = annotate_heatmap(im, \n",
+ " data=replicated_ratio_df.values, \n",
+ " valfmt=\"{x:.0%}\", \n",
+ " textcolors=(\"white\", \"white\"),\n",
+ " threshold=1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 71,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ ":62: UserWarning: FixedFormatter should only be used together with FixedLocator\n",
+ " ax.set_xticklabels([\"\"]+col_labels)\n",
+ ":63: UserWarning: FixedFormatter should only be used together with FixedLocator\n",
+ " ax.set_yticklabels([\"\"]+row_labels)\n"
+ ]
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "matplotlib.rcParams.update({'font.size': 16})\n",
+ "fig, ax = plt.subplots(figsize=(8, 7))\n",
+ "im, bar = heatmap(np.flip(rb_df.values, axis=0), \n",
+ " list(rb_df.index)[::-1], \n",
+ " celltypes,\n",
+ " cmap='alpha', \n",
+ " vmin=0.7, vmax=1)\n",
+ "_ = annotate_heatmap(im, \n",
+ " data=np.flip(rbse_anno_df.values, axis=0), \n",
+ " valfmt=\"{x:^}\", \n",
+ " textcolors=(\"white\", \"white\"),\n",
+ " threshold=1)\n",
+ "\n",
+ "plt.savefig('rb_values.filtered_results.varyingalpha.pdf')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 74,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ ":62: UserWarning: FixedFormatter should only be used together with FixedLocator\n",
+ " ax.set_xticklabels([\"\"]+col_labels)\n",
+ ":63: UserWarning: FixedFormatter should only be used together with FixedLocator\n",
+ " ax.set_yticklabels([\"\"]+row_labels)\n"
+ ]
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "discovery_celltype = 'CD4T'\n",
+ "fig, axes = plt.subplots(1, 6, figsize=(7, 7), sharey=True)\n",
+ "for i, discovery_celltype in enumerate(['CD4T', 'CD8T', 'monocyte', 'DC', 'NK', 'B']):\n",
+ " colors = [\"white\", color_dict[discovery_celltype]]\n",
+ " cmap1 = LinearSegmentedColormap.from_list(\"mycmap\", colors)\n",
+ " im1, bar = heatmap(np.flip(replicated_ratio_df[discovery_celltype].values.reshape((6, 1)),\n",
+ " axis=0), \n",
+ " list(rb_df.index)[::-1], \n",
+ " [discovery_celltype],\n",
+ " cmap=cmap1, ax=axes[i], vmin=0, vmax=1)\n",
+ " bar.remove()\n",
+ " _ = annotate_heatmap(im1, \n",
+ " data=replicated_ratio_df[discovery_celltype].values.reshape((6, 1)), \n",
+ " valfmt=\"{x:.0%}\", \n",
+ " textcolors=(\"white\", \"white\"),\n",
+ " threshold=1)\n",
+ " if i > 0:\n",
+ " axes[i].axis('off')\n",
+ " \n",
+ "plt.subplots_adjust(wspace=0, hspace=0)\n",
+ "plt.savefig('replicated_ratio.filtered_results.pdf')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 50,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAAEoAAADBCAYAAABopyZqAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8rg+JYAAAACXBIWXMAAAsTAAALEwEAmpwYAAAHyUlEQVR4nO2dfYxcZRXGf8/2w5XyhwWiiUFasIiW+JGgtQaj0oSASVMU8SOauiFKDIkiMfqXsVFSTYwfmJhIrUpAg6FBiDSmsQRpEInGsAroQoB+kIJWU6WAdm2x9PjHe7cZhpl7n51hd+fG80sms/POPffeOX2ec9+Z5txXEUHSzNhCn0BbyESZZKJMMlEmmSiTTJRJ6xIl6XRJ35X0W0nTkkLSSjN2XNI3JB2Q9J9qH+9yYluXKGAV8CHgEHDPLGN/BFwBbALWAweAnZLe0hgZEa16AGMdf38SCGClEffmatvLO8YWA48A25viW6eoiDg+YOgG4L/Ato59HQNuBi6S9LK64NYlagjOBfZFxHTX+BSwlGLpvvw/JeoUSl3r5qmO9/uyuO5NSfP9jXkKONLxemtEbH2J9i1Kjeo13khtogDGxuZPdMePHz8SEW+do90/BZzRY3x5x/t9GbVEzeXup4D3Szqpq06tBp4DdtcF12ZBEmNjY/P2mGO2A0uAD3Z8vsXAh4E7IuJoXXCjoiTLwvOKpMuqP8+rnt8r6SBwMCLulrQC2ANcExHXAETE/ZK2Ad+RtATYB1wJnAl8rOmYI2W9WXBL1+vvVc93A++hFOhFvNgxlwNfBTYDrwAeAC6OiD80HVB1PwUvWrQoxsfHjfN+aZienp6cw2I+FG1V1LzTyhq1EDRNOFNRFakok6xRJmk9k7SeSVrPJBVlkjXKJBVlkjXKpNF6qahCKsokE2WS1jNJRZnk9MAkJ5wmqSiTrFEmaT2TtJ5JWs8kFWWSNcokFWWSNcokvxSbpKJMMlEmaT2TVJRJTg9McsJpkooyyRplkooyyRplktYzSeuZpPVMGrMw8zVmPh4Okl4j6WeSnpH0rKTbJPXqw+sVe2YV+7Skw5J2SbJaSlpVoySdBNwFHAUmKB2dm4Fdkt4UEYdrYk8FfgP8C/gUMA18ropdExEP1x27bV+KrwDOAs6JiN0Akh4EHqN8+G/XxF4JvAp4d0fsXcBe4CuUeyn0pVWKorTk/27mgwJExD5J9wKXUJ+otcBjXbGHJd0DrJe0uGrv70nbatS5wJ97jE9RWl7reJ7SEtvNUeDlwGvrgkftqneapPs6Xnd3qde15C/vMd7JI8CFkk6NiH8CSBoD1nTsuy+jZr1/GI2Ng7bkbwGuAn4s6SpKMf8ipVUWoLbzu23WO0Tvf/nl9FbaCSJiL6V3+DxKR/pfgXcA11abHKiLHzXrNTFFqVPdrAYeagqOiFsl/Rx4HfBcROyRdB3wRETsr4ttm6K2A2slndVxfiuB86v3GomI5yPi4SpJr6a081/XFNeYqBG778EPgMeB2yVdImkDcDvwBPD9mY0krZB0TNKmjrElkq6V9D5J6yR9BriPotJvNR24VRPOat6zjlJXfkIp4r8Cro6If3ds2qudP4CzgY9SWvmfBK4HvhYRvaYNL2DUrnqNVLXkAw3bPE7XlbCaTK4f9LitS9RC0SrrLSSpKJP8hdOkbRPOBSMVZZI1yiQVZZI1yiStZ5LWM0nrmaSiTLJGmeSXYpNUlEnWKJO86pmk9UzSeiZpPZNUlEnWKJNUlEnWKJO0nklazyStZ5KKMskaZZK/R5mkokyyRpnkVc8krWeS1jNJ65mkokyyRpmkokyyRpmkokxat1iqhmvnP0PSjZL2S5qW9KikzZKWNcW2qphruHb+ZcCdlJVlvwTsB95G6VA/m9KO1pe2WW+Ydv7zKQm5KCLuqMZ2SToF+LxevH7xC2hbMR+mnX9p9fxs1/jTlBJUq4i2dYAO085/J0V5X5e0WtLJVe/fZ4EtdbaF0atRc9bOHxFHJL0TuJWS2Bl+CHy66cRGrUbNWTu/pHFgG/BKYCOlmK8BNgHHKLck6UvbatTA7fzAJyirYq+KiD3V2K8lPQNslbQlIh7oFzxqimpimHb+NwKHOpI0w++r5zdQlgzvSdu61Idp5/8bsFzSqq7xt1fPf6nNQ92bIzgzH7idH7iBcu+oHZImJF0g6QvAN4FJ4N6BEzWTrFGZHlSX8HXAo5R2/puAfcC6pnb+qnN9LXA/ZTa/gzKB3QpcGBG192YZtelBI4O281fjD9FwQ61+5H+AmrROUQtF26YHC0bbJpwLRlrPJK1nktYzSUWZZI0ySUWZZI0ySUWZZI0ySeuZpPVM0nom+XuUSSrKJGuUSV71TFJRJlmjTNJ6Jmk9k7SeSSrKJGuUSSrKJGuUSVrPJK1nktYzyd+jTFJRJlmjTPKqZ5KKMskaZZLWM0nrmaT1TBp7YUapxaM6p4G61CV9WVL0eRxpim+VojRElzql0/OXXWPLqrHGhVbbVqMG7lKPiCcp636eQNJGSg5ubDpw2656w3Sp92IC+Duws2nDVrWhMVyXevfnOh24ALipWki1llbVKIZbdL6bjRShNNoOGhI1OTm5U9JpszyBYRhXfTs/DL7ofDcfB/4YEQ86G9cmKiIuHuAE5pJhutRPIGkN8HrgajdmpHxlMNSi8x1MUO518FM3oG2JGnrReUlLgY8AOyLioH3kiGjNgzJB3A38iTId2EC5V8Fe4OSO7VZQFLOpxz4updS5S2dz7FYpapgu9Q4mKFfJX8zm2KqynDTQKkUtJJkok0yUSSbKJBNlkokyyUSZZKJMMlEm/wPf4KV4Rxo9gAAAAABJRU5ErkJggg==\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "import matplotlib.pyplot as plt\n",
+ "import matplotlib as mpl\n",
+ "\n",
+ "fig, ax = plt.subplots(figsize=(0.5, 6))\n",
+ "fig.subplots_adjust(bottom=0.5)\n",
+ "\n",
+ "colors = [\"white\", 'black']\n",
+ "cmap = LinearSegmentedColormap.from_list(\"mycmap\", colors)\n",
+ "norm = mpl.colors.Normalize(vmin=0.7, vmax=1)\n",
+ "\n",
+ "fig.colorbar(mpl.cm.ScalarMappable(norm=norm, cmap=cmap),\n",
+ " cax=ax, orientation='vertical')\n",
+ "plt.savefig('colorbar.pdf')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 52,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAAEoAAADBCAYAAABopyZqAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8rg+JYAAAACXBIWXMAAAsTAAALEwEAmpwYAAAJsElEQVR4nO2dbYxdVRWGn3f6wUD7gxLiD5XSNhjSNkqDVasgKTWmNZg2RBGC0YKC34ImYCCVRmuNEogSIApFowRJaABj+wOlAm1VAprWFLUoFWgtGgiFFqqUQpsuf+xz6+Vy7zlr7unMPVvWk9ycmX3OOvvOmrXW2XP3O3vLzAiqGRr0G8iFcJSTcJSTcJSTcJSTcJST7Bwl6a2SbpD0kKR9kkzSNKftsKRrJD0t6eXiHmd4bLNzFHAS8DFgD/DbEdr+GLgYWA58GHgauFfSnEpLM8vqBQy1fX0RYMA0h90pxbUXtrWNBx4D1lbZZxdRZnaoT9PFwAFgddu9DgJ3AAslHVVmnJ2jajAb2G5m+zratwITSSndkzeSo44j1bVOdred78n4spOSxvov5q3A/rbvV5nZqiN0b5FqVLf2SkodBTA0NHZBd+jQof1mNneUbr8bmNqlfUrb+Z40zVGjefutwNmSjumoU7OAV4HHy4xLvSCJoaGhMXuNMmuBCcA5bT/feOBcYJ2ZvVJmXBlRkiuFxxRJHy2+fGdx/JCkXcAuM9so6UTgCWCFma0AMLMtklYD10maAGwHPg9MBz5e1WejUm8E3Nnx/Q+K40ZgPqlAj+P1GXMh8G1gJXAs8AiwyMz+WNWhyj4KHjdunA0PDzve95Fh3759m0exmNci14gac7KsUYOgasAZEVUQEeUkapSTSD0nkXpOIvWcREQ5iRrlJCLKSdQoJ5WpFxGViIhyEo5yEqnnJCLKSQwPnMSA00lElJPKcGnavJ6kEyTdJelFSXsl/VxStxngbrZTJd0qaWchQtsmaaWkSVW2WaWepGOAB4BXgKUkLcFKYL2kd5jZSyW2k4D7SJOgVwE7gXcB3wTeRpoI7UluqXcxMAM42cweB5D0J+DvwGeB75XYnkZyyEIzW1e0rZd0HHBZl6n215Db8GAx8HDLSQBmtl3Sg8ASyh01sTju7Wh/gVSCSiOi0gutQedYvBzMBv7SpX0rSWxRxn2kyLta0ixJkyUtAC4FbipLW2hejTpe0qa27zv1UWVisCld2g9jZvslnQ7cTXJsix8BX6p6Y02rUc85ptT7EoNJGibpN98EfIJUzN9NUggfJAk2epJbjdpDdwnhFLpHWjufJgk4TjKzJ4q230h6EVgl6SYze6SXcW5/FG8l1alOZgGPVti+HdjT5qQWfyiOM0nqlq7kNuBcC8yTNKPVUPzXwmnFuTKeAaZI6lT/vqc4/qvUD1XvrGGOugXYAayRtETSYmAN8BRwc+siSSdKOihpeZvtT4F/A/dIWirpTEmXA9cCm4EH+3bUWA4NPClePMIXANuA24DbScq5BWb2n/a3ToeQzMx2APOALaTR/D2kAewq4INVQv/cijlmthP4SMU1O+jyJDSzR0n/RzNimjY8aCxNG3A2logoJ9nVqEEREeUkapSTSD0nkXpOIvWcREQ5iRrlJLfPowZGRJSTqFFO4qnnJFLPSaSek0g9JxFRTqJGOclqFqZ4T30LyQr7mZLulPSc0qpkj0m6tMouq4hSDSFZYT+3sN9AWqTrRZJmanJV31k5ihpCMklDwK3A/WZ2dtup9Z6Oc0u9rkIy0izvkgrb+SSNQpnYrCe5TanXEZKdXhyHJT0s6YCkZyVdL+noqo6bNjwYNSEZ8ObiuBq4EbgCmAusAE4Azu5hBzRvwDlqQjL+lz0/M7OWeGODpHHAdyXNKqbcS4170rAaVUdI9nxx/HVHe0shPKfMOLenXh0hWUu32RmRrd9QqZolt4iqIyT7JWn8taijfWFx3EQJTatRVdxCUvCukfR1UnR8iy5CMl6/Itnzkr4DXCVpL2ngOZckdr21fcjRjaxSz8xeUtKGf58kJBNwP/CVKiFZwQqS6u4LwGWktYKvITm7lKYNDyqpKSQz0oBzxIPO3FJvYGQXUYMiqxo1SGIC1ElElJOoUU7iqeckUs9JpJ6TSD0nEVFOokY5iYhyEjXKSaSek0g9J5F6TiKinESNchKfRznJTaRRW0jWdp8rlbbF/J3n+qxqlGoKydruMwNYBjzr7Tu3p16dFcna+SFpcYmTcQQL5Jd6dYRkAEg6HzgVuNJz/WE/OG7cJO1BHSEZkqaQZpm/Zmal++l10rTUG00hGaTp822kBW1GRNOK+WiuSPZ+4JPAqVa2s1kPchtw1hGS3Uza0Pmfko4t2sYD44rvXy7bjLBpEVVFHSHZzOL1uS7n9gBfBa7rZdy0GlXFWuBaSTPM7El4jZDsigrbM7u0XUeSB32Zij1Ac4uoOkKyDZ03k/QCML7buU6yiqgjICTrm9yKeS0hWZfr5nv7zS31BkZWqTdIIqKcZFejBkVElJOoUU4i9ZxE6jmJ1HMSEeUkapSTmAB1EhHlJGqUk3jqOYnUcxKp5yRSz0lElJOoUU5yE2n0LSSTNFfSKkl/U9rWcqek2yVN9/SbVY2qKSQ7jzTLfD1pxvktpC0uN0maY2ZPlfWdW42qIyS72sx2tTco7fS4vbjv8q5WBbnVqL63tux0UtH2D0m7SNFVSlapR0qdNV3atwLnjPRmkmaSNib8a9W1TUu90RaSHUbSeOAmYBdJDlRK01JvNFck6+RG4H3AWWZWpa3K7vOoOkKywygtj/QZYGnbnsWlNC2iqqgjJANA0jKSluoSM7vN23FuA846K5Ih6RLSuGuZmd3g6bBFbk+9voVkks4jKex+BTwgaV7bffeWrZgIzXvqlVJTSLaoaF/E69e520ha+bUnudWovoVkZnYBcEG//eaWegMjq9QbJNml3qCIiHISNcpJRJSTqFFOIvWcROo5idRzktvnUQMjIspJ1Cgn8dRzEhHlJGqUk0g9J5F6TiL1nOS2bVzf+qjCdljSNZKeVtqt8SFJZ3hss4qomvooSBqDs4DLgSeBLwL3SnqvmW0pM8ytRtXZsfEU4HzgU2b2k6JtI2n2eQVJUtSTytT7P1poazFwgLS/Xsv2IHAHsFDSUWXGuU2p11loazaw3cz2dbGdCJxUZpxVjaKePqrMtnW+J6WO2rx5872Sjq94A0eSYZULyaB/fZRq2JY7ysw65+gHTR191G6g2zBiStv5njQqrxzU3bFxejHE6LR9lYr1o3JzVB191FpgAm2iWCUd57nAurJl2wAws2xewCTSb/7PpOHAYuAR0uBxctt1JwIHgeUd9neQUvQi4APAXcB+0gKB5X0P+ofvw1lTgbuBvaTdF38BTOu4ZhqpcH+jo/1o0qD0mcJBvwfme/qVjXylxTckudWogRGOchKOchKOchKOchKOchKOchKOchKOcvJfcsY5cEDXPTUAAAAASUVORK5CYII=\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "fig, ax = plt.subplots(figsize=(0.5, 6))\n",
+ "fig.subplots_adjust(bottom=0.5)\n",
+ "\n",
+ "colors = [\"white\", 'black']\n",
+ "cmap = LinearSegmentedColormap.from_list(\"mycmap\", colors)\n",
+ "norm = mpl.colors.Normalize(vmin=0, vmax=1)\n",
+ "\n",
+ "fig.colorbar(mpl.cm.ScalarMappable(norm=norm, cmap=cmap),\n",
+ " cax=ax, orientation='vertical')\n",
+ "plt.savefig('colorbar.replication_ratio.pdf')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### celltype comparison for unfiltered results"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 75,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# filtered results\n",
+ "unrb_df = pd.DataFrame(data=np.zeros((len(celltypes), len(celltypes))), \n",
+ " columns=celltypes, index=celltypes)\n",
+ "unrbse_df = pd.DataFrame(data=np.zeros((len(celltypes), len(celltypes))), \n",
+ " columns=celltypes, index=celltypes)\n",
+ "unrbpvalue_df = pd.DataFrame(data=np.zeros((len(celltypes), len(celltypes))), \n",
+ " columns=celltypes, index=celltypes)\n",
+ "unnumcoeqtl_df = pd.DataFrame(data=np.zeros((len(celltypes), len(celltypes))), \n",
+ " columns=celltypes, index=celltypes)\n",
+ "unanno_df = pd.DataFrame(data=np.zeros((len(celltypes), len(celltypes))), \n",
+ " columns=celltypes, index=celltypes)\n",
+ "unnum_anno_df = pd.DataFrame(data=np.zeros((len(celltypes), len(celltypes))), \n",
+ " columns=celltypes, index=celltypes)\n",
+ "\n",
+ "for discovery_celltype in celltypes:\n",
+ " for replication_celltype in celltypes:\n",
+ " if discovery_celltype != replication_celltype:\n",
+ " unrb_results = unfiltered_res_df[(unfiltered_res_df['celltype_discovery'] == discovery_celltype) &\n",
+ " (unfiltered_res_df['celltype_replication'] == replication_celltype)]\n",
+ " unreplicated_coeqtls_num = pd.read_csv(\n",
+ " workdir/f'output/unfiltered_results/rb_calculations/discovery_{discovery_celltype}_replication_{replication_celltype}.tsv.gz',\n",
+ " compression='gzip',\n",
+ " sep='\\t',\n",
+ " index_col=0\n",
+ " ).shape[0]\n",
+ " if rb_results['r'].values[0] < 10 and discovery_celltype != 'B':\n",
+ " unrb_df.loc[replication_celltype, discovery_celltype] = unrb_results['r'].values[0]\n",
+ " unrbse_df.loc[replication_celltype, discovery_celltype] = unrb_results['se_r'].values[0]\n",
+ " unrbpvalue_df.loc[replication_celltype, discovery_celltype] = unrb_results['p'].values[0]\n",
+ " unnumcoeqtl_df.loc[replication_celltype, discovery_celltype] = unreplicated_coeqtls_num\n",
+ " unrbvalue = unrb_results['r'].values[0]\n",
+ " unrbsevalue = unrb_results['se_r'].values[0]\n",
+ " unnum_anno_df.loc[replication_celltype, discovery_celltype] = \\\n",
+ " f\"N={unreplicated_coeqtls_num}\"\n",
+ " unanno_df.loc[replication_celltype, discovery_celltype] = \\\n",
+ " f\"{unrbvalue:.2f}\\nN={unreplicated_coeqtls_num}\"\n",
+ " elif discovery_celltype == 'B':\n",
+ " unrb_df.loc[replication_celltype, discovery_celltype] = np.nan\n",
+ " unrbse_df.loc[replication_celltype, discovery_celltype] = np.nan\n",
+ " unrbpvalue_df.loc[replication_celltype, discovery_celltype] = 0\n",
+ " unnumcoeqtl_df.loc[replication_celltype, discovery_celltype] = unreplicated_coeqtls_num\n",
+ " unrbvalue = unrb_results['r'].values[0]\n",
+ " unrbsevalue = unrb_results['se_r'].values[0]\n",
+ " unnum_anno_df.loc[replication_celltype, discovery_celltype] = \\\n",
+ " f\"N={unreplicated_coeqtls_num}\"\n",
+ " unanno_df.loc[replication_celltype, discovery_celltype] = \\\n",
+ " f\"N={unreplicated_coeqtls_num}\"\n",
+ " else:\n",
+ " unrb_df.loc[replication_celltype, discovery_celltype] = np.nan\n",
+ " unrbse_df.loc[replication_celltype, discovery_celltype] = np.nan\n",
+ " unrbpvalue_df.loc[replication_celltype, discovery_celltype] = 0\n",
+ " unnumcoeqtl_df.loc[replication_celltype, discovery_celltype] = unreplicated_coeqtls_num\n",
+ " unnum_anno_df.loc[replication_celltype, discovery_celltype] = \\\n",
+ " f\"N={unreplicated_coeqtls_num}\"\n",
+ " unanno_df.loc[replication_celltype, discovery_celltype] = \\\n",
+ " f\"N={unreplicated_coeqtls_num}\"\n",
+ " else:\n",
+ " unrb_df.loc[replication_celltype, discovery_celltype] = 1\n",
+ " unrbse_df.loc[replication_celltype, discovery_celltype] = 0\n",
+ " unrbpvalue_df.loc[replication_celltype, discovery_celltype] = 0\n",
+ " unreplicated_coeqtls_num = pd.read_csv(\n",
+ " workdir/f'output/unfiltered_results/UT_{discovery_celltype}/coeqtls_fullresults_fixed.sig.tsv.gz',\n",
+ " compression='gzip',\n",
+ " sep='\\t'\n",
+ " ).shape[0]\n",
+ " unnumcoeqtl_df.loc[replication_celltype, discovery_celltype] = unreplicated_coeqtls_num\n",
+ " unnum_anno_df.loc[replication_celltype, discovery_celltype] = \\\n",
+ " f\"N={unreplicated_coeqtls_num}\"\n",
+ " unanno_df.loc[replication_celltype, discovery_celltype] = \\\n",
+ " f\"N={unreplicated_coeqtls_num}\"\n",
+ " \n",
+ "unreplicated_ratio_df = pd.DataFrame(data=np.zeros((len(celltypes), len(celltypes))), \n",
+ " columns=celltypes, index=celltypes)\n",
+ "for discovery_celltype in unnumcoeqtl_df.columns:\n",
+ " for replication_celltype in unnumcoeqtl_df.index:\n",
+ " unreplicated_ratio_df.loc[replication_celltype, discovery_celltype] = \\\n",
+ " unnumcoeqtl_df.loc[replication_celltype, discovery_celltype] / unnumcoeqtl_df.loc[discovery_celltype, \n",
+ " discovery_celltype]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 76,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ ":62: UserWarning: FixedFormatter should only be used together with FixedLocator\n",
+ " ax.set_xticklabels([\"\"]+col_labels)\n",
+ ":63: UserWarning: FixedFormatter should only be used together with FixedLocator\n",
+ " ax.set_yticklabels([\"\"]+row_labels)\n"
+ ]
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "matplotlib.rcParams.update({'font.size': 14})\n",
+ "fig, axes = plt.subplots(1, 6, figsize=(7, 7), sharey=True)\n",
+ "for i, discovery_celltype in enumerate(['CD4T', 'CD8T', 'monocyte', 'DC', 'NK', 'B']):\n",
+ " colors = [\"white\", color_dict[discovery_celltype]]\n",
+ " cmap1 = LinearSegmentedColormap.from_list(\"mycmap\", colors)\n",
+ " im1, bar = heatmap(np.flip(unreplicated_ratio_df[discovery_celltype].values.reshape((6, 1)), \n",
+ " axis=0), \n",
+ " list(rb_df.index)[::-1], \n",
+ " [discovery_celltype],\n",
+ " cmap=cmap1, ax=axes[i], vmin=0, vmax=1)\n",
+ " bar.remove()\n",
+ " _ = annotate_heatmap(im1, \n",
+ " data=unreplicated_ratio_df[discovery_celltype].values.reshape((6, 1)), \n",
+ " valfmt=\"{x:.0%}\", \n",
+ " textcolors=(\"white\", \"white\"),\n",
+ " threshold=1)\n",
+ " if i > 0:\n",
+ " axes[i].axis('off')\n",
+ " \n",
+ "plt.subplots_adjust(wspace=0, hspace=0)\n",
+ "plt.savefig('replication_ratio.unfiltered_results.pdf')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 77,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ ":62: UserWarning: FixedFormatter should only be used together with FixedLocator\n",
+ " ax.set_xticklabels([\"\"]+col_labels)\n",
+ ":63: UserWarning: FixedFormatter should only be used together with FixedLocator\n",
+ " ax.set_yticklabels([\"\"]+row_labels)\n"
+ ]
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "matplotlib.rcParams.update({'font.size': 14})\n",
+ "discovery_celltype = 'CD4T'\n",
+ "fig, axes = plt.subplots(1, 6, figsize=(7, 7), sharey=True)\n",
+ "for i, discovery_celltype in enumerate(['CD4T', 'CD8T', 'monocyte', 'DC', 'NK', 'B']):\n",
+ " colors = [\"white\", color_dict[discovery_celltype]]\n",
+ " cmap1 = LinearSegmentedColormap.from_list(\"mycmap\", colors)\n",
+ " im1, bar = heatmap(np.flip(unrb_df[discovery_celltype].values.reshape((6, 1)), \n",
+ " axis=0),\n",
+ " list(rb_df.index)[::-1], \n",
+ " [discovery_celltype],\n",
+ " cmap=cmap1, ax=axes[i], vmin=0, vmax=1)\n",
+ " bar.remove()\n",
+ " _ = annotate_heatmap(im1, \n",
+ " data=unanno_df[discovery_celltype].values.reshape((6, 1)), \n",
+ " valfmt=\"{x:^}\", \n",
+ " textcolors=(\"white\", \"white\"),\n",
+ " threshold=1)\n",
+ " if i > 0:\n",
+ " axes[i].axis('off')\n",
+ " \n",
+ "plt.subplots_adjust(wspace=0, hspace=0)\n",
+ "plt.savefig('rb_values.unfiltered_results.pdf')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## BIOS replication"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "bios_replication_filtered_df = pd.read_csv(\n",
+ " workdir/'bios/onlyRNAAlignMetrics_rmLLD/filtered_results/replication_summary.csv', \n",
+ " index_col=0\n",
+ ").set_index('celltype')\n",
+ "bios_replication_unfiltered_df = pd.read_csv(\n",
+ " workdir/'bios/onlyRNAAlignMetrics_rmLLD/unfiltered_results/replication_summary.csv', \n",
+ " index_col=0\n",
+ ").set_index('celltype')\n",
+ "color_dict = {'CD4T': '#2E9D33',\n",
+ " 'CD8T': 'darkgreen',\n",
+ " 'monocyte': '#EDBA1B',\n",
+ " 'NK': '#E64B50',\n",
+ " 'DC': '#965EC8',\n",
+ " 'B': '#009DDB',\n",
+ " 'cMono': 'peru',\n",
+ " 'ncMono': 'y',\n",
+ " 'CD4T_individual_100': '#2E9D33',\n",
+ " 'CD4T_individual_50': '#2E9D33',\n",
+ " 'CD4T_50': '#2E9D33',\n",
+ " 'CD4T_150': '#2E9D33',\n",
+ " 'CD4T_250': '#2E9D33'}\n",
+ "\n",
+ "bios_replication_filtered_df['color'] = [color_dict.get(celltype) for celltype in \n",
+ " bios_replication_filtered_df.index]\n",
+ "bios_replication_unfiltered_df['color'] = [color_dict.get(celltype) for celltype in \n",
+ " bios_replication_unfiltered_df.index]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "bios_replication_filtered_df_clean = bios_replication_filtered_df.drop(index=['B'])\n",
+ "bios_replication_filtered_df_clean = bios_replication_filtered_df_clean.drop(columns=['color'])\n",
+ "bios_replication_filtered_df_clean.to_excel(workdir/'output/summary/rb_values_bios_replication_filtered_results.xlsx')\n",
+ "\n",
+ "bios_replication_unfiltered_df_clean = bios_replication_unfiltered_df.drop(index=['B'])\n",
+ "bios_replication_unfiltered_df_clean = bios_replication_unfiltered_df_clean.drop(columns=['color'])\n",
+ "bios_replication_unfiltered_df_clean.to_excel(workdir/'output/summary/rb_values_bios_replication_unfiltered_results.xlsx')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ ":3: UserWarning: marker is redundantly defined by the 'marker' keyword argument and the fmt string \".\" (-> marker='.'). The keyword argument will take precedence.\n",
+ " ax2.errorbar(y=bios_replication_filtered_df.loc[sorted_celltypes]['r'].values,\n",
+ ":8: UserWarning: marker is redundantly defined by the 'marker' keyword argument and the fmt string \".\" (-> marker='.'). The keyword argument will take precedence.\n",
+ " ax2.errorbar(y=bios_replication_unfiltered_df.loc[sorted_celltypes]['r'].values,\n",
+ ":12: UserWarning: FixedFormatter should only be used together with FixedLocator\n",
+ " ax2.set_xticklabels(['', 'CD4T', '', 'CD8T', '', 'monocyte', '', 'DC', '', 'NK'])\n"
+ ]
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "sorted_celltypes = ['CD4T', 'CD8T', 'monocyte', 'DC', 'NK']\n",
+ "fig, ax2 = plt.subplots()\n",
+ "ax2.errorbar(y=bios_replication_filtered_df.loc[sorted_celltypes]['r'].values,\n",
+ " x=[ind for ind in range(len(sorted_celltypes))],\n",
+ " yerr=bios_replication_filtered_df.loc[sorted_celltypes]['se_r'].values,\n",
+ " fmt='.', markersize=6, marker='o', color='black', label = 'Filtered results')\n",
+ "bios_replication_unfiltered_df.loc['DC'] = [np.nan, np.nan, np.nan, np.nan]\n",
+ "ax2.errorbar(y=bios_replication_unfiltered_df.loc[sorted_celltypes]['r'].values,\n",
+ " x=[ind+0.05 for ind in range(len(sorted_celltypes))],\n",
+ " yerr=bios_replication_unfiltered_df.loc[sorted_celltypes]['se_r'].values,\n",
+ " fmt='.', markersize=6, marker='o', markerfacecolor='white', color='black', label = 'Unfilter results')\n",
+ "ax2.set_xticklabels(['', 'CD4T', '', 'CD8T', '', 'monocyte', '', 'DC', '', 'NK'])\n",
+ "plt.legend()\n",
+ "plt.ylabel(\"rb (SE)\")\n",
+ "plt.savefig('sf20.comparison_rb_values_bios_replication.pdf')\n",
+ "plt.savefig('sf20.comparison_rb_values_bios_replication.png')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ ":14: UserWarning: marker is redundantly defined by the 'marker' keyword argument and the fmt string \".\" (-> marker='.'). The keyword argument will take precedence.\n",
+ " ax.errorbar(y=bios_replication_filtered_df.loc[celltype]['r'],\n"
+ ]
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# compare between filtered and unfiltered\n",
+ "fig, axes = plt.subplots(1, 5, figsize=(4, 5), sharey=True)\n",
+ "sorted_celltypes = ['CD4T', 'CD8T', 'monocyte', 'DC', 'NK']\n",
+ "# ax1.errorbar(y=bios_replication_filtered_df.loc[sorted_celltypes]['r'].values,\n",
+ "# x=[ind-0.1 for ind in range(len(sorted_celltypes))],\n",
+ "# yerr=bios_replication_filtered_df.loc[sorted_celltypes]['se_r'].values,\n",
+ "# fmt='.', markersize=6, marker='o', \n",
+ "# ecolor=bios_replication_filtered_df.loc[sorted_celltypes]['color'].values,\n",
+ "# color=bios_replication_filtered_df.loc[sorted_celltypes]['color'].values[0])\n",
+ "# ax1.set_xticklabels([\"\"]+sorted_celltypes)\n",
+ "# ax1.plot([0, 5], [0.5, 0.5], linestyle='--', color='black')\n",
+ "for ind, celltype in enumerate(sorted_celltypes):\n",
+ " ax = axes[ind]\n",
+ " ax.errorbar(y=bios_replication_filtered_df.loc[celltype]['r'],\n",
+ " x=[0.4],\n",
+ " yerr=bios_replication_filtered_df.loc[celltype]['se_r'],\n",
+ " fmt='.', markersize=6, marker='o', ecolor='black',\n",
+ " markeredgecolor='black', markerfacecolor='black'\n",
+ " )\n",
+ " ax.set_xlim([0, 1])\n",
+ " ax.spines['bottom'].set_color(bios_replication_filtered_df.loc[celltype]['color'])\n",
+ " ax.spines['top'].set_color(bios_replication_filtered_df.loc[celltype]['color']) \n",
+ " ax.spines['right'].set_color(bios_replication_filtered_df.loc[celltype]['color'])\n",
+ " ax.spines['left'].set_color(bios_replication_filtered_df.loc[celltype]['color'])\n",
+ " ax.set_xticklabels([])\n",
+ " ax.set_xlabel(celltype)\n",
+ " \n",
+ "\n",
+ "plt.savefig('bios_replication.filtered_results.pdf')\n",
+ "plt.savefig('bios_replication.filtered_results.png')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "metadata": {
+ "scrolled": false
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ ":14: UserWarning: marker is redundantly defined by the 'marker' keyword argument and the fmt string \".\" (-> marker='.'). The keyword argument will take precedence.\n",
+ " ax.errorbar(y=bios_replication_filtered_df.loc[celltype]['r'],\n",
+ ":20: UserWarning: marker is redundantly defined by the 'marker' keyword argument and the fmt string \".\" (-> marker='.'). The keyword argument will take precedence.\n",
+ " ax.errorbar(y=bios_replication_unfiltered_df.loc[celltype]['r'],\n",
+ "/groups/umcg-lld/tmp01/projects/1MCellRNAseq/GRN_reconstruction/tools/Beeline/miniconda/envs/scpy3.8/lib/python3.8/site-packages/numpy/core/_asarray.py:102: UserWarning: Warning: converting a masked element to nan.\n",
+ " return array(a, dtype, copy=False, order=order)\n"
+ ]
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# compare between filtered and unfiltered\n",
+ "fig, axes = plt.subplots(1, 6, figsize=(12, 6), sharey=True)\n",
+ "sorted_celltypes = ['CD4T', 'CD8T', 'monocyte', 'DC', 'NK', 'B']\n",
+ "# ax1.errorbar(y=bios_replication_filtered_df.loc[sorted_celltypes]['r'].values,\n",
+ "# x=[ind-0.1 for ind in range(len(sorted_celltypes))],\n",
+ "# yerr=bios_replication_filtered_df.loc[sorted_celltypes]['se_r'].values,\n",
+ "# fmt='.', markersize=6, marker='o', \n",
+ "# ecolor=bios_replication_filtered_df.loc[sorted_celltypes]['color'].values,\n",
+ "# color=bios_replication_filtered_df.loc[sorted_celltypes]['color'].values[0])\n",
+ "# ax1.set_xticklabels([\"\"]+sorted_celltypes)\n",
+ "# ax1.plot([0, 5], [0.5, 0.5], linestyle='--', color='black')\n",
+ "for ind, celltype in enumerate(sorted_celltypes):\n",
+ " ax = axes[ind]\n",
+ " ax.errorbar(y=bios_replication_filtered_df.loc[celltype]['r'],\n",
+ " x=[0.4],\n",
+ " yerr=bios_replication_filtered_df.loc[celltype]['se_r'],\n",
+ " fmt='.', markersize=6, marker='o', ecolor='black',\n",
+ " markeredgecolor='black', markerfacecolor='black'\n",
+ " )\n",
+ " ax.errorbar(y=bios_replication_unfiltered_df.loc[celltype]['r'],\n",
+ " x=[0.6],\n",
+ " yerr=bios_replication_unfiltered_df.loc[celltype]['se_r'],\n",
+ " fmt='.', markersize=6, marker='o', ecolor='black',\n",
+ " markeredgecolor='black', markerfacecolor='white')\n",
+ " ax.set_xlim([0, 1])\n",
+ " ax.spines['bottom'].set_color(bios_replication_filtered_df.loc[celltype]['color'])\n",
+ " ax.spines['top'].set_color(bios_replication_filtered_df.loc[celltype]['color']) \n",
+ " ax.spines['right'].set_color(bios_replication_filtered_df.loc[celltype]['color'])\n",
+ " ax.spines['left'].set_color(bios_replication_filtered_df.loc[celltype]['color'])\n",
+ " ax.set_xticklabels([])\n",
+ " ax.set_xlabel(celltype)\n",
+ " \n",
+ "\n",
+ "plt.savefig('bios_replication_comparison.filter_and_unfilter.pdf')\n",
+ "plt.savefig('bios_replication_comparison.filter_and_unfilter.png')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "metadata": {
+ "scrolled": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# compare between filtered and unfiltered\n",
+ "celltypes = ['CD4T', 'CD8T', 'monocyte', 'B', 'NK', 'DC']\n",
+ "fig, axes = plt.subplots(6, 2, figsize=(12, 12), sharex=True)\n",
+ "for i, celltype in enumerate(celltypes):\n",
+ " replication_celltypes = [ct for ct in celltypes]\n",
+ " ax1, ax2 = axes[i, :]\n",
+ " ax1.scatter(x=replication_celltypes,\n",
+ " y=numcoeqtl_df[celltype].loc[replication_celltypes])\n",
+ " ax1.scatter(x=replication_celltypes,\n",
+ " y=unnumcoeqtl_df[celltype].loc[replication_celltypes])\n",
+ " ax2.errorbar(x=replication_celltypes, fmt='.', markersize=12,\n",
+ " y=rb_df[celltype].loc[replication_celltypes],\n",
+ " yerr=rbse_df[celltype].loc[replication_celltypes], label='filtered')\n",
+ " ax2.errorbar(x=replication_celltypes, fmt='.', markersize=12,\n",
+ " y=unrb_df[celltype].loc[replication_celltypes],\n",
+ " yerr=unrbse_df[celltype].loc[replication_celltypes], label='Unfiltered')\n",
+ " ax1.set_ylabel(celltype)\n",
+ "ax2.legend()\n",
+ "\n",
+ "plt.savefig('celltype_rb.comparison_filtered_unfiltered_results.pdf')\n",
+ "plt.savefig('celltype_rb.comparison_filtered_unfiltered_results.png')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Sub celltypes in monocytes"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " r | \n",
+ " se_r | \n",
+ " p | \n",
+ " celltype_discovery | \n",
+ " celltype_replication | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " 0.971431 | \n",
+ " 0.048402 | \n",
+ " 1.351820e-89 | \n",
+ " ncMono | \n",
+ " cMono | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 0.929081 | \n",
+ " 0.088678 | \n",
+ " 1.101982e-25 | \n",
+ " ncMono | \n",
+ " monocyte | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 0.936797 | \n",
+ " 0.025409 | \n",
+ " 1.468276e-297 | \n",
+ " cMono | \n",
+ " ncMono | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 0.999726 | \n",
+ " 0.000613 | \n",
+ " 0.000000e+00 | \n",
+ " cMono | \n",
+ " monocyte | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " 0.896203 | \n",
+ " 0.036240 | \n",
+ " 5.115902e-135 | \n",
+ " monocyte | \n",
+ " ncMono | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " 0.949824 | \n",
+ " 0.008640 | \n",
+ " 0.000000e+00 | \n",
+ " monocyte | \n",
+ " cMono | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " r se_r p celltype_discovery celltype_replication\n",
+ "1 0.971431 0.048402 1.351820e-89 ncMono cMono\n",
+ "2 0.929081 0.088678 1.101982e-25 ncMono monocyte\n",
+ "3 0.936797 0.025409 1.468276e-297 cMono ncMono\n",
+ "4 0.999726 0.000613 0.000000e+00 cMono monocyte\n",
+ "5 0.896203 0.036240 5.115902e-135 monocyte ncMono\n",
+ "6 0.949824 0.008640 0.000000e+00 monocyte cMono"
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "filtered_mono_res_df = pd.read_csv(workdir/'output/filtered_results/rb_calculations/monocyte_subcelltypes/summary.csv', \n",
+ " index_col=0)\n",
+ "filtered_mono_res_df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# filtered results\n",
+ "mono_subcelltypes = ['monocyte', 'cMono', 'ncMono']\n",
+ "monorb_df = pd.DataFrame(data=np.zeros((len(mono_subcelltypes), len(mono_subcelltypes))), \n",
+ " columns=mono_subcelltypes, index=mono_subcelltypes)\n",
+ "monorbse_df = pd.DataFrame(data=np.zeros((len(mono_subcelltypes), len(mono_subcelltypes))), \n",
+ " columns=mono_subcelltypes, index=mono_subcelltypes)\n",
+ "monorbpvalue_df = pd.DataFrame(data=np.zeros((len(mono_subcelltypes), len(mono_subcelltypes))), \n",
+ " columns=mono_subcelltypes, index=mono_subcelltypes)\n",
+ "mononumcoeqtl_df = pd.DataFrame(data=np.zeros((len(mono_subcelltypes), len(mono_subcelltypes))), \n",
+ " columns=mono_subcelltypes, index=mono_subcelltypes)\n",
+ "monoanno_df = pd.DataFrame(data=np.zeros((len(mono_subcelltypes), len(mono_subcelltypes))), \n",
+ " columns=mono_subcelltypes, index=mono_subcelltypes)\n",
+ "mononum_anno_df = pd.DataFrame(data=np.zeros((len(mono_subcelltypes), len(mono_subcelltypes))), \n",
+ " columns=mono_subcelltypes, index=mono_subcelltypes)\n",
+ "\n",
+ "for discovery_celltype in mono_subcelltypes:\n",
+ " # replication in other celltypes\n",
+ " for replication_celltype in mono_subcelltypes:\n",
+ " if discovery_celltype != replication_celltype:\n",
+ " monorb_results = filtered_mono_res_df[(filtered_mono_res_df['celltype_discovery'] == discovery_celltype) &\n",
+ " (filtered_mono_res_df['celltype_replication'] == replication_celltype)]\n",
+ " monoreplicated_coeqtls_num = pd.read_csv(\n",
+ " workdir/f'output/filtered_results/rb_calculations/monocyte_subcelltypes/discovery_{discovery_celltype}_replication_{replication_celltype}.tsv.gz',\n",
+ " compression='gzip',\n",
+ " sep='\\t',\n",
+ " index_col=0\n",
+ " ).shape[0]\n",
+ " if monorb_results['r'].values[0] < 10:\n",
+ " monorb_df.loc[replication_celltype, discovery_celltype] = monorb_results['r'].values[0]\n",
+ " monorbse_df.loc[replication_celltype, discovery_celltype] = monorb_results['se_r'].values[0]\n",
+ " monorbpvalue_df.loc[replication_celltype, discovery_celltype] = monorb_results['p'].values[0]\n",
+ " mononumcoeqtl_df.loc[replication_celltype, discovery_celltype] = monoreplicated_coeqtls_num\n",
+ " monorbvalue = monorb_results['r'].values[0]\n",
+ " monorbsevalue = monorb_results['se_r'].values[0]\n",
+ " monoanno_df.loc[replication_celltype, discovery_celltype] = \\\n",
+ " f\"rb={monorbvalue:.2f}\\nN={monoreplicated_coeqtls_num}\"\n",
+ " mononum_anno_df.loc[replication_celltype, discovery_celltype] = \\\n",
+ " f\"N={monoreplicated_coeqtls_num}\"\n",
+ " else:\n",
+ " monorb_df.loc[replication_celltype, discovery_celltype] = np.nan\n",
+ " monorbse_df.loc[replication_celltype, discovery_celltype] = np.nan\n",
+ " monorbpvalue_df.loc[replication_celltype, discovery_celltype] = 0\n",
+ " mononumcoeqtl_df.loc[replication_celltype, discovery_celltype] = monoreplicated_coeqtls_num\n",
+ " monoanno_df.loc[replication_celltype, discovery_celltype] = \\\n",
+ " f\"rb=NA\\nN={monoreplicated_coeqtls_num}\"\n",
+ " mononum_anno_df.loc[replication_celltype, discovery_celltype] = \\\n",
+ " f\"N={monoreplicated_coeqtls_num}\"\n",
+ " else:\n",
+ " monorb_df.loc[replication_celltype, discovery_celltype] = 1\n",
+ " monorbse_df.loc[replication_celltype, discovery_celltype] = 0\n",
+ " monorbpvalue_df.loc[replication_celltype, discovery_celltype] = 0\n",
+ " monoreplicated_coeqtls_num = pd.read_csv(\n",
+ " workdir/f'output/filtered_results/UT_{discovery_celltype}/coeqtls_fullresults_fixed.sig.tsv.gz',\n",
+ " compression='gzip',\n",
+ " sep='\\t'\n",
+ " ).shape[0]\n",
+ " mononumcoeqtl_df.loc[replication_celltype, discovery_celltype] = monoreplicated_coeqtls_num\n",
+ " monoanno_df.loc[replication_celltype, discovery_celltype] = \\\n",
+ " f\"N={monoreplicated_coeqtls_num}\"\n",
+ " mononum_anno_df.loc[replication_celltype, discovery_celltype] = \\\n",
+ " f\"N={monoreplicated_coeqtls_num}\"\n",
+ " \n",
+ "monoreplicated_ratio_df = pd.DataFrame(data=np.zeros((len(mono_subcelltypes), len(mono_subcelltypes))), \n",
+ " columns=mono_subcelltypes, index=mono_subcelltypes)\n",
+ "for discovery_celltype in mononumcoeqtl_df.columns:\n",
+ " for replication_celltype in mononumcoeqtl_df.index:\n",
+ " monoreplicated_ratio_df.loc[replication_celltype, discovery_celltype] = \\\n",
+ " mononumcoeqtl_df.loc[replication_celltype, discovery_celltype] / mononumcoeqtl_df.loc[discovery_celltype, discovery_celltype]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " monocyte | \n",
+ " cMono | \n",
+ " ncMono | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " monocyte | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " 0.826087 | \n",
+ "
\n",
+ " \n",
+ " cMono | \n",
+ " 0.996441 | \n",
+ " 1.000000 | \n",
+ " 0.826087 | \n",
+ "
\n",
+ " \n",
+ " ncMono | \n",
+ " 0.985765 | \n",
+ " 0.980645 | \n",
+ " 1.000000 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " monocyte cMono ncMono\n",
+ "monocyte 1.000000 1.000000 0.826087\n",
+ "cMono 0.996441 1.000000 0.826087\n",
+ "ncMono 0.985765 0.980645 1.000000"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "monoreplicated_ratio_df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ ":60: UserWarning: FixedFormatter should only be used together with FixedLocator\n",
+ " ax.set_xticklabels([\"\"]+col_labels)\n",
+ ":61: UserWarning: FixedFormatter should only be used together with FixedLocator\n",
+ " ax.set_yticklabels([\"\"]+row_labels)\n"
+ ]
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "fig, axes = plt.subplots(1, 2, figsize=(10, 5))\n",
+ "ax1, ax2 = axes\n",
+ "\n",
+ "im1, bar = heatmap(monoreplicated_ratio_df.values, \n",
+ " list(monorb_df.index), \n",
+ " list(monorb_df.columns),\n",
+ " cmap=\"viridis\",\n",
+ " ax=ax1)\n",
+ "\n",
+ "\n",
+ "_ = annotate_heatmap(im1, \n",
+ " data=monoreplicated_ratio_df.values, \n",
+ " valfmt=\"{x:.0%}\", \n",
+ " color=\"black\",\n",
+ " threshold=1)\n",
+ "\n",
+ "im2, bar = heatmap(monorb_df.values, \n",
+ " list(monorb_df.index), \n",
+ " list(monorb_df.columns),\n",
+ " cmap=\"viridis\",\n",
+ " ax=ax2)\n",
+ "\n",
+ "\n",
+ "_ = annotate_heatmap(im2, \n",
+ " data=monoanno_df.values, \n",
+ " valfmt=\"{x:^}\", \n",
+ " color=\"black\",\n",
+ " threshold=1)\n",
+ "\n",
+ "plt.savefig('cmono_ncmono_mono.filtered_results.pdf')\n",
+ "plt.savefig('cmono_ncmono_mono.filtered_results.png')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Non-zero ratio and co-expression mean and variances"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "celltype = 'monocyte'\n",
+ "annotated_coeqtl_df = pd.DataFrame()\n",
+ "for celltype in celltypes:\n",
+ " celltype_annotated_coeqtl_df = pd.read_csv(\n",
+ " workdir/f'output/filtered_results/UT_{celltype}/coeqtls_fullresults_fixed.all.annotated.tsv.gz',\n",
+ " compression='gzip',\n",
+ " sep='\\t'\n",
+ " )[['mean_onemillionv2', 'var_onemillionv2', \n",
+ " 'gene2_nonzeroratio_onemillionv2',\n",
+ " 'eqtlgene_nonzeroratio_onemillionv2',\n",
+ " 'gene2_isSig']]\n",
+ " celltype_annotated_coeqtl_df['celltype'] = celltype\n",
+ " annotated_coeqtl_df = pd.concat([annotated_coeqtl_df, \n",
+ " celltype_annotated_coeqtl_df],\n",
+ " axis=0)\n",
+ " \n",
+ "annotated_coeqtl_df_clean = annotated_coeqtl_df.replace([np.inf, -np.inf], np.nan, inplace=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 27,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "sns.boxplot(x=annotated_coeqtl_df_clean['celltype'],\n",
+ " y=abs(annotated_coeqtl_df_clean['mean_onemillionv2']),\n",
+ " hue=annotated_coeqtl_df_clean['gene2_isSig'],\n",
+ " fliersize=1,\n",
+ " palette='viridis',\n",
+ " showfliers = False)\n",
+ "# plt.savefig('mean_onemillionv2.filtered_results.pdf')\n",
+ "# plt.savefig('mean_onemillionv2.filtered_results.png')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 28,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "sns.boxplot(x=annotated_coeqtl_df_clean['celltype'], \n",
+ " y=annotated_coeqtl_df_clean['var_onemillionv2'],\n",
+ " hue=annotated_coeqtl_df_clean['gene2_isSig'],\n",
+ " palette='viridis', fliersize=1,\n",
+ " showfliers = False)\n",
+ "# plt.savefig('var_onemillionv2.filtered_results.pdf')\n",
+ "# plt.savefig('var_onemillionv2.filtered_results.png')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 29,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "sns.boxplot(x=annotated_coeqtl_df_clean['celltype'],\n",
+ " y=annotated_coeqtl_df_clean['gene2_nonzeroratio_onemillionv2'],\n",
+ " hue=annotated_coeqtl_df_clean['gene2_isSig'],\n",
+ " palette='viridis', fliersize=1, showfliers = False)\n",
+ "# plt.savefig('gene2_nonzeroratio_onemillionv2.filtered_results.pdf')\n",
+ "# plt.savefig('gene2_nonzeroratio_onemillionv2.filtered_results.png')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 30,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "sns.boxplot(x=annotated_coeqtl_df_clean['celltype'], \n",
+ " y=annotated_coeqtl_df_clean['eqtlgene_nonzeroratio_onemillionv2'],\n",
+ " hue=annotated_coeqtl_df_clean['gene2_isSig'],\n",
+ " palette='viridis', fliersize=1, showfliers = False)\n",
+ "# plt.savefig('eqtlgene_nonzeroratio_onemillionv2.filtered_results.pdf')\n",
+ "# plt.savefig('eqtlgene_nonzeroratio_onemillionv2.filtered_results.png')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "##### unfiltered results"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "CD4T\n",
+ "CD8T\n",
+ "monocyte\n",
+ "DC\n",
+ "NK\n",
+ "B\n"
+ ]
+ }
+ ],
+ "source": [
+ "celltype = 'monocyte'\n",
+ "annotated_coeqtl_df = pd.DataFrame()\n",
+ "for celltype in celltypes:\n",
+ " print(celltype)\n",
+ " celltype_annotated_coeqtl_df = pd.read_csv(workdir/f'output/unfiltered_results/UT_{celltype}/coeqtls_fullresults_fixed.all.annotated.tsv.gz',\n",
+ " compression='gzip',\n",
+ " sep='\\t')[['mean_onemillionv2', 'var_onemillionv2', \n",
+ " 'gene2_nonzeroratio_onemillionv2',\n",
+ " 'eqtlgene_nonzeroratio_onemillionv2',\n",
+ " 'gene2_isSig']]\n",
+ " celltype_annotated_coeqtl_df['celltype'] = celltype\n",
+ " annotated_coeqtl_df = pd.concat([annotated_coeqtl_df, \n",
+ " celltype_annotated_coeqtl_df],\n",
+ " axis=0)\n",
+ " \n",
+ "annotated_coeqtl_df_clean = annotated_coeqtl_df.replace([np.inf, -np.inf], np.nan, inplace=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "sns.boxplot(x=annotated_coeqtl_df_clean['celltype'],\n",
+ " y=abs(annotated_coeqtl_df_clean['mean_onemillionv2']),\n",
+ " hue=annotated_coeqtl_df_clean['gene2_isSig'],\n",
+ " fliersize=1,\n",
+ " palette='Paired',\n",
+ " showfliers = False)\n",
+ "plt.savefig('mean_onemillionv2.unfiltered_results.pdf')\n",
+ "plt.savefig('mean_onemillionv2.unfiltered_results.png')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXkAAAEGCAYAAACAd+UpAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8rg+JYAAAACXBIWXMAAAsTAAALEwEAmpwYAAAiN0lEQVR4nO3deXgV9dn/8fedEE00uAJqgxAttUhRI4ZaXGiLItbdWqv+rHvFx1qhWttHrVfVlqe2dcWtLbVWUVxRtC611gVRu1gSUFEUU1dSkE2QSBIj3L8/ZpIGOEnOSWbOMvm8ritXklnvyZnc53u+8517zN0REZFkKsp1ACIiEh8leRGRBFOSFxFJMCV5EZEEU5IXEUmwPrkOoL1+/fp5ZWVlrsMQESkoNTU1y9y9f6p5eZXkKysrmT17dq7DEBEpKGb2Xkfz1F0jIpJgSvIiIgmmJC8ikmB51ScvIrKhlpYWFi5cSFNTU65DybnS0lIGDhxISUlJ2usoyYtIXlu4cCF9+/alsrISM8t1ODnj7ixfvpyFCxey0047pb2eumtEJK81NTWx7bbb9uoED2BmbLvtthl/olGSF5G819sTfKvu/B3UXSOJcsMNN1BXV7fetPr6egAqKio2Wn7IkCGce+65WYlNJBeU5CXxGhsbcx2CSM4oyUuipGqVT5w4EYDJkydnOxwpUB988AEnn3wyixcvpqioiPHjx7edR6n89Kc/ZfTo0Rx44IEp57/55pucddZZrFy5kubmZvbff3+mTJnC7NmzmTp1Ktdff31ch6IkLyKyoT59+nD11VczYsQIVq9ezV577cXYsWMZNmxYyuV/9rOfdbq9CRMmcN5553HkkUcC8OqrrwJQXV1NdXV1tMFvQBdeRaQg/fznP2fo0KGMHTuWE044gauuuop///vfHHzwwey1117sv//+vPHGGwCceuqpTJgwgX322Yedd96Z6dOnt23nyiuvZOTIkey+++5ceumlAOywww6MGDECgL59+7Lrrru2XdtJ5dRTT23b5oUXXsiwYcPYfffdueCCCwBYtGgRAwcObFt+t912A2DmzJkcdthhACxdupSxY8cyYsQIzjrrLAYPHsyyZct6/HdSkheRgjN79mweeOAB5syZw4MPPthW2HD8+PHccMMN1NTUcNVVV/G9732vbZ1Fixbxwgsv8Oijj3LhhRcC8OSTT/LWW2/x0ksvMXfuXGpqapg1a9Z6+3r33XeZM2cOe++9d5dxrVixghkzZvDaa6/xyiuvcMkllwBw3nnnMWbMGL7xjW9w7bXXsnLlyo3WvfzyyxkzZgy1tbUcffTRvP/++93986xH3TUiUnBeeOEFjjzySMrKygA4/PDDaWpq4m9/+xvHHnts23LNzc1tPx911FEUFRUxbNgwPvzwQyBI8k8++SR77rknAA0NDbz11luMHj267fdjjjmG6667ji222KLLuLbYYgtKS0v57ne/y6GHHtrWSj/ttNMYN24cTzzxBA8//DC/+93vePnllzc6phkzZgBw8MEHs/XWW3f3z7MeJXkRKTjuvtG0devWsdVWWzF37tyU62y66aYbre/uXHTRRZx11lkbLd/S0sIxxxzDiSeeyDe/+c204urTpw8vvfQSTz/9NPfccw833ngjzzzzDACf+9znOP300zn99NMZPnw48+bN6/KYoqDuGhEpOPvttx+PPPIITU1NNDQ08Nhjj7HZZpux0047cf/99wNB0tywtbyhcePGceutt9LQ0AAE91QsWbIEd+eMM85g11135fzzz087roaGBlatWsUhhxzCdddd1/aG88QTT9DS0gLA4sWLWb58+Ub3bey3337cd999QPAJ46OPPkp7v51RS15ECs7IkSM54ogj2GOPPRg8eDDV1dVsueWWTJs2jbPPPptJkybR0tLC8ccfzx577NHhdg466CDmz5/PqFGjACgvL+fOO+9kwYIF3HHHHey2225UVVUB8Itf/IJDDjmk07hWr17NkUceSVNTE+7OtddeCwRJe+LEiZSWlgLBxd7tt9++7cIwwKWXXsoJJ5zAvffey1e/+lV22GEH+vbt25M/EwAW10eE7qiurnY9GUqipnHyhW3+/PnsuuuuG01vaGigvLycNWvWMHr0aKZMmdI2IqYQNTc3U1xcTJ8+ffj73//O2WefnbLrKdXfw8xq3D3lWEy15EWkII0fP57XX3+dpqYmTjnllIJO8ADvv/8+3/72t1m3bh2bbLIJv//97yPZrpK8iBSku+66K+v7POecc3jxxRfXmzZx4kROO+20Hm/7C1/4AnPmzOnxdjakJC8ikqabbrop1yFkTKNrREQSTEleRCTBlORFRBIs9j55M3sXWA2sBT7raJiPiEh3/Piii/lo5arItrf1Vlvy6yt+0ekyxcXFbUXGAB566CEqKytTLlteXt52s1UuZOvC69fdvefl1ERENvDRylUc8t3zItve47dc2+UyZWVlHZZPyDfqrhER6aGGhgYOOOAARowYwW677cbDDz+80TKLFi1i9OjRVFVVMXz4cJ5//nkguBt21KhRjBgxgmOPPTbyVn82krwDT5pZjZmN33CmmY03s9lmNnvp0qVZCEdEpGcaGxupqqqiqqqKo48+mtLSUmbMmEFtbS3PPvssP/zhDzcqOHbXXXcxbtw45s6dy8svv0xVVRXLli1j0qRJPPXUU9TW1lJdXc0111wTaazZ6K7Z193/Y2YDgL+a2Rvu3law2d2nAFMgKGuQhXhERHpkw+6alpYWLr74YmbNmkVRURH19fV8+OGHbL/99m3LjBw5ktNPP52WlhaOOuooqqqqeO6553j99dfZd999Afj000/b6uhEJfYk7+7/Cb8vMbMZwJeBWZ2vJSJSOKZNm8bSpUupqamhpKSEyspKmpqa1ltm9OjRzJo1i8cee4yTTjqJH/3oR2y99daMHTuWu+++O7bYYu2uMbPNzaxv68/AQcC8ztcSESksq1atYsCAAZSUlPDss8/y3nvvbbTMe++9x4ABAzjzzDM544wzqK2t5Stf+QovvvgidXV1AKxZs4YFCxZEGlvcLfntgBlm1rqvu9z9iZj3KSK9yNZbbZnWiJhMtpepE088kcMPP5zq6mqqqqoYOnToRsvMnDmTK6+8kpKSEsrLy5k6dSr9+/fntttu44QTTmh7itWkSZPYZZddenwcrVRqWBJPpYYLW0elhnurTEsNawiliEiCKcmLiCSYkryISIIpyYuIJJiSvIhIginJi4gkmB7/JyIF7fvn/4gly1ZEtr0B/bbhxmuu7HD+8uXLOeCAAwBYvHgxxcXF9O/fH4CXXnqJTTbZJLJYoqAkLyIFbcmyFfx7u69Gt8EPn+t09rbbbttWt+ayyy6jvLycCy64oG3+Z599Rp8++ZNa8ycSEZECdeqpp7LNNtswZ84cRowYQd++fddL/sOHD+fRRx+lsrKSO++8k+uvv55PP/2Uvffem5tvvpni4uLYYlOfvIhIBBYsWMBTTz3F1Vdf3eEy8+fP59577+XFF19k7ty5FBcXM23atFjjUkteRCQCxx57bJct8qeffpqamhpGjhwJBHXpBwwYEGtcSvIiIhHYfPPN237u06cP69ata/u9teywu3PKKadwxRVXZC0uddeIiESssrKS2tpaAGpra3nnnXcAOOCAA5g+fTpLliwBYMWKFSnLEkdJLXkRKWgD+m3T5YiYjLfXQ8cccwxTp06lqqqKkSNHtpUOHjZsGJMmTeKggw5i3bp1lJSUcNNNNzF48OAe77MjSvIiUtA6G9Met8suuyzl9LKyMp588smU84477jiOO+64GKNan7prREQSTEleRCTBlORFJO/l0xPscqk7fwcleRHJa6WlpSxfvrzXJ3p3Z/ny5ZSWlma0ni68ikheGzhwIAsXLmTp0qW5DiXnSktLGThwYEbrKMmLSF4rKSlhp512ynUYBUvdNSIiCaYkLyKSYEryIiIJpiQvIpJgSvIiIgmmJC8ikmBZSfJmVmxmc8zs0WzsT0REAtlqyU8E5mdpXyIiEoo9yZvZQOBQ4Ja49yUiIuvLRkv+OuDHwLpUM81svJnNNrPZum1ZRCRasSZ5MzsMWOLuNR0t4+5T3L3a3av79+8fZzgiIr1O3C35fYEjzOxd4B5gjJndGfM+RUQkFGuSd/eL3H2gu1cCxwPPuPt34tyniIj8l8bJi4gkWJdJ3syGmtkBZla+wfSDM9mRu89098MyDVBERLqv0yRvZhOAh4FzgXlmdmS72b+IMzAREem5rh4aciawl7s3mFklMN3MKt19MmCxRyciIj3SVZIvdvcGAHd/18y+RpDoB6MkLyKS97rqk19sZlWtv4QJ/zCgH7BbjHGJiEgEukryJwOL209w98/c/WRgdGxRiYhIJDpN8u6+0N0Xm9lkM9tng3kvxhuaiIj0VLrj5GuBS8yszsyuNLPqOIMSEZFopJXk3f12dz8E+DKwAPiVmb0Va2QiItJjmd7xOgQYClQCb0QejYiIRCqtJG9mrS33nwGvEYydPzzWyEREpMe6Giff6h1glLsvizMYERGJVlpJ3t1/a2YV4QibPu2mz4otMhER6bG0kryZ/ZKgVPDrwNpwsgNK8iIieSzd7pqjgS+6e3OcwYiISLTSHV3zNlASZyAiIhK9dFvya4C5ZvY00Naad/cJsUQlIiKRSDfJ/yn8EhGRApLu6JrbzWwTYJdw0pvu3hJfWCIiEoV0R9d8DbgdeJegjvyOZnaKhlCKiOS3dLtrrgYOcvc3AcxsF+BuYK+4ApN43HDDDdTV1a03rb6+HoCKioqNlh8yZAjnnntuVmITkeilm+RLWhM8gLsvMDONtkmIxsbGXIcgIjFJN8nPNrM/AHeEv58I1MQTksQpVat84sSJAEyePDnb4YhIzNJN8mcD5wATCPrkZwE3xxWUiIhEI93RNc3ANeGXiIgUiE6TvJnd5+7fNrNXCWrVrMfdd48tMhER6bGuWvITw++HxR2IiIhEr9Mk7+6Lwu/vdWfjZlZK0H+/abiv6e5+aXe2JSIimeuqu2Y1KbppCC6+urtv0cX2m4Ex7t4QDrl8wcz+7O7/6F64IpJUqe7hgI7v49A9HOnpqiXftycbd3cHGsJfS8KvVG8aIiIp6T6OnumqJb9NZ/PdfUVXOzCzYoIx9UOAm9z9nxlFKCK9Qketct3H0TNdXXitIWh5W4p5Duzc1Q7cfS1QZWZbATPMbLi7z2udb2bjgfEAgwYNSjNsERFJR1fdNTtFtSN3X2lmM4GDgXntpk8BpgBUV1erK0dEJEJdddcMdfc3zGxEqvnuXtvF+v2BljDBlwEHAr/qdrQiIpKRrrprzifoSrk6xTwHxnSx/g7A7WG/fBFwn7s/mnGUIiLSLV1114wPv3+9Oxt391eAPbuzroiI9Fy6Dw0pBg4FKtuv4+6qZSMiksfSrUL5CNAEvAqsiy8cERGJUrpJfqCKkYmIFJ6iNJf7s5kdFGskIiISuXRb8v8guJGpCGgh/do1IiKSQ5k8yHsU8GpYj0ZERApAut01bwHzlOBFRApLui35RcBMM/szQflgQEMoRUTyXbpJ/p3wa5PwS0RECkC6D/K+HMDMNnf3T+INSUREopLuHa+jgD8A5cAgM9sDOMvdvxdncCId6egpQqm0Ltdal7wreuKQJEm63TXXAeOAPwG4+8tmNjquoES6UldXx9x581m7WafPtQGg6NNgvEDN2x92uWzxmi6fgyNSUNJN8rj7B2brPTtkbfThiKRv7Wbb0Dj0kEi3WfbG45FuTyTX0k3yH5jZPoCb2SbABGB+fGGJiEgU0h0n/z/AOUAFsBCoCn8XEZE8lu7ommXAiTHHIiIiEUt3dE1/4Ew2rid/ejxhiYhIFNLtk38YeB54Cl1wFREpGOkm+c3c/X9jjURERCKX7oXXR80s2rFqIiISu3ST/ESCRN9kZqvDr4/jDExERHou3dE1feMOREREopf2Ha9mdgTQWspgprs/Gk9IIiISlXSHUP4SGAlMCydNNLP93P3C2CLLkY4KX9XX1wNQUVGx3nQVsxKRfJZuS/4QoMrd1wGY2e3AHCBxSb4jjY2NuQ5BRCRjaXfXAFsBrSX6tow+lPzQUau8tUzt5MmTsxmOiEiPpJvkrwDmmNmzgBH0zV8UW1QiIhKJtIZQuvvdwFeAB8OvUe5+T+t8M/tSqvXMbEcze9bM5pvZa2aW3lMbREQkEpnUk19E+NCQFO4ARqSY/hnwQ3evNbO+QI2Z/dXdX888VBERyVQmffKdsVQTwzeGReHPq81sPkG5YiX5XkwjmESyJ6ok710tYGaVwJ7APzeYPh4YDzBo0KCIwpFCpBFMItGLKsl3yszKgQeAH7j7euUQ3H0KMAWgurq6yzcLKXwawSS9Ua4+wXZ54dUCO3ax2KedrF9CkOCnufuDGcYnIpJojY2NsX6K7bIl7+5uZg8Be3WyzFdSTbfgyd9/AOa7+zXdDbI3SvWurz5ryQeZnJug87NVrj7Bpttd8w8zG+nu/8pw+/sCJwGvmtnccNrF7v54htvZSG880dRnLflK52b+SjfJfx04y8zeAz4hGE3j7r57Zyu5+wt0MPImDkk60VK9IanPWvKBzs3Ckm6S/0asUXSDTjQRka6lW0/+PQAzGwCUxhqRiIhEJq2yBmZ2hJm9BbwDPAe8C/w5xrhERCQC6XbX/Jygds1T7r6nmX0dOCG+sEQKl+7olXyS7jNeW9x9OVBkZkXu/ixQFV9YIskT93hokVTSbcmvDO9afR6YZmZLCIqPicgGdEev5JN0k/wsgoeGTAS+Q/DQkJ/FFJOISM4k7R6cdLtrDPgLMBMoB+4Nu29ERBKvkLva0h1CeTlwuZntDhwHPGdmC939wFijExHJsqTdg5NuS77VEmAxsBwYEH04IiISpXTHyZ9tZjOBp4F+wJldlTQQEZHcS/fC62CCWvBzY4xFREQilm6f/IVxByIiItHLypOhRER6k47uek6ldbnWi7ud6c5wTSV5EZGI1dXVMXfefNZutk2XyxZ9Gjz1tObtDztdrnjNim7FoiQvBam+vp7iNasoe6PHz59ZT/Ga5dTX62Zu6bm1m21D49BDItted8/1TIdQiohIAVFLPqHi6hOE/LiNu6KigsXNfSJtKUHQWqqo2C7SbUZF1S2lO5TkEyqOPkHofr+gxKdQb7eX7FCST7Co+wQh837BuD5R1NXVQVHfjGIpdKpuWTjiuGbU3etFSvISq7q6Oua/uYD+FTt2vXCfEgCWNXTdMl2zphHKe1eST5KkdyfmEyV5iV3/ih351vd+HOk2f/OTc2mJdIuSTUnvTozjmlF3rxcpyeeYujMkX8XZ2q6vr8+L7sTeIO+TfNI/1qk7Q/JVXOfm0voPKCkuUiMkS/I+ycd5ouULdWcUrqQ3QuI4N6ff/GtWLq6PdJvSsbxP8hDfiSbSU72hEZJUcXdH5Ut6zY8oRAqYGiGFqbd0R8Wa5M3sVuAwYIm7D49zXyJJsnLZElYubkn/IjuF1Q2UL+LsjipesyKtC8FFTR8DsK50i06XC0YO5d/omtuAG4GpMe9H8lR9fT0fN3wSecu0pbmZorUfR7rNfNLS3Exzy9q0hg0W4hDDpCsrK2PIkCFpLVtXtxqAITt3lcC3S3ub7cWa5N19lplVxrkPkaTSEMPCVVFRkfYdyHHfsZzzPnkzGw+MBxg0aFCOo8m+pLd0Kyoq2LShMZ7RQyWdf7yVnonr3Fxa/wHrPmuBsvzos066nCd5d58CTAGorq72DefHeaI1l28e6Tal94nr/Py0qZHi5mbVy5cey3mS7+3U0pV8Fde52XphsinSrWautzQg8z7Jx3mi9Ssvi3Sb0vvE+Sa9pmTLXlUvX+IR9xDKu4GvAf3MbCFwqbv/Ic59ZqK+vj7tYWdxP2xXRLKrtzQg4x5dc0Kc2++pxsbGvHnYrohIHPK+uyZu+fKwXRGROPT6JC/xW1r/QVoXt1YuWwLAVv0GdLlsS3MzlPQ4NJGs6ahWTkddwVF1+yrJ54EkJ8FM7tBbuTiom5lOf+bKzcpY1e2oohXX61fcEu1t8dD9W+MlPmVl8fbfK8nnWNKTYCYtkUzu/Js4cWJat/HHLa7Xr3nb4DpROiNh0r8tHjK9NT6ON7C2Al55IK7j6/fFXTaanqvBGAWR5JN8oiU9CSZdXK9fJuLablxvYP2+uAv19fWsau52aJGI8/i6U2MmLnmf5JN+oonkqzjfwCZOnMjiHDdC8uENOhvyPskn/UQTEYlT7vsrREQkNnnfkpfuqa+vp3jNKhW4EunllORFJOvUCMkeJfmEqqioYHFzHxW4EunllOSlYEX9DM3WbepmofipEZI9SvJSkDIZWhvnzUIi+U5JXgpSbxnjLNJTGkIpIpJgasmLFIhcVTGUwqYkL1Lg4q5iKIVNSV6kQKhVLt2hPnkRkQRTkhcRSTAleRGRBOvVffJx1M9Q7QwRySe9OsmLiGwo1VDVjoapQv4PVe3VST6O+hmqnSEaz548hTxMtVcneZFsKuRE0Zsk7Q1XSV4kYklLElLYNLpGRCTBYm/Jm9nBwGSgGLjF3X8Z9z6TIJOLP4XWp6s+awE9DyBbYk3yZlYM3ASMBRYC/zKzP7n76z3ddtKugKcj0z7dQvsnUp91YYjif0/PA8ieuFvyXwbq3P1tADO7BzgS6HGST6U7SWLDRFjU9DG2riXt9b2oZL3kGFUS7OkbUkcnen19PY2NjetNa1zbBEDZOtto+bKyMioqKtpN6fk/UZxvtr3xzT8fZPq/l+pv3tEnvI7otUtP3Em+Avig3e8Lgb3bL2Bm44HxAIMGDUp7w1G8uKmSVX39Zxslwc4ESbB9Us+PlkRHf59U/0j19fUAGyTzQBL+kfQJIVrZPh/0+vWMuXt8Gzc7Fhjn7t8Nfz8J+LK7pzxLqqurffbs2bHFIyKSRGZW4+7VqebFPbpmIbBju98HAv+JeZ8iIhKKO8n/C/iCme1kZpsAxwN/inmfIiISirVP3t0/M7PvA38hGEJ5q7u/Fuc+RUTkv2IfJ+/ujwPRlXkUEZG06Y5XEZEEU5IXEUkwJXkRkQRTkhcRSbBYb4bKlJktBd7L4i77AcuyuL9s0/EVNh1f4cr2sQ129/6pZuRVks82M5vd0V1iSaDjK2w6vsKVT8em7hoRkQRTkhcRSbDenuSn5DqAmOn4CpuOr3DlzbH16j55EZGk6+0teRGRRFOSFxFJsEQleTPb3szuMbN/m9nrZva4me1iZo1mNsfM5pvZS2Z2Sop1R5rZWjP7lplta2Zzw6/FZlbf7vdNcnFsYYwZH5+ZbWlmj5jZy2b2mpmdlq/Hl01mdqqZfS4P4lgb/t1fC1+j882sqN38L5vZLDN708zeMLNbzGyzXMacKTNzM7u63e8XmNll4c+XmdkF4c+lZvZXM7s0R6H2WLvX82UzqzWzfXIdU+xVKLPFzAyYAdzu7seH06oIHrj6b3ffM5y2M/CgmRW5+x/DacXArwhKIuPuy4GqcN5lQIO7X5XN49lQD47vHOB1dz/czPoDbwLT3L0qXP4y8uD4cuBUYB65f4hNY7vXYgBwF7AlcKmZbQfcDxzv7n8Pz4FjgL7AmhzF2x3NwDfN7Ap3T3mDUNi4eACocffLsxpdtNq/nuOAK4Cv5jKgJLXkvw60uPtvWye4+1zWf8Ys4UPFzwcmtJt8LsEJtiT+MLutu8fnQN8wQZQDK4DPshGwmVW2a33OM7NpZnagmb1oZm+FrdRtzOwhM3vFzP5hZruH615mZrea2Uwze9vMJrTb7vnh9uaZ2Q/aTT853M7LZnaHmfU1s3fMrCScv4WZvWvBYymrgWlhq6vMzPYys+fMrMbM/mJmO2Tjb9Seuy8heN7x98PX6xyCN/W/h/Pd3ae7+4fZjq2HPiMYbXJeB/P7APcAb7n7hVmLKn5bAB/lOojEtOSB4UBNmsvWAkMBzKwCOBoYA4yMJ7RIdOv4gBsJnsb1H4IW4HHuvi768Do0BDiWIHn9C/h/wH7AEcDFBG9Sc9z9KDMbA0wl/BRFcAxfD+N+08x+A+wOnEbwQHgD/mlmzwGfAj8B9nX3ZWa2jbuvNrOZwKHAQwRPJnvA3e83s3OAC9x9dvgmcANwpLsvNbPjgP8DTo/x75KSu78ddtcMIHjNb892DDG5CXjFzH6dYt6Pgafc/QfZDSkWZWY2FygFdiDIKzmVpCSfCWv383XA/7r72qDxlAjtD2QcMJfgZPs88Fcze97dP85SLO+4+6sAZvYa8LS7u5m9ClQCgwm6IHD3Z8LrBVuG6z7m7s1As5ktIeia2g+Y4e6fhNt8ENif4BPL9NbuAHdfEW7jFoIk8hDBm8OZKWL8IkFC/Wt4DhQDiyL7C2QuMSdiK3f/2MymEnzCbNxg9gvAKDPbxd0XZD+6SLXvrhkFTDWz4Z7DsepJ6q55DdgrzWX3BOaHP1cD95jZu8C3gJvN7KjIo+u57h7facCD4Uf9OuAd/tvKz4bmdj+va/f7OoJGRqqE1voP0X7dtZ0sTzh9o38kd38RqDSzrwLF7j6vg3Vfc/eq8Gs3dz+oowOKU3hNZS1B12Emr3khuA44A9h8g+mzgB8Af86Hi+FRCbvZ+gEpC4dlS5KS/DPApmbW1lIzs5EELUXaTasEriL4eI677+Tule5eCUwHvufuD2Up5kx06/iA94EDwnnbEbRa385CvOmaBZwIYGZfA5Z18SljFnCUmW1mZpsTdLU9DzwNfNvMtg23tU27daYCdwN/bDdtNUE3EAQXo/uHLS/MrMTMvtTD48pYeGH8t8CNYcvvRuAUM9u73TLfMbPtsx1bFMJPV/cRJPoN5z0AXAk8YWZbZTm0WJjZUIJPhctzGUdiumvCLoCjgevM7EKgCXiXoIXweTObQ9BPthq4oXVkTaHowfH9HLgt7B4xgq6pfCrvehnwRzN7hWDEyEbDW9tz91ozuw14KZx0i7vPATCz/wOeM7O1wByCETQA04BJBIm+1W3Ab82sERhF8Cnu+rCrqA9BqzMbD51v7cMtIbhAeQdwDYC7f2hmxwNXhSNv1hG8yT2YhbjicjXw/VQz3P234RvYn8zsIHdvym5okWh9PSH4fzvF3dfmMB6VNZDkM7NvEVxUPSnXsYhkW2Ja8iKpmNkNwDeAQ3Idi0guqCUvIpJgSbrwKiIiG1CSFxFJMCV5EZEEU5IXCVlQmfLG8Of21RHzomKlSHcoyYt07VRASV4KkpK8JF6K6pT9zewBM/tX+LVvJ+t+i/UrVh5qZjPazR8b1s/BzBrM7GoL6og/Hd7Bipl93syeCCtcPh/eCSmSFUrykmhheYKfAGPcfQ9gIjAZuNbdRxIUR7ulo/XdfTowGzgxLDz1OLBrawInqA3Uenfx5kCtu48AngNaH34xBTjX3fcCLgBuju4IRTqnm6Ek6cawQXVKMzsQGNau6ugWZta3ow20F5aXuAP4jpn9kaAkwsnh7HXAveHPdxI8vKUc2Ae4v93+Nu3hMYmkTUleki5VdcoiYJS7r1fyNoNS038EHiGoH3S/u3f0EBYP97WytfysSLapu0aSLlV1yidpVyTLgscodqZ9xUrc/T8ED2G5hKDQWasigkJnEDwc5YWwouY7FjyNCgvs0YPjEcmIWvKSaO7+WorqlBOAm8LKl30IKjv+TyebuY12FSvDTwDTgP7u/nq75T4BvmRmNcAq4Lhw+onAb8zsEoJqk/cAL0d1jCKdUe0akW4Ix9PPcfc/tJvW4O7lOQxLZCNK8iIZClvqnwBjw8cTtk5Xkpe8oyQvIpJguvAqIpJgSvIiIgmmJC8ikmBK8iIiCaYkLyKSYP8fsn0+EcPGrYUAAAAASUVORK5CYII=\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "sns.boxplot(x=annotated_coeqtl_df_clean['celltype'], \n",
+ " y=annotated_coeqtl_df_clean['var_onemillionv2'],\n",
+ " hue=annotated_coeqtl_df_clean['gene2_isSig'],\n",
+ " palette='Paired', fliersize=1,\n",
+ " showfliers = False)\n",
+ "plt.savefig('var_onemillionv2.unfiltered_results.pdf')\n",
+ "plt.savefig('var_onemillionv2.unfiltered_results.png')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "sns.boxplot(x=annotated_coeqtl_df_clean['celltype'],\n",
+ " y=annotated_coeqtl_df_clean['gene2_nonzeroratio_onemillionv2'],\n",
+ " hue=annotated_coeqtl_df_clean['gene2_isSig'],\n",
+ " palette='Paired', fliersize=1, showfliers = False)\n",
+ "plt.savefig('gene2_nonzeroratio_onemillionv2.unfiltered_results.pdf')\n",
+ "plt.savefig('gene2_nonzeroratio_onemillionv2.unfiltered_results.png')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "sns.boxplot(x=annotated_coeqtl_df_clean['celltype'], \n",
+ " y=annotated_coeqtl_df_clean['eqtlgene_nonzeroratio_onemillionv2'],\n",
+ " hue=annotated_coeqtl_df_clean['gene2_isSig'],\n",
+ " palette='Paired', fliersize=1, showfliers = False)\n",
+ "plt.savefig('eqtlgene_nonzeroratio_onemillionv2.unfiltered_results.pdf')\n",
+ "plt.savefig('eqtlgene_nonzeroratio_onemillionv2.unfiltered_results.png')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.11"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/04_coeqtl_mapping/README.md b/04_coeqtl_mapping/README.md
new file mode 100644
index 0000000..0b4821e
--- /dev/null
+++ b/04_coeqtl_mapping/README.md
@@ -0,0 +1,44 @@
+# 04_coeqtl_mapping
+
+*plot_effect_concordance_across_cohorts.R*: compares effect sizes (Z-scores) calculated in each individual dataset (before the meta-analysis)
+
+*plot_celltype_overlap_upset.R* : upset plot overlap of significant co-eQTLs between cell types
+
+*power_analyis_coeqtls.R* : explores how the number of tests reduce the power to detect co-eQTLs, taking estimates for number of tests based on how many genes are expressed above different cutoffs for Oelen v3 dataset
+
+Rb calculations are these files:
+*prepare_for_rb_calculation.py* : prepare the input files for rb calculation
+*Rb.R* : rb function
+*calculate_rb_for_sc_and_bios.R* : execute rb functions
+*rb_celltypes.ipynb*: examine the rb values for each cell type; also include scripts for examining different characteristics for coeQTLs compared to non-coeQTLs
+
+Co-eQTL pipeline are these files:
+all files in the betaqtl_scripts (incl. templates)
+*individual_networks.py*: make co-expression files for each individual
+*prepare_genelist_and_annotation_for_betaqtl.py*: prepare input files for the qtl mapping pipeline
+*createBatches.sh*: create batches for qtl mapping pipeline
+*submit_process_betaqtl_results.sh*: submit the jobs for concatenating qtl mapping and perform multiple testing procedures
+*concat_betaqtl_results.fixed.py*: concat qtl mapping results
+*screen_permutation_p_values.py*: concat permutation files
+*multipletesting_correction.fixed.py*: perform multiple testing correction
+
+Other co-eQTL analysis:
+*filtering_strategy.py*: filter for gene pairs
+*individual_networks_cmono_ncmono.py*: create co-expression files for each individual for sub cell types in monocytes
+*individual_networks_maxcell.py*: create co-expression files for each individual with a limit of cell number
+*merge_coexpression_for_betaeqtl_maxcell.py*: merge the co-expression files for each individual with a limit of cell number
+*merge_coexpression_for_betaqtl.subsampleindividuals.py*: create co-expression files for each individual with a limit of sample number
+
+BIOS replication are these files:
+*replication_in_bios.py*: perform bios replication
+*select_snps_from_vcf.sh*: select SNP from vcf file
+*examine_bios_replication.ipynb*: examine the bios replication results
+
+Annotating coeQTL results:
+*annotate_coeqtl_files.py*: annotate the coeqtl results for nonzero ratio, mean and var of gene pair
+*collect_nonzeroratio.py*: collect non zero ratio annotation for all genes in all datasets
+
+
+
+
+
diff --git a/04_coeqtl_mapping/Rb.R b/04_coeqtl_mapping/Rb.R
new file mode 100644
index 0000000..eb22df3
--- /dev/null
+++ b/04_coeqtl_mapping/Rb.R
@@ -0,0 +1,70 @@
+
+#' Function for Rb analysis
+#'
+#' @param b1 Beta from first dataset.
+#' @param se1 Standard error of beta from first dataset.
+#' @param b2 Beta from second dataset.
+#' @param se2 Standard error of beta from second dataset.
+#' @param theta Variable representing sample overlap between two datasets. Should be set 0 if no sample overlap.
+#'
+#' @return Data frame with Rb, SE(Rb) and corresponding P-value.
+#' @export
+#'
+#' @note This function is slightly adapted from the script shared by Ting Qi.
+#'
+#' @examples
+calcu_cor_true <- function(b1, se1, b2, se2, theta) {
+ idx <- which(is.infinite(b1) | is.infinite(b2) | is.infinite(se1) | is.infinite(se2))
+ if (length(idx) > 0) {
+ b1 <- b1[-idx]
+ se1 <- se1[-idx]
+ b2 <- b2[-idx]
+ se2 <- se2[-idx]
+ theta <- theta[-idx]
+ }
+
+ var_b1 <- var(b1, na.rm = T) - mean(se1^2, na.rm = T)
+ var_b2 <- var(b2, na.rm = T) - mean(se2^2, na.rm = T)
+ if (var_b1 < 0) {
+ var_b1 <- var(b1, na.rm = T)
+ }
+ if (var_b2 < 0) {
+ var_b2 <- var(b2, na.rm = T)
+ }
+ cov_b1_b2 <- cov(b1, b2, use = "complete.obs") - mean(theta, na.rm = T) * sqrt(mean(se1^2, na.rm = T) * mean(se2^2, na.rm = T))
+ r <- cov_b1_b2 / sqrt(var_b1 * var_b2)
+
+ r_jack <- c()
+ n <- length(b1)
+ for (k in 1:n) {
+ b1_jack <- b1[-k]
+ se1_jack <- se1[-k]
+ var_b1_jack <- var(b1_jack, na.rm = T) - mean(se1_jack^2, na.rm = T)
+ b2_jack <- b2[-k]
+ se2_jack <- se2[-k]
+ var_b2_jack <- var(b2_jack, na.rm = T) - mean(se2_jack^2, na.rm = T)
+ if (var_b1_jack < 0) {
+ var_b1_jack <- var(b1_jack, na.rm = T)
+ }
+ if (var_b2_jack < 0) {
+ var_b2_jack <- var(b2_jack, na.rm = T)
+ }
+ theta_jack <- theta[-k]
+ cov_e1_jack_e2_jack <- mean(theta_jack, na.rm = T) * sqrt(mean(se1_jack^2, na.rm = T) * mean(se2_jack^2, na.rm = T))
+ cov_b1_b2_jack <- cov(b1_jack, b2_jack, use = "complete.obs") - cov_e1_jack_e2_jack
+ r_tmp <- cov_b1_b2_jack / sqrt(var_b1_jack * var_b2_jack)
+ r_jack <- c(r_jack, r_tmp)
+ }
+ r_mean <- mean(r_jack, na.rm = T)
+ idx <- which(is.na(r_jack))
+ if (length(idx) > 0) {
+ se_r <- sqrt((n - 1) / n * sum((r_jack[-idx] - r_mean)^2))
+ } else {
+ se_r <- sqrt((n - 1) / n * sum((r_jack - r_mean)^2))
+ }
+
+ p <- pchisq((r / se_r)**2, df = 1, lower.tail = FALSE)
+
+ res <- cbind(r, se_r, p)
+ return(res)
+}
diff --git a/04_coeqtl_mapping/annotate_coeqtl_files.py b/04_coeqtl_mapping/annotate_coeqtl_files.py
new file mode 100644
index 0000000..5d6e71e
--- /dev/null
+++ b/04_coeqtl_mapping/annotate_coeqtl_files.py
@@ -0,0 +1,84 @@
+import pandas as pd
+from pathlib import Path
+import numpy as np
+import argparse
+
+
+def parse():
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--celltype', dest='celltype')
+ parser.add_argument('--networkcelltype', dest='networkcelltype')
+ parser.add_argument('--filtertype', dest='filtertype')
+ return parser
+
+args = parse().parse_args()
+celltype = args.celltype
+filtertype = args.filtertype
+networkcelltype = args.networkcelltype
+# filtertype = 'filtered_results'
+workdir = Path("./coeqtl_mapping/")
+coeqtl_filepath = workdir/f'output/{filtertype}/UT_{celltype}/coeqtls_fullresults_fixed.all.tsv.gz'
+
+def find_gene2(genepair, eqtlgene):
+ gene1, gene2 = genepair.split(';')
+ if gene1 == eqtlgene:
+ return gene2
+ else:
+ return gene1
+
+coeqtl_df = pd.read_csv(coeqtl_filepath, sep='\t', compression='gzip')
+coeqtl_df['gene2'] = [find_gene2(item[0], item[1]) for item in coeqtl_df[['Gene', 'eqtlgene']].values]
+unique_genepairs = list(set(coeqtl_df['Gene']))
+
+
+network_prefix = Path("./coeqtl_mapping/input/individual_networks/UT/")
+def annotate_by_datasets(datasetname, coeqtl_df, unique_genepairs):
+ def read_numpy(prefix):
+ data = np.load(f'{prefix}.npy')
+ columns = [item.strip() for item in open(f'{prefix}.cols.txt', 'r').readlines()]
+ rows = [item.strip() for item in open(f'{prefix}.rows.txt', 'r').readlines()]
+ return pd.DataFrame(data=data, columns=columns, index=rows)
+ print(f"Loading {datasetname}.")
+ network_df = read_numpy(network_prefix / datasetname / f'UT_{networkcelltype}.zscores')
+ individual_ids = network_df.columns.copy()
+ common_genepairs = list(set(unique_genepairs) & set(network_df.index))
+ selected_network_df = network_df.loc[common_genepairs]
+ selected_network_df[f'var_{datasetname}'] = np.nanvar(selected_network_df[individual_ids].values, axis=1)
+ selected_network_df[f'mean_{datasetname}'] = np.nanmean(selected_network_df[individual_ids].values, axis=1)
+ var_mean_dic = selected_network_df[[f'var_{datasetname}', f'mean_{datasetname}']].T.to_dict()
+ get_var = lambda x:var_mean_dic.get(x)[f'var_{datasetname}'] if x in var_mean_dic else np.nan
+ get_mean = lambda x:var_mean_dic.get(x)[f'mean_{datasetname}'] if x in var_mean_dic else np.nan
+ coeqtl_df[f'var_{datasetname}'] = [get_var(genepair) for genepair in coeqtl_df['Gene']]
+ coeqtl_df[f'mean_{datasetname}'] = [get_mean(genepair) for genepair in coeqtl_df['Gene']]
+ return coeqtl_df
+
+for datasetname in ['onemillionv2', 'onemillionv3', 'stemiv2', 'ng']:
+ coeqtl_df = annotate_by_datasets(datasetname, coeqtl_df, unique_genepairs)
+
+
+def annotate_with_nonzero(df, celltype, datasetname, condition='UT'):
+ nonzeroratio_prefix = Path(
+ "./coeqtl_mapping/input/gene_pair_selection/annotations/")
+ nonzeroratio_path = nonzeroratio_prefix/f'{datasetname}.genes_nonzeroratio.tsv'
+ nonzero_df = pd.read_csv(nonzeroratio_path, sep='\t', index_col=0)
+ if condition == 'UT' and datasetname == 'stemiv2':
+ colname = f'{datasetname}_t8w_{celltype}'
+ elif condition == 'UT' and datasetname.startswith('onemillion'):
+ colname = f'{datasetname}_UT_{celltype}'
+ elif condition == 'UT' and datasetname.startswith('ng'):
+ colname = f'{datasetname}_{celltype}'
+ else:
+ raise NotImplementedError(f"{datasetname} {celltype} not understood")
+ nonzero_dict = nonzero_df[colname].T.to_dict()
+ df[f'eqtlgene_nonzeroratio_{datasetname}'] = [nonzero_dict.get(genename) for genename in df['eqtlgene']]
+ df[f'gene2_nonzeroratio_{datasetname}'] = [nonzero_dict.get(genename) for genename in df['gene2']]
+ return df
+
+for datasetname in ['onemillionv2', 'onemillionv3', 'stemiv2', 'ng']:
+ print(datasetname)
+ coeqtl_df = annotate_with_nonzero(coeqtl_df, networkcelltype, datasetname)
+
+
+coeqtl_df.to_csv(workdir/f'output/{filtertype}/UT_{celltype}/coeqtls_fullresults_fixed.all.annotated.tsv.gz',
+ compression='gzip', sep='\t', index=False)
+
diff --git a/04_coeqtl_mapping/betaqtl_scripts/createBatches.py b/04_coeqtl_mapping/betaqtl_scripts/createBatches.py
new file mode 100644
index 0000000..76763b6
--- /dev/null
+++ b/04_coeqtl_mapping/betaqtl_scripts/createBatches.py
@@ -0,0 +1,166 @@
+import gzip
+import sys
+import os
+import glob
+
+if len(sys.argv) < 6:
+ print("Usage: createbatches.py expfile.txt.gz gte.txt genotype.vcf.gz genelist.txt.gz annotation.txt.gz template.sh nrmaxgenesperbatch outdir")
+ sys.exit(0)
+
+expfile = sys.argv[1]
+gte = sys.argv[2]
+genotype = sys.argv[3]
+genelist = sys.argv[4]
+annotation = sys.argv[5]
+template = sys.argv[6]
+nrgenes = int(sys.argv[7])
+out = sys.argv[8]
+condition = sys.argv[9]
+celltype = sys.argv[10]
+
+if not out.endswith("/"):
+ out = out + "/"
+
+def writeJob(exp, gte, gt, template, batchfile, jobfile, outprefix, logprefix, chr, condition, celltype):
+ print("Writing job: "+jobfile)
+ fh = open(template,'r')
+ lines = fh.readlines()
+ fh.close()
+ fho = open(jobfile,'w')
+ for line in lines:
+ line = line.replace("GENOTYPE",gt)
+ line = line.replace("GTE",gte)
+ line = line.replace("EXPRESSION",exp)
+ line = line.replace("CHROM",str(chr))
+ line = line.replace("BATCHFILE",batchfile)
+ line = line.replace("OUTPREFIX",outprefix)
+ line = line.replace("LOGPREFIX",logprefix)
+ line = line.replace("CONDITION", condition)
+ line = line.replace("CELLTYPE", celltype)
+ fho.write(line)
+ fho.close()
+
+def checkDir(path):
+ if os.path.exists(path):
+ # delete contents
+ files = glob.glob(path+"*")
+ for file in files:
+ print("Removing: "+file)
+ os.remove(file)
+ else:
+ print("Creating dir: "+path)
+ os.mkdir(path)
+
+abspath = os.path.abspath(out)
+checkDir(abspath+"/batches/")
+checkDir(abspath+"/output/")
+checkDir(abspath+"/jobs/")
+checkDir(abspath+"/logs/")
+
+# read expression file
+fh = None
+genesinfile=genelist
+print("Reading: "+genesinfile)
+if genesinfile.endswith(".txt.gz"):
+ fh = gzip.open(genesinfile,'rt')
+else:
+ fh = open(genesinfile,'r')
+genesInExp = set()
+fh.readline()
+for line in fh:
+# gene = line.split("\t", maxsplit=1)[0]
+ gene = line.strip()
+ genesInExp.add(gene)
+# print(gene)
+
+fh.close()
+print("{} genes in {}".format(len(genesInExp),expfile))
+
+# read gene set
+geneset = set()
+fh = None
+print("Genelist: "+genelist)
+if genelist.endswith(".txt.gz"):
+ fh = gzip.open(genelist,'rt')
+else:
+ fh = open(genelist,'r')
+for line in fh:
+ gene = line.strip()
+ if gene in genesInExp:
+ geneset.add(line.strip())
+fh.close()
+print("Genes in genelist: {}".format(len(geneset)))
+
+# read annotation
+print("Annotation: "+annotation)
+fh = None
+if annotation.endswith(".txt.gz"):
+ fh = gzip.open(annotation,'rt')
+else:
+ fh = open(annotation,'r')
+fh.readline()
+genesPerChr = {}
+annotread = 0
+for line in fh:
+ elems = line.strip().split("\t")
+ gene = elems[1]
+ if gene in geneset:
+ chr = -1
+ try:
+ chr = int(elems[3])
+ except:
+ print(gene+" has non-numeric chromosome: "+elems[3])
+ if chr < 23 and chr > 0:
+ pos = int(elems[4])
+ chrgenes = genesPerChr.get(chr)
+ if chrgenes is None:
+ chrgenes = []
+ chrgenes.append(gene)
+ genesPerChr[chr] = chrgenes
+ annotread = annotread + 1
+fh.close()
+print("Annotation read for {} genes".format(annotread))
+
+# create batches
+for chr in genesPerChr.keys():
+ bctr = 1
+ chrgenes = genesPerChr.get(chr)
+ gctr = 0
+ bgctr = 0
+ batchname = "chr"+str(chr)+"-batch-"+str(bctr)
+ # write job script for first batch
+ batchfile = abspath+"/batches/"+batchname+".txt"
+ print("Writing batch: "+batchfile)
+ jobfile = abspath+"/jobs/"+batchname+".sh"
+ outprefix = abspath+"/output/"+batchname
+ logprefix = abspath+"/logs/"+batchname
+ print()
+ print("Writing job: "+template+"\n"+batchfile+"\n"+jobfile+"\n"+outprefix+"\n"+str(chr))
+ # exp, gte, gt, template, batchfile, jobfile, outprefix, chr
+ chrgenotype = genotype.replace("CHR",str(chr))
+ writeJob(expfile, gte, chrgenotype, template, batchfile, jobfile, outprefix, logprefix, chr, condition, celltype)
+ bgout = open(batchfile,'w')
+ while gctr < len(chrgenes):
+ bgout.write(chrgenes[gctr]+"\n")
+ bgctr = bgctr + 1
+ if bgctr == nrgenes:
+ # start new batch
+ bgout.close()
+ bctr = bctr + 1
+ # write job script for new batch
+ batchname = "chr"+str(chr)+"-batch-"+str(bctr)
+ batchfile = abspath+"/batches/"+batchname+".txt"
+ print("Writing batch: "+batchfile)
+ jobfile = abspath+"/jobs/"+batchname+".sh"
+ outprefix = abspath+"/output/"+batchname
+ logprefix = abspath+"/logs/"+batchname
+# writeJob(template, batchfile, jobfile, outprefix, chr)
+ writeJob(expfile, gte, chrgenotype, template, batchfile, jobfile, outprefix, logprefix, chr, condition, celltype)
+
+ bgout = open(batchfile,'w')
+ bgctr = 0
+ gctr = gctr + 1
+ # if there are any genes left, close batch
+ if bgctr > 0:
+ bgout.close()
+
diff --git a/04_coeqtl_mapping/betaqtl_scripts/createBatches.sh b/04_coeqtl_mapping/betaqtl_scripts/createBatches.sh
new file mode 100644
index 0000000..ce5ca12
--- /dev/null
+++ b/04_coeqtl_mapping/betaqtl_scripts/createBatches.sh
@@ -0,0 +1,58 @@
+condition=$1
+celltype=$2
+workdir="/groups/umcg-lld/tmp01/projects/1MCellRNAseq/GRN_reconstruction/ongoing/coeqtl_mapping/"
+coexpressionfile=${workdir}/"input/individual_networks/${condition}/${condition}_${celltype}.onemillionv23stemiv2ng.zscores.tsv.gz"
+gtefile=${workdir}/"input/summary/gte-fix.tsv"
+gtfile=${workdir}/"output/genotypevcfs/chrCHR/GenotypeData.bgz.vcf.gz"
+batchsize=100000
+
+genelist=${workdir}/"output/${condition}_${celltype}/genelist.noduplicated.txt"
+geneannotation=${workdir}/"input/summary/${condition}_${celltype}.genepairs.annotation.gene1position.noduplicated.tsv"
+jobtemplatefile=${workdir}/"output/betaqtl_scripts/jobtemplate.noduplicated.sh"
+outputfile=${workdir}/"output/${condition}_${celltype}/noduplicated/"
+mkdir -p ${outputfile}
+python createBatches.py \
+ ${coexpressionfile} \
+ ${gtefile} \
+ ${gtfile} \
+ ${genelist} \
+ ${geneannotation} \
+ ${jobtemplatefile} \
+ ${batchsize} \
+ ${outputfile} \
+ ${condition} \
+ ${celltype}
+
+genelist=${workdir}/"output/${condition}_${celltype}/genelist.duplicatedversion1.txt"
+geneannotation=${workdir}/"input/summary/${condition}_${celltype}.genepairs.annotation.gene1position.duplicatedversion1.tsv"
+jobtemplatefile=${workdir}/"output/betaqtl_scripts/jobtemplate.duplicatedversion1.sh"
+outputfile=${workdir}/"output/${condition}_${celltype}/duplicatedversion1/"
+mkdir -p ${outputfile}
+python createBatches.py \
+ ${coexpressionfile} \
+ ${gtefile} \
+ ${gtfile} \
+ ${genelist} \
+ ${geneannotation} \
+ ${jobtemplatefile} \
+ ${batchsize} \
+ ${outputfile} \
+ ${condition} \
+ ${celltype}
+
+genelist=${workdir}/"output/${condition}_${celltype}/genelist.duplicatedversion2.txt"
+geneannotation=${workdir}/"input/summary/${condition}_${celltype}.genepairs.annotation.gene1position.duplicatedversion2.tsv"
+jobtemplatefile=${workdir}/"output/betaqtl_scripts/jobtemplate.duplicatedversion2.sh"
+outputfile=${workdir}/"output/${condition}_${celltype}/duplicatedversion2"
+mkdir -p ${outputfile}
+python createBatches.py \
+ ${coexpressionfile} \
+ ${gtefile} \
+ ${gtfile} \
+ ${genelist} \
+ ${geneannotation} \
+ ${jobtemplatefile} \
+ ${batchsize} \
+ ${outputfile} \
+ ${condition} \
+ ${celltype}
diff --git a/04_coeqtl_mapping/betaqtl_scripts/jobtemplate.duplicatedversion1.sh b/04_coeqtl_mapping/betaqtl_scripts/jobtemplate.duplicatedversion1.sh
new file mode 100644
index 0000000..cd8864f
--- /dev/null
+++ b/04_coeqtl_mapping/betaqtl_scripts/jobtemplate.duplicatedversion1.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+#SBATCH --ntasks=1
+#SBATCH --time=1:00:00
+#SBATCH --mem=24g
+#SBATCH --cpus-per-task=11
+#SBATCH -o LOGPREFIX.log
+#SBATCH -e LOGPREFIX.err
+
+set -e
+set -u
+
+
+
+
+ml Java/11-LTS
+# ml Java/11.0.2
+
+# CHROM, BATCHFILE, OUTPREFIX
+# EXP, GTE, GENOTYPE
+# CONDITION CELLTYPE
+threads=11
+java -Xmx17g \
+ -Djava.util.concurrent.ForkJoinPool.common.parallelism=$threads \
+ -Dmaximum.threads=$threads -Dthread.pool.size=$threads \
+ -jar /groups/umcg-lld/tmp01/projects/1MCellRNAseq/GRN_reconstruction/tools/BetaQTL-1.0-SNAPSHOT-jar-with-dependencies.jar \
+ -m betaqtl \
+ --maf 0.1\
+ -a /groups/umcg-lld/tmp01/projects/1MCellRNAseq/GRN_reconstruction/ongoing/coeqtl_mapping/input/summary/CONDITION_CELLTYPE.genepairs.annotation.gene1position.duplicatedversion1.tsv \
+ -e /groups/umcg-lld/tmp01/projects/1MCellRNAseq/GRN_reconstruction/ongoing/coeqtl_mapping/input/individual_networks/CONDITION/CONDITION_CELLTYPE.onemillionv23stemiv2ng.zscores.tsv.gz \
+ -sgl /groups/umcg-lld/tmp01/projects/1MCellRNAseq/GRN_reconstruction/ongoing/coeqtl_mapping/input/snp_genepair_selection/CONDITION_CELLTYPE.baseline.duplicatedversion1.tsv \
+ -gl BATCHFILE \
+ -g /groups/umcg-lld/tmp01/projects/1MCellRNAseq/GRN_reconstruction/ongoing/coeqtl_mapping/input/summary/gte-fix.tsv \
+ -v GENOTYPE \
+ --chr CHROM \
+ -o OUTPREFIX \
+ --perm 100 \
+ --outputall \
+ --snplog \
+ --outputallpermutations
diff --git a/04_coeqtl_mapping/betaqtl_scripts/jobtemplate.duplicatedversion2.sh b/04_coeqtl_mapping/betaqtl_scripts/jobtemplate.duplicatedversion2.sh
new file mode 100644
index 0000000..9792bc9
--- /dev/null
+++ b/04_coeqtl_mapping/betaqtl_scripts/jobtemplate.duplicatedversion2.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+#SBATCH --ntasks=1
+#SBATCH --time=1:00:00
+#SBATCH --mem=24g
+#SBATCH --cpus-per-task=11
+#SBATCH -o LOGPREFIX.log
+#SBATCH -e LOGPREFIX.err
+
+set -e
+set -u
+
+
+
+
+ml Java/11-LTS
+# ml Java/11.0.2
+
+# CHROM, BATCHFILE, OUTPREFIX
+# EXP, GTE, GENOTYPE
+# CONDITION CELLTYPE
+threads=11
+java -Xmx17g \
+ -Djava.util.concurrent.ForkJoinPool.common.parallelism=$threads \
+ -Dmaximum.threads=$threads -Dthread.pool.size=$threads \
+ -jar /groups/umcg-lld/tmp01/projects/1MCellRNAseq/GRN_reconstruction/tools/BetaQTL-1.0-SNAPSHOT-jar-with-dependencies.jar \
+ -m betaqtl \
+ --maf 0.1\
+ -a /groups/umcg-lld/tmp01/projects/1MCellRNAseq/GRN_reconstruction/ongoing/coeqtl_mapping/input/summary/CONDITION_CELLTYPE.genepairs.annotation.gene1position.duplicatedversion2.tsv \
+ -e /groups/umcg-lld/tmp01/projects/1MCellRNAseq/GRN_reconstruction/ongoing/coeqtl_mapping/input/individual_networks/CONDITION/CONDITION_CELLTYPE.onemillionv23stemiv2ng.zscores.tsv.gz \
+ -sgl /groups/umcg-lld/tmp01/projects/1MCellRNAseq/GRN_reconstruction/ongoing/coeqtl_mapping/input/snp_genepair_selection/CONDITION_CELLTYPE.baseline.duplicatedversion2.tsv \
+ -gl BATCHFILE \
+ -g /groups/umcg-lld/tmp01/projects/1MCellRNAseq/GRN_reconstruction/ongoing/coeqtl_mapping/input/summary/gte-fix.tsv \
+ -v GENOTYPE \
+ --chr CHROM \
+ -o OUTPREFIX \
+ --perm 100 \
+ --outputall \
+ --snplog \
+ --outputallpermutations
diff --git a/04_coeqtl_mapping/betaqtl_scripts/jobtemplate.noduplicated.sh b/04_coeqtl_mapping/betaqtl_scripts/jobtemplate.noduplicated.sh
new file mode 100644
index 0000000..6373cd7
--- /dev/null
+++ b/04_coeqtl_mapping/betaqtl_scripts/jobtemplate.noduplicated.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+#SBATCH --ntasks=1
+#SBATCH --time=1:00:00
+#SBATCH --mem=24g
+#SBATCH --cpus-per-task=11
+#SBATCH -o LOGPREFIX.log
+#SBATCH -e LOGPREFIX.err
+
+set -e
+set -u
+
+
+
+
+ml Java/11-LTS
+
+# CHROM, BATCHFILE, OUTPREFIX
+# EXP, GTE, GENOTYPE
+# CONDITION CELLTYPE
+threads=11
+java -Xmx17g \
+ -Djava.util.concurrent.ForkJoinPool.common.parallelism=$threads \
+ -Dmaximum.threads=$threads -Dthread.pool.size=$threads \
+ -jar /groups/umcg-lld/tmp01/projects/1MCellRNAseq/GRN_reconstruction/tools/BetaQTL-1.0-SNAPSHOT-jar-with-dependencies.jar \
+ -m betaqtl \
+ --maf 0.1 \
+ -a /groups/umcg-lld/tmp01/projects/1MCellRNAseq/GRN_reconstruction/ongoing/coeqtl_mapping/input/summary/CONDITION_CELLTYPE.genepairs.annotation.gene1position.noduplicated.tsv \
+ -e /groups/umcg-lld/tmp01/projects/1MCellRNAseq/GRN_reconstruction/ongoing/coeqtl_mapping/input/individual_networks/CONDITION/CONDITION_CELLTYPE.onemillionv23stemiv2ng.zscores.tsv.gz \
+ -sgl /groups/umcg-lld/tmp01/projects/1MCellRNAseq/GRN_reconstruction/ongoing/coeqtl_mapping/input/snp_genepair_selection/CONDITION_CELLTYPE.baseline.noduplicated.tsv \
+ -gl BATCHFILE \
+ -g /groups/umcg-lld/tmp01/projects/1MCellRNAseq/GRN_reconstruction/ongoing/coeqtl_mapping/input/summary/gte-fix.tsv \
+ -v GENOTYPE \
+ --chr CHROM \
+ -o OUTPREFIX \
+ --perm 100 \
+ --outputall \
+ --snplog \
+ --outputallpermutations
diff --git a/04_coeqtl_mapping/calculate_rb_for_sc_and_bios.R b/04_coeqtl_mapping/calculate_rb_for_sc_and_bios.R
new file mode 100644
index 0000000..40415e6
--- /dev/null
+++ b/04_coeqtl_mapping/calculate_rb_for_sc_and_bios.R
@@ -0,0 +1,89 @@
+# Title : TODO
+# Objective : TODO
+# Created by: Shuang
+# Created on: 1/19/2022
+
+source("Rb.R")
+library(glue)
+library(data.table)
+
+calculate_rb_bios_replication_summary <- function(biostype, filtertype){
+ print(biostype)
+ print(filtertype)
+ resdf <- c()
+ for ( celltype in c('CD4T', 'CD8T', 'monocyte', 'NK', 'B', 'DC') ){
+ df <- read.csv(glue('./coeqtl_mapping/bios/{biostype}/{filtertype}/UT_{celltype}/replication_parameters.csv'))
+ res <- calcu_cor_true(df$flipped_bios_beta, df$std.err_bios, df$MetaBeta, df$MetaSE, df$theta)
+ res <- cbind(res, celltype)
+ resdf <- rbind(res, resdf)
+ }
+ write.csv(resdf, glue('./coeqtl_mapping/bios/{biostype}/{filtertype}/replication_summary.csv'))
+}
+
+# BIOS replication
+args = commandArgs(trailingOnly=TRUE)
+calculate_rb_bios_replication_summary(args[1], args[2])
+
+
+# coeQTLs
+filtertype = 'filtered_results'
+workdir = './coeqtl_mapping/'
+resdf <- c()
+for ( celltype_discovery in c('CD4T', 'CD8T', 'monocyte', 'NK', 'B', 'DC') ){
+ for ( celltype_replication in c('CD4T', 'CD8T', 'monocyte', 'NK', 'B', 'DC') ){
+ if ( celltype_discovery != celltype_replication ){
+ df <- fread(glue('{workdir}/output/{filtertype}/rb_calculations/discovery_{celltype_discovery}_replication_{celltype_replication}.tsv.gz'))
+ print(c(celltype_discovery, celltype_replication, nrow(df)))
+ if ( nrow(df) < 5 ){
+ resdf <- rbind(resdf, c(NA, NA, 0, celltype_discovery, celltype_replication))
+ }else{
+ res <- calcu_cor_true(df$MetaBeta, df$MetaSE, df$MetaBeta_replication, df$MetaSE_replication, df$theta)
+ res <- cbind(res, celltype_discovery, celltype_replication)
+ resdf <- rbind(res, resdf)
+ }
+ }
+ }
+}
+write.csv(resdf, glue('{workdir}/output/{filtertype}/rb_calculations/summary.csv'))
+
+# coeQTLs monocyte sub celltypes
+filtertype = 'filtered_results'
+workdir = './coeqtl_mapping/'
+resdf <- c()
+for ( celltype_discovery in c('monocyte', 'cMono', 'ncMono') ){
+ for ( celltype_replication in c('monocyte', 'cMono', 'ncMono') ){
+ if ( celltype_discovery != celltype_replication ){
+ df <- fread(glue('{workdir}/output/{filtertype}/rb_calculations/monocyte_subcelltypes/discovery_{celltype_discovery}_replication_{celltype_replication}.tsv.gz'))
+ print(c(celltype_discovery, celltype_replication, nrow(df)))
+ if ( nrow(df) < 5 ){
+ resdf <- rbind(resdf, c(NA, NA, 0, celltype_discovery, celltype_replication))
+ }else{
+ res <- calcu_cor_true(df$MetaBeta, df$MetaSE, df$MetaBeta_replication, df$MetaSE_replication, df$theta)
+ res <- cbind(res, celltype_discovery, celltype_replication)
+ resdf <- rbind(res, resdf)
+ }
+ }
+ }
+}
+write.csv(resdf, glue('{workdir}/output/{filtertype}/rb_calculations/monocyte_subcelltypes/summary.csv'))
+
+
+# eQTLs
+workdir = './coeqtl_mapping/'
+resdf <- c()
+for ( celltype_discovery in c('CD4T', 'CD8T', 'monocyte', 'NK', 'B', 'DC') ){
+ for ( celltype_replication in c('CD4T', 'CD8T', 'monocyte', 'NK', 'B', 'DC') ){
+ if ( celltype_discovery != celltype_replication ){
+ df <- fread(glue('{workdir}/input/snp_selection/rb_calculations/discovery_{celltype_discovery}_replication_{celltype_replication}.tsv.gz'))
+ print(c(celltype_discovery, celltype_replication, nrow(df)))
+ if ( nrow(df) < 5 ){
+ resdf <- rbind(resdf, c(NA, NA, 0, celltype_discovery, celltype_replication))
+ }else{
+ res <- calcu_cor_true(df$metabeta, df$SE, df$metabeta_replication, df$SE_replication, df$theta)
+ res <- cbind(res, celltype_discovery, celltype_replication)
+ resdf <- rbind(res, resdf)
+ }
+ }
+ }
+}
+write.csv(resdf, glue('{workdir}/input/snp_selection/rb_calculations/summary.csv'))
\ No newline at end of file
diff --git a/04_coeqtl_mapping/cell-type_specific_eQTLmapping/template_config.xml b/04_coeqtl_mapping/cell-type_specific_eQTLmapping/template_config.xml
new file mode 100644
index 0000000..6684866
--- /dev/null
+++ b/04_coeqtl_mapping/cell-type_specific_eQTLmapping/template_config.xml
@@ -0,0 +1,94 @@
+
+
+
+
+
+
+
+ 0.95
+ 0.0001
+ 0.1
+
+
+
+ cis
+ 100000
+ nonparametric
+
+ 10
+ false
+ false
+
+
+
+ fdr
+ 0.05
+ probe-level
+ 10
+
+
+
+
+
+
+ /path/to/SNPconfinement/file.tsv
+ false
+ false
+ false
+
+
+
+
+
+
+ van_der_Wijst
+ /path/to/van_der_Wijst/genome/trityper/
+ /path/to/van_der_Wijst/cell_type_specific_donor_aggregated_matrix/expression.tsv
+ /path_to_snp_annotation_file/singleCell-annotation-stripped.tsv
+ false
+ false
+
+
+ van_Blockland_v2
+ /path/to/van_Blockland_v2/genome/trityper/
+ /path/to/van_Blockland_v2/cell_type_specific_donor_aggregated_matrix/expression.tsv
+ /path_to_snp_annotation_file/singleCell-annotation-stripped.tsv
+ false
+ false
+
+
+ van_Blockland_v3
+ /path/to/van_Blockland_v3/genome/trityper/
+ /path/to/van_Blockland_v3/cell_type_specific_donor_aggregated_matrix/expression.tsv
+ /path_to_snp_annotation_file/singleCell-annotation-stripped.tsv
+ false
+ false
+
+
+ Oelen_v2
+ /path/to/Oelen_v2/genome/trityper/
+ /path/to/Oelen_v2/cell_type_specific_donor_aggregated_matrix/expression.tsv
+ /path_to_snp_annotation_file/singleCell-annotation-stripped.tsv
+ false
+ false
+
+
+ Oelen_v3
+ /path/to/Oelen_v3/genome/trityper/
+ /path/to/Oelen_v3/cell_type_specific_donor_aggregated_matrix/expression.tsv
+ /path_to_snp_annotation_file/singleCell-annotation-stripped.tsv
+ false
+ false
+
+
+
+
diff --git a/04_coeqtl_mapping/cell-type_specific_eQTLmapping/template_job_file.sh b/04_coeqtl_mapping/cell-type_specific_eQTLmapping/template_job_file.sh
new file mode 100644
index 0000000..3c99f30
--- /dev/null
+++ b/04_coeqtl_mapping/cell-type_specific_eQTLmapping/template_job_file.sh
@@ -0,0 +1,16 @@
+#!/usr/bin/env bash
+#SBATCH --job-name=B_1m_v2
+#SBATCH --output=/groups/umcg-bios/tmp01/projects/1M_cells_scRNAseq/ongoing/GRN_reconstruction/EMP_mapping/B/err/B_1m_v2.out
+#SBATCH --error=/groups/umcg-bios/tmp01/projects/1M_cells_scRNAseq/ongoing/GRN_reconstruction/EMP_mapping/B/err/B_1m_v2.err
+#SBATCH --time=05:59:00
+#SBATCH --cpus-per-task=10
+#SBATCH --mem=64gb
+#SBATCH --nodes=1
+#SBATCH --open-mode=append
+#SBATCH --export=NONE
+#SBATCH --get-user-env=L
+
+set -e
+ml Java/1.8.0_144
+
+java -jar -Xmx40g -Xms20g -XX:StringTableSize=10000019 -XX:MaxPermSize=512m /groups/umcg-lld/tmp01/projects/1MCellRNAseq/GRN_reconstruction/ongoing/cis_eqtl_single_cell/EMP_mapping_30_11_2021/eqtl-mapping-pipeline-1.4.9a-SNAPSHOT/eqtl-mapping-pipeline.jar --mode metaqtl --settings /groups/umcg-bios/tmp01/projects/1M_cells_scRNAseq/ongoing/GRN_reconstruction/EMP_mapping/B/config/1m_v2.xml
diff --git a/04_coeqtl_mapping/collect_nonzeroratio.py b/04_coeqtl_mapping/collect_nonzeroratio.py
new file mode 100644
index 0000000..b86dc08
--- /dev/null
+++ b/04_coeqtl_mapping/collect_nonzeroratio.py
@@ -0,0 +1,95 @@
+from pathlib import Path
+import numpy as np
+import scanpy as sc
+import re
+import pandas as pd
+
+
+prefix = Path('./seurat_objects')
+data_path_dic = {'onemillionv2':prefix/'1M_v2_mediumQC_ctd_rnanormed_demuxids_20201029.sct.h5ad',
+ 'stemiv2': prefix / 'cardio.integrated.20210301.stemiv2.h5ad',
+ 'onemillionv3': prefix / "1M_v3_mediumQC_ctd_rnanormed_demuxids_20201106.SCT.h5ad",
+ 'ng': prefix / 'pilot3_seurat3_200420_sct_azimuth.h5ad'}
+
+
+# extract timepoint from timepoint - stimulation annotation
+def get_time(x):
+ if x == 'UT':
+ return x
+ else:
+ pattern = re.compile(r'\d+h')
+ return re.findall(pattern, x)[0]
+
+
+def count_nonzeroratio(data_sc):
+ df = pd.DataFrame(data=data_sc.X.toarray(),
+ index=data_sc.obs.index,
+ columns=data_sc.var.index)
+ nonzerocounts = np.count_nonzero(df.values, axis=0)/df.shape[0]
+ return nonzerocounts
+
+
+def load_onemillion(data_name, data_sc):
+ var_df = pd.DataFrame(index=data_sc.var.index.values)
+ data_sc.obs['time'] = [get_time(x) for x in data_sc.obs['timepoint']]
+ for condition in data_sc.obs['time'].unique():
+ for celltype in data_sc.obs['cell_type_lowerres'].unique():
+ print(condition, celltype)
+ subset_sc = data_sc[(data_sc.obs['time']==condition) &
+ (data_sc.obs['cell_type_lowerres']==celltype)]
+ var_df[f'{data_name}_{condition}_{celltype}'] = count_nonzeroratio(subset_sc)
+ return var_df
+
+
+def load_ng(data_sc):
+ var_df = pd.DataFrame(index=data_sc.var.index.values)
+ celltype_maping = {'CD4 T': 'CD4T', 'CD8 T': 'CD8T', 'Mono': 'monocyte', 'DC': 'DC', 'NK': 'NK',
+ 'other T': 'otherT', 'other': 'other', 'B': 'B'}
+ data_sc.obs['cell_type_mapped_to_onemillion'] = [celltype_maping.get(name) for name in
+ data_sc.obs['predicted.celltype.l1']]
+ for celltype in data_sc.obs['cell_type_mapped_to_onemillion'].unique():
+ print(celltype)
+ subset_sc = data_sc[(data_sc.obs['cell_type_mapped_to_onemillion']==celltype)]
+ var_df[f'ng_{celltype}'] = count_nonzeroratio(subset_sc)
+ return var_df
+
+
+def load_stemi(dataname, data_sc):
+ var_df = pd.DataFrame(index=data_sc.var.index.values)
+ for condition in data_sc.obs['timepoint.final'].unique():
+ for celltype in data_sc.obs['cell_type_lowerres'].unique():
+ print(condition, celltype)
+ subset_sc = data_sc[(data_sc.obs['timepoint.final']==condition) &
+ (data_sc.obs['cell_type_lowerres']==celltype)]
+ var_df[f'{dataname}_{condition}_{celltype}'] = count_nonzeroratio(subset_sc)
+ return var_df
+
+
+def get_expressed_ratio(datasetname):
+ data_sc = sc.read_h5ad(data_path_dic[datasetname])
+ if datasetname.startswith('onemillion'):
+ var_df = load_onemillion(datasetname, data_sc)
+ elif datasetname.startswith('stemi'):
+ var_df = load_stemi(datasetname, data_sc)
+ else:
+ var_df = load_ng(data_sc)
+ return var_df
+
+
+def calculate_genes_withnonzeroratio(datasetname, savepath):
+ print("Processing ", datasetname)
+ var_df = get_expressed_ratio(datasetname)
+ var_df.to_csv(savepath, sep='\t')
+ return var_df
+
+
+work_dir = Path('/groups/umcg-lld/tmp01/projects/1MCellRNAseq/GRN_reconstruction/ongoing/')
+nonzero_savepath = work_dir/'coeqtl_mapping/input/gene_pair_selection/annotations/'
+for datasetname in ['stemiv2', 'ng', 'onemillionv2', 'onemillionv3']:
+ print('Processing ', datasetname)
+ savepath = nonzero_savepath/f'{datasetname}.genes_nonzeroratio.tsv'
+ var_df = calculate_genes_withnonzeroratio(datasetname, savepath)
+
+
+
+
diff --git a/04_coeqtl_mapping/concat_all6majorcelltypes_coeqtls.py b/04_coeqtl_mapping/concat_all6majorcelltypes_coeqtls.py
new file mode 100644
index 0000000..a2de1f7
--- /dev/null
+++ b/04_coeqtl_mapping/concat_all6majorcelltypes_coeqtls.py
@@ -0,0 +1,38 @@
+import pandas as pd
+from pathlib import Path
+
+def find_eqtlsnp_gene(snp_genepair, eqtl_snp_gene_set):
+ snp = snp_genepair.split('_')[0]
+ gene1, gene2 = snp_genepair.split('_')[1].split(';')
+ if '_'.join([snp, gene1]) in eqtl_snp_gene_set:
+ return gene1
+ else:
+ return gene2
+
+
+workdir = Path("/groups/umcg-lld/tmp01/projects/1MCellRNAseq/GRN_reconstruction/ongoing/coeqtl_mapping/output")
+eqtl_prefix = Path("/groups/umcg-lld/tmp01/projects/1MCellRNAseq/GRN_reconstruction/ongoing/coeqtl_mapping/input/snp_selection/eqtl")
+writer = pd.ExcelWriter(workdir/'summary/coeQTLs_6majorcelltypes.unfiltered.xlsx', engine='xlsxwriter')
+for celltype in ['CD4T', 'CD8T', 'monocyte', 'B', 'DC', 'NK']:
+ eqtls_path = eqtl_prefix/f'UT_{celltype}_eQTLProbesFDR0.05-ProbeLevel.tsv'
+ eqtl_df = pd.read_csv(eqtls_path, sep='\t')
+ eqtl_df['snp_gene'] = ['_'.join(item) for item in eqtl_df[['SNPName', 'genename']].values]
+ eqtl_snp_gene_set = set(eqtl_df['snp_gene'])
+ df = pd.read_csv(workdir/f'unfiltered_results/UT_{celltype}/coeqtls_fullresults.sig.tsv.gz', sep='\t', compression='gzip')
+ df['eqtlgene'] = [find_eqtlsnp_gene(item, eqtl_snp_gene_set) for item in df['snp_genepair']]
+ print(celltype, df.shape[0], len(df['eqtlgene'].unique()))
+ df.to_excel(writer, sheet_name=celltype)
+writer.save()
+
+
+writer = pd.ExcelWriter(workdir/'summary/coeQTLs_6majorcelltypes.filtered.xlsx', engine='xlsxwriter')
+for celltype in ['CD4T', 'CD8T', 'monocyte', 'B', 'DC', 'NK']:
+ eqtls_path = eqtl_prefix/f'UT_{celltype}_eQTLProbesFDR0.05-ProbeLevel.tsv'
+ eqtl_df = pd.read_csv(eqtls_path, sep='\t')
+ eqtl_df['snp_gene'] = ['_'.join(item) for item in eqtl_df[['SNPName', 'genename']].values]
+ eqtl_snp_gene_set = set(eqtl_df['snp_gene'])
+ df = pd.read_csv(workdir/f'filtered_results/UT_{celltype}/coeqtls_fullresults.sig.tsv.gz', sep='\t', compression='gzip')
+ df['eqtlgene'] = [find_eqtlsnp_gene(item, eqtl_snp_gene_set) for item in df['snp_genepair']]
+ print(celltype, df.shape[0], len(df['eqtlgene'].unique()))
+ df.to_excel(writer, sheet_name=celltype)
+writer.save()
\ No newline at end of file
diff --git a/04_coeqtl_mapping/concat_betaqtl_results.fixed.py b/04_coeqtl_mapping/concat_betaqtl_results.fixed.py
new file mode 100644
index 0000000..7d698f7
--- /dev/null
+++ b/04_coeqtl_mapping/concat_betaqtl_results.fixed.py
@@ -0,0 +1,65 @@
+import pandas as pd
+from pathlib import Path
+import os
+import argparse
+from tqdm import tqdm
+from statsmodels.stats.multitest import multipletests
+
+
+def concat_results(prefix, savepath):
+ concated_df = pd.DataFrame()
+ coeqtl_annotation_path = f'{args.annotation_prefix}.genepairs.annotation.gene1position.noduplicated.tsv'
+ coeqtl_annotation_df = pd.read_csv(coeqtl_annotation_path, sep='\t')
+ coeqtl_annotation_df['chr_pos'] = ['_'.join([str(ele) for ele in item]) for item in
+ coeqtl_annotation_df[['Chr', 'ChrStart', 'ChrEnd']].values]
+ coeqtl_annotation_dict = coeqtl_annotation_df.set_index('ArrayAddress')['chr_pos'].T.to_dict()
+ for filename in tqdm(os.listdir(prefix/'noduplicated/output')):
+ if filename.endswith("-TopEffects.txt"):
+ df = pd.read_csv(prefix/'noduplicated/output'/filename, sep='\t')
+ df['chr_pos'] = [coeqtl_annotation_dict.get(gene) for gene in df['Gene']]
+ concated_df = pd.concat([concated_df, df], axis=0)
+ concated_df['snp_genepair'] = ['_'.join(item) for item in concated_df[['SNP', 'Gene']].values]
+ version1 = pd.DataFrame()
+ coeqtl_annotation_path = f'{args.annotation_prefix}.genepairs.annotation.gene1position.duplicatedversion1.tsv'
+ coeqtl_annotation_df = pd.read_csv(coeqtl_annotation_path, sep='\t')
+ coeqtl_annotation_df['chr_pos'] = ['_'.join([str(ele) for ele in item]) for item in
+ coeqtl_annotation_df[['Chr', 'ChrStart', 'ChrEnd']].values]
+ coeqtl_annotation_dict = coeqtl_annotation_df.set_index('ArrayAddress')['chr_pos'].T.to_dict()
+ for filename in tqdm(os.listdir(prefix/'duplicatedversion1/output')):
+ if filename.endswith("-TopEffects.txt"):
+ df = pd.read_csv(prefix/'duplicatedversion1/output'/filename, sep='\t')
+ df['chr_pos'] = [coeqtl_annotation_dict.get(gene) for gene in df['Gene']]
+ version1 = pd.concat([version1, df], axis=0)
+ version1['snp_genepair'] = ['_'.join(item) for item in version1[['SNP', 'Gene']].values]
+ version2 = pd.DataFrame()
+ coeqtl_annotation_path = f'{args.annotation_prefix}.genepairs.annotation.gene1position.duplicatedversion2.tsv'
+ coeqtl_annotation_df = pd.read_csv(coeqtl_annotation_path, sep='\t')
+ coeqtl_annotation_df['chr_pos'] = ['_'.join([str(ele) for ele in item]) for item in
+ coeqtl_annotation_df[['Chr', 'ChrStart', 'ChrEnd']].values]
+ coeqtl_annotation_dict = coeqtl_annotation_df.set_index('ArrayAddress')['chr_pos'].T.to_dict()
+ for filename in tqdm(os.listdir(prefix/'duplicatedversion2/output')):
+ if filename.endswith("-TopEffects.txt"):
+ df = pd.read_csv(prefix/'duplicatedversion2/output'/filename, sep='\t')
+ df['chr_pos'] = [coeqtl_annotation_dict.get(gene) for gene in df['Gene']]
+ version2 = pd.concat([version2, df], axis=0)
+ version2['snp_genepair'] = ['_'.join(item) for item in version2[['SNP', 'Gene']].values]
+ concated_versions = pd.concat([concated_df, version1, version2], axis=0)
+ concated_versions = concated_versions.sort_values(by=['GeneChr', 'GenePos'])
+ concated_versions = concated_versions.set_index('snp_genepair')
+ # add multiple test significance
+ concated_versions['multipletestP'] = multipletests(concated_versions['BetaAdjustedMetaP'],
+ alpha=0.05, method='fdr_bh',
+ is_sorted=False, returnsorted=False)[1]
+ concated_versions.to_csv(savepath, sep='\t')
+ return concated_versions
+
+def argumentsparser():
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--prefix', type=str, dest='prefix')
+ parser.add_argument('--savepath', type=str, dest='savepath')
+ parser.add_argument('--annotation_prefix', type=str, dest='annotation_prefix')
+ return parser
+
+if __name__ == '__main__':
+ args = argumentsparser().parse_args()
+ concat_results(Path(args.prefix), args.savepath)
\ No newline at end of file
diff --git a/04_coeqtl_mapping/examine_bios_replication.ipynb b/04_coeqtl_mapping/examine_bios_replication.ipynb
new file mode 100644
index 0000000..8c5b906
--- /dev/null
+++ b/04_coeqtl_mapping/examine_bios_replication.ipynb
@@ -0,0 +1,1013 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import matplotlib.pyplot as plt\n",
+ "from scipy.stats import spearmanr\n",
+ "from pathlib import Path\n",
+ "from scipy.stats import t, norm\n",
+ "import seaborn as sns\n",
+ "%matplotlib inline\n",
+ "\n",
+ "def flip_zscore(zscore, coeqtlallele, altaf, altallele):\n",
+ " if not pd.isnull(zscore):\n",
+ " if coeqtlallele == altallele:\n",
+ " coeqtlaf = altaf\n",
+ " else:\n",
+ " coeqtlaf = 1 - altaf\n",
+ " if coeqtlaf > 0.5:\n",
+ " return -zscore\n",
+ " else:\n",
+ " return zscore\n",
+ " else:\n",
+ " return np.nan\n",
+ " \n",
+ "def flip_allele(altaf, altallele, refallele):\n",
+ " if altaf > 0.5:\n",
+ " return refallele\n",
+ " else:\n",
+ " return altallele"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "coeqtl_withbios_prefix = Path(\n",
+ " \"./coeqtl_mapping/output\"\n",
+ ")\n",
+ "filter_type = 'filtered_results'\n",
+ "\n",
+ "def flip_direction(allele1, allele2, zscore2):\n",
+ " if allele1 == allele2:\n",
+ " return zscore2\n",
+ " else:\n",
+ " return -1*zscore2\n",
+ "\n",
+ "\n",
+ "def get_z_score(t_statistic, num):\n",
+ " prob = t.cdf(t_statistic, num - 2)\n",
+ " z_score = norm.ppf(prob)\n",
+ " return z_score"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import seaborn as sns\n",
+ "import numpy as np\n",
+ "import matplotlib.pyplot as plt\n",
+ "import matplotlib.patches as mpatches"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "color_dict = {'CD4T': '#2E9D33',\n",
+ " 'CD8T': 'darkgreen',\n",
+ " 'monocyte': '#EDBA1B',\n",
+ " 'NK': '#E64B50',\n",
+ " 'DC': '#965EC8',\n",
+ " 'B': '#009DDB',\n",
+ " 'cMono': 'peru',\n",
+ " 'ncMono': 'y',\n",
+ " 'CD4T_individual_100': '#2E9D33',\n",
+ " 'CD4T_individual_50': '#2E9D33',\n",
+ " 'CD4T_50': '#2E9D33',\n",
+ " 'CD4T_150': '#2E9D33',\n",
+ " 'CD4T_250': '#2E9D33'}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "workdir = Path(\"./coeqtl_mapping/\")\n",
+ "bios_replication_filtered_df = pd.read_csv(\n",
+ " workdir/'bios/onlyRNAAlignMetrics_rmLLD/filtered_results/replication_summary.csv', \n",
+ " index_col=0\n",
+ ").set_index('celltype')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "celltype = 'CD4T'\n",
+ "eqtldf = pd.read_csv(\n",
+ " workdir/f'input/snp_selection/eqtl/UT_{celltype}_eQTLProbesFDR0.05-ProbeLevel_withAF.tsv',\n",
+ " sep='\\t'\n",
+ " )\n",
+ "eqtldf['snp_eqtlgene'] = ['_'.join(item) for item in eqtldf[['SNPName', 'genename']].values]\n",
+ "eqtl_allele_af_df = eqtldf.drop_duplicates(subset=['snp_eqtlgene', 'AlleleAssessed', 'AF'])\n",
+ "eqtl_allele_af_dict = eqtl_allele_af_df.set_index('snp_eqtlgene')[['AlleleAssessed', 'AF', 'alt_allele', 'ref_allele']].T.to_dict()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 50,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "biostype = 'onlyRNAAlignMetrics_rmLLD'\n",
+ "celltype = 'CD4T'\n",
+ "filter_type = 'filtered_results'\n",
+ "\n",
+ "coeqtl_df = pd.read_csv(\n",
+ " coeqtl_withbios_prefix/filter_type/f'UT_{celltype}/coeqtls_fullresults_fixed.sig.withbios{biostype}.tsv.gz',\n",
+ " compression='gzip', \n",
+ " index_col=0, \n",
+ " sep='\\t')\n",
+ "coeqtl_df = coeqtl_df.dropna(subset=['t_bios'])\n",
+ "coeqtl_df['zscore_bios'] = [get_z_score(item[0], item[1]) for item in \n",
+ " coeqtl_df[['t_bios', \n",
+ " 'num_individuals_bios']].values]\n",
+ "coeqtl_df['flipped_zscore_bios'] = [flip_direction(item[0], item[1], item[2]) for item in \n",
+ " coeqtl_df[['SNPEffectAllele', \n",
+ " 'assessed_allele_bios',\n",
+ " 'zscore_bios']].values]\n",
+ "\n",
+ "isConcordant = lambda x:True if x[0]*x[1] > 0 else False\n",
+ "coeqtl_df['is_concordant'] = [isConcordant(item) for item in \n",
+ " coeqtl_df[['MetaPZ', 'flipped_zscore_bios']].values]\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 51,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " snp_genepair | \n",
+ " Gene | \n",
+ " GeneChr | \n",
+ " GenePos | \n",
+ " GeneStrand | \n",
+ " GeneSymbol | \n",
+ " SNP | \n",
+ " SNPChr | \n",
+ " SNPPos | \n",
+ " SNPAlleles | \n",
+ " ... | \n",
+ " gene1_bios | \n",
+ " gene2_bios | \n",
+ " assessed_allele_bios | \n",
+ " num_individuals_bios | \n",
+ " isinteractionterm_bios | \n",
+ " snp_genepair_bios | \n",
+ " corrected_p_bios | \n",
+ " zscore_bios | \n",
+ " flipped_zscore_bios | \n",
+ " is_concordant | \n",
+ "
\n",
+ " \n",
+ " snp_gene1_gene2 | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " rs7605824_SH3YL1_NPM1 | \n",
+ " rs7605824_NPM1;SH3YL1 | \n",
+ " NPM1;SH3YL1 | \n",
+ " 2 | \n",
+ " 217730 | \n",
+ " NaN | \n",
+ " NPM1;SH3YL1 | \n",
+ " rs7605824 | \n",
+ " 2 | \n",
+ " 280819 | \n",
+ " G/A | \n",
+ " ... | \n",
+ " SH3YL1 | \n",
+ " NPM1 | \n",
+ " A | \n",
+ " 2491.0 | \n",
+ " True | \n",
+ " rs7605824_NPM1;SH3YL1 | \n",
+ " 0.000000 | \n",
+ " -3.617874 | \n",
+ " -3.617874 | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " rs7605824_SH3YL1_CD48 | \n",
+ " rs7605824_CD48;SH3YL1 | \n",
+ " CD48;SH3YL1 | \n",
+ " 2 | \n",
+ " 217730 | \n",
+ " NaN | \n",
+ " CD48;SH3YL1 | \n",
+ " rs7605824 | \n",
+ " 2 | \n",
+ " 280819 | \n",
+ " G/A | \n",
+ " ... | \n",
+ " SH3YL1 | \n",
+ " CD48 | \n",
+ " A | \n",
+ " 2491.0 | \n",
+ " True | \n",
+ " rs7605824_CD48;SH3YL1 | \n",
+ " 0.784422 | \n",
+ " -0.446946 | \n",
+ " -0.446946 | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " rs7605824_SH3YL1_RPS13 | \n",
+ " rs7605824_RPS13;SH3YL1 | \n",
+ " RPS13;SH3YL1 | \n",
+ " 2 | \n",
+ " 217730 | \n",
+ " NaN | \n",
+ " RPS13;SH3YL1 | \n",
+ " rs7605824 | \n",
+ " 2 | \n",
+ " 280819 | \n",
+ " G/A | \n",
+ " ... | \n",
+ " SH3YL1 | \n",
+ " RPS13 | \n",
+ " A | \n",
+ " 2491.0 | \n",
+ " True | \n",
+ " rs7605824_RPS13;SH3YL1 | \n",
+ " 0.000000 | \n",
+ " -3.489377 | \n",
+ " -3.489377 | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " rs7605824_SH3YL1_RPL31 | \n",
+ " rs7605824_RPL31;SH3YL1 | \n",
+ " RPL31;SH3YL1 | \n",
+ " 2 | \n",
+ " 217730 | \n",
+ " NaN | \n",
+ " RPL31;SH3YL1 | \n",
+ " rs7605824 | \n",
+ " 2 | \n",
+ " 280819 | \n",
+ " G/A | \n",
+ " ... | \n",
+ " SH3YL1 | \n",
+ " RPL31 | \n",
+ " A | \n",
+ " 2491.0 | \n",
+ " True | \n",
+ " rs7605824_RPL31;SH3YL1 | \n",
+ " 0.349601 | \n",
+ " -1.325633 | \n",
+ " -1.325633 | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " rs7605824_SH3YL1_RPL3 | \n",
+ " rs7605824_RPL3;SH3YL1 | \n",
+ " RPL3;SH3YL1 | \n",
+ " 2 | \n",
+ " 217730 | \n",
+ " NaN | \n",
+ " RPL3;SH3YL1 | \n",
+ " rs7605824 | \n",
+ " 2 | \n",
+ " 280819 | \n",
+ " G/A | \n",
+ " ... | \n",
+ " SH3YL1 | \n",
+ " RPL3 | \n",
+ " A | \n",
+ " 2491.0 | \n",
+ " True | \n",
+ " rs7605824_RPL3;SH3YL1 | \n",
+ " 0.000000 | \n",
+ " -3.854851 | \n",
+ " -3.854851 | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " rs4147638_SMDT1_ACTB | \n",
+ " rs4147638_ACTB;SMDT1 | \n",
+ " ACTB;SMDT1 | \n",
+ " 22 | \n",
+ " 42475695 | \n",
+ " NaN | \n",
+ " ACTB;SMDT1 | \n",
+ " rs4147638 | \n",
+ " 22 | \n",
+ " 42487900 | \n",
+ " G/A | \n",
+ " ... | \n",
+ " SMDT1 | \n",
+ " ACTB | \n",
+ " G | \n",
+ " 2491.0 | \n",
+ " True | \n",
+ " rs4147638_ACTB;SMDT1 | \n",
+ " 0.000000 | \n",
+ " -3.748326 | \n",
+ " 3.748326 | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " rs4147638_SMDT1_RPS25 | \n",
+ " rs4147638_RPS25;SMDT1 | \n",
+ " RPS25;SMDT1 | \n",
+ " 22 | \n",
+ " 42475695 | \n",
+ " NaN | \n",
+ " RPS25;SMDT1 | \n",
+ " rs4147638 | \n",
+ " 22 | \n",
+ " 42487900 | \n",
+ " G/A | \n",
+ " ... | \n",
+ " SMDT1 | \n",
+ " RPS25 | \n",
+ " G | \n",
+ " 2491.0 | \n",
+ " True | \n",
+ " rs4147638_RPS25;SMDT1 | \n",
+ " 0.000000 | \n",
+ " 5.773036 | \n",
+ " -5.773036 | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " rs4147638_SMDT1_RPS3A | \n",
+ " rs4147638_RPS3A;SMDT1 | \n",
+ " RPS3A;SMDT1 | \n",
+ " 22 | \n",
+ " 42475695 | \n",
+ " NaN | \n",
+ " RPS3A;SMDT1 | \n",
+ " rs4147638 | \n",
+ " 22 | \n",
+ " 42487900 | \n",
+ " G/A | \n",
+ " ... | \n",
+ " SMDT1 | \n",
+ " RPS3A | \n",
+ " G | \n",
+ " 2491.0 | \n",
+ " True | \n",
+ " rs4147638_RPS3A;SMDT1 | \n",
+ " 0.000000 | \n",
+ " 4.434777 | \n",
+ " -4.434777 | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " rs4147638_SMDT1_RPS18 | \n",
+ " rs4147638_RPS18;SMDT1 | \n",
+ " RPS18;SMDT1 | \n",
+ " 22 | \n",
+ " 42475695 | \n",
+ " NaN | \n",
+ " RPS18;SMDT1 | \n",
+ " rs4147638 | \n",
+ " 22 | \n",
+ " 42487900 | \n",
+ " G/A | \n",
+ " ... | \n",
+ " SMDT1 | \n",
+ " RPS18 | \n",
+ " G | \n",
+ " 2491.0 | \n",
+ " True | \n",
+ " rs4147638_RPS18;SMDT1 | \n",
+ " 0.000000 | \n",
+ " 7.128733 | \n",
+ " -7.128733 | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " rs4147638_SMDT1_RPL11 | \n",
+ " rs4147638_RPL11;SMDT1 | \n",
+ " RPL11;SMDT1 | \n",
+ " 22 | \n",
+ " 42475695 | \n",
+ " NaN | \n",
+ " RPL11;SMDT1 | \n",
+ " rs4147638 | \n",
+ " 22 | \n",
+ " 42487900 | \n",
+ " G/A | \n",
+ " ... | \n",
+ " SMDT1 | \n",
+ " RPL11 | \n",
+ " G | \n",
+ " 2491.0 | \n",
+ " True | \n",
+ " rs4147638_RPL11;SMDT1 | \n",
+ " 0.000000 | \n",
+ " 5.896748 | \n",
+ " -5.896748 | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
497 rows × 55 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " snp_genepair Gene GeneChr \\\n",
+ "snp_gene1_gene2 \n",
+ "rs7605824_SH3YL1_NPM1 rs7605824_NPM1;SH3YL1 NPM1;SH3YL1 2 \n",
+ "rs7605824_SH3YL1_CD48 rs7605824_CD48;SH3YL1 CD48;SH3YL1 2 \n",
+ "rs7605824_SH3YL1_RPS13 rs7605824_RPS13;SH3YL1 RPS13;SH3YL1 2 \n",
+ "rs7605824_SH3YL1_RPL31 rs7605824_RPL31;SH3YL1 RPL31;SH3YL1 2 \n",
+ "rs7605824_SH3YL1_RPL3 rs7605824_RPL3;SH3YL1 RPL3;SH3YL1 2 \n",
+ "... ... ... ... \n",
+ "rs4147638_SMDT1_ACTB rs4147638_ACTB;SMDT1 ACTB;SMDT1 22 \n",
+ "rs4147638_SMDT1_RPS25 rs4147638_RPS25;SMDT1 RPS25;SMDT1 22 \n",
+ "rs4147638_SMDT1_RPS3A rs4147638_RPS3A;SMDT1 RPS3A;SMDT1 22 \n",
+ "rs4147638_SMDT1_RPS18 rs4147638_RPS18;SMDT1 RPS18;SMDT1 22 \n",
+ "rs4147638_SMDT1_RPL11 rs4147638_RPL11;SMDT1 RPL11;SMDT1 22 \n",
+ "\n",
+ " GenePos GeneStrand GeneSymbol SNP SNPChr \\\n",
+ "snp_gene1_gene2 \n",
+ "rs7605824_SH3YL1_NPM1 217730 NaN NPM1;SH3YL1 rs7605824 2 \n",
+ "rs7605824_SH3YL1_CD48 217730 NaN CD48;SH3YL1 rs7605824 2 \n",
+ "rs7605824_SH3YL1_RPS13 217730 NaN RPS13;SH3YL1 rs7605824 2 \n",
+ "rs7605824_SH3YL1_RPL31 217730 NaN RPL31;SH3YL1 rs7605824 2 \n",
+ "rs7605824_SH3YL1_RPL3 217730 NaN RPL3;SH3YL1 rs7605824 2 \n",
+ "... ... ... ... ... ... \n",
+ "rs4147638_SMDT1_ACTB 42475695 NaN ACTB;SMDT1 rs4147638 22 \n",
+ "rs4147638_SMDT1_RPS25 42475695 NaN RPS25;SMDT1 rs4147638 22 \n",
+ "rs4147638_SMDT1_RPS3A 42475695 NaN RPS3A;SMDT1 rs4147638 22 \n",
+ "rs4147638_SMDT1_RPS18 42475695 NaN RPS18;SMDT1 rs4147638 22 \n",
+ "rs4147638_SMDT1_RPL11 42475695 NaN RPL11;SMDT1 rs4147638 22 \n",
+ "\n",
+ " SNPPos SNPAlleles ... gene1_bios gene2_bios \\\n",
+ "snp_gene1_gene2 ... \n",
+ "rs7605824_SH3YL1_NPM1 280819 G/A ... SH3YL1 NPM1 \n",
+ "rs7605824_SH3YL1_CD48 280819 G/A ... SH3YL1 CD48 \n",
+ "rs7605824_SH3YL1_RPS13 280819 G/A ... SH3YL1 RPS13 \n",
+ "rs7605824_SH3YL1_RPL31 280819 G/A ... SH3YL1 RPL31 \n",
+ "rs7605824_SH3YL1_RPL3 280819 G/A ... SH3YL1 RPL3 \n",
+ "... ... ... ... ... ... \n",
+ "rs4147638_SMDT1_ACTB 42487900 G/A ... SMDT1 ACTB \n",
+ "rs4147638_SMDT1_RPS25 42487900 G/A ... SMDT1 RPS25 \n",
+ "rs4147638_SMDT1_RPS3A 42487900 G/A ... SMDT1 RPS3A \n",
+ "rs4147638_SMDT1_RPS18 42487900 G/A ... SMDT1 RPS18 \n",
+ "rs4147638_SMDT1_RPL11 42487900 G/A ... SMDT1 RPL11 \n",
+ "\n",
+ " assessed_allele_bios num_individuals_bios \\\n",
+ "snp_gene1_gene2 \n",
+ "rs7605824_SH3YL1_NPM1 A 2491.0 \n",
+ "rs7605824_SH3YL1_CD48 A 2491.0 \n",
+ "rs7605824_SH3YL1_RPS13 A 2491.0 \n",
+ "rs7605824_SH3YL1_RPL31 A 2491.0 \n",
+ "rs7605824_SH3YL1_RPL3 A 2491.0 \n",
+ "... ... ... \n",
+ "rs4147638_SMDT1_ACTB G 2491.0 \n",
+ "rs4147638_SMDT1_RPS25 G 2491.0 \n",
+ "rs4147638_SMDT1_RPS3A G 2491.0 \n",
+ "rs4147638_SMDT1_RPS18 G 2491.0 \n",
+ "rs4147638_SMDT1_RPL11 G 2491.0 \n",
+ "\n",
+ " isinteractionterm_bios snp_genepair_bios \\\n",
+ "snp_gene1_gene2 \n",
+ "rs7605824_SH3YL1_NPM1 True rs7605824_NPM1;SH3YL1 \n",
+ "rs7605824_SH3YL1_CD48 True rs7605824_CD48;SH3YL1 \n",
+ "rs7605824_SH3YL1_RPS13 True rs7605824_RPS13;SH3YL1 \n",
+ "rs7605824_SH3YL1_RPL31 True rs7605824_RPL31;SH3YL1 \n",
+ "rs7605824_SH3YL1_RPL3 True rs7605824_RPL3;SH3YL1 \n",
+ "... ... ... \n",
+ "rs4147638_SMDT1_ACTB True rs4147638_ACTB;SMDT1 \n",
+ "rs4147638_SMDT1_RPS25 True rs4147638_RPS25;SMDT1 \n",
+ "rs4147638_SMDT1_RPS3A True rs4147638_RPS3A;SMDT1 \n",
+ "rs4147638_SMDT1_RPS18 True rs4147638_RPS18;SMDT1 \n",
+ "rs4147638_SMDT1_RPL11 True rs4147638_RPL11;SMDT1 \n",
+ "\n",
+ " corrected_p_bios zscore_bios flipped_zscore_bios \\\n",
+ "snp_gene1_gene2 \n",
+ "rs7605824_SH3YL1_NPM1 0.000000 -3.617874 -3.617874 \n",
+ "rs7605824_SH3YL1_CD48 0.784422 -0.446946 -0.446946 \n",
+ "rs7605824_SH3YL1_RPS13 0.000000 -3.489377 -3.489377 \n",
+ "rs7605824_SH3YL1_RPL31 0.349601 -1.325633 -1.325633 \n",
+ "rs7605824_SH3YL1_RPL3 0.000000 -3.854851 -3.854851 \n",
+ "... ... ... ... \n",
+ "rs4147638_SMDT1_ACTB 0.000000 -3.748326 3.748326 \n",
+ "rs4147638_SMDT1_RPS25 0.000000 5.773036 -5.773036 \n",
+ "rs4147638_SMDT1_RPS3A 0.000000 4.434777 -4.434777 \n",
+ "rs4147638_SMDT1_RPS18 0.000000 7.128733 -7.128733 \n",
+ "rs4147638_SMDT1_RPL11 0.000000 5.896748 -5.896748 \n",
+ "\n",
+ " is_concordant \n",
+ "snp_gene1_gene2 \n",
+ "rs7605824_SH3YL1_NPM1 True \n",
+ "rs7605824_SH3YL1_CD48 True \n",
+ "rs7605824_SH3YL1_RPS13 True \n",
+ "rs7605824_SH3YL1_RPL31 True \n",
+ "rs7605824_SH3YL1_RPL3 True \n",
+ "... ... \n",
+ "rs4147638_SMDT1_ACTB True \n",
+ "rs4147638_SMDT1_RPS25 True \n",
+ "rs4147638_SMDT1_RPS3A True \n",
+ "rs4147638_SMDT1_RPS18 True \n",
+ "rs4147638_SMDT1_RPL11 True \n",
+ "\n",
+ "[497 rows x 55 columns]"
+ ]
+ },
+ "execution_count": 51,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "coeqtl_df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 52,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# flip direction according to AF\n",
+ "coeqtl_df['eqtl_effect_allele'] = [eqtl_allele_af_dict.get(eqtl)['AlleleAssessed'] for eqtl in \n",
+ " coeqtl_df['snp_eqtlgene']]\n",
+ "coeqtl_df['eqtl_alt_af'] = [eqtl_allele_af_dict.get(eqtl)['AF'] for eqtl in coeqtl_df['snp_eqtlgene']]\n",
+ "coeqtl_df['eqtl_alt_allele'] = [eqtl_allele_af_dict.get(eqtl)['alt_allele'] for eqtl in \n",
+ " coeqtl_df['snp_eqtlgene']]\n",
+ "coeqtl_df['eqtl_ref_allele'] = [eqtl_allele_af_dict.get(eqtl)['ref_allele'] for eqtl in \n",
+ " coeqtl_df['snp_eqtlgene']]\n",
+ "coeqtl_df[f'MetaPZ_flippedforAF'] = [flip_zscore(zscore, coeqtlallele, altaf, altallele)\n",
+ " for zscore, coeqtlallele, altaf, altallele in\n",
+ " coeqtl_df[[f'MetaPZ',\n",
+ " f'SNPEffectAllele',\n",
+ " 'eqtl_alt_af',\n",
+ " 'eqtl_alt_allele']].values]\n",
+ "coeqtl_df[f'flipped_zscore_bios_flippedforAF'] = [flip_zscore(zscore, coeqtlallele, altaf, altallele)\n",
+ " for zscore, coeqtlallele, altaf, altallele in\n",
+ " coeqtl_df[[f'flipped_zscore_bios',\n",
+ " f'SNPEffectAllele',\n",
+ " 'eqtl_alt_af',\n",
+ " 'eqtl_alt_allele']].values]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 53,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "0.9637681159420289\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "Text(3, -5, 'Concordance = 0.96\\nrb = 0.61')"
+ ]
+ },
+ "execution_count": 53,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAATsAAAEvCAYAAAA6m2ZKAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8rg+JYAAAACXBIWXMAAAsTAAALEwEAmpwYAABYiklEQVR4nO29eXwUVb73/zmdpLvJHkhYAyQgstkJW3JBCUGQJSwhBBS8uA13xgWHuc9vrs44g4PiyB0d/enoDLnzOMuD+qgkkpUlKBiIwY1EWVpiFDGZAAImkK0D6U53n+ePUGUvVdXV3dVbct6vly9Jd1fV6arT3/M935VQSsFgMBj9HVWgB8BgMBj+gAk7BoMxIGDCjsFgDAiYsGMwGAMCJuwYDMaAgAk7BoMxIAgPxEUTExNpSkpKIC7NYDD6MZ9//nkrpTRJ6L2ACLuUlBTU1dUF4tIMBqMfQwj5l9h7bBvLYDAGBEzYMRiMAQETdgwGY0DAhB2DwRgQMGHHYDAGBEzYMRiMAQETdgwGY0AQkDg7BoPhHvvOVuLPdTtwqfsyhkcNw+ZZj2L5+JxADyukYMKOwQhy9p2txDNHt6PH0gMAuNh9Cc8c3Q4ATOC5AdvGMhhBzp/rdvCCjqPH0oM/1+0I0IhCEybsGIwg51L3ZbdeZwjDhB2DEeQMjxrm1usMYZiwYzCCnM2zHoU2TGv3mjZMi82zHg3QiEIT5qBgMIIczgnBvLHewYQdgxECLB+fw4Sbl7BtLIPBGBAwYcdgMAYETNgxGIwBARN2DAZjQMCEHYPBGBAwb+wAgCWR9w/Yc/QOJuz6OSyJvH/AnqP3KLKNJYTEE0J2E0IaCCFfEULmKHFehvewJPL+gdznuO9sJZYWrsC0f2ZgaeEK7Dtb6c9hBjVKaXavADhAKV1LCFEDiFTovAwvYUnk/QM5z5Fpf9J4rdkRQmIBzAPwDwCglJoope3enpehDCyJvH8g5zkyLV4aJbax4wC0APg/hJDjhJC/E0KiFDgvQwFYEnn/QM5zZFq8NEoIu3AAMwD8D6V0OoBuAE84fogQ8iAhpI4QUtfS0qLAZRlyWD4+B1vnbsGIqOEgIBgRNRxb525h25oQQ85zZFq8NIRS6t0JCBkO4FNKacqNv7MAPEEpXS52zKxZs2hdXZ1X12UwGPY42uyAPu1vIC1uhJDPKaWzhN7z2kFBKb1ECDlHCJlIKf0awEIA9d6el8FguAcrBSWNUt7YzQDeuuGJ/Q7ATxQ6L4PRb/FFkDArBSWOIsKOUnoCgKDqyGAwnNl3thJbP9wGMzUD6AsT2frhNgCBCRMZCNkZLDeWwQgAz3/yAi/oOMzUjOc/ecHvY+FsfRe7L4GC8vF5SgUkB0ugMxN2DEYA6DB1ir7ub2Hgy/g8XwtSd2DCjsEIMvwtDHwZnxdMgc5M2DEYASBeEyf6nr+FgS/j84Ip0JkJOwYjAPxq9mOIUEWIvu9PYeDLLJtgCnRmwo7BCADLx+dgW9ZWqIjwT9CfwsCXWTbBlK7I6tkxGAGCEyZCWQ/+Fga+is8LpkBnJuwYjAASTMLAVwRLoDMTdgxGgAkWYdDfYTY7BoMxIGCaHYPhA/p7+lUofj8m7BgMhenv5dF98f38ITzZNpbBUBh3sgYClTfqzXWVzorwV0oZE3YMhsLIzRoIVN6ot9dVOivCXyllTNgxGAojN2sgUHmj3l5X6ayIi92X3HrdU5iwYzAURm7WQKDyRr29rtJZEWJZJAAU1XKZg4LBUBi5gcLDo4YJai/epIrJMfR7e12lA6Gt1Cr6npKOHa8b7ngCa7jDYCjfIEfu+YKtMc/SwhWSW9YRUcNxYN1eWeeSarjDtrEMRgDgNLAeSw+/jfM2AV+uLc5Xif+eeniFtsW2KLWtZ9tYBsPPOGpWVmrlbV7eCBxXtjhfxrJ5GntnK/TFUKoCDNPsGAw/4ysvrJSX1NdhLp58J9sxiaFkBRgm7BgMP+MrL6yUl9TXYS5iY7/YfclOoNpudZ/88ClBjU5FVIrX1QPYNpbB8Du+8MIC0l7SLdVbBY9xR8BKbYPFvhPwo0eV+zcn4MSco1ZqxYio4YqnjDFhx2D4mc2zHvW6YKeY4BErF+WtgHVlkxP6Thw9lh48+eFTiImIlrTN2eKLfGK2jWUw/Iy33lBP7G/eBgK72gZz30kMK7WKto8UQ+lsEqbZMRgBwJuCnVKCR+qcmjA1f1y8Jg6LUhfhz3U7sKV6q0vvrBw74/LxOfhz3Q6307xURCUaWKxkNgnT7BiMEMNdBwenCdpqVgZTN8q+KZetHcrNh3UVM+eINkyLZ+dtw4io4W5d1xMUE3aEkDBCyHFCiLxQZwaD4RHuJuILaYJmakavtdfuNalto9xtMLedlcp35YhTx/Lbd390IVNSs/tPAF8peD4GY0AjlpHgrmBwZyso9llXdkbbsf65bgfWTMx3qeFFRkTyx/uynSOHIjY7QkgygOUAtgP4pRLnZDAGMnIyEoS8sUJeWqmwEEekto1idkahse45sxcrJ6zA0XNHRa/tKFh93XhIKQfFnwD8CkCM2AcIIQ8CeBAAxowZo9BlGQOV7777DgAwbty4AI/EN7hyQggJBjEBuXLCCuw5s1dW2Mfc0XMVG+vRc0dxYN1e0UR/fzYCBxTYxhJCVgD4gVL6udTnKKWvUUpnUUpnJSUleXtZBqNf40mWhZTQ4baIrthzZq/bKWSuxiq27Z47eq5fS9IrodndBiCXELIMgBZALCHk/1JK71Hg3AyGIMnJyYEegtd4kpEgpQ1JCR1OE5z2zwxQiJd1kxPC4vgdCCGC2RDcWLlz/fHTF9Fu7AAAEBCUfl0GMzUDsN+mA75pGu61Zkcp/Q2lNJlSmgJgPYAqJugYvkatVkOtVgd6GB7jKjDYE++kmCCkoLzmJGfrKNehwX0HoRg5obH2mI38v69brvOCjn/f0oPnP3nBZwULWJwdA0Dgulx5SkdHBzo6OgI9DI+Rm5Eg1zu572wlrpuvi16PExpzR8916SWN08R6/B2AviBhx7G6KuPE0WHq9FnBAkUzKCilRwAcUfKcDN8Tin1Or1y5AgCIi4sL8EjkY7ttFdtKOmYkyLn/QpWHhbC130llOhhM3dh3ttLltcU0QEqp07HeZkIokUnBNDtGwLpcDSQct61ieOKhlKs1AT/a7w6s24uTG+sQp3bW4szULOvZuxPcLOd7hZNwxGuEFy8lPLdM2DEC1uVqICFHIHmaMeDOc4pV20eHdZq6PD6nWGrYxe5LTqaQzbMeRTiR3khGq6Pwq9mP+SyTghUCYPisvhqjj31nKyWDegmIV15Hd4KGr5mv221RvXn2tsHNjufgTCHHL5+UDCy2pcPYqXjnMluYsGMoUl+NIQy3fRXDnc5ZYkjVknOk19prF1riy2ffY+nBuw27ZX/eNlTFF7ZiJuwYPl1NfUWoZOFIbV+VEircc3ryw6cke7ByODpBuHG6++zlOkbkEE7Cfb64sr6xDIYPkQri/e/s3yu6oLgKGOZQQpsEXPd7dYd4TRyqN3zg9XlY31hGv6OtrQ1tbW2BHoZLxGxfI6KGK645y7GzKWmeUNKB1WH8sdaer2I+mbBjhCShIuz8UadN6loRqgjEqWN9UjZJSQcWdy5ftnxkNjsGw4e4sokp2bja37ZXdxwjUtgK/+c/ecGjkvNyYDY7RkgSKiWepISZkIFfG6blG9coLbSUFKxi53TV8NpRkBEQUFDEqWPRazXjmvma4LEEBCc21rocj5TNjml2DIaPcJWGJ5a58vwnL8BoMUmm77kruHyVEugYJiLmtOD6wHJjjlXH4Jr5Ol8a3lXnMZZBwWAEMa7S8MQM/K6S4T2xa7kai1JOASkbJZemdmJjLSIjIp16YLg6r7cwYccISVJSUpCSkhLoYUjiKg3PXW2FO86TXGapsSjpFJBbrcUdTy4BYRkUjIGLShX867SrVCyx7AVtuIYvcil0nJxcZsdtbqw6RnCrODxqmKjw/G317/Db6t8hXhOHX81+TLbAcdzaclqjp30x5MQOyiH4ZwyDIcCVK1f4Mk/BiquwEzEtyFUyvKtqI0Ka2jXzdcFE/Ovm6y6FTruxA0/VPOORpic0lt9W/w7tPR2IUEXIOoeccvJyYJrdAMYX3jl/wRXuHDJkSIBHIg53L5//5AVeq9KGa5w+I3bPxZ6Nq3xWIU2t19qLeE0cKKV2Gp6QBimEY06tXMTS5a5brvMlnaTGoGRMIhN2A5RQLNgZqhgtJv7f7cYOWfdZSgi6iqcTdXwYOzE8aphLz6cYnH3PnQVSyjZnpmYMCh8kKeyUDIJmcXYDFKkQASXyJn1NqMTZucof5UIylFxgpJ6tVJXkEVHDJccap461C4kBfowLFBu/q+9PQEAIESxgoCIqHP/JMdFjBc/HcmMZjshtXMzwDlf3U8l0KA4pW6FUru6BdXvx39m/F7TthZEwEELc9gKLFfjkGB41TLRSi5wKLu7AhN0AROqHxQp2Kouc+6l0CXyp8A85TpPVE/OczklARLebUgKdG4tQ+XfuumIOCKUcExzMZjcAEf1hUSBvaK5/B+Mhwb595ZCbP6q0Ri1m85OTP3v03FGn48zUDBVRCWpbrgQ6NxYpe58/iscyYTcAEf1hEaD7iy5s+3AbkpKSkJWVBZ1O59/B9TO4H/Nvq38n+Tl/atSuKgGLzQ8rtTrlt7ojlLwRwErAhF0/Ra/Xo6amBq2trUhMTLQTXGIBnSOihiP/9nxUVlYiJycHZWVlABCUAq+1tRUAkJiYGOCRCON4/6OiotBt6Rb8rDZMi7yhuSgoKBB8Xv5Gan7Y5rfKEUpS89AWX5Vit4UJu36IXq9HVVUVcnNzMWbMGDQ3N6OiogJAn+CSitMaM2YMWltbkZqairy8PBQVFfHHBROdnX3hE8Eo7Bzvf3V1NdJOTcWn0bWgKntPaJw6Fv8+aj2Mp66LPi9/IzY/8obm4l/vNeLW1sw+wZWRBd148fG5mof+hgm7foLtCqrRaHDXXXchNTUVAJCamorc3FxUVlZCp9PxK+h/H34O3aprdit0Y2MjL0DGjBkDo9GIqqoqAMEn8IKVmpoa5ObmIjU1FXq9Hnq9Hr9e9Ti+NNfjlWN/RktPK4ZohuC/5vwvLB+fg5deegnh4eF48803ee3H9nn5G6FtZd7QXLcFsu19AJznob9hwq4f4LiCbt++3akhDaexcSwfn4Mx15L7jru977jGxkZUVFRgwYIFAIDm5mYkJiYiJycnYBM0FGltbeXvv+0PPhWpWDlhORobG1FUVIQx15Kh1+thtVqxcuVKXojs3r0bERER6OjoQEFBQUC2tI7byoKCAtmCi1t4W1paXM5DudtcJfBa2BFCRgN4A8BwAFYAr1FKX/H2vAz5OK6giYmJaG5u5v8GfhRctnCTqrKyEi0tLdBoNAgPD0dpaSkOHz4Mo9GIpUuXOk1QhjS2999W8HE4asxr1qzhn5XBYEBERARWrVrFC7/i4mIcPHgQkyZNQlNTk6Bg8LXQEPsejvPCduGtrKyUnIf+3uYqodmZAfwXpfQLQkgMgM8JIQcppfUKnJshA8eJmJWVhYqKCqdJxGlstuh0Ouh0Ouzfvx/19fVYs2YNfwznoBASlIEmGKue2AqcoqIiZGRkSC48OTk5eOONN+yeXU1NDdLS0lBZWckLrhkzZuDLL7/EV199henTpyM7O9tOMACQFBpKCEK5C6jtwutqHvp7m+u1sKOUXgRw8ca/uwghXwEYBYAJOz/hOBF1Oh1aWlpQVFQEo9GIxMRELFiwQHICNTU12WkYnINiz549oJQKCspAEmy17IS0lJKSEhgMBpSUlCA/P9/pBz9mzBio1Wq7Z9fS0oLe3l4nAdHe3o777rsPRUVFOHr0KBITE6HT6VBTUwMAokIDkBaEcpG7gNouvI47h6SkJLt5KFdbVApFbXaEkBQA0wF8puR5GT8itEoLTUS9Xo9ly5bJntBiE6+trQ35+fnMXucCIS0lP78vjCcrK0tw4WlsbIRGo7F7dlqtVlBwFRUV8dvfLVu22AlBQoio0BDTnkpLS93S9mwFF3fMhAkTUFNTg9LSUv4c3MJrMBj488fHxyMmJgabNm2yO6dcbVEpFBN2hJBoAMUA/hel1KmsAiHkQQAPAqHTzT3YELNxLFiwAAsWLLCbiK40OUfEJl5SUlJQCroffvgBADB06NAAj6QPKS2Fu39VVVXIycmxcwYtWrQIwI9ChFIqauPjBEFYWJidEIyJiREVGkLj6uzshNVq5cciV9vjTB6A+FycMGEC72CxtTuWlZVBr9fbnd8dc4sSKCLsCCER6BN0b1FKS4Q+Qyl9DcBrQF/VEyWuO9CQsnFs2rTJK6Hk74nnLQaDAUDwCDuhxaK6uhoajQbPPPMMrwmJLUjc/1955RVBwRUfH+/0PDghuGzZMtFnV1NTIzguR5OFu7YyqbmoVqud3svLy3M6v5C26O4i7Q5KeGMJgH8A+IpS+pL3Q2KI4Usbh78nXn/DcbGorq7G8ePHcddddzkJILF7qtfrYTKZUF5e7uSNNRqNmDNnjt2xnPbm6tk5CsK2tjav55HUXBTTToU8t/4KOwGU0exuA3AvAD0h5MSN135LKd2vwLkZNvjSxuHvidcfsVgs2LVrF0wmE7RarWRgtxA1NTVYu3YtDAYDL7g0Gg16enqg0WhQV1eH1NRUdHZ2orq6Gm1tbYiPj+e3h7ae19LSUlRVVcFisaCrq8vOZhgfH+/VPNLr9dBoNILn0Gg0GDRokMvzC22Dy8rKcPDgQRgMBp/MQSW8sUcBEAXGwnCBr7aawZbWE2pw92/16tX8/XMMKQFca0+cthQWFgbA2YtaXFyMN954A1FRUXYhQlIhKOXl5cjLy0NsbCwqKiqQlZUFwFnbc2ce1dTUICMjw+kcxcXFyMjIQFJSksvzC22DOe+/rQMGUG4OsgyKEMJXW81gS+uRQ3h48ExdofuXkJAgqN1ERESIZkXYau5C51yzZg2KiopE7W2AcwjKqlWreJuurX0XAO/NVavVUKlUqKqqsvOsij371tZWZGdnIykpyW4udnd3Izs7mxfW3HsRERFYsWKF3fnEtsHt7e12Dhgl52DwzBiGLGw9Yo54uhX1d7yTEgSTR1/o/mVnZ6O4uNhOAysvL8eyZct4LQuAqHdS7Jn09PRIPitX77W2tvKaaHt7OxISEjB+/HicOXNGtmbPCWXbucilwHHCmnuvsbFRUGDJMckoPQeZsOsn6PV6HDp0CHl5ebyBfN++fbJWan/HO/U3hO5fbGwsLBYLioqK0NPTA61WC51Oh/T0dAAQ1FpsHQoRERGCz0Sr1Qq+HhcXh56eHsnn2NzcjOjoaKetbmFhIdatWydbsxczp+h0OtnbY6FzlJeXY+HChYJjVwIm7EIMMe2tqqoKeXl5dpU21q1b53Kl1uv16O3txRtvvIGEhARkZ2fzmkewhp0AwKVLffXWhg9XtnS3Jwj9cHfv3g21Ws0vPtwz4JwJYloLlxWRmpoqmHkhJlAsFougHa2kpAR33HEHH9cXFhbmtNU1mUxuafZS5hS9Xi/LzOJ4jri4OPT29iI6OhoWi8UnoU9M2AUAT7ebUo6E9vZ2wUobgPhKLXS+4uJimEwmmM1mPhUpGO12165dC/QQeIR+/EKxZrbPgNOyhAp2tra24qGHHsJ7773Ha4YajQaZmZnIzs6GVqt1ysgoLS0VtKMZDAaUlZVBrVZDo9HAYDA4CTZPNHsxc4qUmcXVZ+UKSk9hws7PeOP5dBXI6arShuNKLWYE37NnDx599FHmlXUD7v5wixgXa2a7sEVHR8NoNGLbtm1Qq9WwWq28o8VsNuPAgQM4d+4cNBoNtm/fjsTERD7lr6qqCrW1tXxerGMqIBc8LGRHs431s7WrcUyaNEk0f9e2XJNWq+UFrJwFWq/X4+DBgzAajTCZTIiMjIRarUZHR4fgOdwRlJ7AhJ2f8cbzKSXEoqOj+WBUsZXaUZMQqzfmS49Yf8VxEduxYweqq6uh1+uRm5uLzs5OVFVVYf369XZBx7Z17MrLy3HixAncfffddkKnpaXFZa5zVlYWdu/eDY1Gg/b2dsTHx+PatWvIzMy0m2sZGRlOgk2v12Py5Ml2sX3Lli0D0BfKotPpBIsTAOKLoF6vx4EDBxAREYH169ejs7MThw8fDmh4U/DVyenneOP55ISYLdx2Y9GiRbh+/TqKiorQ0tKCwsJCPqi0sbERu3fvBiEEOTk52LJlC3JychAdHY3q6mqn88XFxbk9toGO7SIWFhaGm266CceOHeNf++ijj3ibalhYGBoaGpCfn8//zYWJEELsXsvNzUVtba2sLV14eDhWrlyJLVu2YOXKlQgPD8eQIUPsPpOdnc0HLW/fvh1FRUXQ6XRYsmQJcnJyEBcXxwtV7js1NDTYfTduXJyZQ+x+aDQarFq1iv/+3L9tz8HV9PMHTLPzM3LsI2I2PVdBxVqt1s4gXlJSgpqaGiQlJQnakPLz8/H222/jxIkT6OrqglqtBqUUlFJs29bXYWzSpElB6ZWNiIgI9BDssF3E9Ho9zpw5Y2f4d1zkxBY9k8nk9JrRaJRl4uCEKWBvkuA8wMCPxR24WDturnHbY6ESTJ4s0I6hMFJxdY4FAnwFE3Z+xpXAkmPTEzLiFhQUOE12rsTQpk2b8MwzzwhWv9BoNHaR/+Xl5TCZTBg0aBBycnJQUlKCyZMn+/EOyWP06NGBHoIdjgHBOp0OtbW1fLkjWzucbSkkx0UvJibG7rxywy+kSnQ1NjaKhoJI2cm4MXriwEhMTITZbOaPEztHQkICf798DRN2fsZVFoQrm57Y5HS1+opV5XCMxl+1ahX27NmD7u5uO4HJkMZ2EeMKcGZkZGDXrl1QqVS8YX/SpEmoqqrCsGHDnGxnJSUlsFqtksJJDDFhEh8fLxoi4ioigPtO7sTP2R574MAB3o582223ORU4qKiowPz581FeXu75jXcDJuwCgNRq6u6WgZu0YkGo3OorpFGKVb9ob28HpdTltQPJxYsXAQAjRowI8Ej6sF3ENBoNcnNzYTAYoFarBWPljh07BovFgj179vAOBavViqlTp3oUfpGSkiIoPCdPnsw7GzjkRgTYLsDt7e1uVb7m3jt48CBfHIHrb8Il+i9YsADR0dF+M5MwYRdkuLNlsJ20nZ2dgisnt/oKaZRi0fjx8fHo7e2VvHaguX79eqCH4AS3iHEmg//9v/8374QA7LV0o9GI++67z+7ec6lVjhV95dDU1ITp06fbPd/p06ejoaHB6bPuRAR4Ew4iFEdXVVWFe++9NyA1E5mwCzLcqWziOGlVKhX27NmDtrY2JCUlCZbNtv0hbdu2zUkb4Gx24eHhTq0VGfJITExEdXW1ZCtBtVqtaD4yl5xv+6wsFguOHj3K/20bM8eVi+eEkT80+EDXTGTCLsiQMyHE+nLqdDpMmTIF27dvR1ZWlsutSlJSElpaWlBeXo6Ojg4+yn7JkiUoKytDZWUlK+DpASkpKTh+/Lho5RONRoOwsDDR9/R6PQC4lWXjakcgtXUFgMOHD4NS6nGfWrlZQb4OHJaCCbsgxFVlEzl9OeVsVbKysrBv3z4+/omDawSTkpLCBJ0HnDlzBvn5+TAYDIK5qjqdDqNHjxY0O2RkZODQoUMwm81Yu3atrABcOfnNUo13VCqV0zjEriWEJ1lBgSgWy4RdiCG3L2dpaanLbZJO19cv1rEUUUVFBTIzM1FbW4vRo0cHpcDTaDSBHoIoXJ6ybV03rpXg4sWL+ftptVqxa9cu9Pb22mnwqamp2LNnjyybmlh+s0qlwqJFi5xi5mzhYvjWr1/vs34UYsI5EMVimbALMeT25RRqtCLkbFi2bBlKSkqcts1TpkzB0aNH/RYD5S6jRo0K9BBEsc1T5rT0V199FTk5OU5loABgy5YtvGAEfvSI2yJkU9Pr9di/f79T+fc1a9Y4CRqxba67FU+EcDeCIFDFYpmwCzJcqfdCDbGjo6OdvHgpKSl2oQKTJk2CXq93cjZwSeaOP8TGxka+FR/DPSwWC79Fte0XUVhYyFcu4bQZjUaD6upqNDQ08M980qRJiI+PtzunUJZNVVUVjEajLEEjtgvwth8F4H7VFDHh2NLSwndi88W2lgm7IEKOei/HW8ulK9lWu+Birrjz2Do5NBoN3njjDT56n4v4Hzt2rJ/vgHwuXLgAIPg0PL1eD4vFgttvvx2lpaWwWq12JoLi4mLU1NQgISEB8+fPx1dffYXjx487xceZTCbJ4GJOO5Ky29oi5vgCvOtHAbjfG0Uqm8KX1XaYsAsi5Kj3cry1Ut3pAWD//v3Q6/Xo6elBdHS0U+jJqlWrcOXKFRw7dgwmk8ljD50vMRqNgR6CIDU1NYiJiUFsbCzCwsKwevVqpy1maWkpwsPDUV5eDrVa7VQlOD8/HxUVFZLPmNOO3BE0Uo4vb8JB3A0pkapS7MtqO0zYBRFybR+u3PdS59Hr9fjqq69w1113obKy0m77yqWLlZaWIiwsTFalY4Y9ra2tWLVqFd59911cv35d8DkYDAb88pe/BADBnOUxY8ago6MDW7duFb2ObR8I4EdBw5VncldYeftcuXPYtnI8ePAgwsLCnOrXOY45IiLCacy+iPtjwi6IEFPv4+LiBCvaunuexMREVFVV8VH9vvTQDVQSExNx5coVWK1W0Tg72y2mp/0/bLWjKVOmIDo62mUTbl/jmNEjVb/OVugVFBTwzhoOX2TuMGEXRAip92VlZTCbzU6T5ty5c2hqahIUgEJ5ktwPoaSkhBdwvvTQDVS4gGKj0Qi1Wi3YYcy2qYxUlWApAp2NIISt+aSgoMAuflNqwfRVP2RHmLALIoQmMKUUa9eutZs0Op3OyahtGw1/5swZuzxJjUbDr6Tl5eXYsWMH2tvbER0djbKyMrsaeO+++65ot/dgypEdNGhQoIcgSFNTE0aNGoXz589j9erV6Ozs5FP4IiMjAcCuqYxer8fQoUPtek0YjUbs37/fZWc4udtPfwXw2u4U3AlH8ZfgZsIuyHCcwJxNx3bCajQap9gqoUbJSUlJ/DFcCtKgQYPsSoG/++67fFWKmJgYWK1WZGZmCkb+L1682P83RIRgqXbiSGtrKzo7O+2cDunp6WhsbOTvMyfYuPxlIc/59OnT7UJUAM/spf4M4LXdKbi7PfdHGpkiZdkJIUsJIV8TQr4lhDyhxDkZfXBJ5Vws3JYtWyRjq7gVlZvk3DF33XUX9Hq9UynwO++8E4QQEEJw7do1GI1GPqGcK91dWVkJg8HA7HUySExMFH0+JpMJEydOhFarRX5+PjZt2oSmpiankuf5+floaGiQXQJdCsdy8d6eTwpuO9rY2MjXr2tsbORbA1RUVCArK0vx68rFa82OEBIGYAeARQDOA6glhFRQSuu9PTejbwI5Rsm7WjWbm5sFw0+kfoSrVq3C4cOHoVKpeC8fJ9y4LlX+Kp8th3PnzgEIvorFWVlZ2Lt3r2gc2fnz5zFs2DCXaVy22z1vq6H4y/7quB3lnCacNzbQNkUltrGZAL6llH4HAISQXQBWAWDCTgF0Op1Tnqur7AhugjlOcikhyTVEEUpeLy4uRkZGBt8cJRgEHldvL9jgqvoKZVDEx8djxowZqK2t5T8vp1qJNxVJPPX2ekogq5q4gnAVaT0+ASFrASyllP70xt/3Avg3SunPxY6ZNWsWraurc+s63333ndNrcXFxGDJkCKxWK5qampzeT0hIQEJCAl8L35EhQ4YgLi4OJpMJ58+fd3o/MTERsbGxMBqNfMS+LUOHDkV0dDSuX7/OV861Zfjw4YiMjMS1a9f4Dva2jBgxAoMGDYLBYMAPP/xg9153dzfq6upw7tw5jB49Gunp6YiPj0d3dzfa29sxZMgQjBs3DhcvXsTHH3+MiRMnYvDgwfyxV69exejRozFu3Di0tbWhra0N7e3tfJVYrVYLANizZw8WLFiA1tZWPmOiu7sbHR0dqKqqQkREBG6++WZMnz4dPT09uHr1KkaOHAmVSoWUlBQAwA8//ACDwWA3/vDwcF7YXrp0yampdUREBK+VXbx40akYp0aj4bMjLly44BREbDKZoFarMW7cOJw7d85J+EVGRmL48OEA+n7cZrPZ7v3o6GgMHToUQJ9TwWq12r0fGxvLCwR3597Vq1fR0NCApqYmxMTEYNGiRdBqtejp6cGVK1dgNptBCEF9fT0MBgNmz57NP5Oenh60trZCpVIhJiYGn332GebOnWt3fHx8PFJTU2XNPb1ez88X23P09PTgzJkzIIRAp9MhLi4OUVFR/PGjRo2CRqNBZ2enoBaYnJzM94C9cuWK0/tjxoxBeHg4P/e6u7v94iQhhHxOKZ0l9J4Smh0ReM1JghJCHgTwIAAnjYNhDyfQ5s2bh5SUFHz33Xf8hL527Ro/cVUqFVJTU2E2m3H58mX+eG7SHjhwADk5OYiNjUVPTw+6u7sRGRnJ/+Dq6uowc+ZMjBw5Eh0dHejp6YFWq0VUVBSioqJw7733oqioCDfffDOAvu5lwapRBQvd3d3o6emB2WxGeHg40tLS+IVFq9ViyJAhaGlpwfnz55GTk4MRI0agvr4eLS0tsFqtIKTv59TQ0ICrV69izpw5TsdfvXpV9njGjx9vd36VSsUXKsjJycGQIUNw/vx5XmDZCjyl6O7uDkiVE0eU0OzmAHiaUrrkxt+/AQBK6R/EjvFEsxtIFBQUCCbmFxYWwmg04sknn7SrkmGxWLB9+3aniHuxkIOCggJ+69ve3o6EhASMHz8ep0+fhlartWuyHBcXB0opWltbER8fD7PZzEf/BxJO2xo3blyAR/IjYp5PW1uVxWLBs88+K1iS/Z133sGgQYPsQoGEjhd61u6MqaioyM4GzF3f05LwUtfm8q8TEhJw++2329mBlb4e4HvNrhbABEJIKoALANYD+HcFzttvcRX3JJXZIFX99qWXXuK3qY6pOdx1CwoK0NLSguvXr9vF6RUWFvJNlm1tda2trbjnnnvsgpyDwVHBxawFE3Jym5ubm0VLshNCnNphCh3vjr3NHUeVq6ZOcragjiFSGRkZeOihh5y0ObGSVb7c6not7CilZkLIzwG8ByAMwD8ppae9Hlk/RU7ck5hRWa1WIzs7WzDaPCMjA6dOncKqVav4KrW253SscOyYExsZGYmVK1c6Ja0XFhbavZaXlxcUaWOcPS6YkPJ8ckHEXFkndzNXuHRBLkDcmzF52tTJ1RZU7LNJSUnQ6XR2glusZJUvt7qKBBVTSvcD2K/EuUIRd1YksdWfa7EHiKfPUEoRGxuLBQsW8N7YuLg4vrFKfHw8PvjgAyxatAiEEJSUlKCqqgoWi8Uu31XoB8BV17WF0yYdX2NpY8KICRGNRoNnn30WCQkJuOmmm/D11187lVAvLS1FTEyM6CKXk5Njp103NDQ4afFCxMXFOZ3TnRQ1dwptuvosN3caGxudmq/7o6Any6Bwga3dQavV8uEe3ARzd0USW/3b29v57aFY+sy5c+f4SWo0GrFq1SocOXIEq1evttt6HjhwwK5/QXl5OXp7eyVzYsWKODqWPw+WtDHOux5Mzi6xRYrr23rw4EF89dVXTvXtAMBsNsNkMjkJoeLiYmRmZjpp13v27MGWLVtczjfbQqLcOU+dOgWz2SwrPctxvtr+HhxDYVzF9DU3NyMiIgKVlZUYNWoU9Ho96urqkJiYKNmJTSmYsJOAE2Q6nQ69vb38JK6uruZzFzm7hNwVSapwoW0JdKF4Je7voqIiREREoLq62mk1XLNmjVP/glWrVqGoqIi/rtCPsqenxylPtri4GL29vR51qPc1jqEkwYDQIsW1s+TaJzoKLq6Eek5ODgoLC516v3Z3dyM7O9vuOtziKKf2m8FgwKpVq+zOefvtt6O8vBy//vWvXX4n2/nqamGX2h5zGRQrVqwAAFRVVTmlyFVXV9vNLaUXVibsJLCtBssJFb1eD71eb/egbO0SgPOKZLvNjY6OFmxwM3/+fJSVlTmVpRbaIi9btgx6vd6uggkH90OwLQl12223oaenh9capkyZgpaWFhQWFsJkMvH1xFQqldMPTaPRuNUJfqBju0i5smEBP84VzmTg2Pu1oKDApX1NSgPiYkVtvZ5cyX052C6MrraaQotoSUkJDAYDH8/JRQMIFZctKipCamqqUyC2Ug4xJuwk4CahrXou1+Mm1a/znXfewZ49e9De3s4LkOjoaLuy1GVlZdi/fz/CwsKcBCOHmKFbo9HY2XjKy8sRERGBUaNG4Z133gEhBCaTCYMGDcKgQYOgVqsRGxvLV1QBfgx1ycnJQXl5uexQB8aPuDNXuOfW3NwMg8HAL3CRkZF8YQCuT4Wjk0JKA/K2fJKttupqqymk2XLFI6qqqvhqL1LRBkKl7JVyVDBhJwE3CW3Vc7keN6l+nbNnz8bx48dx7733OtU547YmeXl5KCoqwpo1a5x+LFwlYaHqJFxql+M2dteuXWhqakJkZKSd/aa0tBQGg8HJYF5SUoK0tDS7TAKGfPR6vaRwsJ0rOp0OJSUlGDx4MHbv3o2IiAi7Z/TOO+84lfQqLy/HqFGjnPrDOqJE+SROW5WjZcop/e6qhJhjpIBSjgom7CTgVkUu3zE3N1fS47Z9+3anySQkHLOzs1FTUyNYltp22xoREYHOzk6nrey1a9ewYcMGvowTt83UaDTo6emxs/FwxwIQDC9ZvXo19uzZw2uUxcXFMBqNMJvN0Ov1OHnyJG9nCSaio6MDPQRR9Ho9Dhw4AK1WK+pdffbZZ/nadbW1tRg5ciS6urr45ti2zyg6Ohq33HKLncBKS0vD/v37ERcX51J4KZWv6o2W6Li996bfsacwYScB93BqamrQ3t7OZzAIue3F6v6LCcekpCRkZWXxnq3q6mpcuHABZ86csZsE7777LiIiIpwcB52dnfwYuSKPOTk5dt2mbLfQb775pmh4ia2xm3NwcMKvpKQE586dCzo7HZfXGowcPHgQERERSEtLE9S8w8PDYbVasXz5cqf7KtSToq2tDXq93kk4mEwmtLa28ouZr5+RUkU2pc4jt9+xJzBh5wJbFZ5Tr/V6vWiDE0ctLCUlRXAVmzBhgpMtj7PN2K7qWq1WMNh3z549SE9PBwC+v+uYMWNEDcqJiYl8QQRXxm5b4Zefn4/CwkI0NTUFXYexYMU2pjEpKclurixduhSxsbF8yIkjQoujVqsVtP0VFRXhscce82uuqVJaoth5fFminQk7mdhuR7kHxeUpuvK+TZgwwWkVk2p3aPtgOW3Mtv0hV72CCwmZNGkSjh07huZm+25TtjajrKwsHDhwwCnmyrEngpDwMxqNyMnJCaoOY1ylEa7ySjBhmwnhOFe4BYoLOXG8l2JhQWIGfV+2HgwEvizRzoSdCzhNjVKKHTt22CUzOwoGKe+bY8KzXNtEfHw83n33XVy4cMEpLsk2JGTUqFF2oSXR0dF2sXXcmN977z2+PLharUZYWJhdTwQh4afVaoPuB+VYkilY0Ov1snp4iNmhhH7sYgHfcsNPQg1f1cRjwk4CIU2tvLwcVqtV0AvmTlVYKUeHbRCv0WhEU1OTYCPloqIiu5AQ2+11YmIiRo4caWdf5Iz6FosFYWFhUKlUsFqt2LVrF3p7exEXF4fe3l474VdcXMy3uetPPyhfwM2XzMxMpwBtVxq0LUIFHMSaSss5X7DjuGvR6XR81omSMGHngKvGNlwYh5AXzJ0EazHbhE6nsxNYS5cuFQ0e7unpsXuN+5FwP7rvvvsOUVFRKC8vR2dnJ9RqNSil2LBhA3/Nt99+G+Hh4bj33nud7JFqtRopKSm4cOEC9Ho9oqOjQ/YH5Q84zd5gMMBkMvEOLY1GA0qp0yKyZMkSWXnVjtqe0KLkuPAqWUHEl9VI9u/fzzdtt921AFBc4DFhZ4OjJrd9+3YnIcN5QYW8YO4YV92xTezfv19UC3SMLhezG65evRrR0dFOqWQWiwUrVqxwarhcXFyM5ORkXL16Ffn5+dizZw8opUGRKhascJ3F3n//fWg0GjvbaElJCR/TGB8fD5Wqr9eV3LxqIW1PbO4oWUHE19VIuGwkoV0LE3Y+xNHm5qipcf0A1q9fL/jg3TWuyrVNcEGntuEuJSUlSElJceoLIWU3fOihh9De3m53bi6diOsmxhXpBIDLly/zDpK2tjbk5+cHhb0OgFMH+WCA6wSn0WicPOj5+fkoLy/Hvffei927d0OtVqOkpAQJCQkwGAxuOxqk5o6cyjpysT0Xp+F1dHRg//79/Di8Qcz54rhrUQIm7GxwtLk5amqHDx922eVcjgBzd1vArXDctkir1WLkyJG4evWq08QTsxu2tLRgx44dTo1bBg8ezOfIJiYmYu7cudDr9ViyZImdk4OzpQQLwbidzsrKQklJCQghgs+go6MDpaWlCA8PF9WUlLCLCs2Bzs5OXLt2zSn3Wu65fKXhiQVec6XolYQJOxscNTmdToeWlhbe60kp9Tq629NJs2zZMtTV1eHJJ59EfX296Dmio6OxY8cOPu82JSUF9fX1iIqKcqpCXFJSgqioKKxbt46v5nLs2DGYTCbU1NTgtttug9FoREVFhU9W2v6GTqdDVVUVCCGimRNarVYyHUoJR4O7OxI55/JVvTmxXQvzxvoYIZubXq/ng4bl5Aa6wnHSGAwGvtBmTU2N5IrrauKVlpaCEMILterqahw/fhxqtRorV67EV199xXej5xr2AMCbb76J6OhoEEJ4wcd5/MLDw6HT6dDQ0ODpbfUJwdiDAgAWLFiAffv2OcUzlpWVQaPRSHrsuTJI3tpFs7Ky7CrryNmRSJ1LrDWnEloot2uxnZfMG+sHXNnclIjutp3s7mp5riaebeQ+0NehKj8/H2+++Sbq6+vR0NBg5/UqLCzk68L19vYKFhAoLS2160nLcA2lFNeuXePjGWNiYmA2m5GTkyOaDsUVtVQigFan0+HgwYN8ZR1vdiQ6nQ7nzp3DqVOnfJbGtWzZMp8IN0eYsHNAyuamRHS37RbD3a0B95qYd9axhwEnWBMTE/Hll1/aeb0MBgO0Wq2d9iFUa62rqwvx8fEoLS11qXky+jT39evX25VpCg//8WfW09MjWM9wxYoVit7XRYsWoaqqCvfee69dvjSHO4KqqalJsMJOSUkJX8IpFGDCzk28je621Q5tjb+2DouWlhbJ6wMQ1DAdI/c5wcoZzm0FYU1NjcutDRfeEuh+n6EE90zDwsL4e8S1T6yqqsLq1avR2dmJPXv28MUpfVEQ1TFtUG7PCbHvlJ2dbZfnm5iYCIPBEFLzgAk7P8NNjtLSUqjValRXVztVtCgpKZGsziqmYZaUlNgJQa6xyvTp050EoZy6fCUlJU5lxIMpZSwYEQssd0zmT09PR2NjI68xc5VrXGnO7njyHcsqeboj4b6T7fm4vq+hBBN2CuLOROSKb9bW1goGVboSKEIaZk1NDSZNmmQ3qUeNGoVjx44hJSXFbnUXy7dUq9V8XT6DwSDY/yAYUsbi4uICPQRBxOy6XK9W2zkSGxsLk8lk1zBJSnP2JvzDmx2JLyuR+BMm7BTCnYloa6s7evSo28ZjMaGalZUlOAaTyYQ777wT7733Hu/1ioiIELQdGY1GPPXUUwCAV1991WdGaW8ZMmRIoIcgiJjWXVNTI6jFl5eXyw4q9ke7QXe+k2NGhy8bXCsBE3YK4c5EtN1CupNPC4gL1XPnzqGpqQkdHR12bvyMjAw0NDSgubnZzuvV2NiI4uJiuwms0+nw5Zdf8ttYo9HolNAeLCs6V/WEC58JJsS0qP379wvmWtvOEamFzp1CE0ojpRn6o8G1EjBhpxCeVjxxd4sgJFR1Op1gj4Kbb74ZDQ0NggbqsrIyUEqdmi9zxvT4+HgsXboUgG9qi3kLV88u2OLsxNDpdLLKekktdO4ujP4iUBqnuzBhpxCeVjzh2hrKbVcoJFS5eDohjYGro3fy5Em7tLA77rgDAHivINcPIT4+HjExMejo6OBDTRxr8TE8Q2yOxMfHi1YvsUWqVaFjw2p/EkiN0x28EnaEkBcArARgAnAWwE8ope0KjCtgyLE9CH3G24onYj0sHBH6wciZbLGxsbBarYiJibHT5qxWK/Ly8hAbG4vS0lJ0d3fDbDYjMTERkyZNcio0wPAcoTnCadhCzZoccZw3XIP27OzsgG4dg1XjdMRbze4ggN9QSs2EkOcB/AaA6zbjQYoc24PYZxYsWGBXOUSpiieOCP1gxCrjchpDdXU1amtr0dvbi7CwMLzxxhtQq9WIiIjAlClT8NFHH/HVTtRqNX7961/b1derqalhwk4BhBa5O+64w617y80b254oQGC3jqHirfVK2FFK37f581MAa70bTmCRY3twVXrd1xPNNqi4vb0darUaKpXKyZHAbW/+8Ic/YNCgQXZpYhUVFRg2bBiamppQW1uLhIQErFq1im8EU19fD51Ox3+vYNuOhDLeBqVzBNPWUYnMIn+gpM1uI4BCBc/nd+RMIDmf8Zcb/r777uMF2O7du/m8WdvtzSuvvILVq1cLOjRsk/65ldi2EQz3vYJtOwIACQkJgR5CQBHbOsbFxaGgoMDvISBKCXFf4lLYEUIOARgu8NYWSmn5jc9sAWAG8JbEeR4E8CAAJ2ERLMixPbj6jD/c8AcPHkR4eDjefPNNfkLPnDkTtbW1oJRi0KBBSEpKQn19Pbq6upzu96lTp5wcGrYFPjnBzW2Rs7KyFBm3kgx0YSdm/zObzUEfAhIoXAo7SukdUu8TQu4HsALAQkoplTjPawBeA4BZs2aJfi6QCE0grqosV/RQrA8sZ5/wtRter9fDarXa1abjJrnjVtVisSAhIcGptplUuZ7m5mbExsaisbGRrysWjD8UrlqLbZL9QEJo60gpxdq1a4PCjheMeOuNXYo+h0Q2pfSaMkMKHI4TKDo6WrCqrFAfWO5YJW0p3Ha4paUFWq2Wb97iGJial5fn1FsiNzcXu3btwsKFC52qLTsKQOBHh0ZJSQm6u7tRWFiItLQ0v5Te8YTm5mYAoRNn5wscF6JnnnkmaOx4wYi3y+JfAGgAHCSEAMCnlNKHvR5VALGdQFIeL7HYM6Xc8Nx2WKfTobe3V7IJ0JgxY5x6S3CNgcrLyxEXF8c3e6GUIi8vz0k7LS4uhkqlwvTp0/HZZ59h3bp1qKiokCxIwJDG3ylUoRICEii89cbepNRAghFPtDSl3PDcdriystJuW+xYdj0rKwvR0dF8kxxAvAz33LlzUVtbK9hgR6VS4Ze//CUsFgtqampgMBjYFsgLApFCFSohIIFiYBo8ZNI2tANLdi1Hq/EKhkcNw+ZZj2KKapLgSmm7ikdHR/OeUU/d8JygdaxsbFt2nUsLM5lMIITwzbXFynAXFRUBAF8A4KGHHuJ/EIsWLQLQpwnExMSgpqbGzlnBcI9ApFCFSghIoGDCToR9ZytxVPsJTEYTAOBi9yVsq/k9Zndn4Gfzfmr3Walera4m2r6zlfhz3Q5c6r7MC9Tl43P4LYljZeO8vDyntDBu9eYmuVgZbqPRiHvvvRdvvfUWni9/ASci9bgedh0J8fEYp70J0Y3RKCsrw8KFC1FeXs62QF4QqDi4YHUoBQNM2Inw57odMFlNdq8ZrSZ8GdcgWbIJkL+K7ztbiWeObkePpa9z18XuS3jm6HYAP25JdDqdU2VjW8aMGQO96TQq6w/ikvYyhk8chptbx6O5uRn11gZekEaHR8M40ojiqgpEj45Gj7UHZtrn0WyztOOp6meQbZ6LidMm4r++/DVaRrXiwAcf4L7JG7y7kT4iWEs8cTD7WfDBhJ0Il7ovC75+1XTV6TVPV/E/1+3gBR1Hj6UHf67bgQPr9gLoE6Tt7e0oKipCRESE0w/ozWNv4fjgkzB3WwD0CcwrUVdx+b3folH7LxhvCOwucxdwoxqSwWJwGotFZcHn0cfx0b8+6TuGAN2kG/+n+Q2MPjsay8fnSH4XfxOsxTs5mP0s+GDCToThUcNwsfuS4Osc3Bb04qhL+HDXx/j/Zv8nLxTkrOJiAvVi9yXsO1uJ5bocpwKJjj+g107/A2aVxe54k9WEb9Rn+ZpvcukwdgLE/jVO+AabsDOZ+oS4Wq0O8EiEYfaz4IMJOxE2z3rUbosJANowLTbPehSAwxaUAC3GVmyr+T2sVituCZ8iuorb2ugIIRCLw+a2s7ZCRugH1K3tFjzeCvcEHQAnQcchJpQDyfnz5wEEd5wds58FF0zYQdxJAEDydcctqNFqwvbDz+E+892Cq7ijjU4i4YTXqITGYBvjV1G4X1ADdRdtmBbacA3ajR1O79lqswxGqDLghZ2Uk2D5+BzR7ZuYttMT3oNNDwoHHAsJSCm4sYiNDRDWQOUSp45Fp6kLw6OGYe7ouTjYeNDpM7baLIMRygRfAX8/I+UkkEJM25HSgtzVwFRE5XJsy8fnYOvcLRgRNRwEBCOihGo2CGMr6Pac2euk1cWpY7F17pags9cxGJ4w4IWdmIbmyk61edaj0IZp7V6T0oL2nRXvsTkobJDTa9owLaxU2O7mOLbl43NwYN1enNhYiwPr9soWeBQUF7sv4d2G3YKaYWREJBN0jH7DgBd2nmhogLBGJaUFSWmKvdZep9dWTlghKrRcjU1IEHtCMDomOBITE1nMGsMtBrzNzpXXVQopm54jUoKDC+615ei5ox6PzdG5EquOASEEHcZOUMivrhXMjonY2NhAD4FHysHFCB4GvLBz5XVVCrG4PTEudV/2amxCgnjf2Ur8tvp3sq4f7I4Jo9EIANBoNAEdhysHFyN4IFLhD75i1qxZtK6uzu/X9RZvVnDHHwXQJ1A0YWp0mDqdPj8iajifRaHU2F15beM1cegwdoaEdvLdd98BCHyc3dLCFYKLmNLPjyEPQsjnlNJZQu8NeM1OLt6u4GJaGgCPt9HuICfsZVD4IFRv+EDR6/Z3PHVwMfwPE3YykQpRkasBSdn4fL2NlvPjYz9Q95GTVsgIDpiwk4kvV3B3HB2eIsdmyH6g7uONg4vhXwZ86IlcPA1RCRY2z3oUEaoI0ffZD9Qz3A1BYgQO5qCQiZiDIZQm9rz/u0DQGaIiKjw7b1vIfA8AMBj6ylRFR0cHeCSMYII5KBTAXyEqrvDGIywk6ADASq0hJegAJuQY7sOEnRv4w7Ymxb6zldj64TY+CPli9yVs/XAbPzZXqIhKMAVNRULPmnH9+nUAwKBBzql2DIYQoTfLBzDPf/KCU7aFmZrx/CcvyDpeLNfWSq1YWrgC0/6ZgaWFKyTzeIOFixcv4uLFi4EeBiOEYJpdCCG2DRV73RZXAozz1LIMAEZ/hWl2AwRXJatskVPiisEINZiwCyGESkFJvW6Lu/GALMCY0d9gwi6E0IQLN5cRe90Wd+MBQyV+kMGQCxN2IUSHUcRmJ/K6Le7WuJs7eq7szwaC4cOHY/hw+VWZGQxFhB0h5DFCCCWEsGqKPsSbLA4u0j9eI6/f6tFzR90am7+JjIxEZGRkoIfBCCG8FnaEkNEAFgFo9n44DCncLQXvyPLxOaje8AH+O/v3Lu18XO/aYOXatWu4du1aoIfBCCGU0OxeBvArwI0SuAyPUCoPc/n4HHx6fw1ObqyT7FfxVM0zQSvwLl26hEuXvG8h6e41169fj/Hjx2PKlClYtmwZvvnmG7+OQYj58+cjFNIvX3/9dUyYMAETJkzA66+/LviZf/3rX1i4cCHS0tIwf/58vj8w0Nd4fvHixZg8eTKmTJmCpqYmt67vVZwdISQXwAVK6UlCRDos//jZBwE8CABjxozx5rIDGqWzOKRaMfZae/HHT19k8Xbo6/G7evVq3H///di1axcA4MSJE7h8+TJuvvlmv43DbDYjPDz0wmOvXr2Kbdu2oa6uDoQQzJw5E7m5uUhISLD73GOPPYb77rsP999/P6qqqvCb3/wGb775JgDgvvvuw5YtW7Bo0SIYDAaoVO7pai4/TQg5RAj5UuC/VQC2ANgq50KU0tcopbMopbOSkpLcGiTDdywfn4OVE1aIvi/UNHsgcvjwYURERODhhx/mX5s2bRqysrJAKcXjjz+OW265BTqdDoWFhQCAI0eOYP78+Vi7di0mTZqEDRs28I3Ra2trceuttyI9PR2ZmZno6upCT08PfvKTn0Cn02H69Ok4fPgwAGDnzp248847sXLlSixevBjXr1/H+vXrkZaWhnXr1vGpcwDwyCOPYNasWZg6dSqeeuop/vWUlBQ89dRTmDFjBnQ6HRoaGgD0FVTgrpmWlobi4mIAwPvvv485c+ZgxowZuPPOO/nCC57y3nvvYdGiRRg8eDASEhKwaNEiHDhwwOlz9fX1WLhwIQDg9ttvR3l5Of+62WzGokWLAPTlRrtrs3W5RFBK7xB6nRCiA5AKgNPqkgF8QQjJpJT6d3/B8Jh9Zyux5wwrH+6KL7/8EjNnzhR8r6SkBCdOnMDJkyfR2tqKjIwMzJs3DwBw/PhxnD59GiNHjsRtt92Gjz76CJmZmVi3bh0KCwuRkZGBzs5ODBo0CK+88goAQK/Xo6GhAYsXL+a3yZ988glOnTqFwYMH46WXXkJkZCROnTqFU6dOYcaMGfxYtm/fjsGDB8NisWDhwoU4deoU0tLSAPR1ZPviiy9QUFCAF198EX//+9/x+9//HnFxcdDr9QCAtrY2tLa24tlnn8WhQ4cQFRWF559/Hi+99BK2brXXa1544QW89dZbTvdj3rx5ePXVV+1eu3DhAkaPHs3/nZycjAsXLjgdm56ejuLiYvznf/4nSktL0dXVhStXruCbb75BfHw88vPz0djYiDvuuAPPPfccwsLCpB+cDR7rw5RSPYCh3N+EkCYAsyilrZ6ek+F/XJVrj1MHTxevYOXo0aO4++67ERYWhmHDhiE7Oxu1tbWIjY1FZmYmkpOTAfRpgk1NTYiLi8OIESOQkZEB4MdOaUePHsXmzZsBAJMmTcLYsWN5YcdpRQDw4Ycf4he/+AUAIC0tjRdmAFBUVITXXnsNZrMZFy9eRH19Pf9+fn4+AGDmzJkoKSkBABw6dIjflgNAQkIC9u7di/r6etx2220AAJPJhDlz5jh978cffxyPP/64rHskVEpOyPT14osv4uc//zl27tyJefPmYdSoUQgPD4fZbEZNTQ2OHz+OMWPGYN26ddi5cyf+4z/+Q9b1AZYbO6DZd7ZSsnpxOAnHr+fIm8z+ZsSIEX693tSpU7F7927B96RqQtp2PwsLC4PZbAalVPCHLnWeqKgou7+Fjm9sbMSLL76I2tpaJCQk4IEHHkBPz48LGTcWbhzcNR3PRSnFokWL8M4774iOB3BPs0tOTsaRI0f4v8+fP4/58+c7HTty5EheEBsMBhQXFyMuLg7JycmYPn0632ApLy8Pn376qVvCTrGgYkppCtPqQgeuGKkYKqLCM/OeClrnxKBBg/xa3mnBggUwGo3429/+xr9WW1uL6upqzJs3D4WFhbBYLGhpacGHH36IzMxM0XNNmjQJ33//PWprawEAXV1dMJvNmDdvHi88vvnmGzQ3N2PixIlOx9t+7ssvv8SpU6cAAJ2dnYiKikJcXBwuX76MykrXnvTFixfjL3/5C/93W1sbZs+ejY8++gjffvstgL4wHyGv8+OPP44TJ044/eco6ABgyZIleP/999HW1oa2tja8//77WLJkidPnWltbYbX2Vef5wx/+gI0bNwIAMjIy0NbWhpaWFgBAVVUVpkyZ4vL72cIyKAYoUttXFYK/crHBYPDaaO4OhBCUlpbi4MGDGD9+PKZOnYqnn34aI0eOxOrVq5GWlob09HQsWLAAf/zjHyWzO9RqNQoLC7F582akp6dj0aJF6OnpwaZNm2CxWKDT6fhtmlBf3EceeQQGgwFpaWn44x//yAvW9PR0TJ8+HVOnTsXGjRv5bagUTz75JNra2nDLLbcgPT0dhw8fRlJSEnbu3Im7774baWlpmD17Nu/Q8JTBgwfjd7/7HTIyMpCRkYGtW7fy2/KtW7eioqICQJ9TZ+LEibj55ptx+fJlbNmyBUCfNvriiy9i4cKF0Ol0oJTiZz/7mVtjYGXZByjT/pkBKhIaSUBwYmOtn0fkHsHSN5YRXEiVZWea3QBFKsWMggZtMDGD4SlM2A1QXKWYPXN0OxN4jH4FE3YDlOXjc5A5QtyIzgp4usYXTX8OHDiAiRMn4qabbsJzzz0n+rkjR45g2rRpmDp1KrKzs/nXN27ciKFDh+KWW25RfGyhDhN2A5i/5RTgzklrRd9nBTyFoZTyHkMlsVgsePTRR1FZWYn6+nq88847qK+vd/pce3s7Nm3ahIqKCpw+fRrvvvsu/94DDzwgmJnAYMJuwPPkrU+IFgMI5gKeo0aNwqhRo/x2vaamJkyePBmbNm3CjBkzcO7cOQDAf/3Xf2HGjBlYuHAhHxbhKceOHcNNN92EcePGQa1WY/369Xy6lC1vv/028vPz+RzzoUP52H7MmzeP93Iy7GHCjuF16ahAoNFoBMMyfMnXX3+N++67D8ePH8fYsWPR3d2NGTNm4IsvvkB2dja2bdvmdMxbb72FadOmOf23dq2zRi03peqbb75BW1sb5s+fj5kzZ+KNN95Q9ov2U1gGBSNoGoC7Q2dnX3VmLtXKH4wdOxazZ8/m/1apVFi3bh0A4J577uHTsWzZsGEDNmzYIOv8clOqzGYzPv/8c3zwwQe4fv065syZg9mzZ/u1+koowoQdA0DgG4C7S2trX7KOP4WdY8qWI0KC6a233sILLzj39b3pppuc0s+Sk5P57THQl1I1cuRIp2OTk5ORmJiIqKgoREVFYd68eTh58iQTdi5g21gGw0OsVisvsN5++23Mnevct2PDhg2CKVVCebYZGRk4c+YMGhsbYTKZsGvXLuTm5jp9btWqVaipqYHZbMa1a9fw2WefYfLkycp/wX4G0+wYDA+JiorC6dOnMXPmTMTFxfF17DwlPDwcf/nLX7BkyRJYLBZs3LgRU6dOBQD89a9/BQA8/PDDmDx5MpYuXYq0tDSoVCr89Kc/5UNN7r77bhw5cgStra1ITk7Gtm3b3EqW78+wdDFGSMLSxRhCsHQxBoMx4GHbWEZIwhXEZDDkwoTdAGff2cqQCjnhUKvVgR4CI8Rgwm4AwxXw5OraXey+xBf0DHaB19HR1wgoLk5e028Gg9nsBjBCBTxDpQDAlStXcOXKlUAPgxFCMM2uHyJ3ayqW6M8KADD6I0yz62dwW9OL3ZdAQfmtqVBtOrFE/2AuAMBgeAoTdv0Md7amoVgAgMHwFLaN7We4szUNxQIADIanMGHXzxgeNUywF6zY1jTUCgBwcLXcGAy5sG1sP8Pdrem+s5VYWrgC0/6ZgaWFK0Km70R4eDjCw9lazZAPmy39DHe2pqEcZ9fW1gYASEhICMj1e3t7cf78efT0CPfeZfgWrVaL5ORkREREyD6GFQIYoOw7W4knP3wKVurcS2FE1HAcWLc3AKOST6ALATQ2NiImJgZDhgwRrGPH8B2UUly5cgVdXV1ITU21e8+nhQAIIZsJIV8TQk4TQv7o7fkYvofT6IQEHcDi7OTQ09PDBF2AIIRgyJAhbmvVXm1jCSG3A1gFII1SaiSEDHV1DCPwCIWn2MLi7OTBBF3g8OTee6vZPQLgOUqpEQAopT94eT6GHxDy1nKwOLvQwdO+tX/961+9atKzbNkytLe3AwBeffVVTJ48GRs2bEBFRYVkr1tPKCsrE2wn6QneOihuBpBFCNkOoAfAY5TSWu+HxfAlBAQUwrZa2wDkYHdSMDzj4Ycf9ur4/fv38/8uKChAZWUlbzsTKiPvDWVlZVixYgWmTJni9blcanaEkEOEkC8F/luFPmGZAGA2gMcBFBER/ZIQ8iAhpI4QUudtf02Gd4gJOg6pFLNgISUlBSkpKYEehmz0ej0KCgrwzDPPoKCgAHq9XrFzHzlyBPPnz8fatWsxadIkbNiwge9U9sQTT2DKlClIS0vDY489BgB4+umn8eKLLwIAamtrkZaWhjlz5uDxxx/ny7vv3LkT+fn5WLp0KSZMmIBf/epX/PVSUlLQ2tqKhx9+GN999x1yc3Px8ssvY+fOnfj5z38OALh8+TJWr16N9PR0pKen4+OPPwYA5OXlYebMmZg6dSpee+01/pzR0dHYsmUL0tPTMXv2bFy+fBkff/wxKioq8Pjjj2PatGk4e/asV/fJpWZHKb1D7D1CyCMASmjfnT1GCLECSATgJM0opa8BeA3o88Z6PGKGX+A0vGDV7lSq0AkR1ev1qKqqQm5uLsaMGYPm5mZUVFQAAHQ6nSLXOH78OE6fPo2RI0fitttuw0cffYQpU6agtLQUDQ0NIITwW09bfvKTn+C1117DrbfeiieeeMLuvRMnTuD48ePQaDSYOHEiNm/ebNfX9q9//SsOHDiAw4cPIzExETt37uTf+8UvfoHs7GyUlpbCYrHAYDAAAP75z39i8ODBuH79OjIyMrBmzRoMGTIE3d3dmD17NrZv345f/epX+Nvf/oYnn3wSubm5WLFihWCfXXfxdsaUAVgAAISQmwGoAbR6eU6GwjgGDg9yCDoWI5i9sqFU4qmmpga5ublITU1FWFgYUlNTkZubi5qaGsWukZmZieTkZKhUKkybNg1NTU2IjY2FVqvFT3/6U5SUlCAyMtLumPb2dnR1deHWW28FAPz7v/+73fsLFy5EXFwctFotpkyZgn/961+yx1NVVYVHHnkEABAWFsbXHXz11Vd57e3cuXM4c+YMgL5irCtWrAAAzJw5E01NTR7dBym8FXb/BDCOEPIlgF0A7qeBCNxjiCJUBcVMLSBw7c0KZq9sR0cHX8Az2GltbXVKbxszZgzf+1YJNBoN/++wsDCYzWaEh4fj2LFjWLNmDcrKyrB06VK7Y1z9VIXO6Q1HjhzBoUOH8Mknn+DkyZOYPn06Hz4SERHBe1iVuJYQXgk7SqmJUnoPpfQWSukMSmmVUgNjKINQmEmvtRdxmliMiBoOAoI4dSwiVPaR6MwrqxyJiYlobm62e625uRmJiYk+va7BYEBHRweWLVuGP/3pTzhx4oTd+wkJCYiJicGnn34KANi1a5di1164cCH+53/+BwBgsVjQ2dmJjo4OJCQkIDIyEg0NDfx1pYiJiUFXV5ciYwodwwfDI8S2oh3GThxYtxcnNtbiw3uqsC1rKy/8RkQNx9a5W4LWXhdqZGVloaKiAo2NjbBYLGhsbERFRQWysrJ8et2uri6sWLECaWlpyM7Oxssvv+z0mX/84x948MEHMWfOHFBKFStz/8orr+Dw4cPQ6XSYOXMmTp8+jaVLl8JsNiMtLQ2/+93vMHv2bJfnWb9+PV544QVMnz7dawcFSxfr5ywtXCEYVxcKKWFSBDpd7KuvvsLkyZNlf16v16Ompgatra1ITExEVlaWYs4JbzAYDHy83nPPPYeLFy/ilVdeCfCo5CH0DKTSxVghgH7O5lmP2iX7A2yLGgh0Ol1QCDdH9u3bhz/84Q8wm80YO3asnUe1v8GEXT+nvxboDJRG199Yt24d1q1bF+hh+AUm7AYAoVqgk8FQEuagYDAYAwIm7BgMxoCACTsGgzEgYMKOwQhRtm/fjqlTpyItLQ3Tpk3DZ599hp/+9Kcel0T6/vvv7XJQ7777bqSlpeHll1/G1q1bcejQIaWGDgD405/+hGvXril6TimYg4LBCEE++eQT7N27F1988QU0Gg1aW1thMpnw97//3eNzjhw5Ert37wYAXLp0CR9//LFb+bDu8qc//Qn33HOPU86ur2CaHYPhB5Tu4nbx4kUkJiby+auJiYkYOXIk5s+fDy5g/x//+AduvvlmzJ8/Hz/72c/48ksPPPAAfvGLX+DWW2/FuHHjeAHX1NTEl3havHgxfvjhB0ybNg01NTV44IEH+M/V1tbi1ltvRXp6OjIzM9HV1YWmpiZkZWVhxowZmDFjBl/SSaz81Kuvvorvv/8et99+O26//Xav7oVsKKV+/2/mzJmUEVzs/XY/XbJrOU3/xyy6ZNdyuvfb/YEeUlBTX18v+7N7v91PM3feRtP+MZP/L3PnbV7d466uLpqenk4nTJhAH3nkEXrkyBFKKaXZ2dm0traWXrhwgY4dO5ZeuXKFmkwmOnfuXProo49SSim9//776dq1a6nFYqGnT5+m48ePp5RS2tjYSKdOner0b+6Yd999lxqNRpqamkqPHTtGKaW0o6OD9vb20u7ubnr9+nVKKaXffPMN5X7jhw8fprGxsfTcuXPUYrHQ2bNn05qaGkoppWPHjqUtLS0e3wOhZwCgjorIHabZMQQrowR78c5QQqgYg21FaE+Ijo7G559/jtdeew1JSUlYt26dXfbDsWPHkJ2djcGDByMiIgJ33nmn3fF5eXlQqVSYMmUKLl+WX8rr66+/xogRI5CRkQEAiI2NRXh4OHp7e/Gzn/0MOp0Od955p53dUKj8VCBgNjuG5I+RBSN7j1gxBm/rBYaFhWH+/PmYP38+dDodXn/9df496kb5JleftYVSKtjs5uWXX8awYcNw8uRJWK1WaLU/1kxUulSUpzDNjuGzHyOjD7G6gN7UC/z666/5wpdAX1XhsWPH8n9nZmaiuroabW1tMJvNKC4u9vhatkyaNAnff/89amv7Ws10dXXBbDajo6MDI0aMgEqlwptvvgmLxeLyXEqWb5IDE3YMn/wYGT+yedaj0DpUh/a2GIPBYMD999/P95eor6/H008/zb8/atQo/Pa3v8W//du/4Y477sCUKVMUKd+kVqtRWFiIzZs3Iz09HYsWLUJPTw82bdqE119/HbNnz8Y333yDqKgol+d68MEHkZOT4zcHBSvxxOBtdo6VUVhNO3HcLfG072yl34sxcOWbzGYzVq9ejY0bN2L16tU+vaY/YSWeGG7TXyujBBOBKMbw9NNP49ChQ+jp6cHixYuRl5fn1+sHG0zYMQCwyij9Ea5dIqMPZrNjMBgDAibsGAwPCYS9m9GHJ/eeCTsGwwO0Wi2uXLnCBF4AoJTiypUrdrF8cmA2OwbDA5KTk3H+/Hm0tLQEeigDEq1Wi+TkZLeOYcKOwfCAiIgIpKamBnoYDDdg21gGgzEgYMKOwWAMCJiwYzAYA4KApIsRQloAuFsCNRFAqw+G4ytCbbxA6I051MYLhN6YQ228YymlSUJvBETYeQIhpE4s5y0YCbXxAqE35lAbLxB6Yw618UrBtrEMBmNAwIQdg8EYEISSsHst0ANwk1AbLxB6Yw618QKhN+ZQG68oIWOzYzAYDG8IJc2OwWAwPCaohR0h5E5CyGlCiJUQMsvm9RRCyHVCyIkb//01kOO0RWzMN977DSHkW0LI14SQJYEaoxiEkKcJIRds7uuyQI9JDELI0hv38VtCyBOBHo8rCCFNhBD9jfsalGW6CSH/JIT8QAj50ua1wYSQg4SQMzf+nxDIMXpDUAs7AF8CyAfwocB7Zyml027897CfxyWF4JgJIVMArAcwFcBSAAWEkDD/D88lL9vc1/2BHowQN+7bDgA5AKYAuPvG/Q12br9xX4M1lGMn+uamLU8A+IBSOgHABzf+DkmCWthRSr+ilH4d6HG4g8SYVwHYRSk1UkobAXwLINO/o+s3ZAL4llL6HaXUBGAX+u4vwwsopR8CuOrw8ioAXI/G1wHk+XNMShLUws4FqYSQ44SQakJIVqAHI4NRAM7Z/H3+xmvBxs8JIadubGmCdcsSKvfSFgrgfULI54SQBwM9GDcYRim9CAA3/j80wOPxmICXeCKEHAIwXOCtLZTScpHDLgIYQym9QgiZCaCMEDKVUtrps4Ha4OGYnTsL9/0A/IrU2AH8D4Dfo29cvwfw/wPY6L/RySYo7qWb3EYp/Z4QMhTAQUJIww1NiuEnAi7sKKV3eHCMEYDxxr8/J4ScBXAzAL8Yfj0ZM/q0j9E2fycD+F6ZEclH7tgJIX8DsNfHw/GUoLiX7kAp/f7G/38ghJSibyseCsLuMiFkBKX0IiFkBIAfAj0gTwnJbSwhJIkz7hNCxgGYAOC7wI7KJRUA1hNCNISQVPSN+ViAx2THjcnMsRp9zpZgpBbABEJIKiFEjT7HT0WAxyQKISSKEBLD/RvAYgTvvXWkAsD9N/59PwCxnUvwQykN2v/Q94M7jz4t7jKA9268vgbAaQAnAXwBYGWgx+pqzDfe2wLgLICvAeQEeqwCY38TgB7AKfRN8hGBHpPEWJcB+ObG/dwS6PG4GOu4G3P15I15G5TjBfAO+kxEvTfm8H8AGII+L+yZG/8fHOhxevofy6BgMBgDgpDcxjIYDIa7MGHHYDAGBEzYMRiMAQETdgwGY0DAhB2DwRgQMGHHYDAGBEzYMRiMAQETdgwGY0Dw/wDCOqoT17PBEwAAAABJRU5ErkJggg==\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "coeqtl_sig = coeqtl_df[coeqtl_df['corrected_p_bios']<=0.05]\n",
+ "coeqtl_nonsig = coeqtl_df[coeqtl_df['corrected_p_bios']>0.05]\n",
+ "plt.figure(figsize=(5, 5))\n",
+ "plt.scatter(coeqtl_nonsig['MetaPZ_flippedforAF'], \n",
+ " coeqtl_nonsig['flipped_zscore_bios_flippedforAF'], \n",
+ " label='Insignificant',\n",
+ " edgecolor='gray',\n",
+ " facecolor='white', alpha=1)\n",
+ "plt.scatter(coeqtl_sig['MetaPZ_flippedforAF'],\n",
+ " coeqtl_sig['flipped_zscore_bios_flippedforAF'], \n",
+ " label='Significant',\n",
+ " edgecolor=color_dict[celltype],\n",
+ " facecolor=color_dict[celltype], alpha=1)\n",
+ "plt.plot([-15, 12], [0, 0], linestyle='--', color='lightgray')\n",
+ "plt.plot([0, 0], [-6.5, 4], linestyle='--', color='lightgray')\n",
+ "plt.legend()\n",
+ "\n",
+ "concordance_rate = coeqtl_sig[coeqtl_sig['is_concordant']].shape[0] / coeqtl_sig.shape[0]\n",
+ "print(concordance_rate)\n",
+ "\n",
+ "celltype_rb = bios_replication_filtered_df.loc[celltype]['r']\n",
+ "plt.text(3, -5, f'Concordance = {concordance_rate:.2f}\\nrb = {celltype_rb:.2f}')\n",
+ "\n",
+ "# plt.savefig('bios_replication.cd4t.filtered_results.pdf')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 62,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 62,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "def plot_ci_manual(t, s_err, n, x, x2, y2, ax=None):\n",
+ " \"\"\"Return an axes of confidence bands using a simple approach.\n",
+ " \n",
+ " Notes\n",
+ " -----\n",
+ " .. math:: \\left| \\: \\hat{\\mu}_{y|x0} - \\mu_{y|x0} \\: \\right| \\; \\leq \\; T_{n-2}^{.975} \\; \\hat{\\sigma} \\; \\sqrt{\\frac{1}{n}+\\frac{(x_0-\\bar{x})^2}{\\sum_{i=1}^n{(x_i-\\bar{x})^2}}}\n",
+ " .. math:: \\hat{\\sigma} = \\sqrt{\\sum_{i=1}^n{\\frac{(y_i-\\hat{y})^2}{n-2}}}\n",
+ " \n",
+ " References\n",
+ " ----------\n",
+ " .. [1] M. Duarte. \"Curve fitting,\" Jupyter Notebook.\n",
+ " http://nbviewer.ipython.org/github/demotu/BMC/blob/master/notebooks/CurveFitting.ipynb\n",
+ " \n",
+ " \"\"\"\n",
+ " if ax is None:\n",
+ " ax = plt.gca()\n",
+ " \n",
+ " ci = t * s_err * np.sqrt(1/n + (x2 - np.mean(x))**2 / np.sum((x - np.mean(x))**2))\n",
+ " ax.fill_between(x2, y2 + ci, y2 - ci, alpha=0.1, color='gray')\n",
+ " return ax\n",
+ "\n",
+ "from scipy import stats\n",
+ "def equation(a, b):\n",
+ " \"\"\"Return a 1D polynomial.\"\"\"\n",
+ " return np.polyval(a, b) \n",
+ "\n",
+ "x=coeqtl_df['MetaPZ_flippedforAF']\n",
+ "y=coeqtl_df['flipped_zscore_bios_flippedforAF']\n",
+ "\n",
+ "p, cov = np.polyfit(x, y, 1, cov=True) # parameters and covariance from of the fit of 1-D polynom.\n",
+ "y_model = equation(p, x) \n",
+ "# Statistics\n",
+ "n = y.size # number of observations\n",
+ "m = p.size # number of parameters\n",
+ "dof = n - m # degrees of freedom\n",
+ "t = stats.t.ppf(0.975, n - m) # used for CI and PI bands\n",
+ "# Estimates of Error in Data/Model\n",
+ "resid = y - y_model \n",
+ "chi2 = np.sum((resid / y_model)**2) # chi-squared; estimates error in data\n",
+ "chi2_red = chi2 / dof # reduced chi-squared; measures goodness of fit\n",
+ "s_err = np.sqrt(np.sum(resid**2) / dof) # standard deviation of the error\n",
+ "\n",
+ "# Plotting --------------------------------------------------------------------\n",
+ "fig, ax = plt.subplots(figsize=(5, 5))\n",
+ "# Data\n",
+ "ax.scatter(\n",
+ " x, y\n",
+ ")\n",
+ "\n",
+ "\n",
+ "# Fit\n",
+ "ax.plot(x, y_model, \"-\", color=\"0.1\", linewidth=1.5, alpha=0.5, label=\"Fit\") \n",
+ "\n",
+ "x2 = np.linspace(np.min(x), np.max(x), 100)\n",
+ "y2 = equation(p, x2)\n",
+ "\n",
+ "# Confidence Interval (select one)\n",
+ "plot_ci_manual(t, s_err, n, x, x2, y2, ax=ax)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 46,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ ":19: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " coeqtl_sig['celltype'] = celltype\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 46,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# withbiostechnicalandcelltypePICs\n",
+ "sig_df = pd.DataFrame()\n",
+ "fig, axes = plt.subplots(2, 3, figsize=(15, 10), sharex=True, sharey=True)\n",
+ "celltypes = ['CD4T', 'CD8T', 'monocyte', 'NK', 'B', 'DC']\n",
+ "for i in range(2):\n",
+ " for j in range(3):\n",
+ " celltype = celltypes[i*3+j]\n",
+ " coeqtl_df = pd.read_csv(\n",
+ " coeqtl_withbios_prefix/filter_type/f'UT_{celltype}/coeqtls_fullresults_fixed.sig.withbiosonlyRNAAlignMetrics_rmLLD.tsv.gz',\n",
+ " compression='gzip', index_col=0, sep='\\t')\n",
+ " coeqtl_df['zscore_bios'] = [get_z_score(item[0], item[1]) for item in \n",
+ " coeqtl_df[['t_bios', \n",
+ " 'num_individuals_bios']].values]\n",
+ " coeqtl_df['flipped_zscore_bios'] = [flip_direction(item[0], item[1], item[2]) for item in \n",
+ " coeqtl_df[['SNPEffectAllele', \n",
+ " 'assessed_allele_bios',\n",
+ " 'zscore_bios']].values]\n",
+ " # flip the direction according to AF\n",
+ " coeqtl_df['eqtl_effect_allele'] = [eqtl_allele_af_dict.get(eqtl)['AlleleAssessed'] for eqtl in \n",
+ " coeqtl_df['snp_eqtlgene']]\n",
+ " coeqtl_df['eqtl_alt_af'] = [eqtl_allele_af_dict.get(eqtl)['AF'] for eqtl in coeqtl_df['snp_eqtlgene']]\n",
+ " coeqtl_df['eqtl_alt_allele'] = [eqtl_allele_af_dict.get(eqtl)['alt_allele'] for eqtl in \n",
+ " coeqtl_df['snp_eqtlgene']]\n",
+ " coeqtl_df['eqtl_ref_allele'] = [eqtl_allele_af_dict.get(eqtl)['ref_allele'] for eqtl in \n",
+ " coeqtl_df['snp_eqtlgene']]\n",
+ " coeqtl_df[f'MetaPZ_flippedforAF'] = [flip_zscore(zscore, coeqtlallele, altaf, altallele)\n",
+ " for zscore, coeqtlallele, altaf, altallele in\n",
+ " coeqtl_df[[f'MetaPZ',\n",
+ " f'SNPEffectAllele',\n",
+ " 'eqtl_alt_af',\n",
+ " 'eqtl_alt_allele']].values]\n",
+ " coeqtl_df[f'flipped_zscore_bios_flippedforAF'] = [flip_zscore(zscore, coeqtlallele, altaf, altallele)\n",
+ " for zscore, coeqtlallele, altaf, altallele in\n",
+ " coeqtl_df[[f'flipped_zscore_bios',\n",
+ " f'SNPEffectAllele',\n",
+ " 'eqtl_alt_af',\n",
+ " 'eqtl_alt_allele']].values]\n",
+ " ## end flip\n",
+ " coeqtl_sig = coeqtl_df[coeqtl_df['corrected_p_bios']<=0.05]\n",
+ " coeqtl_sig['celltype'] = celltype\n",
+ " sig_df = pd.concat([coeqtl_sig, sig_df], axis=0)\n",
+ " significant_ratio = coeqtl_sig.shape[0] / coeqtl_df.shape[0]\n",
+ " coeqtl_sig_samedirection = coeqtl_sig[((coeqtl_sig['MetaPZ']>0) & (coeqtl_sig['flipped_zscore_bios']>0)) | \n",
+ " ((coeqtl_sig['MetaPZ']<0) & (coeqtl_sig['flipped_zscore_bios']<0))]\n",
+ " consistent_ratio = coeqtl_sig_samedirection.shape[0] / coeqtl_sig.shape[0]\n",
+ " # draw\n",
+ " ax = axes[i][j]\n",
+ " ax.scatter(coeqtl_df['MetaPZ'][coeqtl_df['corrected_p_bios']>0.05], \n",
+ " coeqtl_df['flipped_zscore_bios'][coeqtl_df['corrected_p_bios']>0.05], alpha=0.5,\n",
+ " label='Non-sig')\n",
+ " ax.scatter(coeqtl_df['MetaPZ'][coeqtl_df['corrected_p_bios']<=0.05],\n",
+ " coeqtl_df['flipped_zscore_bios'][coeqtl_df['corrected_p_bios']<=0.05], alpha=0.5,\n",
+ " label='Sig')\n",
+ " ax.set_xlabel('single cell')\n",
+ " ax.set_ylabel('BIOS')\n",
+ " ax.set_title(celltype)\n",
+ " ax.text(-2, -8, \n",
+ " f'Significant ratio: {significant_ratio:.2f}\\nConcordance ratio: {consistent_ratio:.2f}')\n",
+ "ax.legend(loc='upper left')\n",
+ " \n",
+ "# plt.savefig('bios_replication.filtered_results.scatterplots.pdf')\n",
+ "# plt.savefig('bios_replication.filtered_results.scatterplots.png')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ ":19: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " coeqtl_sig['celltype'] = celltype\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# unfiltered results \n",
+ "# withbiosonlyRNAAlignMetrics_rmLLD\n",
+ "sig_df = pd.DataFrame()\n",
+ "fig, axes = plt.subplots(2, 3, figsize=(15, 10), sharex=True, sharey=True)\n",
+ "celltypes = ['CD4T', 'CD8T', 'monocyte', 'NK', 'B', 'DC']\n",
+ "for i in range(2):\n",
+ " for j in range(3):\n",
+ " celltype = celltypes[i*3+j]\n",
+ " coeqtl_df = pd.read_csv(\n",
+ " coeqtl_withbios_prefix/'unfiltered_results'/f'UT_{celltype}/coeqtls_fullresults_fixed.sig.withbiosonlyRNAAlignMetrics_rmLLD.tsv.gz',\n",
+ " compression='gzip', index_col=0, sep='\\t')\n",
+ " coeqtl_df['zscore_bios'] = [get_z_score(item[0], item[1]) for item in \n",
+ " coeqtl_df[['t_bios', \n",
+ " 'num_individuals_bios']].values]\n",
+ " coeqtl_df['flipped_zscore_bios'] = [flip_direction(item[0], item[1], item[2]) for item in \n",
+ " coeqtl_df[['SNPEffectAllele', \n",
+ " 'assessed_allele_bios',\n",
+ " 'zscore_bios']].values]\n",
+ " coeqtl_sig = coeqtl_df[coeqtl_df['corrected_p_bios']<=0.05]\n",
+ " coeqtl_sig['celltype'] = celltype\n",
+ " sig_df = pd.concat([coeqtl_sig, sig_df], axis=0)\n",
+ " # draw\n",
+ " ax = axes[i][j]\n",
+ " ax.scatter(coeqtl_df['MetaPZ'][coeqtl_df['corrected_p_bios']>0.05], \n",
+ " coeqtl_df['flipped_zscore_bios'][coeqtl_df['corrected_p_bios']>0.05], alpha=0.5,\n",
+ " label='Non-sig')\n",
+ " ax.scatter(coeqtl_df['MetaPZ'][coeqtl_df['corrected_p_bios']<=0.05],\n",
+ " coeqtl_df['flipped_zscore_bios'][coeqtl_df['corrected_p_bios']<=0.05], alpha=0.5,\n",
+ " label='Sig')\n",
+ " ax.set_xlabel('single cell')\n",
+ " ax.set_ylabel('BIOS')\n",
+ " ax.set_title(celltype)\n",
+ "ax.legend(loc='upper left')\n",
+ "# plt.savefig('bios_replication.unfiltered_results.scatterplots.pdf')\n",
+ "# plt.savefig('bios_replication.unfiltered_results.scatterplots.png', dpi=300)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.11"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/04_coeqtl_mapping/filtering_strategy.py b/04_coeqtl_mapping/filtering_strategy.py
new file mode 100644
index 0000000..f901ab4
--- /dev/null
+++ b/04_coeqtl_mapping/filtering_strategy.py
@@ -0,0 +1,56 @@
+import pandas as pd
+import numpy as np
+from pathlib import Path
+import argparse
+from scipy.stats import norm
+from time import time
+
+
+sig_thres_zscore = norm.ppf(1-0.025)
+individual_network_prefix = Path("./input/individual_networks")
+saveprefix = Path("./input/gene_pair_selection/annotations/")
+def read_numpy(prefix):
+ data = np.load(f'{prefix}.npy')
+ rows = [item.strip() for item in open(f'{prefix}.rows.txt', 'r').readlines()]
+ cols = [item.strip() for item in open(f'{prefix}.cols.txt', 'r').readlines()]
+ return pd.DataFrame(data=data, columns=cols, index=rows)
+
+
+def merge_datasets(celltype, condition):
+ res_df = pd.DataFrame()
+ for datasetname in ['stemiv2', 'onemillionv2', 'onemillionv3', 'ng']:
+ data_path = individual_network_prefix / condition / datasetname / f'{condition}_{celltype}.zscores'
+ startime = time()
+ df = read_numpy(data_path)
+ res_df = pd.concat([res_df, df], axis=1)
+ print(f'Merged {datasetname}, it took', time() - startime)
+ return res_df
+
+
+def calculate_significance_freq(zscore_df, thres=sig_thres_zscore):
+ freqs = (abs(zscore_df.values) > thres).sum(axis=1)
+ assert len(freqs) == zscore_df.shape[0]
+ return freqs
+
+
+def parse():
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--celltype', dest='celltype')
+ parser.add_argument('--condition', dest='condition')
+ return parser
+
+
+def main():
+ args = parse().parse_args()
+ celltype, condition = args.celltype, args.condition
+ celltype_condition_df = merge_datasets(celltype, condition)
+ celltype_condition_df['sig_count'] = calculate_significance_freq(celltype_condition_df)
+ celltype_condition_df['sig_freq'] = [item/celltype_condition_df.shape[1] for item in celltype_condition_df['sig_count']]
+ print(celltype, celltype_condition_df[celltype_condition_df['sig_freq']>=0.1].shape)
+ celltype_condition_df[['sig_count', 'sig_freq']].to_csv(saveprefix/f'{condition}_{celltype}.significance_frequency.tsv',
+ sep='\t')
+ return celltype_condition_df
+
+
+if __name__ == '__main__':
+ _ = main()
\ No newline at end of file
diff --git a/04_coeqtl_mapping/individual_networks.py b/04_coeqtl_mapping/individual_networks.py
new file mode 100644
index 0000000..82dd60a
--- /dev/null
+++ b/04_coeqtl_mapping/individual_networks.py
@@ -0,0 +1,270 @@
+import os
+import re
+from itertools import combinations
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+import scanpy as sc
+from scipy.stats import spearmanr
+from scipy.stats import t, norm
+from tqdm import tqdm
+import argparse
+from scipy.stats import rankdata
+from collections import namedtuple
+
+
+def get_time(x):
+ if x == 'UT':
+ return x
+ else:
+ pattern = re.compile(r'\d+h')
+ return re.findall(pattern, x)[0]
+
+class DATASET:
+ def __init__(self, datasetname):
+ self.name = datasetname
+ self.path_prefix = Path("./seurat_objects")
+ self.information = self.get_information()
+ def get_information(self):
+ if self.name == 'onemillionv2':
+ self.path = '1M_v2_mediumQC_ctd_rnanormed_demuxids_20201029.sct.h5ad'
+ self.individual_id_col = 'assignment'
+ self.timepoint_id_col = 'time'
+ self.celltype_id = 'cell_type_lowerres'
+ self.chosen_condition = {'UT': 'UT',
+ 'stimulated': '3h'}
+ elif self.name == 'onemillionv3':
+ self.path = '1M_v3_mediumQC_ctd_rnanormed_demuxids_20201106.SCT.h5ad'
+ self.individual_id_col = 'assignment'
+ self.timepoint_id_col = 'time'
+ self.celltype_id = 'cell_type_lowerres'
+ self.chosen_condition = {'UT': 'UT',
+ 'stimulated': '3h'}
+ elif self.name == 'stemiv2':
+ self.path = 'cardio.integrated.20210301.stemiv2.h5ad'
+ self.individual_id_col = 'assignment.final'
+ self.timepoint_id_col = 'timepoint.final'
+ self.celltype_id = 'cell_type_lowerres'
+ self.chosen_condition = {'UT': 't8w',
+ 'stimulated': 'Baseline'}
+ elif self.name == 'ng':
+ self.path = 'pilot3_seurat3_200420_sct_azimuth.h5ad'
+ self.individual_id_col = 'snumber'
+ self.celltype_id = 'cell_type_mapped_to_onemillion'
+ else:
+ raise IOError("Dataset name not understood.")
+
+ def load_dataset(self):
+ self.get_information()
+ print(f'Loading dataset {self.name} from {self.path_prefix} {self.path}')
+ self.data_sc = sc.read_h5ad(self.path_prefix / self.path)
+ if self.name.startswith('onemillion'):
+ self.data_sc.obs['time'] = [get_time(item) for item in self.data_sc.obs['timepoint']]
+ elif self.name == 'ng':
+ celltype_maping = {'CD4 T': 'CD4T', 'CD8 T': 'CD8T', 'Mono': 'monocyte', 'DC': 'DC', 'NK': 'NK',
+ 'other T': 'otherT', 'other': 'other', 'B': 'B'}
+ self.data_sc.obs['cell_type_mapped_to_onemillion'] = [celltype_maping.get(name) for name in
+ self.data_sc.obs['predicted.celltype.l1']]
+
+
+def corr_to_z(coef, num):
+ t_statistic = coef * np.sqrt((num - 2) / (1 - coef ** 2))
+ prob = t.cdf(t_statistic, num - 2)
+ z_score = norm.ppf(prob)
+ positive_coef_probs = 1 - prob
+ positive_coef_probs[coef < 0] = 0
+ negative_coef_probs = prob
+ negative_coef_probs[coef > 0] = 0
+ probs = negative_coef_probs + positive_coef_probs
+ return z_score, probs
+
+
+# def z_to_corr(z, num):
+# prob = norm.cdf(z)
+# t_statistic = t.ppf(prob, num - 2)
+# corr = t_statistic / np.sqrt(num - 2 + t_statistic ** 2)
+# return corr
+
+
+def save_numpy(data_df, prefix):
+ np.save(f'{prefix}.npy', data_df.values)
+ with open(f'{prefix}.cols.txt', 'w') as f:
+ f.write('\n'.join(data_df.columns))
+ with open(f'{prefix}.rows.txt', 'w') as f:
+ f.write('\n'.join(data_df.index))
+ return None
+
+def _contains_nan(a, nan_policy='propagate'):
+ '''
+ From scipy: https://github.com/scipy/scipy/blob/v1.7.1/scipy/stats/stats.py#L4343-L4525
+ '''
+ policies = ['propagate', 'raise', 'omit']
+ if nan_policy not in policies:
+ raise ValueError("nan_policy must be one of {%s}" %
+ ', '.join("'%s'" % s for s in policies))
+ try:
+ with np.errstate(invalid='ignore'):
+ contains_nan = np.isnan(np.sum(a))
+ except TypeError:
+ try:
+ contains_nan = np.nan in set(a.ravel())
+ except TypeError:
+ contains_nan = False
+ nan_policy = 'omit'
+
+ if contains_nan and nan_policy == 'raise':
+ raise ValueError("The input contains nan values")
+
+ return contains_nan, nan_policy
+
+
+def _chk_asarray(a, axis):
+ '''
+ From scipy: https://github.com/scipy/scipy/blob/v1.7.1/scipy/stats/stats.py#L4343-L4525
+ '''
+ if axis is None:
+ a = np.ravel(a)
+ outaxis = 0
+ else:
+ a = np.asarray(a)
+ outaxis = axis
+
+ if a.ndim == 0:
+ a = np.atleast_1d(a)
+
+ return a, outaxis
+
+
+def spearmanr_withnan(a, axis=0, nan_policy='propagate'):
+ '''
+ Modified from scipy: https://github.com/scipy/scipy/blob/v1.7.1/scipy/stats/stats.py#L4343-L4525
+ '''
+ SpearmanrResult = namedtuple('SpearmanrResult', ('correlation', 'pvalue'))
+ if axis is not None and axis > 1:
+ raise ValueError("spearmanr only handles 1-D or 2-D arrays, supplied axis argument {}, "
+ "please use only values 0, 1 or None for axis".format(axis))
+ a, axisout = _chk_asarray(a, axis)
+ if a.ndim > 2:
+ raise ValueError("spearmanr only handles 1-D or 2-D arrays")
+ n_vars = a.shape[1 - axisout]
+ n_obs = a.shape[axisout]
+ if n_obs <= 1:
+ # Handle empty arrays or single observations.
+ return SpearmanrResult(np.nan, np.nan)
+ a_contains_nan, nan_policy = _contains_nan(a, nan_policy)
+ variable_has_nan = np.zeros(n_vars, dtype=bool)
+ if a_contains_nan:
+ if nan_policy == 'propagate':
+ if a.ndim == 1 or n_vars <= 2:
+ return SpearmanrResult(np.nan, np.nan)
+ else:
+ variable_has_nan = np.isnan(a).sum(axis=axisout)
+ a_ranked = np.apply_along_axis(rankdata, axisout, a)
+ rs = np.corrcoef(a_ranked, rowvar=axisout)
+ dof = n_obs - 2 # degrees of freedom
+ # rs can have elements equal to 1, so avoid zero division warnings
+ with np.errstate(divide='ignore'):
+ t_ = rs * np.sqrt((dof/((rs+1.0)*(1.0-rs))).clip(0))
+ prob = 2 * t.sf(np.abs(t_), dof)
+ # For backwards compatibility, return scalars when comparing 2 columns
+ if rs.shape == (2, 2):
+ return SpearmanrResult(rs[1, 0], prob[1, 0])
+ else:
+ rs[variable_has_nan, :] = np.nan
+ rs[:, variable_has_nan] = np.nan
+ return SpearmanrResult(rs, prob)
+
+
+def get_individual_networks_selected_genepairs(data_sc, individual_colname, selected_genepairs):
+ data_df = pd.DataFrame(data=data_sc.X.toarray(),
+ index=data_sc.obs.index,
+ columns=data_sc.var.index)
+ selected_genes = list(set([ele for item in selected_genepairs for ele in item.split(';')]) & set(data_sc.var.index))
+ selected_genes_sorted_genepairs = [';'.join(sorted(item)) for item in combinations(selected_genes, 2)]
+ common_genepairs = list(set(selected_genes_sorted_genepairs) & set(selected_genepairs))
+ coef_df = pd.DataFrame(index=common_genepairs)
+ coef_p_df = pd.DataFrame(index=common_genepairs)
+ zscore_df = pd.DataFrame(index=common_genepairs)
+ zscore_p_df = pd.DataFrame(index=common_genepairs)
+ data_selected_df = data_df[selected_genes]
+ print(f"Begin calculating networks for {len(data_sc.obs[individual_colname].unique())} individuals")
+ for ind_id in tqdm(data_sc.obs[individual_colname].unique()):
+ cell_num = data_sc.obs[data_sc.obs[individual_colname] == ind_id].shape[0]
+ if cell_num > 10:
+ individual_df = data_selected_df.loc[data_sc.obs[individual_colname] == ind_id]
+ individual_coefs, individual_coef_ps = spearmanr_withnan(individual_df.values, axis=0)
+ try:
+ individual_coefs_flatten = pd.DataFrame(data=individual_coefs[np.triu_indices_from(individual_coefs, 1)],
+ index=selected_genes_sorted_genepairs).loc[common_genepairs]
+ individual_coef_ps_flatten = pd.DataFrame(data=individual_coef_ps[np.triu_indices_from(individual_coefs, 1)],
+ index=selected_genes_sorted_genepairs).loc[common_genepairs]
+ individual_zscores_flatten, individual_zscore_ps_flatten = corr_to_z(individual_coefs_flatten, cell_num)
+ coef_df[ind_id] = individual_coefs_flatten
+ coef_p_df[ind_id] = individual_coef_ps_flatten
+ zscore_df[ind_id] = individual_zscores_flatten
+ zscore_p_df[ind_id] = individual_zscore_ps_flatten
+ except:
+ continue
+ else:
+ print("Deleted this individual because of low cell number", cell_num)
+ return coef_df, coef_p_df, zscore_df, zscore_p_df
+
+
+def get_individual_networks_given_celltype_condition_datasetname(celltype, datasetname, condition='UT'):
+ # load the data and data information
+ dataset = DATASET(datasetname)
+ dataset.load_dataset()
+ print(f"{datasetname} loaded.")
+ # calculate the individual network for specific condition and celltype
+ print(datasetname, celltype, condition)
+ work_prefix = Path('/groups/umcg-lld/tmp01/projects/1MCellRNAseq/GRN_reconstruction/ongoing')
+ selected_genepairs_path = work_prefix / 'coeqtl_mapping/input/snp_genepair_selection' / f'{condition}_{celltype}.baseline.tsv'
+ selected_genepairs = pd.read_csv(selected_genepairs_path, sep='\t')['genepair_sorted'].values
+ if datasetname == 'ng':
+ data_selected = dataset.data_sc[(dataset.data_sc.obs[dataset.celltype_id] == celltype)]
+ else:
+ data_selected = dataset.data_sc[(dataset.data_sc.obs[dataset.celltype_id] == celltype) &
+ (dataset.data_sc.obs[dataset.timepoint_id_col] == dataset.chosen_condition[condition])]
+ individual_coefs_df, individual_coefs_p_df, individual_zscores_df, individual_zscores_p_df = get_individual_networks_selected_genepairs(
+ data_selected,
+ dataset.individual_id_col,
+ selected_genepairs
+ )
+ print(individual_coefs_df.head())
+ save_prefix = Path('/groups/umcg-lld/tmp01/projects/1MCellRNAseq/GRN_reconstruction/ongoing/coeqtl_mapping/input')
+ if not os.path.exists(save_prefix / 'individual_networks' / condition / datasetname):
+ os.mkdir(save_prefix / 'individual_networks' / condition / datasetname)
+ ## not saving the coefficients
+ # save_numpy(individual_coefs_df,
+ # save_prefix / 'individual_networks' / condition / datasetname / f'{condition}_{celltype}.coefs')
+ # save_numpy(individual_coefs_p_df,
+ # save_prefix / 'individual_networks' / condition / datasetname / f'{condition}_{celltype}.coef_ps')
+ save_numpy(individual_zscores_df,
+ save_prefix / 'individual_networks' / condition / datasetname / f'{condition}_{celltype}.zscores')
+ save_numpy(individual_zscores_p_df,
+ save_prefix / 'individual_networks' / condition / datasetname / f'{condition}_{celltype}.zscore_ps')
+ print("Saved ")
+ return individual_coefs_df, individual_coefs_p_df, individual_zscores_df, individual_zscores_p_df
+
+
+def argumentsparser():
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--datasetname', type=str, dest='datasetname')
+ parser.add_argument('--celltype', type=str, dest='celltype')
+ parser.add_argument('--condition', type=str, dest='condition')
+ parser.add_argument('--nonzeroratio', type=float, dest='nonzeroratio')
+ return parser
+
+def run_get_individual_networks_given_celltype_condition_datasetname():
+ args = argumentsparser().parse_args()
+ print(f"Starting to calculate individual network for {args.datasetname}, {args.celltype}, {args.condition}, "
+ f"for genes {args.nonzeroratio}.")
+ _ = get_individual_networks_given_celltype_condition_datasetname(celltype=args.celltype,
+ condition=args.condition,
+ datasetname=args.datasetname)
+ return None
+
+
+if __name__ == '__main__':
+ run_get_individual_networks_given_celltype_condition_datasetname()
diff --git a/04_coeqtl_mapping/individual_networks_cmono_ncmono.py b/04_coeqtl_mapping/individual_networks_cmono_ncmono.py
new file mode 100644
index 0000000..e01e855
--- /dev/null
+++ b/04_coeqtl_mapping/individual_networks_cmono_ncmono.py
@@ -0,0 +1,309 @@
+import argparse
+import os
+import re
+from collections import namedtuple
+from itertools import combinations
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+import scanpy as sc
+from scipy.stats import rankdata
+from scipy.stats import t, norm
+from tqdm import tqdm
+
+
+def get_time(x):
+ if x == 'UT':
+ return x
+ else:
+ pattern = re.compile(r'\d+h')
+ return re.findall(pattern, x)[0]
+
+
+class DATASET:
+ def __init__(self, datasetname):
+ self.name = datasetname
+ self.path_prefix = Path(
+ "./seurat_objects")
+ self.information = self.get_information()
+ def get_information(self):
+ if self.name == 'onemillionv2':
+ self.path = '1M_v2_mediumQC_ctd_rnanormed_demuxids_20201029.sct.h5ad'
+ self.individual_id_col = 'assignment'
+ self.timepoint_id_col = 'time'
+ self.celltype_id = 'cell_type_lowerres'
+ self.chosen_condition = {'UT': 'UT',
+ 'stimulated': '3h'}
+ elif self.name == 'onemillionv3':
+ self.path = '1M_v3_mediumQC_ctd_rnanormed_demuxids_20201106.SCT.h5ad'
+ self.individual_id_col = 'assignment'
+ self.timepoint_id_col = 'time'
+ self.celltype_id = 'cell_type_lowerres'
+ self.chosen_condition = {'UT': 'UT',
+ 'stimulated': '3h'}
+ elif self.name == 'stemiv2':
+ self.path = 'cardio.integrated.20210301.stemiv2.h5ad'
+ self.individual_id_col = 'assignment.final'
+ self.timepoint_id_col = 'timepoint.final'
+ self.celltype_id = 'cell_type_lowerres'
+ self.chosen_condition = {'UT': 't8w',
+ 'stimulated': 'Baseline'}
+ elif self.name == 'ng':
+ self.path = 'pilot3_seurat3_200420_sct_azimuth.h5ad'
+ self.individual_id_col = 'snumber'
+ self.celltype_id = 'cell_type_mapped_to_onemillion'
+ else:
+ raise IOError("Dataset name not understood.")
+ def load_dataset(self):
+ self.get_information()
+ print(f'Loading dataset {self.name} from {self.path_prefix} {self.path}')
+ self.data_sc = sc.read_h5ad(self.path_prefix / self.path)
+ if self.name.startswith('onemillion'):
+ self.data_sc.obs['time'] = [get_time(item) for item in self.data_sc.obs['timepoint']]
+ elif self.name == 'ng':
+ celltype_maping = {'CD4 T': 'CD4T', 'CD8 T': 'CD8T', 'Mono': 'monocyte', 'DC': 'DC', 'NK': 'NK',
+ 'other T': 'otherT', 'other': 'other', 'B': 'B'}
+ self.data_sc.obs['cell_type_mapped_to_onemillion'] = [celltype_maping.get(name) for name in
+ self.data_sc.obs['predicted.celltype.l1']]
+ def get_cMono_ncMono(self):
+ def tell_cmono_foronemillion(x):
+ if x == 'mono 1' or x == 'mono 3' or x == 'mono 4':
+ return 'cMono'
+ elif x == 'mono 2':
+ return 'ncMono'
+ if self.name.startswith('onemillion'):
+ self.data_sc.obs['sub_monocytes'] = [tell_cmono_foronemillion(x) for x in
+ self.data_sc.obs['cell_type']]
+ self.cmono = self.data_sc[self.data_sc.obs['sub_monocytes'] == 'cMono']
+ self.ncmono = self.data_sc[self.data_sc.obs['sub_monocytes'] == 'ncMono']
+ elif self.name.startswith('stemi'):
+ self.cmono = self.data_sc[self.data_sc.obs['cell_type'] == 'cMono']
+ self.ncmono = self.data_sc[self.data_sc.obs['cell_type'] == 'ncMono']
+ elif self.name == 'ng':
+ self.cmono = self.data_sc[self.data_sc.obs['predicted.celltype.l2'] == 'CD14 Mono']
+ self.ncmono = self.data_sc[self.data_sc.obs['predicted.celltype.l2'] == 'CD16 Mono']
+ else:
+ raise IOError("Dataset name not understood.")
+
+
+
+def save_numpy(data_df, prefix):
+ np.save(f'{prefix}.npy', data_df.values)
+ with open(f'{prefix}.cols.txt', 'w') as f:
+ f.write('\n'.join(data_df.columns))
+ with open(f'{prefix}.rows.txt', 'w') as f:
+ f.write('\n'.join(data_df.index))
+ return None
+
+
+def corr_to_z(coef, num):
+ t_statistic = coef * np.sqrt((num - 2) / (1 - coef ** 2))
+ prob = t.cdf(t_statistic, num - 2)
+ z_score = norm.ppf(prob)
+ positive_coef_probs = 1 - prob
+ positive_coef_probs[coef < 0] = 0
+ negative_coef_probs = prob
+ negative_coef_probs[coef > 0] = 0
+ probs = negative_coef_probs + positive_coef_probs
+ return z_score, probs
+
+
+def _contains_nan(a, nan_policy='propagate'):
+ policies = ['propagate', 'raise', 'omit']
+ if nan_policy not in policies:
+ raise ValueError("nan_policy must be one of {%s}" %
+ ', '.join("'%s'" % s for s in policies))
+ try:
+ with np.errstate(invalid='ignore'):
+ contains_nan = np.isnan(np.sum(a))
+ except TypeError:
+ try:
+ contains_nan = np.nan in set(a.ravel())
+ except TypeError:
+ contains_nan = False
+ nan_policy = 'omit'
+ if contains_nan and nan_policy == 'raise':
+ raise ValueError("The input contains nan values")
+ return contains_nan, nan_policy
+
+
+def _chk_asarray(a, axis):
+ if axis is None:
+ a = np.ravel(a)
+ outaxis = 0
+ else:
+ a = np.asarray(a)
+ outaxis = axis
+ if a.ndim == 0:
+ a = np.atleast_1d(a)
+ return a, outaxis
+
+
+def spearmanr_withnan(a, axis=0, nan_policy='propagate'):
+ SpearmanrResult = namedtuple('SpearmanrResult', ('correlation', 'pvalue'))
+ if axis is not None and axis > 1:
+ raise ValueError("spearmanr only handles 1-D or 2-D arrays, supplied axis argument {}, "
+ "please use only values 0, 1 or None for axis".format(axis))
+ a, axisout = _chk_asarray(a, axis)
+ if a.ndim > 2:
+ raise ValueError("spearmanr only handles 1-D or 2-D arrays")
+ n_vars = a.shape[1 - axisout]
+ n_obs = a.shape[axisout]
+ if n_obs <= 1:
+ # Handle empty arrays or single observations.
+ return SpearmanrResult(np.nan, np.nan)
+ a_contains_nan, nan_policy = _contains_nan(a, nan_policy)
+ variable_has_nan = np.zeros(n_vars, dtype=bool)
+ if a_contains_nan:
+ if nan_policy == 'propagate':
+ if a.ndim == 1 or n_vars <= 2:
+ return SpearmanrResult(np.nan, np.nan)
+ else:
+ variable_has_nan = np.isnan(a).sum(axis=axisout)
+ a_ranked = np.apply_along_axis(rankdata, axisout, a)
+ rs = np.corrcoef(a_ranked, rowvar=axisout)
+ dof = n_obs - 2 # degrees of freedom
+ # rs can have elements equal to 1, so avoid zero division warnings
+ with np.errstate(divide='ignore'):
+ t_ = rs * np.sqrt((dof / ((rs + 1.0) * (1.0 - rs))).clip(0))
+ prob = 2 * t.sf(np.abs(t_), dof)
+ # For backwards compatibility, return scalars when comparing 2 columns
+ if rs.shape == (2, 2):
+ return SpearmanrResult(rs[1, 0], prob[1, 0])
+ else:
+ rs[variable_has_nan, :] = np.nan
+ rs[:, variable_has_nan] = np.nan
+ return SpearmanrResult(rs, prob)
+
+
+def get_individual_networks_selected_genepairs(data_sc, individual_colname, selected_genepairs):
+ data_df = pd.DataFrame(data=data_sc.X.toarray(),
+ index=data_sc.obs.index,
+ columns=data_sc.var.index)
+ selected_genes = list(set([ele for item in selected_genepairs for ele in item.split(';')]) & set(data_sc.var.index))
+ selected_genes_sorted_genepairs = [';'.join(sorted(item)) for item in combinations(selected_genes, 2)]
+ common_genepairs = list(set(selected_genes_sorted_genepairs) & set(selected_genepairs))
+ coef_df = pd.DataFrame(index=common_genepairs)
+ coef_p_df = pd.DataFrame(index=common_genepairs)
+ zscore_df = pd.DataFrame(index=common_genepairs)
+ zscore_p_df = pd.DataFrame(index=common_genepairs)
+ data_selected_df = data_df[selected_genes]
+ print(f"Begin calculating networks for {len(data_sc.obs[individual_colname].unique())} individuals")
+ for ind_id in tqdm(data_sc.obs[individual_colname].unique()):
+ cell_num = data_sc.obs[data_sc.obs[individual_colname] == ind_id].shape[0]
+ if cell_num > 10:
+ individual_df = data_selected_df.loc[data_sc.obs[individual_colname] == ind_id]
+ individual_coefs, individual_coef_ps = spearmanr_withnan(individual_df.values, axis=0)
+ try:
+ individual_coefs_flatten = \
+ pd.DataFrame(data=individual_coefs[np.triu_indices_from(individual_coefs, 1)],
+ index=selected_genes_sorted_genepairs).loc[common_genepairs]
+ individual_coef_ps_flatten = \
+ pd.DataFrame(data=individual_coef_ps[np.triu_indices_from(individual_coefs, 1)],
+ index=selected_genes_sorted_genepairs).loc[common_genepairs]
+ individual_zscores_flatten, individual_zscore_ps_flatten = corr_to_z(individual_coefs_flatten, cell_num)
+ coef_df[ind_id] = individual_coefs_flatten
+ coef_p_df[ind_id] = individual_coef_ps_flatten
+ zscore_df[ind_id] = individual_zscores_flatten
+ zscore_p_df[ind_id] = individual_zscore_ps_flatten
+ except:
+ continue
+ else:
+ print("Deleted this individual because of low cell number", cell_num)
+ return coef_df, coef_p_df, zscore_df, zscore_p_df
+
+
+def get_individual_networks_UT_subcelltypesMonocytes(celltype, datasetname, condition='UT'):
+ # load the data and data information
+ dataset = DATASET(datasetname)
+ dataset.load_dataset()
+ dataset.get_cMono_ncMono()
+ print(f"{datasetname} loaded.")
+ # calculate the individual network for specific condition and celltype
+ print(datasetname, celltype, condition)
+ work_prefix = Path('./')
+ selected_genepairs_path = work_prefix / 'coeqtl_mapping/input/snp_genepair_selection' / f'{condition}_monocyte_{datasetname}.baseline.tsv'
+ selected_genepairs = pd.read_csv(selected_genepairs_path, sep='\t')['genepair_sorted'].values
+ if celltype == 'cMono':
+ data_celltype = dataset.cmono
+ elif celltype == 'ncMono':
+ data_celltype = dataset.ncmono
+ else:
+ raise IOError("Celltype not understood. select from cMono or ncMono.")
+ if datasetname == 'ng':
+ data_selected = data_celltype
+ else:
+ data_selected = data_celltype[
+ data_celltype.obs[dataset.timepoint_id_col] == dataset.chosen_condition[condition]]
+ individual_coefs_df, individual_coefs_p_df, individual_zscores_df, individual_zscores_p_df = get_individual_networks_selected_genepairs(
+ data_selected,
+ dataset.individual_id_col,
+ selected_genepairs
+ )
+ print(individual_coefs_df.head())
+ save_prefix = Path('./coeqtl_mapping/input')
+ if not os.path.exists(save_prefix / 'individual_networks' / condition / datasetname):
+ os.mkdir(save_prefix / 'individual_networks' / condition / datasetname)
+ # save_numpy(individual_coefs_df,
+ # save_prefix / 'individual_networks' / condition / datasetname / f'{condition}_{celltype}.coefs')
+ # save_numpy(individual_coefs_p_df,
+ # save_prefix / 'individual_networks' / condition / datasetname / f'{condition}_{celltype}.coef_ps')
+ save_numpy(individual_zscores_df,
+ save_prefix / 'individual_networks' / condition / datasetname / f'{condition}_{celltype}.zscores')
+ save_numpy(individual_zscores_p_df,
+ save_prefix / 'individual_networks' / condition / datasetname / f'{condition}_{celltype}.zscore_ps')
+ print("Saved.")
+ return individual_coefs_df, individual_coefs_p_df, individual_zscores_df, individual_zscores_p_df
+
+
+def argumentsparser():
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--datasetname', type=str, dest='datasetname')
+ parser.add_argument('--celltype', type=str, dest='celltype')
+ parser.add_argument('--condition', type=str, dest='condition')
+ return parser
+
+
+def run_get_individual_networks_given_celltype_condition_datasetname():
+ args = argumentsparser().parse_args()
+ print(f"Starting to calculate individual network for {args.datasetname}, {args.celltype}, {args.condition}.")
+ _ = get_individual_networks_UT_subcelltypesMonocytes(celltype=args.celltype,
+ condition=args.condition,
+ datasetname=args.datasetname)
+ return None
+
+
+if __name__ == '__main__':
+ run_get_individual_networks_given_celltype_condition_datasetname()
+
+
+# dataset = DATASET('stemiv2')
+# dataset.load_dataset()
+# dataset.get_cMono_ncMono()
+# celldf = dataset.data_sc.obs
+# cellnum = celldf[(celldf[dataset.celltype_id]=='CD4T') & (celldf[dataset.timepoint_id_col]==dataset.chosen_condition['UT'])][dataset.individual_id_col].value_counts()
+# print('CD4T, ', cellnum[cellnum>10].mean())
+# cellnum = celldf[(celldf[dataset.celltype_id]=='CD8T') & (celldf[dataset.timepoint_id_col]==dataset.chosen_condition['UT'])][dataset.individual_id_col].value_counts()
+# print('CD8T, ', cellnum[cellnum>10].mean())
+# cellnum = celldf[(celldf[dataset.celltype_id]=='monocyte') & (celldf[dataset.timepoint_id_col]==dataset.chosen_condition['UT'])][dataset.individual_id_col].value_counts()
+# print('Monocyte, ', cellnum[cellnum>10].mean())
+# cellnum = dataset.cmono.obs[dataset.cmono.obs[dataset.timepoint_id_col]==dataset.chosen_condition['UT']][dataset.individual_id_col].value_counts()
+# print('cMono, ', cellnum[cellnum>10].mean())
+# cellnum = dataset.ncmono.obs[dataset.ncmono.obs[dataset.timepoint_id_col]==dataset.chosen_condition['UT']][dataset.individual_id_col].value_counts()
+# print('ncMono, ', cellnum[cellnum>10].mean())
+#
+# dataset = DATASET('ng')
+# dataset.load_dataset()
+# dataset.get_cMono_ncMono()
+# celldf = dataset.data_sc.obs
+# cellnum = celldf[(celldf[dataset.celltype_id]=='CD4T')][dataset.individual_id_col].value_counts()
+# print('CD4T, ', cellnum[cellnum>10].mean())
+# cellnum = celldf[(celldf[dataset.celltype_id]=='CD8T')][dataset.individual_id_col].value_counts()
+# print('CD8T, ', cellnum[cellnum>10].mean())
+# cellnum = celldf[(celldf[dataset.celltype_id]=='monocyte') ][dataset.individual_id_col].value_counts()
+# print('Monocyte, ', cellnum[cellnum>10].mean())
+# cellnum = dataset.cmono.obs[dataset.individual_id_col].value_counts()
+# print('cMono, ', cellnum[cellnum>10].mean())
+# cellnum = dataset.ncmono.obs[dataset.individual_id_col].value_counts()
+# print('ncMono, ', cellnum[cellnum>10].mean())
\ No newline at end of file
diff --git a/04_coeqtl_mapping/individual_networks_maxcell.py b/04_coeqtl_mapping/individual_networks_maxcell.py
new file mode 100644
index 0000000..72566f6
--- /dev/null
+++ b/04_coeqtl_mapping/individual_networks_maxcell.py
@@ -0,0 +1,315 @@
+import os
+import re
+from itertools import combinations
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+import scanpy as sc
+from scipy.stats import spearmanr
+from scipy.stats import t, norm
+from tqdm import tqdm
+import argparse
+from scipy.stats import rankdata
+from collections import namedtuple
+
+
+def get_time(x):
+ if x == 'UT':
+ return x
+ else:
+ pattern = re.compile(r'\d+h')
+ return re.findall(pattern, x)[0]
+
+
+class DATASET:
+ def __init__(self, datasetname):
+ self.name = datasetname
+ self.path_prefix = Path("./seurat_objects")
+ self.information = self.get_information()
+ def get_information(self):
+ if self.name == 'onemillionv2':
+ self.path = '1M_v2_mediumQC_ctd_rnanormed_demuxids_20201029.sct.h5ad'
+ self.individual_id_col = 'assignment'
+ self.timepoint_id_col = 'time'
+ self.celltype_id = 'cell_type_lowerres'
+ self.chosen_condition = {'UT': 'UT',
+ 'stimulated': '3h'}
+ elif self.name == 'onemillionv3':
+ self.path = '1M_v3_mediumQC_ctd_rnanormed_demuxids_20201106.SCT.h5ad'
+ self.individual_id_col = 'assignment'
+ self.timepoint_id_col = 'time'
+ self.celltype_id = 'cell_type_lowerres'
+ self.chosen_condition = {'UT': 'UT',
+ 'stimulated': '3h'}
+ elif self.name == 'stemiv2':
+ self.path = 'cardio.integrated.20210301.stemiv2.h5ad'
+ self.individual_id_col = 'assignment.final'
+ self.timepoint_id_col = 'timepoint.final'
+ self.celltype_id = 'cell_type_lowerres'
+ self.chosen_condition = {'UT': 't8w',
+ 'stimulated': 'Baseline'}
+ elif self.name == 'ng':
+ self.path = 'pilot3_seurat3_200420_sct_azimuth.h5ad'
+ self.individual_id_col = 'snumber'
+ self.celltype_id = 'cell_type_mapped_to_onemillion'
+ else:
+ raise IOError("Dataset name not understood.")
+ def load_dataset(self):
+ self.get_information()
+ print(f'Loading dataset {self.name} from {self.path_prefix} {self.path}')
+ self.data_sc = sc.read_h5ad(self.path_prefix / self.path)
+ if self.name.startswith('onemillion'):
+ self.data_sc.obs['time'] = [get_time(item) for item in self.data_sc.obs['timepoint']]
+ elif self.name == 'ng':
+ celltype_maping = {'CD4 T': 'CD4T', 'CD8 T': 'CD8T', 'Mono': 'monocyte', 'DC': 'DC', 'NK': 'NK',
+ 'other T': 'otherT', 'other': 'other', 'B': 'B'}
+ self.data_sc.obs['cell_type_mapped_to_onemillion'] = [celltype_maping.get(name) for name in
+ self.data_sc.obs['predicted.celltype.l1']]
+
+
+
+def select_gene_nonzeroratio(df, ratio):
+ nonzerocounts = np.count_nonzero(df.values, axis=0) / df.shape[0]
+ selected_genes = df.columns[nonzerocounts > ratio]
+ return selected_genes
+
+
+def corr_to_z(coef, num):
+ t_statistic = coef * np.sqrt((num - 2) / (1 - coef ** 2))
+ prob = t.cdf(t_statistic, num - 2)
+ z_score = norm.ppf(prob)
+ positive_coef_probs = 1 - prob
+ positive_coef_probs[coef < 0] = 0
+ negative_coef_probs = prob
+ negative_coef_probs[coef > 0] = 0
+ probs = negative_coef_probs + positive_coef_probs
+ return z_score, probs
+
+
+def z_to_corr(z, num):
+ prob = norm.cdf(z)
+ t_statistic = t.ppf(prob, num - 2)
+ corr = t_statistic / np.sqrt(num - 2 + t_statistic ** 2)
+ return corr
+
+
+def get_om_name(filename):
+ pattern = re.compile(r'LLDeep_\d\d\d\d')
+ return re.findall(pattern, filename)[0]
+
+
+def get_stemi_name(filename):
+ pattern = re.compile(r'TEST_\d.')
+ return re.findall(pattern, filename)[0]
+
+
+def save_numpy(data_df, prefix):
+ np.save(f'{prefix}.npy', data_df.values)
+ with open(f'{prefix}.cols.txt', 'w') as f:
+ f.write('\n'.join(data_df.columns))
+ with open(f'{prefix}.rows.txt', 'w') as f:
+ f.write('\n'.join(data_df.index))
+ return None
+
+def _contains_nan(a, nan_policy='propagate'):
+ policies = ['propagate', 'raise', 'omit']
+ if nan_policy not in policies:
+ raise ValueError("nan_policy must be one of {%s}" %
+ ', '.join("'%s'" % s for s in policies))
+ try:
+ with np.errstate(invalid='ignore'):
+ contains_nan = np.isnan(np.sum(a))
+ except TypeError:
+ try:
+ contains_nan = np.nan in set(a.ravel())
+ except TypeError:
+ contains_nan = False
+ nan_policy = 'omit'
+ if contains_nan and nan_policy == 'raise':
+ raise ValueError("The input contains nan values")
+ return contains_nan, nan_policy
+
+
+def _chk_asarray(a, axis):
+ if axis is None:
+ a = np.ravel(a)
+ outaxis = 0
+ else:
+ a = np.asarray(a)
+ outaxis = axis
+ if a.ndim == 0:
+ a = np.atleast_1d(a)
+ return a, outaxis
+
+
+def spearmanr_withnan(a, axis=0, nan_policy='propagate'):
+ SpearmanrResult = namedtuple('SpearmanrResult', ('correlation', 'pvalue'))
+ if axis is not None and axis > 1:
+ raise ValueError("spearmanr only handles 1-D or 2-D arrays, supplied axis argument {}, "
+ "please use only values 0, 1 or None for axis".format(axis))
+ a, axisout = _chk_asarray(a, axis)
+ if a.ndim > 2:
+ raise ValueError("spearmanr only handles 1-D or 2-D arrays")
+ n_vars = a.shape[1 - axisout]
+ n_obs = a.shape[axisout]
+ if n_obs <= 1:
+ # Handle empty arrays or single observations.
+ return SpearmanrResult(np.nan, np.nan)
+ a_contains_nan, nan_policy = _contains_nan(a, nan_policy)
+ variable_has_nan = np.zeros(n_vars, dtype=bool)
+ if a_contains_nan:
+ if nan_policy == 'propagate':
+ if a.ndim == 1 or n_vars <= 2:
+ return SpearmanrResult(np.nan, np.nan)
+ else:
+ variable_has_nan = np.isnan(a).sum(axis=axisout)
+ a_ranked = np.apply_along_axis(rankdata, axisout, a)
+ rs = np.corrcoef(a_ranked, rowvar=axisout)
+ dof = n_obs - 2 # degrees of freedom
+ # rs can have elements equal to 1, so avoid zero division warnings
+ with np.errstate(divide='ignore'):
+ t_ = rs * np.sqrt((dof/((rs+1.0)*(1.0-rs))).clip(0))
+ prob = 2 * t.sf(np.abs(t_), dof)
+ # For backwards compatibility, return scalars when comparing 2 columns
+ if rs.shape == (2, 2):
+ return SpearmanrResult(rs[1, 0], prob[1, 0])
+ else:
+ rs[variable_has_nan, :] = np.nan
+ rs[:, variable_has_nan] = np.nan
+ return SpearmanrResult(rs, prob)
+
+def read_numpy(prefix):
+ data = np.load(f'{prefix}.npy')
+ columns = [item.strip() for item in open(f'{prefix}.rows.txt', 'r').readlines()]
+ return pd.DataFrame(data=data, columns=columns, index=columns)
+
+
+def read_all_files(prefix, genepairs):
+ res_df = pd.DataFrame(index=genepairs)
+ for filename in os.listdir(prefix):
+ if filename.endswith('_coefs.npy'):
+ data = np.load(f'{prefix}/{filename}')
+ if len(data.shape) > 1:
+ data_uppertria = data[np.triu_indices_from(data, 1)]
+ individual_id = get_stemi_name(filename)
+ res_df[individual_id] = data_uppertria
+ return res_df
+
+
+def get_unique_genepairs(genepair_list, sep=';'):
+ unique_pairs = set()
+ for genepair in genepair_list:
+ reverse_genepair = sep.join(genepair.split(sep))
+ if genepair in unique_pairs or reverse_genepair in unique_pairs:
+ continue
+ else:
+ unique_pairs.add(genepair)
+ return unique_pairs
+
+
+def get_genes(genepair_list, sep=';'):
+ genes = list(set([gene for genepair in genepair_list for gene in genepair.split(sep)]))
+ return genes
+
+
+def get_genepairs(genelist_path):
+ genelist = [item.strip() for item in open(genelist_path, 'r').readlines()]
+ genepairs = [';'.join(sorted(item)) for item in combinations(genelist, 2)]
+ return genelist, genepairs
+
+
+def get_individual_networks_selected_genepairs(data_sc, individual_colname, selected_genepairs, maxcell):
+ data_df = pd.DataFrame(data=data_sc.X.toarray(),
+ index=data_sc.obs.index,
+ columns=data_sc.var.index)
+ selected_genes = list(set([ele for item in selected_genepairs
+ for ele in item.split(';')]) & set(data_sc.var.index))
+ selected_genes_sorted_genepairs = [';'.join(sorted(item)) for item in combinations(selected_genes, 2)]
+ common_genepairs = list(set(selected_genes_sorted_genepairs) & set(selected_genepairs))
+ coef_df = pd.DataFrame(index=common_genepairs)
+ coef_p_df = pd.DataFrame(index=common_genepairs)
+ zscore_df = pd.DataFrame(index=common_genepairs)
+ zscore_p_df = pd.DataFrame(index=common_genepairs)
+ data_selected_df = data_df[selected_genes]
+ print(f"Begin calculating networks for {len(data_sc.obs[individual_colname].unique())} individuals.")
+ for ind_id in tqdm(data_sc.obs[individual_colname].unique()):
+ cell_num = data_sc.obs[data_sc.obs[individual_colname] == ind_id].shape[0]
+ if cell_num > 10:
+ if maxcell>0 and cell_num >= maxcell:
+ individual_df = data_selected_df.loc[data_sc.obs[individual_colname] == ind_id].sample(maxcell, random_state=5)
+ cell_num = maxcell
+ else:
+ individual_df = data_selected_df.loc[data_sc.obs[individual_colname] == ind_id]
+ # individual_df = data_selected_df.loc[data_sc.obs[individual_colname] == ind_id]
+ individual_coefs, individual_coef_ps = spearmanr_withnan(individual_df.values, axis=0)
+ try:
+ individual_coefs_flatten = pd.DataFrame(data=individual_coefs[np.triu_indices_from(individual_coefs, 1)],
+ index=selected_genes_sorted_genepairs).loc[common_genepairs]
+ individual_coef_ps_flatten = pd.DataFrame(data=individual_coef_ps[np.triu_indices_from(individual_coefs, 1)],
+ index=selected_genes_sorted_genepairs).loc[common_genepairs]
+ individual_zscores_flatten, individual_zscore_ps_flatten = corr_to_z(individual_coefs_flatten.values,
+ cell_num)
+ coef_df[ind_id] = individual_coefs_flatten
+ coef_p_df[ind_id] = individual_coef_ps_flatten
+ zscore_df[ind_id] = individual_zscores_flatten
+ zscore_p_df[ind_id] = individual_zscore_ps_flatten
+ except:
+ continue
+ else:
+ print("Deleted this individual because of low cell number", cell_num)
+ return coef_df, coef_p_df, zscore_df, zscore_p_df
+
+
+def get_individual_networks_given_celltype_condition_datasetname(celltype, datasetname, condition='UT', maxcell=-1):
+ # load the data and data information
+ dataset = DATASET(datasetname)
+ dataset.load_dataset()
+ print(f"{datasetname} loaded.")
+ # calculate the individual network for specific condition and celltype
+ print(datasetname, celltype, condition)
+ work_prefix = Path('./')
+ selected_genepairs_path = work_prefix / f'coeqtl_mapping/input/snp_genepair_selection/{condition}_{celltype}_{datasetname}.baseline.tsv'
+ selected_genepairs = pd.read_csv(selected_genepairs_path, sep='\t')['genepair_sorted'].values
+ if datasetname == 'ng':
+ data_selected = dataset.data_sc[(dataset.data_sc.obs[dataset.celltype_id] == celltype)]
+ else:
+ data_selected = dataset.data_sc[(dataset.data_sc.obs[dataset.celltype_id] == celltype) &
+ (dataset.data_sc.obs[dataset.timepoint_id_col] == dataset.chosen_condition[condition])]
+ individual_coefs_df, individual_coefs_p_df, individual_zscores_df, individual_zscores_p_df = \
+ get_individual_networks_selected_genepairs(
+ data_selected,
+ dataset.individual_id_col,
+ selected_genepairs,
+ maxcell
+ )
+ print(individual_coefs_df.head())
+ save_prefix = Path('./coeqtl_mapping/input')
+ if not os.path.exists(save_prefix / 'individual_networks' / condition / datasetname):
+ os.mkdir(save_prefix / 'individual_networks' / condition / datasetname)
+ save_numpy(individual_zscores_df,
+ save_prefix / 'individual_networks' / condition / datasetname / f'{condition}_{celltype}.max{maxcell}cells.zscores')
+ print("Saved ")
+ return individual_coefs_df, individual_coefs_p_df, individual_zscores_df, individual_zscores_p_df
+
+
+def argumentsparser():
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--datasetname', type=str, dest='datasetname')
+ parser.add_argument('--celltype', type=str, dest='celltype')
+ parser.add_argument('--condition', type=str, dest='condition')
+ parser.add_argument('--maxcell', type=float, dest='maxcell')
+ return parser
+
+def run_get_individual_networks_given_celltype_condition_datasetname():
+ args = argumentsparser().parse_args()
+ print(f"Starting to calculate individual network for {args.datasetname}, {args.celltype}, {args.condition}, "
+ f"for max cell number {args.maxcell}.")
+ _ = get_individual_networks_given_celltype_condition_datasetname(celltype=args.celltype,
+ condition=args.condition,
+ datasetname=args.datasetname,
+ maxcell=int(args.maxcell))
+ return None
+
+if __name__ == '__main__':
+ run_get_individual_networks_given_celltype_condition_datasetname()
\ No newline at end of file
diff --git a/04_coeqtl_mapping/launch_sbatch_files.sh b/04_coeqtl_mapping/launch_sbatch_files.sh
new file mode 100644
index 0000000..27633da
--- /dev/null
+++ b/04_coeqtl_mapping/launch_sbatch_files.sh
@@ -0,0 +1,48 @@
+# Calculate individual networks
+working_dir=/groups/umcg-lld/tmp01/projects/1MCellRNAseq/GRN_reconstruction/ongoing/coeqtl_mapping
+condition='UT'
+for celltype in 'CD4T' 'CD8T' 'B' 'NK' 'DC'
+do
+for dataset in 'stemiv2' 'onemillionv2' 'onemillionv3' 'ng'
+do
+ echo ${dataset}_${condition}_${celltype}
+ sbatch --parsable --job-name ${dataset}_${condition}_${celltype} \
+ --output ${working_dir}/input/individual_networks/logs/${dataset}_${condition}_${celltype}.out \
+ --error ${working_dir}/input/individual_networks/logs/${dataset}_${condition}_${celltype}.err \
+ ${working_dir}/input/individual_networks/submit_individual_networks.sh ${dataset} ${celltype} ${condition}
+done
+done # decided not to save into tsv after saving in numpy
+
+# merge individual networks and create gene list and annotation file for betaqtl
+for celltype in 'CD4T' 'CD8T' 'B' 'NK' 'DC'
+do
+ echo ${condition}_${celltype}
+ sbatch --parsable --job-name merge_${condition}_${celltype} \
+ --output ${working_dir}/input/individual_networks/logs/merge_${condition}_${celltype}.out \
+ --error ${working_dir}/input/individual_networks/logs/merge_${condition}_${celltype}.err \
+ ${working_dir}/input/individual_networks/submit_merge_coexpression.sh ${celltype} ${condition}
+done
+
+
+# rsync the betaqtl_scripts to gearshift: /groups/umcg-lld/tmp01/projects/1MCellRNAseq/GRN_reconstruction/ongoing/coeqtl_mapping/output/${condition}_${celltype}
+# make batches for betaqtl
+for celltype in 'CD4T' 'CD8T' 'B' 'NK' 'DC'
+do
+cd ${working_dir}/output/${condition}_${celltype} ||exit
+./createBatches.sh ${condition} ${celltype}
+# submit betaqtl jobs
+./suball.sh ${working_dir}/output/${condition}_${celltype}/noduplicated/jobs
+./suball.sh ${working_dir}/output/${condition}_${celltype}/duplicatedversion1/jobs
+./suball.sh ${working_dir}/output/${condition}_${celltype}/duplicatedversion2/jobs
+done
+
+# concate and process output from betaqtl
+for celltype in 'CD4T' 'CD8T' 'B' 'NK' 'DC'
+do
+ cd ${working_dir}/output/${condition}_${celltype} ||exit
+ echo ${condition}_${celltype}
+ sbatch --parsable --job-name process_betaqtl_results_${condition}_${celltype} \
+ --output ${working_dir}/input/individual_networks/logs/process_betaqtl_results_${condition}_${celltype}.out \
+ --error ${working_dir}/input/individual_networks/logs/process_betaqtl_results_${condition}_${celltype}.err \
+ ${working_dir}/output/submit_process_betaqtl_results.sh ${condition} ${celltype}
+done
\ No newline at end of file
diff --git a/04_coeqtl_mapping/merge_coexpression_for_betaeqtl.py b/04_coeqtl_mapping/merge_coexpression_for_betaeqtl.py
new file mode 100644
index 0000000..e1bb6fd
--- /dev/null
+++ b/04_coeqtl_mapping/merge_coexpression_for_betaeqtl.py
@@ -0,0 +1,40 @@
+import pandas as pd
+from pathlib import Path
+import numpy as np
+import argparse
+
+
+def read_numpy(prefix):
+ data = np.load(f'{prefix}.npy')
+ columns = [item.strip() for item in open(f'{prefix}.cols.txt', 'r').readlines()]
+ rows = [item.strip() for item in open(f'{prefix}.rows.txt', 'r').readlines()]
+ return pd.DataFrame(data=data, columns=columns, index=rows)
+
+
+def concat_numpy_files(celltype, condition, res_prefix):
+ allres = pd.DataFrame()
+ for dataset in ['onemillionv2', 'onemillionv3', 'stemiv2', 'ng']:
+ if condition =='stimulated' and dataset == 'ng':
+ continue
+ else:
+ numpyfile_path = res_prefix/condition/dataset/f'{condition}_{celltype}.zscores'
+ df = read_numpy(numpyfile_path)
+ allres = pd.concat([df, allres], axis=1)
+ print(f'Adding {dataset}, it has shape:', allres.shape)
+ allres.to_csv(res_prefix/condition/f'{condition}_{celltype}.onemillionv23stemiv2ng.zscores.tsv', sep='\t')
+ return allres
+
+
+def argumentsparser():
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--celltype', type=str, dest='celltype')
+ parser.add_argument('--condition', type=str, dest='condition')
+ return parser
+
+
+workdir = Path("/groups/umcg-lld/tmp01/projects/1MCellRNAseq/GRN_reconstruction/ongoing/coeqtl_mapping")
+res_prefix = workdir/'input/individual_networks/'
+
+args = argumentsparser().parse_args()
+celltype, condition = args.celltype, args.condition
+_ = concat_numpy_files(celltype, condition, res_prefix)
\ No newline at end of file
diff --git a/04_coeqtl_mapping/merge_coexpression_for_betaeqtl_maxcell.py b/04_coeqtl_mapping/merge_coexpression_for_betaeqtl_maxcell.py
new file mode 100644
index 0000000..319f6a2
--- /dev/null
+++ b/04_coeqtl_mapping/merge_coexpression_for_betaeqtl_maxcell.py
@@ -0,0 +1,42 @@
+import pandas as pd
+from pathlib import Path
+import numpy as np
+import argparse
+
+
+def read_numpy(prefix):
+ data = np.load(f'{prefix}.npy')
+ columns = [item.strip() for item in open(f'{prefix}.cols.txt', 'r').readlines()]
+ rows = [item.strip() for item in open(f'{prefix}.rows.txt', 'r').readlines()]
+ return pd.DataFrame(data=data, columns=columns, index=rows)
+
+
+def concat_numpy_files(celltype, condition, res_prefix, maxcell):
+ allres = pd.DataFrame()
+ for dataset in ['onemillionv2', 'onemillionv3', 'stemiv2', 'ng']:
+ if condition =='stimulated' and dataset == 'ng':
+ continue
+ else:
+ numpyfile_path = res_prefix/condition/dataset/f'{condition}_{celltype}.max{maxcell}cells.zscores'
+ df = read_numpy(numpyfile_path)
+ allres = pd.concat([df, allres], axis=1, join='outer')
+ print(f'Adding {dataset}, it has shape:', allres.shape)
+ allres.to_csv(res_prefix/condition/f'{condition}_{celltype}.max{maxcell}cells.onemillionv23stemiv2ng.zscores.tsv.gz',
+ compression='gzip', sep='\t')
+ return allres
+
+
+def argumentsparser():
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--celltype', type=str, dest='celltype')
+ parser.add_argument('--condition', type=str, dest='condition')
+ parser.add_argument('--maxcell', type=str, dest='maxcell')
+ return parser
+
+
+workdir = Path("./coeqtl_mapping")
+res_prefix = workdir/'input/individual_networks/'
+
+args = argumentsparser().parse_args()
+celltype, condition, maxcell = args.celltype, args.condition, int(args.maxcell)
+_ = concat_numpy_files(celltype, condition, res_prefix, maxcell)
diff --git a/04_coeqtl_mapping/merge_coexpression_for_betaqtl.subsampleindividuals.py b/04_coeqtl_mapping/merge_coexpression_for_betaqtl.subsampleindividuals.py
new file mode 100644
index 0000000..d0e2dc0
--- /dev/null
+++ b/04_coeqtl_mapping/merge_coexpression_for_betaqtl.subsampleindividuals.py
@@ -0,0 +1,44 @@
+import pandas as pd
+from pathlib import Path
+import numpy as np
+import argparse
+
+
+def read_numpy(prefix):
+ data = np.load(f'{prefix}.npy')
+ columns = [item.strip() for item in open(f'{prefix}.cols.txt', 'r').readlines()]
+ rows = [item.strip() for item in open(f'{prefix}.rows.txt', 'r').readlines()]
+ return pd.DataFrame(data=data, columns=columns, index=rows)
+
+
+def concat_numpy_files(celltype, condition, res_prefix, num):
+ allres = pd.DataFrame()
+ for dataset in ['onemillionv2', 'onemillionv3', 'stemiv2', 'ng']:
+ if condition =='stimulated' and dataset == 'ng':
+ continue
+ else:
+ numpyfile_path = res_prefix/condition/dataset/f'{condition}_{celltype}.zscores'
+ df = read_numpy(numpyfile_path)
+ allres = pd.concat([df, allres], axis=1, join='outer')
+ print(f'Adding {dataset}, it has shape:', allres.shape)
+ allres.sample(num, axis=1).to_csv(res_prefix/condition/f'{condition}_{celltype}.onemillionv23stemiv2ng.{num}randompeople.zscores.tsv.gz',
+ sep='\t', compression='gzip')
+ # allres.sample(50).to_csv(res_prefix / condition / f'{condition}_{celltype}.onemillionv23stemiv2ng.50randompeople.zscores.tsv.gz',
+ # sep='\t', compression='gzip')
+ return allres
+
+
+def argumentsparser():
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--celltype', type=str, dest='celltype')
+ parser.add_argument('--condition', type=str, dest='condition')
+ parser.add_argument('--num', type=str, dest='num')
+ return parser
+
+
+workdir = Path("./coeqtl_mapping")
+res_prefix = workdir/'input/individual_networks/'
+
+args = argumentsparser().parse_args()
+celltype, condition, number = args.celltype, args.condition, int(args.num)
+_ = concat_numpy_files(celltype, condition, res_prefix, number)
\ No newline at end of file
diff --git a/04_coeqtl_mapping/multipletesting_correction.fixed.py b/04_coeqtl_mapping/multipletesting_correction.fixed.py
new file mode 100644
index 0000000..bf7a9c4
--- /dev/null
+++ b/04_coeqtl_mapping/multipletesting_correction.fixed.py
@@ -0,0 +1,129 @@
+import pandas as pd
+from statsmodels.stats.multitest import multipletests
+import numpy as np
+import argparse
+from scipy.optimize import minimize
+from scipy.stats import beta
+from scipy import special
+from pathlib import Path
+
+
+def read_numpy(prefix):
+ data = np.load(f'{prefix}.npy')
+ columns = [f'perm{item.strip()}' for item in open(f'{prefix}.cols.txt', 'r').readlines()]
+ rows = [item.strip() for item in open(f'{prefix}.rows.txt', 'r').readlines()]
+ return pd.DataFrame(data=data, columns=columns, index=rows)
+
+
+def beta_distribution_mle_function(x, p):
+ k, n = x
+ ll = (k - 1) * np.sum(np.log(p)) + (n - 1) * np.sum(np.log(1 - p)) - np.size(p) * special.betaln(k, n)
+ return -1 * ll
+
+
+def beta_distribution_initial_guess(x):
+ """
+ https://stats.stackexchange.com/questions/13245/which-is-a-good-tool-to-compute-parameters-for-a-beta-distribution
+ """
+ mean = np.mean(x)
+ var = np.var(x)
+ a = mean * ((mean * (1 - mean) / var) - 1)
+ b = (1 - mean) * ((mean * (1 - mean) / var) - 1)
+ return a, b
+
+
+def fit_beta_distribution(p, a_bnd=(0.1, 10), b_bnd=(1, 1000000)):
+ a, b = beta_distribution_initial_guess(p)
+ x0 = np.array([min(max(a, a_bnd[0]), a_bnd[1]), min(max(b, b_bnd[0]), b_bnd[1])])
+ res = minimize(beta_distribution_mle_function,
+ x0=x0,
+ args=(p, ),
+ method='nelder-mead',
+ bounds=(a_bnd, b_bnd),
+ options={"maxiter": 10000, "disp": True})
+ return res.x, res.nfev, res.nit
+
+
+def arguments():
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--permutation_pvalue_path', dest='permutation_pvalue_path')
+ parser.add_argument('--coeqtl_path', dest='coeqtl_path')
+ parser.add_argument('--eqtl_path', dest='eqtl_path')
+ parser.add_argument('--save_prefix', dest='saveprefix')
+ return parser
+
+
+def find_eqtlsnp_gene(snp, genepair, eqtl_snp_gene_set):
+ gene1, gene2 = genepair.split(';')
+ if '_'.join([snp, gene1]) in eqtl_snp_gene_set:
+ return '_'.join([snp, gene1])
+ else:
+ return '_'.join([snp, gene2])
+
+
+def find_eqtl_gene(coeqtl_chrpos, annotation_dict):
+ annotation_eqtlgene = annotation_dict.get(coeqtl_chrpos)
+ return annotation_eqtlgene
+
+def main():
+ args = arguments().parse_args()
+ coeqtl_path = args.coeqtl_path
+ eqtls_path = args.eqtl_path
+ saveprefix = args.saveprefix
+ permutation_pvalue_path = args.permutation_pvalue_path
+ permutation_cols = [f'Perm{ind}' for ind in range(0, 100)]
+ permutation_pvalues_df = pd.read_csv(permutation_pvalue_path, sep='\t',
+ compression='gzip', index_col=0)
+ eqtl_df = pd.read_csv(eqtls_path, sep='\t')
+ eqtl_df['chr_pos'] = ['_'.join([str(ele) for ele in item]) for item in eqtl_df[['ProbeChr', 'ProbeCenterChrPos']].values]
+ eqtl_snp_gene_set = set(['_'.join(item) for item in eqtl_df[['SNPName', 'genename']].values])
+ annotation_path = '/groups/umcg-bios/tmp01/projects/1M_cells_scRNAseq/ongoing/eQTL_mapping/probeannotation/singleCell-annotation-stripped.tsv'
+ mappingdic = pd.read_csv('/groups/umcg-lld/tmp01/projects/1MCellRNAseq/GRN_reconstruction/ongoing/resources/features_v3_reformated_names.tsv',
+ sep='\t', names=['geneid', 'genename']).set_index('geneid')['genename'].T.to_dict()
+ annotation_df = pd.read_csv(annotation_path, sep='\t')
+ annotation_df['chr_pos'] = ['_'.join([str(ele) for ele in item]) for item in annotation_df[['Chr', 'ChrStart', 'ChrEnd']].values]
+ annotation_df['genename'] = [mappingdic.get(ensemblid) for ensemblid in annotation_df['Ensembl']]
+ annotation_dict = annotation_df.set_index('chr_pos')['genename'].T.to_dict()
+ # eqtl_df['snp_gene'] = ['_'.join(item) for item in eqtl_df[['SNPName', 'genename']].values]
+ # eqtl_snp_gene_set = set(eqtl_df['snp_gene'])
+ coeqtls = pd.read_csv(coeqtl_path, sep='\t', index_col=0, compression='gzip')
+ coeqtls['eqtlgene'] = [find_eqtl_gene(chr_pos, annotation_dict) for (chr_pos) in coeqtls['chr_pos']]
+ coeqtls['snp_eqtlgene'] = ['_'.join(item) for item in coeqtls[['SNP', 'eqtlgene']].values]
+ coeqtls_lowest_nominalP = coeqtls.sort_values(by='MetaP', ascending=True).drop_duplicates(subset=['snp_eqtlgene'])
+ coeqtls_lowest_nominalP_dict = coeqtls_lowest_nominalP.set_index('snp_eqtlgene')['MetaP'].T.to_dict()
+ permutation_pvalues_df['SNP'] = [item.split('_')[0] for item in permutation_pvalues_df.index]
+ permutation_pvalues_df['nominalP'] = [coeqtls_lowest_nominalP_dict.get(snp) for snp in
+ permutation_pvalues_df.index]
+ permutation_pvalues_df = permutation_pvalues_df.dropna(subset=['nominalP'])
+ permutation_pvalues_df['beta_shape1'], permutation_pvalues_df['beta_shape2'] = \
+ zip(*[fit_beta_distribution(x)[0] for x in permutation_pvalues_df[permutation_cols].values])
+ permutation_pvalues_df['pval_beta'] = [1-beta.sf(x[0], x[1], x[2]) for x in
+ permutation_pvalues_df[['nominalP', 'beta_shape1', 'beta_shape2']].values]
+ assert permutation_pvalues_df['pval_beta'].isnull().sum() == 0
+ # over all eqtls, perform BH-FDR
+ permutation_pvalues_df['qval'] = multipletests(permutation_pvalues_df['pval_beta'].values, method='fdr_bh')[1]
+ permutation_pvalues_df.to_csv(f'{saveprefix}.eqtls_betaadjustedPs.tsv.gz', sep='\t', compression='gzip')
+ ub = permutation_pvalues_df[permutation_pvalues_df['qval']>=0.05].sort_values(by=['pval_beta'], ascending=True)['pval_beta'].values[0]
+ lb = permutation_pvalues_df[permutation_pvalues_df['qval']<=0.05].sort_values(by=['pval_beta'], ascending=False)['pval_beta'].values[0]
+ pthreshold = (ub + lb) / 2
+ print('Minimum p-value threshold', pthreshold)
+ permutation_pvalues_df['threshold_per_betadistribution'] = [beta.ppf(pthreshold, x[0], x[1]) for x in
+ permutation_pvalues_df[['beta_shape1', 'beta_shape2']].values]
+ permutation_pvalue_threshold_dict = permutation_pvalues_df.T.to_dict()
+ coeqtls['snp_beta_shape1'] = [permutation_pvalue_threshold_dict.get(snp)['beta_shape1'] for snp in coeqtls['snp_eqtlgene'].values]
+ coeqtls['snp_beta_shape2'] = [permutation_pvalue_threshold_dict.get(snp)['beta_shape2'] for snp in coeqtls['snp_eqtlgene']]
+ coeqtls['snp_pvalbeta'] = [permutation_pvalue_threshold_dict.get(snp)['pval_beta'] for snp in coeqtls['snp_eqtlgene']]
+ coeqtls['snp_qval'] = [permutation_pvalue_threshold_dict.get(snp)['qval'] for snp in coeqtls['snp_eqtlgene']]
+ coeqtls['gene2_pthreshold'] = [permutation_pvalue_threshold_dict.get(snp)['threshold_per_betadistribution']
+ for snp in coeqtls['snp_eqtlgene']]
+ issig = lambda x:True if x[0] <= x[1] else False
+ coeqtls['gene2_isSig'] = [issig(item) for item in coeqtls[['MetaP', 'gene2_pthreshold']].values]
+ significant_coeqtls = coeqtls[(coeqtls['snp_qval']<=0.05) & (coeqtls['gene2_isSig'])]
+ print('Significant results:', significant_coeqtls.shape[0])
+ coeqtls.to_csv(f'{saveprefix}.all.tsv.gz', sep='\t', compression='gzip')
+ significant_coeqtls.to_csv(f'{saveprefix}.sig.tsv.gz', sep='\t', compression='gzip')
+ return coeqtls
+
+
+if __name__ == '__main__':
+ _ = main()
diff --git a/04_coeqtl_mapping/plot_celltype_overlap_upset.R b/04_coeqtl_mapping/plot_celltype_overlap_upset.R
new file mode 100644
index 0000000..8e5cd48
--- /dev/null
+++ b/04_coeqtl_mapping/plot_celltype_overlap_upset.R
@@ -0,0 +1,46 @@
+# ------------------------------------------------------------------------------
+# Generate an upset plot of overlap between cell types
+# Input: significant co-eQTL results per cell type
+# Output: upset plot
+# ------------------------------------------------------------------------------
+
+library(data.table)
+library(UpSetR)
+
+coeqtls_mono<-fread("coeqtl_mapping/output/filtered_results/UT_monocyte/coeqtls_fullresults_fixed.sig.tsv.gz")
+coeqtls_cd4t<-fread("coeqtl_mapping/output/filtered_results/UT_CD4T/coeqtls_fullresults_fixed.sig.tsv.gz")
+coeqtls_cd8t<-fread("coeqtl_mapping/output/filtered_results/UT_CD8T/coeqtls_fullresults_fixed.sig.tsv.gz")
+coeqtls_nk<-fread("coeqtl_mapping/output/filtered_results/UT_NK/coeqtls_fullresults_fixed.sig.tsv.gz")
+coeqtls_dc<-fread("coeqtl_mapping/output/filtered_results/UT_DC/coeqtls_fullresults_fixed.sig.tsv.gz")
+coeqtls_b<-fread("coeqtl_mapping/output/filtered_results/UT_B/coeqtls_fullresults_fixed.sig.tsv.gz")
+
+pdf(paste0(outdir, "grn_plot_snp_gene_gene.pdf"))
+
+upset(fromList(list(Monocyte = coeqtls_mono$snp_genepair,
+ `CD4+ T` = coeqtls_cd4t$snp_genepair,
+ `CD8+ T` = coeqtls_cd8t$snp_genepair,
+ NK = coeqtls_nk$snp_genepair,
+ DC = coeqtls_dc$snp_genepair,
+ B = coeqtls_b$snp_genepair)),
+ set_size.show = T,set_size.scale_max = 600,
+ mainbar.y.label = "SNP-Gene-Gene",
+ nintersects = 40, nsets = 10,
+ text.scale = 1.5)
+
+dev.off()
+
+#Identify all elements that are in at least four of the six cell types
+all_coeqtls<-c(unique(coeqtls_mono$snp_genepair),
+ unique(coeqtls_cd4t$snp_genepair),
+ unique(coeqtls_cd8t$snp_genepair),
+ unique(coeqtls_nk$snp_genepair),
+ unique(coeqtls_dc$snp_genepair),
+ unique(coeqtls_b$snp_genepair))
+
+occurrence<-data.frame(table(all_coeqtls))
+
+#Show all coeQTLs part of at least three different cell types:
+most_occ<-occurrence[occurrence$Freq > 2,]
+#How many of the frequent coeQTls are associated with the RPS26 locus:
+mean(startsWith(as.character(most_occ$all_coeqtls),"rs1131017"))
+
diff --git a/04_coeqtl_mapping/plot_co-eQTL.R b/04_coeqtl_mapping/plot_co-eQTL.R
new file mode 100644
index 0000000..0357de4
--- /dev/null
+++ b/04_coeqtl_mapping/plot_co-eQTL.R
@@ -0,0 +1,152 @@
+############################################################################################################################
+# Code Author: Dylan de Vries
+# Name: plot_co-eQTL.R
+# Function: Plot co-eQTLs
+############################################################################################################################
+#
+# Libraries
+#
+############################################################################################################################
+library(data.table)
+library(ggplot2)
+library(ggbeeswarm)
+library(gridExtra)
+
+############################################################################################################################
+#
+# Functions
+#
+############################################################################################################################
+# Name: get.expression.data
+# Function: Get the expression data and calculate the co-expression for plotting purposes
+# Input:
+# Name Type Description
+# sample character sample name
+# cell.type character cell type to get the data for
+# gene1 character first gene to get data for
+# gene2 character second gene to get data for
+# genotype character the genotype of the co-eQTL for this sample
+#
+# Output:
+# A list with two data frames of one sample. The first is for making the boxplots and the second for the personalized expression regression plot
+get.expression.data <- function(sample, cell.type, gene1, gene2, genotype){
+ sample.gene1.expression <- data@assays$SCT@data[gene1, rownames(data@meta.data[data@meta.data$cell_type_lowerres == cell.type & data@meta.data$assignment == sample,])]
+ sample.gene2.expression <- data@assays$SCT@data[gene2, rownames(data@meta.data[data@meta.data$cell_type_lowerres == cell.type & data@meta.data$assignment == sample,])]
+
+ sample.co.expression <- cor(sample.gene1.expression, sample.gene2.expression, method="spearman")
+ expr.plot.data <- data.frame(gene1.expression=sample.gene1.expression, gene2.expression=sample.gene2.expression, sample=sample, genotype=genotype)
+ plot.data <- list(sample.co.expression, expr.plot.data)
+ return(plot.data)
+}
+
+# Name: prepare.plot.data
+# Function: Combine the data of all samples into data.frames
+# Input:
+# Name Type Description
+# gene1 character first gene to get data for
+# gene2 character second gene to get data for
+# SNP.name character the rs-ID for the co-eQTL SNP
+# cell.type character cell type to get the data for
+#
+# Output:
+# A list with two data frames. The first is for making the boxplots and the second for the personalized expression regression plot
+prepare.plot.data <- function(gene1, gene2, SNP.name, cell.type){
+ co.expressions <- c()
+ genotypes <- c()
+ expr.plot.data <- data.frame(gene1.expression=numeric(0), gene2.expression=numeric(0), sample=character(0), genotype=character(0))
+ for (sample in samples){
+ genotypes <- c(genotypes, genotypes_all[SNP.name, sample])
+ plot.data <- get.expression.data(sample, cell.type, gene1, gene2, genotypes_all[SNP.name, sample])
+ expr.plot.data <- rbind(expr.plot.data, plot.data[[2]])
+ co.expressions <- c(co.expressions, plot.data[[1]])
+ }
+ plot.data <- data.frame(co.expression=co.expressions, sample=samples, genotype=genotypes)
+ combined.plot.data <- list(plot.data, expr.plot.data)
+ return(combined.plot.data)
+}
+
+# Name: plot.co.eQTL.boxplot
+# Function: Make a plot for the co-eQTL
+# Input:
+# Name Type Description
+# plot.data data.frame the data for the boxplot
+# expr.plot.data data.frame the data for the expression regression plot
+# gene1 character first gene to get data for
+# gene2 character second gene to get data for
+# SNP.name character the rs-ID for the co-eQTL SNP
+# cell.type character cell type to get the data for
+# meta.z numeric meta z-score
+# QTL.type character indicates whether it's amongst the strongest, middle or weakest co-eQTLs
+# QTL.type.index character the index of the co-eQTL within its type
+#
+# Output:
+# A list with two data frames. The first is for making the boxplots and the second for the personalized expression regression plot
+plot.co.eQTL.boxplot <- function(plot.data, expr.plot.data, gene1, gene2, SNP.name, cell.type, meta.z, QTL.type, QTL.type.index){
+ genotype.colors <- c("#57a350", "#fd7600", "#383bfe", "white")
+ names(genotype.colors) <- c("0/0", "0/1", "1/1", "white")
+
+ sample.color <- c(colorRampPalette(c("#9efc95", "#57a350"))(length(which(plot.data$genotype=="0/0"))),
+ colorRampPalette(c("#fabb84", "#fd7600"))(length(which(plot.data$genotype=="0/1"))),
+ colorRampPalette(c("#acadfc", "#383bfe"))(length(which(plot.data$genotype=="1/1"))))
+ names(sample.color) <- c(as.character(plot.data$sample[plot.data$genotype == "0/0"]), as.character(plot.data$sample[plot.data$genotype == "0/1"]), as.character(plot.data$sample[plot.data$genotype == "1/1"]))
+
+ expr.plot <- ggplot(expr.plot.data, aes(x=gene1.expression, y=gene2.expression, fill=sample, color=sample)) + geom_point(size=0.5) +
+ geom_smooth(method = "lm", fullrange = T, se=F) +
+ scale_fill_manual(values=sample.color) +
+ scale_color_manual(values=sample.color) +
+ xlab(paste0(gene1, " expression")) +
+ ylab(paste0(gene2, " expression")) +
+ ggtitle(paste0(SNP.name, " effect on ", gene1, " - ", gene2, "\nco-expression")) +
+ guides(fill=FALSE, color=FALSE) +
+ theme(axis.text.x = element_text(angle = 90, hjust = 1, size=7), panel.border = element_rect(color="black", fill=NA, size=1.1), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank(), strip.background = element_rect(colour="white", fill="white"))
+
+ box.plot <- ggplot(plot.data) + geom_boxplot(aes(x=genotype, y=co.expression, fill=genotype), outlier.shape=NA, alpha=0.6) +
+ geom_quasirandom(aes(x=genotype, y=co.expression, color=genotype, fill="white"), pch=21, size=2, alpha=1, dodge.width=0.4, alpha=0.6) +
+ scale_fill_manual(values=genotype.colors) +
+ scale_color_manual(values=genotype.colors) +
+ xlab("Genotype") +
+ ylab(paste0(gene1, " - ", gene2, " co-expression")) +
+ ggtitle(paste0(SNP.name, " co-eQTL\n", QTL.type, " ", QTL.type.index)) +
+ guides(fill=FALSE, color=FALSE) +
+ theme(axis.text.x = element_text(angle = 90, hjust = 1, size=7), panel.border = element_rect(color="black", fill=NA, size=1.1), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank(), strip.background = element_rect(colour="white", fill="white"))
+
+ pdf(paste0("/groups/umcg-bios/tmp01/projects/1M_cells_scRNAseq/ongoing/co-eQTLs/plots/", cell.type, "/", cell.type, "_co-eQTL_", SNP.name, "_", gene1, "-", gene2, ".pdf"))
+ grid.arrange(expr.plot, box.plot, ncol=2)
+ dev.off()
+}
+
+############################################################################################################################
+#
+# Main code
+#
+############################################################################################################################
+data <- readRDS("/groups/umcg-lld/tmp01/projects/1MCellRNAseq/GRN_reconstruction/ongoing/seurat_objects/1M_v2_mediumQC_ctd_rnanormed_demuxids_20201029.rds")
+vcf <- fread('/groups/umcg-lld/tmp01/projects/1MCellRNAseq/GRN_reconstruction/ongoing/genotypes/LL_trityper_plink_converted.vcf.gz')
+target.QTLs <- read.table("/groups/umcg-lld/tmp01/projects/1MCellRNAseq/GRN_reconstruction/ongoing/coeqtl_mapping/output/filtered_results/UT_monocyte/coeqtls_fullresults.sig.tsv.gz", header=T, sep="\t", stringsAsFactors=F)
+target.QTLs <- target.QTLs[order(abs(target.QTLs$MetaPZ), decreasing=T),]
+genotypes_all <- as.data.frame(vcf[, 10:ncol(vcf)])
+rownames(genotypes_all) <- vcf$ID
+
+#Get the 10 strongest, 10 middling and 10 weakest of the input co-eQTLs
+QTL.selection <- target.QTLs[c(1:10, floor(nrow(target.QTLs)/2):(floor(nrow(target.QTLs)/2)+10), (nrow(target.QTLs)-10):nrow(target.QTLs)),]
+samples <- unique(data@meta.data$assignment)
+
+for (QTL.index in 1:nrow(QTL.selection)){
+ print(QTL.index)
+ if (QTL.index <= 10){
+ type <- "strong"
+ QTL.type.index <- QTL.index
+ } else if (QTL.index <= 20){
+ type <- "medium"
+ QTL.type.index <- QTL.index - 10
+ } else {
+ type <- "poor"
+ QTL.type.index <- QTL.index - 20
+ }
+ genes <- unlist(strsplit(QTL.selection$Gene[QTL.index], ";"))
+ combined.plot.data <- prepare.plot.data(genes[1], genes[2], QTL.selection$SNP[QTL.index], "monocyte")
+ plot.data <- combined.plot.data[[1]]
+ expr.plot.data <- combined.plot.data[[2]]
+
+ plot.co.eQTL.boxplot(plot.data, expr.plot.data, genes[1], genes[2], QTL.selection$SNP[QTL.index], "monocyte", QTL.selection$MetaPZ[QTL.index], type, QTL.type.index)
+}
diff --git a/04_coeqtl_mapping/plot_effect_concordance_across_cohorts.R b/04_coeqtl_mapping/plot_effect_concordance_across_cohorts.R
new file mode 100644
index 0000000..b876047
--- /dev/null
+++ b/04_coeqtl_mapping/plot_effect_concordance_across_cohorts.R
@@ -0,0 +1,73 @@
+################################################################################
+# Compare effect sizes (Z-scores) calculated in each individual dataset
+# (before the meta-analysis)
+# Input: coeqtls results of the respective cell type
+# Output: pairwise plot showing the differences for each combination of cohorts
+################################################################################
+
+library(GGally) #to generate pairwise comparison plots
+library(viridis)
+
+coeqtl_dir<-"coeqtl_mapping/output/filtered_results/"
+plot_dir<-"coeqtl_interpretation/plots_filtered/"
+
+cell_type<-"CD4T"
+
+# Load current set of coeQTL
+coeqtls<-fread(paste0(coeqtl_dir,"UT_",
+ cell_type,"/coeqtls_fullresults.all.tsv.gz"))
+coeqtls$gene1<-gsub(";.*","",coeqtls$Gene)
+coeqtls$gene2<-gsub(".*;","",coeqtls$Gene)
+
+# Gene 1 and 2 should be ordered alphabetically, but there is an issue regarding
+# small and capital letters (so order them again!)
+coeqtls$swap<-ifelse(coeqtls$gene1 > coeqtls$gene2,coeqtls$gene1,coeqtls$gene2)
+coeqtls$gene1<-ifelse(coeqtls$gene1 > coeqtls$gene2,coeqtls$gene2,coeqtls$gene1)
+coeqtls$gene2<-coeqtls$swap
+coeqtls$swap<-NULL
+
+# Filter for significant coeQTLs
+sign_coeqtls<-coeqtls[coeqtls$gene2_isSig == "TRUE" &
+ coeqtls$snp_qval <= 0.05,]
+
+print(paste(nrow(sign_coeqtls),"significant coeQTLs from",
+ nrow(coeqtls),"pairs"))
+print(paste("CoeQTLs consisting of:",
+ length(unique(sign_coeqtls$GeneSymbol)), "unique gene pairs from",
+ length(unique(c(sign_coeqtls$gene1,sign_coeqtls$gene2))),"unique genes",
+ "and",length(unique(sign_coeqtls$SNP)),"unique SNPs"))
+
+# Check Z score distribution
+z_scores<-strsplit(sign_coeqtls$`DatasetZScores(ng;onemillionv2;onemillionv3;stemiv2)`,
+ split=";")
+z_scores<-matrix(as.numeric(unlist(z_scores)),ncol=4,byrow=TRUE)
+z_scores<-as.data.frame(z_scores)
+colnames(z_scores)<-c("ng","onemillionv2","onemillionv3","stemiv2")
+z_scores$coeqtl<-sign_coeqtls$snp_genepair
+z_scores$meta_z<-sign_coeqtls$MetaPZ
+
+#Flip the Z-scores so that AF is always representing the minor allele
+z_scores$AF<-sign_coeqtls$SNPEffectAlleleFreq
+for(colN in c("ng","onemillionv2","onemillionv3","stemiv2","meta_z")){
+ z_scores[,colN]<-ifelse(z_scores$AF>=0.5,z_scores[,colN]*(-1),z_scores[,colN])
+}
+
+#Rename Z score columns
+colnames(z_scores)[1:4]<-c("van der Wijst","Oelen (v2)","Oelen (v3)", "van Blokland (v2)")
+z_scores<-z_scores[,c("Oelen (v2)","Oelen (v3)", "van Blokland (v2)",
+ "van der Wijst","coeqtl","meta_z","AF")]
+#Plot comparison of Z scores between cohorts
+lowerfun <- function(data,mapping){
+ ggplot(data = data, mapping = mapping)+
+ geom_bin2d()+
+ scale_fill_viridis("Density",breaks=c(2,7),labels = c("Low", "High"))+
+ geom_hline(yintercept=0)+geom_vline(xintercept=0)
+}
+
+g<-ggpairs(z_scores[1:4],
+ lower=list(continuous=wrap(lowerfun)),
+ legend=c(2,1))+
+ theme(legend.position = "bottom")
+ggsave(g,file=paste0(plot_dir,cell_type,
+ "_zscore_dist_cohorts.pdf"),
+ height=6,width=6)
\ No newline at end of file
diff --git a/04_coeqtl_mapping/plot_example_imputed_zero.ipynb b/04_coeqtl_mapping/plot_example_imputed_zero.ipynb
new file mode 100644
index 0000000..fc28e77
--- /dev/null
+++ b/04_coeqtl_mapping/plot_example_imputed_zero.ipynb
@@ -0,0 +1,571 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import matplotlib.pyplot as plt\n",
+ "import seaborn as sns\n",
+ "import os\n",
+ "import re\n",
+ "from pathlib import Path\n",
+ "\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import scanpy as sc\n",
+ "from scipy.stats import spearmanr, pearsonr\n",
+ "from scipy.stats import t, norm\n",
+ "from tqdm import tqdm\n",
+ "\n",
+ "\n",
+ "def get_time(x):\n",
+ " if x == 'UT':\n",
+ " return x\n",
+ " else:\n",
+ " pattern = re.compile(r'\\d+h')\n",
+ " return re.findall(pattern, x)[0]\n",
+ "\n",
+ "\n",
+ "class DATASET:\n",
+ " def __init__(self, datasetname):\n",
+ " self.name = datasetname\n",
+ " self.path_prefix = Path(\"./seurat_objects\")\n",
+ " self.information = self.get_information()\n",
+ " def get_information(self):\n",
+ " if self.name == 'onemillionv2':\n",
+ " self.path = '1M_v2_mediumQC_ctd_rnanormed_demuxids_20201029.sct.h5ad'\n",
+ " self.individual_id_col = 'assignment'\n",
+ " self.timepoint_id_col = 'time'\n",
+ " self.celltype_id = 'cell_type_lowerres'\n",
+ " self.chosen_condition = {'UT': 'UT',\n",
+ " 'stimulated': '3h'}\n",
+ " elif self.name == 'onemillionv3':\n",
+ " self.path = '1M_v3_mediumQC_ctd_rnanormed_demuxids_20201106.SCT.h5ad'\n",
+ " self.individual_id_col = 'assignment'\n",
+ " self.timepoint_id_col = 'time'\n",
+ " self.celltype_id = 'cell_type_lowerres'\n",
+ " self.chosen_condition = {'UT': 'UT',\n",
+ " 'stimulated': '3h'}\n",
+ " elif self.name == 'stemiv2':\n",
+ " self.path = 'cardio.integrated.20210301.stemiv2.h5ad'\n",
+ " self.individual_id_col = 'assignment.final'\n",
+ " self.timepoint_id_col = 'timepoint.final'\n",
+ " self.celltype_id = 'cell_type_lowerres'\n",
+ " self.chosen_condition = {'UT': 't8w',\n",
+ " 'stimulated': 'Baseline'}\n",
+ " elif self.name == 'ng':\n",
+ " self.path = 'pilot3_seurat3_200420_sct_azimuth.h5ad'\n",
+ " self.individual_id_col = 'snumber'\n",
+ " self.celltype_id = 'cell_type_mapped_to_onemillion'\n",
+ " else:\n",
+ " raise IOError(\"Dataset name not understood.\")\n",
+ " def load_dataset(self):\n",
+ " self.get_information()\n",
+ " print(f'Loading dataset {self.name} from {self.path_prefix} {self.path}')\n",
+ " self.data_sc = sc.read_h5ad(self.path_prefix / self.path)\n",
+ " if self.name.startswith('onemillion'):\n",
+ " self.data_sc.obs['time'] = [get_time(item) for item in self.data_sc.obs['timepoint']]\n",
+ " elif self.name == 'ng':\n",
+ " celltype_maping = {'CD4 T': 'CD4T', 'CD8 T': 'CD8T', 'Mono': 'monocyte', 'DC': 'DC', 'NK': 'NK',\n",
+ " 'other T': 'otherT', 'other': 'other', 'B': 'B'}\n",
+ " self.data_sc.obs['cell_type_mapped_to_onemillion'] = [celltype_maping.get(name) for name in\n",
+ " self.data_sc.obs['predicted.celltype.l1']]\n",
+ " def get_cMono_ncMono(self):\n",
+ " def tell_cmono_foronemillion(x):\n",
+ " if x == 'mono 1' or x == 'mono 3' or x == 'mono 4':\n",
+ " return 'cMono'\n",
+ " elif x == 'mono 2':\n",
+ " return 'ncMono'\n",
+ " if self.name.startswith('onemillion'):\n",
+ " self.data_sc.obs['sub_monocytes'] = [tell_cmono_foronemillion(x) for x in\n",
+ " self.data_sc.obs['cell_type']]\n",
+ " self.cmono = self.data_sc[self.data_sc.obs['sub_monocytes'] == 'cMono']\n",
+ " self.ncmono = self.data_sc[self.data_sc.obs['sub_monocytes'] == 'ncMono']\n",
+ " elif self.name.startswith('stemi'):\n",
+ " self.cmono = self.data_sc[self.data_sc.obs['cell_type'] == 'cMono']\n",
+ " self.ncmono = self.data_sc[self.data_sc.obs['cell_type'] == 'ncMono']\n",
+ " elif self.name == 'ng':\n",
+ " self.cmono = self.data_sc[self.data_sc.obs['predicted.celltype.l2'] == 'CD14 Mono']\n",
+ " self.ncmono = self.data_sc[self.data_sc.obs['predicted.celltype.l2'] == 'CD16 Mono']\n",
+ " else:\n",
+ " raise IOError(\"Dataset name not understood.\")\n",
+ "\n",
+ "example_savedir = Path(\n",
+ " \"/groups/umcg-lld/tmp01/projects/1MCellRNAseq/GRN_reconstruction/ongoing/coeqtl_mapping/output/examples\"\n",
+ ")\n",
+ "\n",
+ "import subprocess\n",
+ "bashfile_path = '/groups/umcg-lld/tmp01/projects/1MCellRNAseq/GRN_reconstruction/ongoing/coeqtl_mapping/bios/select_snps_from_vcf.sh'\n",
+ "def get_snps_from_vcffile(bashfile_path, vcf_path, snps_path, savepath):\n",
+ " response = subprocess.run([bashfile_path, vcf_path, snps_path, savepath])\n",
+ " print(response)\n",
+ " return None\n",
+ "\n",
+ "# sample id mapping\n",
+ "gtefile = pd.read_csv(\n",
+ " '/groups/umcg-lld/tmp01/projects/1MCellRNAseq/GRN_reconstruction/ongoing/coeqtl_mapping/input/summary/gte-fix.tsv',\n",
+ " sep='\\t'\n",
+ ")\n",
+ "gte_dict = gtefile.set_index(\"expressionsampleID\")[\"genotypesampleID\"].T.to_dict()\n",
+ "\n",
+ "\n",
+ "def corr_to_z(coef, num):\n",
+ " t_statistic = coef * np.sqrt((num - 2) / (1 - coef ** 2))\n",
+ " prob = t.cdf(t_statistic, num - 2)\n",
+ " z_score = norm.ppf(prob)\n",
+ " positive_coef_probs = 1 - prob\n",
+ " positive_coef_probs[coef < 0] = 0\n",
+ " negative_coef_probs = prob\n",
+ " negative_coef_probs[coef > 0] = 0\n",
+ " probs = negative_coef_probs + positive_coef_probs\n",
+ " return z_score, probs\n",
+ "\n",
+ "\n",
+ "def get_individual_networks_selected_genepairs(data_df, data_sc, individual_colname, genepair, fillna=False):\n",
+ "# data_df = pd.DataFrame(data=data_sc.X.toarray(),\n",
+ "# index=data_sc.obs.index,\n",
+ "# columns=data_sc.var.index)\n",
+ " gene1, gene2 = genepair.split(';')\n",
+ " sorted_genepair = [';'.join(sorted([gene1, gene2]))]\n",
+ " coef_df = pd.DataFrame(index=sorted_genepair)\n",
+ " coef_p_df = pd.DataFrame(index=sorted_genepair)\n",
+ " zscore_df = pd.DataFrame(index=sorted_genepair)\n",
+ " zscore_p_df = pd.DataFrame(index=sorted_genepair)\n",
+ " data_selected_df = data_df[[gene1, gene2]]\n",
+ " print(\n",
+ " f\"Calculating networks for {len(data_sc.obs[individual_colname].unique())} individuals and;\\n{genepair}\"\n",
+ " )\n",
+ " for ind_id in tqdm(data_sc.obs[individual_colname].unique()):\n",
+ " cell_num = data_sc.obs[data_sc.obs[individual_colname] == ind_id].shape[0]\n",
+ " if cell_num > 10:\n",
+ " individual_df = data_selected_df.loc[data_sc.obs[individual_colname] == ind_id]\n",
+ " individual_coefs, individual_coef_ps = spearmanr(individual_df.values, axis=0)\n",
+ " if data_selected_df.shape[1] == 2:\n",
+ " individual_coefs_flatten = pd.DataFrame(data = [individual_coefs],\n",
+ " index = sorted_genepair)\n",
+ " individual_coef_ps_flatten = \\\n",
+ " pd.DataFrame(data=[individual_coef_ps],\n",
+ " index=sorted_genepair)\n",
+ " else:\n",
+ " individual_coefs_flatten = pd.DataFrame(\n",
+ " data=individual_coefs[np.triu_indices_from(individual_coefs, 1)],\n",
+ " index=sorted_genepair).loc[sorted_genepair]\n",
+ " individual_coef_ps_flatten = \\\n",
+ " pd.DataFrame(data=individual_coef_ps[np.triu_indices_from(individual_coefs, 1)],\n",
+ " index=sorted_genepair).loc[sorted_genepair]\n",
+ " coef_df[ind_id] = individual_coefs_flatten\n",
+ " coef_p_df[ind_id] = individual_coef_ps_flatten\n",
+ " try:\n",
+ " individual_zscores_flatten, individual_zscore_ps_flatten = corr_to_z(\n",
+ " individual_coefs_flatten.values,\n",
+ " cell_num\n",
+ " )\n",
+ " zscore_df[ind_id] = individual_zscores_flatten\n",
+ " zscore_p_df[ind_id] = individual_zscore_ps_flatten\n",
+ " except:\n",
+ " continue\n",
+ " else:\n",
+ " print(\"Deleted this individual because of low cell number\", cell_num)\n",
+ " if fillna:\n",
+ " zscore_df = zscore_df.fillna(0)\n",
+ " return data_selected_df, zscore_df, zscore_p_df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Loading dataset onemillionv2 from /groups/umcg-lld/tmp01/projects/1MCellRNAseq/GRN_reconstruction/ongoing/seurat_objects 1M_v2_mediumQC_ctd_rnanormed_demuxids_20201029.sct.h5ad\n"
+ ]
+ }
+ ],
+ "source": [
+ "datasetname = 'onemillionv2'\n",
+ "dataset = DATASET(datasetname)\n",
+ "dataset.load_dataset()\n",
+ "data_sc = dataset.data_sc"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "CompletedProcess(args=['/groups/umcg-lld/tmp01/projects/1MCellRNAseq/GRN_reconstruction/ongoing/coeqtl_mapping/bios/select_snps_from_vcf.sh', '/groups/umcg-lld/tmp01/projects/1MCellRNAseq/GRN_reconstruction/ongoing/coeqtl_mapping/output/genotypevcfs/chr1/GenotypeData.vcf.gz', PosixPath('/groups/umcg-lld/tmp01/projects/1MCellRNAseq/GRN_reconstruction/ongoing/coeqtl_mapping/output/examples/snplist.rs221045'), PosixPath('/groups/umcg-lld/tmp01/projects/1MCellRNAseq/GRN_reconstruction/ongoing/coeqtl_mapping/output/examples/rs221045.vcf')], returncode=0)\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " #CHROM | \n",
+ " POS | \n",
+ " ID | \n",
+ " REF | \n",
+ " ALT | \n",
+ " QUAL | \n",
+ " FILTER | \n",
+ " INFO | \n",
+ " FORMAT | \n",
+ " 1_LLDeep_1191 | \n",
+ " ... | \n",
+ " s21 | \n",
+ " s43 | \n",
+ " s24 | \n",
+ " s23 | \n",
+ " s45 | \n",
+ " s26 | \n",
+ " s25 | \n",
+ " s28 | \n",
+ " s27 | \n",
+ " s29 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 16530049 | \n",
+ " rs221045 | \n",
+ " T | \n",
+ " C | \n",
+ " . | \n",
+ " . | \n",
+ " . | \n",
+ " GT:DS | \n",
+ " 0/0:0.03 | \n",
+ " ... | \n",
+ " 0/1:1.0 | \n",
+ " 0/0:0.010000000000000009 | \n",
+ " 0/1:1.0 | \n",
+ " 0/0:0.0 | \n",
+ " 0/0:0.0 | \n",
+ " 1/1:2.0 | \n",
+ " 0/0:0.0 | \n",
+ " 0/1:1.0 | \n",
+ " 0/0:0.0 | \n",
+ " 0/1:1.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
1 rows × 182 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT 1_LLDeep_1191 \\\n",
+ "0 1 16530049 rs221045 T C . . . GT:DS 0/0:0.03 \n",
+ "\n",
+ " ... s21 s43 s24 s23 s45 s26 \\\n",
+ "0 ... 0/1:1.0 0/0:0.010000000000000009 0/1:1.0 0/0:0.0 0/0:0.0 1/1:2.0 \n",
+ "\n",
+ " s25 s28 s27 s29 \n",
+ "0 0/0:0.0 0/1:1.0 0/0:0.0 0/1:1.0 \n",
+ "\n",
+ "[1 rows x 182 columns]"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "celltype = 'monocyte'\n",
+ "snp_id = 'rs221045'\n",
+ "chromosome = '1'\n",
+ "snp_vcf_path = example_savedir/f'{snp_id}.vcf'\n",
+ "with open(example_savedir/f'snplist.{snp_id}', 'w') as f:\n",
+ " f.write(f'{snp_id}\\n')\n",
+ "vcf_path = f'/groups/umcg-lld/tmp01/projects/1MCellRNAseq/GRN_reconstruction/ongoing/coeqtl_mapping/output/genotypevcfs/chr{chromosome}/GenotypeData.vcf.gz'\n",
+ "get_snps_from_vcffile(bashfile_path, vcf_path, example_savedir/f'snplist.{snp_id}', snp_vcf_path)\n",
+ "gt = pd.read_csv(snp_vcf_path, sep='\\t', skiprows=6)\n",
+ "gt"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {
+ "scrolled": false
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Calculating networks for 72 individuals and;\n",
+ "AC005076.5;ARHGEF19\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 0%| | 0/72 [00:00, ?it/s]/groups/umcg-lld/tmp01/projects/1MCellRNAseq/GRN_reconstruction/tools/Beeline/miniconda/envs/scpy3.8/lib/python3.8/site-packages/scipy/stats/stats.py:4264: SpearmanRConstantInputWarning: An input array is constant; the correlation coefficent is not defined.\n",
+ " warnings.warn(SpearmanRConstantInputWarning())\n",
+ "100%|██████████| 72/72 [00:00<00:00, 210.51it/s]\n",
+ "/groups/umcg-lld/tmp01/projects/1MCellRNAseq/GRN_reconstruction/tools/Beeline/miniconda/envs/scpy3.8/lib/python3.8/site-packages/seaborn/categorical.py:1296: UserWarning: 42.5% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.\n",
+ " warnings.warn(msg, UserWarning)\n",
+ "/groups/umcg-lld/tmp01/projects/1MCellRNAseq/GRN_reconstruction/tools/Beeline/miniconda/envs/scpy3.8/lib/python3.8/site-packages/seaborn/categorical.py:1296: UserWarning: 7.1% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.\n",
+ " warnings.warn(msg, UserWarning)\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Not Imputed SpearmanrResult(correlation=-0.028018282506059713, pvalue=0.8942369051146191)\n",
+ "Imputed SpearmanrResult(correlation=-0.24638574744096847, pvalue=0.03833253459364005)\n"
+ ]
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# genepair = 'RP1-29C18.10;ZNF501'\n",
+ "# genepair = 'CCDC15;UNC5B'\n",
+ "# genepair = 'GSTM3;RP1-29C18.10'\n",
+ "# genepair = 'MMEL1;SARS2'\n",
+ "genepair = 'AC005076.5;ARHGEF19'\n",
+ "gene1, gene2 = genepair.split(';')\n",
+ "\n",
+ "if datasetname == 'ng':\n",
+ " ut_celltype = data_sc[data_sc.obs[dataset.celltype_id]==celltype]\n",
+ "else:\n",
+ " ut_celltype = data_sc[(data_sc.obs[dataset.celltype_id]==celltype) &\n",
+ " (data_sc.obs[dataset.timepoint_id_col]==dataset.chosen_condition['UT'])]\n",
+ "\n",
+ "ut_celltype_df = pd.DataFrame(data=ut_celltype.X.toarray(),\n",
+ " columns=ut_celltype.var.index,\n",
+ " index=ut_celltype.obs.index)\n",
+ "selected_expression_df, ut_zscore_df, ut_zscore_p_df = get_individual_networks_selected_genepairs(\n",
+ " data_df = ut_celltype_df,\n",
+ " data_sc = ut_celltype,\n",
+ " individual_colname = dataset.individual_id_col,\n",
+ " genepair = genepair,\n",
+ " fillna=False\n",
+ ")\n",
+ "\n",
+ "ut_t = ut_zscore_df.T\n",
+ "ut_t['gt_sampleid'] = [gte_dict.get(name) for name in ut_t.index]\n",
+ "ut_t = ut_t.set_index('gt_sampleid')\n",
+ "common_individuals = list(set(gt.columns) & set(ut_t.index))\n",
+ "gt_t = gt[common_individuals].T\n",
+ "gt_t['genotype'] = [item.split(':')[0].count('1') for item in gt_t[0]]\n",
+ "concat_df = pd.concat([gt_t, ut_t], axis=1).replace([np.inf, -np.inf], np.nan).dropna()\n",
+ "print('Not Imputed', spearmanr(concat_df['genotype'], concat_df[genepair]))\n",
+ "\n",
+ "ut_t_imputed = ut_zscore_df.fillna(0).T\n",
+ "ut_t_imputed['gt_sampleid'] = [gte_dict.get(name) for name in ut_t_imputed.index]\n",
+ "ut_t_imputed = ut_t_imputed.set_index('gt_sampleid')\n",
+ "common_individuals_imputed = list(set(gt.columns) & set(ut_t_imputed.index))\n",
+ "gt_t_imputed = gt[common_individuals_imputed].T\n",
+ "gt_t_imputed['genotype'] = [item.split(':')[0].count('1') for item in gt_t_imputed[0]]\n",
+ "concat_imputed_df = pd.concat([gt_t_imputed, ut_t_imputed], axis=1).replace([np.inf, -np.inf], np.nan).dropna()\n",
+ "print('Imputed', spearmanr(concat_imputed_df['genotype'], concat_imputed_df[genepair]))\n",
+ "\n",
+ "# dosage_dict = gt_t['genotype'].T.to_dict()\n",
+ "# selected_expression_df_withsample = pd.concat([selected_expression_df,\n",
+ "# ut_celltype.obs[[dataset.individual_id_col]]],\n",
+ "# axis=1)\n",
+ "# selected_expression_df_withsample['gt_sampleid'] = [gte_dict.get(name) for name in\n",
+ "# selected_expression_df_withsample[dataset.individual_id_col]]\n",
+ "# selected_expression_df_withsample['genotype'] = [dosage_dict.get(gt_sampleid) for gt_sampleid in\n",
+ "# selected_expression_df_withsample['gt_sampleid']]\n",
+ "\n",
+ "sns.set_style('white')\n",
+ "refallele = gt['REF'].values[0]\n",
+ "altallele = gt['ALT'].values[0]\n",
+ "snp_name = f'{snp_id}_{altallele}'\n",
+ "\n",
+ "_, axes = plt.subplots(1, 2, figsize=(10, 5), sharey=True)\n",
+ "ax1, ax2 = axes\n",
+ "\n",
+ "im_coef, im_p = spearmanr(concat_imputed_df['genotype'], concat_imputed_df[genepair])\n",
+ "sns.violinplot(x=concat_imputed_df['genotype'], \n",
+ " y=concat_imputed_df[genepair], \n",
+ " ax=ax1,\n",
+ " inner=None)\n",
+ "sns.swarmplot(x=concat_imputed_df['genotype'], \n",
+ " y=concat_imputed_df[genepair], \n",
+ " ax=ax1,\n",
+ " color='black')\n",
+ "ax1.set_title(f'Imputed r={im_coef:.2f}; pvalue {im_p:.4f}')\n",
+ "# ax1.set_xticklabels([f'{refallele}{refallele}', \n",
+ "# f'{refallele}{altallele}',\n",
+ "# f'{altallele}{altallele}'])\n",
+ "ax1.set_xlabel(snp_id)\n",
+ "\n",
+ "coef, p = spearmanr(concat_df['genotype'], concat_df[genepair])\n",
+ "sns.violinplot(x=concat_df['genotype'], \n",
+ " y=concat_df[genepair], \n",
+ " ax=ax2,\n",
+ " inner=None)\n",
+ "sns.swarmplot(x=concat_df['genotype'], \n",
+ " y=concat_df[genepair], \n",
+ " ax=ax2,\n",
+ " color='black')\n",
+ "ax2.set_xlabel('')\n",
+ "ax2.set_title(f'Not Imputed r={coef:.2f}; pvalue {p:.4f}')\n",
+ "# ax2.set_xticklabels([f'{refallele}{refallele}', \n",
+ "# f'{refallele}{altallele}',\n",
+ "# f'{altallele}{altallele}'])\n",
+ "ax2.set_xlabel(snp_id)\n",
+ "plt.savefig(example_savedir/f'{snp_name}_ref{refallele}_alt{altallele}_{gene1}_{gene2}.{celltype}_{datasetname}.full.pdf')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/groups/umcg-lld/tmp01/projects/1MCellRNAseq/GRN_reconstruction/tools/Beeline/miniconda/envs/scpy3.8/lib/python3.8/site-packages/seaborn/categorical.py:1296: UserWarning: 42.5% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.\n",
+ " warnings.warn(msg, UserWarning)\n",
+ "/groups/umcg-lld/tmp01/projects/1MCellRNAseq/GRN_reconstruction/tools/Beeline/miniconda/envs/scpy3.8/lib/python3.8/site-packages/seaborn/categorical.py:1296: UserWarning: 7.1% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.\n",
+ " warnings.warn(msg, UserWarning)\n"
+ ]
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "_, axes = plt.subplots(1, 2, figsize=(10, 5), sharey=True)\n",
+ "ax1, ax2 = axes\n",
+ "\n",
+ "im_coef, im_p = spearmanr(concat_imputed_df['genotype'], concat_imputed_df[genepair])\n",
+ "# sns.violinplot(x=concat_imputed_df['genotype'], \n",
+ "# y=concat_imputed_df[genepair], \n",
+ "# ax=ax1,\n",
+ "# inner=None)\n",
+ "sns.swarmplot(x=concat_imputed_df['genotype'], \n",
+ " y=concat_imputed_df[genepair], \n",
+ " ax=ax1,\n",
+ " color='black')\n",
+ "sns.regplot(x=concat_imputed_df['genotype'], \n",
+ " y=concat_imputed_df[genepair], \n",
+ " ax=ax1, scatter=False)\n",
+ "ax1.set_title(f'Imputed r={im_coef:.2f}; pvalue {im_p:.4f}')\n",
+ "ax1.set_xticklabels([f'{refallele}{refallele}', \n",
+ " f'{refallele}{altallele}',\n",
+ " f'{altallele}{altallele}'])\n",
+ "ax1.set_xlabel(snp_id)\n",
+ "\n",
+ "coef, p = spearmanr(concat_df['genotype'], concat_df[genepair])\n",
+ "# sns.violinplot(x=concat_df['genotype'], \n",
+ "# y=concat_df[genepair], \n",
+ "# ax=ax2,\n",
+ "# inner=None)\n",
+ "sns.swarmplot(x=concat_df['genotype'], \n",
+ " y=concat_df[genepair], \n",
+ " ax=ax2,\n",
+ " color='black')\n",
+ "sns.regplot(x=concat_df['genotype'], \n",
+ " y=concat_df[genepair], \n",
+ " ax=ax2, scatter=False)\n",
+ "ax2.set_xlabel('')\n",
+ "ax2.set_title(f'Not Imputed r={coef:.2f}; pvalue {p:.4f}')\n",
+ "ax2.set_xticklabels([f'{refallele}{refallele}', \n",
+ " f'{refallele}{altallele}',\n",
+ " f'{altallele}{altallele}'])\n",
+ "ax2.set_xlabel(snp_id)\n",
+ "plt.savefig(example_savedir/f'{snp_name}_ref{refallele}_alt{altallele}_{gene1}_{gene2}.{celltype}_{datasetname}.full.pdf')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 112,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "PosixPath('/groups/umcg-lld/tmp01/projects/1MCellRNAseq/GRN_reconstruction/ongoing/coeqtl_mapping/output/examples/rs221045_C_refT_altC_AC005076.5_ARHGEF19.monocyte_onemillionv2.full.pdf')"
+ ]
+ },
+ "execution_count": 112,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "example_savedir/f'{snp_name}_ref{refallele}_alt{altallele}_{gene1}_{gene2}.{celltype}_{datasetname}.full.pdf'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.11"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/04_coeqtl_mapping/power_analysis_coeqtls.R b/04_coeqtl_mapping/power_analysis_coeqtls.R
new file mode 100644
index 0000000..ff2e515
--- /dev/null
+++ b/04_coeqtl_mapping/power_analysis_coeqtls.R
@@ -0,0 +1,79 @@
+################################################################################
+# Evaluate how number of triplets decreases the power to detect
+# co-expression QTLs by calculating the power dependent on the sample size
+# (N=173), heritability (Rsq: 0.1-0.3), Bonferroni multiple testing correction,
+# and different number of tests
+# The number of tests is estimated based on different expression cutoffs for
+# the Oelen v3 dataset, assuming that all pairwise combinations are tested for
+# all genes above the respective cutoff and one SNP per pair
+# Input: Seurat object with data from Oelen v3
+# Output: line plot visualizing power for different number of tests
+################################################################################
+
+library(Seurat)
+library(scPower)
+library(ggplot2)
+
+theme_set(theme_bw())
+
+################################################################################
+# Getting expression distribution for Oelen v3 dataset
+################################################################################
+
+#Load complete seurat object
+seurat<-readRDS("seurat_objects/1M_v3_mediumQC_ctd_rnanormed_demuxids_20201106.rds")
+
+#Filter for monocytes and UT timepoint
+seurat<-seurat[,seurat$cell_type_lowerres == "monocyte"]
+seurat<-seurat[,seurat$timepoint == "UT"]
+
+#Calculate for each gene the non-zero ratio
+nonzero_ratio<-rowMeans(as.matrix(seurat@assays$SCT@counts)>0)
+
+#Get cumulative ratio
+thresholds<-seq(0,1,0.05)
+num_genes<-sapply(thresholds,function(i)sum(nonzero_ratio>i))
+
+#Save results in a file
+nonzero_count<-data.frame(nonzero_ratio=thresholds,num_genes)
+
+################################################################################
+# Performing power calculation
+################################################################################
+
+#Samples in meta-analysis
+nSamples<-173
+
+bonfLevel<-function(nTests){
+ return(0.05/nTests)
+}
+
+#Test different heritabilities
+Rsq<-seq(0.1,0.3,0.05)
+
+#Number tests
+nonzero_count$genepairs<-nonzero_count$num_genes*(nonzero_count$num_genes-1)/2
+
+res<-NULL
+for(her in Rsq){
+ for(i in 1:(nrow(nonzero_count)-1)){
+ res<-rbind(res,
+ data.frame(her,
+ numTests=nonzero_count$genepairs[i],
+ cutoff=nonzero_count$nonzero_ratio[i],
+ power=scPower:::power.eqtl.ftest(her,
+ bonfLevel(nonzero_count$genepairs[i]),
+ nSamples)))
+ }
+}
+
+#Plot results
+g<-ggplot(res,aes(x=numTests,y=power,color=as.factor(her)))+
+ geom_line()+
+ scale_color_discrete("Heritability")+
+ scale_x_log10()+
+ xlab("Number tests")+ylab("Power")
+print(g)
+ggsave(g,file="power_calculation/power_effect_nonzeroratio.png",
+ height=5,width=6)
+
diff --git a/04_coeqtl_mapping/prepare_for_rb_calculation.py b/04_coeqtl_mapping/prepare_for_rb_calculation.py
new file mode 100644
index 0000000..9eaf191
--- /dev/null
+++ b/04_coeqtl_mapping/prepare_for_rb_calculation.py
@@ -0,0 +1,307 @@
+import pandas as pd
+import numpy as np
+from pathlib import Path
+from scipy.stats import pearsonr
+import argparse
+
+
+def argumentsparser():
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--filtertype', type=str, dest='filtertype')
+ return parser
+
+def prepare_for_rb_BIOS_replication(celltype, filtertype, bios_replication_type='onlyRNAAlignMetrics'):
+ '''
+ Rb Calculation preparation for BIOS replication
+ '''
+ workdir = Path("./coeqtl_mapping")
+ coeqtl_path = workdir/f'output/{filtertype}/UT_{celltype}/coeqtls_fullresults_fixed.sig.withbios{bios_replication_type}.tsv.gz'
+ coeqtl_df = pd.read_csv(coeqtl_path, sep='\t', compression='gzip')
+ coeqtl_df['theta'] = 0
+ def flip_direction(allele1, allele2, coef2):
+ if allele1 == allele2:
+ return coef2
+ else:
+ return -1*coef2
+ coeqtl_df['flipped_bios_beta'] = [flip_direction(item[0],
+ item[1],
+ item[2]) for item in
+ coeqtl_df[['SNPEffectAllele',
+ 'assessed_allele_bios',
+ 'coef_bios']].values]
+ coeqtl_df[['snp_genepair', 'snp_eqtlgene',
+ 'flipped_bios_beta', 'std err_bios',
+ 'MetaBeta', 'MetaSE', 'theta']].dropna().to_csv(
+ workdir/f'bios/{bios_replication_type}/{filtertype}/UT_{celltype}/replication_parameters.csv'
+ )
+ return coeqtl_df
+
+
+def find_gene2(genepair, eqtlgene):
+ gene1, gene2 = genepair.split(';')
+ if gene1 == eqtlgene:
+ return gene2
+ else:
+ return gene1
+
+
+def flip_direction(df, flipcol, allele1_col, allele2_col):
+ df = df.rename({flipcol: f'{flipcol}_ori'}, axis=1)
+ def flip(x1, x2, x3):
+ if not pd.isnull(x1):
+ if x2 == x3:
+ return x1
+ else:
+ return -x1
+ else:
+ return x1
+ df[f'{flipcol}'] = [flip(score, allele1, allele2) for (score, allele1,allele2)
+ in df[[f'{flipcol}_ori', allele1_col, allele2_col]].values]
+ return df
+
+
+# coeQTLs
+args = argumentsparser().parse_args()
+filtertype = args.filtertype
+workdir = Path("/groups/umcg-lld/tmp01/projects/1MCellRNAseq/GRN_reconstruction/ongoing/coeqtl_mapping")
+celltypes = ['CD4T', 'CD8T', 'monocyte', 'DC', 'B', 'NK']
+for celltype_replication in celltypes:
+ print(f"Discovery: {celltype_replication}")
+ replication_coeqtl_path = workdir / f'output/{filtertype}/UT_{celltype_replication}/coeqtls_fullresults_fixed.all.tsv.gz'
+ replication_coeqtl_df = pd.read_csv(replication_coeqtl_path, sep='\t', compression='gzip')
+ replication_coeqtl_df['gene2'] = [find_gene2(x[0], x[1]) for x in
+ replication_coeqtl_df[['Gene',
+ 'eqtlgene']].values]
+ replication_coeqtl_df['snp_eqtlgene_gene2'] = ['_'.join([item[0], item[1]]) for item in
+ replication_coeqtl_df[['snp_eqtlgene',
+ 'gene2']].values]
+ replication_coeqtl_df = replication_coeqtl_df.set_index('snp_eqtlgene_gene2')
+ replication_coexpression_df = pd.read_csv(workdir/f'input/individual_networks/UT/UT_{celltype_replication}.sigcoeQTLs.tsv.gz',
+ compression='gzip', sep='\t', index_col=0)
+ for celltype_discovery in celltypes:
+ if celltype_replication != celltype_discovery:
+ print(f"Replication: {celltype_discovery}")
+ discovery_coeqtl_path = workdir / f'output/{filtertype}/UT_{celltype_discovery}/coeqtls_fullresults_fixed.sig.tsv.gz'
+ discovery_coeqtl_df = pd.read_csv(discovery_coeqtl_path, sep='\t', compression='gzip')
+ discovery_coeqtl_df['gene2'] = [find_gene2(x[0], x[1]) for x in
+ discovery_coeqtl_df[['Gene',
+ 'eqtlgene']].values]
+ discovery_coeqtl_df['snp_eqtlgene_gene2'] = ['_'.join([item[0], item[1]]) for item in
+ discovery_coeqtl_df[['snp_eqtlgene',
+ 'gene2']].values]
+ discovery_coeqtl_df = discovery_coeqtl_df.set_index('snp_eqtlgene_gene2')
+ tested_coeqtls = list(set(replication_coeqtl_df.index) & set(discovery_coeqtl_df.index))
+ merged_coeqtl_df = pd.concat([replication_coeqtl_df.loc[tested_coeqtls],
+ discovery_coeqtl_df.loc[tested_coeqtls].add_suffix('_replication')], # todo: here is wrong.. should be discovery
+ axis=1)
+ merged_coeqtl_df = flip_direction(merged_coeqtl_df,
+ 'MetaBeta_replication',
+ 'SNPEffectAllele',
+ 'SNPEffectAllele_replication') # MetaBeta, MetaSE, MetaBeta_replication, MetaSE_replication
+ disovery_coexpression_df = pd.read_csv(
+ workdir / f'input/individual_networks/UT/UT_{celltype_discovery}.sigcoeQTLs.tsv.gz',
+ compression='gzip', sep='\t', index_col=0)
+ # find overlapping individuals
+ tested_genepairs = list(merged_coeqtl_df['Gene'].unique())
+ tested_coexpression_discovery_df = disovery_coexpression_df.loc[tested_genepairs]
+ tested_coexpression_discovery_df.replace([np.inf, -np.inf], np.nan, inplace=True)
+ tested_coexpression_replication_df = replication_coexpression_df.loc[tested_genepairs]
+ tested_coexpression_replication_df.replace([np.inf, -np.inf], np.nan, inplace=True)
+ other_col_dict = {genepair:np.nan for genepair in tested_genepairs}
+ for genepair in tested_genepairs:
+ tested_coexpression_discovery_genepair_nonan = tested_coexpression_discovery_df.loc[genepair].dropna()
+ tested_coexpression_replication_genepair_nonan = tested_coexpression_replication_df.loc[genepair].dropna()
+ common_individuals = list(set(tested_coexpression_discovery_genepair_nonan.index) & set(tested_coexpression_replication_genepair_nonan.index))
+ num_common = len(common_individuals)
+ num_discovery = tested_coexpression_discovery_genepair_nonan.shape[0]
+ num_replication = tested_coexpression_replication_genepair_nonan.shape[0]
+ rho = pearsonr(tested_coexpression_discovery_genepair_nonan[common_individuals],
+ tested_coexpression_replication_genepair_nonan[common_individuals])[0]
+ other_col_dict[genepair] = rho * num_common / np.sqrt(num_discovery * num_replication)
+ merged_coeqtl_df['theta'] = [other_col_dict.get(genepair) for genepair in merged_coeqtl_df['Gene']]
+ merged_coeqtl_df[['MetaBeta',
+ 'MetaBeta_replication',
+ 'MetaSE',
+ 'MetaSE_replication',
+ 'theta']].to_csv(workdir/f'output/{filtertype}/rb_calculations/discovery_{celltype_discovery}_replication_{celltype_replication}.tsv.gz',
+ sep='\t',
+ compression='gzip')
+ else:
+ continue
+
+
+# cmono ncmono and monocyte
+filtertype = 'filtered_results'
+workdir = Path("./coeqtl_mapping")
+celltypes = ['monocyte', 'cMono', 'ncMono']
+for celltype_replication in celltypes:
+ print(f"Discovery: {celltype_replication}")
+ replication_coeqtl_path = workdir / f'output/{filtertype}/UT_{celltype_replication}/coeqtls_fullresults_fixed.all.tsv.gz'
+ replication_coeqtl_df = pd.read_csv(replication_coeqtl_path, sep='\t', compression='gzip')
+ replication_coeqtl_df['gene2'] = [find_gene2(x[0], x[1]) for x in
+ replication_coeqtl_df[['Gene',
+ 'eqtlgene']].values]
+ replication_coeqtl_df['snp_eqtlgene_gene2'] = ['_'.join([item[0], item[1]]) for item in
+ replication_coeqtl_df[['snp_eqtlgene',
+ 'gene2']].values]
+ replication_coeqtl_df = replication_coeqtl_df.set_index('snp_eqtlgene_gene2')
+ replication_coexpression_df = pd.read_csv(workdir/f'input/individual_networks/UT/monocyte_subcelltypes/UT_{celltype_replication}.sigcoeQTLs.tsv.gz',
+ compression='gzip', sep='\t', index_col=0)
+ for celltype_discovery in celltypes:
+ if celltype_replication != celltype_discovery:
+ print(f"Replication: {celltype_discovery}")
+ discovery_coeqtl_path = workdir / f'output/{filtertype}/UT_{celltype_discovery}/coeqtls_fullresults_fixed.sig.tsv.gz'
+ discovery_coeqtl_df = pd.read_csv(discovery_coeqtl_path, sep='\t', compression='gzip')
+ discovery_coeqtl_df['gene2'] = [find_gene2(x[0], x[1]) for x in
+ discovery_coeqtl_df[['Gene',
+ 'eqtlgene']].values]
+ discovery_coeqtl_df['snp_eqtlgene_gene2'] = ['_'.join([item[0], item[1]]) for item in
+ discovery_coeqtl_df[['snp_eqtlgene',
+ 'gene2']].values]
+ discovery_coeqtl_df = discovery_coeqtl_df.set_index('snp_eqtlgene_gene2')
+ tested_coeqtls = list(set(replication_coeqtl_df.index) & set(discovery_coeqtl_df.index))
+ merged_coeqtl_df = pd.concat([replication_coeqtl_df.loc[tested_coeqtls],
+ discovery_coeqtl_df.loc[tested_coeqtls].add_suffix('_replication')], # todo: also here it is wrong...
+ axis=1)
+ merged_coeqtl_df = flip_direction(merged_coeqtl_df,
+ 'MetaBeta_replication',
+ 'SNPEffectAllele',
+ 'SNPEffectAllele_replication') # MetaBeta, MetaSE, MetaBeta_replication, MetaSE_replication
+ disovery_coexpression_df = pd.read_csv(
+ workdir / f'input/individual_networks/UT/monocyte_subcelltypes/UT_{celltype_discovery}.sigcoeQTLs.tsv.gz',
+ compression='gzip', sep='\t', index_col=0)
+ # find overlapping individuals
+ tested_genepairs = list(merged_coeqtl_df['Gene'].unique())
+ tested_coexpression_discovery_df = disovery_coexpression_df.loc[tested_genepairs]
+ tested_coexpression_discovery_df.replace([np.inf, -np.inf], np.nan, inplace=True)
+ tested_coexpression_replication_df = replication_coexpression_df.loc[tested_genepairs]
+ tested_coexpression_replication_df.replace([np.inf, -np.inf], np.nan, inplace=True)
+ other_col_dict = {genepair:np.nan for genepair in tested_genepairs}
+ for genepair in tested_genepairs:
+ tested_coexpression_discovery_genepair_nonan = tested_coexpression_discovery_df.loc[genepair].dropna()
+ tested_coexpression_replication_genepair_nonan = tested_coexpression_replication_df.loc[genepair].dropna()
+ common_individuals = list(set(tested_coexpression_discovery_genepair_nonan.index) & set(tested_coexpression_replication_genepair_nonan.index))
+ num_common = len(common_individuals)
+ num_discovery = tested_coexpression_discovery_genepair_nonan.shape[0]
+ num_replication = tested_coexpression_replication_genepair_nonan.shape[0]
+ rho = pearsonr(tested_coexpression_discovery_genepair_nonan[common_individuals],
+ tested_coexpression_replication_genepair_nonan[common_individuals])[0]
+ other_col_dict[genepair] = rho * num_common / np.sqrt(num_discovery * num_replication)
+ merged_coeqtl_df['theta'] = [other_col_dict.get(genepair) for genepair in merged_coeqtl_df['Gene']]
+ merged_coeqtl_df[['MetaBeta',
+ 'MetaBeta_replication',
+ 'MetaSE',
+ 'MetaSE_replication',
+ 'theta']].to_csv(workdir/f'output/{filtertype}/rb_calculations/monocyte_subcelltypes/discovery_{celltype_discovery}_replication_{celltype_replication}.tsv.gz',
+ sep='\t',
+ compression='gzip')
+ else:
+ continue
+
+
+# eQTLs
+workdir = Path("./cis_eqtl_single_cell/EMP_mapping_1_12_2021_perm1000/output/")
+celltypes = ['CD4T', 'CD8T', 'monocyte', 'DC', 'B', 'NK']
+genename_dict = pd.read_csv('/groups/umcg-lld/tmp01/projects/1MCellRNAseq/GRN_reconstruction/ongoing/resources/features_v3_reformated_names.tsv',
+ sep='\t', names=['ensemblid', 'genename']).set_index('ensemblid')['genename'].T.to_dict()
+
+def read_alldataset_celltype_expression_df(celltype):
+ expression_prefix = Path('./expression_files/sources_for_coeqtl')
+ df = pd.DataFrame()
+ for datasetname in ['1m_v2', '1m_v3', 'NG', 't8w']:
+ dataset_df = pd.read_csv(expression_prefix/f'{datasetname}/{celltype}_expression.tsv', sep='\t', index_col=0)
+ dataset_df['genename'] = [genename_dict.get(geneid) for geneid in dataset_df.index]
+ dataset_df = dataset_df.dropna(subset=['genename']).set_index('genename')
+ df = pd.concat([df, dataset_df], axis=1)
+ return df
+
+for celltype_replication in celltypes:
+ print(f"Discovery: {celltype_replication}")
+ replication_coeqtl_path = workdir / f'{celltype_replication}/eQTLsFDR-ProbeLevel.txt.gz'
+ replication_coeqtl_df = pd.read_csv(replication_coeqtl_path, sep='\t', compression='gzip')
+ replication_coeqtl_df['genename'] = [genename_dict.get(ensemblid) for ensemblid in replication_coeqtl_df['ProbeName']]
+ replication_coeqtl_df['snp_gene'] = ['_'.join(item) for item in replication_coeqtl_df[['SNPName', 'genename']].values]
+ replication_coeqtl_df = replication_coeqtl_df.set_index('snp_gene')
+ replication_coeqtl_df['metabeta'] = [float(item.split(' (')[0]) for item in replication_coeqtl_df['Meta-Beta (SE)']]
+ replication_coeqtl_df['SE'] = [float(item.split(' (')[1][:-2]) for item in replication_coeqtl_df['Meta-Beta (SE)']]
+ replication_coexpression_df = read_alldataset_celltype_expression_df(celltype_replication)
+ for celltype_discovery in celltypes:
+ if celltype_replication != celltype_discovery:
+ print(f"Replication: {celltype_discovery}")
+ discovery_coeqtl_path = workdir / f'{celltype_discovery}/eQTLsFDR0.05-ProbeLevel.txt.gz'
+ discovery_coeqtl_df = pd.read_csv(discovery_coeqtl_path, sep='\t', compression='gzip')
+ discovery_coeqtl_df['genename'] = [genename_dict.get(ensemblid) for ensemblid in
+ discovery_coeqtl_df['ProbeName']]
+ discovery_coeqtl_df['snp_gene'] = ['_'.join(item) for item in
+ discovery_coeqtl_df[['SNPName', 'genename']].values]
+ discovery_coeqtl_df = discovery_coeqtl_df.set_index('snp_gene')
+ discovery_coeqtl_df['metabeta'] = [float(item.split(' (')[0]) for item in discovery_coeqtl_df['Meta-Beta (SE)']]
+ discovery_coeqtl_df['SE'] = [float(item.split(' (')[1][:-2]) for item in discovery_coeqtl_df['Meta-Beta (SE)']]
+ tested_eqtls = list(set(replication_coeqtl_df.index) & set(discovery_coeqtl_df.index))
+ merged_coeqtl_df = pd.concat([replication_coeqtl_df.loc[tested_eqtls],
+ discovery_coeqtl_df.loc[tested_eqtls].add_suffix('_replication')], # todo here it is wrong...
+ axis=1)
+ merged_coeqtl_df = flip_direction(merged_coeqtl_df,
+ 'metabeta_replication',
+ 'AlleleAssessed',
+ 'AlleleAssessed_replication')
+ discovery_coexpression_df = read_alldataset_celltype_expression_df(celltype_discovery)
+ # find overlapping individuals
+ tested_genepairs = list(merged_coeqtl_df['genename'].unique())
+ tested_coexpression_discovery_df = discovery_coexpression_df.loc[tested_genepairs]
+ tested_coexpression_discovery_df.replace([np.inf, -np.inf], np.nan, inplace=True)
+ tested_coexpression_replication_df = replication_coexpression_df.loc[tested_genepairs]
+ tested_coexpression_replication_df.replace([np.inf, -np.inf], np.nan, inplace=True)
+ other_col_dict = {genepair:np.nan for genepair in tested_genepairs}
+ for genepair in tested_genepairs:
+ tested_coexpression_discovery_genepair_nonan = tested_coexpression_discovery_df.loc[genepair].dropna()
+ tested_coexpression_replication_genepair_nonan = tested_coexpression_replication_df.loc[genepair].dropna()
+ common_individuals = list(set(tested_coexpression_discovery_genepair_nonan.index) & set(tested_coexpression_replication_genepair_nonan.index))
+ num_common = len(common_individuals)
+ num_discovery = tested_coexpression_discovery_genepair_nonan.shape[0]
+ num_replication = tested_coexpression_replication_genepair_nonan.shape[0]
+ rho = pearsonr(tested_coexpression_discovery_genepair_nonan[common_individuals],
+ tested_coexpression_replication_genepair_nonan[common_individuals])[0]
+ other_col_dict[genepair] = rho * num_common / np.sqrt(num_discovery * num_replication)
+ merged_coeqtl_df['theta'] = [other_col_dict.get(genepair) for genepair in merged_coeqtl_df['genename']]
+ merged_coeqtl_df[['metabeta',
+ 'metabeta_replication',
+ 'SE',
+ 'SE_replication',
+ 'theta']].to_csv(f'./coeqtl_mapping/input/snp_selection/rb_calculations/discovery_{celltype_discovery}_replication_{celltype_replication}.tsv.gz',
+ sep='\t',
+ compression='gzip')
+ else:
+ continue
+
+
+
+filtertype = 'filtered_results'
+workdir = Path("./coeqtl_mapping")
+celltypes = ['CD4T', 'CD8T', 'monocyte', 'DC', 'B', 'NK']
+for celltype_replication in celltypes:
+ for celltype_discovery in celltypes:
+ if celltype_replication != celltype_discovery:
+ print(celltype_discovery, celltype_replication)
+ merged_coeqtl_df = pd.read_csv(workdir/f'output/{filtertype}/rb_calculations/discovery_{celltype_discovery}_replication_{celltype_replication}.tsv.gz',
+ sep='\t',
+ compression='gzip')
+ merged_coeqtl_df = merged_coeqtl_df.rename({
+ 'MetaBeta_replication': 'MetaBeta_discovery',
+ 'MetaSE_replication': 'MetaSE_discovery'
+ },
+ axis=1)
+ merged_coeqtl_df = merged_coeqtl_df.rename({
+ 'MetaBeta': 'MetaBeta_replication',
+ 'MetaSE': 'MetaSE_replication'
+ },
+ axis=1)
+ merged_coeqtl_df = merged_coeqtl_df.rename({
+ 'MetaBeta_discovery': 'MetaBeta',
+ 'MetaSE_discovery': 'MetaSE'
+ },
+ axis=1)
+ merged_coeqtl_df.to_csv(
+ workdir / f'output/{filtertype}/rb_calculations/discovery_{celltype_discovery}_replication_{celltype_replication}.fixed.tsv.gz',
+ sep='\t',
+ compression='gzip')
\ No newline at end of file
diff --git a/04_coeqtl_mapping/prepare_genelist_and_annotation_for_betaqtl.py b/04_coeqtl_mapping/prepare_genelist_and_annotation_for_betaqtl.py
new file mode 100644
index 0000000..c5ea9f3
--- /dev/null
+++ b/04_coeqtl_mapping/prepare_genelist_and_annotation_for_betaqtl.py
@@ -0,0 +1,73 @@
+import pandas as pd
+import numpy as np
+from pathlib import Path
+import argparse
+import os
+
+
+def parse():
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--condition', dest = 'condition')
+ parser.add_argument('--celltype', dest='celltype')
+ return parser
+
+args = parse().parse_args()
+condition, celltype = args.condition , args.celltype
+
+# old code for creating the annotation file..
+workdir = Path("/groups/umcg-lld/tmp01/projects/1MCellRNAseq/GRN_reconstruction/ongoing/coeqtl_mapping/")
+eqtl_annotations_path = workdir/f'input/snp_genepair_selection/annotations/{condition}_{celltype}.baseline.annotatedeQTL.tsv'
+savepath = workdir/f'output/{condition}_{celltype}/'
+if not os.path.isdir(savepath):
+ os.mkdir(savepath)
+
+eqtl_annotations = pd.read_csv(eqtl_annotations_path, sep='\t')
+
+annotation_cols = ['Platform', 'ArrayAddress', 'Symbol', 'Chr', 'ChrStart', 'ChrEnd', 'Probe', 'Seq']
+gene_annotation_dict = pd.read_csv('/groups/umcg-bios/tmp01/projects/1M_cells_scRNAseq/ongoing/eQTL_mapping/probeannotation/singleCell-annotation-stripped.tsv',
+ sep='\t').set_index('Ensembl').T.to_dict()
+genename_ensembl_mapping = pd.read_csv(workdir/'../resources/features_v3_reformated_names.tsv',
+ sep='\t', names=['Ensembl', 'genename']).set_index('genename')['Ensembl'].T.to_dict()
+
+
+eqtl_annotations['ArrayAddress'] = eqtl_annotations['genepair_sorted']
+eqtl_annotations['Symbol'] = eqtl_annotations['genepair_sorted']
+eqtl_annotations['Probe'] = eqtl_annotations['genepair_sorted']
+eqtl_annotations['Seq'] = 'NNNNNNN'
+getchr = lambda x:gene_annotation_dict.get(x)['Chr'] if x in gene_annotation_dict else np.nan
+getchrstart = lambda x:int(gene_annotation_dict.get(x)['ChrStart']) if x in gene_annotation_dict else np.nan
+getchrend = lambda x:int(gene_annotation_dict.get(x)['ChrEnd']) if x in gene_annotation_dict else np.nan
+eqtl_annotations['Platform'] = 'SingleCell'
+
+eqtl_annotations['eqtlgene'] = [item.split(';')[0] for item in eqtl_annotations['eqtlgene1_gene2']]
+eqtl_annotations['eqtlgene_ensembl'] = [genename_ensembl_mapping.get(genename) for genename in eqtl_annotations['eqtlgene']]
+
+eqtl_annotations['Chr'] = [getchr(gene) for gene in eqtl_annotations['eqtlgene_ensembl']]
+eqtl_annotations['ChrStart'] = [getchrstart(gene) for gene in eqtl_annotations['eqtlgene_ensembl']]
+eqtl_annotations['ChrEnd'] = [getchrend(gene) for gene in eqtl_annotations['eqtlgene_ensembl']]
+counts = eqtl_annotations['genepair_sorted'].value_counts()
+duplicated_genepairs_set = set(counts[counts>1].index.values)
+isdup = lambda x:True if x in duplicated_genepairs_set else False
+eqtl_annotations['isdup'] = [isdup(genepair) for genepair in eqtl_annotations['genepair_sorted']]
+eqtl_annotations[eqtl_annotations['isdup']==False][['snp', 'genepair_sorted']].to_csv(workdir/f'input/snp_genepair_selection/{condition}_{celltype}.baseline.noduplicated.tsv',
+ sep='\t', index=False)
+eqtl_annotations[eqtl_annotations['isdup']==False][annotation_cols].to_csv(workdir/f'input/summary/{condition}_{celltype}.genepairs.annotation.gene1position.noduplicated.tsv',
+ sep='\t', index=False)
+eqtl_annotations[eqtl_annotations['isdup']==False][['genepair_sorted']].to_csv(savepath/'genelist.noduplicated.txt', header=None, index=False)
+
+
+duplicated = eqtl_annotations[eqtl_annotations['isdup']].drop_duplicates(subset=['genepair_sorted'], keep='first')
+duplicated[['snp', 'genepair_sorted']].to_csv(workdir/f'input/snp_genepair_selection/{condition}_{celltype}.baseline.duplicatedversion1.tsv',
+ sep='\t', index=False)
+duplicated[annotation_cols].to_csv(workdir/f'input/summary/{condition}_{celltype}.genepairs.annotation.gene1position.duplicatedversion1.tsv',
+ sep='\t', index=False)
+duplicated[['genepair_sorted']].to_csv(savepath/'genelist.duplicatedversion1.txt', header=None, index=False)
+
+
+duplcated_version2 = eqtl_annotations[eqtl_annotations['isdup']].drop_duplicates(subset=['genepair_sorted'], keep='last')
+duplcated_version2[['snp', 'genepair_sorted']].to_csv(workdir/f'input/snp_genepair_selection/{condition}_{celltype}.baseline.duplicatedversion2.tsv',
+ sep='\t', index=False)
+duplcated_version2[annotation_cols].to_csv(workdir/f'input/summary/{condition}_{celltype}.genepairs.annotation.gene1position.duplicatedversion2.tsv',
+ sep='\t', index=False)
+duplcated_version2[['genepair_sorted']].to_csv(savepath/'genelist.duplicatedversion2.txt', header=None, index=False)
+
diff --git a/04_coeqtl_mapping/rb_celltypes.ipynb b/04_coeqtl_mapping/rb_celltypes.ipynb
new file mode 100644
index 0000000..834ede3
--- /dev/null
+++ b/04_coeqtl_mapping/rb_celltypes.ipynb
@@ -0,0 +1,2026 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 67,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import matplotlib as mpl\n",
+ "mpl.rcParams['pdf.fonttype'] = 42\n",
+ "import matplotlib.pyplot as plt\n",
+ "import seaborn as sns\n",
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "%matplotlib inline\n",
+ "\n",
+ "from pathlib import Path\n",
+ "workdir = Path(\"./coeqtl_mapping/\")\n",
+ "\n",
+ "celltypes = ['CD4T', 'CD8T', 'monocyte', 'DC', 'NK', 'B']\n",
+ "import matplotlib\n",
+ "def heatmap(data, row_labels, col_labels, ax=None,\n",
+ " cbar_kw={}, cbarlabel=\"\", **kwargs):\n",
+ " \"\"\"\n",
+ " Create a heatmap from a numpy array and two lists of labels.\n",
+ "\n",
+ " Parameters\n",
+ " ----------\n",
+ " data\n",
+ " A 2D numpy array of shape (M, N).\n",
+ " row_labels\n",
+ " A list or array of length M with the labels for the rows.\n",
+ " col_labels\n",
+ " A list or array of length N with the labels for the columns.\n",
+ " ax\n",
+ " A `matplotlib.axes.Axes` instance to which the heatmap is plotted. If\n",
+ " not provided, use current axes or create a new one. Optional.\n",
+ " cbar_kw\n",
+ " A dictionary with arguments to `matplotlib.Figure.colorbar`. Optional.\n",
+ " cbarlabel\n",
+ " The label for the colorbar. Optional.\n",
+ " **kwargs\n",
+ " All other arguments are forwarded to `imshow`.\n",
+ " \"\"\"\n",
+ "\n",
+ " if not ax:\n",
+ " ax = plt.gca()\n",
+ "\n",
+ " # Plot the heatmap\n",
+ " im = ax.pcolormesh(data, **kwargs)\n",
+ "\n",
+ " # Create colorbar\n",
+ " cbar = ax.figure.colorbar(im, ax=ax, **cbar_kw)\n",
+ " cbar.ax.set_ylabel(cbarlabel, rotation=-90, va=\"bottom\")\n",
+ "\n",
+ " # Let the horizontal axes labeling appear on top.\n",
+ " ax.tick_params(top=True, bottom=False,\n",
+ " labeltop=True, labelbottom=False)\n",
+ "\n",
+ " # Rotate the tick labels and set their alignment.\n",
+ " plt.setp(ax.get_xticklabels(), rotation=-30, ha=\"right\",\n",
+ " rotation_mode=\"anchor\")\n",
+ "\n",
+ " # Turn spines off and create white grid.\n",
+ "# ax.spines[:].set_visible(False)\n",
+ "\n",
+ "# ax.set_xticks(np.arange(-0.5, data.shape[1]-2, 1), minor=True)\n",
+ "# ax.set_yticks(np.arange(-0.5, data.shape[0]-2, 1), minor=True)\n",
+ " # Show all ticks and label them with the respective list entries.\n",
+ " ax.set_xticklabels([\"\"]+col_labels)\n",
+ " ax.set_yticklabels([\"\"]+row_labels)\n",
+ "# ax.grid(which='minor', color=\"white\", linestyle='-', linewidth=2)\n",
+ "# ax.tick_params(which=\"minor\", bottom=False, left=False)\n",
+ " return im, cbar\n",
+ "\n",
+ "\n",
+ "def annotate_heatmap(im, data=None, valfmt=\"{x:.2f}\",\n",
+ " textcolors=(\"black\", \"white\"),\n",
+ " threshold=None, **textkw):\n",
+ " \"\"\"\n",
+ " A function to annotate a heatmap.\n",
+ "\n",
+ " Parameters\n",
+ " ----------\n",
+ " im\n",
+ " The AxesImage to be labeled.\n",
+ " data\n",
+ " Data used to annotate. If None, the image's data is used. Optional.\n",
+ " valfmt\n",
+ " The format of the annotations inside the heatmap. This should either\n",
+ " use the string format method, e.g. \"$ {x:.2f}\", or be a\n",
+ " `matplotlib.ticker.Formatter`. Optional.\n",
+ " textcolors\n",
+ " A pair of colors. The first is used for values below a threshold,\n",
+ " the second for those above. Optional.\n",
+ " threshold\n",
+ " Value in data units according to which the colors from textcolors are\n",
+ " applied. If None (the default) uses the middle of the colormap as\n",
+ " separation. Optional.\n",
+ " **kwargs\n",
+ " All other arguments are forwarded to each call to `text` used to create\n",
+ " the text labels.\n",
+ " \"\"\"\n",
+ "\n",
+ " # Normalize the threshold to the images color range.\n",
+ " if threshold is not None:\n",
+ " threshold = im.norm(threshold)\n",
+ " else:\n",
+ " threshold = im.norm(data.max())/2.\n",
+ "\n",
+ " # Set default alignment to center, but allow it to be\n",
+ " # overwritten by textkw.\n",
+ " kw = dict(horizontalalignment=\"center\",\n",
+ " verticalalignment=\"center\")\n",
+ " kw.update(textkw)\n",
+ "\n",
+ " # Get the formatter in case a string is supplied\n",
+ " if isinstance(valfmt, str):\n",
+ " valfmt = matplotlib.ticker.StrMethodFormatter(valfmt)\n",
+ "\n",
+ " # Loop over the data and create a `Text` for each \"pixel\".\n",
+ " # Change the text's color depending on the data.\n",
+ " texts = []\n",
+ " for i in range(data.shape[0]):\n",
+ " for j in range(data.shape[1]):\n",
+ "# kw.update(color=textcolors[int(im.norm(data[i, j]) > threshold)])\n",
+ " text = im.axes.text(j+0.5, i+0.5, valfmt(data[i, j], None), **kw)#j+0.1, i+0.5\n",
+ " texts.append(text)\n",
+ "\n",
+ " return texts"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## celltypes"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "filtered_res_df = pd.read_csv(workdir/'output/filtered_results/rb_calculations/summary.csv', index_col=0)\n",
+ "unfiltered_res_df = pd.read_csv(workdir/'output/unfiltered_results/rb_calculations/summary.csv', index_col=0)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "filtered_res_df_clean = filtered_res_df[filtered_res_df['celltype_discovery']!='B']\n",
+ "filtered_res_df_clean = filtered_res_df_clean.dropna()\n",
+ "filtered_res_df_clean.to_excel(workdir/'output/summary/rb_values_replication_in_other_celltypes_filtered_results.xlsx')\n",
+ "\n",
+ "unfiltered_res_df_clean = unfiltered_res_df[unfiltered_res_df['celltype_discovery']!='B']\n",
+ "unfiltered_res_df_clean = unfiltered_res_df_clean.dropna()\n",
+ "unfiltered_res_df_clean.to_excel(workdir/'output/summary/rb_values_replication_in_other_celltypes_unfiltered_results.xlsx')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### filtered results"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "# filtered results\n",
+ "rb_df = pd.DataFrame(data=np.zeros((len(celltypes), len(celltypes))), \n",
+ " columns=celltypes, index=celltypes)\n",
+ "rbse_df = pd.DataFrame(data=np.zeros((len(celltypes), len(celltypes))), \n",
+ " columns=celltypes, index=celltypes)\n",
+ "rbpvalue_df = pd.DataFrame(data=np.zeros((len(celltypes), len(celltypes))), \n",
+ " columns=celltypes, index=celltypes)\n",
+ "numcoeqtl_df = pd.DataFrame(data=np.zeros((len(celltypes), len(celltypes))), \n",
+ " columns=celltypes, index=celltypes)\n",
+ "num_anno_df = pd.DataFrame(data=np.zeros((len(celltypes), len(celltypes))), \n",
+ " columns=celltypes, index=celltypes)\n",
+ "rbse_anno_df = pd.DataFrame(data=np.zeros((len(celltypes), len(celltypes))), \n",
+ " columns=celltypes, index=celltypes)\n",
+ "for discovery_celltype in celltypes:\n",
+ " # replication in other celltypes\n",
+ " for replication_celltype in celltypes:\n",
+ " if discovery_celltype != replication_celltype:\n",
+ " rb_results = filtered_res_df[(filtered_res_df['celltype_discovery'] == discovery_celltype) &\n",
+ " (filtered_res_df['celltype_replication'] == replication_celltype)]\n",
+ " replicated_coeqtls_num = pd.read_csv(\n",
+ " workdir/f'output/filtered_results/rb_calculations/discovery_{discovery_celltype}_replication_{replication_celltype}.tsv.gz',\n",
+ " compression='gzip',\n",
+ " sep='\\t',\n",
+ " index_col=0\n",
+ " ).shape[0]\n",
+ " if rb_results['r'].values[0] < 10 and discovery_celltype != 'B':\n",
+ " rb_df.loc[replication_celltype, discovery_celltype] = rb_results['r'].values[0]\n",
+ " rbse_df.loc[replication_celltype, discovery_celltype] = rb_results['se_r'].values[0]\n",
+ " rbpvalue_df.loc[replication_celltype, discovery_celltype] = rb_results['p'].values[0]\n",
+ " numcoeqtl_df.loc[replication_celltype, discovery_celltype] = replicated_coeqtls_num\n",
+ " rbvalue = rb_results['r'].values[0]\n",
+ " rbsevalue = rb_results['se_r'].values[0]\n",
+ " num_anno_df.loc[replication_celltype, discovery_celltype] = \\\n",
+ " f\"N={replicated_coeqtls_num}\"\n",
+ " rbse_anno_df.loc[replication_celltype, discovery_celltype] = \\\n",
+ " f\"{rbvalue:.2f}\\nN={replicated_coeqtls_num}\"\n",
+ " elif discovery_celltype == 'B':\n",
+ " rb_df.loc[replication_celltype, discovery_celltype] = np.nan\n",
+ " rbse_df.loc[replication_celltype, discovery_celltype] = np.nan\n",
+ " rbpvalue_df.loc[replication_celltype, discovery_celltype] = 0\n",
+ " numcoeqtl_df.loc[replication_celltype, discovery_celltype] = replicated_coeqtls_num\n",
+ " rbvalue = rb_results['r'].values[0]\n",
+ " rbsevalue = rb_results['se_r'].values[0]\n",
+ " num_anno_df.loc[replication_celltype, discovery_celltype] = \\\n",
+ " f\"N={replicated_coeqtls_num}\"\n",
+ " rbse_anno_df.loc[replication_celltype, discovery_celltype] = \\\n",
+ " f\"N={replicated_coeqtls_num}\"\n",
+ " else:\n",
+ " rb_df.loc[replication_celltype, discovery_celltype] = np.nan\n",
+ " rbse_df.loc[replication_celltype, discovery_celltype] = np.nan\n",
+ " rbpvalue_df.loc[replication_celltype, discovery_celltype] = 0\n",
+ " numcoeqtl_df.loc[replication_celltype, discovery_celltype] = replicated_coeqtls_num\n",
+ " num_anno_df.loc[replication_celltype, discovery_celltype] = \\\n",
+ " f\"N={replicated_coeqtls_num}\"\n",
+ " rbse_anno_df.loc[replication_celltype, discovery_celltype] = \\\n",
+ " f\"N={replicated_coeqtls_num}\"\n",
+ " else:\n",
+ " rb_df.loc[replication_celltype, discovery_celltype] = 1\n",
+ " rbse_df.loc[replication_celltype, discovery_celltype] = 0\n",
+ " rbpvalue_df.loc[replication_celltype, discovery_celltype] = 0\n",
+ " replicated_coeqtls_num = pd.read_csv(\n",
+ " workdir/f'output/filtered_results/UT_{discovery_celltype}/coeqtls_fullresults_fixed.sig.tsv.gz',\n",
+ " compression='gzip',\n",
+ " sep='\\t'\n",
+ " ).shape[0]\n",
+ " numcoeqtl_df.loc[replication_celltype, discovery_celltype] = replicated_coeqtls_num\n",
+ " num_anno_df.loc[replication_celltype, discovery_celltype] = \\\n",
+ " f\"N={replicated_coeqtls_num}\"\n",
+ " rbse_anno_df.loc[replication_celltype, discovery_celltype] = \\\n",
+ " f\"N={replicated_coeqtls_num}\"\n",
+ " \n",
+ "replicated_ratio_df = pd.DataFrame(data=np.zeros((len(celltypes), len(celltypes))), \n",
+ " columns=celltypes, index=celltypes)\n",
+ "for discovery_celltype in numcoeqtl_df.columns:\n",
+ " for replication_celltype in numcoeqtl_df.index:\n",
+ " replicated_ratio_df.loc[replication_celltype, discovery_celltype] = \\\n",
+ " numcoeqtl_df.loc[replication_celltype, discovery_celltype] / numcoeqtl_df.loc[discovery_celltype, discovery_celltype]\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " CD4T | \n",
+ " CD8T | \n",
+ " monocyte | \n",
+ " DC | \n",
+ " NK | \n",
+ " B | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " CD4T | \n",
+ " 1.000000 | \n",
+ " 0.971596 | \n",
+ " 0.759425 | \n",
+ " 0.773429 | \n",
+ " 0.953264 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " CD8T | \n",
+ " 0.988285 | \n",
+ " 1.000000 | \n",
+ " 0.847118 | \n",
+ " 1.002450 | \n",
+ " 0.966100 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " monocyte | \n",
+ " 0.792142 | \n",
+ " 0.779688 | \n",
+ " 1.000000 | \n",
+ " 0.797139 | \n",
+ " 0.960618 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " DC | \n",
+ " 0.794745 | \n",
+ " 0.815816 | \n",
+ " 0.935905 | \n",
+ " 1.000000 | \n",
+ " 0.853924 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " NK | \n",
+ " 0.925802 | \n",
+ " 0.967842 | \n",
+ " 0.868747 | \n",
+ " NaN | \n",
+ " 1.000000 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " B | \n",
+ " 0.918479 | \n",
+ " 0.952496 | \n",
+ " 0.948709 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " CD4T CD8T monocyte DC NK B\n",
+ "CD4T 1.000000 0.971596 0.759425 0.773429 0.953264 NaN\n",
+ "CD8T 0.988285 1.000000 0.847118 1.002450 0.966100 NaN\n",
+ "monocyte 0.792142 0.779688 1.000000 0.797139 0.960618 NaN\n",
+ "DC 0.794745 0.815816 0.935905 1.000000 0.853924 NaN\n",
+ "NK 0.925802 0.967842 0.868747 NaN 1.000000 NaN\n",
+ "B 0.918479 0.952496 0.948709 NaN NaN 1.0"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "rb_df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " CD4T | \n",
+ " CD8T | \n",
+ " monocyte | \n",
+ " DC | \n",
+ " NK | \n",
+ " B | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " CD4T | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 1.126679e-35 | \n",
+ " 2.425843e-03 | \n",
+ " 0.000000e+00 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " CD8T | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 7.557685e-59 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " monocyte | \n",
+ " 1.052643e-121 | \n",
+ " 5.216640e-92 | \n",
+ " 0.000000e+00 | \n",
+ " 1.774726e-21 | \n",
+ " 1.393096e-317 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " DC | \n",
+ " 3.609987e-25 | \n",
+ " 4.217830e-39 | \n",
+ " 5.947381e-316 | \n",
+ " 0.000000e+00 | \n",
+ " 4.322965e-05 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " NK | \n",
+ " 2.552726e-264 | \n",
+ " 0.000000e+00 | \n",
+ " 8.365584e-06 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " B | \n",
+ " 2.320757e-144 | \n",
+ " 1.610287e-212 | \n",
+ " 1.074123e-78 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " CD4T CD8T monocyte DC \\\n",
+ "CD4T 0.000000e+00 0.000000e+00 1.126679e-35 2.425843e-03 \n",
+ "CD8T 0.000000e+00 0.000000e+00 7.557685e-59 0.000000e+00 \n",
+ "monocyte 1.052643e-121 5.216640e-92 0.000000e+00 1.774726e-21 \n",
+ "DC 3.609987e-25 4.217830e-39 5.947381e-316 0.000000e+00 \n",
+ "NK 2.552726e-264 0.000000e+00 8.365584e-06 0.000000e+00 \n",
+ "B 2.320757e-144 1.610287e-212 1.074123e-78 0.000000e+00 \n",
+ "\n",
+ " NK B \n",
+ "CD4T 0.000000e+00 0.0 \n",
+ "CD8T 0.000000e+00 0.0 \n",
+ "monocyte 1.393096e-317 0.0 \n",
+ "DC 4.322965e-05 0.0 \n",
+ "NK 0.000000e+00 0.0 \n",
+ "B 0.000000e+00 0.0 "
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "rbpvalue_df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import numpy as np\n",
+ "import matplotlib.pyplot as plt\n",
+ "from matplotlib import cm\n",
+ "from matplotlib.colors import ListedColormap, LinearSegmentedColormap"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 72,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "color_dict = {'CD4T': '#2E9D33',\n",
+ " 'CD8T': '#126725',\n",
+ " 'monocyte': '#EDBA1B',\n",
+ " 'NK': '#965EC8',\n",
+ " 'DC': '#E64B50',\n",
+ " 'B': '#009DDB',\n",
+ " 'cMono': 'peru',\n",
+ " 'ncMono': 'y',\n",
+ " 'CD4T_individual_100': '#2E9D33',\n",
+ " 'CD4T_individual_50': '#2E9D33',\n",
+ " 'CD4T_50': '#2E9D33',\n",
+ " 'CD4T_150': '#2E9D33',\n",
+ " 'CD4T_250': '#2E9D33'}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ ":60: UserWarning: FixedFormatter should only be used together with FixedLocator\n",
+ " ax.set_xticklabels([\"\"]+col_labels)\n",
+ ":61: UserWarning: FixedFormatter should only be used together with FixedLocator\n",
+ " ax.set_yticklabels([\"\"]+row_labels)\n"
+ ]
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "matplotlib.rcParams.update({'font.size': 16})\n",
+ "discovery_celltype = 'CD4T'\n",
+ "fig, axes = plt.subplots(1, 6, figsize=(7, 7), sharey=True)\n",
+ "for i, discovery_celltype in enumerate(['CD4T', 'CD8T', 'monocyte', 'DC', 'NK', 'B']):\n",
+ " colors = [\"white\", color_dict[discovery_celltype]]\n",
+ " cmap1 = LinearSegmentedColormap.from_list(\"mycmap\", colors)\n",
+ " im1, bar = heatmap(rb_df[discovery_celltype].values.reshape((6, 1)), \n",
+ " list(rb_df.index), \n",
+ " [discovery_celltype],\n",
+ " cmap=cmap1, ax=axes[i], vmin=0.7, vmax=1)\n",
+ " bar.remove()\n",
+ " _ = annotate_heatmap(im1, \n",
+ " data=rbse_anno_df[discovery_celltype].values.reshape((6, 1)), \n",
+ " valfmt=\"{x:^}\", \n",
+ " textcolors=(\"white\", \"white\"),\n",
+ " threshold=1)\n",
+ " if i > 0:\n",
+ " axes[i].axis('off')\n",
+ "plt.subplots_adjust(wspace=0, hspace=0)\n",
+ "plt.savefig('rb_values.filtered_results.pdf')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# cdict = {'red': [[0.0, 0.0, 0.0],\n",
+ "# [0.5, 0.5, 0.5],\n",
+ "# [1.0, 1.0, 1.0]],\n",
+ " \n",
+ "# 'green': [[0.0, 0.0, 0.0],\n",
+ "# [0.5, 0.5, 0.5],\n",
+ "# [1.0, 1.0, 1.0]],\n",
+ " \n",
+ "# 'blue': [[0.0, 0.0, 0.0],\n",
+ "# [0.5, 0.5, 0.5],\n",
+ "# [1.0, 1.0, 1.0]]}\n",
+ "\n",
+ "# cdict['alpha'] = ((0.0, 0.0, 0.0),\n",
+ "# (0.5, 0.5, 0.5),\n",
+ "# (1.0, 1.0, 1.0))\n",
+ "# newcmp = LinearSegmentedColormap('alpha', segmentdata=cdict, N=256)\n",
+ "\n",
+ "c_white = matplotlib.colors.colorConverter.to_rgba('white',alpha = 0)\n",
+ "c_black= matplotlib.colors.colorConverter.to_rgba('black',alpha = 1)\n",
+ "cmap_rb = matplotlib.colors.LinearSegmentedColormap.from_list('rb_cmap',[c_white,c_black],512)\n",
+ "\n",
+ "\n",
+ "\n",
+ "mpl.cm.register_cmap(cmap=cmap_rb, name='alpha')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ ":62: UserWarning: FixedFormatter should only be used together with FixedLocator\n",
+ " ax.set_xticklabels([\"\"]+col_labels)\n",
+ ":63: UserWarning: FixedFormatter should only be used together with FixedLocator\n",
+ " ax.set_yticklabels([\"\"]+row_labels)\n"
+ ]
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "im, bar = heatmap(replicated_ratio_df.values, \n",
+ " list(rb_df.index), \n",
+ " celltypes,\n",
+ " cmap='alpha', \n",
+ " vmin=0, vmax=1)\n",
+ "_ = annotate_heatmap(im, \n",
+ " data=replicated_ratio_df.values, \n",
+ " valfmt=\"{x:.0%}\", \n",
+ " textcolors=(\"white\", \"white\"),\n",
+ " threshold=1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 71,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ ":62: UserWarning: FixedFormatter should only be used together with FixedLocator\n",
+ " ax.set_xticklabels([\"\"]+col_labels)\n",
+ ":63: UserWarning: FixedFormatter should only be used together with FixedLocator\n",
+ " ax.set_yticklabels([\"\"]+row_labels)\n"
+ ]
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "matplotlib.rcParams.update({'font.size': 16})\n",
+ "fig, ax = plt.subplots(figsize=(8, 7))\n",
+ "im, bar = heatmap(np.flip(rb_df.values, axis=0), \n",
+ " list(rb_df.index)[::-1], \n",
+ " celltypes,\n",
+ " cmap='alpha', \n",
+ " vmin=0.7, vmax=1)\n",
+ "_ = annotate_heatmap(im, \n",
+ " data=np.flip(rbse_anno_df.values, axis=0), \n",
+ " valfmt=\"{x:^}\", \n",
+ " textcolors=(\"white\", \"white\"),\n",
+ " threshold=1)\n",
+ "\n",
+ "plt.savefig('rb_values.filtered_results.varyingalpha.pdf')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 74,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ ":62: UserWarning: FixedFormatter should only be used together with FixedLocator\n",
+ " ax.set_xticklabels([\"\"]+col_labels)\n",
+ ":63: UserWarning: FixedFormatter should only be used together with FixedLocator\n",
+ " ax.set_yticklabels([\"\"]+row_labels)\n"
+ ]
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "discovery_celltype = 'CD4T'\n",
+ "fig, axes = plt.subplots(1, 6, figsize=(7, 7), sharey=True)\n",
+ "for i, discovery_celltype in enumerate(['CD4T', 'CD8T', 'monocyte', 'DC', 'NK', 'B']):\n",
+ " colors = [\"white\", color_dict[discovery_celltype]]\n",
+ " cmap1 = LinearSegmentedColormap.from_list(\"mycmap\", colors)\n",
+ " im1, bar = heatmap(np.flip(replicated_ratio_df[discovery_celltype].values.reshape((6, 1)),\n",
+ " axis=0), \n",
+ " list(rb_df.index)[::-1], \n",
+ " [discovery_celltype],\n",
+ " cmap=cmap1, ax=axes[i], vmin=0, vmax=1)\n",
+ " bar.remove()\n",
+ " _ = annotate_heatmap(im1, \n",
+ " data=replicated_ratio_df[discovery_celltype].values.reshape((6, 1)), \n",
+ " valfmt=\"{x:.0%}\", \n",
+ " textcolors=(\"white\", \"white\"),\n",
+ " threshold=1)\n",
+ " if i > 0:\n",
+ " axes[i].axis('off')\n",
+ " \n",
+ "plt.subplots_adjust(wspace=0, hspace=0)\n",
+ "plt.savefig('replicated_ratio.filtered_results.pdf')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 50,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAAEoAAADBCAYAAABopyZqAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8rg+JYAAAACXBIWXMAAAsTAAALEwEAmpwYAAAHyUlEQVR4nO2dfYxcZRXGf8/2w5XyhwWiiUFasIiW+JGgtQaj0oSASVMU8SOauiFKDIkiMfqXsVFSTYwfmJhIrUpAg6FBiDSmsQRpEInGsAroQoB+kIJWU6WAdm2x9PjHe7cZhpl7n51hd+fG80sms/POPffeOX2ec9+Z5txXEUHSzNhCn0BbyESZZKJMMlEmmSiTTJRJ6xIl6XRJ35X0W0nTkkLSSjN2XNI3JB2Q9J9qH+9yYluXKGAV8CHgEHDPLGN/BFwBbALWAweAnZLe0hgZEa16AGMdf38SCGClEffmatvLO8YWA48A25viW6eoiDg+YOgG4L/Ato59HQNuBi6S9LK64NYlagjOBfZFxHTX+BSwlGLpvvw/JeoUSl3r5qmO9/uyuO5NSfP9jXkKONLxemtEbH2J9i1Kjeo13khtogDGxuZPdMePHz8SEW+do90/BZzRY3x5x/t9GbVEzeXup4D3Szqpq06tBp4DdtcF12ZBEmNjY/P2mGO2A0uAD3Z8vsXAh4E7IuJoXXCjoiTLwvOKpMuqP8+rnt8r6SBwMCLulrQC2ANcExHXAETE/ZK2Ad+RtATYB1wJnAl8rOmYI2W9WXBL1+vvVc93A++hFOhFvNgxlwNfBTYDrwAeAC6OiD80HVB1PwUvWrQoxsfHjfN+aZienp6cw2I+FG1V1LzTyhq1EDRNOFNRFakok6xRJmk9k7SeSVrPJBVlkjXKJBVlkjXKpNF6qahCKsokE2WS1jNJRZnk9MAkJ5wmqSiTrFEmaT2TtJ5JWs8kFWWSNcokFWWSNcokvxSbpKJMMlEmaT2TVJRJTg9McsJpkooyyRplkooyyRplktYzSeuZpPVMGrMw8zVmPh4Okl4j6WeSnpH0rKTbJPXqw+sVe2YV+7Skw5J2SbJaSlpVoySdBNwFHAUmKB2dm4Fdkt4UEYdrYk8FfgP8C/gUMA18ropdExEP1x27bV+KrwDOAs6JiN0Akh4EHqN8+G/XxF4JvAp4d0fsXcBe4CuUeyn0pVWKorTk/27mgwJExD5J9wKXUJ+otcBjXbGHJd0DrJe0uGrv70nbatS5wJ97jE9RWl7reJ7SEtvNUeDlwGvrgkftqneapPs6Xnd3qde15C/vMd7JI8CFkk6NiH8CSBoD1nTsuy+jZr1/GI2Ng7bkbwGuAn4s6SpKMf8ipVUWoLbzu23WO0Tvf/nl9FbaCSJiL6V3+DxKR/pfgXcA11abHKiLHzXrNTFFqVPdrAYeagqOiFsl/Rx4HfBcROyRdB3wRETsr4ttm6K2A2slndVxfiuB86v3GomI5yPi4SpJr6a081/XFNeYqBG778EPgMeB2yVdImkDcDvwBPD9mY0krZB0TNKmjrElkq6V9D5J6yR9BriPotJvNR24VRPOat6zjlJXfkIp4r8Cro6If3ds2qudP4CzgY9SWvmfBK4HvhYRvaYNL2DUrnqNVLXkAw3bPE7XlbCaTK4f9LitS9RC0SrrLSSpKJP8hdOkbRPOBSMVZZI1yiQVZZI1yiStZ5LWM0nrmaSiTLJGmeSXYpNUlEnWKJO86pmk9UzSeiZpPZNUlEnWKJNUlEnWKJO0nklazyStZ5KKMskaZZK/R5mkokyyRpnkVc8krWeS1jNJ65mkokyyRpmkokyyRpmkokxat1iqhmvnP0PSjZL2S5qW9KikzZKWNcW2qphruHb+ZcCdlJVlvwTsB95G6VA/m9KO1pe2WW+Ydv7zKQm5KCLuqMZ2SToF+LxevH7xC2hbMR+mnX9p9fxs1/jTlBJUq4i2dYAO085/J0V5X5e0WtLJVe/fZ4EtdbaF0atRc9bOHxFHJL0TuJWS2Bl+CHy66cRGrUbNWTu/pHFgG/BKYCOlmK8BNgHHKLck6UvbatTA7fzAJyirYq+KiD3V2K8lPQNslbQlIh7oFzxqimpimHb+NwKHOpI0w++r5zdQlgzvSdu61Idp5/8bsFzSqq7xt1fPf6nNQ92bIzgzH7idH7iBcu+oHZImJF0g6QvAN4FJ4N6BEzWTrFGZHlSX8HXAo5R2/puAfcC6pnb+qnN9LXA/ZTa/gzKB3QpcGBG192YZtelBI4O281fjD9FwQ61+5H+AmrROUQtF26YHC0bbJpwLRlrPJK1nktYzSUWZZI0ySUWZZI0ySUWZZI0ySeuZpPVM0nom+XuUSSrKJGuUSV71TFJRJlmjTNJ6Jmk9k7SeSSrKJGuUSSrKJGuUSVrPJK1nktYzyd+jTFJRJlmjTPKqZ5KKMskaZZLWM0nrmaT1TBp7YUapxaM6p4G61CV9WVL0eRxpim+VojRElzql0/OXXWPLqrHGhVbbVqMG7lKPiCcp636eQNJGSg5ubDpw2656w3Sp92IC+Duws2nDVrWhMVyXevfnOh24ALipWki1llbVKIZbdL6bjRShNNoOGhI1OTm5U9JpszyBYRhXfTs/DL7ofDcfB/4YEQ86G9cmKiIuHuAE5pJhutRPIGkN8HrgajdmpHxlMNSi8x1MUO518FM3oG2JGnrReUlLgY8AOyLioH3kiGjNgzJB3A38iTId2EC5V8Fe4OSO7VZQFLOpxz4updS5S2dz7FYpapgu9Q4mKFfJX8zm2KqynDTQKkUtJJkok0yUSSbKJBNlkokyyUSZZKJMMlEm/wPf4KV4Rxo9gAAAAABJRU5ErkJggg==\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "import matplotlib.pyplot as plt\n",
+ "import matplotlib as mpl\n",
+ "\n",
+ "fig, ax = plt.subplots(figsize=(0.5, 6))\n",
+ "fig.subplots_adjust(bottom=0.5)\n",
+ "\n",
+ "colors = [\"white\", 'black']\n",
+ "cmap = LinearSegmentedColormap.from_list(\"mycmap\", colors)\n",
+ "norm = mpl.colors.Normalize(vmin=0.7, vmax=1)\n",
+ "\n",
+ "fig.colorbar(mpl.cm.ScalarMappable(norm=norm, cmap=cmap),\n",
+ " cax=ax, orientation='vertical')\n",
+ "plt.savefig('colorbar.pdf')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 52,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAAEoAAADBCAYAAABopyZqAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8rg+JYAAAACXBIWXMAAAsTAAALEwEAmpwYAAAJsElEQVR4nO2dbYxdVRWGn3f6wUD7gxLiD5XSNhjSNkqDVasgKTWmNZg2RBGC0YKC34ImYCCVRmuNEogSIApFowRJaABj+wOlAm1VAprWFLUoFWgtGgiFFqqUQpsuf+xz6+Vy7zlr7unMPVvWk9ycmX3OOvvOmrXW2XP3O3vLzAiqGRr0G8iFcJSTcJSTcJSTcJSTcJST7Bwl6a2SbpD0kKR9kkzSNKftsKRrJD0t6eXiHmd4bLNzFHAS8DFgD/DbEdr+GLgYWA58GHgauFfSnEpLM8vqBQy1fX0RYMA0h90pxbUXtrWNBx4D1lbZZxdRZnaoT9PFwAFgddu9DgJ3AAslHVVmnJ2jajAb2G5m+zratwITSSndkzeSo44j1bVOdred78n4spOSxvov5q3A/rbvV5nZqiN0b5FqVLf2SkodBTA0NHZBd+jQof1mNneUbr8bmNqlfUrb+Z40zVGjefutwNmSjumoU7OAV4HHy4xLvSCJoaGhMXuNMmuBCcA5bT/feOBcYJ2ZvVJmXBlRkiuFxxRJHy2+fGdx/JCkXcAuM9so6UTgCWCFma0AMLMtklYD10maAGwHPg9MBz5e1WejUm8E3Nnx/Q+K40ZgPqlAj+P1GXMh8G1gJXAs8AiwyMz+WNWhyj4KHjdunA0PDzve95Fh3759m0exmNci14gac7KsUYOgasAZEVUQEeUkapSTSD0nkXpOIvWcREQ5iRrlJCLKSdQoJ5WpFxGViIhyEo5yEqnnJCLKSQwPnMSA00lElJPKcGnavJ6kEyTdJelFSXsl/VxStxngbrZTJd0qaWchQtsmaaWkSVW2WaWepGOAB4BXgKUkLcFKYL2kd5jZSyW2k4D7SJOgVwE7gXcB3wTeRpoI7UluqXcxMAM42cweB5D0J+DvwGeB75XYnkZyyEIzW1e0rZd0HHBZl6n215Db8GAx8HDLSQBmtl3Sg8ASyh01sTju7Wh/gVSCSiOi0gutQedYvBzMBv7SpX0rSWxRxn2kyLta0ixJkyUtAC4FbipLW2hejTpe0qa27zv1UWVisCld2g9jZvslnQ7cTXJsix8BX6p6Y02rUc85ptT7EoNJGibpN98EfIJUzN9NUggfJAk2epJbjdpDdwnhFLpHWjufJgk4TjKzJ4q230h6EVgl6SYze6SXcW5/FG8l1alOZgGPVti+HdjT5qQWfyiOM0nqlq7kNuBcC8yTNKPVUPzXwmnFuTKeAaZI6lT/vqc4/qvUD1XvrGGOugXYAayRtETSYmAN8BRwc+siSSdKOihpeZvtT4F/A/dIWirpTEmXA9cCm4EH+3bUWA4NPClePMIXANuA24DbScq5BWb2n/a3ToeQzMx2APOALaTR/D2kAewq4INVQv/cijlmthP4SMU1O+jyJDSzR0n/RzNimjY8aCxNG3A2logoJ9nVqEEREeUkapSTSD0nkXpOIvWcREQ5iRrlJLfPowZGRJSTqFFO4qnnJFLPSaSek0g9JxFRTqJGOclqFqZ4T30LyQr7mZLulPSc0qpkj0m6tMouq4hSDSFZYT+3sN9AWqTrRZJmanJV31k5ihpCMklDwK3A/WZ2dtup9Z6Oc0u9rkIy0izvkgrb+SSNQpnYrCe5TanXEZKdXhyHJT0s6YCkZyVdL+noqo6bNjwYNSEZ8ObiuBq4EbgCmAusAE4Azu5hBzRvwDlqQjL+lz0/M7OWeGODpHHAdyXNKqbcS4170rAaVUdI9nxx/HVHe0shPKfMOLenXh0hWUu32RmRrd9QqZolt4iqIyT7JWn8taijfWFx3EQJTatRVdxCUvCukfR1UnR8iy5CMl6/Itnzkr4DXCVpL2ngOZckdr21fcjRjaxSz8xeUtKGf58kJBNwP/CVKiFZwQqS6u4LwGWktYKvITm7lKYNDyqpKSQz0oBzxIPO3FJvYGQXUYMiqxo1SGIC1ElElJOoUU7iqeckUs9JpJ6TSD0nEVFOokY5iYhyEjXKSaSek0g9J5F6TiKinESNchKfRznJTaRRW0jWdp8rlbbF/J3n+qxqlGoKydruMwNYBjzr7Tu3p16dFcna+SFpcYmTcQQL5Jd6dYRkAEg6HzgVuNJz/WE/OG7cJO1BHSEZkqaQZpm/Zmal++l10rTUG00hGaTp822kBW1GRNOK+WiuSPZ+4JPAqVa2s1kPchtw1hGS3Uza0Pmfko4t2sYD44rvXy7bjLBpEVVFHSHZzOL1uS7n9gBfBa7rZdy0GlXFWuBaSTPM7El4jZDsigrbM7u0XUeSB32Zij1Ac4uoOkKyDZ03k/QCML7buU6yiqgjICTrm9yKeS0hWZfr5nv7zS31BkZWqTdIIqKcZFejBkVElJOoUU4i9ZxE6jmJ1HMSEeUkapSTmAB1EhHlJGqUk3jqOYnUcxKp5yRSz0lElJOoUU5yE2n0LSSTNFfSKkl/U9rWcqek2yVN9/SbVY2qKSQ7jzTLfD1pxvktpC0uN0maY2ZPlfWdW42qIyS72sx2tTco7fS4vbjv8q5WBbnVqL63tux0UtH2D0m7SNFVSlapR0qdNV3atwLnjPRmkmaSNib8a9W1TUu90RaSHUbSeOAmYBdJDlRK01JvNFck6+RG4H3AWWZWpa3K7vOoOkKywygtj/QZYGnbnsWlNC2iqqgjJANA0jKSluoSM7vN23FuA846K5Ih6RLSuGuZmd3g6bBFbk+9voVkks4jKex+BTwgaV7bffeWrZgIzXvqlVJTSLaoaF/E69e520ha+bUnudWovoVkZnYBcEG//eaWegMjq9QbJNml3qCIiHISNcpJRJSTqFFOIvWcROo5idRzktvnUQMjIspJ1Cgn8dRzEhHlJGqUk0g9J5F6TiL1nOS2bVzf+qjCdljSNZKeVtqt8SFJZ3hss4qomvooSBqDs4DLgSeBLwL3SnqvmW0pM8ytRtXZsfEU4HzgU2b2k6JtI2n2eQVJUtSTytT7P1poazFwgLS/Xsv2IHAHsFDSUWXGuU2p11loazaw3cz2dbGdCJxUZpxVjaKePqrMtnW+J6WO2rx5872Sjq94A0eSYZULyaB/fZRq2JY7ysw65+gHTR191G6g2zBiStv5njQqrxzU3bFxejHE6LR9lYr1o3JzVB191FpgAm2iWCUd57nAurJl2wAws2xewCTSb/7PpOHAYuAR0uBxctt1JwIHgeUd9neQUvQi4APAXcB+0gKB5X0P+ofvw1lTgbuBvaTdF38BTOu4ZhqpcH+jo/1o0qD0mcJBvwfme/qVjXylxTckudWogRGOchKOchKOchKOchKOchKOchKOchKOcvJfcsY5cEDXPTUAAAAASUVORK5CYII=\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "fig, ax = plt.subplots(figsize=(0.5, 6))\n",
+ "fig.subplots_adjust(bottom=0.5)\n",
+ "\n",
+ "colors = [\"white\", 'black']\n",
+ "cmap = LinearSegmentedColormap.from_list(\"mycmap\", colors)\n",
+ "norm = mpl.colors.Normalize(vmin=0, vmax=1)\n",
+ "\n",
+ "fig.colorbar(mpl.cm.ScalarMappable(norm=norm, cmap=cmap),\n",
+ " cax=ax, orientation='vertical')\n",
+ "plt.savefig('colorbar.replication_ratio.pdf')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### celltype comparison for unfiltered results"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 75,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# filtered results\n",
+ "unrb_df = pd.DataFrame(data=np.zeros((len(celltypes), len(celltypes))), \n",
+ " columns=celltypes, index=celltypes)\n",
+ "unrbse_df = pd.DataFrame(data=np.zeros((len(celltypes), len(celltypes))), \n",
+ " columns=celltypes, index=celltypes)\n",
+ "unrbpvalue_df = pd.DataFrame(data=np.zeros((len(celltypes), len(celltypes))), \n",
+ " columns=celltypes, index=celltypes)\n",
+ "unnumcoeqtl_df = pd.DataFrame(data=np.zeros((len(celltypes), len(celltypes))), \n",
+ " columns=celltypes, index=celltypes)\n",
+ "unanno_df = pd.DataFrame(data=np.zeros((len(celltypes), len(celltypes))), \n",
+ " columns=celltypes, index=celltypes)\n",
+ "unnum_anno_df = pd.DataFrame(data=np.zeros((len(celltypes), len(celltypes))), \n",
+ " columns=celltypes, index=celltypes)\n",
+ "\n",
+ "for discovery_celltype in celltypes:\n",
+ " for replication_celltype in celltypes:\n",
+ " if discovery_celltype != replication_celltype:\n",
+ " unrb_results = unfiltered_res_df[(unfiltered_res_df['celltype_discovery'] == discovery_celltype) &\n",
+ " (unfiltered_res_df['celltype_replication'] == replication_celltype)]\n",
+ " unreplicated_coeqtls_num = pd.read_csv(\n",
+ " workdir/f'output/unfiltered_results/rb_calculations/discovery_{discovery_celltype}_replication_{replication_celltype}.tsv.gz',\n",
+ " compression='gzip',\n",
+ " sep='\\t',\n",
+ " index_col=0\n",
+ " ).shape[0]\n",
+ " if rb_results['r'].values[0] < 10 and discovery_celltype != 'B':\n",
+ " unrb_df.loc[replication_celltype, discovery_celltype] = unrb_results['r'].values[0]\n",
+ " unrbse_df.loc[replication_celltype, discovery_celltype] = unrb_results['se_r'].values[0]\n",
+ " unrbpvalue_df.loc[replication_celltype, discovery_celltype] = unrb_results['p'].values[0]\n",
+ " unnumcoeqtl_df.loc[replication_celltype, discovery_celltype] = unreplicated_coeqtls_num\n",
+ " unrbvalue = unrb_results['r'].values[0]\n",
+ " unrbsevalue = unrb_results['se_r'].values[0]\n",
+ " unnum_anno_df.loc[replication_celltype, discovery_celltype] = \\\n",
+ " f\"N={unreplicated_coeqtls_num}\"\n",
+ " unanno_df.loc[replication_celltype, discovery_celltype] = \\\n",
+ " f\"{unrbvalue:.2f}\\nN={unreplicated_coeqtls_num}\"\n",
+ " elif discovery_celltype == 'B':\n",
+ " unrb_df.loc[replication_celltype, discovery_celltype] = np.nan\n",
+ " unrbse_df.loc[replication_celltype, discovery_celltype] = np.nan\n",
+ " unrbpvalue_df.loc[replication_celltype, discovery_celltype] = 0\n",
+ " unnumcoeqtl_df.loc[replication_celltype, discovery_celltype] = unreplicated_coeqtls_num\n",
+ " unrbvalue = unrb_results['r'].values[0]\n",
+ " unrbsevalue = unrb_results['se_r'].values[0]\n",
+ " unnum_anno_df.loc[replication_celltype, discovery_celltype] = \\\n",
+ " f\"N={unreplicated_coeqtls_num}\"\n",
+ " unanno_df.loc[replication_celltype, discovery_celltype] = \\\n",
+ " f\"N={unreplicated_coeqtls_num}\"\n",
+ " else:\n",
+ " unrb_df.loc[replication_celltype, discovery_celltype] = np.nan\n",
+ " unrbse_df.loc[replication_celltype, discovery_celltype] = np.nan\n",
+ " unrbpvalue_df.loc[replication_celltype, discovery_celltype] = 0\n",
+ " unnumcoeqtl_df.loc[replication_celltype, discovery_celltype] = unreplicated_coeqtls_num\n",
+ " unnum_anno_df.loc[replication_celltype, discovery_celltype] = \\\n",
+ " f\"N={unreplicated_coeqtls_num}\"\n",
+ " unanno_df.loc[replication_celltype, discovery_celltype] = \\\n",
+ " f\"N={unreplicated_coeqtls_num}\"\n",
+ " else:\n",
+ " unrb_df.loc[replication_celltype, discovery_celltype] = 1\n",
+ " unrbse_df.loc[replication_celltype, discovery_celltype] = 0\n",
+ " unrbpvalue_df.loc[replication_celltype, discovery_celltype] = 0\n",
+ " unreplicated_coeqtls_num = pd.read_csv(\n",
+ " workdir/f'output/unfiltered_results/UT_{discovery_celltype}/coeqtls_fullresults_fixed.sig.tsv.gz',\n",
+ " compression='gzip',\n",
+ " sep='\\t'\n",
+ " ).shape[0]\n",
+ " unnumcoeqtl_df.loc[replication_celltype, discovery_celltype] = unreplicated_coeqtls_num\n",
+ " unnum_anno_df.loc[replication_celltype, discovery_celltype] = \\\n",
+ " f\"N={unreplicated_coeqtls_num}\"\n",
+ " unanno_df.loc[replication_celltype, discovery_celltype] = \\\n",
+ " f\"N={unreplicated_coeqtls_num}\"\n",
+ " \n",
+ "unreplicated_ratio_df = pd.DataFrame(data=np.zeros((len(celltypes), len(celltypes))), \n",
+ " columns=celltypes, index=celltypes)\n",
+ "for discovery_celltype in unnumcoeqtl_df.columns:\n",
+ " for replication_celltype in unnumcoeqtl_df.index:\n",
+ " unreplicated_ratio_df.loc[replication_celltype, discovery_celltype] = \\\n",
+ " unnumcoeqtl_df.loc[replication_celltype, discovery_celltype] / unnumcoeqtl_df.loc[discovery_celltype, \n",
+ " discovery_celltype]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 76,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ ":62: UserWarning: FixedFormatter should only be used together with FixedLocator\n",
+ " ax.set_xticklabels([\"\"]+col_labels)\n",
+ ":63: UserWarning: FixedFormatter should only be used together with FixedLocator\n",
+ " ax.set_yticklabels([\"\"]+row_labels)\n"
+ ]
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "matplotlib.rcParams.update({'font.size': 14})\n",
+ "fig, axes = plt.subplots(1, 6, figsize=(7, 7), sharey=True)\n",
+ "for i, discovery_celltype in enumerate(['CD4T', 'CD8T', 'monocyte', 'DC', 'NK', 'B']):\n",
+ " colors = [\"white\", color_dict[discovery_celltype]]\n",
+ " cmap1 = LinearSegmentedColormap.from_list(\"mycmap\", colors)\n",
+ " im1, bar = heatmap(np.flip(unreplicated_ratio_df[discovery_celltype].values.reshape((6, 1)), \n",
+ " axis=0), \n",
+ " list(rb_df.index)[::-1], \n",
+ " [discovery_celltype],\n",
+ " cmap=cmap1, ax=axes[i], vmin=0, vmax=1)\n",
+ " bar.remove()\n",
+ " _ = annotate_heatmap(im1, \n",
+ " data=unreplicated_ratio_df[discovery_celltype].values.reshape((6, 1)), \n",
+ " valfmt=\"{x:.0%}\", \n",
+ " textcolors=(\"white\", \"white\"),\n",
+ " threshold=1)\n",
+ " if i > 0:\n",
+ " axes[i].axis('off')\n",
+ " \n",
+ "plt.subplots_adjust(wspace=0, hspace=0)\n",
+ "plt.savefig('replication_ratio.unfiltered_results.pdf')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 77,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ ":62: UserWarning: FixedFormatter should only be used together with FixedLocator\n",
+ " ax.set_xticklabels([\"\"]+col_labels)\n",
+ ":63: UserWarning: FixedFormatter should only be used together with FixedLocator\n",
+ " ax.set_yticklabels([\"\"]+row_labels)\n"
+ ]
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "matplotlib.rcParams.update({'font.size': 14})\n",
+ "discovery_celltype = 'CD4T'\n",
+ "fig, axes = plt.subplots(1, 6, figsize=(7, 7), sharey=True)\n",
+ "for i, discovery_celltype in enumerate(['CD4T', 'CD8T', 'monocyte', 'DC', 'NK', 'B']):\n",
+ " colors = [\"white\", color_dict[discovery_celltype]]\n",
+ " cmap1 = LinearSegmentedColormap.from_list(\"mycmap\", colors)\n",
+ " im1, bar = heatmap(np.flip(unrb_df[discovery_celltype].values.reshape((6, 1)), \n",
+ " axis=0),\n",
+ " list(rb_df.index)[::-1], \n",
+ " [discovery_celltype],\n",
+ " cmap=cmap1, ax=axes[i], vmin=0, vmax=1)\n",
+ " bar.remove()\n",
+ " _ = annotate_heatmap(im1, \n",
+ " data=unanno_df[discovery_celltype].values.reshape((6, 1)), \n",
+ " valfmt=\"{x:^}\", \n",
+ " textcolors=(\"white\", \"white\"),\n",
+ " threshold=1)\n",
+ " if i > 0:\n",
+ " axes[i].axis('off')\n",
+ " \n",
+ "plt.subplots_adjust(wspace=0, hspace=0)\n",
+ "plt.savefig('rb_values.unfiltered_results.pdf')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## BIOS replication"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "bios_replication_filtered_df = pd.read_csv(\n",
+ " workdir/'bios/onlyRNAAlignMetrics_rmLLD/filtered_results/replication_summary.csv', \n",
+ " index_col=0\n",
+ ").set_index('celltype')\n",
+ "bios_replication_unfiltered_df = pd.read_csv(\n",
+ " workdir/'bios/onlyRNAAlignMetrics_rmLLD/unfiltered_results/replication_summary.csv', \n",
+ " index_col=0\n",
+ ").set_index('celltype')\n",
+ "color_dict = {'CD4T': '#2E9D33',\n",
+ " 'CD8T': 'darkgreen',\n",
+ " 'monocyte': '#EDBA1B',\n",
+ " 'NK': '#E64B50',\n",
+ " 'DC': '#965EC8',\n",
+ " 'B': '#009DDB',\n",
+ " 'cMono': 'peru',\n",
+ " 'ncMono': 'y',\n",
+ " 'CD4T_individual_100': '#2E9D33',\n",
+ " 'CD4T_individual_50': '#2E9D33',\n",
+ " 'CD4T_50': '#2E9D33',\n",
+ " 'CD4T_150': '#2E9D33',\n",
+ " 'CD4T_250': '#2E9D33'}\n",
+ "\n",
+ "bios_replication_filtered_df['color'] = [color_dict.get(celltype) for celltype in \n",
+ " bios_replication_filtered_df.index]\n",
+ "bios_replication_unfiltered_df['color'] = [color_dict.get(celltype) for celltype in \n",
+ " bios_replication_unfiltered_df.index]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "bios_replication_filtered_df_clean = bios_replication_filtered_df.drop(index=['B'])\n",
+ "bios_replication_filtered_df_clean = bios_replication_filtered_df_clean.drop(columns=['color'])\n",
+ "bios_replication_filtered_df_clean.to_excel(workdir/'output/summary/rb_values_bios_replication_filtered_results.xlsx')\n",
+ "\n",
+ "bios_replication_unfiltered_df_clean = bios_replication_unfiltered_df.drop(index=['B'])\n",
+ "bios_replication_unfiltered_df_clean = bios_replication_unfiltered_df_clean.drop(columns=['color'])\n",
+ "bios_replication_unfiltered_df_clean.to_excel(workdir/'output/summary/rb_values_bios_replication_unfiltered_results.xlsx')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ ":3: UserWarning: marker is redundantly defined by the 'marker' keyword argument and the fmt string \".\" (-> marker='.'). The keyword argument will take precedence.\n",
+ " ax2.errorbar(y=bios_replication_filtered_df.loc[sorted_celltypes]['r'].values,\n",
+ ":8: UserWarning: marker is redundantly defined by the 'marker' keyword argument and the fmt string \".\" (-> marker='.'). The keyword argument will take precedence.\n",
+ " ax2.errorbar(y=bios_replication_unfiltered_df.loc[sorted_celltypes]['r'].values,\n",
+ ":12: UserWarning: FixedFormatter should only be used together with FixedLocator\n",
+ " ax2.set_xticklabels(['', 'CD4T', '', 'CD8T', '', 'monocyte', '', 'DC', '', 'NK'])\n"
+ ]
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "sorted_celltypes = ['CD4T', 'CD8T', 'monocyte', 'DC', 'NK']\n",
+ "fig, ax2 = plt.subplots()\n",
+ "ax2.errorbar(y=bios_replication_filtered_df.loc[sorted_celltypes]['r'].values,\n",
+ " x=[ind for ind in range(len(sorted_celltypes))],\n",
+ " yerr=bios_replication_filtered_df.loc[sorted_celltypes]['se_r'].values,\n",
+ " fmt='.', markersize=6, marker='o', color='black', label = 'Filtered results')\n",
+ "bios_replication_unfiltered_df.loc['DC'] = [np.nan, np.nan, np.nan, np.nan]\n",
+ "ax2.errorbar(y=bios_replication_unfiltered_df.loc[sorted_celltypes]['r'].values,\n",
+ " x=[ind+0.05 for ind in range(len(sorted_celltypes))],\n",
+ " yerr=bios_replication_unfiltered_df.loc[sorted_celltypes]['se_r'].values,\n",
+ " fmt='.', markersize=6, marker='o', markerfacecolor='white', color='black', label = 'Unfilter results')\n",
+ "ax2.set_xticklabels(['', 'CD4T', '', 'CD8T', '', 'monocyte', '', 'DC', '', 'NK'])\n",
+ "plt.legend()\n",
+ "plt.ylabel(\"rb (SE)\")\n",
+ "plt.savefig('sf20.comparison_rb_values_bios_replication.pdf')\n",
+ "plt.savefig('sf20.comparison_rb_values_bios_replication.png')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ ":14: UserWarning: marker is redundantly defined by the 'marker' keyword argument and the fmt string \".\" (-> marker='.'). The keyword argument will take precedence.\n",
+ " ax.errorbar(y=bios_replication_filtered_df.loc[celltype]['r'],\n"
+ ]
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# compare between filtered and unfiltered\n",
+ "fig, axes = plt.subplots(1, 5, figsize=(4, 5), sharey=True)\n",
+ "sorted_celltypes = ['CD4T', 'CD8T', 'monocyte', 'DC', 'NK']\n",
+ "# ax1.errorbar(y=bios_replication_filtered_df.loc[sorted_celltypes]['r'].values,\n",
+ "# x=[ind-0.1 for ind in range(len(sorted_celltypes))],\n",
+ "# yerr=bios_replication_filtered_df.loc[sorted_celltypes]['se_r'].values,\n",
+ "# fmt='.', markersize=6, marker='o', \n",
+ "# ecolor=bios_replication_filtered_df.loc[sorted_celltypes]['color'].values,\n",
+ "# color=bios_replication_filtered_df.loc[sorted_celltypes]['color'].values[0])\n",
+ "# ax1.set_xticklabels([\"\"]+sorted_celltypes)\n",
+ "# ax1.plot([0, 5], [0.5, 0.5], linestyle='--', color='black')\n",
+ "for ind, celltype in enumerate(sorted_celltypes):\n",
+ " ax = axes[ind]\n",
+ " ax.errorbar(y=bios_replication_filtered_df.loc[celltype]['r'],\n",
+ " x=[0.4],\n",
+ " yerr=bios_replication_filtered_df.loc[celltype]['se_r'],\n",
+ " fmt='.', markersize=6, marker='o', ecolor='black',\n",
+ " markeredgecolor='black', markerfacecolor='black'\n",
+ " )\n",
+ " ax.set_xlim([0, 1])\n",
+ " ax.spines['bottom'].set_color(bios_replication_filtered_df.loc[celltype]['color'])\n",
+ " ax.spines['top'].set_color(bios_replication_filtered_df.loc[celltype]['color']) \n",
+ " ax.spines['right'].set_color(bios_replication_filtered_df.loc[celltype]['color'])\n",
+ " ax.spines['left'].set_color(bios_replication_filtered_df.loc[celltype]['color'])\n",
+ " ax.set_xticklabels([])\n",
+ " ax.set_xlabel(celltype)\n",
+ " \n",
+ "\n",
+ "plt.savefig('bios_replication.filtered_results.pdf')\n",
+ "plt.savefig('bios_replication.filtered_results.png')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "metadata": {
+ "scrolled": false
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ ":14: UserWarning: marker is redundantly defined by the 'marker' keyword argument and the fmt string \".\" (-> marker='.'). The keyword argument will take precedence.\n",
+ " ax.errorbar(y=bios_replication_filtered_df.loc[celltype]['r'],\n",
+ ":20: UserWarning: marker is redundantly defined by the 'marker' keyword argument and the fmt string \".\" (-> marker='.'). The keyword argument will take precedence.\n",
+ " ax.errorbar(y=bios_replication_unfiltered_df.loc[celltype]['r'],\n",
+ "/groups/umcg-lld/tmp01/projects/1MCellRNAseq/GRN_reconstruction/tools/Beeline/miniconda/envs/scpy3.8/lib/python3.8/site-packages/numpy/core/_asarray.py:102: UserWarning: Warning: converting a masked element to nan.\n",
+ " return array(a, dtype, copy=False, order=order)\n"
+ ]
+ },
+ {
+ "data": {
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAAtUAAAFtCAYAAADIwpbuAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8rg+JYAAAACXBIWXMAAAsTAAALEwEAmpwYAAAmbklEQVR4nO3dfZhcZX3/8ffXJARTfGjZpCRVSEWtwaAi0Vq64KaaSimtv2rFpls1ujYVKNpy+VC7Xlatq60PVH5YlKUroXVNoe3P+lxBmhBXREzUYjCVohB8wJhFRSAQQvz+/jizZXezm+zmnLOzM/t+XddcydznYb/ZO2fmM/fcc09kJpIkSZIO38OaXYAkSZLU6gzVkiRJUkmGakmSJKkkQ7UkSZJUkqFakiRJKslQLUmSJJU0v9kFlPXswefksqOWNbuMOesbd+4Y/q9XbF1cxbk6/rwjlx+9vIpT6TBs27ltOC/NSvryx5tPzIcd+dgqTqVp2n/3jcO/sOa7lfQjwGXnbstHdCys6nSaht233Tt89uW/Wklf/uAPX5rzlyyp4lQ6DPtu+dbw0k9+tJrnysFv5/KjFlRxKh2GbXfuHc5XPH7Cvmz5UL3sqGVsfP4/NbuMOeupH1q1s6pzLT96OVvftLWq02ma4o+jsr582JGP5VHP+nRVp9M0/Ojqx1TWjwCP6FjI7791ZZWn1BR94GVfqqwv5y9ZQsf73lPV6TRNd5z5e9U9Vx61gK3Pd9CiWeJDt0zal07/kCRJkkoyVEuSJEklGaolSZKkkgzVkiRJUkmGakmSJKkkQ7UkSZJUUi2hOiJOi4iPR8T3IiIjYt0UjjkxIq6NiPsax705IqKO+iRJkqQq1TVSfRSwHXgNcN+hdo6IRwJXA7uAZwCvBl4HnF9TfZIkSVJlavnyl8z8NPBpgIjYMIVDuoFFwMsy8z5ge0SsAM6PiAsyM+uoU5IkSarCbJlT/WvA5xuBesRngWXA8qZUJEmSJE3RbAnVx1BM/Rht16htY0TE+ojYGhFbdw/vrr041WdMX+62L1uZ12X78LpsD/Zj+/DxtTXMllANMH6KR0zSTmb2Z+aqzFy1uGNx/ZWpNmP6crF92cq8LtuH12V7sB/bh4+vrWG2hOofcOCI9JLGn+NHsCVJkjRLdHV10dXV1ewymm62hOovAqdGxJGj2tYA3wdua0pFkiRJ0hTVtU71URHxtIh4WuNnHNu4f2xj+zsj4ppRh3wE2ANsiIiVEfEC4C8AV/6QJEnSrFfXSPUq4KuN28OBtzb+/rbG9qXA8SM7Z+ZdFCPTy4CtwN8D7wUuqKk+SZIkqTJ1rVO9mYc+aDjR9nUTtH0dOK2OeiRJkqQ6zZY51ZIkSVLLMlRLkiRJJRmqJUmSpJIM1ZIkSVJJhmpJkiSpJEO1JEmSVJKhWpIkSSrJUC1JkiSVZKiWJEmSSjJUS5IkSSUZqiVJkqSSDNWSJElSSYZqSZIkqSRDtSRJklSSoVqSJEkqyVAtSZIklWSoliRJkkoyVEuSJEklGaolSZKkkgzVkiRJUkmGakmSJKkkQ7UkSZJUkqFakiRJKslQLUmSJJVkqJYkSZJKMlRLkiRJJRmqJUmSpJIM1ZIkSVJJhmpJkiSpJEO1JEmSVFJtoToizomIWyPi/ojYFhGnHmL/50XEFyPi7ogYjoiPRcQT66pPkiRJqkotoToiXgxcCLwDOAm4DvhMRBw7yf6/DHwM+Hxj/+cCDwc+XUd9kiRJUpXqGqk+H9iQmZdm5o7MPA+4Azh7kv1PBhYAb8zMWzLza8A7geMjoqOmGiVJkqRKVB6qI+IIipB81bhNVwGnTHLYVmAf8MqImBcRjwBeBnw5M4errlGSJEmqUh0j1R3APGDXuPZdwDETHZCZtwFrgLcCe4G7gBOBMyfaPyLWR8TWiNi6e3h3RWWrGcb05W77spV5XbYPr8v2YD+2Dx9fW0Odq3/kuPsxQVuxIeIYYAD4R+AZQBdwN3BlRBxQY2b2Z+aqzFy1uGNxpUVrZo3py8X2ZSvzumwfXpftwX5sHz6+tob5NZxzGNjPgaPSSzhw9HrEucC9mfn6kYaI+CPgOxRTRoZqqFOSJEmqROUj1Zn5ALCNYjrHaGsoVgGZyCKKID7ayH3X0pYkSdKsVldgvQBYFxGvjIgVEXEhsAz4IEBEvDMirhm1/6eAp0fEX0XEEyLi6cBlFCPV22qqUZIkSapEHdM/yMwrIuJo4E3AUmA7cEZm7mzsshQ4ftT+/xkRfwi8HngdcB9wPXB6Zt5bR42SJElSVWoJ1QCZeTFw8STb1k3Q9s/AP9dVjyRJklQX5ytLkiRJJRmqJUmSpJIM1ZIkSVJJhmpJkiSpJEO1JEmSVJKhWpIkSSrJUC1JkiSVZKiWJEmSSjJUSwfR1dVFV1dXs8uQJEmznKFakiRJKslQLUmSJJVkqJYkSZJKMlRLkiRJJRmqJUmSpJIM1ZIkSVJJhmpJkiSpJEO1JEmSVJKhWpIkSSrJUC1JkiSVZKiWJEmSSjJU16Crq4uurq5mlyFJkqQZYqiWJEmSSjJUS5IkSSUZqiVJkqSSDNWSJElSSYZqSZIkqSRDtSRJklSSoVqSJEkqyVAtSZIklWSortjg4CDXX3891157LcuXL2dwcLDZJUmSJKlmtYXqiDgnIm6NiPsjYltEnHqI/SMi/iwi/jsi9kbEHRHxN3XVV4fBwUHWr1/P3r17Adi5cyfr1683WEuSJLW5WkJ1RLwYuBB4B3AScB3wmYg49iCHvRc4B3gDsAI4A9hSR3116e3tZc+ePWPa9uzZQ29vb5MqkiRJ0kyYX9N5zwc2ZOaljfvnRcTpwNnAG8fvHBG/ApwHPCUzd4za9NWa6qvF7bffPq12SZLmsq6uLgA2b97c1DqkKlQ+Uh0RRwAnA1eN23QVcMokhz0f+DZwekR8OyJui4jLI2LJJD9jfURsjYitu4d3V1Z7WcceO/FA/GTtGteXu2dPX2r6Zut1qenzumwP9mP78PG1NdQx/aMDmAfsGte+CzhmkmMeBxwH/AGwDngJ8CTgExFxQI2Z2Z+ZqzJz1eKOxVXVXVpfXx+LFi0a07Zo0SL6+vqaVNHsN6YvF8+evtT0zdbrUtPnddke7Mf24eNra6hz9Y8cdz8maBtdx0LgJZm5JTM/TxGsnwk8o74Sq9Xd3U1/fz8LFy4E4LjjjqO/v5/u7u4mV6bD4UoukiRpquqYUz0M7OfAUeklHDh6PeIO4MHMvHlU2/8ADwLHAl+qusi6dHd3c+mlxVRy54i1rslWcgF8kSRJkg5Q+Uh1Zj4AbAPWjNu0hmIVkIl8AZgfEcePanscRejfWXWN0qG4koskSZqOuqZ/XACsi4hXRsSKiLgQWAZ8ECAi3hkR14za/3PAV4APRcRJEXES8CGKEeqtNdUoTcqVXCRJ0nTUEqoz8wrgz4A3AV8DOoEzMnNk1HkpcPyo/X8GnAn8kGJt6s8C3wWe39gmzShXcpEkSdNR2wcVM/PizFyemQsz8+TM3DJq27rMXD5u/zsy80WZ+YjMXJKZ3Zk52RxsqVau5CJJkqajztU/pJblSi6SJGk66vpGRanluZKLJEmaKkN1DQxgkiRJc4vTPyRJkqSSDNWSJGnG+a21ajeGakmSNKMm+9Zag7VamaFakiTNKL+1Vu3IUC1JkmaU31qrdmSoliRJM8pvrVU7MlRLkqQZ5bfWqh0ZqiVJ0ozyW2vVjvzyF0mSNOP81lq1G0eqJUmSpJIM1ZIkSVJJhmpJkiSpJEO1JEmSVJKhWpIkSSrJUC1JkiSVZKiWJEmSSjJUS5IkSSUZqiVJkqSSDNWS5oSuri66urqaXYYkqU0ZqiVJkqSS5je7AEmSNDdt3ry52SVIlXGkWpIkSSrJUC1JkiSVZKiW1PYGBwe5/vrrufbaa1m+fDmDg4PNLkmS1GYM1ZLa2uDgIOvXr2fv3r0A7Ny5k/Xr1xusJUmVMlRLB7F582Y/SNPient72bNnz5i2PXv20Nvb26SKJEntqLZQHRHnRMStEXF/RGyLiFOneNwTIuLuiLinrtokzR233377tNolSToctYTqiHgxcCHwDuAk4DrgMxFx7CGOOwL4Z2BLHXVJmnuOPXbih53J2iVJOhx1jVSfD2zIzEszc0dmngfcAZx9iOP+FrgR+Jea6pI0x/T19bFo0aIxbYsWLaKvr69JFUmS2lHlobox2nwycNW4TVcBpxzkuN8GzgReXXVNkuau7u5u+vv7WbhwIQDHHXcc/f39dHd3N7kySVI7qWOkugOYB+wa174LOGaiAyJiKXAp8JLMvPtQPyAi1kfE1ojYunt4d9l61URj+nK3fdnKZvN12d3dzbOe9Sye/exnc9tttxmoD8Hrsj3Yj+1jNj++6iF1rv6R4+7HBG0jPgx8IDOvn9KJM/szc1VmrlrcsbhMjWqyMX252L5sZV6X7cPrsj3Yj+3Dx9fWUEeoHgb2c+Co9BIOHL0e8RvAX0XEgxHxIDAA/Fzj/voaapQkSZIqM7/qE2bmAxGxDVjD2A8crgH+bZLDThx3//lAL/BM4HtV1yhJal1dXV0AriEvaVapPFQ3XAD8U0TcAHwBeBWwDPggQES8E3hmZj4HIDO3jz44IlYBPxvfLkmSJM1GtYTqzLwiIo4G3gQsBbYDZ2TmzsYuS4Hj6/jZkjQRRzUlSXWqa6SazLwYuHiSbesOcewGYEPlRUmSJEk1qHP1D0mSJGlOMFRLkiRJJRmqJUmSpJIM1ZIkSTosg4ODXH/99Vx77bUsX76cwcHBZpfUNIZqSZIkTdvg4CDr169n7969AOzcuZP169fP2WBtqJYkSdK09fb2smfPnjFte/bsobe3t0kVNZehWpIkSdN2++23T6u93RmqJUmSNG3HHnvstNrbnaFakiRJ09bX18eiRYvGtC1atIi+vr4mVdRchmpJmqauri66urqaXYYkNVV3dzf9/f0sXLgQgOOOO47+/n66u7ubXFlz1PY15ZIkSWpv3d3dXHrppQBs3ry5ucU0mSPVkqSW4Zq4kmYrQ7UkqSW4Jq6k2cxQLUlqCa6JK2k2M1RL0jRs3LiRm266iS1btrBy5Uo2btzY7JLmDNfElTSbGaolaYo2btxIb28vV155JXv37uWiiy6it7fXYD1DXBNX0mxmqJakKerr62NgYIDVq1ezYMECVq9ezcDAwJxdk3WmuSaupNnMUC1JU7Rjxw46OzvHtHV2drJjx44mVTS3uCaupNnMdaolaYpWrFjB0NAQq1ev/t+2oaEhVqxY0cSq5hbXxJU0WzlSLUlT1NvbS09PD5s2bWLfvn1s2rSJnp4eV5+QJDlSLUlTtXbtWgDOOuss7rzzTk444QT6+vr+t12SNHcZqiVpGtauXcsll1wCOP1AkvQQp39IkiRJJRmqJUmSpJKc/iFJ0+S0D0nSeI5US5IkSSUZqiVJkqSSDNWSJElSSYZqSZIkqSRDtSRJklRSbaE6Is6JiFsj4v6I2BYRpx5k366I+FhE3BEReyLixoh4RV21SZIkSVWqZUm9iHgxcCFwDjDU+PMzEXFCZt4+wSGnAF8H3gXcATwP6I+I+zPzI3XUKElqTS5pKGk2qmud6vOBDZl5aeP+eRFxOnA28MbxO2fmO8Y1fSAiVgMvBAzVkiRJmtUqn/4REUcAJwNXjdt0FcWI9FQ9EvhxVXVJkiRJdaljTnUHMA/YNa59F3DMVE4QEWcCzwH6J9m+PiK2RsTW3cO7y9SqJhvTl7vty1bmddk+vC7bg/3YPnx8bQ11rv6R4+7HBG0HiIhfp5jy8erMvGHCE2f2Z+aqzFy1uGNx+UrVNGP6crF92cq8LtuH12V7sB/bh4+vraGOUD0M7OfAUeklHDh6PUZEdAKfAd6cmR+ooTZJkiSpcpWH6sx8ANgGrBm3aQ1w3WTHRcRpFIH6rZn5vqrrkiRJkupS1+ofFwD/FBE3AF8AXgUsAz4IEBHvBJ6Zmc9p3O8CPgVcDAxGxMgo9/7MdPKQJEmSZrVaQnVmXhERRwNvApYC24EzMnNnY5elwPGjDlkHLAJe27iN2Aksr6NGSZIkqSp1jVSTmRdTjDxPtG3dBPfXTbSvJEmSNNvVufqHJEmSNCcYqiVJkqSSDNWSJElSSYZqSZIkqSRDtSRJklSSoVqSJEkqyVAtSZIklWSoliRJkkoyVEuSJEklGaolSZKkkgzVkiRJUkmGakmSJKkkQ7U0Q7q6uujq6mp2GZIkqQaGakmSJM0arToIZaiWJEmSSjJUS5IkSSUZqqUZsHHjRm666Sa2bNnCypUr2bhxY7NLkiRJFZrf7AKkdrdx40Z6e3u58sor6ezsZGhoiJ6eHgDWrl3b5OokSVIVHKmWatbX18fAwACrV69mwYIFrF69moGBAfr6+ppdmiS1jVb9cJvah6FaqtmOHTvo7Owc09bZ2cmOHTuaVJEkSaqaoVqq2YoVKxgaGhrTNjQ0xIoVK5pUkSRJqpqhWqpZb28vPT09bNq0iX379rFp0yZ6enro7e1tdmmSJKkiflBRqtnIhxHPOuss7rzzTk444QT6+vr8kKIkSW3EUC3NgLVr13LJJZcAsHnz5uYWI0mSKuf0D0mSJKkkQ7U0QzZv3uwotSTVwC/Y0mzg9A9JktSy/IItzRaOVEuSpJblF2xptjBUS5KkluUXbGm2qC1UR8Q5EXFrRNwfEdsi4tRD7H9iRFwbEfdFxPci4s0REXXVJ0mSWp9fsKXZopZQHREvBi4E3gGcBFwHfCYijp1k/0cCVwO7gGcArwZeB5xfR32SJKk9+AVbmi3q+qDi+cCGzLy0cf+8iDgdOBt44wT7dwOLgJdl5n3A9ohYAZwfERdkZtZUZ0vo6uoCXN9YkqTx/IItzRaVh+qIOAI4GXjPuE1XAadMctivAZ9vBOoRnwX+GlgO3FpxmZIkqU34BVuaDeqY/tEBzKOYyjHaLuCYSY45ZpL9R7aNERHrI2JrRGzdPby7TK2zXruvvTmmL3e3d1+2u7l0XbY7r8v2YD+2Dx9fW0Odq3+Mn7IRE7Qdav+J2snM/sxclZmrFncsLlHi7DZ67c29e/dy0UUX0dvb21bBekxfLm7fvpwL5sp1ORd4XbYH+7F9+PjaGuoI1cPAfg4cYV7CgaPRI34wyf4c5Ji259qbkiRJraHyUJ2ZDwDbgDXjNq2hWAVkIl8ETo2II8ft/33gtqprbBWuvSlJktQa6pr+cQGwLiJeGRErIuJCYBnwQYCIeGdEXDNq/48Ae4ANEbEyIl4A/AUwp1f+cO1NSZKk1lDLknqZeUVEHA28CVgKbAfOyMydjV2WAseP2v+uiFgD/D2wFfgx8F6KcD5njay9OTAwQGdnJ0NDQ/T09Dj9Q5KkcVz1Q81W1zrVZObFwMWTbFs3QdvXgdPqqqcVufamJElSa6gtVKsaa9euNURLkiTNcnUuqSdJkiTNCYZqSZIkqSRDtSRJklSSoVqSJEkqyVAtSZIklWSoliRJkkoyVEuSJEklGaolSZKkkgzVkiRJUkmGakmSJKkkQ7UkSZJUkqFakiRJKslQLUmSJJVkqJYkSZJKMlRLkiRJJRmqJUmSpJIM1ZIkSVJJhmpJkiSpJEO1JEmSVJKhWpIkSSrJUC1JkiSVZKiWJEmSSjJUS5IkSSUZqiVJkqSSDNWSJElSSYZqSZIkqSRDtSRJklSSoVqSJEmzwsaNG7npppvYsmULK1euZOPGjc0uacrmN7sASZIkaePGjfT29nLllVfS2dnJ0NAQPT09AKxdu7bJ1R1a5SPVEbEwIi6KiOGIuDciPh4RjznEMX8cEZ+PiB9FxE8iYlNEdFZdmyRJkmanvr4+BgYGWL16NQsWLGD16tUMDAzQ19fX7NKmpI7pH+8DXgisBU4FHgl8MiLmHeSYLuAK4DnArwLfBD4bEU+ooT5JkiTNMjt27KCzc+yYamdnJzt27GhSRdNTaaiOiEcBPcDrMvPqzPwK8BLgKcBzJzsuM7sz8/2Z+dXM/CZwNnA3cHqV9UmSJGl2WrFiBUNDQ2PahoaGWLFiRZMqmp6qR6pPBhYAV400ZOZ3gB3AKdM4zxHAkcCPK61OkiRJs1Jvby89PT1s2rSJffv2sWnTJnp6eujt7W12aVNSdag+BtgPDI9r39XYNlVvB+4BPj7RxohYHxFbI2Lr7uHdh1WoZocxfbnbvmxlXpftw+uyPdiP7WOuPL6uXbuWvr4+zjrrLBYuXMh5551HX19fS3xIEaYYqiPi7RGRh7h1HewUQE7xZ70G+BPgBZn504n2ycz+zFyVmasWdyyeymk1S43py8X2ZSvzumwfXpftwX5sH3Pp8XXt2rU8+clP5rTTTmP79u0tE6hh6kvqvQ/48CH2uR14FjAP6ABGv5RaAmw51A9pBOq3A7+VmTdMsTZJkiSpqaYUqjNzmAOndBwgIrYB+4A1wEcabY8BVgDXHeLY84G3AWdk5tDB9pUkSZJmk0q//CUz74qIAeDdEfFD4E7gAuBG4HMj+0XENcANmfnGxv3XAX3AHwE3R8TI/Ov7MvOuKmuUJEmSqlbHNyr+OfAgxbrTDweuAV6amftH7XM88J1R98+lWDXkinHnuhxYV0ONkiRJUmUqD9WZeT9wXuM22T7LD3ZfkiRJaiV1fKOiJEmSNKcYqiVJkqSSDNWSJElSSYZqSZIkqSRDtSRJklSSoVqSJEkqyVAtSZIklWSoliRJkkoyVEuSJEklGaolSZKkkgzVkiRJUkmGakmSJKkkQ7UkSZJUkqFakiRJKslQLUmSJJVkqJYkSZJKMlRLkiRJJc1vdgGSJElqXZs3b252CbOCI9WSJElSSYZqSZIkqSRDtSRJklSSoVqSJEkqyVAtSZIklWSoliRJkkoyVEuSJEklGaolSZKkkgzVkiRJUkmGakmSJKkkQ7UkSZJUUuWhOiIWRsRFETEcEfdGxMcj4jHTOH5tRGREfLLq2iRJkqQ61DFS/T7ghcBa4FTgkcAnI2LeoQ6MiMcB7wY+X0NdkiRJUi0qDdUR8SigB3hdZl6dmV8BXgI8BXjuIY5dAGwEeoFvV1mXJEmSVKeqR6pPBhYAV400ZOZ3gB3AKYc4tg+4LTMvr7gmSZIkqVZVh+pjgP3A8Lj2XY1tE4qI3wReDLxqKj8kItZHxNaI2Lp7ePfh1qpZYExf7rYvW5nXZfvwumwP9mP78PG1NUwpVEfE2xsfHjzYretgpwByknN3ABuAl2Xmj6dST2b2Z+aqzFy1uGPxVA7RLDWmLxfbl63M67J9eF22B/uxffj42hrmT3G/9wEfPsQ+twPPAuYBHcDol1JLgC2THLcSWAp8LiJG2h4GEBEPAk/OzG9OsU5JkiRpxk0pVGfmMAdO6ThARGwD9gFrgI802h4DrACum+SwLwMnjmt7O/DzwLnArVOpUZIkSWqWqY5UT0lm3hURA8C7I+KHwJ3ABcCNwOdG9ouIa4AbMvONmXkvsH30eSLiJ8D8zBzTLkmSJM1GlYbqhj8HHgSuAB4OXAO8NDP3j9rneOA7NfxsSZIkacZVHqoz837gvMZtsn2WH+Ic66qtSpIkSapPHd+oKEmSJM0phmpJkiSpJEO1JEmSVJKhWpIkSSrJUC1JkiSVZKiWJEmSSjJUS5IkSSUZqiVJkqSS6vhGRUmSJOmwbN68udklHBZHqiVJkqSSDNWSJElSSYZqSZIkqSRDtSRJklSSoVqSJEkqyVAtSZIklWSoliRJkkoyVEuSJEklGaolSZKkkgzVkiRJUkmGakmSJKkkQ7UkSZJUkqFakiRJKslQLUmSJJU0v9kFlLX9jpvueeqHVn2z2XUcrn137etY8KgFw82u43D97IGf/UpV59r2rW33xB9Hy/Yle+hgES3blzxIZX157/B/3bP/6se0bF/+8Ef7O5b8wryW7Mv792Zl/QjwvVt+dM8HXvalluzLn953Z8cjH350S/YjwAMP7q2sL+/+5s337Dvz91qyHwF+uPf+jiULj2zZvrx///7qnivvuOue+NDelu1L7trdwaMWt2xf8sD9k/ZlZOZMllK5iNiamauaXcfhsv56ztUM1l/PuZqhleuvunZ/F83jNfkQ66/nXM3QzvU7/UOSJEkqyVAtSZIkldQOobq/2QWUZP31nKsZrL+eczVDK9dfde3+LprHa/Ih1l/PuZqhbetv+TnVkiRJUrO1w0i1JEmS1FSGakmSJKkkQ7UkSZJUkqFakiRJKslQLUmSJJVkqJYkSZJKMlSXFBG/GBEXRsS3ImJvRHwvIj4TEWc0tt8WEdm43R8R34mIj0bE7xzknEdGxH81jlnVaHvLqPNMdls+Q//stlRlX0bEEyPi3yNiOCLujojrI+L0xjb7co6JiHURcU+z62gnEbFh1PWyLyJ+GBGbIuLciFgwbt/jI2Kgcc3ubVzL/xoRpzSrfo01qj/fNK69q9HeERHLRz8vNrYvioj/iIhbI+IJM1+5pmLc9ZqN58ZPRsSTml1blQzVJTSCz1eA5wFvBJ4CPBf4FPDBUbu+DVgKPBH4A+A24KMRcdEkp34P8N0J2paOun0TeO+4tu+U/CfNWTX05SeBI4HnACcBQ8DHIuJ47EupKp+juF6WA78JfAJ4K/D5iPg5gEYA+wrwZOAc4ATgd4FtwGSPwWqO+4HXR8TiqewcET9P8X/gl4Bfz8z/qbM4lTZyvS6luF4fDny0qRVVLTO9HeYN+DTwfeCoCbb9fOPP24DXTrB9PZDA6nHtzwduAlY0tq+a5GdvB97S7N9Bu9yq7EugY3zfAvOB/cDv25cH7YfNwAcoXmT8CNgNvAZYCPw98BPgduAlo445keLB+r7GMRuAR43avoHiRc5rgO8BPwYuAxaN2mch8D5gF8UT+/VA57jangR8HLgLuAf4YuNnnwbsA44Zt38fcCPQ1fj/MPr2lsY+RwB/S/Ei+l7gy8Dzmt0PrXAb6dcJ2lcCD1CE62hcX18F5k2w76Ob/e/wNqY/P924Zv7vqPaR66eD4sVTAquAZcDXgetGHqO9zd7bRNcrcGajPx/e7PqqujlSfZgi4heA04H3Z+YBb+tm5o8PcYoBiif3F44652MoAkU3RUDQDKihL+8EdgAviYijImIeRfC+G/hCZYW3r26K39WvAn9DEXb/HbiZ4sn0cuAfImJZRCwC/oMi5D4T+D3gFOBD4855KkXYei7w4sZ+rxm1/V2N9ldQvLPwdeA/ImIpQEQso3i3IYE1wNMpQv68zNwCfAt46cjJIuJhjfsDFE/6fwbs4aFRmvc0dr0MeDbwhxQB/XLgExHx1On9yjQiM7dT/J94IfA0ihHqd2fm/gn2/cmMFqdD+RnwF8CrGu/qTebxFI+l3wWeO4XHaM0yEfEIisfcr2dm2+QdQ/XhezzFKMiOwzm48QB/M/A4gEbwGgTem5lfq6hGTU2lfZnFS/A1FCHup8Be4C3Ab2XmHRXU2+5uysy3ZPFW7gXAMLAvMy/MzFsopuAERXjuBo6iGLn+emZeS/EC5gUR8fhR5/wpcHZm7sjMq4B/oZiaQ2OawNnAGzLzU5m5A3gVxaj1uY3jz6UYSX5RZt6QmTdn5odHXav/ALx81M97HrAE+HBmPkAxup2Z+YPG7Z5GaFgLnJWZWzLz25n5forRuj+p5Dc5d32D4nocmWN7WNe2Zl5mfpoiMPcdZLfLgTuA383MPTNSmKpwekTc0/h8yU95aEChbRiqD19UdI5s/P0vKd5CvqCC82p6Ku3LiAjgYooR61MpRlD/Ffi3iPilCn5Wu7tx5C+NFyg/pBg5HmnbR/HOwBKKaVI3Zubdo46/jmLE64RRbd/IzAdH3f9+43iA44EFjHoXofFC6YujznESMNQIyBO5HHjcqA++vQL498y88yD/zqdT/L/5xsgTTePJ5rcbNenwjVyPVVzbmnmvB140+gOJ43yM4nH1D2auJFVgC8W7R0+jeCfyP4GrIuKxTaypUobqw/c/FA/aKw7n4MbI9BOBbzeangOsBvZFxIPALY326yNisGStOriq+/I3gN8B1mbmFzLzK5l5DsVI58snOY0esm/c/Zyk7WGMfWE63uj2yY6Hh4LXROfJcftM/IMyd1PMt35FRBxN8UG4gYMd0/j5CTyDh55onkbx//AVhzhWB3cCxfV4c+P+YV3bao7M/DLwbxSfN5jIuygGojZExLqZqkul7cnMWxq3G4Ae4JEU7y62BUP1YcrMHwGfBf40Io4avz0iHn2IU7wSeDTFCCYUYeupPPTEekajvRt4Q9l6Nbka+nJR48+fjdvvZ3jNVe0bwFMb8/NGnELxe57qW/63UHywrXOkofFC6dca54di9YjOiDjiIOe5FDiLYurGLooPT454AJg3bv+vUoT1Y0Y90YzcvjfF2jVORKyk+IzEvwJfo+jD1zX6dPy+j57R4jQdf0nxTt/pE23MzHcBrwUGIuKVM1mYKpMUz4uLDrVjq/AJvpxzKJ4Ut0bEiyLiVyLiSRFxNqPewgYeERHHRMRjI+KUiPg7ig85vb8xB5TMvDUzt4/ceGiE5VuZOX55PVWvsr6kmDbwI+CyiHhqY83qd1PM8fzkDP6b5oJBincA/jEiToyI04BLgP/XmH99SJl5L8UHhP8mIs6IiBWN+79IMY2Hxp9HAVdGxDMi4vERsTYinjbqVFdTTPn5K+CyzBz9ouo24MiIWNNYb3dRZt7cqH9DRPx+RDwuIlZFxGsj4gWH9+uYcxY2rsdljWvtfIoVZLYB72lMH3o5xXSaL0TEmY01q0+MiNcz9oWPZpHG9dvP2A8Uj9/n7xrbL4kIP4cw+41cr8c0Hmcvonhc/UST66qMobqEzLyVYl7k1RRvU91IMUfodxn7QaM3U3yo4hbgSuCXgRdk5nkzWrAmVWVfZuYwxejKUY1zbKVYdu3/ZOZXav/HzCGNDyk9j+ItxBso5lp+kelPn3gDRX9eRjG6+RTg9JEPljZGjk+jWAJvE8Uo83nA/87TbgS4yyjmZ182rs7rKNY730ixTODrG5te3tj3XcB/U7zoOg3YOc3656rnUlyPtwPXUFyvbwVOa7xYovE288kU71x8sPHnpyjm5P5pE2rW1L2NUdfYRBof7j0XuDgizpmRqnS4Rq7XO4AvUUx9e1Fmbm5mUVWK4nlAklRWRHwAeHxmrml2LZKkmTW/2QVIUquLiEdRjIa+lGJetSRpjjFUS1J5I0t8DWTmp5pdjCRp5jn9Q5IkSSrJDypKkiRJJRmqJUmSpJIM1ZIkSVJJhmpJkiSpJEO1JEmSVJKhWpIkSSrp/wORGRA+wc0kDgAAAABJRU5ErkJggg==\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# compare between filtered and unfiltered\n",
+ "fig, axes = plt.subplots(1, 6, figsize=(12, 6), sharey=True)\n",
+ "sorted_celltypes = ['CD4T', 'CD8T', 'monocyte', 'DC', 'NK', 'B']\n",
+ "# ax1.errorbar(y=bios_replication_filtered_df.loc[sorted_celltypes]['r'].values,\n",
+ "# x=[ind-0.1 for ind in range(len(sorted_celltypes))],\n",
+ "# yerr=bios_replication_filtered_df.loc[sorted_celltypes]['se_r'].values,\n",
+ "# fmt='.', markersize=6, marker='o', \n",
+ "# ecolor=bios_replication_filtered_df.loc[sorted_celltypes]['color'].values,\n",
+ "# color=bios_replication_filtered_df.loc[sorted_celltypes]['color'].values[0])\n",
+ "# ax1.set_xticklabels([\"\"]+sorted_celltypes)\n",
+ "# ax1.plot([0, 5], [0.5, 0.5], linestyle='--', color='black')\n",
+ "for ind, celltype in enumerate(sorted_celltypes):\n",
+ " ax = axes[ind]\n",
+ " ax.errorbar(y=bios_replication_filtered_df.loc[celltype]['r'],\n",
+ " x=[0.4],\n",
+ " yerr=bios_replication_filtered_df.loc[celltype]['se_r'],\n",
+ " fmt='.', markersize=6, marker='o', ecolor='black',\n",
+ " markeredgecolor='black', markerfacecolor='black'\n",
+ " )\n",
+ " ax.errorbar(y=bios_replication_unfiltered_df.loc[celltype]['r'],\n",
+ " x=[0.6],\n",
+ " yerr=bios_replication_unfiltered_df.loc[celltype]['se_r'],\n",
+ " fmt='.', markersize=6, marker='o', ecolor='black',\n",
+ " markeredgecolor='black', markerfacecolor='white')\n",
+ " ax.set_xlim([0, 1])\n",
+ " ax.spines['bottom'].set_color(bios_replication_filtered_df.loc[celltype]['color'])\n",
+ " ax.spines['top'].set_color(bios_replication_filtered_df.loc[celltype]['color']) \n",
+ " ax.spines['right'].set_color(bios_replication_filtered_df.loc[celltype]['color'])\n",
+ " ax.spines['left'].set_color(bios_replication_filtered_df.loc[celltype]['color'])\n",
+ " ax.set_xticklabels([])\n",
+ " ax.set_xlabel(celltype)\n",
+ " \n",
+ "\n",
+ "plt.savefig('bios_replication_comparison.filter_and_unfilter.pdf')\n",
+ "plt.savefig('bios_replication_comparison.filter_and_unfilter.png')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "metadata": {
+ "scrolled": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# compare between filtered and unfiltered\n",
+ "celltypes = ['CD4T', 'CD8T', 'monocyte', 'B', 'NK', 'DC']\n",
+ "fig, axes = plt.subplots(6, 2, figsize=(12, 12), sharex=True)\n",
+ "for i, celltype in enumerate(celltypes):\n",
+ " replication_celltypes = [ct for ct in celltypes]\n",
+ " ax1, ax2 = axes[i, :]\n",
+ " ax1.scatter(x=replication_celltypes,\n",
+ " y=numcoeqtl_df[celltype].loc[replication_celltypes])\n",
+ " ax1.scatter(x=replication_celltypes,\n",
+ " y=unnumcoeqtl_df[celltype].loc[replication_celltypes])\n",
+ " ax2.errorbar(x=replication_celltypes, fmt='.', markersize=12,\n",
+ " y=rb_df[celltype].loc[replication_celltypes],\n",
+ " yerr=rbse_df[celltype].loc[replication_celltypes], label='filtered')\n",
+ " ax2.errorbar(x=replication_celltypes, fmt='.', markersize=12,\n",
+ " y=unrb_df[celltype].loc[replication_celltypes],\n",
+ " yerr=unrbse_df[celltype].loc[replication_celltypes], label='Unfiltered')\n",
+ " ax1.set_ylabel(celltype)\n",
+ "ax2.legend()\n",
+ "\n",
+ "plt.savefig('celltype_rb.comparison_filtered_unfiltered_results.pdf')\n",
+ "plt.savefig('celltype_rb.comparison_filtered_unfiltered_results.png')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Sub celltypes in monocytes"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " r | \n",
+ " se_r | \n",
+ " p | \n",
+ " celltype_discovery | \n",
+ " celltype_replication | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " 0.971431 | \n",
+ " 0.048402 | \n",
+ " 1.351820e-89 | \n",
+ " ncMono | \n",
+ " cMono | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 0.929081 | \n",
+ " 0.088678 | \n",
+ " 1.101982e-25 | \n",
+ " ncMono | \n",
+ " monocyte | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 0.936797 | \n",
+ " 0.025409 | \n",
+ " 1.468276e-297 | \n",
+ " cMono | \n",
+ " ncMono | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 0.999726 | \n",
+ " 0.000613 | \n",
+ " 0.000000e+00 | \n",
+ " cMono | \n",
+ " monocyte | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " 0.896203 | \n",
+ " 0.036240 | \n",
+ " 5.115902e-135 | \n",
+ " monocyte | \n",
+ " ncMono | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " 0.949824 | \n",
+ " 0.008640 | \n",
+ " 0.000000e+00 | \n",
+ " monocyte | \n",
+ " cMono | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " r se_r p celltype_discovery celltype_replication\n",
+ "1 0.971431 0.048402 1.351820e-89 ncMono cMono\n",
+ "2 0.929081 0.088678 1.101982e-25 ncMono monocyte\n",
+ "3 0.936797 0.025409 1.468276e-297 cMono ncMono\n",
+ "4 0.999726 0.000613 0.000000e+00 cMono monocyte\n",
+ "5 0.896203 0.036240 5.115902e-135 monocyte ncMono\n",
+ "6 0.949824 0.008640 0.000000e+00 monocyte cMono"
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "filtered_mono_res_df = pd.read_csv(workdir/'output/filtered_results/rb_calculations/monocyte_subcelltypes/summary.csv', \n",
+ " index_col=0)\n",
+ "filtered_mono_res_df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# filtered results\n",
+ "mono_subcelltypes = ['monocyte', 'cMono', 'ncMono']\n",
+ "monorb_df = pd.DataFrame(data=np.zeros((len(mono_subcelltypes), len(mono_subcelltypes))), \n",
+ " columns=mono_subcelltypes, index=mono_subcelltypes)\n",
+ "monorbse_df = pd.DataFrame(data=np.zeros((len(mono_subcelltypes), len(mono_subcelltypes))), \n",
+ " columns=mono_subcelltypes, index=mono_subcelltypes)\n",
+ "monorbpvalue_df = pd.DataFrame(data=np.zeros((len(mono_subcelltypes), len(mono_subcelltypes))), \n",
+ " columns=mono_subcelltypes, index=mono_subcelltypes)\n",
+ "mononumcoeqtl_df = pd.DataFrame(data=np.zeros((len(mono_subcelltypes), len(mono_subcelltypes))), \n",
+ " columns=mono_subcelltypes, index=mono_subcelltypes)\n",
+ "monoanno_df = pd.DataFrame(data=np.zeros((len(mono_subcelltypes), len(mono_subcelltypes))), \n",
+ " columns=mono_subcelltypes, index=mono_subcelltypes)\n",
+ "mononum_anno_df = pd.DataFrame(data=np.zeros((len(mono_subcelltypes), len(mono_subcelltypes))), \n",
+ " columns=mono_subcelltypes, index=mono_subcelltypes)\n",
+ "\n",
+ "for discovery_celltype in mono_subcelltypes:\n",
+ " # replication in other celltypes\n",
+ " for replication_celltype in mono_subcelltypes:\n",
+ " if discovery_celltype != replication_celltype:\n",
+ " monorb_results = filtered_mono_res_df[(filtered_mono_res_df['celltype_discovery'] == discovery_celltype) &\n",
+ " (filtered_mono_res_df['celltype_replication'] == replication_celltype)]\n",
+ " monoreplicated_coeqtls_num = pd.read_csv(\n",
+ " workdir/f'output/filtered_results/rb_calculations/monocyte_subcelltypes/discovery_{discovery_celltype}_replication_{replication_celltype}.tsv.gz',\n",
+ " compression='gzip',\n",
+ " sep='\\t',\n",
+ " index_col=0\n",
+ " ).shape[0]\n",
+ " if monorb_results['r'].values[0] < 10:\n",
+ " monorb_df.loc[replication_celltype, discovery_celltype] = monorb_results['r'].values[0]\n",
+ " monorbse_df.loc[replication_celltype, discovery_celltype] = monorb_results['se_r'].values[0]\n",
+ " monorbpvalue_df.loc[replication_celltype, discovery_celltype] = monorb_results['p'].values[0]\n",
+ " mononumcoeqtl_df.loc[replication_celltype, discovery_celltype] = monoreplicated_coeqtls_num\n",
+ " monorbvalue = monorb_results['r'].values[0]\n",
+ " monorbsevalue = monorb_results['se_r'].values[0]\n",
+ " monoanno_df.loc[replication_celltype, discovery_celltype] = \\\n",
+ " f\"rb={monorbvalue:.2f}\\nN={monoreplicated_coeqtls_num}\"\n",
+ " mononum_anno_df.loc[replication_celltype, discovery_celltype] = \\\n",
+ " f\"N={monoreplicated_coeqtls_num}\"\n",
+ " else:\n",
+ " monorb_df.loc[replication_celltype, discovery_celltype] = np.nan\n",
+ " monorbse_df.loc[replication_celltype, discovery_celltype] = np.nan\n",
+ " monorbpvalue_df.loc[replication_celltype, discovery_celltype] = 0\n",
+ " mononumcoeqtl_df.loc[replication_celltype, discovery_celltype] = monoreplicated_coeqtls_num\n",
+ " monoanno_df.loc[replication_celltype, discovery_celltype] = \\\n",
+ " f\"rb=NA\\nN={monoreplicated_coeqtls_num}\"\n",
+ " mononum_anno_df.loc[replication_celltype, discovery_celltype] = \\\n",
+ " f\"N={monoreplicated_coeqtls_num}\"\n",
+ " else:\n",
+ " monorb_df.loc[replication_celltype, discovery_celltype] = 1\n",
+ " monorbse_df.loc[replication_celltype, discovery_celltype] = 0\n",
+ " monorbpvalue_df.loc[replication_celltype, discovery_celltype] = 0\n",
+ " monoreplicated_coeqtls_num = pd.read_csv(\n",
+ " workdir/f'output/filtered_results/UT_{discovery_celltype}/coeqtls_fullresults_fixed.sig.tsv.gz',\n",
+ " compression='gzip',\n",
+ " sep='\\t'\n",
+ " ).shape[0]\n",
+ " mononumcoeqtl_df.loc[replication_celltype, discovery_celltype] = monoreplicated_coeqtls_num\n",
+ " monoanno_df.loc[replication_celltype, discovery_celltype] = \\\n",
+ " f\"N={monoreplicated_coeqtls_num}\"\n",
+ " mononum_anno_df.loc[replication_celltype, discovery_celltype] = \\\n",
+ " f\"N={monoreplicated_coeqtls_num}\"\n",
+ " \n",
+ "monoreplicated_ratio_df = pd.DataFrame(data=np.zeros((len(mono_subcelltypes), len(mono_subcelltypes))), \n",
+ " columns=mono_subcelltypes, index=mono_subcelltypes)\n",
+ "for discovery_celltype in mononumcoeqtl_df.columns:\n",
+ " for replication_celltype in mononumcoeqtl_df.index:\n",
+ " monoreplicated_ratio_df.loc[replication_celltype, discovery_celltype] = \\\n",
+ " mononumcoeqtl_df.loc[replication_celltype, discovery_celltype] / mononumcoeqtl_df.loc[discovery_celltype, discovery_celltype]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " monocyte | \n",
+ " cMono | \n",
+ " ncMono | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " monocyte | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " 0.826087 | \n",
+ "
\n",
+ " \n",
+ " cMono | \n",
+ " 0.996441 | \n",
+ " 1.000000 | \n",
+ " 0.826087 | \n",
+ "
\n",
+ " \n",
+ " ncMono | \n",
+ " 0.985765 | \n",
+ " 0.980645 | \n",
+ " 1.000000 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " monocyte cMono ncMono\n",
+ "monocyte 1.000000 1.000000 0.826087\n",
+ "cMono 0.996441 1.000000 0.826087\n",
+ "ncMono 0.985765 0.980645 1.000000"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "monoreplicated_ratio_df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ ":60: UserWarning: FixedFormatter should only be used together with FixedLocator\n",
+ " ax.set_xticklabels([\"\"]+col_labels)\n",
+ ":61: UserWarning: FixedFormatter should only be used together with FixedLocator\n",
+ " ax.set_yticklabels([\"\"]+row_labels)\n"
+ ]
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "fig, axes = plt.subplots(1, 2, figsize=(10, 5))\n",
+ "ax1, ax2 = axes\n",
+ "\n",
+ "im1, bar = heatmap(monoreplicated_ratio_df.values, \n",
+ " list(monorb_df.index), \n",
+ " list(monorb_df.columns),\n",
+ " cmap=\"viridis\",\n",
+ " ax=ax1)\n",
+ "\n",
+ "\n",
+ "_ = annotate_heatmap(im1, \n",
+ " data=monoreplicated_ratio_df.values, \n",
+ " valfmt=\"{x:.0%}\", \n",
+ " color=\"black\",\n",
+ " threshold=1)\n",
+ "\n",
+ "im2, bar = heatmap(monorb_df.values, \n",
+ " list(monorb_df.index), \n",
+ " list(monorb_df.columns),\n",
+ " cmap=\"viridis\",\n",
+ " ax=ax2)\n",
+ "\n",
+ "\n",
+ "_ = annotate_heatmap(im2, \n",
+ " data=monoanno_df.values, \n",
+ " valfmt=\"{x:^}\", \n",
+ " color=\"black\",\n",
+ " threshold=1)\n",
+ "\n",
+ "plt.savefig('cmono_ncmono_mono.filtered_results.pdf')\n",
+ "plt.savefig('cmono_ncmono_mono.filtered_results.png')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Non-zero ratio and co-expression mean and variances"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "celltype = 'monocyte'\n",
+ "annotated_coeqtl_df = pd.DataFrame()\n",
+ "for celltype in celltypes:\n",
+ " celltype_annotated_coeqtl_df = pd.read_csv(\n",
+ " workdir/f'output/filtered_results/UT_{celltype}/coeqtls_fullresults_fixed.all.annotated.tsv.gz',\n",
+ " compression='gzip',\n",
+ " sep='\\t'\n",
+ " )[['mean_onemillionv2', 'var_onemillionv2', \n",
+ " 'gene2_nonzeroratio_onemillionv2',\n",
+ " 'eqtlgene_nonzeroratio_onemillionv2',\n",
+ " 'gene2_isSig']]\n",
+ " celltype_annotated_coeqtl_df['celltype'] = celltype\n",
+ " annotated_coeqtl_df = pd.concat([annotated_coeqtl_df, \n",
+ " celltype_annotated_coeqtl_df],\n",
+ " axis=0)\n",
+ " \n",
+ "annotated_coeqtl_df_clean = annotated_coeqtl_df.replace([np.inf, -np.inf], np.nan, inplace=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 27,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "sns.boxplot(x=annotated_coeqtl_df_clean['celltype'],\n",
+ " y=abs(annotated_coeqtl_df_clean['mean_onemillionv2']),\n",
+ " hue=annotated_coeqtl_df_clean['gene2_isSig'],\n",
+ " fliersize=1,\n",
+ " palette='viridis',\n",
+ " showfliers = False)\n",
+ "# plt.savefig('mean_onemillionv2.filtered_results.pdf')\n",
+ "# plt.savefig('mean_onemillionv2.filtered_results.png')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 28,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "sns.boxplot(x=annotated_coeqtl_df_clean['celltype'], \n",
+ " y=annotated_coeqtl_df_clean['var_onemillionv2'],\n",
+ " hue=annotated_coeqtl_df_clean['gene2_isSig'],\n",
+ " palette='viridis', fliersize=1,\n",
+ " showfliers = False)\n",
+ "# plt.savefig('var_onemillionv2.filtered_results.pdf')\n",
+ "# plt.savefig('var_onemillionv2.filtered_results.png')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 29,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "sns.boxplot(x=annotated_coeqtl_df_clean['celltype'],\n",
+ " y=annotated_coeqtl_df_clean['gene2_nonzeroratio_onemillionv2'],\n",
+ " hue=annotated_coeqtl_df_clean['gene2_isSig'],\n",
+ " palette='viridis', fliersize=1, showfliers = False)\n",
+ "# plt.savefig('gene2_nonzeroratio_onemillionv2.filtered_results.pdf')\n",
+ "# plt.savefig('gene2_nonzeroratio_onemillionv2.filtered_results.png')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 30,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "sns.boxplot(x=annotated_coeqtl_df_clean['celltype'], \n",
+ " y=annotated_coeqtl_df_clean['eqtlgene_nonzeroratio_onemillionv2'],\n",
+ " hue=annotated_coeqtl_df_clean['gene2_isSig'],\n",
+ " palette='viridis', fliersize=1, showfliers = False)\n",
+ "# plt.savefig('eqtlgene_nonzeroratio_onemillionv2.filtered_results.pdf')\n",
+ "# plt.savefig('eqtlgene_nonzeroratio_onemillionv2.filtered_results.png')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "##### unfiltered results"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "CD4T\n",
+ "CD8T\n",
+ "monocyte\n",
+ "DC\n",
+ "NK\n",
+ "B\n"
+ ]
+ }
+ ],
+ "source": [
+ "celltype = 'monocyte'\n",
+ "annotated_coeqtl_df = pd.DataFrame()\n",
+ "for celltype in celltypes:\n",
+ " print(celltype)\n",
+ " celltype_annotated_coeqtl_df = pd.read_csv(workdir/f'output/unfiltered_results/UT_{celltype}/coeqtls_fullresults_fixed.all.annotated.tsv.gz',\n",
+ " compression='gzip',\n",
+ " sep='\\t')[['mean_onemillionv2', 'var_onemillionv2', \n",
+ " 'gene2_nonzeroratio_onemillionv2',\n",
+ " 'eqtlgene_nonzeroratio_onemillionv2',\n",
+ " 'gene2_isSig']]\n",
+ " celltype_annotated_coeqtl_df['celltype'] = celltype\n",
+ " annotated_coeqtl_df = pd.concat([annotated_coeqtl_df, \n",
+ " celltype_annotated_coeqtl_df],\n",
+ " axis=0)\n",
+ " \n",
+ "annotated_coeqtl_df_clean = annotated_coeqtl_df.replace([np.inf, -np.inf], np.nan, inplace=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "sns.boxplot(x=annotated_coeqtl_df_clean['celltype'],\n",
+ " y=abs(annotated_coeqtl_df_clean['mean_onemillionv2']),\n",
+ " hue=annotated_coeqtl_df_clean['gene2_isSig'],\n",
+ " fliersize=1,\n",
+ " palette='Paired',\n",
+ " showfliers = False)\n",
+ "plt.savefig('mean_onemillionv2.unfiltered_results.pdf')\n",
+ "plt.savefig('mean_onemillionv2.unfiltered_results.png')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "sns.boxplot(x=annotated_coeqtl_df_clean['celltype'], \n",
+ " y=annotated_coeqtl_df_clean['var_onemillionv2'],\n",
+ " hue=annotated_coeqtl_df_clean['gene2_isSig'],\n",
+ " palette='Paired', fliersize=1,\n",
+ " showfliers = False)\n",
+ "plt.savefig('var_onemillionv2.unfiltered_results.pdf')\n",
+ "plt.savefig('var_onemillionv2.unfiltered_results.png')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "sns.boxplot(x=annotated_coeqtl_df_clean['celltype'],\n",
+ " y=annotated_coeqtl_df_clean['gene2_nonzeroratio_onemillionv2'],\n",
+ " hue=annotated_coeqtl_df_clean['gene2_isSig'],\n",
+ " palette='Paired', fliersize=1, showfliers = False)\n",
+ "plt.savefig('gene2_nonzeroratio_onemillionv2.unfiltered_results.pdf')\n",
+ "plt.savefig('gene2_nonzeroratio_onemillionv2.unfiltered_results.png')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYMAAAEGCAYAAACHGfl5AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8rg+JYAAAACXBIWXMAAAsTAAALEwEAmpwYAAAp2klEQVR4nO3de5xVddn38c+XAQUERAXSQAXPp5RwyENqpaGkKd6pt5p3glZYHiB9rLR60oo7M7M8S+TjqSgtzTxxE2UhaZZyMuWgkpqAJ8AjAcMA1/PHWsO9GfbM7DWz157T9/167dfsvdZvrX2tPTP7Wmv9TooIzMysc+vS2gGYmVnrczIwMzMnAzMzczIwMzOcDMzMDOja2gE0R79+/WLw4MGtHYaZWbsya9as5RHRv9i6dpkMBg8ezMyZM1s7DDOzdkXSvxpa59tEZmbmZGBmZk4GZmZGO60zKKa2tpYlS5awZs2a1g6l1XXv3p1BgwbRrVu31g7FzNqJDpMMlixZQu/evRk8eDCSWjucVhMRrFixgiVLljBkyJDWDsfM2olcbxNJulXSm5KebWC9JF0naZGkf0ga1tz3WrNmDdttt12nTgQAkthuu+18hWRmmeRdZ3A7MLKR9Z8Cdk8fY4GbW/JmnT0R1PHnYGZZ5XqbKCJmSBrcSJFRwJ2RjKP9N0l9Je0QEa/lGZeZWUtcf/31LFq0aLPlS5cuBWDgwIGbLN9tt9244IILKhJbc7V2a6KBwOKC10vSZZuRNFbSTEkzly1bVpHgzMyyWL16NatXr27tMJqltSuQi93PKDrbTkRMAiYBVFdXt4kZeRYvXsyZZ57J66+/TpcuXRg7dizjx49vsPy3v/1tjjjiCD75yU8WXf/cc89xzjnn8M4771BTU8Phhx/OpEmTmDlzJnfeeSfXXXddXodiZhk0dJZf9/9/7bXXVjKcsmjtZLAE2LHg9SDg1VaKJbOuXbty9dVXM2zYMN5//30OPPBARowYwT777FO0/He/+91G9zdu3DguvPBCRo0aBcAzzzwDQHV1NdXV1eUN3sysQGvfJnoAODNtVXQw8G656wu+973vsddeezFixAhOP/10fvSjH/HPf/6TkSNHcuCBB3L44YezcOFCAMaMGcO4ceM49NBD2WWXXbjnnns27ueqq65i+PDh7L///lx22WUA7LDDDgwbljSA6t27N3vvvffGe4bFjBkzZuM+L7nkEvbZZx/2339/Lr74YgBee+01Bg0atLH8hz70IQCmT5/Opz/9aQCWLVvGiBEjGDZsGOeccw4777wzy5cvL9fHZWadVK5XBpJ+BXwc6CdpCXAZ0A0gIiYCU4BjgUXAKuCscr7/zJkzuffee5kzZw7r1q1j2LBhHHjggYwdO5aJEyey++678/e//51zzz2XP/3pT0DyhfzYY4+xcOFCTjjhBE4++WSmTZvGCy+8wJNPPklEcMIJJzBjxgyOOOKIje/18ssvM2fOHA466KAm43rrrbe47777WLhwIZJ45513ALjwwgs58sgjOfTQQzn66KM566yz6Nu37ybbfuc73+HII4/k0ksvZerUqUyaNKlsn5eZdV55tyY6vYn1AZyX1/s/9thjjBo1ih49egBw/PHHs2bNGv76179yyimnbCxXU1Oz8fmJJ55Ily5d2GeffXjjjTcAmDZtGtOmTePDH/4wACtXruSFF17YmAxWrlzJSSedxDXXXEOfPn2ajKtPnz50796dL3zhCxx33HEbz/rPOussjjnmGKZOncr999/PT3/6U55++unNjum+++4DYOTIkWyzzTbN/XjMzDZq7TqDXCW5ZlMbNmygb9++zJ07t+g2W2655WbbRwSXXnop55xzzmbla2trOemkkzjjjDP4zGc+U1JcXbt25cknn+SRRx7hrrvu4oYbbth4ZfLBD36Qs88+m7PPPpv99tuPZ5/dtL9esWMyM2up1q4zyNVhhx3Ggw8+yJo1a1i5ciUPP/wwPXv2ZMiQIfzmN78Bki/X+mff9R1zzDHceuutrFy5EkjaEr/55ptEBJ///OfZe++9ueiii0qOa+XKlbz77rsce+yxXHPNNRsT09SpU6mtrQXg9ddfZ8WKFZu1Vz7ssMP49a9/DSRXLG+//XbJ72tm1pAOfWUwfPhwTjjhBA444AB23nlnqqur2XrrrZk8eTJf/vKXmTBhArW1tZx22mkccMABDe7n6KOPZsGCBRxyyCEA9OrVi1/84hc8//zz/PznP+dDH/oQQ4cOBeD73/8+xx57bKNxvf/++4waNYo1a9YQEfzkJz8Bki/38ePH0717dyCptN5+++03VnADXHbZZZx++uncfffdfOxjH2OHHXagd+/eLfmYzMxQe7ztUF1dHfVnOluwYAF77733ZmVXrlxJr169WLVqFUcccQSTJk3a2AKoPaqpqaGqqoquXbvyxBNP8OUvf7noLa+GPg/rmL1HrW1o6/0MJM2KiKLt1Dv0lQHA2LFjmT9/PmvWrGH06NHtOhEAvPLKK/znf/4nGzZsYIsttuBnP/tZa4fUYbTXnqNm5dDhk8Evf/nLir/neeedx+OPP77JsvHjx3PWWS1vObv77rszZ86cFu+nM+uIvUfNWqqkZCBpL5Ixg/4eESsLlo+MiKl5Bdde3Xjjja0dgplZJk22JpI0DrgfuAB4VtKogtXfzyswMzOrnFKuDL4IHBgRK9PhqO+RNDgirqX4QHNmZtbOlJIMqupuDUXEy5I+TpIQdsbJwMysQyglGbwuaWhEzAVIrxA+DdwKfCjP4Mrla5d+g7ffebds+9um79b88IrG75BVVVVtHGgO4He/+x2DBw8uWrZXr14bO7SZWedw/fXXM3Xq5lWuq1atyjTSgCR69uy5ybKRI0dmbg5dSjI4E1hXuCAi1pGMNvrTTO/WSt5+512O/cKFZdvflFt+0mSZHj16NDjkhVVOQ30Kiqkr19icFIXc/6BzyOtvaOnSpdTU1GwyBE5rajIZRMQSAEnXAndHxF8L1j3e4Ia2iZUrVzJq1CjefvttamtrmTBhwsZ5C+q89tprnHrqqbz33nusW7eOm2++mcMPP5xp06Zx2WWXUVNTw6677sptt91Gr169WulI2pdFixYx99kFrO+5bZNlu6xNzsZmvfhGk2WrVr3V4tisfVi0aBELnnue/gN3bLpw124ALF/ZdJ+V91b+m3333bfNNGXO0s9gNvAtSXsA95EkhplNbNNprV69euMQFXVjId1333306dOH5cuXc/DBB3PCCSdsMnn9L3/5S4455hi++c1vsn79elatWsXy5cuZMGECf/zjH9lqq6248sor+fGPf8y3v/3tVjqy9md9z21ZvVfjQ4Rk1WPhlLLuz9q2/gN35ORzv1bWfd5z0w/Lur+WKjkZRMQdwB2StgVOAq6UtFNE7J5bdO1Y/dtEtbW1fOMb32DGjBl06dKFpUuX8sYbb7D99ttvLDN8+HDOPvtsamtrOfHEExk6dCiPPvoo8+fP56Mf/SgAa9eu3ThGkplZuTSnB/JuwF7AYGB+WaPpwCZPnsyyZcuYNWsW3bp1Y/DgwaxZs2aTMkcccQQzZszg4Ycf5nOf+xxf/epX2WabbRgxYgS/+tWvWilyM+sMSh7CWtKVkl4AvgvMI+l7cHxukXUw7777LgMGDKBbt278+c9/5l//+tdmZf71r38xYMAAvvjFL/L5z3+e2bNnc/DBB/P4449vrJhatWoVzz//fKXDN7MOLsuVwUvAIRHR7ibc3abv1iW1AMqyv6zOOOMMjj/+eKqrqxk6dCh77bXXZmWmT5/OVVddRbdu3ejVqxd33nkn/fv35/bbb+f000/fOCPbhAkT2GOPPVp8HGZmdbLUGUyUNFDSoYXbRcSMXCIro6b6BOShfr+Bfv368cQTTzRadvTo0YwePXqz9UceeSRPPfVU+YM0M0uVnAwk/QA4jaSeYH26OIA2nwzMzKxxWW4T/QewZ0TUNFnSzMzalSxzIL8IdMsrEDMzaz1ZrgxWAXMlPQJsvDqIiHFlj8qsEyg2zEFDU2+Ch7+wfGVJBg+kDzPLiafetNaSqQeypC2AujaNz0VEbT5hmXV8xc7yPfWmtZYsrYk+DtwBvEwyj8GOkka3h6al51/0Vd5cXr6BxQb025YbfnxVg+tXrFjBUUcdBcDrr79OVVUV/fv3B+DJJ59kiy22KFssZmblkOU20dXA0RHxHEA6YN2vgAPzCKyc3lz+Fv/8wMfKt8M3Hm109XbbbbdxXKLLL7+cXr16cfHFF29cv27dOrp2bc5IIGZm+cjyjdStLhEARMTzkty6qERjxoxh2223Zc6cOQwbNozevXtvkiT2228/HnroIQYPHswvfvELrrvuOtauXctBBx3ETTfdRFVVVSsfgZl1ZFmals6U9P8kfTx9/AyYlVdgHdHzzz/PH//4R66++uoGyyxYsIC7776bxx9/nLlz51JVVcXkyZMrGKWZdUZZrgy+DJwHjCOpM5gB3JRHUB3VKaec0uQZ/iOPPMKsWbMYPnw4kLQuGTBgQCXCM7NOLEtrohrgx+nDmmGrrbba+Lxr165s2LBh4+u64awjgtGjR3PFFVdUPD4z67yavE0k6dfpz2ck/aP+o4TtR0p6TtIiSZcUWb+1pAclPS1pnqSzmnco7cvgwYOZPXs2ALNnz+all14C4KijjuKee+7hzTffBOCtt94qOty1mVk5lXJlUDez86ez7lxSFXAjMAJYAjwl6YGIKJwU5zxgfkQcL6k/8JykyRGxNuv7NWRAv22bbAGUeX8tdNJJJ3HnnXcydOhQhg8fvnFI6n322YcJEyZw9NFHs2HDBrp168aNN97Izjvv3OL3NDNrSJPJICJeS3825/T0I8CiiHgRQNJdwCg2nSEtgN5KJgPuBbwFrGvGezWosT4Bebv88suLLu/RowfTpk0ruu7UU0/l1FNPzTEqM7NNNZkMJL1P8oW92SogIqJPI5sPBBYXvF4CHFSvzA0kw1y8CvQGTo2IDfXKIGksMBZgp512aipsMzPLoMk6g4joHRF9ijx6N5EIIEkYm+2y3utjgLnAB4GhwA2SNttvREyKiOqIqK7rzWtmZuVRypVBozfII6KxcR6WADsWvB5EcgVQ6CzgBxERwCJJLwF7AU82FVuRWEjuNnVuyUdpZla6UiqQZ5GczTd0lr9LI9s+BewuaQiwlGSmtM/WK/MKcBTwF0kfAPYkmTshk+7du7NixQq22267Tp0QIoIVK1bQvXv31g7FzNqRUiqQhzR35xGxTtL5wO+BKuDWiJgn6Uvp+onA94DbJT1DknC+HhHLs77XoEGDWLJkCcuWLWtuuB1G9+7dGTRoUGuHYWbtSCm3ifaKiIWShhVbHxGzG9s+IqYAU+otm1jw/FXg6NLCbVi3bt0YMqTZecvMrFMr5TbRRSSteIoNqBPAkWWNyMzMKq6U20Rj05+fyD8cMzNrDVkmt6kCjgMGF24XER6ryMysncsyaumDwBrgGWCzTmFmZtZ+ZUkGgyJi/9wiMTOzVpNlcpv/kdTiVj9mZtb2ZLky+Btwn6QuQC2ljU1kZmbtQJZkcDVwCPBMeLwDM7MOJcttoheAZ50IzMw6nixXBq8B0yX9D1BTt9BNS83M2r8syeCl9LFF+jAzsw6i5GQQEd8BkLRVRPw7v5DMzKzSSq4zkHSIpPnAgvT1AZJuyi0yMzOrmCwVyNeQzEq2AiAingaOyCEmMzOrsCzJgIhYXG/R+jLGYmZmrSRLBfJiSYcCIWkLYBzpLSMzM2vfslwZfAk4DxhIMrfx0PS1mZm1c1laEy0HzsgxFjMzayVZ5jPoD3yRzeczOLv8YZmZWSVlqTO4H/gL8EdccWxm1qFkSQY9I+LruUViZmatJksF8kOSjs0tEjMzazVZksF4koSwRtL76eO9vAIzM7PKydKaqHeegZiZWevJUmeApBP43yEopkfEQ+UPyczMKi3LQHU/ILlVND99jE+XmZlZO5flyuBYYGhEbACQdAcwB7gkj8DMzKxyMg1UB/QteL51GeMwM7NWlOXK4ApgjqQ/AyKpO7g0l6jMzKyisrQm+pWk6cBwkmTw9Yh4vW69pH0jYl75QzQzs7xlnc/gtYh4ICLuL0wEqZ8X20bSSEnPSVokqWj9gqSPS5oraZ6kR7PEZGZmLZepaWkTtNkCqQq4ERhBMuz1U5IeiIj5BWX6AjcBIyPiFUkDyhiTmZmVIGsFcmOiyLKPAIsi4sWIWAvcBYyqV+azwG8j4hWAiHizjDGZmVkJypkMihkIFE6VuSRdVmgPYBtJ0yXNknRmsR1JGitppqSZy5YtyylcM7POqZzJYG2RZZvdOmLzK4iuwIHAccAxwP+VtMdmG0VMiojqiKju379/i4M1M7P/1ZLhKB6NiAfr1kXEwUU2WQLsWPB6EPBqkTLLI+LfwL8lzQAOAJ7PEpuZmTVfluEormDT4SjGpcsa8xSwu6QhkrYATgMeqFfmfuBwSV0l9QQOAhaUGpeZmbVcliuD4yg+HEWDHc8iYp2k84HfA1XArRExT9KX0vUTI2KBpKnAP4ANwC0R8WzzDsfMzJoja9PSvsBb6fOShqOIiCnAlHrLJtZ7fRVwVcZYzMysTDwchZmZlW84CjMza7+arECWtFf6cxiwA0nrn8XAB9NlZmbWzpVyZXARMBa4usi6AI4sa0RmZlZxTSaDiBibPv1URKwpXCepey5RmZlZRWXpgfzXEpeZmVk70+SVgaTtScYT6iHpw/zvEBN9gJ45xmZmZhVSSp3BMcAYkqEkflyw/H3gGznEZGZmFVZKncEdwB2SToqIeysQk5mZVViWfgb3SjoO2BfoXrD8u3kEZmZmlZNloLqJwKnABST1BqcAO+cUl5mZVVCW1kSHRsSZwNsR8R3gEDYdntrMzNqpLMmgro/BKkkfBGqBIeUPyczMKi3LQHUPppPXXwXMJul9/LM8gjIzs8oqKRlI6gI8EhHvAPdKegjoHhHv5hmcmZlVRkm3idIJba4ueF3jRGBm1nFkqTOYJukkScUmuTczs3YsS53BRcBWwHpJq0mal0ZE9MklMjMzq5gsnc565xmImZm1nkxzIEs6gWS6S4DpEfFQ+UMyM7NKy9ID+QfAeGB++hifLjMzs3Yuy5XBscDQtGURku4A5gCX5BGYmZlVTpbWRAB9C55vXcY4zMysFWW5Mvg+MEfSn0laEh0BXJpLVGZmVlFZeiBvAA4GhpMkg69HxOs5xmZmZhVSUjKIiA2Szo+IXwMP5ByTmZlVWJY6gz9IuljSjpK2rXvkFpmZmVVMljqDs9Of5xUsC2CX8oVjZmatIUsPZM9dYGbWQWXpdNZT0rckTUpf7y7p0/mFZmZmlZKlzuA2YC1waPp6CTCh7BGZmVnFZUkGu0bED0mmuyQi6kYubZSkkZKek7RIUoO9lSUNl7Re0skZYjIzszLIkgzWSupBUmmMpF2BmsY2kFQF3Ah8CtgHOF3SPg2UuxL4fYZ4zMysTLIkg8uBqcCOkiYDjwBfa2KbjwCLIuLFiFgL3AWMKlLuAuBe4M0M8ZiZWZlkaU00TdIskl7IAsZHxPImNhsILC54vQQ4qLCApIHAfwBHkvRuLkrSWGAswE477VRq2GZmVoIsrYkeAQ6KiIcj4qGIWF7XsqixzYosi3qvryEZ2mJ9YzuKiEkRUR0R1f379y81bDMzK0GWTmdDgK9LGh4R30mXVTexzRJgx4LXg4BX65WpBu5Kp1buBxwraV1E/C5DbGZt1vXXX8+iRYtKKltXbvz48SWV32233bjggguaHZtZnSzJ4B3gKOA6SQ8C/1XCNk8Bu0saAiwFTgM+W1igsDObpNuBh5wIrCNZtGgRC557nv4Dd2y6cNduACxfubrJosuWLm6yjFmpsiQDRcQ64FxJY4DHgG0a2yAi1kk6n6SVUBVwa0TMk/SldP3E5oVt1r70H7gjJ5/bVHuLbO656Ydl3Z91blmSwcYv7oi4XdIzbDpOUVERMQWYUm9Z0SQQEWMyxGNmZmWSpTXRT9P+AB9It1tG0tzUzMzauZKTQXq753LgDZKJbiBpGbR/+cMyK17xunTpUgAGDhy4WXlXppo1X5bbRF8B9oyIFTnFYtak1aubrlg1s+yyJIPFwLt5BWJWX7Gz/Loml9dee22lwzHr0LIkgxeB6ZIepmBMooj4cdmjMjOzisqSDF5JH1ukDzMz6yCytCb6DoCk3snLWJlbVGZmVlFZxibaT9Ic4FlgnqRZkvbNLzQzM6uULENYTwIuioidI2Jn4P8AP8snLDMzq6QsyWCriPhz3YuImA5sVfaIzMys4jK1JpL0f4Gfp6//C3ip/CGZmVmlZbkyOBvoD/wWuC99flYeQZmZWWVlaU30NjAux1jMzKyVZBmbaA/gYmBw4XYRcWT5wzIzs0rKUmfwG5JhrG8BGp2i0szM2pcsyWBdRNycWyRm1qF41Nn2JUsyeFDSuSSVx4VjE71V9qjMrEPyqLNtV5ZkMDr9+dWCZQHsUr5wzKyj8Kiz7UuW1kRDGlsvaURE/KHlIZmZWaVl6WfQlCvLuC8zM6ugciYDlXFfZmZWQeVMBlHGfZmZWQVlqUC2NqZY0z1ouPmem+6ZZbd06VLeW/lv7rnph2Xd77Kli6np1XbG+ixnMni5jPuyFnDzPTPLKstwFD1J5jDYKSK+KGl3YM+IeAggIj6TU4zN1tHPnBuK1c33zMpn4MCBbLlyNSef+7Wy7veem35Iv149yrrPlshSZ3AbSWezQ9LXS4AJZY+oAlavXu2zZzOzAlluE+0aEadKOh0gIlZLatMtiHzmbGZWmizJYK2kHqSthiTtSsGwFGZWeUuXLuW4447bZNmqVauIKL1xnyR69uy52fKRI0e2q9um1jJZksFlwFRgR0mTgY8CY/IIyszMKivLcBR/kDQbOJikg9n4iFieW2Rm1qSBAwf6dqeVRdZOZ92Bt4H3gH0kHdHUBpJGSnpO0iJJlxRZf4akf6SPv0o6IGNMZmbWQlmall4JnArMAzakiwOY0cg2VcCNwAiS1kdPSXogIuYXFHsJ+FhEvC3pU8Ak4KBMR2FmZi2Spc7gRJJ+BVkqjT8CLIqIFwEk3QWMAjYmg4j4a0H5vwGDMuzfzMzKIMttoheBbhn3PxBYXPB6SbqsIZ8H/qfYCkljJc2UNHPZsmUZwzAzs8ZkuTJYBcyV9AibznQ2rpFtivVDKNrmTdInSJLBYcXWR8QkkltIVFdXe1A8M7MyypIMHkgfWSwBdix4PQh4tX4hSfsDtwCfiogVGd/DzMxaKEvT0jvSTmc7RcRzJW72FLC7pCHAUuA04LOFBSTtBPwW+FxEPF9qPGZmVj4l1xlIOh6YS9LxDElDJTV6pRAR64Dzgd8DC4BfR8Q8SV+S9KW02LeB7YCbJM2VNDP7YZiZWUtkuU10OUnroOkAETE3PeNvVERMAabUWzax4PkXgC9kiMPMzMosS2uidRHxbr1lrsg1M+sAslwZPCvps0BVOpfBOOCvTWxjZmbtQJZkcAHwTZJmpb8iqQf4Xh5BWetqjUmBGnrP+urK1A1D3pQkZs/uataULK2JVpEkg2/mF461ZXlOCLRo0SIWPPc8/Qfu2HjBrkm/x+Urm45l2dLFdKvqAl16lyPEZussc+ha+5ZlbKIH2byO4F1gJvDTiFhTzsCs9bTWpED9B+5Y1qkF77nph7zz+tKy7c+sI8ty/fwi0J/kFhEkg9a9AewB/Az4XHlDM+sYOsscuta+ZUkGH46IwiGrH5Q0IyKOkDSv3IGZmVnlZEkG/SXtFBGvwMaew/3SdWvLHpmZWRHFGhs01LgBytPAoTPIkgz+D/CYpH+SDEA3BDhX0lbAHXkEZ9ZSNTU1VMVaeiyc0nThDKpWrWDp0nVl3Wd7VWpLMMjeGqzUL/I8Gzd0FllaE01J+xfsRZIMFhZUGl+TQ2xm1g6U3BIMMrcGK6ZYcsi7cUNn0GQykPSZBlbtIomI+G2ZYzIrmy233JI1XXqzeq9jy7rfHgunMHDgB8q6z/as3C3BgLI3xbXGlXJlcHwj64JkxFHLWVu4FDezjqvJZBARZwFIGhIRLxWuK2WgOiuPSl+Km1nnkqUC+V5gWL1l9wAHli+c5usMZ86+FDezvJRSZ7AXsC+wdb36gz5A97wCy8pnzmZmzVfKlcGewKeBvmxaf/A+8MUcYmo2nzmbWR6WLV1c0nfBO8vfBKBvvwEl7bPfnnu0OLZyKaXO4H7gfkmHRMQTFYjJzKzN2G233Uou+87rtQAlDRPSb889Mu07b1nqDD4r6fSGVkbEuDLEY2bWpmSpL2zP/R2yJIMtgX2Au9PXpwCzSOZFNjMrq3eWv8k7r9eW1NCjvTYKaUuyJIPdgU9ERC2ApInAtIi4MJfIrCL8D2dtVW1NDTW165n14htNlu2yNhldv5SyVaveanFsHVGWZPBBoDdQ90n2SpdZO+Z/OGvL1vfcNpfe47a5LMngB8BsSdPT1x8DLi93QFZ5/ofLX2dojWLtW5ZkcDuwHvgKSRL4NrB92SOyTimPqSGXLV3MhnW10KN1p73sLK1RrH3LkgxuAjYAPSLiAUnbkPRKHp5LZGYdREdvjZLXHM9r16ymqqbGw49XSJZkcFBEDJM0ByAi3pa0RU5xWSeTx9SQdXMge3Jus6ZlSQa1kqpIRipFUn+SKwWrgLzOvmprauiy/r2y7tM6l7zmeL75mxewqtvWHn68QrIkg+uA+4ABkv4bOBn4Vi5RtQOees/MOpIsM51NljQLOIpkprMTI2JBbpFllNeZ87Kli6nptVVJZfOcei+vs6/rv3YO2lDr+7LWJlWtequkv80ua5Kr2w3d+5S0T/CVQX1ZrgyIiIXAwpxiaVc89V774S+U9qnbllvSraqW3XZp+nNetOh9gJLKwgfcCquITMmgLcvrzPmW717M6tWrO2wP3a7dtmBtB74v26NHj5L/8f2F0rb07TeAfr16lHRy5ROxluswySAvbamHbh4dl2praqiqbRtnzqUcX9ZOWXvvuUfJXxD+Qmk+d6pr/3JPBpJGAtcCVcAtEfGDeuuVrj8WWAWMiYjZzXmvPP4g165ZDYiqVSuaDmDDegCqVtU2XXb9uo0VzqXIq+NSzXbbApR0Fp/nmXOpZd0pq+3pjJ3qGppZsaG7A23hLkBTck0GaVPUG4ERwBLgKUkPRMT8gmKfIhkEb3fgIODm9Gcmef1BrujenXXr1rHllk13qairQO7RvZTuF1vQt2/fEsol2kLHpTzPnEs9vo509l7sC6WxW41t9Qul0n+bbflz69Gj6e+UtirvK4OPAIsi4kUASXcBo4DCZDAKuDMiAvibpL6SdoiI17K8UbFfdpZ5kSHbH01jf5DFElMef5B5nZ20lbOevP7p28rxFdOev0zqq+SXdqU/t7aYlFsq72QwECicRHgJm5/1FyszENgkGUgaC4wF2GmnnVoUVF5/OG3lH7kjH1+eMfgLJX/l+Iw74+dWCUpOyHPauXQKcExEfCF9/TngIxFxQUGZh4ErIuKx9PUjwNciYlZD+62uro6ZM2fmFreZWUckaVZEVBdb1yXn914C7FjwehDwajPKmJlZjvJOBk8Bu0sakg5qdxrwQL0yDwBnKnEw8G7W+gIzM2uZXOsMImKdpPOB35M0Lb01IuZJ+lK6fiIwhaRZ6SKSpqVn5RmTmZltLvd+BhExheQLv3DZxILnAZyXdxxmZtawvG8TmZlZO+BkYGZmTgZmZuZkYGZm5NzpLC+SlgH/quBb9gOWV/D9Ks3H13515GMDH1+57RwR/YutaJfJoNIkzWyo115H4ONrvzrysYGPr5J8m8jMzJwMzMzMyaBUk1o7gJz5+Nqvjnxs4OOrGNcZmJmZrwzMzMzJwMzM6KTJQNL2ku6S9E9J8yVNkbSHpNWS5khaIOlJSaOLbDtc0npJJ0vaTtLc9PG6pKUFr0uZCDkXzTk+SVtLelDS05LmSTqrrR5fJUkaI+mDbSCO9ennPi/9HV0kqUvB+o9ImiHpOUkLJd0iqWdrxpyFpJB0dcHriyVdnj6/XNLF6fPukv4g6bJWCrUsCn6fT0uaLenQ1o4p91FL2xpJAu4D7oiI09JlQ4EPAP+MiA+ny3YBfiupS0Tcli6rAq4kGZKbiFgBDE3XXQ6sjIgfVfJ46mvB8Z0HzI+I4yX1B54DJkfE0LT85bSB42sFY4Bnaf0Jl1YX/C4GAL8EtgYuk/QB4DfAaRHxRPo3cBLQm2RY+PagBviMpCsiomgnrPQE5F5gVkR8p6LRlV/h7/MY4ArgY60ZUGe8MvgEUFtvGO25bDoPMxHxInARMK5g8QUkf4xv5h9mszX3+ALonX6R9ALeAtZVImBJgwvOZp+VNFnSJyU9LumF9Kx3W0m/k/QPSX+TtH+67eWSbpU0XdKLksYV7PeidH/PSvpKwfIz0/08LennknpLeklSt3R9H0kvK5m2tRqYnJ7F9ZB0oKRHJc2S9HtJO1TiMyoUEW+SzAd+fvr7Oo8k+T+Rro+IuCci3qh0bC2wjqRlzYUNrO8K3AW8EBGXVCyqyugDvN3aQXS6KwNgP6DB+ZXrmQ3sBSBpIPAfwJHA8HxCK4tmHR9wA8msc6+SnFGeGhEbyh9eg3YDTiH5knsK+CxwGHAC8A2SZDYnIk6UdCRwJ+lVGckxfCKN+zlJNwP7k0yUdBAg4O+SHgXWAt8EPhoRyyVtGxHvS5oOHAf8jmRGvnsj4jeSzgMujoiZabK4HhgVEcsknQr8N3B2jp9LURHxYnqbaADJ7/yOSseQgxuBf0j6YZF1XwP+GBFfqWxIuekhaS7QHdiB5HulVXXGZJCFCp5fA3w9ItYnJ2MdQuGBHAPMJfmj3BX4g6S/RMR7FYrlpYh4BkDSPOCRiAhJzwCDgZ1Jbn0QEX9K6zO2Trd9OCJqgBpJb5LcEjsMuC8i/p3u87fA4SRXQPfU3YqIiLfSfdxC8oXzO5Ik8sUiMe5J8sX7h/RvoApozSlaO8wfIkBEvCfpTpKr1dX1Vj8GHCJpj4h4vvLRlV3hbaJDgDsl7Ret2Na/M94mmgccWGLZDwML0ufVwF2SXgZOBm6SdGLZo2u55h7fWcBv01sMi4CX+N+rhkqoKXi+oeD1BpKTlmJffHX/OIXbrm+kPOnyzf7hIuJxYLCkjwFVEfFsA9vOi4ih6eNDEXF0QweUp7TOZz3JLcssv/O27hrg88BW9ZbPAL4C/E9bqNAvp/T2Xj+g6AByldIZk8GfgC0lbTzzkzSc5MyTgmWDgR+R3BYgIoZExOCIGAzcA5wbEb+rUMxZNOv4gFeAo9J1HyA5C36xAvGWagZwBoCkjwPLm7hqmQGcKKmnpK1IbvH9BXgE+E9J26X72rZgmzuBXwG3FSx7n+T2EySV6v3TMzkkdZO0bwuPK7O0gn8icEN6JnkDMFrSQQVl/kvS9pWOraXSK7VfkySE+uvuBa4CpkrqW+HQciNpL5KrzBWtGUenu02U3nr4D+AaSZcAa4CXSc46dpU0h+Q+3vvA9XUtidqLFhzf94Db09syIrkl1paGDr4cuE3SP0hayGzW7LdQRMyWdDvwZLroloiYAyDpv4FHJa0H5pC0GAKYDEwgSQh1bgcmSloNHEJyVXhdeouqK8mZ7LyWHVpJ6u4xdyOpbP058GOAiHhD0mnAj9KWRhtIkuFvKxBXHq4Gzi+2IiImpknuAUlHR8SayoZWNnW/T0j+30ZHxPpWjMfDUZjVkXQySeXw51o7FrNK63RXBmbFSLoe+BRwbGvHYtYafGVgZmadsgLZzMzqcTIwMzMnAzMzczIwy0TJKKY3pM8LR9NsE6ObmjWXk4FZeYwBnAys3XIyMKPoSKb9Jd0r6an08dFGtj2ZTUc3PU7SfQXrR6RjIyFppaSrlYxh/0jamxhJu0qamo6G+pe0V6pZxTgZWKeXDinxTeDIiDgAGA9cC/wkIoaTDJB3S0PbR8Q9wEzgjHTwsSnA3nVf9CTjPtX19N4KmB0Rw4BHgbpJWiYBF0TEgcDFwE3lO0KzprnTmVkyUusmI5lK+iSwT8EItX0k9W5oB4XSIUF+DvyXpNtIhrE4M129Abg7ff4LkgmGegGHAr8peL8tW3hMZpk4GZgVH8m0C3BIRGwylHKG4ctvAx4kGRvqNxHR0ERBkb7XO3VDGpu1Bt8mMis+kuk0CgZLUzJ1aGMKRzclIl4lmSjoWySD3dXpQjLYHSQT+DyWjr76kpKZ1VDigBYcj1lmvjKwTi8i5hUZyXQccGM6SmpXklFAv9TIbm6nYHTT9IpiMtA/IuYXlPs3sK+kWcC7wKnp8jOAmyV9i2Rk0ruAp8t1jGZN8dhEZjlJ+yPMiYj/V7BsZUT0asWwzIpyMjDLQXrm/29gRDolZ91yJwNrk5wMzMzMFchmZuZkYGZmOBmYmRlOBmZmhpOBmZkB/x+ObU1a6UJD/AAAAABJRU5ErkJggg==\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "sns.boxplot(x=annotated_coeqtl_df_clean['celltype'], \n",
+ " y=annotated_coeqtl_df_clean['eqtlgene_nonzeroratio_onemillionv2'],\n",
+ " hue=annotated_coeqtl_df_clean['gene2_isSig'],\n",
+ " palette='Paired', fliersize=1, showfliers = False)\n",
+ "plt.savefig('eqtlgene_nonzeroratio_onemillionv2.unfiltered_results.pdf')\n",
+ "plt.savefig('eqtlgene_nonzeroratio_onemillionv2.unfiltered_results.png')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.11"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/04_coeqtl_mapping/replication_in_bios.py b/04_coeqtl_mapping/replication_in_bios.py
new file mode 100644
index 0000000..e20c8c2
--- /dev/null
+++ b/04_coeqtl_mapping/replication_in_bios.py
@@ -0,0 +1,233 @@
+import argparse
+import os
+import subprocess
+from pathlib import Path
+
+import pandas as pd
+import statsmodels.api as sm
+from statsmodels.stats.multitest import multipletests
+from tqdm import tqdm
+
+workdir = Path("./coeqtl_mapping")
+bios_exp_path = 'BIOS_NoRNAPhenoNA_NoSexNA_NoMixups_NoMDSOutlier_20RNAseqAlignemntMetrics/data/gene_read_counts_BIOS_and_LLD_passQC.tsv.SampleSelection.ProbesWithZeroVarianceRemoved.TMM.SampleSelection.ProbesWithZeroVarianceRemoved.Log2Transformed.ProbesCentered.SamplesZTransformed.CovariatesRemovedOLS.txt.gz'
+
+unique_mappingfile = "./resources/features_v3_reformated_names.tsv"
+bios_gt_prefix = Path('./genotypes-hrc-imputed-vcf/')
+gte_mapping_path = "./coeqtl_mapping/bios/gte.tsv"
+
+
+def get_snps_from_vcffile(bashfile_path, vcf_path, snps_path, savepath):
+ response = subprocess.run([bashfile_path, vcf_path, snps_path, savepath])
+ print(response)
+ return None
+
+
+def get_genes_from_gzipfile(expression_path, gene_path, savepath):
+ print("Loading exp dataframe...")
+ exp_df = pd.read_csv(expression_path, sep='\t', index_col=0, compression='gzip')
+ print("Full exp loaded.")
+ genes = pd.read_csv(gene_path, sep='\t')['ensembl']
+ common_genes = list(set(genes) & set(exp_df.index.values))
+ print(f"Selecting {len(common_genes)} to save. {len(genes) - len(common_genes)} genes not found in BIOS")
+ selected_exp_df = exp_df.loc[common_genes]
+ genes_dic = pd.read_csv(gene_path, sep='\t').set_index('ensembl')['symbol'].T.to_dict()
+ common_genes_names = [genes_dic.get(geneid) for geneid in common_genes]
+ selected_exp_df.index = common_genes_names
+ selected_exp_df.to_csv(savepath, sep='\t')
+ print(f"Selected {selected_exp_df.shape[0]} genes in {savepath}.")
+ return selected_exp_df
+
+
+def make_snps_genes_files_for_coeqtls(coeqtl_path):
+ significant_coeqtls = pd.read_csv(coeqtl_path, sep='\t', compression='gzip', index_col=0)
+ mappings = pd.read_csv(unique_mappingfile, sep='\t', names=['geneid', 'genename', 'type']).set_index('genename')[
+ 'geneid'].T.to_dict()
+ snps = significant_coeqtls['SNP'].unique()
+ genes = [ele for item in significant_coeqtls['Gene'].values for ele in item.split(';')]
+ genes = list(set([item for item in genes if item]))
+ genes_df = pd.DataFrame(data=[[item, mappings.get(item)] for item in genes],
+ columns=['symbol', 'ensembl']).dropna(subset=['ensembl'])
+ print(f"Writing {len(snps)} snps and {len(genes)} genes from coeQTLs.")
+ with open(f"{str(coeqtl_path)[:-len('.tsv.gz')]}.snps.txt", 'w') as f:
+ f.write('\n'.join(snps))
+ genes_df.to_csv(f"{str(coeqtl_path)[:-len('.tsv.gz')]}.genes.tsv",
+ sep='\t', index=False)
+ return snps, genes_df
+
+
+def replicate(annotated_coeqtl_path,
+ bios_gt_path,
+ bios_gene_path,
+ saveprefix,
+ gte_mapping_path,
+ vcf_header_rows=6):
+ import warnings
+ def find_gene2(eqtlgene, genepair):
+ gene1, gene2 = genepair.split(';')
+ if eqtlgene == gene1:
+ return gene2
+ else:
+ return gene1
+ warnings.simplefilter(action='ignore', category=FutureWarning)
+ gte_mapping = pd.read_csv(gte_mapping_path, sep='\t').set_index('gt')['exp'].T.to_dict()
+ # transform the GT columns into expression ids
+ gt = pd.read_csv(bios_gt_path, skiprows=vcf_header_rows, sep='\t')
+ sc_individuals = pd.read_csv(
+ './coeqtl_mapping/input/summary/gte-fix.tsv',
+ sep='\t'
+ )['genotypesampleID']
+ # remove LLD individuals
+ remove_individuals = list(set(sc_individuals) & set(gt.columns))
+ gt = gt.drop(remove_individuals, axis=1)
+ gt_snp_set = set(gt['ID'].values)
+ # map genotype and expression individual names
+ find_name = lambda x: gte_mapping.get(x) if x in gte_mapping else x
+ gt = gt.rename({item: find_name(item) for item in gt.columns}, axis=1)
+ # load expression data
+ exp = pd.read_csv(bios_gene_path, index_col=0, sep='\t', compression='gzip')
+ genename_mapping = pd.read_csv(unique_mappingfile, sep='\t', names=['gene_id', 'gene_name']).set_index('gene_id')[
+ 'gene_name'].T.to_dict()
+ exp['genename'] = [genename_mapping.get(geneid) for geneid in exp.index]
+ exp = exp.dropna(subset=['genename']).set_index('genename')
+ expression_gene_name_set = set(exp.index)
+ common_indidvidauls = list(set(exp.columns) & set(gt.columns))
+ gt_df = gt.set_index('ID')
+ exp_common = exp[common_indidvidauls]
+ coeqtls = pd.read_csv(annotated_coeqtl_path, sep='\t', compression='gzip', index_col=0)
+ coeqtls['gene1'] = [item.split('_')[1] for item in coeqtls['snp_eqtlgene']]
+ coeqtls['gene2'] = [find_gene2(gene1, genepair) for (gene1, genepair) in coeqtls[['gene1', 'Gene']].values]
+ coeqtl_pairs = coeqtls[['SNP', 'gene1', 'gene2']].values
+ # start solving the interaction models
+ i = 0
+ res_df = pd.DataFrame()
+ for snp, gene1, gene2 in tqdm(coeqtl_pairs):
+ if snp in gt_snp_set and gene1 in expression_gene_name_set and gene2 in expression_gene_name_set: # todo: ESNG id to genename
+ i += 1
+ gt_selected = gt_df.loc[snp]
+ gene1_selected = exp_common.loc[gene1]
+ gene2_selected = exp_common.loc[gene2]
+ x_df = pd.concat([gt_selected[common_indidvidauls], gene2_selected], axis=1)
+ x_df[f'{snp}_dosage'] = [float(item.split(':')[1]) for item in x_df[snp]]
+ x_df[f'{snp}_{gene2}'] = x_df[f'{snp}_dosage'] * x_df[gene2]
+ X = sm.add_constant(x_df[[f'{snp}_dosage', gene2, f'{snp}_{gene2}']])
+ model = sm.OLS(gene1_selected.T, X)
+ results_data = model.fit().summary().tables[1].data
+ results = pd.DataFrame(data=results_data[1:], columns=results_data[0]).set_index('')
+ results['gene1'] = gene1
+ results['gene2'] = gene2
+ results['assessed_allele'] = gt_selected['ALT']
+ results['num_individuals'] = len(common_indidvidauls)
+ res_df = pd.concat([res_df, results], axis=0)
+ if len(coeqtl_pairs) > 10000:
+ if i % 10000 == 0 and i > 1:
+ res_df.to_csv(f"{saveprefix}.part{int(i / 10000)}.tsv", sep='\t')
+ res_df = pd.DataFrame()
+ print(f"results part {int(i / 10000)} has been saved in {saveprefix}.part{int(i / 10000)}.tsv")
+ part_ind = 1 + int(i / 10000)
+ res_df.to_csv(f"{saveprefix}.part{part_ind}.tsv.gz", sep='\t', compression='gzip')
+ print(f"results part {part_ind} has been saved in {saveprefix}.part{part_ind}.tsv")
+ return res_df
+
+
+def make_gte_mapping_file():
+ prefix = Path("./tmp03boxy/input/")
+ gtm = pd.DataFrame()
+ for filename in os.listdir(prefix / "hrcGTM"):
+ sub_gtm = pd.read_csv(prefix / f"hrcGTM/{filename}", compression='gzip', sep='\t', names=['gt', 'met'])
+ gtm = pd.concat([gtm, sub_gtm], axis=0)
+ gtm = gtm.set_index('met')
+ mte_path = prefix / 'hrcMTE/CODAM_LLDeep_LLS660Q_LLSOmni_NTR_RS_MTE.txt'
+ mte = pd.read_csv(mte_path, sep='\t', names=['met', 'exp']).set_index('met')
+ all_mapping = pd.concat([gtm, mte], axis=1)
+ gte = all_mapping[['gt', 'exp']]
+ gte = gte.dropna()
+ gte.to_csv('./coeqtl_mapping/bios/gte.tsv',
+ sep='\t', index=False)
+ return gte
+
+
+def examine_replicated_in_bios(replication_res_path, savepath):
+ from statsmodels.stats.multitest import multipletests
+ bios_replication = pd.read_csv(replication_res_path, sep='\t')
+ bios_replication['snp_genepair'] = ['_'.join(item) for item in bios_replication[['Unnamed: 0', 'gene1']].values]
+ tobesave = lambda x: True if 'dosage' not in x and 'const' not in x and x.startswith('rs') else False
+ bios_replication['isinteractionterm'] = [tobesave(item) for item in bios_replication['snp_genepair']]
+ bios_interactions_df = bios_replication[bios_replication['isinteractionterm']]
+ bios_interactions_df['corrected_p'] = multipletests(bios_interactions_df['P>|t|'], method='fdr_bh')[1]
+ bios_interactions_df.to_csv(savepath, sep='\t')
+ significant_res = bios_interactions_df[bios_interactions_df['corrected_p'] <= 0.05]
+ print("Significantly replicated coeQTLs: ", significant_res.shape[0])
+ return bios_interactions_df
+
+
+def arguments():
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--saveprefix', type=str, dest='saveprefix')
+ parser.add_argument('--coeqtlpath', type=str, dest='coeqtlpath')
+ parser.add_argument('--selection', type=str, dest='selection')
+ parser.add_argument('--chromosome', type=str, dest='chr')
+ parser.add_argument('--replicate', type=str, dest='replicate')
+ parser.add_argument('--bios_selected_vcf', type=str, dest='bios_selected_vcf')
+ return parser
+
+
+if __name__ == '__main__':
+ # _ = make_gte_mapping_file() # make the GTE files for BIOS
+ args = arguments().parse_args()
+ print("Arguments:")
+ print(args)
+ # get snps and genes from coeQTL results
+ coeqtl_path = args.coeqtlpath
+ savedirectory = Path(args.saveprefix)
+ if not os.path.isdir(savedirectory):
+ os.makedirs(savedirectory)
+ if args.selection == 'snp':
+ if not os.path.exists(f"{coeqtl_path[:-len('.tsv.gz')]}.snps.txt"):
+ _ = make_snps_genes_files_for_coeqtls(coeqtl_path)
+ # get snps from vcf
+ chromosome = args.chr
+ snp_bashfile_path = workdir / 'bios/select_snps_from_vcf.sh'
+ dataname = f"chr{chromosome}"
+ bios_gt_path = bios_gt_prefix / f"chr{chromosome}/GenotypeData.vcf.gz"
+ snp_savepath = savedirectory / f'bios_selected.chr{chromosome}.vcf'
+ get_snps_from_vcffile(snp_bashfile_path, bios_gt_path, f"{coeqtl_path[:-len('.tsv.gz')]}.snps.txt",
+ snp_savepath)
+ elif args.selection == 'gene':
+ if not os.path.exists(f"{coeqtl_path[:-len('.tsv.gz')]}.snps.txt"):
+ _ = make_snps_genes_files_for_coeqtls(coeqtl_path)
+ # get genes from gzip files
+ gene_savepath = savedirectory / f'bios_selected_gene_expression.tsv'
+ _ = get_genes_from_gzipfile(bios_exp_path, f"{coeqtl_path[:-len('.tsv.gz')]}.genes.tsv", gene_savepath)
+ if args.replicate:
+ # perform replication in bios
+ work_prefix = Path("./coeqtl_mapping/")
+ bios_gt_path = args.bios_selected_vcf
+ vcf_header_rows = 6
+ bios_gene_path = bios_exp_path
+ saveprefix = savedirectory / 'bios_replication_results.eqtlgene1_gene2'
+ replicate(annotated_coeqtl_path=coeqtl_path,
+ bios_gt_path=bios_gt_path,
+ bios_gene_path=bios_gene_path,
+ saveprefix=saveprefix,
+ gte_mapping_path=gte_mapping_path,
+ vcf_header_rows=vcf_header_rows)
+ # concatenate replication results saved in parts
+ res_df = pd.DataFrame()
+ for filename in os.listdir(savedirectory):
+ if filename.startswith('bios_replication_results.eqtlgene1_gene2.') and 'part' in filename and filename.endswith('gz'):
+ print(filename)
+ df = pd.read_csv(savedirectory / filename, sep='\t', compression='gzip')
+ res_df = pd.concat([res_df, df], axis=0)
+ tobesave = lambda x: True if 'dosage' not in x and 'const' not in x and x.startswith('rs') else False
+ res_df['isinteractionterm'] = [tobesave(item) for item in res_df['Unnamed: 0']]
+ bios_interactions_df = res_df[res_df['isinteractionterm']]
+ bios_interactions_df['snp_genepair'] = ['_'.join([item[0].split('_')[0],
+ ';'.join(sorted([item[0].split('_')[1], item[1]]))]) for item
+ in
+ bios_interactions_df[['Unnamed: 0', 'gene1']].values]
+ bios_interactions_df['corrected_p'] = multipletests(bios_interactions_df['P>|t|'], method='fdr_bh')[1]
+ bios_interactions_df.to_csv(savedirectory / 'bios_replication_results.eqtlgene1_gene2.all.tsv.gz',
+ sep='\t', index=False, compression='gzip')
+ bios_interactions_df[bios_interactions_df['corrected_p'] <= 0.05].to_csv(
+ savedirectory / 'bios_replication_results.eqtlgene1_gene2.sig.tsv.gz',
+ sep='\t', index=False, compression='gzip')
diff --git a/04_coeqtl_mapping/screen_permutation_p_values.py b/04_coeqtl_mapping/screen_permutation_p_values.py
new file mode 100644
index 0000000..2812dc8
--- /dev/null
+++ b/04_coeqtl_mapping/screen_permutation_p_values.py
@@ -0,0 +1,130 @@
+import argparse
+import os
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+from tqdm import tqdm
+import gzip
+
+
+workdir = Path('/groups/umcg-lld/tmp01/projects/1MCellRNAseq/GRN_reconstruction/ongoing/coeqtl_mapping')
+annotation_path = '/groups/umcg-bios/tmp01/projects/1M_cells_scRNAseq/ongoing/eQTL_mapping/probeannotation/singleCell-annotation-stripped.tsv'
+mappingdic = pd.read_csv('/groups/umcg-lld/tmp01/projects/1MCellRNAseq/GRN_reconstruction/ongoing/resources/features_v3_reformated_names.tsv',
+ sep='\t', names=['geneid', 'genename']).set_index('geneid')['genename'].T.to_dict()
+annotation_df = pd.read_csv(annotation_path, sep='\t')
+annotation_df['chr_pos'] = ['_'.join([str(ele) for ele in item]) for item in annotation_df[['Chr', 'ChrStart', 'ChrEnd']].values]
+annotation_df['genename'] = [mappingdic.get(ensemblid) for ensemblid in annotation_df['Ensembl']]
+annotation_dict = annotation_df.set_index('chr_pos')['genename'].T.to_dict()
+
+def update_perm(old_p_list, new_p_list):
+ return np.min([old_p_list, new_p_list], axis=0)
+
+
+def find_eqtlsnp_gene(snp, genepair, coeqtl_annotation_dic):
+ genepair_chrpos = coeqtl_annotation_dic.get(genepair)
+ eqtlgene = annotation_dict.get(genepair_chrpos)
+ snp_genepair = '_'.join([snp, eqtlgene])
+ return snp_genepair
+
+
+def loop_through_one_batch_perm(batch_perm_path, snpgene1_minpvalues_dict, coeqtl_annotation_dict):
+ with gzip.open(batch_perm_path, 'rb') as f:
+ f.readline()
+ while True:
+ line = f.readline().decode('utf-8')
+ if not line:
+ break
+ else:
+ linecontent = line.strip().split('\t')
+ perm_ps = [float(ele) for ele in linecontent[2:102]]
+ snp_gene1 = find_eqtlsnp_gene(linecontent[1], linecontent[0], coeqtl_annotation_dict)
+ snpgene1_minpvalues_dict[snp_gene1] = update_perm(snpgene1_minpvalues_dict[snp_gene1],
+ perm_ps)
+ return snpgene1_minpvalues_dict
+
+
+def update_dictionary_per_permutation_batch(batch_perm_path, snpgene1_minpvalues_df, coeqtl_annotation_dict):
+ batch_perm_df = pd.read_csv(batch_perm_path, compression='gzip', sep='\t')
+ # print(batch_perm_df.head())
+ batch_perm_df['chr_pos'] = [coeqtl_annotation_dict.get(genepair) for genepair in batch_perm_df['Gene']]
+ batch_perm_df['eqtlgene'] = [annotation_dict.get(chrpos) for chrpos in batch_perm_df['chr_pos']]
+ batch_perm_df['snp_eqtlgene'] = ['_'.join(item) for item in batch_perm_df[['SNP', 'eqtlgene']].values]
+ # print(batch_perm_df.head())
+ merge_columns = ['snp_eqtlgene'] + [f'Perm{ind}' for ind in range(100)]
+ merged_df = pd.concat([batch_perm_df[merge_columns], snpgene1_minpvalues_df[merge_columns]],
+ axis=0)
+ # print(merged_df.head())
+ reduced_df = merged_df.groupby(by='snp_eqtlgene').agg(min)
+ reduced_df['snp_eqtlgene'] = reduced_df.index
+ # print(reduced_df.head())
+ return reduced_df
+
+
+def save_numpy(data_df, prefix):
+ np.save(f'{prefix}.npy', data_df.values)
+ with open(f'{prefix}.cols.txt', 'w') as f:
+ f.write('\n'.join([str(ele) for ele in data_df.columns]))
+ with open(f'{prefix}.rows.txt', 'w') as f:
+ f.write('\n'.join([str(ele) for ele in data_df.index]))
+ return None
+
+
+def arguments():
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--eqtl_path', dest='eqtl_path')
+ parser.add_argument('--result_prefix', dest='result_prefix')
+ parser.add_argument('--save_prefix', dest='save_prefix')
+ parser.add_argument('--annotation_prefix', dest='annotation_prefix')
+ return parser
+
+
+
+def main():
+ args = arguments().parse_args()
+ eqtl_path, results_prefix, save_prefix = args.eqtl_path, Path(args.result_prefix), Path(args.save_prefix)
+ # load eqtl path
+ eqtl_df = pd.read_csv(eqtl_path, sep='\t')
+ eqtl_df['snp_gene1'] = ['_'.join(item) for item in eqtl_df[['SNPName', 'genename']].values]
+ unique_snpgene1 = eqtl_df['snp_gene1'].values
+ # initialize the dict to contain the
+ # snpgene1_minpvalues_dict = {item: np.ones(100) for item in unique_snpgene1}
+ snpgene1_minpvalues_df = pd.DataFrame(data=np.ones((len(unique_snpgene1), 100)),
+ columns=[f'Perm{ind}' for ind in range(100)])
+ snpgene1_minpvalues_df['snp_eqtlgene'] = unique_snpgene1
+ # loop through all batch permutation files
+ coeqtl_annotation_path = f'{args.annotation_prefix}.genepairs.annotation.gene1position.noduplicated.tsv'
+ coeqtl_annotation_df = pd.read_csv(coeqtl_annotation_path, sep='\t')
+ coeqtl_annotation_df['chr_pos'] = ['_'.join([str(ele) for ele in item]) for item in
+ coeqtl_annotation_df[['Chr', 'ChrStart', 'ChrEnd']].values]
+ coeqtl_annotation_dict = coeqtl_annotation_df.set_index('ArrayAddress')['chr_pos'].T.to_dict()
+ for filename in tqdm(os.listdir(results_prefix / 'noduplicated/output')):
+ if '-Permutations.txt.gz' in filename:
+ snpgene1_minpvalues_df = update_dictionary_per_permutation_batch(results_prefix / 'noduplicated/output'/filename,
+ snpgene1_minpvalues_df, coeqtl_annotation_dict)
+ coeqtl_annotation_path = f'{args.annotation_prefix}.genepairs.annotation.gene1position.duplicatedversion1.tsv'
+ coeqtl_annotation_df = pd.read_csv(coeqtl_annotation_path, sep='\t')
+ coeqtl_annotation_df['chr_pos'] = ['_'.join([str(ele) for ele in item]) for item in
+ coeqtl_annotation_df[['Chr', 'ChrStart', 'ChrEnd']].values]
+ coeqtl_annotation_dict = coeqtl_annotation_df.set_index('ArrayAddress')['chr_pos'].T.to_dict()
+ for filename in tqdm(os.listdir(results_prefix / 'duplicatedversion1/output')):
+ if '-Permutations.txt.gz' in filename:
+ snpgene1_minpvalues_df = update_dictionary_per_permutation_batch(results_prefix / 'duplicatedversion1/output'/filename,
+ snpgene1_minpvalues_df, coeqtl_annotation_dict)
+ coeqtl_annotation_path = f'{args.annotation_prefix}.genepairs.annotation.gene1position.duplicatedversion2.tsv'
+ coeqtl_annotation_df = pd.read_csv(coeqtl_annotation_path, sep='\t')
+ coeqtl_annotation_df['chr_pos'] = ['_'.join([str(ele) for ele in item]) for item in
+ coeqtl_annotation_df[['Chr', 'ChrStart', 'ChrEnd']].values]
+ coeqtl_annotation_dict = coeqtl_annotation_df.set_index('ArrayAddress')['chr_pos'].T.to_dict()
+ for filename in tqdm(os.listdir(results_prefix / 'duplicatedversion2/output')):
+ if '-Permutations.txt.gz' in filename:
+ snpgene1_minpvalues_df = update_dictionary_per_permutation_batch(results_prefix / 'duplicatedversion2/output'/filename,
+ snpgene1_minpvalues_df, coeqtl_annotation_dict)
+ # snpgene1_minpvalues_df = pd.DataFrame.from_dict(snpgene1_minpvalues_dict)
+ snpgene1_minpvalues_df.to_csv(save_prefix / 'concated_alltests_permutations_fixed.tsv.gz',
+ sep='\t', compression='gzip')
+ return snpgene1_minpvalues_df
+
+
+if __name__ == '__main__':
+ _ = main()
diff --git a/04_coeqtl_mapping/select_snps_from_vcf.sh b/04_coeqtl_mapping/select_snps_from_vcf.sh
new file mode 100644
index 0000000..cc710c4
--- /dev/null
+++ b/04_coeqtl_mapping/select_snps_from_vcf.sh
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+
+vcfpath=$1
+snpspath=$2
+savepath=$3
+
+ml BCFtools
+bcftools view --include ID==@${snpspath} ${vcfpath} > ${savepath}
\ No newline at end of file
diff --git a/04_coeqtl_mapping/submit_individual_networks.sh b/04_coeqtl_mapping/submit_individual_networks.sh
new file mode 100644
index 0000000..7a57463
--- /dev/null
+++ b/04_coeqtl_mapping/submit_individual_networks.sh
@@ -0,0 +1,18 @@
+#!/usr/bin/env bash
+#SBATCH --time=16:00:00
+#SBATCH --mem=80gb
+#SBATCH --nodes=1
+#SBATCH --export=NONE
+#SBATCH --get-user-env=L
+
+module purge
+
+conda init bash
+source /groups/umcg-lld/tmp01/projects/1MCellRNAseq/GRN_reconstruction/tools/Beeline/miniconda/etc/profile.d/conda.sh
+conda activate scpy3.8
+
+
+python /groups/umcg-lld/tmp01/projects/1MCellRNAseq/GRN_reconstruction/ongoing/coeqtl_mapping/input/individual_networks/individual_networks.py \
+--datasetname $1 \
+--celltype $2 \
+--condition $3
\ No newline at end of file
diff --git a/04_coeqtl_mapping/submit_merge_coexpression.sh b/04_coeqtl_mapping/submit_merge_coexpression.sh
new file mode 100644
index 0000000..52f6789
--- /dev/null
+++ b/04_coeqtl_mapping/submit_merge_coexpression.sh
@@ -0,0 +1,21 @@
+#!/usr/bin/env bash
+#SBATCH --time=8:00:00
+#SBATCH --mem=80gb
+#SBATCH --nodes=1
+#SBATCH --export=NONE
+#SBATCH --get-user-env=L
+
+module purge
+
+conda init bash
+source /groups/umcg-lld/tmp01/projects/1MCellRNAseq/GRN_reconstruction/tools/Beeline/miniconda/etc/profile.d/conda.sh
+conda activate scpy3.8
+
+
+python /groups/umcg-lld/tmp01/projects/1MCellRNAseq/GRN_reconstruction/ongoing/coeqtl_mapping/input/individual_networks/merge_coexpression_for_betaeqtl.py \
+--celltype $1 \
+--condition $2
+
+python /groups/umcg-lld/tmp01/projects/1MCellRNAseq/GRN_reconstruction/ongoing/coeqtl_mapping/input/individual_networks/prepare_genelist_and_annotation_for_betaqtl.py \
+--celltype $1 \
+--condition $2
\ No newline at end of file
diff --git a/04_coeqtl_mapping/submit_process_betaqtl_results.sh b/04_coeqtl_mapping/submit_process_betaqtl_results.sh
new file mode 100644
index 0000000..e3c45c8
--- /dev/null
+++ b/04_coeqtl_mapping/submit_process_betaqtl_results.sh
@@ -0,0 +1,51 @@
+#!/usr/bin/env bash
+#SBATCH --time=01:00:00
+#SBATCH --mem=20gb
+#SBATCH --nodes=1
+#SBATCH --open-mode=append
+#SBATCH --export=NONE
+#SBATCH --get-user-env=L
+
+module purge
+
+conda init bash
+source /groups/umcg-lld/tmp01/projects/1MCellRNAseq/GRN_reconstruction/tools/Beeline/miniconda/etc/profile.d/conda.sh
+conda activate scpy3.8
+
+
+celltype=$1
+
+condition='UT'
+workdir="./"
+
+# unfiltered results
+python ${workdir}/output/concat_betaqtl_results.fixed.py \
+--prefix ${workdir}/output/unfiltered_results/${condition}_${celltype_individual} \
+--savepath ${workdir}/output/unfiltered_results/${condition}_${celltype_individual}/concated_alltests_output_fixed.tsv.gz \
+--annotation_prefix ${workdir}/input/summary/${condition}_${celltype}
+python ${workdir}/output/screen_permutation_p_values.py \
+--eqtl_path ${workdir}/input/snp_selection/eqtl/${condition}_${celltype}_eQTLProbesFDR0.05-ProbeLevel.tsv \
+--result_prefix ${workdir}/output/unfiltered_results/${condition}_${celltype_individual} \
+--save_prefix ${workdir}/output/unfiltered_results/${condition}_${celltype_individual} \
+--annotation_prefix ${workdir}/input/summary/${condition}_${celltype}
+python ${workdir}/output/multipletesting_correction.fixed.py \
+--permutation_pvalue_path ${workdir}/output/unfiltered_results/${condition}_${celltype_individual}/concated_alltests_permutations_fixed.tsv.gz \
+--coeqtl_path ${workdir}/output/unfiltered_results/${condition}_${celltype_individual}/concated_alltests_output_fixed.tsv.gz \
+--eqtl_path ${workdir}/input/snp_selection/eqtl/${condition}_${celltype}_eQTLProbesFDR0.05-ProbeLevel.tsv \
+--save_prefix ${workdir}/output/unfiltered_results/${condition}_${celltype_individual}/coeqtls_fullresults_fixed
+
+# filtered results
+python ${workdir}/output/concat_betaqtl_results.fixed.py \
+--prefix ${workdir}/output/filtered_results/${condition}_${celltype_individual} \
+--savepath ${workdir}/output/filtered_results/${condition}_${celltype_individual}/concated_alltests_output_fixed.tsv.gz \
+--annotation_prefix ${workdir}/input/summary/${condition}_${celltype}
+python ${workdir}/output/screen_permutation_p_values.py \
+--eqtl_path ${workdir}/input/snp_selection/eqtl/${condition}_${celltype}_eQTLProbesFDR0.05-ProbeLevel.tsv \
+--result_prefix ${workdir}/output/filtered_results/${condition}_${celltype_individual} \
+--save_prefix ${workdir}/output/filtered_results/${condition}_${celltype_individual} \
+--annotation_prefix ${workdir}/input/summary/${condition}_${celltype}
+python ${workdir}/output/multipletesting_correction.fixed.py \
+--permutation_pvalue_path ${workdir}/output/filtered_results/${condition}_${celltype_individual}/concated_alltests_permutations_fixed.tsv.gz \
+--coeqtl_path ${workdir}/output/filtered_results/${condition}_${celltype_individual}/concated_alltests_output_fixed.tsv.gz \
+--eqtl_path ${workdir}/input/snp_selection/eqtl/${condition}_${celltype}_eQTLProbesFDR0.05-ProbeLevel.tsv \
+--save_prefix ${workdir}/output/filtered_results/${condition}_${celltype_individual}/coeqtls_fullresults_fixed
diff --git a/05_coeqtl_interpretation/.ipynb_checkpoints/LDTRAIT-checkpoint.ipynb b/05_coeqtl_interpretation/.ipynb_checkpoints/LDTRAIT-checkpoint.ipynb
new file mode 100644
index 0000000..659b6fa
--- /dev/null
+++ b/05_coeqtl_interpretation/.ipynb_checkpoints/LDTRAIT-checkpoint.ipynb
@@ -0,0 +1,944 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import requests\n",
+ "from tqdm import tqdm\n",
+ "import os\n",
+ "from io import StringIO\n",
+ "from pathlib import Path"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {
+ "scrolled": false
+ },
+ "outputs": [],
+ "source": [
+ "savedir = Path(\"./annotated_coeqtl_snps/ldtrait\")\n",
+ "\n",
+ "celltypesnps = {}\n",
+ "merged_dict = pd.read_excel('/groups/umcg-lld/tmp01/projects/1MCellRNAseq/GRN_reconstruction/ongoing/coeqtl_mapping/output/summary/coeQTLs_6majorcelltypes.filtered.xlsx',\n",
+ " sheet_name=None)\n",
+ "for celltype in merged_dict.keys():\n",
+ " celltypesnps[celltype] = list(merged_dict[celltype]['SNP'].unique())\n",
+ "allcelltypes_snps = list(set([ele for l in celltypesnps.values() for ele in l]))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 90,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "72"
+ ]
+ },
+ "execution_count": 90,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "len(allcelltypes_snps)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 70,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 17%|█▋ | 12/72 [05:53<34:19, 34.33s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "no GWAS: rs62480001\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ " 18%|█▊ | 13/72 [06:52<39:48, 40.48s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "no GWAS: rs817352\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ " 19%|█▉ | 14/72 [07:11<33:43, 34.89s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "no GWAS: rs80164297\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 24%|██▎ | 17/72 [09:23<37:43, 41.16s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "no GWAS: rs11772922\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 26%|██▋ | 19/72 [10:43<35:14, 39.89s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "no GWAS: rs3758833\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 29%|██▉ | 21/72 [11:39<28:28, 33.49s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "no GWAS: rs11047696\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ " 31%|███ | 22/72 [12:09<27:15, 32.70s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "no GWAS: rs9971029\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ " 32%|███▏ | 23/72 [12:35<24:55, 30.53s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "no GWAS: rs4949655\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 42%|████▏ | 30/72 [16:56<26:11, 37.41s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "no GWAS: rs6007595\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ " 43%|████▎ | 31/72 [17:27<24:21, 35.64s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "no GWAS: rs7309189\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ " 44%|████▍ | 32/72 [18:20<27:11, 40.79s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "no GWAS: rs9657360\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 49%|████▊ | 35/72 [19:43<20:09, 32.70s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "no GWAS: rs731835\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 53%|█████▎ | 38/72 [26:08<51:53, 91.57s/it] "
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "no GWAS: rs260503\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 58%|█████▊ | 42/72 [28:04<22:59, 46.00s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "no GWAS: rs13140099\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ " 60%|█████▉ | 43/72 [28:34<19:51, 41.10s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "no GWAS: rs2235910\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 71%|███████ | 51/72 [42:19<40:20, 115.26s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "no GWAS: rs1628955\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 74%|███████▎ | 53/72 [43:55<25:05, 79.24s/it] "
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "no GWAS: rs12443580\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 82%|████████▏ | 59/72 [48:18<09:34, 44.16s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "no GWAS: rs150458741\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 85%|████████▍ | 61/72 [49:39<07:29, 40.88s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "no GWAS: rs62423804\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ " 86%|████████▌ | 62/72 [50:16<06:36, 39.69s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "no GWAS: rs2267989\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 89%|████████▉ | 64/72 [50:56<03:58, 29.82s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "no GWAS: rs7605964\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 99%|█████████▊| 71/72 [54:21<00:24, 24.54s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "no GWAS: rs1261896\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|██████████| 72/72 [54:53<00:00, 45.75s/it]\n"
+ ]
+ }
+ ],
+ "source": [
+ "# curl -k -H \"Content-Type: application/json\" -X POST -d '{\"snps\": \"rs3\\nrs4\", \"pop\": \"YRI\", \"r2_d\": \"r2\", \"r2_d_threshold\": \"0.1\", \"window\": \"500000\", \"genome_build\": \"grch37\"}' 'https://ldlink.nci.nih.gov/LDlinkRest/ldtrait?token=faketoken123'\n",
+ "# snp = \"rs10276099\"\n",
+ "for snp in tqdm(allcelltypes_snps):\n",
+ " if os.path.exists(savedir/f'{snp}.tsv'):\n",
+ " continue\n",
+ " else:\n",
+ " params = {\"snps\": snp, \n",
+ " \"pop\": \"CEU\", \n",
+ " \"r2_d\": \"r2\", \n",
+ " \"r2_d_threshold\": \"0.8\", \n",
+ " \"window\": \"500000\", \n",
+ " \"genome_build\": \"grch37\"}\n",
+ " r = requests.request(headers={\"Content-Type\": \"application/json\"},\n",
+ " method='POST',\n",
+ " json=params, \n",
+ " url=f'https://ldlink.nci.nih.gov/LDlinkRest/ldtrait?token={token}')\n",
+ " try:\n",
+ " if \"No entries in the GWAS Catalog are identified using the LDtrait search criteria.\" in r.text:\n",
+ " print('no GWAS:', snp)\n",
+ " continue\n",
+ " else:\n",
+ " r_df = pd.read_csv(StringIO(r.text), sep='\\t')\n",
+ " r_df.to_csv(savedir/f'{snp}.tsv', sep='\\t', index=False)\n",
+ " except:\n",
+ " print('failed entry:', snp)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 80,
+ "metadata": {
+ "scrolled": false
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|██████████| 72/72 [00:00<00:00, 298.96it/s]\n"
+ ]
+ }
+ ],
+ "source": [
+ "allsnps_inld_gwas_df = pd.DataFrame()\n",
+ "for snp in tqdm(allcelltypes_snps):\n",
+ " if os.path.exists(savedir/f'{snp}.tsv'):\n",
+ " df = pd.read_csv(savedir/f'{snp}.tsv', sep='\\t')\n",
+ " if 'error' not in df.iloc[0].values[0]:\n",
+ " allsnps_inld_gwas_df = pd.concat([allsnps_inld_gwas_df, df],\n",
+ " axis=0)\n",
+ " \n",
+ "allsnps_inld_gwas_df.to_csv(savedir/'summary.tsv', sep='\\t', index=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "allsnps_inld_gwas_df = pd.read_csv(savedir/'summary.tsv', sep='\\t')\n",
+ "magma_df = pd.read_csv(savedir/'coeqtl_with_gwas_and_magma.tsv', sep='\\t')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " VARIABLE | \n",
+ " celltype | \n",
+ " SNP | \n",
+ " gene | \n",
+ " TYPE | \n",
+ " NGENES | \n",
+ " BETA | \n",
+ " BETA_STD | \n",
+ " SE | \n",
+ " P | \n",
+ " ... | \n",
+ " non_effect_allele | \n",
+ " current_build | \n",
+ " frequency | \n",
+ " sample_size | \n",
+ " zscore | \n",
+ " pvalue | \n",
+ " effect_size | \n",
+ " standard_error | \n",
+ " imputation_status | \n",
+ " n_cases | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " B_rs1131017_RPS26 | \n",
+ " B | \n",
+ " rs1131017 | \n",
+ " RPS26 | \n",
+ " SET | \n",
+ " 38 | \n",
+ " -0.199320 | \n",
+ " -0.008952 | \n",
+ " 0.12542 | \n",
+ " 0.943980 | \n",
+ " ... | \n",
+ " G | \n",
+ " hg38 | \n",
+ " 0.580808 | \n",
+ " 54612 | \n",
+ " 0.138937 | \n",
+ " 0.889500 | \n",
+ " 0.002200 | \n",
+ " 0.015600 | \n",
+ " original | \n",
+ " 17008.0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " B_rs1131017_RPS26 | \n",
+ " B | \n",
+ " rs1131017 | \n",
+ " RPS26 | \n",
+ " SET | \n",
+ " 38 | \n",
+ " 0.201320 | \n",
+ " 0.009042 | \n",
+ " 0.12905 | \n",
+ " 0.059382 | \n",
+ " ... | \n",
+ " G | \n",
+ " hg38 | \n",
+ " 0.580808 | \n",
+ " 53293 | \n",
+ " 1.735682 | \n",
+ " 0.082620 | \n",
+ " 0.023902 | \n",
+ " 0.013700 | \n",
+ " original | \n",
+ " 19099.0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " B_rs1131017_RPS26 | \n",
+ " B | \n",
+ " rs1131017 | \n",
+ " RPS26 | \n",
+ " SET | \n",
+ " 37 | \n",
+ " 0.163610 | \n",
+ " 0.007256 | \n",
+ " 0.12608 | \n",
+ " 0.097201 | \n",
+ " ... | \n",
+ " G | \n",
+ " hg38 | \n",
+ " 0.580808 | \n",
+ " 29344 | \n",
+ " -2.348664 | \n",
+ " 0.018841 | \n",
+ " -0.010569 | \n",
+ " 0.004363 | \n",
+ " original | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " B_rs1131017_RPS26 | \n",
+ " B | \n",
+ " rs1131017 | \n",
+ " RPS26 | \n",
+ " SET | \n",
+ " 38 | \n",
+ " -0.010395 | \n",
+ " -0.000467 | \n",
+ " 0.11668 | \n",
+ " 0.535490 | \n",
+ " ... | \n",
+ " G | \n",
+ " hg38 | \n",
+ " 0.580808 | \n",
+ " 15954 | \n",
+ " -0.324182 | \n",
+ " 0.745800 | \n",
+ " -0.009950 | \n",
+ " 0.025700 | \n",
+ " original | \n",
+ " 7387.0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " B_rs1131017_RPS26 | \n",
+ " B | \n",
+ " rs1131017 | \n",
+ " RPS26 | \n",
+ " SET | \n",
+ " 38 | \n",
+ " 0.282350 | \n",
+ " 0.012677 | \n",
+ " 0.11706 | \n",
+ " 0.007937 | \n",
+ " ... | \n",
+ " G | \n",
+ " hg38 | \n",
+ " 0.580808 | \n",
+ " 337159 | \n",
+ " -1.597883 | \n",
+ " 0.110069 | \n",
+ " -0.000210 | \n",
+ " 0.000132 | \n",
+ " original | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 44 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " VARIABLE celltype SNP gene TYPE NGENES BETA \\\n",
+ "0 B_rs1131017_RPS26 B rs1131017 RPS26 SET 38 -0.199320 \n",
+ "1 B_rs1131017_RPS26 B rs1131017 RPS26 SET 38 0.201320 \n",
+ "2 B_rs1131017_RPS26 B rs1131017 RPS26 SET 37 0.163610 \n",
+ "3 B_rs1131017_RPS26 B rs1131017 RPS26 SET 38 -0.010395 \n",
+ "4 B_rs1131017_RPS26 B rs1131017 RPS26 SET 38 0.282350 \n",
+ "\n",
+ " BETA_STD SE P ... non_effect_allele current_build \\\n",
+ "0 -0.008952 0.12542 0.943980 ... G hg38 \n",
+ "1 0.009042 0.12905 0.059382 ... G hg38 \n",
+ "2 0.007256 0.12608 0.097201 ... G hg38 \n",
+ "3 -0.000467 0.11668 0.535490 ... G hg38 \n",
+ "4 0.012677 0.11706 0.007937 ... G hg38 \n",
+ "\n",
+ " frequency sample_size zscore pvalue effect_size standard_error \\\n",
+ "0 0.580808 54612 0.138937 0.889500 0.002200 0.015600 \n",
+ "1 0.580808 53293 1.735682 0.082620 0.023902 0.013700 \n",
+ "2 0.580808 29344 -2.348664 0.018841 -0.010569 0.004363 \n",
+ "3 0.580808 15954 -0.324182 0.745800 -0.009950 0.025700 \n",
+ "4 0.580808 337159 -1.597883 0.110069 -0.000210 0.000132 \n",
+ "\n",
+ " imputation_status n_cases \n",
+ "0 original 17008.0 \n",
+ "1 original 19099.0 \n",
+ "2 original NaN \n",
+ "3 original 7387.0 \n",
+ "4 original NaN \n",
+ "\n",
+ "[5 rows x 44 columns]"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "magma_df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "VARIABLE B_rs1131017_RPS26\n",
+ "celltype B\n",
+ "SNP rs1131017\n",
+ "gene RPS26\n",
+ "TYPE SET\n",
+ "NGENES 38\n",
+ "BETA -0.19932\n",
+ "BETA_STD -0.008952\n",
+ "SE 0.12542\n",
+ "P 0.94398\n",
+ "prefix results/current/magma/AD\n",
+ "trait AD\n",
+ "FDR 0.973479\n",
+ "Tag IGAP_Alzheimer\n",
+ "PUBMED_Paper_Link http://www.ncbi.nlm.nih.gov/pubmed/24162737\n",
+ "Phenotype Alzheimer\n",
+ "RSID rs10876864\n",
+ "RSALIAS rs57455456\n",
+ "CHR 12\n",
+ "POS1 56435929\n",
+ "POS2 56401085\n",
+ "DIST -34844\n",
+ "R2 0.991789\n",
+ "D 0.240643\n",
+ "DPRIME 0.995886\n",
+ "MAJOR A\n",
+ "MINOR G\n",
+ "MAF 0.408549\n",
+ "CMMB 0.155229\n",
+ "CM 71.092406\n",
+ "panel_variant_id chr12_56007301_G_A_b38\n",
+ "chromosome chr12\n",
+ "position 56007301\n",
+ "effect_allele A\n",
+ "non_effect_allele G\n",
+ "current_build hg38\n",
+ "frequency 0.580808\n",
+ "sample_size 54612\n",
+ "zscore 0.138937\n",
+ "pvalue 0.8895\n",
+ "effect_size 0.0022\n",
+ "standard_error 0.0156\n",
+ "imputation_status original\n",
+ "n_cases 17008.0\n",
+ "Name: 0, dtype: object"
+ ]
+ },
+ "execution_count": 23,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "magma_df.iloc[0]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([[0.041321, 'Inflammatory Bowel Disease'],\n",
+ " [0.030935, 'Non-cancer illness code, self-reported: psoriasis'],\n",
+ " [0.0090688,\n",
+ " 'Non-cancer illness code, self-reported: schizophrenia'],\n",
+ " [0.0042454,\n",
+ " 'Overall breast cancer in Europeans, imputed genotype'],\n",
+ " [0.032584, 'Diagnoses - main ICD10: G40 Epilepsy'],\n",
+ " [0.0013766,\n",
+ " 'Estrogen-receptor-negative breast cancer in Europeans, imputed genotype'],\n",
+ " [0.025212,\n",
+ " 'Non-cancer illness code, self-reported: high cholesterol']],\n",
+ " dtype=object)"
+ ]
+ },
+ "execution_count": 27,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "magma_df[(magma_df['SNP']=='rs4147638') & (magma_df['P']<0.05)][['P', 'Phenotype']].values"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Query | \n",
+ " GWAS Trait | \n",
+ " RS Number | \n",
+ " Position (GRCh37) | \n",
+ " Alleles | \n",
+ " R2 | \n",
+ " D' | \n",
+ " Risk Allele | \n",
+ " Effect Size (95% CI) | \n",
+ " Beta or OR | \n",
+ " P-value | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " rs2954654 | \n",
+ " Type 2 diabetes | \n",
+ " rs2294120 | \n",
+ " chr8:146003567 | \n",
+ " A=0.52, G=0.48 | \n",
+ " 0.846295 | \n",
+ " 0.957895 | \n",
+ " 0.455879299759268 | \n",
+ " 0.04430 | \n",
+ " 0.029-0.06 | \n",
+ " 2.000000e-08 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " rs4840568 | \n",
+ " Albumin-globulin ratio | \n",
+ " rs2409780 | \n",
+ " chr8:11337587 | \n",
+ " C=0.237, T=0.763 | \n",
+ " 0.897156 | \n",
+ " 1.000000 | \n",
+ " NR | \n",
+ " 0.04604 | \n",
+ " 0.035-0.057 | \n",
+ " 1.000000e-16 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " rs4840568 | \n",
+ " Non-albumin protein levels | \n",
+ " rs2409780 | \n",
+ " chr8:11337587 | \n",
+ " C=0.237, T=0.763 | \n",
+ " 0.897156 | \n",
+ " 1.000000 | \n",
+ " NR | \n",
+ " 0.04456 | \n",
+ " 0.034-0.055 | \n",
+ " 1.000000e-15 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " rs4840568 | \n",
+ " Rheumatoid arthritis | \n",
+ " rs2618444 | \n",
+ " chr8:11338370 | \n",
+ " A=0.763, C=0.237 | \n",
+ " 0.897156 | \n",
+ " 1.000000 | \n",
+ " NR | \n",
+ " 0.10050 | \n",
+ " 0.072-0.129 | \n",
+ " 7.000000e-12 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " rs4840568 | \n",
+ " Systemic lupus erythematosus | \n",
+ " rs2618444 | \n",
+ " chr8:11338370 | \n",
+ " A=0.763, C=0.237 | \n",
+ " 0.897156 | \n",
+ " 1.000000 | \n",
+ " NR | \n",
+ " 1.36000 | \n",
+ " 1.22-1.51 | \n",
+ " 7.000000e-09 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Query GWAS Trait RS Number Position (GRCh37) \\\n",
+ "0 rs2954654 Type 2 diabetes rs2294120 chr8:146003567 \n",
+ "1 rs4840568 Albumin-globulin ratio rs2409780 chr8:11337587 \n",
+ "2 rs4840568 Non-albumin protein levels rs2409780 chr8:11337587 \n",
+ "3 rs4840568 Rheumatoid arthritis rs2618444 chr8:11338370 \n",
+ "4 rs4840568 Systemic lupus erythematosus rs2618444 chr8:11338370 \n",
+ "\n",
+ " Alleles R2 D' Risk Allele \\\n",
+ "0 A=0.52, G=0.48 0.846295 0.957895 0.455879299759268 \n",
+ "1 C=0.237, T=0.763 0.897156 1.000000 NR \n",
+ "2 C=0.237, T=0.763 0.897156 1.000000 NR \n",
+ "3 A=0.763, C=0.237 0.897156 1.000000 NR \n",
+ "4 A=0.763, C=0.237 0.897156 1.000000 NR \n",
+ "\n",
+ " Effect Size (95% CI) Beta or OR P-value \n",
+ "0 0.04430 0.029-0.06 2.000000e-08 \n",
+ "1 0.04604 0.035-0.057 1.000000e-16 \n",
+ "2 0.04456 0.034-0.055 1.000000e-15 \n",
+ "3 0.10050 0.072-0.129 7.000000e-12 \n",
+ "4 1.36000 1.22-1.51 7.000000e-09 "
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "allsnps_inld_gwas_df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "allsnps_inld_gwas_df.to_excel('./coeqtl_mapping/output/snps_in_ld_with_gwas_catelogue.xlsx')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.11"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/05_coeqtl_interpretation/.ipynb_checkpoints/TEM_NAIVE-checkpoint.ipynb b/05_coeqtl_interpretation/.ipynb_checkpoints/TEM_NAIVE-checkpoint.ipynb
new file mode 100644
index 0000000..314c2f6
--- /dev/null
+++ b/05_coeqtl_interpretation/.ipynb_checkpoints/TEM_NAIVE-checkpoint.ipynb
@@ -0,0 +1,1437 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "import re\n",
+ "from itertools import combinations\n",
+ "from pathlib import Path\n",
+ "\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import scanpy as sc\n",
+ "from scipy.stats import spearmanr\n",
+ "from scipy.stats import t, norm\n",
+ "from tqdm import tqdm\n",
+ "import argparse\n",
+ "from scipy.stats import rankdata\n",
+ "from collections import namedtuple\n",
+ "import matplotlib.pyplot as plt\n",
+ "import seaborn as sns\n",
+ "from scipy import stats\n",
+ "%matplotlib inline\n",
+ "\n",
+ "\n",
+ "def get_time(x):\n",
+ " if x == 'UT':\n",
+ " return x\n",
+ " else:\n",
+ " pattern = re.compile(r'\\d+h')\n",
+ " return re.findall(pattern, x)[0]\n",
+ "\n",
+ "\n",
+ "class DATASET:\n",
+ " def __init__(self, datasetname):\n",
+ " self.name = datasetname\n",
+ " self.path_prefix = Path(\"./seurat_objects\")\n",
+ " self.information = self.get_information()\n",
+ " def get_information(self):\n",
+ " if self.name == 'onemillionv2':\n",
+ " self.path = '1M_v2_mediumQC_ctd_rnanormed_demuxids_20201029.sct.h5ad'\n",
+ " self.individual_id_col = 'assignment'\n",
+ " self.timepoint_id_col = 'time'\n",
+ " self.celltype_id = 'cell_type_lowerres'\n",
+ " self.chosen_condition = {'UT': 'UT',\n",
+ " 'stimulated': '3h'}\n",
+ " elif self.name == 'onemillionv3':\n",
+ " self.path = '1M_v3_mediumQC_ctd_rnanormed_demuxids_20201106.SCT.h5ad'\n",
+ " self.individual_id_col = 'assignment'\n",
+ " self.timepoint_id_col = 'time'\n",
+ " self.celltype_id = 'cell_type_lowerres'\n",
+ " self.chosen_condition = {'UT': 'UT',\n",
+ " 'stimulated': '3h'}\n",
+ " elif self.name == 'stemiv2':\n",
+ " self.path = 'cardio.integrated.20210301.stemiv2.h5ad'\n",
+ " self.individual_id_col = 'assignment.final'\n",
+ " self.timepoint_id_col = 'timepoint.final'\n",
+ " self.celltype_id = 'cell_type_lowerres'\n",
+ " self.chosen_condition = {'UT': 't8w',\n",
+ " 'stimulated': 'Baseline'}\n",
+ " elif self.name == 'ng':\n",
+ " self.path = 'pilot3_seurat3_200420_sct_azimuth.h5ad'\n",
+ " self.individual_id_col = 'snumber'\n",
+ " self.celltype_id = 'cell_type_mapped_to_onemillion'\n",
+ " else:\n",
+ " raise IOError(\"Dataset name not understood.\")\n",
+ " def load_dataset(self):\n",
+ " self.get_information()\n",
+ " print(f'Loading dataset {self.name} from {self.path_prefix} {self.path}')\n",
+ " self.data_sc = sc.read_h5ad(self.path_prefix / self.path)\n",
+ " if self.name.startswith('onemillion'):\n",
+ " self.data_sc.obs['time'] = [get_time(item) for item in self.data_sc.obs['timepoint']]\n",
+ " elif self.name == 'ng':\n",
+ " celltype_maping = {'CD4 T': 'CD4T', 'CD8 T': 'CD8T', 'Mono': 'monocyte', 'DC': 'DC', 'NK': 'NK',\n",
+ " 'other T': 'otherT', 'other': 'other', 'B': 'B'}\n",
+ " self.data_sc.obs['cell_type_mapped_to_onemillion'] = [celltype_maping.get(name) for name in\n",
+ " self.data_sc.obs['predicted.celltype.l1']]\n",
+ "\n",
+ "def corr_to_z(coef, num):\n",
+ " t_statistic = coef * np.sqrt((num - 2) / (1 - coef ** 2))\n",
+ " prob = t.cdf(t_statistic, num - 2)\n",
+ " z_score = norm.ppf(prob)\n",
+ " positive_coef_probs = 1 - prob\n",
+ " positive_coef_probs[coef < 0] = 0\n",
+ " negative_coef_probs = prob\n",
+ " negative_coef_probs[coef > 0] = 0\n",
+ " probs = negative_coef_probs + positive_coef_probs\n",
+ " return z_score, probs\n",
+ "\n",
+ "\n",
+ "def get_individual_networks_selected_genepairs(data_df, data_sc, individual_colname, genepair):\n",
+ "# data_df = pd.DataFrame(data=data_sc.X.toarray(),\n",
+ "# index=data_sc.obs.index,\n",
+ "# columns=data_sc.var.index)\n",
+ " gene1, gene2 = genepair.split(';')\n",
+ " sorted_genepair = [';'.join(sorted([gene1, gene2]))]\n",
+ " coef_df = pd.DataFrame(index=sorted_genepair)\n",
+ " coef_p_df = pd.DataFrame(index=sorted_genepair)\n",
+ " zscore_df = pd.DataFrame(index=sorted_genepair)\n",
+ " zscore_p_df = pd.DataFrame(index=sorted_genepair)\n",
+ " data_selected_df = data_df[[gene1, gene2]]\n",
+ " print(\n",
+ " f\"Begin calculating networks for {len(data_sc.obs[individual_colname].unique())} individuals and;\\n{genepair}\"\n",
+ " )\n",
+ " for ind_id in tqdm(data_sc.obs[individual_colname].unique()):\n",
+ " cell_num = data_sc.obs[data_sc.obs[individual_colname] == ind_id].shape[0]\n",
+ " if cell_num > 10:\n",
+ " individual_df = data_selected_df.loc[data_sc.obs[individual_colname] == ind_id]\n",
+ " individual_coefs, individual_coef_ps = spearmanr(individual_df.values, axis=0)\n",
+ " if data_selected_df.shape[1] == 2:\n",
+ " individual_coefs_flatten = pd.DataFrame(data = [individual_coefs],\n",
+ " index = sorted_genepair)\n",
+ " individual_coef_ps_flatten = \\\n",
+ " pd.DataFrame(data=[individual_coef_ps],\n",
+ " index=sorted_genepair)\n",
+ " else:\n",
+ " individual_coefs_flatten = pd.DataFrame(\n",
+ " data=individual_coefs[np.triu_indices_from(individual_coefs, 1)],\n",
+ " index=selected_genes_sorted_genepairs).loc[sorted_genepair]\n",
+ " individual_coef_ps_flatten = \\\n",
+ " pd.DataFrame(data=individual_coef_ps[np.triu_indices_from(individual_coefs, 1)],\n",
+ " index=selected_genes_sorted_genepairs).loc[sorted_genepair]\n",
+ " coef_df[ind_id] = individual_coefs_flatten\n",
+ " coef_p_df[ind_id] = individual_coef_ps_flatten\n",
+ " try:\n",
+ "# print(individual_coefs_flatten.values, cell_num)\n",
+ " individual_zscores_flatten, individual_zscore_ps_flatten = corr_to_z(\n",
+ " individual_coefs_flatten.values, \n",
+ " cell_num\n",
+ " )\n",
+ " zscore_df[ind_id] = individual_zscores_flatten\n",
+ " zscore_p_df[ind_id] = individual_zscore_ps_flatten\n",
+ " except:\n",
+ " continue\n",
+ " else:\n",
+ " print(\"Deleted this individual because of low cell number\", cell_num)\n",
+ " return data_selected_df, zscore_df, zscore_p_df"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### One million data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# load the GT data\n",
+ "gt = pd.read_csv('./coeqtl_interpretation/rs1131017_TEM_ratio/rs1131017.vcf',\n",
+ " skiprows=6, sep='\\t')\n",
+ "change_colnames = lambda col:'_'.join(col.split('_')[1:]) if 'LLDeep' in col else col\n",
+ "gt = gt.rename({col:change_colnames(col) for col in gt.columns}, axis=1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " #CHROM | \n",
+ " POS | \n",
+ " ID | \n",
+ " REF | \n",
+ " ALT | \n",
+ " QUAL | \n",
+ " FILTER | \n",
+ " INFO | \n",
+ " FORMAT | \n",
+ " LLDeep_1191 | \n",
+ " ... | \n",
+ " s21 | \n",
+ " s43 | \n",
+ " s24 | \n",
+ " s23 | \n",
+ " s45 | \n",
+ " s26 | \n",
+ " s25 | \n",
+ " s28 | \n",
+ " s27 | \n",
+ " s29 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 12 | \n",
+ " 56435929 | \n",
+ " rs1131017 | \n",
+ " C | \n",
+ " G | \n",
+ " . | \n",
+ " . | \n",
+ " . | \n",
+ " GT:DS | \n",
+ " 0/0:0.0 | \n",
+ " ... | \n",
+ " 1/1:2.0 | \n",
+ " 1/1:2.0 | \n",
+ " 1/1:2.0 | \n",
+ " 0/1:1.0 | \n",
+ " 1/1:2.0 | \n",
+ " 0/1:1.0 | \n",
+ " 0/0:0.0 | \n",
+ " 1/1:2.0 | \n",
+ " 0/0:0.06000000000000005 | \n",
+ " 1/1:2.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
1 rows × 182 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT LLDeep_1191 \\\n",
+ "0 12 56435929 rs1131017 C G . . . GT:DS 0/0:0.0 \n",
+ "\n",
+ " ... s21 s43 s24 s23 s45 s26 s25 \\\n",
+ "0 ... 1/1:2.0 1/1:2.0 1/1:2.0 0/1:1.0 1/1:2.0 0/1:1.0 0/0:0.0 \n",
+ "\n",
+ " s28 s27 s29 \n",
+ "0 1/1:2.0 0/0:0.06000000000000005 1/1:2.0 \n",
+ "\n",
+ "[1 rows x 182 columns]"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "gt.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### CD4T+CD8T"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# # load onemillion v2 data\n",
+ "# dataset = DATASET('onemillionv2')\n",
+ "# dataset.load_dataset()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " assignment | \n",
+ " predicted.celltype.l1 | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " LLDeep_0022 | \n",
+ " B | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " CD4 T | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " CD8 T | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " DC | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " Mono | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " 0\n",
+ "assignment predicted.celltype.l1 \n",
+ "LLDeep_0022 B 0\n",
+ " CD4 T 0\n",
+ " CD8 T 0\n",
+ " DC 0\n",
+ " Mono 0"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " assignment | \n",
+ " predicted.celltype.l2 | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " LLDeep_0022 | \n",
+ " ASDC | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " B intermediate | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " B memory | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " B naive | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " CD14 Mono | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " 0\n",
+ "assignment predicted.celltype.l2 \n",
+ "LLDeep_0022 ASDC 0\n",
+ " B intermediate 0\n",
+ " B memory 0\n",
+ " B naive 0\n",
+ " CD14 Mono 0"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " CD4T TEM | \n",
+ " CD4T Naive | \n",
+ " CD8T TEM | \n",
+ " CD8T Naive | \n",
+ " CD4T | \n",
+ " CD8T | \n",
+ " CD4T TCM | \n",
+ " CD8T TCM | \n",
+ " all_num | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " LLDeep_1370 | \n",
+ " 20 | \n",
+ " 85 | \n",
+ " 74 | \n",
+ " 13 | \n",
+ " 224 | \n",
+ " 89 | \n",
+ " 85 | \n",
+ " 8 | \n",
+ " 827 | \n",
+ "
\n",
+ " \n",
+ " LLDeep_0434 | \n",
+ " 42 | \n",
+ " 417 | \n",
+ " 95 | \n",
+ " 93 | \n",
+ " 989 | \n",
+ " 209 | \n",
+ " 406 | \n",
+ " 29 | \n",
+ " 1496 | \n",
+ "
\n",
+ " \n",
+ " LLDeep_1319 | \n",
+ " 110 | \n",
+ " 132 | \n",
+ " 223 | \n",
+ " 4 | \n",
+ " 922 | \n",
+ " 236 | \n",
+ " 546 | \n",
+ " 19 | \n",
+ " 1504 | \n",
+ "
\n",
+ " \n",
+ " LLDeep_0269 | \n",
+ " 21 | \n",
+ " 58 | \n",
+ " 80 | \n",
+ " 7 | \n",
+ " 211 | \n",
+ " 88 | \n",
+ " 101 | \n",
+ " 8 | \n",
+ " 529 | \n",
+ "
\n",
+ " \n",
+ " LLDeep_0471 | \n",
+ " 27 | \n",
+ " 242 | \n",
+ " 81 | \n",
+ " 14 | \n",
+ " 570 | \n",
+ " 103 | \n",
+ " 254 | \n",
+ " 10 | \n",
+ " 1241 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " CD4T TEM CD4T Naive CD8T TEM CD8T Naive CD4T CD8T CD4T TCM \\\n",
+ "LLDeep_1370 20 85 74 13 224 89 85 \n",
+ "LLDeep_0434 42 417 95 93 989 209 406 \n",
+ "LLDeep_1319 110 132 223 4 922 236 546 \n",
+ "LLDeep_0269 21 58 80 7 211 88 101 \n",
+ "LLDeep_0471 27 242 81 14 570 103 254 \n",
+ "\n",
+ " CD8T TCM all_num \n",
+ "LLDeep_1370 8 827 \n",
+ "LLDeep_0434 29 1496 \n",
+ "LLDeep_1319 19 1504 \n",
+ "LLDeep_0269 8 529 \n",
+ "LLDeep_0471 10 1241 "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/plain": [
+ "Text(0.5, 0, 'rs1131017')"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "onemillionv2 = dataset.data_sc.obs.copy()\n",
+ "onemillionv2_celltype_df = pd.read_csv(\n",
+ " './1M_v2_20201029_azimuth.tsv',\n",
+ " sep='\\t', index_col=0\n",
+ ")\n",
+ "onemillionv2 = pd.concat([onemillionv2, onemillionv2_celltype_df], axis=1)\n",
+ "onemillionv2 = onemillionv2[onemillionv2['timepoint']=='UT']\n",
+ "onemillionv2_l1_cellratio_df = onemillionv2.groupby(['assignment', 'predicted.celltype.l1']).size().to_frame()\n",
+ "display(onemillionv2_l1_cellratio_df.head())\n",
+ "onemillionv2_celltyperatio = onemillionv2.groupby(['assignment', 'predicted.celltype.l2']).size().to_frame()\n",
+ "display(onemillionv2_celltyperatio.head())\n",
+ "onemillionv2_allcells = onemillionv2['assignment'].value_counts()\n",
+ "\n",
+ "# caluclate the individual CD4T TEM and NAIVE ratio\n",
+ "individual_ratio = pd.DataFrame()\n",
+ "for individual in onemillionv2['assignment'].unique():\n",
+ " tem_num = onemillionv2_celltyperatio.loc[individual, \"CD4 TEM\"].values[0]\n",
+ " naive_num = onemillionv2_celltyperatio.loc[individual, \"CD4 Naive\"].values[0]\n",
+ " cd8t_tem_num = onemillionv2_celltyperatio.loc[individual, \"CD8 TEM\"].values[0]\n",
+ " tcm_num = onemillionv2_celltyperatio.loc[individual, \"CD4 TCM\"].values[0]\n",
+ " cd8t_tcm_num = onemillionv2_celltyperatio.loc[individual, \"CD8 TCM\"].values[0]\n",
+ " cd8t_naive_num = onemillionv2_celltyperatio.loc[individual, \"CD8 Naive\"].values[0]\n",
+ " cd4t_num = onemillionv2_l1_cellratio_df.loc[individual, 'CD4 T'].values[0]\n",
+ " cd8t_num = onemillionv2_l1_cellratio_df.loc[individual, 'CD8 T'].values[0]\n",
+ " all_num = onemillionv2_allcells.loc[individual]\n",
+ " individual_ratio[individual] = [tem_num, naive_num, \n",
+ " cd8t_tem_num, cd8t_naive_num,\n",
+ " cd4t_num, cd8t_num,\n",
+ " tcm_num, cd8t_tcm_num, all_num]\n",
+ "\n",
+ "individual_ratio_df = individual_ratio.T\n",
+ "individual_ratio_df = individual_ratio_df.rename({0: 'CD4T TEM', 1:'CD4T Naive', \n",
+ " 2: 'CD8T TEM', 3: 'CD8T Naive',\n",
+ " 4: 'CD4T', 5: 'CD8T',\n",
+ " 6: 'CD4T TCM', 7: 'CD8T TCM',\n",
+ " 8: 'all_num'}, \n",
+ " axis=1)\n",
+ "display(individual_ratio_df.head())\n",
+ "\n",
+ "\n",
+ "common_individuals = list(set(individual_ratio_df.index) & set(gt.columns))\n",
+ "common_individuals_individual_ratio_df = individual_ratio_df.loc[common_individuals]\n",
+ "common_individuals_individual_ratio_df['gt'] = [float(gt[col].values[0].split(':')[1]) for col in \n",
+ " common_individuals_individual_ratio_df.index]\n",
+ "common_individuals_individual_ratio_df['chemistry'] = 'v2'\n",
+ "\n",
+ "fig, axes = plt.subplots(1, 3, figsize=(15, 5))\n",
+ "ax1, ax2, ax3 = axes\n",
+ "cd4ydata = (common_individuals_individual_ratio_df['CD4T TEM']) / common_individuals_individual_ratio_df['CD4T']\n",
+ "sns.regplot(x=common_individuals_individual_ratio_df['gt'],\n",
+ " y=cd4ydata, \n",
+ " ax=ax1)\n",
+ "r, p = stats.pearsonr(common_individuals_individual_ratio_df['gt'],\n",
+ " cd4ydata)\n",
+ "ax1.set_title('Oelen v2 r={:.2f}, p={:.2g}'.format(r, p))\n",
+ "ax1.set_ylabel('CD4 TEM / CD4T')\n",
+ "ax1.set_xlabel(\"rs1131017\")\n",
+ "\n",
+ "cd8tydata = (common_individuals_individual_ratio_df['CD4T Naive']) / common_individuals_individual_ratio_df['CD4T']\n",
+ "sns.regplot(x=common_individuals_individual_ratio_df['gt'],\n",
+ " y= cd8tydata, \n",
+ " ax=ax2)\n",
+ "r, p = stats.pearsonr(common_individuals_individual_ratio_df['gt'],\n",
+ " cd8tydata)\n",
+ "ax2.set_title('Oelen v2 r={:.2f}, p={:.2g}'.format(r, p))\n",
+ "ax2.set_ylabel('CD4 Naive / CD4T')\n",
+ "ax2.set_xlabel(\"rs1131017\")\n",
+ "\n",
+ "cd8tydata = (common_individuals_individual_ratio_df['CD4T Naive']) / common_individuals_individual_ratio_df['CD4T TEM']\n",
+ "sns.regplot(x=common_individuals_individual_ratio_df['gt'],\n",
+ " y= cd8tydata, \n",
+ " ax=ax3)\n",
+ "r, p = stats.pearsonr(common_individuals_individual_ratio_df['gt'],\n",
+ " cd8tydata)\n",
+ "ax3.set_title('Oelen v2 r={:.2f}, p={:.2g}'.format(r, p))\n",
+ "ax3.set_ylabel('CD4 Naive / CD4T TEM')\n",
+ "ax3.set_xlabel(\"rs1131017\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Text(0.5, 0, 'rs1131017')"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "fig, axes = plt.subplots(1, 3, figsize=(15, 5))\n",
+ "ax1, ax2, ax3 = axes\n",
+ "cd4ydata = (common_individuals_individual_ratio_df['CD8T TEM']) / common_individuals_individual_ratio_df['CD8T']\n",
+ "sns.regplot(x=common_individuals_individual_ratio_df['gt'],\n",
+ " y=cd4ydata, \n",
+ " ax=ax1)\n",
+ "r, p = stats.pearsonr(common_individuals_individual_ratio_df['gt'],\n",
+ " cd4ydata)\n",
+ "ax1.set_title('Oelen v2 r={:.2f}, p={:.2g}'.format(r, p))\n",
+ "ax1.set_ylabel('CD8 TEM / CD8T')\n",
+ "ax1.set_xlabel(\"rs1131017\")\n",
+ "\n",
+ "cd8tydata = (common_individuals_individual_ratio_df['CD8T Naive']) / common_individuals_individual_ratio_df['CD8T']\n",
+ "sns.regplot(x=common_individuals_individual_ratio_df['gt'],\n",
+ " y= cd8tydata, \n",
+ " ax=ax2)\n",
+ "r, p = stats.pearsonr(common_individuals_individual_ratio_df['gt'],\n",
+ " cd8tydata)\n",
+ "ax2.set_title('Oelen v2 r={:.2f}, p={:.2g}'.format(r, p))\n",
+ "ax2.set_ylabel('CD8 Naive / CD8T')\n",
+ "ax2.set_xlabel(\"rs1131017\")\n",
+ "\n",
+ "cd8tydata = (common_individuals_individual_ratio_df['CD8T Naive']) / common_individuals_individual_ratio_df['CD8T TEM']\n",
+ "sns.regplot(x=common_individuals_individual_ratio_df['gt'],\n",
+ " y= cd8tydata, \n",
+ " ax=ax3)\n",
+ "r, p = stats.pearsonr(common_individuals_individual_ratio_df['gt'],\n",
+ " cd8tydata)\n",
+ "ax3.set_title('Oelen v2 r={:.2f}, p={:.2g}'.format(r, p))\n",
+ "ax3.set_ylabel('CD8 Naive / CD8T TEM')\n",
+ "ax3.set_xlabel(\"rs1131017\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Text(0.5, 0, 'rs1131017')"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "cd8tydata = (common_individuals_individual_ratio_df['CD8T TEM'] + \\\n",
+ " common_individuals_individual_ratio_df['CD4T TEM']) / (\n",
+ " common_individuals_individual_ratio_df['CD8T Naive'] + \\\n",
+ " common_individuals_individual_ratio_df['CD4T Naive']\n",
+ ")\n",
+ "fig, ax = plt.subplots()\n",
+ "sns.regplot(x=common_individuals_individual_ratio_df['gt'],\n",
+ " y= cd8tydata, \n",
+ " ax=ax)\n",
+ "r, p = stats.spearmanr(common_individuals_individual_ratio_df['gt'],\n",
+ " cd8tydata)\n",
+ "ax.set_title('Oelen v2 r={:.2f}, p={:.2g}'.format(r, p))\n",
+ "ax.set_ylabel('CD8+CD4 TEM / CD8+CD4 Naive')\n",
+ "ax.set_xlabel(\"rs1131017\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Begin calculating networks for 72 individuals and;\n",
+ "RPS26;RUNX3\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|██████████| 72/72 [00:00<00:00, 207.95it/s]\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "SpearmanrResult(correlation=0.29651955126400936, pvalue=0.011433091246178868)\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "onemillionv2_datasc = dataset.data_sc\n",
+ "onemillionv2_monocytes_ut = onemillionv2_datasc[(onemillionv2_datasc.obs['cell_type_lowerres']=='CD4T') & \n",
+ " (onemillionv2_datasc.obs['time']=='UT')]\n",
+ "onemillionv2_monocytes_ut_df = pd.DataFrame(\n",
+ " data=onemillionv2_monocytes_ut.X.toarray(),\n",
+ " columns=onemillionv2_monocytes_ut.var.index,\n",
+ " index=onemillionv2_monocytes_ut.obs.index\n",
+ ")\n",
+ "data_selected_df, zscore_df, zscore_p_df = \\\n",
+ "get_individual_networks_selected_genepairs(onemillionv2_monocytes_ut_df, \n",
+ " onemillionv2_monocytes_ut, \n",
+ " 'assignment', \n",
+ " ';'.join(['RPS26', 'RUNX3']))\n",
+ "concated_df = pd.concat([zscore_df.T,\n",
+ " gt.T],\n",
+ " axis=1).dropna()\n",
+ "concated_df['gt'] = [item.split(':')[0].count('1') for item in concated_df[0]]\n",
+ "print(spearmanr(concated_df['RPS26;RUNX3'], concated_df['gt']))\n",
+ "sns.regplot(x='gt', y='RPS26;RUNX3', data=concated_df)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Begin calculating networks for 72 individuals and;\n",
+ "RPS26;RUNX3\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|██████████| 72/72 [00:00<00:00, 214.76it/s]\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "SpearmanrResult(correlation=0.2201691525430256, pvalue=0.06311568667519006)\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYAAAAEGCAYAAABsLkJ6AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8rg+JYAAAACXBIWXMAAAsTAAALEwEAmpwYAAAqEklEQVR4nO3dfXRcd33n8ff3zoMe/SDFMkljm0QJ1BBKmtRkScl63R4WQtsk7TZtYbu0tGTtbpcFuktOoLRpG7pdcmi7DduHdU7K2WWXQlvTlrQloWRT46YkhTxgiBsBQU6QA0Z+kG1JI2ke7nf/uDPSjCxnpNG9Gs3M53WOzozuPP08vvp97/3e7+/3M3dHREQ6T9DsBoiISHMoAIiIdCgFABGRDqUAICLSoRQAREQ6VLrZDViJLVu2+GWXXdbsZoiItJQnnnjipLsPLd7eUgHgsssu4/HHH292M0REWoqZPb/UdqWAREQ6lAKAiEiHUgAQEelQCgAiIh1KAUBEpEO1VBWQiEinOTgyzv5Do4xN5Ng+0Mu+3cPs2bk1lvfWGYCIyDp1cGScO+8/wvjkLJt7MoxPznLn/Uc4ODIey/srAIiIrFP7D42SSRm92TRm0W0mZew/NBrL+ysAiIisU2MTOXoyqZptPZkUxyZysby/AoCIyDq1faCXmUKpZttMocS2gd5Y3l8BQERkndq3e5hCycnli7hHt4WSs2/3cCzvrwAgIrJO7dm5lbtuvoqtG7o5O1Ng64Zu7rr5qtiqgFQGKiKyju3ZuTW2Dn8xnQGIiHQoBQARkQ6lACAi0qEUAEREOpQuAousQpLztIgkTWcAIg1Kep4WkaQpAIg0KOl5WkSSpgAg0qCk52kRSZoCgEiDkp6nRSRpCgAiDUp6nhaRpCkAiDQo6XlaRJKmMlCRVUhynhaRpOkMQESkQykAiIh0KAUAEZEOpQAgItKhFABERDqUAoCISIdqWgAws+1m9vdm9oyZHTGzdzWrLSIinaiZ4wCKwH9x9yfNbAPwhJl91t3/uYltEhHpGE07A3D3b7v7k+X7k8AzwKXNao+ISKdZF9cAzOwy4Brgn5rcFBGRjtH0AGBm/cAngXe7+7klHt9rZo+b2eMnTpxY+waKiLSppgYAM8sQdf4fc/e/WOo57n6vu+9y911DQ0Nr20ARkTbWzCogA/4YeMbdf7dZ7RAR6VTNrAJ6HfBW4Ctm9qXytl929083r0kiIuvLwZFx9h8aZWwix/aBXvbtHo5tBtqmBQB3fwSwZn2+iMh6d3BknDvvP0ImZWzuyTA+Ocud9x/hLoglCDT9IrCIiCxt/6FRMimjN5vGLLrNpIz9h0ZjeX8tCCMisk6NTeRIGYyemCJfCsmmArb0Zzk2kYvl/XUGICKyTm3oSvPCmVmKoZMKjGLovHBmlv6ueI7d2/4MIMkLKCIiSXL38h1qbue3r1JbnwFULqCMT87WXEA5ODLe7KaJiNQ1lS9x6eZu0imj5E46ZVy6uZvpfCmW92/rAJD0BRQRkSRtH+glnQoYHupn58UbGR7qJ50K2DbQG8v7t3UAGJvI0ZNJ1WzryaRiu4AiIpKkfbuHKZScXL6Ie3RbKDn7dg/H8v5tHQC2D/QyU6g9VZoplGKLniIiSdqzcyt33XwVWzd0c3amwNYN3dx181WtPxBsLezbPcyd9x8hly/Sk0kxUyjFGj1FRJK2Z+fWxApX2voMIOnoKSLSytr6DACSjZ4iIq2src8ARETkwtr+DEAkSR9+6Gvc98hRpvMl+rIpbrvhct75+pc3u1kiy6IAINKgDz/0Ne55+FkCg3QQVZjd8/CzAAoC0hKUAhJp0H2PHC13/gGBBeXbaLtIK1AAEGnQdL5EsGhFi8CIbZi+SNIUAEQa1JdNES6akyv0aLtIK1AAEGnQbTdcTuhQDENCD8u30XaRVqCLwCINqlzoVRWQtCqLa17ptbBr1y5//PHHm90MEZGWYmZPuPuuxduVAhIR6VAKACIiHUoBQESkQykAiIh0KFUBiazCwZFx9h8aZWwix/aBXvbtHtbssxKrJPcxnQGINOjgyDh33n+E8clZNvdkGJ+c5c77j3BwZLzZTZM2kfQ+pgAg0qD9h0bJpIzebBqz6DaTMvYfGm1206RNJL2PKQCINGhsIkdPpnbah55MimMTuSa1SNpN0vuYAoBIg7YP9DJTqJ34baZQYttAb5NaJO0m6X1MAUCkQft2D1MoObl8EffotlBy9u0ebnbTpE0kvY8pAIg0aM/Ordx67aWcmJzjmeOTnJic49ZrL1UVkMRmz86t3HXzVWzd0M3ZmQJbN3Rz181XxbaPqQxUpEEHR8Y58OQLDG3oYkcmxUyhxIEnX+DV2zYrCEhs9uzcmtj+pDMAkQapCkhanQKASINUBSStrqkBwMw+YmbjZvZ0M9sh0ghVAUmrqxsAzCwws6B8P2tm15rZYEyf/7+AG2N6L5E1pSogaXUvGgDM7EeBbwMvmNktwD8Avw182cxuWu2Hu/sh4PRq30ekGZKu0BBJWr0qoF8DrgZ6gMPAa9z9q2b2UuCTwF8n3D7MbC+wF2DHjh1Jf5xIQ1pnXT2RBXVTQO5+3N2PAt9096+Wtz2/nNfGwd3vdfdd7r5raGhoLT5SZFk0GZy0umVdAyjf/fmqbSkgm1SjRFqBykCl1dULAHspd/Tu/oWq7duBDybVKJFWoDJQaXX1rgE84+6zize6+3Pls4BVMbOPA3uALWZ2DPg1d//j1b5vNS3YIUnZPtDL+OQsvdmFPyOVgUorqXcGcNjMfrJ6g5l1m9lvAg+u9sPd/S3ufom7Z9x9WxKdv3K0khSVgUqrqxcA3gD8nJl91syuLJeCfgXoAq5JvHWrpBytJElloNLqXjQF5O7fAN5kZrcDI8Bx4I3ufmQtGrdaYxM5NvdkarYpRytxSnKiLhFo4prAZpY2s/cB+4BfBB4HPmxm3x3Lpyds+0Avp6bnGD0xxcjxc4yemOLU9JxytCLSEg6OjPOeA4d5amyC75yb5amxCd5z4PCarQn8FHAp8H3levwfBf478Ckz+61YWpCg64cHGZ/Mky+FBAb5Usj4ZJ7rh+OayUJEJDkffOAZzuQKeAgpMzyEM7kCH3zgmVjev14AeJu7v8Pdz1Y2uPvfEOX/1/3gx0dHTzPUnyWbCggdsqmAof4sj45q9gkRWf+OnsoBTiEMmSuGFMIQ8PL21at3DeCJC2yfAd4fSwsSNDaRY0t/F0Mbuue3ubuuAUhsPvzQ17jvkaNM50v0ZVPcdsPlvPP1L292s6RNhKFTDKP7ZuAORQeL6fj7RQOAmR2l9kjfqn53d78illYkRHXakqQPP/Q17nn4WQKDdBDtW/c8/CyAgoCsmLtTKDmFUlj+cVKBUQjLXW5VT5xJxTMTT7132QW8purnOuB3iALBl2JpQYJUpy1Juu+Ro+XOPyCwoHwbbRd5MWHozBZKnJ0pcGJyjmMTOZ47lePYRI7vnJvl9HSeydkC3Zlon4KF/j8VQG82ngBQLwV0CubnA3orcDtRx//D7v7PsbQgQXt2buUuovEAxyZybNNIYInRdL5EetHfYWDRdpFiKaQYOvlSSKEY3S+GTrEUUgqXl8K57KJ+Rk9OMjlbwonSQBu707zsJRtjaWO9FFCGaBK4XwIeAW4pjw1oGarTlqT0ZaOF4CtHaAChR9ulMxTLqZp8OW1TLKdwiqHjvvo8/TXbN3H42Jn5o//Q4exMMbZKxnpzAR0FisDvAd8ErjazqysPuvtfxNIKkRZ02w2Xc8/Dz1IMozLj0KOf2264vNlNkxi5Rx18pXPPlzv9QjEkjKGTfzGf+/rJmt+tfBX2gaePx3KdqV4AeCj6OK4u/1RzQAFAOlblD1BVQO0hLKdrKimb6guyzTI2kSMdQCoIMItONUthyOjJ6Vjev941gLfF8ikibeqdr3+5OvwW5O7MFUPmCiFzxRKzhZBi2LyOvlnqXQP4z4s2OXASeKS8SphIR9N0460hDJ3ZYom5QshsucOPI0eftO2be3j+dA4LnSCIxgGEDlduiaeUvV4t0YZFPxuJSkMfMLM3x9ICkRal6cbXr2IpZGquyMmpSonlNMfPzjKRyzOTL7VE5w+wd/cVbOzJYAGU3LEANvdmeO+bXhHL+9dLAf3GUtvNbJDo+sAnYmmFSAuqnm4coDebJpcvsv/QqM4C1kDl4my+GP0UYq7AWQ+uGx7kjjfu5MCTx/jOudnYS9nrXQRekruftsoVCZEONTaRI2UwemKKfCkkmwrY0p/VVCMJKIULFTj5YjQvTr7YGmmc1bpueJA3vOpiBvviX4a9oQBgZj8ITMTcFpGW0p9N8eyJaVJmpMwolpwXzsxy5VBfs5vWcioVOMXQKZWiyc8qnX4p9GUPnJKVqXcR+CucP+vnIPAt4GeTapRIKzCzKA3hHv2VWHRRTSfHS6t08nPFMBoN61HHXih6R1bgLNcXRk9z4MljHD83G3uhQb0zgB9Z9LsDp9w9niJUkRZ2YmouulPp/8tBYH57B6pMaFYKo6P49VJP36q+MHqauz8zQi5fJHQ4OTXHew4c5rdvvTqWIFDvIvDzS203s83Af3T3/7rqFoi0qHwxJAiMbLBQTFcMo9x0JyhW5eQreXl18vG699A3ODdTiNKMQe2CMIkHADPbDvwq8F3AXwF/AnwA+Jny/XVPddqSlEzKmClEqY3KXO0A2VR7pYCqpyfOF8P535WXT97YmRkCgyAwzCzaz8I1WhAG+CjwOeCTwI3AY8AR4Hvc/XgsLUhQpU47k7KaOu27QEFAVu3lL9nI0ZNTTM4W56uANnRnuHxLf7ObtmLVVTbzc94U26ukUs5XLwAMuvuvl+9/xsy+A7zG3Vsiybn/0CiFUolTUwt/oBt70qrTlljs2z3Mnfcf4eJNaXoy0cyg63m9iepKm0LlSD5cm0nNZHkKpZCJ6TynpvOcns6zqTvDqek8JY/SjSkzQoeXxVRpVrcM1MwGiBaAATgO9JpZH0TjAWJpRUK+Pj7J2Vwh+uICoxg6JyfzFEqTzW6atIH1uN7E4lWlqo/olbJpnpl8iVPTc1HHPrXQwVduT0/nOTU1x7nZ4pKvdwcvOem0MdCb4Y4bd8bSrnoBYBPwBAsBAODJSpuA9XmoU5YvhoREdcXu0VSqZnTMRTpJXrPWm8gXq2atDKP8/EoWGpHVC905N1NY1IlX3Z+/nWO2sPI+pzsdMNifJZsKODtTwB1eccnGtSsDdffLYvmUJnF3qosS3IlK9nS6KzFJqsggLK8eVSmnLJY7+ELoHTMCtlkWp2EqnXllW+UofiKXp9hAwN3YnWawL8tFfVkG+7ui28rvlfv92Zq1zDf3ZtfPSOBWYWYL62mWzwAq20VW6+DIOLcfOMzkbJFiGHJyco7bDxzmQ8us0a5eMrC6g1dOPhmVNMziI/TqI/dT03nOzhRW/N6BwUBVJ35RX9SxV7Zd1B9tH+jNkl28jmgTLSsAmNmT7n5t1e/PlO/+gbv/fiIti0EmFeX+A2y+TC/E265MT5rj7gdHmMgVSAVGOhXgDhPlGu3rr7xofgqD+R93wjAaK1AoqbomDu7OuZkip3NRDr0m9VLOtU/kovszhZWv1VxJwyx05os69r4sg/1ZNnZnSAWt168sKwBUd/7l319hZhcBr02kVTF5+Us2MnL8LGdnolF0gcGmnvgWVJbOVSiFfOPEFIZj2PyEKYbzjZPTvDAx09wGtrhiKWQiV5jPoS8+Sj9d9dNoGqb2iD1Kxwz2LhytX9SXpTebauuMwYpSQGa2EXgZMOrup4C/TaRVMbl+eJAvPHeaVGBkymu2npstxbagsrQPdyf0qB4+9Ooj9oX7pdCjVE25Nr7S7eg4fvlmCqWo456q6tir0zC56LGzM4UVf6+L0zC1efWumu3rKQ3TTPVGAv9f4N3uftLM3gjcB3wVeJmZvcfd/3wtGtmoR0dPM9SfXTRQJ82jo6d5Z7MbJ4mr5NhDjzrtyoXV+VuPqsMqnf5KVVZromokcOjw0oGeBP4165e7c262OF/KuGSOvXyby688DdOVDmo69MXpl4v6uhjsy7KppzXTMM1U7wzganevLEv/a8C/dPfnzGwL8P+AdR0AxiZybOnvYmhD9/w2d9d87S2mcnTuXptHD8OoFC8sPz5/5B76moxg3bv7Cu7+zAjT+SJh6ASBsTGbYe/uKxL93LVSCr2qA5/j9HSB05Va9qqj9olcnkJp5d/1hu40g73ZhRx7Of1SfSF1sD9LX5unYZqpXgAIzGyju58DQuCbAOUzglVXEJnZjcA9QAq4z90/uNr3rLZ9oJfxydmacqqZQoltA/GspykrE4ZR2qQ6zRKWUyle7tQrnXflsXAdl+1WVmv6xBfHOH5uhos39vDm12znunWeYpwtlOZLGU/nKrn1uZpSx9PTec7kGkvDVEoWa4/SozTMYF9m/ohdaZjmq9eJ/wbw92b2B8A/An9uZp8CfhB4cDUfbGYp4A+Afw0cA75oZve7+z+v5n2rVYbq5/LFlhiqv96EYe3R9UKnHT1W8oXUiWF4eV7k6qNx92gt0/Xaia/WdcOD66LDd3cmZ4vn16tPz5138bSRNEwmZfMdd/VF0tpSxy6lYVpMvYFgf2ZmTwL/Hnh5+fnXAx9398+s8rOvA55191EAM/sEcAsQWwBYj0P1k+DlXDZEFyQrna1T7rgXpUp8Uade83jVfWm+UuhM5F5spOlCSqaRNEx/V7qmE6908LW59i76upSGaUd10zju/ixwRwKffSkwVvX7MeBfLH6Sme0F9gLs2LGj4Q+LozurdJyV/HPlCLgywjj6nHLnW9UhV17rVdsX2rXoiUbN75XXOF6+yBjdzt9n/aZI5MLmKmmYxdMGTNV26qtNw5w3wrSqkx/szdKVSSXy75PWsJzJ4N4IbAMeql4gxsx+3t0/sorPXupw4rx93d3vBe4F2LVr14r+Fqqng97QlebbZ2d4/189zR03fjevu3LL/AdWOtjq9EV1R+us71y0rA/uztRc8YIjTOfTMbk803OrS8MMnpdfX9i2uTerNIwsS70y0N8CbiCaAO6Xzez33P1/lB9+B7CaAHAM2F71+zaitYZjs//QKJmU0ZtNky9GZaClsMRHHnmOV126Kc6PkjZWCp0zuSXKGqcWOvbT0wVO5/INTTTY15Uqd+yZ+Xr1gb4sW/prO/b+rrTSMB2omWsC3wRc4+5FM/t14E/MbNjdf4mlj+BX4otE4wkuB14A3gz821W+Z42xiRybezI127ozAcfPaZSmRDNqnqq6SFo7g+PCtjO5PCsdbGrA5t7MfCnj0oOTop9upWHkAr4wepp7Hv46XekgkUWt6gWAtLsXAdz9jJndBNxrZn8OrGpqunJQeQfwGaIy0I+4+5HVvOdiS5WBzhZCLt7YWQN1Oom7Mz23MPf6fEXMEtMITM0tPff6i8mkbL5efXq2yDeXmPLhp77vUm7bfYXSMLJqn/jiGOnA6MlEF+F7s2ly+WJsi1rVCwDfMLN/5e6fA3D3EvB2M/tN4MdX++Hu/mng06t9nwupLgNNmTFTKFEMnTe/Znv9F8u6UgqdszMFTk3N1R6lV6YPqJpKoKE0TDY1f3F0YH4+mK6aUsct/bVpmDf87ueWfK9PPvkC+/Zcuap/rwjAt8/NsLG7tpvuyaRiG8xaLwD8xFIb3f1XzOyPYmlBgqrLQJ87OcVLWmSgTifJF8OlR5pOVdeyN56G2dSTqUnBLAxIWriIOtiXpaeBNEzxAu250HYRiKajNyAoL/JeWeoxCCBlRjoICILo8ZcO9nFiapZseuFsMs7BrPUCwFbgDDBTbvgPAD8KPA+s22mgl6K/ybXj7kznS0uONF185D55gSXwXkx1GmZwUcdenV8f6M2QTmm0qcTHLOqsazru8v1g0f3K81JVj630Iv4v7rmCO+8/wmyhlMhg1noB4M+AHwPOmtn3Es3989+Aq4E/BG6LpRUJqS4D3did4dT0HPc8/HXexct0FtCA0J0z1VP0Ti094dfp6TxzDaRhestpmJp69UWrJg32ZdnYvT6qYdK29NF+uvlNa1uVo2ezaPT5wiJPCwtAGQudbWCUO1+bP6qeP/Iuv3j+PZb4rMp2a6DzjkPSg1nrBYAed6+UZv47ogu1v2NmAfClWFqQoMVloJUI+okvjikAVMkXw/lpeBfKGs+fpndieuVpGIDNS6Zhqo7YV5GGaaYN3WkmZs4/g9nQveppshJT3YEGFi2YlAqspkOdP3otd5jV3V5ldHll8GNlxHiw6PmLO+j515fH1VTetboDh4XXGOujA14Pklx3ut6eWv2N/yDwPgB3D1vhP6OTy0DdnVy+dN4I06UW1DjXQBomHdgFBiTVTvjVzmmYqQvMqXOh7ctRnWKodMbVt9VHt5U8chAspBmWek4nd57y4uoFgIfN7M+AbwMDwMMAZnYJkE+4bau2faCXoyenmJwtMlcskUkF9GVTbBvoa3bTGhZ6VA1TfZF0YfKvuYWqmKk8s3GmYeanEIhWTdrYsz7SMM2WCSAIFgJcGC5852ZGOlg4yq7kgVOLtq0mRyyyGvUCwLuBnwIuAW5w98pqyRcD70+wXbGorAgW5QGjZfxO50JuevX6GwWcL4bzk35Vr4w037GXj9YncgVKDeRhNvVkaueBOa+Tj47Ye7KtlYZZa5UOO50yXjrQw9FTOWzRgjBXbunlsov6CDQOQNa5erOBupnNEg3UehXRiF3c/ak1aNuqPTp6mk3dac7MFAg9ymdt7E7z1NhZ3roGn+/uzBRKSw5CiiphFipjVpOGGejLnreW6eIVlDJtmoaJSyX1EgSQDoKokw+MVCq6TQcB6cBqOvVf+ZGreM+Bw0zNFSmFTiplbOzK8L4feqU6f2kJ9eYC+iPglcDngQ+Y2XXu/oE1aVkMvj4+yeRskUwqmJ9Nc3quyPOnp1f1vqE752YKi/Lr1R38Qi17I2mY7kxw/qRf583D3sWGnvR8JYPUMjOy6YBMyhYqP6i9WFlJz6QDayj1smfnVn7mtS/lvkeOMl0q0ZMK+JnXvrTtphuX5jo4Ms7+Q6OMTeTWfC6gf0m0LGTJzHqBfwBaJgDkiyEhTqnk82cAZlxwpGihFNZcHK2d8GthabxG0zAbu9NRDn2pKXr7Fzp2pWGWLx0EpFNRSiZTvp9NB2RTQeL59IMj4xx48gWGNnSxo1xhduDJF3j1ts0KAhKLgyPj3H7gMJOzRYphyMnJOW4/cJgP3Xr1mkwFkS9P/4C756zFrlC5O6Wqvr4y9XOhGHLvodFF0/bONZSGSQXGQO9C1ctSaZjKj9Iwy7c4JVPp4CspmdUcucdl/6FRJmfznJ0pEnp0nWlTTzq2eVpE7n5whIlcoXzdKcAdJnIF7n5wZE0CwE4z+3L5vgFXlH83oksEr151CxJUKYNbfKyeD51PfHFsqZfM604Hi0obl07FbOzJKA3ToEwqmL+gWjl6z6SC+e3r3dMvnGGyal7/0GEiV+TpF840r1HSVkZPTs8PZoMog+HmjJ5cXRq7ol4AeEUsn9IkmXJqYPFSeYHBNTsGXnS1pMrse9K4ylFLpnLEnopy8ukgum3173f6AvX+F9oust7UqwJ6fqnt5QXd30w0J9C69fKXbOToySnO5grkSyGZlNHflWbbQB8funVdn7y0hEqdeyYV1BzFV+63eyXMhS4DNTJaWmQpl1/Uy7MnppcsNY7DiyalzWyjmb3PzH7fzN5gkf8EjAI/GUsLErRv9zDZdIrvGujhiqF+hjZ0k0mnNB30CmVSAX1daTb1ZNiyoYtLNvWwY7CXy7f0sX2wl4s3dbOlv4tNvRn6utJ0pVNt3/mLrIX3vukVbO7NYEG0BrkF0UJD731TPMmZeimg/wNMAI8STfx2O9FCMLe4+5diaUGCNB308lWO5isVNJlyCeVaVNOIyNL27NzKb996ddMmgxt29+8BMLP7gJPADnefjOXT10BlIqWx0zkKpZXX5LebVLBwoXW+s09Z287XI9LqmjkZXGXqB8pjAY62Uuff6TKpgK501NF3pVNk061RXSMia6NeALjazM6V7xvQU/69Uga6MdHWybJUjuqjka/lTj/V/hdhm22pEuPKdpFWUK8KSENS15HK9AbZVOWovnVq5tvRtoEexiZmzpsvf9tAT7OaJLIi63flig6XDso5+nRQ0+nL+vGBW17Fu/70Kc7NFHEqaxCn+cAtr2p200SWRQGgycwsqrZJB3SlUvMdvo7q1789O7fy9tddHk0Gly/Rl03x9tddrmkgpGUoAKyRdBDQlVlI2aTLUyCozLJ1aTI4aXUKAAmopG+60lGnn00FKrNsQ9VrTgP0ZtPk8kVNBictQwFgFdJBQCZtNTX1qr7pHEutOd2TSXFsItekFomsjAJAHemaWSpr571RR9/Ztg/0Mj45O38GADBTKLFtIJ55WkSSpgAANTNVVtI1GXXyUse+3cPcef8RcvkiPeVrAIWSs2/3cLObJrIsHRMAgsDIWjA/DULlAqzq6KVR1XNNJTFPiwg0d0nItnHpZg3OkeRoBmhJwsGRce68/wiZlLG5J8P45Cx33n+EuyCWIKDSFJEGVf44xydna/44D46MN7tp0iaqK83MottMyth/aDSW91cAEGlQ0n+cImMTOXoytTPyxFlppgAg0qCk/zhFtg/0MlOoXWI0zkozBQCRBiX9xymyb/cwhZKTyxdxj27jrDRrSgAws58wsyNmFprZrma0QWS1kv7jFNmzcyt33XwVWzd0c3amwNYN3dx181UtXwX0NPBvgP1N+nyRVduzcyu3HjtTMxncbTdoMjiJVzNXBEuEuz8DaBI0aWmaDE5a3bofB2Bme4G9ADt27Fjx65McRCGdTZPBSatL7BqAmT1kZk8v8XPLSt7H3e91913uvmtoaGhFbVCdtiRJVUDS6hI7A3D31yf13sulIzRJkiaDk1bX1mWgYxM5iqWQ0RNTjBw/x+iJKYqlUEdoEgtVAUmra1YZ6I+Z2THgeuBvzewzSXxOfzbFC2dmKZaclBnFkvPCmVn6slrrXlZvz86t3HrtpZyYnOOZ45OcmJzj1msv1dmltIymBAB3/0t33+buXe7+End/YxKfM19lZFU/qPpI4nFwZJyPPvY8+VJIYJAvhXz0sed1jUlaRlungCbnigz0pimUQmYLIYVSyEBvmqm5YrObJm3ggw88w5lcAQ8hZYaHcCZX4IMPPNPspoksS1sHgP5siolckUwQ0J0OyAQBE7miUkASi6OncgQWrTVhZgSBEVi0XaQVtHUAUApIROTC1v1AsNWYnCty6eZuTk7lyZdCsqmAizd2KQUksRje0sfXx6cwd8zAHUKHlw31Nbtp0kaSHMza1mcA2wd6SacChof62XnxRoaH+kmnAtVpSyzuuHEnA70ZDCiWQgwY6M1wx407m900aRNJD2Zt6wCgOm1J0p6dW/nQrVdzzY4BLtnUwzU7BvjQrVerDFRik/SiQ22dAtKi3ZK0JGdqFBmbyLG5J1OzLc7pRto6AID+QCVZmmxQkpT0dCNtnQISSdLBkXFuP3CYp745wfGzMzz1zQluP3BYA8EkNm25IphIO7j7wREmcgUcSKcCHJjIFbj7wZFmN03aRLuuCCbS8kZPTkcDwcrjSszAzRk9Od3klkk7absVwUTaRehOsVjCPQoAgUXTQoi0AqWARBo01JehGEaDv5zothhG20VagQKASIM29GQJiI78Kd8G5e0irUApIJEGTc4V2T7YUzPVyJb+rKYakZahACDSoEqN9vBQ//y2XL7I1g3dTWyVyPK1fQro4Mg4b7n3MW64+2Hecu9jqtGW2GiqEWl1bR0Akp5ISTpb0jXaIklr6xRQ9URKAL3ZNLl8kf2HRvVHKrHQVCPSytr6DGBsIkdPpnb1rzgnUhIRaWVtfQawfaCX505NcW6mOF+lsbEnzWUX9dd/scgyaDI4aWVtfQZw/fAg45NRiV5gkC+FjE/muX54sNlNkzaga0zS6to6ADw6epqh/izZVEDokE0FDPVneXT0dLObJm0g6cU6RJLW1imgsYkcW/q7GKqqy3Z3XQOQWCS9WIdI0tr6DGD7QC8zhVLNtjgXU5DOpv1LWl1bBwAN1JEkaf+SVtfWAUADdSRJ2r+k1bX1NQDQQB1ZG97sBog0oK3PAESSpDJQaXUKACINUhmotDoFAJEGaaoRaXUKACINUhmotDoFAJEGqQxUWl1TqoDM7EPATUAe+Abwc+5+JonP0mRdkpQ9O7dyF9G1gGMTObZp/5IWY+5rX8BmZm8AHnb3opndDeDud9R73a5du/zxxx9f9udUqjQyKaMnk2KmUKJQctVqi0hHMbMn3H3X4u1NSQG5+9+5e2Xl7MeAbUl8jqo0REQubD0MBPt54E8v9KCZ7QX2AuzYsWNFb6zJukSk1SWZxk7sDMDMHjKzp5f4uaXqOe8HisDHLvQ+7n6vu+9y911DQ0MraoOqNESklSU92DCxAODur3f3Vy3x8ykAM/tZ4EeAn/aELkSoSkNEWlnSaeymXAMwsxuBO4Cb3T2xfIwm6xKRVpb0YMNmXQP4faAL+KyZATzm7r+QxAdpMjgRaVXbB3oZn5ylN7vQVceZxm5WFdCV7r7d3b+3/JNI5y8i0sqSTmNrJLCIyDqVdBp7PZSBiojIBSSZxtYZgIhIh1IAEBHpUAoAIiIdSgFARKRDKQCIiHSopkwH3SgzOwE83+DLtwAnY2xOXNSulVG7VkbtWpn12i5YXdte6u7nTabWUgFgNczs8aXmw242tWtl1K6VUbtWZr22C5Jpm1JAIiIdSgFARKRDdVIAuLfZDbgAtWtl1K6VUbtWZr22CxJoW8dcAxARkVqddAYgIiJVFABERDpUWwQAM7vRzL5qZs+a2XuXeNzM7MPlx79sZtcu97UJt+uny+35spl93syurnrsOTP7ipl9ycweX+N27TGzs+XP/pKZ3bnc1ybcrtur2vS0mZXMbLD8WCLfl5l9xMzGzezpCzzerH2rXruatW/Va1ez9q167Vrzfav83tvN7O/N7BkzO2Jm71riOcntY+7e0j9ACvgGMAxkgcPAKxc954eABwADXgv803Jfm3C7vh8YKN9/U6Vd5d+fA7Y06fvaA/xNI69Nsl2Lnn8T8PAafF+7gWuBpy/w+JrvW8ts15rvW8ts15rvW8tpVzP2rfJ7XwJcW76/AfjaWvZf7XAGcB3wrLuPunse+ARwy6Ln3AJ81COPAZvN7JJlvjaxdrn75919ovzrY8C2mD57Ve1K6LVxv/dbgI/H9NkX5O6HgNMv8pRm7Ft129WkfWs539eFNPX7WmRN9i0Ad/+2uz9Zvj8JPANcuuhpie1j7RAALgXGqn4/xvlf4IWes5zXJtmuam8nivIVDvydmT1hZntjatNK2nW9mR02swfM7KoVvjbJdmFmvcCNwCerNif1fdXTjH1rpdZq31qutd63lq2Z+5aZXQZcA/zToocS28faYUUwW2Lb4trWCz1nOa9t1LLf28x+gOiP9Iaqza9z92+Z2Vbgs2Y2Uj6KWYt2PUk0d8iUmf0Q8FfAy5b52iTbVXET8I/uXn1El9T3VU8z9q1lW+N9azmasW+tRFP2LTPrJwo673b3c4sfXuIlsexj7XAGcAzYXvX7NuBby3zOcl6bZLsws1cD9wG3uPupynZ3/1b5dhz4S6LTvTVpl7ufc/ep8v1PAxkz27Kc1ybZripvZtEpeoLfVz3N2LeWpQn7Vl1N2rdWYs33LTPLEHX+H3P3v1jiKcntY0lc2FjLH6KzmFHgchYuhFy16Dk/TO1FlC8s97UJt2sH8Czw/Yu29wEbqu5/HrhxDdt1MQuDBK8Dvln+7pr6fZWft4kol9u3Ft9X+T0v48IXNdd831pmu9Z831pmu9Z831pOu5q4bxnwUeD3XuQ5ie1jLZ8Ccveimb0D+AzRVfGPuPsRM/uF8uP/E/g00ZX0Z4Ec8HMv9to1bNedwEXAH5oZQNGj2f5eAvxleVsa+BN3f3AN23Ur8B/MrAjMAG/2aI9r9vcF8GPA37n7dNXLE/u+zOzjRJUrW8zsGPBrQKaqTWu+by2zXWu+by2zXWu+by2zXbDG+1bZ64C3Al8xsy+Vt/0yUQBPfB/TVBAiIh2qHa4BiIhIAxQAREQ6lAKAiEiHUgAQEelQCgAiIh1KAUAkRmb2NjP7rma3Q2Q5FABE4vU2QAFAWkLLDwQTaRYz+1Xgp4km5DoJPAHsAj5mZjPA9e4+08QmirwoBQCRBpjZLuDHiWZvTBNNcvYE8DjwHnePdeEQkSQoAIg05gbgU5UjfDP76ya3R2TFdA1ApDFLTcUr0lIUAEQa8whwk5l1l+dy/+Hy9kmipf1E1j1NBifSIDP7daLlA58HTgAHiaYT/i2imS51EVjWNQUAkQaZWb9HK1v1AoeAvV5e31WkFegisEjj7jWzVwLdwP9W5y+tRmcAIiIdSheBRUQ6lAKAiEiHUgAQEelQCgAiIh1KAUBEpEP9f2bPVnu4Rv72AAAAAElFTkSuQmCC\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "onemillionv2_datasc = dataset.data_sc\n",
+ "onemillionv2_monocytes_ut = onemillionv2_datasc[(onemillionv2_datasc.obs['cell_type_lowerres']=='CD8T') & \n",
+ " (onemillionv2_datasc.obs['time']=='UT')]\n",
+ "onemillionv2_monocytes_ut_df = pd.DataFrame(\n",
+ " data=onemillionv2_monocytes_ut.X.toarray(),\n",
+ " columns=onemillionv2_monocytes_ut.var.index,\n",
+ " index=onemillionv2_monocytes_ut.obs.index\n",
+ ")\n",
+ "data_selected_df, zscore_df, zscore_p_df = \\\n",
+ "get_individual_networks_selected_genepairs(onemillionv2_monocytes_ut_df, \n",
+ " onemillionv2_monocytes_ut, \n",
+ " 'assignment', \n",
+ " ';'.join(['RPS26', 'RUNX3']))\n",
+ "concated_df = pd.concat([zscore_df.T,\n",
+ " gt.T],\n",
+ " axis=1).dropna()\n",
+ "concated_df['gt'] = [item.split(':')[0].count('1') for item in concated_df[0]]\n",
+ "print(spearmanr(concated_df['RPS26;RUNX3'], concated_df['gt']))\n",
+ "sns.regplot(x='gt', y='RPS26;RUNX3', data=concated_df)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### onemillion v3"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# # load onemillion v2 data\n",
+ "# datasetv3 = DATASET('onemillionv3')\n",
+ "# datasetv3.load_dataset()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " CD4T TEM | \n",
+ " CD4T Naive | \n",
+ " CD8T TEM | \n",
+ " CD8T Naive | \n",
+ " CD4T | \n",
+ " CD8T | \n",
+ " CD4T TCM | \n",
+ " CD8T TCM | \n",
+ " all_num | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " LLDeep_0117 | \n",
+ " 39 | \n",
+ " 248 | \n",
+ " 70 | \n",
+ " 75 | \n",
+ " 625 | \n",
+ " 170 | \n",
+ " 289 | \n",
+ " 26 | \n",
+ " 1599 | \n",
+ "
\n",
+ " \n",
+ " LLDeep_1300 | \n",
+ " 34 | \n",
+ " 149 | \n",
+ " 73 | \n",
+ " 39 | \n",
+ " 658 | \n",
+ " 119 | \n",
+ " 450 | \n",
+ " 10 | \n",
+ " 1382 | \n",
+ "
\n",
+ " \n",
+ " LLDeep_0615 | \n",
+ " 68 | \n",
+ " 84 | \n",
+ " 175 | \n",
+ " 23 | \n",
+ " 550 | \n",
+ " 212 | \n",
+ " 347 | \n",
+ " 19 | \n",
+ " 1277 | \n",
+ "
\n",
+ " \n",
+ " LLDeep_0923 | \n",
+ " 39 | \n",
+ " 80 | \n",
+ " 205 | \n",
+ " 40 | \n",
+ " 316 | \n",
+ " 249 | \n",
+ " 140 | \n",
+ " 6 | \n",
+ " 907 | \n",
+ "
\n",
+ " \n",
+ " LLDeep_0705 | \n",
+ " 13 | \n",
+ " 114 | \n",
+ " 123 | \n",
+ " 61 | \n",
+ " 322 | \n",
+ " 188 | \n",
+ " 166 | \n",
+ " 3 | \n",
+ " 947 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " CD4T TEM CD4T Naive CD8T TEM CD8T Naive CD4T CD8T CD4T TCM \\\n",
+ "LLDeep_0117 39 248 70 75 625 170 289 \n",
+ "LLDeep_1300 34 149 73 39 658 119 450 \n",
+ "LLDeep_0615 68 84 175 23 550 212 347 \n",
+ "LLDeep_0923 39 80 205 40 316 249 140 \n",
+ "LLDeep_0705 13 114 123 61 322 188 166 \n",
+ "\n",
+ " CD8T TCM all_num \n",
+ "LLDeep_0117 26 1599 \n",
+ "LLDeep_1300 10 1382 \n",
+ "LLDeep_0615 19 1277 \n",
+ "LLDeep_0923 6 907 \n",
+ "LLDeep_0705 3 947 "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/plain": [
+ "Text(0.5, 0, 'rs1131017')"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "onemillionv3 = datasetv3.data_sc.obs.copy()\n",
+ "onemillionv3_celltype_df = pd.read_csv(\n",
+ " './1M_v3_20201106_azimuth.tsv',\n",
+ " sep='\\t', index_col=0\n",
+ ")\n",
+ "onemillionv3 = pd.concat([onemillionv3, onemillionv3_celltype_df], axis=1)\n",
+ "onemillionv3 = onemillionv3[onemillionv3['timepoint']=='UT']\n",
+ "onemillionv3_l1_cellratio_df = onemillionv3.groupby(['assignment', 'predicted.celltype.l1']).size().to_frame()\n",
+ "# display(onemillionv3_l1_cellratio_df.head())\n",
+ "onemillionv3_celltyperatio = onemillionv3.groupby(['assignment', 'predicted.celltype.l2']).size().to_frame()\n",
+ "# display(onemillionv3_celltyperatio.head())\n",
+ "onemillionv3_allcells = onemillionv3['assignment'].value_counts()\n",
+ "\n",
+ "# caluclate the individual CD4T TEM and NAIVE ratio\n",
+ "individual_ratio = pd.DataFrame()\n",
+ "for individual in onemillionv3['assignment'].unique():\n",
+ " tem_num = onemillionv3_celltyperatio.loc[individual, \"CD4 TEM\"].values[0]\n",
+ " naive_num = onemillionv3_celltyperatio.loc[individual, \"CD4 Naive\"].values[0]\n",
+ " cd8t_tem_num = onemillionv3_celltyperatio.loc[individual, \"CD8 TEM\"].values[0]\n",
+ " tcm_num = onemillionv3_celltyperatio.loc[individual, \"CD4 TCM\"].values[0]\n",
+ " cd8t_tcm_num = onemillionv3_celltyperatio.loc[individual, \"CD8 TCM\"].values[0]\n",
+ " cd8t_naive_num = onemillionv3_celltyperatio.loc[individual, \"CD8 Naive\"].values[0]\n",
+ " cd4t_num = onemillionv3_l1_cellratio_df.loc[individual, 'CD4 T'].values[0]\n",
+ " cd8t_num = onemillionv3_l1_cellratio_df.loc[individual, 'CD8 T'].values[0]\n",
+ " all_num = onemillionv3_allcells.loc[individual]\n",
+ " individual_ratio[individual] = [tem_num, naive_num, \n",
+ " cd8t_tem_num, cd8t_naive_num,\n",
+ " cd4t_num, cd8t_num,\n",
+ " tcm_num, cd8t_tcm_num, all_num]\n",
+ "\n",
+ "individual_ratio_dfv3 = individual_ratio.T\n",
+ "individual_ratio_dfv3 = individual_ratio_dfv3.rename({0: 'CD4T TEM', 1:'CD4T Naive', \n",
+ " 2: 'CD8T TEM', 3: 'CD8T Naive',\n",
+ " 4: 'CD4T', 5: 'CD8T',\n",
+ " 6: 'CD4T TCM', 7: 'CD8T TCM',\n",
+ " 8: 'all_num'}, \n",
+ " axis=1)\n",
+ "display(individual_ratio_dfv3.head())\n",
+ "\n",
+ "\n",
+ "common_individuals = list(set(individual_ratio_dfv3.index) & set(gt.columns))\n",
+ "common_individuals_individual_ratio_dfv3 = individual_ratio_dfv3.loc[common_individuals]\n",
+ "common_individuals_individual_ratio_dfv3['gt'] = [float(gt[col].values[0].split(':')[1]) for col in \n",
+ " common_individuals_individual_ratio_dfv3.index]\n",
+ "common_individuals_individual_ratio_dfv3['chemistry'] = 'v2'\n",
+ "\n",
+ "fig, axes = plt.subplots(1, 3, figsize=(15, 5))\n",
+ "ax1, ax2, ax3 = axes\n",
+ "cd4ydata = (common_individuals_individual_ratio_dfv3['CD4T TEM']) / common_individuals_individual_ratio_dfv3['CD4T']\n",
+ "sns.regplot(x=common_individuals_individual_ratio_dfv3['gt'],\n",
+ " y=cd4ydata, \n",
+ " ax=ax1)\n",
+ "r, p = stats.pearsonr(common_individuals_individual_ratio_dfv3['gt'],\n",
+ " cd4ydata)\n",
+ "ax1.set_title('Oelen v3 r={:.2f}, p={:.2g}'.format(r, p))\n",
+ "ax1.set_ylabel('CD4 TEM / CD4T')\n",
+ "ax1.set_xlabel(\"rs1131017\")\n",
+ "\n",
+ "cd8tydata = (common_individuals_individual_ratio_dfv3['CD4T Naive']) / common_individuals_individual_ratio_dfv3['CD4T']\n",
+ "sns.regplot(x=common_individuals_individual_ratio_dfv3['gt'],\n",
+ " y= cd8tydata, \n",
+ " ax=ax2)\n",
+ "r, p = stats.pearsonr(common_individuals_individual_ratio_dfv3['gt'],\n",
+ " cd8tydata)\n",
+ "ax2.set_title('Oelen v3 r={:.2f}, p={:.2g}'.format(r, p))\n",
+ "ax2.set_ylabel('CD4 Naive / CD4T')\n",
+ "ax2.set_xlabel(\"rs1131017\")\n",
+ "\n",
+ "cd8tydata = (common_individuals_individual_ratio_dfv3['CD4T Naive']) / common_individuals_individual_ratio_dfv3['CD4T TEM']\n",
+ "sns.regplot(x=common_individuals_individual_ratio_dfv3['gt'],\n",
+ " y= cd8tydata, \n",
+ " ax=ax3)\n",
+ "r, p = stats.pearsonr(common_individuals_individual_ratio_dfv3['gt'],\n",
+ " cd8tydata)\n",
+ "ax3.set_title('Oelen v3 r={:.2f}, p={:.2g}'.format(r, p))\n",
+ "ax3.set_ylabel('CD4 Naive / CD4T TEM')\n",
+ "ax3.set_xlabel(\"rs1131017\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Text(0.5, 0, 'rs1131017')"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "fig, axes = plt.subplots(1, 3, figsize=(15, 5))\n",
+ "ax1, ax2, ax3 = axes\n",
+ "cd4ydata = (common_individuals_individual_ratio_dfv3['CD8T TEM']) / common_individuals_individual_ratio_dfv3['CD8T']\n",
+ "sns.regplot(x=common_individuals_individual_ratio_dfv3['gt'],\n",
+ " y=cd4ydata, \n",
+ " ax=ax1)\n",
+ "r, p = stats.spearmanr(common_individuals_individual_ratio_dfv3['gt'],\n",
+ " cd4ydata)\n",
+ "ax1.set_title('Oelen v3 r={:.2f}, p={:.2g}'.format(r, p))\n",
+ "ax1.set_ylabel('CD8 TEM / CD4T')\n",
+ "ax1.set_xlabel(\"rs1131017\")\n",
+ "\n",
+ "cd8tydata = (common_individuals_individual_ratio_dfv3['CD8T Naive']) / common_individuals_individual_ratio_dfv3['CD8T']\n",
+ "sns.regplot(x=common_individuals_individual_ratio_dfv3['gt'],\n",
+ " y= cd8tydata, \n",
+ " ax=ax2)\n",
+ "r, p = stats.spearmanr(common_individuals_individual_ratio_dfv3['gt'],\n",
+ " cd8tydata)\n",
+ "ax2.set_title('Oelen v3 r={:.2f}, p={:.2g}'.format(r, p))\n",
+ "ax2.set_ylabel('CD8 Naive / CD8T')\n",
+ "ax2.set_xlabel(\"rs1131017\")\n",
+ "\n",
+ "cd8tydata = (common_individuals_individual_ratio_dfv3['CD8T Naive']) / common_individuals_individual_ratio_dfv3['CD8T TEM']\n",
+ "sns.regplot(x=common_individuals_individual_ratio_dfv3['gt'],\n",
+ " y= cd8tydata, \n",
+ " ax=ax3)\n",
+ "r, p = stats.spearmanr(common_individuals_individual_ratio_dfv3['gt'],\n",
+ " cd8tydata)\n",
+ "ax3.set_title('Oelen v3 r={:.2f}, p={:.2g}'.format(r, p))\n",
+ "ax3.set_ylabel('CD8 Naive / CD8T TEM')\n",
+ "ax3.set_xlabel(\"rs1131017\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "(104, 11) (72, 11) (32, 11)\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " CD4T TEM | \n",
+ " CD4T Naive | \n",
+ " CD8T TEM | \n",
+ " CD8T Naive | \n",
+ " CD4T | \n",
+ " CD8T | \n",
+ " CD4T TCM | \n",
+ " CD8T TCM | \n",
+ " all_num | \n",
+ " gt | \n",
+ " chemistry | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " LLDeep_1035 | \n",
+ " 17 | \n",
+ " 152 | \n",
+ " 36 | \n",
+ " 54 | \n",
+ " 625 | \n",
+ " 108 | \n",
+ " 424 | \n",
+ " 28 | \n",
+ " 1006 | \n",
+ " 1.0 | \n",
+ " v2 | \n",
+ "
\n",
+ " \n",
+ " LLDeep_0960 | \n",
+ " 36 | \n",
+ " 89 | \n",
+ " 113 | \n",
+ " 13 | \n",
+ " 586 | \n",
+ " 143 | \n",
+ " 374 | \n",
+ " 26 | \n",
+ " 1546 | \n",
+ " 0.0 | \n",
+ " v2 | \n",
+ "
\n",
+ " \n",
+ " LLDeep_1004 | \n",
+ " 65 | \n",
+ " 54 | \n",
+ " 440 | \n",
+ " 9 | \n",
+ " 583 | \n",
+ " 462 | \n",
+ " 437 | \n",
+ " 16 | \n",
+ " 1493 | \n",
+ " 1.0 | \n",
+ " v2 | \n",
+ "
\n",
+ " \n",
+ " LLDeep_0918 | \n",
+ " 19 | \n",
+ " 34 | \n",
+ " 119 | \n",
+ " 7 | \n",
+ " 249 | \n",
+ " 136 | \n",
+ " 168 | \n",
+ " 14 | \n",
+ " 598 | \n",
+ " 1.0 | \n",
+ " v2 | \n",
+ "
\n",
+ " \n",
+ " LLDeep_0067 | \n",
+ " 51 | \n",
+ " 101 | \n",
+ " 122 | \n",
+ " 89 | \n",
+ " 753 | \n",
+ " 231 | \n",
+ " 546 | \n",
+ " 30 | \n",
+ " 1429 | \n",
+ " 1.0 | \n",
+ " v2 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " CD4T TEM CD4T Naive CD8T TEM CD8T Naive CD4T CD8T CD4T TCM \\\n",
+ "LLDeep_1035 17 152 36 54 625 108 424 \n",
+ "LLDeep_0960 36 89 113 13 586 143 374 \n",
+ "LLDeep_1004 65 54 440 9 583 462 437 \n",
+ "LLDeep_0918 19 34 119 7 249 136 168 \n",
+ "LLDeep_0067 51 101 122 89 753 231 546 \n",
+ "\n",
+ " CD8T TCM all_num gt chemistry \n",
+ "LLDeep_1035 28 1006 1.0 v2 \n",
+ "LLDeep_0960 26 1546 0.0 v2 \n",
+ "LLDeep_1004 16 1493 1.0 v2 \n",
+ "LLDeep_0918 14 598 1.0 v2 \n",
+ "LLDeep_0067 30 1429 1.0 v2 "
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "concate_v2_v3 = pd.concat([common_individuals_individual_ratio_df,\n",
+ " common_individuals_individual_ratio_dfv3],\n",
+ " axis=0)\n",
+ "print(concate_v2_v3.shape,common_individuals_individual_ratio_df.shape, common_individuals_individual_ratio_dfv3.shape)\n",
+ "concate_v2_v3.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "fig, ax1 = plt.subplots()\n",
+ "cd4ydata = (concate_v2_v3['CD8T Naive'] + \\\n",
+ " concate_v2_v3['CD4T Naive']\n",
+ " ) / (\n",
+ " concate_v2_v3['CD8T TEM'] + \\\n",
+ " concate_v2_v3['CD4T TEM']\n",
+ ")\n",
+ "sns.regplot(x=concate_v2_v3['gt'],\n",
+ " y=cd4ydata, \n",
+ " ax=ax1)\n",
+ "r, p = stats.spearmanr(concate_v2_v3['gt'],\n",
+ " cd4ydata)\n",
+ "ax1.set_title('Oelen v2 & v3 r={:.2f}, p={:.2g}'.format(r, p))\n",
+ "ax1.set_ylabel('CD8+CD4 TEM / CD8+CD4 Naive')\n",
+ "ax1.set_xlabel(\"rs1131017\")\n",
+ "\n",
+ "plt.savefig('TEM_naive_CD4_CD8_v2_v3_rs1131017.pdf')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.11"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/05_coeqtl_interpretation/LDTRAIT.ipynb b/05_coeqtl_interpretation/LDTRAIT.ipynb
new file mode 100644
index 0000000..659b6fa
--- /dev/null
+++ b/05_coeqtl_interpretation/LDTRAIT.ipynb
@@ -0,0 +1,944 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import requests\n",
+ "from tqdm import tqdm\n",
+ "import os\n",
+ "from io import StringIO\n",
+ "from pathlib import Path"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {
+ "scrolled": false
+ },
+ "outputs": [],
+ "source": [
+ "savedir = Path(\"./annotated_coeqtl_snps/ldtrait\")\n",
+ "\n",
+ "celltypesnps = {}\n",
+ "merged_dict = pd.read_excel('/groups/umcg-lld/tmp01/projects/1MCellRNAseq/GRN_reconstruction/ongoing/coeqtl_mapping/output/summary/coeQTLs_6majorcelltypes.filtered.xlsx',\n",
+ " sheet_name=None)\n",
+ "for celltype in merged_dict.keys():\n",
+ " celltypesnps[celltype] = list(merged_dict[celltype]['SNP'].unique())\n",
+ "allcelltypes_snps = list(set([ele for l in celltypesnps.values() for ele in l]))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 90,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "72"
+ ]
+ },
+ "execution_count": 90,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "len(allcelltypes_snps)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 70,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 17%|█▋ | 12/72 [05:53<34:19, 34.33s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "no GWAS: rs62480001\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ " 18%|█▊ | 13/72 [06:52<39:48, 40.48s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "no GWAS: rs817352\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ " 19%|█▉ | 14/72 [07:11<33:43, 34.89s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "no GWAS: rs80164297\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 24%|██▎ | 17/72 [09:23<37:43, 41.16s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "no GWAS: rs11772922\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 26%|██▋ | 19/72 [10:43<35:14, 39.89s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "no GWAS: rs3758833\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 29%|██▉ | 21/72 [11:39<28:28, 33.49s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "no GWAS: rs11047696\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ " 31%|███ | 22/72 [12:09<27:15, 32.70s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "no GWAS: rs9971029\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ " 32%|███▏ | 23/72 [12:35<24:55, 30.53s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "no GWAS: rs4949655\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 42%|████▏ | 30/72 [16:56<26:11, 37.41s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "no GWAS: rs6007595\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ " 43%|████▎ | 31/72 [17:27<24:21, 35.64s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "no GWAS: rs7309189\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ " 44%|████▍ | 32/72 [18:20<27:11, 40.79s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "no GWAS: rs9657360\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 49%|████▊ | 35/72 [19:43<20:09, 32.70s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "no GWAS: rs731835\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 53%|█████▎ | 38/72 [26:08<51:53, 91.57s/it] "
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "no GWAS: rs260503\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 58%|█████▊ | 42/72 [28:04<22:59, 46.00s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "no GWAS: rs13140099\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ " 60%|█████▉ | 43/72 [28:34<19:51, 41.10s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "no GWAS: rs2235910\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 71%|███████ | 51/72 [42:19<40:20, 115.26s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "no GWAS: rs1628955\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 74%|███████▎ | 53/72 [43:55<25:05, 79.24s/it] "
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "no GWAS: rs12443580\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 82%|████████▏ | 59/72 [48:18<09:34, 44.16s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "no GWAS: rs150458741\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 85%|████████▍ | 61/72 [49:39<07:29, 40.88s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "no GWAS: rs62423804\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ " 86%|████████▌ | 62/72 [50:16<06:36, 39.69s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "no GWAS: rs2267989\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 89%|████████▉ | 64/72 [50:56<03:58, 29.82s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "no GWAS: rs7605964\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 99%|█████████▊| 71/72 [54:21<00:24, 24.54s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "no GWAS: rs1261896\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|██████████| 72/72 [54:53<00:00, 45.75s/it]\n"
+ ]
+ }
+ ],
+ "source": [
+ "# curl -k -H \"Content-Type: application/json\" -X POST -d '{\"snps\": \"rs3\\nrs4\", \"pop\": \"YRI\", \"r2_d\": \"r2\", \"r2_d_threshold\": \"0.1\", \"window\": \"500000\", \"genome_build\": \"grch37\"}' 'https://ldlink.nci.nih.gov/LDlinkRest/ldtrait?token=faketoken123'\n",
+ "# snp = \"rs10276099\"\n",
+ "for snp in tqdm(allcelltypes_snps):\n",
+ " if os.path.exists(savedir/f'{snp}.tsv'):\n",
+ " continue\n",
+ " else:\n",
+ " params = {\"snps\": snp, \n",
+ " \"pop\": \"CEU\", \n",
+ " \"r2_d\": \"r2\", \n",
+ " \"r2_d_threshold\": \"0.8\", \n",
+ " \"window\": \"500000\", \n",
+ " \"genome_build\": \"grch37\"}\n",
+ " r = requests.request(headers={\"Content-Type\": \"application/json\"},\n",
+ " method='POST',\n",
+ " json=params, \n",
+ " url=f'https://ldlink.nci.nih.gov/LDlinkRest/ldtrait?token={token}')\n",
+ " try:\n",
+ " if \"No entries in the GWAS Catalog are identified using the LDtrait search criteria.\" in r.text:\n",
+ " print('no GWAS:', snp)\n",
+ " continue\n",
+ " else:\n",
+ " r_df = pd.read_csv(StringIO(r.text), sep='\\t')\n",
+ " r_df.to_csv(savedir/f'{snp}.tsv', sep='\\t', index=False)\n",
+ " except:\n",
+ " print('failed entry:', snp)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 80,
+ "metadata": {
+ "scrolled": false
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|██████████| 72/72 [00:00<00:00, 298.96it/s]\n"
+ ]
+ }
+ ],
+ "source": [
+ "allsnps_inld_gwas_df = pd.DataFrame()\n",
+ "for snp in tqdm(allcelltypes_snps):\n",
+ " if os.path.exists(savedir/f'{snp}.tsv'):\n",
+ " df = pd.read_csv(savedir/f'{snp}.tsv', sep='\\t')\n",
+ " if 'error' not in df.iloc[0].values[0]:\n",
+ " allsnps_inld_gwas_df = pd.concat([allsnps_inld_gwas_df, df],\n",
+ " axis=0)\n",
+ " \n",
+ "allsnps_inld_gwas_df.to_csv(savedir/'summary.tsv', sep='\\t', index=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "allsnps_inld_gwas_df = pd.read_csv(savedir/'summary.tsv', sep='\\t')\n",
+ "magma_df = pd.read_csv(savedir/'coeqtl_with_gwas_and_magma.tsv', sep='\\t')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " VARIABLE | \n",
+ " celltype | \n",
+ " SNP | \n",
+ " gene | \n",
+ " TYPE | \n",
+ " NGENES | \n",
+ " BETA | \n",
+ " BETA_STD | \n",
+ " SE | \n",
+ " P | \n",
+ " ... | \n",
+ " non_effect_allele | \n",
+ " current_build | \n",
+ " frequency | \n",
+ " sample_size | \n",
+ " zscore | \n",
+ " pvalue | \n",
+ " effect_size | \n",
+ " standard_error | \n",
+ " imputation_status | \n",
+ " n_cases | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " B_rs1131017_RPS26 | \n",
+ " B | \n",
+ " rs1131017 | \n",
+ " RPS26 | \n",
+ " SET | \n",
+ " 38 | \n",
+ " -0.199320 | \n",
+ " -0.008952 | \n",
+ " 0.12542 | \n",
+ " 0.943980 | \n",
+ " ... | \n",
+ " G | \n",
+ " hg38 | \n",
+ " 0.580808 | \n",
+ " 54612 | \n",
+ " 0.138937 | \n",
+ " 0.889500 | \n",
+ " 0.002200 | \n",
+ " 0.015600 | \n",
+ " original | \n",
+ " 17008.0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " B_rs1131017_RPS26 | \n",
+ " B | \n",
+ " rs1131017 | \n",
+ " RPS26 | \n",
+ " SET | \n",
+ " 38 | \n",
+ " 0.201320 | \n",
+ " 0.009042 | \n",
+ " 0.12905 | \n",
+ " 0.059382 | \n",
+ " ... | \n",
+ " G | \n",
+ " hg38 | \n",
+ " 0.580808 | \n",
+ " 53293 | \n",
+ " 1.735682 | \n",
+ " 0.082620 | \n",
+ " 0.023902 | \n",
+ " 0.013700 | \n",
+ " original | \n",
+ " 19099.0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " B_rs1131017_RPS26 | \n",
+ " B | \n",
+ " rs1131017 | \n",
+ " RPS26 | \n",
+ " SET | \n",
+ " 37 | \n",
+ " 0.163610 | \n",
+ " 0.007256 | \n",
+ " 0.12608 | \n",
+ " 0.097201 | \n",
+ " ... | \n",
+ " G | \n",
+ " hg38 | \n",
+ " 0.580808 | \n",
+ " 29344 | \n",
+ " -2.348664 | \n",
+ " 0.018841 | \n",
+ " -0.010569 | \n",
+ " 0.004363 | \n",
+ " original | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " B_rs1131017_RPS26 | \n",
+ " B | \n",
+ " rs1131017 | \n",
+ " RPS26 | \n",
+ " SET | \n",
+ " 38 | \n",
+ " -0.010395 | \n",
+ " -0.000467 | \n",
+ " 0.11668 | \n",
+ " 0.535490 | \n",
+ " ... | \n",
+ " G | \n",
+ " hg38 | \n",
+ " 0.580808 | \n",
+ " 15954 | \n",
+ " -0.324182 | \n",
+ " 0.745800 | \n",
+ " -0.009950 | \n",
+ " 0.025700 | \n",
+ " original | \n",
+ " 7387.0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " B_rs1131017_RPS26 | \n",
+ " B | \n",
+ " rs1131017 | \n",
+ " RPS26 | \n",
+ " SET | \n",
+ " 38 | \n",
+ " 0.282350 | \n",
+ " 0.012677 | \n",
+ " 0.11706 | \n",
+ " 0.007937 | \n",
+ " ... | \n",
+ " G | \n",
+ " hg38 | \n",
+ " 0.580808 | \n",
+ " 337159 | \n",
+ " -1.597883 | \n",
+ " 0.110069 | \n",
+ " -0.000210 | \n",
+ " 0.000132 | \n",
+ " original | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 44 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " VARIABLE celltype SNP gene TYPE NGENES BETA \\\n",
+ "0 B_rs1131017_RPS26 B rs1131017 RPS26 SET 38 -0.199320 \n",
+ "1 B_rs1131017_RPS26 B rs1131017 RPS26 SET 38 0.201320 \n",
+ "2 B_rs1131017_RPS26 B rs1131017 RPS26 SET 37 0.163610 \n",
+ "3 B_rs1131017_RPS26 B rs1131017 RPS26 SET 38 -0.010395 \n",
+ "4 B_rs1131017_RPS26 B rs1131017 RPS26 SET 38 0.282350 \n",
+ "\n",
+ " BETA_STD SE P ... non_effect_allele current_build \\\n",
+ "0 -0.008952 0.12542 0.943980 ... G hg38 \n",
+ "1 0.009042 0.12905 0.059382 ... G hg38 \n",
+ "2 0.007256 0.12608 0.097201 ... G hg38 \n",
+ "3 -0.000467 0.11668 0.535490 ... G hg38 \n",
+ "4 0.012677 0.11706 0.007937 ... G hg38 \n",
+ "\n",
+ " frequency sample_size zscore pvalue effect_size standard_error \\\n",
+ "0 0.580808 54612 0.138937 0.889500 0.002200 0.015600 \n",
+ "1 0.580808 53293 1.735682 0.082620 0.023902 0.013700 \n",
+ "2 0.580808 29344 -2.348664 0.018841 -0.010569 0.004363 \n",
+ "3 0.580808 15954 -0.324182 0.745800 -0.009950 0.025700 \n",
+ "4 0.580808 337159 -1.597883 0.110069 -0.000210 0.000132 \n",
+ "\n",
+ " imputation_status n_cases \n",
+ "0 original 17008.0 \n",
+ "1 original 19099.0 \n",
+ "2 original NaN \n",
+ "3 original 7387.0 \n",
+ "4 original NaN \n",
+ "\n",
+ "[5 rows x 44 columns]"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "magma_df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "VARIABLE B_rs1131017_RPS26\n",
+ "celltype B\n",
+ "SNP rs1131017\n",
+ "gene RPS26\n",
+ "TYPE SET\n",
+ "NGENES 38\n",
+ "BETA -0.19932\n",
+ "BETA_STD -0.008952\n",
+ "SE 0.12542\n",
+ "P 0.94398\n",
+ "prefix results/current/magma/AD\n",
+ "trait AD\n",
+ "FDR 0.973479\n",
+ "Tag IGAP_Alzheimer\n",
+ "PUBMED_Paper_Link http://www.ncbi.nlm.nih.gov/pubmed/24162737\n",
+ "Phenotype Alzheimer\n",
+ "RSID rs10876864\n",
+ "RSALIAS rs57455456\n",
+ "CHR 12\n",
+ "POS1 56435929\n",
+ "POS2 56401085\n",
+ "DIST -34844\n",
+ "R2 0.991789\n",
+ "D 0.240643\n",
+ "DPRIME 0.995886\n",
+ "MAJOR A\n",
+ "MINOR G\n",
+ "MAF 0.408549\n",
+ "CMMB 0.155229\n",
+ "CM 71.092406\n",
+ "panel_variant_id chr12_56007301_G_A_b38\n",
+ "chromosome chr12\n",
+ "position 56007301\n",
+ "effect_allele A\n",
+ "non_effect_allele G\n",
+ "current_build hg38\n",
+ "frequency 0.580808\n",
+ "sample_size 54612\n",
+ "zscore 0.138937\n",
+ "pvalue 0.8895\n",
+ "effect_size 0.0022\n",
+ "standard_error 0.0156\n",
+ "imputation_status original\n",
+ "n_cases 17008.0\n",
+ "Name: 0, dtype: object"
+ ]
+ },
+ "execution_count": 23,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "magma_df.iloc[0]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([[0.041321, 'Inflammatory Bowel Disease'],\n",
+ " [0.030935, 'Non-cancer illness code, self-reported: psoriasis'],\n",
+ " [0.0090688,\n",
+ " 'Non-cancer illness code, self-reported: schizophrenia'],\n",
+ " [0.0042454,\n",
+ " 'Overall breast cancer in Europeans, imputed genotype'],\n",
+ " [0.032584, 'Diagnoses - main ICD10: G40 Epilepsy'],\n",
+ " [0.0013766,\n",
+ " 'Estrogen-receptor-negative breast cancer in Europeans, imputed genotype'],\n",
+ " [0.025212,\n",
+ " 'Non-cancer illness code, self-reported: high cholesterol']],\n",
+ " dtype=object)"
+ ]
+ },
+ "execution_count": 27,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "magma_df[(magma_df['SNP']=='rs4147638') & (magma_df['P']<0.05)][['P', 'Phenotype']].values"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Query | \n",
+ " GWAS Trait | \n",
+ " RS Number | \n",
+ " Position (GRCh37) | \n",
+ " Alleles | \n",
+ " R2 | \n",
+ " D' | \n",
+ " Risk Allele | \n",
+ " Effect Size (95% CI) | \n",
+ " Beta or OR | \n",
+ " P-value | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " rs2954654 | \n",
+ " Type 2 diabetes | \n",
+ " rs2294120 | \n",
+ " chr8:146003567 | \n",
+ " A=0.52, G=0.48 | \n",
+ " 0.846295 | \n",
+ " 0.957895 | \n",
+ " 0.455879299759268 | \n",
+ " 0.04430 | \n",
+ " 0.029-0.06 | \n",
+ " 2.000000e-08 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " rs4840568 | \n",
+ " Albumin-globulin ratio | \n",
+ " rs2409780 | \n",
+ " chr8:11337587 | \n",
+ " C=0.237, T=0.763 | \n",
+ " 0.897156 | \n",
+ " 1.000000 | \n",
+ " NR | \n",
+ " 0.04604 | \n",
+ " 0.035-0.057 | \n",
+ " 1.000000e-16 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " rs4840568 | \n",
+ " Non-albumin protein levels | \n",
+ " rs2409780 | \n",
+ " chr8:11337587 | \n",
+ " C=0.237, T=0.763 | \n",
+ " 0.897156 | \n",
+ " 1.000000 | \n",
+ " NR | \n",
+ " 0.04456 | \n",
+ " 0.034-0.055 | \n",
+ " 1.000000e-15 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " rs4840568 | \n",
+ " Rheumatoid arthritis | \n",
+ " rs2618444 | \n",
+ " chr8:11338370 | \n",
+ " A=0.763, C=0.237 | \n",
+ " 0.897156 | \n",
+ " 1.000000 | \n",
+ " NR | \n",
+ " 0.10050 | \n",
+ " 0.072-0.129 | \n",
+ " 7.000000e-12 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " rs4840568 | \n",
+ " Systemic lupus erythematosus | \n",
+ " rs2618444 | \n",
+ " chr8:11338370 | \n",
+ " A=0.763, C=0.237 | \n",
+ " 0.897156 | \n",
+ " 1.000000 | \n",
+ " NR | \n",
+ " 1.36000 | \n",
+ " 1.22-1.51 | \n",
+ " 7.000000e-09 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Query GWAS Trait RS Number Position (GRCh37) \\\n",
+ "0 rs2954654 Type 2 diabetes rs2294120 chr8:146003567 \n",
+ "1 rs4840568 Albumin-globulin ratio rs2409780 chr8:11337587 \n",
+ "2 rs4840568 Non-albumin protein levels rs2409780 chr8:11337587 \n",
+ "3 rs4840568 Rheumatoid arthritis rs2618444 chr8:11338370 \n",
+ "4 rs4840568 Systemic lupus erythematosus rs2618444 chr8:11338370 \n",
+ "\n",
+ " Alleles R2 D' Risk Allele \\\n",
+ "0 A=0.52, G=0.48 0.846295 0.957895 0.455879299759268 \n",
+ "1 C=0.237, T=0.763 0.897156 1.000000 NR \n",
+ "2 C=0.237, T=0.763 0.897156 1.000000 NR \n",
+ "3 A=0.763, C=0.237 0.897156 1.000000 NR \n",
+ "4 A=0.763, C=0.237 0.897156 1.000000 NR \n",
+ "\n",
+ " Effect Size (95% CI) Beta or OR P-value \n",
+ "0 0.04430 0.029-0.06 2.000000e-08 \n",
+ "1 0.04604 0.035-0.057 1.000000e-16 \n",
+ "2 0.04456 0.034-0.055 1.000000e-15 \n",
+ "3 0.10050 0.072-0.129 7.000000e-12 \n",
+ "4 1.36000 1.22-1.51 7.000000e-09 "
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "allsnps_inld_gwas_df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "allsnps_inld_gwas_df.to_excel('./coeqtl_mapping/output/snps_in_ld_with_gwas_catelogue.xlsx')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.11"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/05_coeqtl_interpretation/LD_description.R b/05_coeqtl_interpretation/LD_description.R
new file mode 100644
index 0000000..8e1d5e4
--- /dev/null
+++ b/05_coeqtl_interpretation/LD_description.R
@@ -0,0 +1,164 @@
+#install.packages("LDlinkR")
+
+library('LDlinkR')
+
+cells <- c("B", "CD4T", "CD8T", "NK", "DC", "monocyte")
+
+output_file <- c()
+
+# To run this scipt file with LD structure of given population is needed. After all the significant SNPs aremacthed with other SNPs in hight LD. ALternatively, we could also use LD package, which make the mapping of LD as well.
+outPath <- '/groups/umcg-lld/tmp01/projects/1MCellRNAseq/GRN_reconstruction/ongoing/coeqtl_mapping/output/filtered_results/'
+
+for (cell in cells){
+ name <- paste(outPath, "UT_", cell, '/coeqtls_fullresults.sig.tsv.gz', sep="")
+ tab <- read.table(name, sep='\t', header=T )
+ # SNP <- as.data.frame(tab[,'SNP'])
+ # colnames(SNP) <- "SNP"
+ SNP <- (tab[,'SNP'])
+ output_file <- c(SNP, output_file)
+}
+
+length(output_file)
+
+output_file <- unique(output_file)
+length(output_file)
+
+LD_Score <- LDexpress(output_file[1],
+ pop = "CEU",
+ tissue = "ALL",
+ r2d = "r2",
+ r2d_threshold = 0.8,
+ p_threshold = 0.1,
+ win_size = 500000,
+ token = "d1bfc9a7a30b",
+ file = FALSE
+)
+
+for (i in output_file[63:length(output_file)] ){
+print(i)
+LD_Score_ind <- LDexpress(i,
+ pop = "CEU",
+ tissue = "ALL",
+ r2d = "r2",
+ r2d_threshold = 0.8,
+ p_threshold = 0.1,
+ win_size = 500000,
+ token = "d1bfc9a7a30b",
+ file = FALSE
+)
+LD_Score <- rbind(LD_Score_ind,LD_Score)
+}
+
+LD_Score_subset <- subset(LD_Score, select = c('Query',"RS_ID","R2" ))
+dim(LD_Score_subset)
+LD_Score_subset <- LD_Score_subset[!duplicated(LD_Score_subset$RS_ID),]
+dim(LD_Score_subset)
+
+
+write.table(LD_Score,'/groups/umcg-franke-scrna/tmp01/projects/sc-eqtlgen-consortium-pipeline/ongoing/wg3-QTL-mapping/GRN_downstream_analysis/sign_LD_SNPs_18_12.txt', quote = F, col.names = F, row.names = F, sep='\t')
+
+write.table(LD_Score_subset,'/groups/umcg-franke-scrna/tmp01/projects/sc-eqtlgen-consortium-pipeline/ongoing/wg3-QTL-mapping/GRN_downstream_analysis/sign_LD_SNPs_subset_18_12.txt', quote = F, col.names = F, row.names = F, sep='\t')
+
+######### LDtrait
+
+
+
+LD_Score <- LDtrait(output_file[2],
+ pop = "CEU",
+ r2d = "r2",
+ r2d_threshold = 0.8,
+ token = "d1bfc9a7a30b",
+ file = FALSE
+)
+LD_Score
+
+for (i in output_file){
+ print(i)
+ LD_Score_ind <- LDtrait(i,
+ pop = "CEU",
+ r2d = "r2",
+ r2d_threshold = 0.8,
+ token = "d1bfc9a7a30b",
+ file = FALSE
+ )
+ LD_Score <- rbind(LD_Score_ind,LD_Score)
+}
+
+LD_Score_ind <- LDtrait(output_file[1:49],
+ pop = "CEU",
+ r2d = "r2",
+ r2d_threshold = 0.8,
+ token = "d1bfc9a7a30b",
+ file = FALSE
+)
+
+LD_Score_ind2 <- LDtrait(output_file[50:72],
+ pop = "CEU",
+ r2d = "r2",
+ r2d_threshold = 0.8,
+ token = "d1bfc9a7a30b",
+ file = FALSE
+)
+LD_Score <- rbind(LD_Score_ind,LD_Score_ind2)
+
+# for (i in output_file){
+# print(i)
+# if(i %in% LD_Score$Query){
+# print('SNP is analyzed')
+# } else {
+#
+# tryCatch({
+# LD_Score_ind <- LDtrait(i,
+# pop = "CEU",
+# r2d = "r2",
+# r2d_threshold = 0.8,
+# token = "d1bfc9a7a30b",
+# file = FALSE
+# )
+# LD_Score <- rbind(LD_Score_ind,LD_Score)
+# }, error = function(e){
+# output_file <- output_file[-i]
+# print(length(output_file))
+# })
+# }
+# }
+
+# LD_Score_subset <- subset(LD_Score, select = c('Query',"RS_ID","R2" ))
+# dim(LD_Score_subset)
+# LD_Score_subset <- LD_Score_subset[!duplicated(LD_Score_subset$RS_ID),]
+# dim(LD_Score_subset)
+
+
+write.table(LD_Score,'/groups/umcg-franke-scrna/tmp01/projects/sc-eqtlgen-consortium-pipeline/ongoing/wg3-QTL-mapping/GRN_downstream_analysis/sign_LD_SNPs_23_01.txt', quote = F, col.names = F, row.names = F, sep='\t')
+
+
+#write.table(output_file,'/groups/umcg-franke-scrna/tmp01/projects/sc-eqtlgen-consortium-pipeline/ongoing/wg3-QTL-mapping/GRN_downstream_analysis/sign_SNPs_17_12.txt', quote = F, col.names = F, row.names = F, sep='\t')
+
+
+
+expand_ld_table <- function(ld){
+ # double the ld table, so we can easily select just from the left or right
+ ld_copy <- ld[, c('CHR_B', 'BP_B', 'SNP_B', 'CHR_A', 'BP_A', 'SNP_A', 'R2')]
+ colnames(ld_copy) <- c('CHR_A', 'BP_A', 'SNP_A', 'CHR_B', 'BP_B', 'SNP_B', 'R2')
+ ld <- rbind(ld, ld_copy)
+ # add each SNP in max LD with itself by copying the unique snps on the left and right
+ ld_left <- ld[, c('CHR_A', 'BP_A', 'SNP_A')]
+ ld_right <- ld[, c('CHR_B', 'BP_B', 'SNP_B')]
+ colnames(ld_right) <- c('CHR_A', 'BP_A', 'SNP_A')
+ ld_left_right <- rbind(ld_left, ld_right)
+ ld_left_right <- unique(ld_left_right)
+ ld_self <- cbind(ld_left_right, ld_left_right)
+ colnames(ld_self) <- c('CHR_A', 'BP_A', 'SNP_A', 'CHR_B', 'BP_B', 'SNP_B')
+ # ld with itself is off course 1
+ ld_self$R2 <- 1
+ # add to existing ld table
+ ld <- rbind(ld, ld_self)
+ return(ld)
+}
+# location of the LD file
+ld_loc <- '/groups/umcg-bios/tmp01/projects/1M_cells_scRNAseq/ongoing/LD_DB/genotypes_eur/EUR.chrAll.phase3_shapeit2_mvncall_integrated_v5b.20130502.genotypes.positions_plus_RSID.plink1.ldwindow10000.r2_075.ld'
+# make the ld table a bit easier to work with
+#ld_loc <- read.table(ld_loc, sep='\t')
+ld <- expand_ld_table(ld_loc)
+# confine the ld table to eQTL snps on the left <- subset here to what SNPs you need the LD of, with the other SNPs
+ld <- ld[ld$SNP_A %in% eqtls$V1, ]
\ No newline at end of file
diff --git a/05_coeqtl_interpretation/MS1_Libraries.r b/05_coeqtl_interpretation/MS1_Libraries.r
new file mode 100644
index 0000000..9018e46
--- /dev/null
+++ b/05_coeqtl_interpretation/MS1_Libraries.r
@@ -0,0 +1,10 @@
+### Libraries for all the project scripts
+library(stringr, quietly = TRUE, verbose = FALSE)
+library(dplyr)
+library(data.table)
+library(tidyverse)
+library('reshape2')
+library(caret)
+library('gprofiler2')
+library('coloc')
+library('biomaRt')
diff --git a/05_coeqtl_interpretation/R1_TRANSFAC_enrichment.ipynb b/05_coeqtl_interpretation/R1_TRANSFAC_enrichment.ipynb
new file mode 100644
index 0000000..9b30961
--- /dev/null
+++ b/05_coeqtl_interpretation/R1_TRANSFAC_enrichment.ipynb
@@ -0,0 +1,3005 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 161,
+ "id": "8839a2ee-257d-4182-8c82-01d012d8f888",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "### Execute TRANFAC enrichment analysis based on co-eqtl results"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "26bafe98-a70d-4052-ba5d-fca1b4115633",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "# Libraries"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 162,
+ "id": "946a1c00-83e8-4260-9093-e79e373c1fe0",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "source('MS1_Libraries.r')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "cb04a3b3-e6f5-4458-8b8e-c925813cee89",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6b9b799d-d266-4fc8-a49b-ebb550fa64fd",
+ "metadata": {},
+ "source": [
+ "# Parameters"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 163,
+ "id": "35c8b11a-e882-4bbb-9706-6c13b9c522f1",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "### Path to input data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 164,
+ "id": "c517df12-ff3c-492e-a0a3-53d7cbdb945d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "path<-\"\"\n",
+ "outdir<-\"\""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "309ead5e-4bbd-4370-8089-5dfa0c53a194",
+ "metadata": {},
+ "source": [
+ "# Data "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "759d077f-409c-4e52-857c-47af7be21134",
+ "metadata": {},
+ "source": [
+ "## Enrichment Data Input"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 165,
+ "id": "8dfbd479-4b4a-4abd-828b-c668708fa7e9",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "### Exemplary data input load for a cell-type"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 166,
+ "id": "4bbb56ca-273c-42dd-857c-be3d358ded78",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "cell_type_var = \"CD4T\"\n",
+ "# c(\"CD4T\",\"CD8T\",\"monocyte\",\"NK\",\"B\",\"DC\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 167,
+ "id": "54fb40a9-c9c6-4f88-9ab8-fc98fa60d279",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "for(cell_type in cell_type_var){\n",
+ "\n",
+ " coeqtls <- fread(paste0(path, \"UT_\",cell_type, \n",
+ " \"_coeqtls_fullresults_fixed.all.tsv.gz\"))\n",
+ " coeqtls$gene1<-gsub(\";.*\",\"\",coeqtls$Gene)\n",
+ " coeqtls$gene2<-gsub(\".*;\",\"\",coeqtls$Gene)\n",
+ " coeqtls$second_gene<-ifelse(coeqtls$gene1 == coeqtls$eqtlgen, coeqtls$gene2,\n",
+ " coeqtls$gene1)\n",
+ " coeqtls$gene1<-NULL\n",
+ " coeqtls$gene2<-NULL\n",
+ " }"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 168,
+ "id": "7e950293-2939-4bd8-905d-729dd8c1a278",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#unique(coeqtls$eqtlgene)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 169,
+ "id": "78dd2696-0f6c-40a8-93c4-0f1d6b3b99a0",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "372"
+ ],
+ "text/latex": [
+ "372"
+ ],
+ "text/markdown": [
+ "372"
+ ],
+ "text/plain": [
+ "[1] 372"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "nrow(coeqtls[(coeqtls$eqtlgene == 'RPS26') & (coeqtls$gene2_isSig == TRUE),c('eqtlgene', 'second_gene')])\n",
+ "\n",
+ "# validity check --> 372 significant co-egenes for RPS26"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 170,
+ "id": "1535e95a-ec90-42e9-a5a7-de98dea38ea8",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "742"
+ ],
+ "text/latex": [
+ "742"
+ ],
+ "text/markdown": [
+ "742"
+ ],
+ "text/plain": [
+ "[1] 742"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "nrow(coeqtls[(coeqtls$eqtlgene == 'RPS26'),c('eqtlgene', 'second_gene')])\n",
+ "# overall 742 --> those that would not haven been tested"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 171,
+ "id": "599506b1-e79b-4c3a-9c38-b32dd14bf5ad",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "A data.table: 2 × 38\n",
+ "\n",
+ "\tsnp_genepair | Gene | GeneChr | GenePos | GeneStrand | GeneSymbol | SNP | SNPChr | SNPPos | SNPAlleles | ⋯ | multipletestP | eqtlgene | snp_eqtlgene | snp_beta_shape1 | snp_beta_shape2 | snp_pvalbeta | snp_qval | gene2_pthreshold | gene2_isSig | second_gene |
\n",
+ "\t<chr> | <chr> | <int> | <int> | <lgl> | <chr> | <chr> | <int> | <int> | <chr> | ⋯ | <dbl> | <chr> | <chr> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <lgl> | <chr> |
\n",
+ "\n",
+ "\n",
+ "\trs11587831_C1orf86;NUDT22 | C1orf86;NUDT22 | 1 | 2115903 | NA | C1orf86;NUDT22 | rs11587831 | 1 | 2110848 | T/G | ⋯ | 0.6354470 | C1orf86 | rs11587831_C1orf86 | 1.197903 | 127.555 | 0.5044989 | 0.7012273 | 4.539067e-05 | FALSE | NUDT22 |
\n",
+ "\trs11587831_C1orf86;SDHC | C1orf86;SDHC | 1 | 2115903 | NA | C1orf86;SDHC | rs11587831 | 1 | 2110848 | T/G | ⋯ | 0.9144163 | C1orf86 | rs11587831_C1orf86 | 1.197903 | 127.555 | 0.5044989 | 0.7012273 | 4.539067e-05 | FALSE | SDHC |
\n",
+ "\n",
+ "
\n"
+ ],
+ "text/latex": [
+ "A data.table: 2 × 38\n",
+ "\\begin{tabular}{lllllllllllllllllllll}\n",
+ " snp\\_genepair & Gene & GeneChr & GenePos & GeneStrand & GeneSymbol & SNP & SNPChr & SNPPos & SNPAlleles & ⋯ & multipletestP & eqtlgene & snp\\_eqtlgene & snp\\_beta\\_shape1 & snp\\_beta\\_shape2 & snp\\_pvalbeta & snp\\_qval & gene2\\_pthreshold & gene2\\_isSig & second\\_gene\\\\\n",
+ " & & & & & & & & & & ⋯ & & & & & & & & & & \\\\\n",
+ "\\hline\n",
+ "\t rs11587831\\_C1orf86;NUDT22 & C1orf86;NUDT22 & 1 & 2115903 & NA & C1orf86;NUDT22 & rs11587831 & 1 & 2110848 & T/G & ⋯ & 0.6354470 & C1orf86 & rs11587831\\_C1orf86 & 1.197903 & 127.555 & 0.5044989 & 0.7012273 & 4.539067e-05 & FALSE & NUDT22\\\\\n",
+ "\t rs11587831\\_C1orf86;SDHC & C1orf86;SDHC & 1 & 2115903 & NA & C1orf86;SDHC & rs11587831 & 1 & 2110848 & T/G & ⋯ & 0.9144163 & C1orf86 & rs11587831\\_C1orf86 & 1.197903 & 127.555 & 0.5044989 & 0.7012273 & 4.539067e-05 & FALSE & SDHC \\\\\n",
+ "\\end{tabular}\n"
+ ],
+ "text/markdown": [
+ "\n",
+ "A data.table: 2 × 38\n",
+ "\n",
+ "| snp_genepair <chr> | Gene <chr> | GeneChr <int> | GenePos <int> | GeneStrand <lgl> | GeneSymbol <chr> | SNP <chr> | SNPChr <int> | SNPPos <int> | SNPAlleles <chr> | ⋯ ⋯ | multipletestP <dbl> | eqtlgene <chr> | snp_eqtlgene <chr> | snp_beta_shape1 <dbl> | snp_beta_shape2 <dbl> | snp_pvalbeta <dbl> | snp_qval <dbl> | gene2_pthreshold <dbl> | gene2_isSig <lgl> | second_gene <chr> |\n",
+ "|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|\n",
+ "| rs11587831_C1orf86;NUDT22 | C1orf86;NUDT22 | 1 | 2115903 | NA | C1orf86;NUDT22 | rs11587831 | 1 | 2110848 | T/G | ⋯ | 0.6354470 | C1orf86 | rs11587831_C1orf86 | 1.197903 | 127.555 | 0.5044989 | 0.7012273 | 4.539067e-05 | FALSE | NUDT22 |\n",
+ "| rs11587831_C1orf86;SDHC | C1orf86;SDHC | 1 | 2115903 | NA | C1orf86;SDHC | rs11587831 | 1 | 2110848 | T/G | ⋯ | 0.9144163 | C1orf86 | rs11587831_C1orf86 | 1.197903 | 127.555 | 0.5044989 | 0.7012273 | 4.539067e-05 | FALSE | SDHC |\n",
+ "\n"
+ ],
+ "text/plain": [
+ " snp_genepair Gene GeneChr GenePos GeneStrand\n",
+ "1 rs11587831_C1orf86;NUDT22 C1orf86;NUDT22 1 2115903 NA \n",
+ "2 rs11587831_C1orf86;SDHC C1orf86;SDHC 1 2115903 NA \n",
+ " GeneSymbol SNP SNPChr SNPPos SNPAlleles ⋯ multipletestP eqtlgene\n",
+ "1 C1orf86;NUDT22 rs11587831 1 2110848 T/G ⋯ 0.6354470 C1orf86 \n",
+ "2 C1orf86;SDHC rs11587831 1 2110848 T/G ⋯ 0.9144163 C1orf86 \n",
+ " snp_eqtlgene snp_beta_shape1 snp_beta_shape2 snp_pvalbeta snp_qval \n",
+ "1 rs11587831_C1orf86 1.197903 127.555 0.5044989 0.7012273\n",
+ "2 rs11587831_C1orf86 1.197903 127.555 0.5044989 0.7012273\n",
+ " gene2_pthreshold gene2_isSig second_gene\n",
+ "1 4.539067e-05 FALSE NUDT22 \n",
+ "2 4.539067e-05 FALSE SDHC "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "head(coeqtls,2)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e1a2e697-badf-4802-ad22-26a5ac2a9101",
+ "metadata": {},
+ "source": [
+ "## ReMap Results for comparison"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 173,
+ "id": "128f1402-b5bf-411d-b812-5dbed446f4a8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "## Load supplementary table (with ReMap Results to compare):\n",
+ "# \"supptable15.TFenrichment_co-eGenes.xlsx - Sheet1.csv\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 174,
+ "id": "4c21cd7a-c9da-4c2b-8877-bac38c095960",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "old_enrichments = read.csv( paste0(path, \"supptable15.TFenrichment_co-eGenes.xlsx - Sheet1.csv\"))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 175,
+ "id": "8b28f114-298e-4170-9e98-0644683d93fd",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "963"
+ ],
+ "text/latex": [
+ "963"
+ ],
+ "text/markdown": [
+ "963"
+ ],
+ "text/plain": [
+ "[1] 963"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "nrow(old_enrichments)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 176,
+ "id": "f159bb08-15b4-4a74-86a0-dd723fcc83b0",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "A data.frame: 2 × 13\n",
+ "\n",
+ "\t | Cell.type | eQTL..SNP.eGene. | TF | TF.is.a.co.eGene. | enrichment.p.value | X..TF.overlap...co.eGene | X..TF.overlap...background | X..no.TF.overlap...co.eGene | X..background.gene...not.co.eGene | enrichment.fdr | eQTL.SNP | SNP.overlaps.TF. | Names.of.overlapping.SNPs |
\n",
+ "\t | <chr> | <chr> | <chr> | <lgl> | <dbl> | <int> | <int> | <int> | <int> | <dbl> | <chr> | <lgl> | <chr> |
\n",
+ "\n",
+ "\n",
+ "\t1 | CD4T | rs111454690_HLA-DRB5 | CDK8 | FALSE | 9.630369e-06 | 14 | 5 | 2778 | 8515 | 1.640373e-03 | rs111454690 | FALSE | |
\n",
+ "\t2 | CD4T | rs111454690_HLA-DRB5 | SNRNP70 | FALSE | 1.209254e-09 | 11 | 8 | 649 | 10644 | 6.179288e-07 | rs111454690 | FALSE | |
\n",
+ "\n",
+ "
\n"
+ ],
+ "text/latex": [
+ "A data.frame: 2 × 13\n",
+ "\\begin{tabular}{r|lllllllllllll}\n",
+ " & Cell.type & eQTL..SNP.eGene. & TF & TF.is.a.co.eGene. & enrichment.p.value & X..TF.overlap...co.eGene & X..TF.overlap...background & X..no.TF.overlap...co.eGene & X..background.gene...not.co.eGene & enrichment.fdr & eQTL.SNP & SNP.overlaps.TF. & Names.of.overlapping.SNPs\\\\\n",
+ " & & & & & & & & & & & & & \\\\\n",
+ "\\hline\n",
+ "\t1 & CD4T & rs111454690\\_HLA-DRB5 & CDK8 & FALSE & 9.630369e-06 & 14 & 5 & 2778 & 8515 & 1.640373e-03 & rs111454690 & FALSE & \\\\\n",
+ "\t2 & CD4T & rs111454690\\_HLA-DRB5 & SNRNP70 & FALSE & 1.209254e-09 & 11 & 8 & 649 & 10644 & 6.179288e-07 & rs111454690 & FALSE & \\\\\n",
+ "\\end{tabular}\n"
+ ],
+ "text/markdown": [
+ "\n",
+ "A data.frame: 2 × 13\n",
+ "\n",
+ "| | Cell.type <chr> | eQTL..SNP.eGene. <chr> | TF <chr> | TF.is.a.co.eGene. <lgl> | enrichment.p.value <dbl> | X..TF.overlap...co.eGene <int> | X..TF.overlap...background <int> | X..no.TF.overlap...co.eGene <int> | X..background.gene...not.co.eGene <int> | enrichment.fdr <dbl> | eQTL.SNP <chr> | SNP.overlaps.TF. <lgl> | Names.of.overlapping.SNPs <chr> |\n",
+ "|---|---|---|---|---|---|---|---|---|---|---|---|---|---|\n",
+ "| 1 | CD4T | rs111454690_HLA-DRB5 | CDK8 | FALSE | 9.630369e-06 | 14 | 5 | 2778 | 8515 | 1.640373e-03 | rs111454690 | FALSE | |\n",
+ "| 2 | CD4T | rs111454690_HLA-DRB5 | SNRNP70 | FALSE | 1.209254e-09 | 11 | 8 | 649 | 10644 | 6.179288e-07 | rs111454690 | FALSE | |\n",
+ "\n"
+ ],
+ "text/plain": [
+ " Cell.type eQTL..SNP.eGene. TF TF.is.a.co.eGene. enrichment.p.value\n",
+ "1 CD4T rs111454690_HLA-DRB5 CDK8 FALSE 9.630369e-06 \n",
+ "2 CD4T rs111454690_HLA-DRB5 SNRNP70 FALSE 1.209254e-09 \n",
+ " X..TF.overlap...co.eGene X..TF.overlap...background\n",
+ "1 14 5 \n",
+ "2 11 8 \n",
+ " X..no.TF.overlap...co.eGene X..background.gene...not.co.eGene enrichment.fdr\n",
+ "1 2778 8515 1.640373e-03 \n",
+ "2 649 10644 6.179288e-07 \n",
+ " eQTL.SNP SNP.overlaps.TF. Names.of.overlapping.SNPs\n",
+ "1 rs111454690 FALSE \n",
+ "2 rs111454690 FALSE "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "head(old_enrichments,2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 177,
+ "id": "c7fc8d4c-f820-4793-b58d-b194e56d6b4c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "## Check out some results of ReMap mentioned in paper"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 178,
+ "id": "4b439eaa-c15a-4917-9ae8-4df0afd7475a",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "0.0132347204977557"
+ ],
+ "text/latex": [
+ "0.0132347204977557"
+ ],
+ "text/markdown": [
+ "0.0132347204977557"
+ ],
+ "text/plain": [
+ "[1] 0.01323472"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "max(old_enrichments$enrichment.p.value)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 179,
+ "id": "7899b194-8054-4a2a-b9d7-3e051f7bc380",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "0.0498313855060291"
+ ],
+ "text/latex": [
+ "0.0498313855060291"
+ ],
+ "text/markdown": [
+ "0.0498313855060291"
+ ],
+ "text/plain": [
+ "[1] 0.04983139"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "max(old_enrichments$enrichment.fdr)\n",
+ "# check to use same cut-off for TRANSFAC --> 0.05"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 180,
+ "id": "384f4ce7-13c5-4230-a0d7-5aaa262d1112",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "- 'rs111454690_HLA-DRB5'
- 'rs1131017_RPS26'
- 'rs4147638_SMDT1'
- 'rs7605824_SH3YL1'
- 'rs7632486_CMTM8'
- 'rs9271520_HLA-DQA2'
- 'rs1131017_RPS26_positive'
- 'rs1131017_RPS26_negative'
- 'rs6708265_PASK'
\n"
+ ],
+ "text/latex": [
+ "\\begin{enumerate*}\n",
+ "\\item 'rs111454690\\_HLA-DRB5'\n",
+ "\\item 'rs1131017\\_RPS26'\n",
+ "\\item 'rs4147638\\_SMDT1'\n",
+ "\\item 'rs7605824\\_SH3YL1'\n",
+ "\\item 'rs7632486\\_CMTM8'\n",
+ "\\item 'rs9271520\\_HLA-DQA2'\n",
+ "\\item 'rs1131017\\_RPS26\\_positive'\n",
+ "\\item 'rs1131017\\_RPS26\\_negative'\n",
+ "\\item 'rs6708265\\_PASK'\n",
+ "\\end{enumerate*}\n"
+ ],
+ "text/markdown": [
+ "1. 'rs111454690_HLA-DRB5'\n",
+ "2. 'rs1131017_RPS26'\n",
+ "3. 'rs4147638_SMDT1'\n",
+ "4. 'rs7605824_SH3YL1'\n",
+ "5. 'rs7632486_CMTM8'\n",
+ "6. 'rs9271520_HLA-DQA2'\n",
+ "7. 'rs1131017_RPS26_positive'\n",
+ "8. 'rs1131017_RPS26_negative'\n",
+ "9. 'rs6708265_PASK'\n",
+ "\n",
+ "\n"
+ ],
+ "text/plain": [
+ "[1] \"rs111454690_HLA-DRB5\" \"rs1131017_RPS26\" \n",
+ "[3] \"rs4147638_SMDT1\" \"rs7605824_SH3YL1\" \n",
+ "[5] \"rs7632486_CMTM8\" \"rs9271520_HLA-DQA2\" \n",
+ "[7] \"rs1131017_RPS26_positive\" \"rs1131017_RPS26_negative\"\n",
+ "[9] \"rs6708265_PASK\" "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "unique(old_enrichments[,c( 'eQTL..SNP.eGene.')])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 181,
+ "id": "6810f798-9fa5-4aea-9954-b517aa0b49b8",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "9"
+ ],
+ "text/latex": [
+ "9"
+ ],
+ "text/markdown": [
+ "9"
+ ],
+ "text/plain": [
+ "[1] 9"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "length(unique(old_enrichments[,c( 'eQTL..SNP.eGene.')])) # subtract positive and negative case for RPS26 --> yields the 7 mentioned in paper for which there were significant TF enrichments"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 182,
+ "id": "528cc4ad-2052-42b3-a6a4-b47bcfaa2bbb",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "- 'rs1131017_RPS26'
- 'rs4147638_SMDT1'
- 'rs7605824_SH3YL1'
- 'rs9271520_HLA-DQA2'
- 'rs1131017_RPS26_positive'
- 'rs1131017_RPS26_negative'
\n"
+ ],
+ "text/latex": [
+ "\\begin{enumerate*}\n",
+ "\\item 'rs1131017\\_RPS26'\n",
+ "\\item 'rs4147638\\_SMDT1'\n",
+ "\\item 'rs7605824\\_SH3YL1'\n",
+ "\\item 'rs9271520\\_HLA-DQA2'\n",
+ "\\item 'rs1131017\\_RPS26\\_positive'\n",
+ "\\item 'rs1131017\\_RPS26\\_negative'\n",
+ "\\end{enumerate*}\n"
+ ],
+ "text/markdown": [
+ "1. 'rs1131017_RPS26'\n",
+ "2. 'rs4147638_SMDT1'\n",
+ "3. 'rs7605824_SH3YL1'\n",
+ "4. 'rs9271520_HLA-DQA2'\n",
+ "5. 'rs1131017_RPS26_positive'\n",
+ "6. 'rs1131017_RPS26_negative'\n",
+ "\n",
+ "\n"
+ ],
+ "text/plain": [
+ "[1] \"rs1131017_RPS26\" \"rs4147638_SMDT1\" \n",
+ "[3] \"rs7605824_SH3YL1\" \"rs9271520_HLA-DQA2\" \n",
+ "[5] \"rs1131017_RPS26_positive\" \"rs1131017_RPS26_negative\""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "unique(old_enrichments[old_enrichments$SNP.overlaps.TF. == TRUE,c('eQTL..SNP.eGene.')]) # results in the 4 pairs mentioned in paper"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 183,
+ "id": "1e28ceb8-6515-4561-ac51-f1ab0860ad36",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "## rs1131017–RPS26 examples: RMB39, TCF7, LEF1, KLF6, CD74, MAF"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 184,
+ "id": "0a75dc32-fc85-4e53-ad4a-fa630f0460d4",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "A data.frame: 7 × 13\n",
+ "\n",
+ "\t | Cell.type | eQTL..SNP.eGene. | TF | TF.is.a.co.eGene. | enrichment.p.value | X..TF.overlap...co.eGene | X..TF.overlap...background | X..no.TF.overlap...co.eGene | X..background.gene...not.co.eGene | enrichment.fdr | eQTL.SNP | SNP.overlaps.TF. | Names.of.overlapping.SNPs |
\n",
+ "\t | <chr> | <chr> | <chr> | <lgl> | <dbl> | <int> | <int> | <int> | <int> | <dbl> | <chr> | <lgl> | <chr> |
\n",
+ "\n",
+ "\n",
+ "\t19 | CD4T | rs1131017_RPS26 | MAF | TRUE | 3.654557e-06 | 92 | 280 | 1747 | 9546 | 5.187441e-05 | rs1131017 | TRUE | rs1131017 |
\n",
+ "\t34 | CD4T | rs1131017_RPS26 | RBM39 | TRUE | 2.128100e-06 | 244 | 128 | 6041 | 5252 | 3.295330e-05 | rs1131017 | TRUE | rs10876864,rs1131017,rs7297175 |
\n",
+ "\t50 | CD4T | rs1131017_RPS26 | TCF7 | TRUE | 7.468026e-03 | 134 | 238 | 3379 | 7914 | 3.052929e-02 | rs1131017 | TRUE | rs1131017 |
\n",
+ "\t84 | CD4T | rs1131017_RPS26 | LEF1 | TRUE | 4.859147e-05 | 153 | 219 | 3529 | 7764 | 4.598193e-04 | rs1131017 | TRUE | rs10876864,rs1131017 |
\n",
+ "\t116 | CD4T | rs1131017_RPS26 | KLF6 | TRUE | 1.597304e-03 | 139 | 233 | 3385 | 7908 | 8.236538e-03 | rs1131017 | TRUE | rs10876864,rs1131017,rs7297175 |
\n",
+ "\t119 | CD4T | rs1131017_RPS26 | CD74 | TRUE | 3.954534e-06 | 172 | 200 | 3915 | 7378 | 5.461532e-05 | rs1131017 | TRUE | rs1131017 |
\n",
+ "\t730 | monocyte | rs1131017_RPS26 | CD74 | TRUE | 7.422301e-03 | 63 | 69 | 3526 | 6028 | 3.134542e-02 | rs1131017 | TRUE | rs1131017 |
\n",
+ "\n",
+ "
\n"
+ ],
+ "text/latex": [
+ "A data.frame: 7 × 13\n",
+ "\\begin{tabular}{r|lllllllllllll}\n",
+ " & Cell.type & eQTL..SNP.eGene. & TF & TF.is.a.co.eGene. & enrichment.p.value & X..TF.overlap...co.eGene & X..TF.overlap...background & X..no.TF.overlap...co.eGene & X..background.gene...not.co.eGene & enrichment.fdr & eQTL.SNP & SNP.overlaps.TF. & Names.of.overlapping.SNPs\\\\\n",
+ " & & & & & & & & & & & & & \\\\\n",
+ "\\hline\n",
+ "\t19 & CD4T & rs1131017\\_RPS26 & MAF & TRUE & 3.654557e-06 & 92 & 280 & 1747 & 9546 & 5.187441e-05 & rs1131017 & TRUE & rs1131017 \\\\\n",
+ "\t34 & CD4T & rs1131017\\_RPS26 & RBM39 & TRUE & 2.128100e-06 & 244 & 128 & 6041 & 5252 & 3.295330e-05 & rs1131017 & TRUE & rs10876864,rs1131017,rs7297175\\\\\n",
+ "\t50 & CD4T & rs1131017\\_RPS26 & TCF7 & TRUE & 7.468026e-03 & 134 & 238 & 3379 & 7914 & 3.052929e-02 & rs1131017 & TRUE & rs1131017 \\\\\n",
+ "\t84 & CD4T & rs1131017\\_RPS26 & LEF1 & TRUE & 4.859147e-05 & 153 & 219 & 3529 & 7764 & 4.598193e-04 & rs1131017 & TRUE & rs10876864,rs1131017 \\\\\n",
+ "\t116 & CD4T & rs1131017\\_RPS26 & KLF6 & TRUE & 1.597304e-03 & 139 & 233 & 3385 & 7908 & 8.236538e-03 & rs1131017 & TRUE & rs10876864,rs1131017,rs7297175\\\\\n",
+ "\t119 & CD4T & rs1131017\\_RPS26 & CD74 & TRUE & 3.954534e-06 & 172 & 200 & 3915 & 7378 & 5.461532e-05 & rs1131017 & TRUE & rs1131017 \\\\\n",
+ "\t730 & monocyte & rs1131017\\_RPS26 & CD74 & TRUE & 7.422301e-03 & 63 & 69 & 3526 & 6028 & 3.134542e-02 & rs1131017 & TRUE & rs1131017 \\\\\n",
+ "\\end{tabular}\n"
+ ],
+ "text/markdown": [
+ "\n",
+ "A data.frame: 7 × 13\n",
+ "\n",
+ "| | Cell.type <chr> | eQTL..SNP.eGene. <chr> | TF <chr> | TF.is.a.co.eGene. <lgl> | enrichment.p.value <dbl> | X..TF.overlap...co.eGene <int> | X..TF.overlap...background <int> | X..no.TF.overlap...co.eGene <int> | X..background.gene...not.co.eGene <int> | enrichment.fdr <dbl> | eQTL.SNP <chr> | SNP.overlaps.TF. <lgl> | Names.of.overlapping.SNPs <chr> |\n",
+ "|---|---|---|---|---|---|---|---|---|---|---|---|---|---|\n",
+ "| 19 | CD4T | rs1131017_RPS26 | MAF | TRUE | 3.654557e-06 | 92 | 280 | 1747 | 9546 | 5.187441e-05 | rs1131017 | TRUE | rs1131017 |\n",
+ "| 34 | CD4T | rs1131017_RPS26 | RBM39 | TRUE | 2.128100e-06 | 244 | 128 | 6041 | 5252 | 3.295330e-05 | rs1131017 | TRUE | rs10876864,rs1131017,rs7297175 |\n",
+ "| 50 | CD4T | rs1131017_RPS26 | TCF7 | TRUE | 7.468026e-03 | 134 | 238 | 3379 | 7914 | 3.052929e-02 | rs1131017 | TRUE | rs1131017 |\n",
+ "| 84 | CD4T | rs1131017_RPS26 | LEF1 | TRUE | 4.859147e-05 | 153 | 219 | 3529 | 7764 | 4.598193e-04 | rs1131017 | TRUE | rs10876864,rs1131017 |\n",
+ "| 116 | CD4T | rs1131017_RPS26 | KLF6 | TRUE | 1.597304e-03 | 139 | 233 | 3385 | 7908 | 8.236538e-03 | rs1131017 | TRUE | rs10876864,rs1131017,rs7297175 |\n",
+ "| 119 | CD4T | rs1131017_RPS26 | CD74 | TRUE | 3.954534e-06 | 172 | 200 | 3915 | 7378 | 5.461532e-05 | rs1131017 | TRUE | rs1131017 |\n",
+ "| 730 | monocyte | rs1131017_RPS26 | CD74 | TRUE | 7.422301e-03 | 63 | 69 | 3526 | 6028 | 3.134542e-02 | rs1131017 | TRUE | rs1131017 |\n",
+ "\n"
+ ],
+ "text/plain": [
+ " Cell.type eQTL..SNP.eGene. TF TF.is.a.co.eGene. enrichment.p.value\n",
+ "19 CD4T rs1131017_RPS26 MAF TRUE 3.654557e-06 \n",
+ "34 CD4T rs1131017_RPS26 RBM39 TRUE 2.128100e-06 \n",
+ "50 CD4T rs1131017_RPS26 TCF7 TRUE 7.468026e-03 \n",
+ "84 CD4T rs1131017_RPS26 LEF1 TRUE 4.859147e-05 \n",
+ "116 CD4T rs1131017_RPS26 KLF6 TRUE 1.597304e-03 \n",
+ "119 CD4T rs1131017_RPS26 CD74 TRUE 3.954534e-06 \n",
+ "730 monocyte rs1131017_RPS26 CD74 TRUE 7.422301e-03 \n",
+ " X..TF.overlap...co.eGene X..TF.overlap...background\n",
+ "19 92 280 \n",
+ "34 244 128 \n",
+ "50 134 238 \n",
+ "84 153 219 \n",
+ "116 139 233 \n",
+ "119 172 200 \n",
+ "730 63 69 \n",
+ " X..no.TF.overlap...co.eGene X..background.gene...not.co.eGene\n",
+ "19 1747 9546 \n",
+ "34 6041 5252 \n",
+ "50 3379 7914 \n",
+ "84 3529 7764 \n",
+ "116 3385 7908 \n",
+ "119 3915 7378 \n",
+ "730 3526 6028 \n",
+ " enrichment.fdr eQTL.SNP SNP.overlaps.TF. Names.of.overlapping.SNPs \n",
+ "19 5.187441e-05 rs1131017 TRUE rs1131017 \n",
+ "34 3.295330e-05 rs1131017 TRUE rs10876864,rs1131017,rs7297175\n",
+ "50 3.052929e-02 rs1131017 TRUE rs1131017 \n",
+ "84 4.598193e-04 rs1131017 TRUE rs10876864,rs1131017 \n",
+ "116 8.236538e-03 rs1131017 TRUE rs10876864,rs1131017,rs7297175\n",
+ "119 5.461532e-05 rs1131017 TRUE rs1131017 \n",
+ "730 3.134542e-02 rs1131017 TRUE rs1131017 "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "old_enrichments[(old_enrichments$eQTL..SNP.eGene. %in% c('rs1131017_RPS26')) & (old_enrichments$TF.is.a.co.eGene. == TRUE) & ((old_enrichments$SNP.overlaps.TF. == TRUE)),]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 185,
+ "id": "973e7d59-4d0d-4aaa-90a6-5ec718d3cdc7",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# MAF and CD74 only negative effect directions"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 186,
+ "id": "41107400-6b96-4024-939d-bffa6790c377",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# TMEM176A nothing found with remap"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5c479bb0-aeee-4286-8107-70eaebb4968e",
+ "metadata": {},
+ "source": [
+ "# Run TRANSFAC enrichment for all cell-types"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 213,
+ "id": "7ee02d44-c7a6-4c44-8957-794c603da722",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "### Set parameters for function"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 214,
+ "id": "5b6b137d-6865-4bea-bf5d-94bdbf71c96b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "p_val_thres = 0.05"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 215,
+ "id": "aea1119e-9ccb-40e4-a43c-ad3022bf4281",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "correction_var = 'fdr'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 216,
+ "id": "52ba5757-2a17-4a25-9975-f73fe9d49888",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "### Decide on whether to restrict the background set\n",
+ "restrict_background_set = FALSE\n",
+ "\n",
+ "# set to TRUE for adaption"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 217,
+ "id": "5fd364b9-2b0d-40a6-aadc-6c32925e8a33",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "### Run enrichments"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 218,
+ "id": "33a0acf7-c544-4278-b8e0-60506109d9aa",
+ "metadata": {
+ "collapsed": true,
+ "jupyter": {
+ "outputs_hidden": true
+ },
+ "tags": []
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[1] \"DC with 58 co-eQTLs\"\n",
+ "[1] \"rs7935082_MS4A7\"\n",
+ "[1] 6054\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Detected custom background input, domain scope is set to 'custom'\n",
+ "\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[1] \"rs9271520_HLA-DQA2\"\n",
+ "[1] 6054\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Detected custom background input, domain scope is set to 'custom'\n",
+ "\n",
+ "No results to show\n",
+ "Please make sure that the organism is correct or set significant = FALSE\n",
+ "\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[1] \"CD4T with 500 co-eQTLs\"\n",
+ "[1] \"rs111454690_HLA-DRB5\"\n",
+ "[1] 11300\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Detected custom background input, domain scope is set to 'custom'\n",
+ "\n",
+ "No results to show\n",
+ "Please make sure that the organism is correct or set significant = FALSE\n",
+ "\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[1] \"rs1131017_RPS26\"\n",
+ "[1] 11300\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Detected custom background input, domain scope is set to 'custom'\n",
+ "\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[1] \"rs2741159_KRT1\"\n",
+ "[1] 11300\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Detected custom background input, domain scope is set to 'custom'\n",
+ "\n",
+ "No results to show\n",
+ "Please make sure that the organism is correct or set significant = FALSE\n",
+ "\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[1] \"rs4147638_SMDT1\"\n",
+ "[1] 11300\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Detected custom background input, domain scope is set to 'custom'\n",
+ "\n",
+ "No results to show\n",
+ "Please make sure that the organism is correct or set significant = FALSE\n",
+ "\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[1] \"rs7605824_SH3YL1\"\n",
+ "[1] 11300\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Detected custom background input, domain scope is set to 'custom'\n",
+ "\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[1] \"rs7632486_CMTM8\"\n",
+ "[1] 11300\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Detected custom background input, domain scope is set to 'custom'\n",
+ "\n",
+ "No results to show\n",
+ "Please make sure that the organism is correct or set significant = FALSE\n",
+ "\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[1] \"rs9022_CLN8\"\n",
+ "[1] 11300\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Detected custom background input, domain scope is set to 'custom'\n",
+ "\n",
+ "No results to show\n",
+ "Please make sure that the organism is correct or set significant = FALSE\n",
+ "\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[1] \"rs9271520_HLA-DQA2\"\n",
+ "[1] 11300\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Detected custom background input, domain scope is set to 'custom'\n",
+ "\n",
+ "No results to show\n",
+ "Please make sure that the organism is correct or set significant = FALSE\n",
+ "\n",
+ "Detected custom background input, domain scope is set to 'custom'\n",
+ "\n",
+ "Detected custom background input, domain scope is set to 'custom'\n",
+ "\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[1] \"CD8T with 420 co-eQTLs\"\n",
+ "[1] \"rs1131017_RPS26\"\n",
+ "[1] 9579\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Detected custom background input, domain scope is set to 'custom'\n",
+ "\n",
+ "No results to show\n",
+ "Please make sure that the organism is correct or set significant = FALSE\n",
+ "\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[1] \"rs4147638_SMDT1\"\n",
+ "[1] 9579\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Detected custom background input, domain scope is set to 'custom'\n",
+ "\n",
+ "No results to show\n",
+ "Please make sure that the organism is correct or set significant = FALSE\n",
+ "\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[1] \"rs6708265_PASK\"\n",
+ "[1] 9579\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Detected custom background input, domain scope is set to 'custom'\n",
+ "\n",
+ "No results to show\n",
+ "Please make sure that the organism is correct or set significant = FALSE\n",
+ "\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[1] \"rs7605824_SH3YL1\"\n",
+ "[1] 9579\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Detected custom background input, domain scope is set to 'custom'\n",
+ "\n",
+ "No results to show\n",
+ "Please make sure that the organism is correct or set significant = FALSE\n",
+ "\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[1] \"rs9271520_HLA-DQA2\"\n",
+ "[1] 9579\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Detected custom background input, domain scope is set to 'custom'\n",
+ "\n",
+ "No results to show\n",
+ "Please make sure that the organism is correct or set significant = FALSE\n",
+ "\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[1] \"rs9306156_PRMT2\"\n",
+ "[1] 9579\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Detected custom background input, domain scope is set to 'custom'\n",
+ "\n",
+ "No results to show\n",
+ "Please make sure that the organism is correct or set significant = FALSE\n",
+ "\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[1] \"monocyte with 281 co-eQTLs\"\n",
+ "[1] \"rs111454690_HLA-DRB5\"\n",
+ "[1] 9557\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Detected custom background input, domain scope is set to 'custom'\n",
+ "\n",
+ "No results to show\n",
+ "Please make sure that the organism is correct or set significant = FALSE\n",
+ "\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[1] \"rs1131017_RPS26\"\n",
+ "[1] 9557\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Detected custom background input, domain scope is set to 'custom'\n",
+ "\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[1] \"rs11577318_CD52\"\n",
+ "[1] 9557\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Detected custom background input, domain scope is set to 'custom'\n",
+ "\n",
+ "No results to show\n",
+ "Please make sure that the organism is correct or set significant = FALSE\n",
+ "\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[1] \"rs3758833_CTSC\"\n",
+ "[1] 9557\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Detected custom background input, domain scope is set to 'custom'\n",
+ "\n",
+ "No results to show\n",
+ "Please make sure that the organism is correct or set significant = FALSE\n",
+ "\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[1] \"rs4782899_DNAAF1\"\n",
+ "[1] 9557\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Detected custom background input, domain scope is set to 'custom'\n",
+ "\n",
+ "No results to show\n",
+ "Please make sure that the organism is correct or set significant = FALSE\n",
+ "\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[1] \"rs5756736_LGALS2\"\n",
+ "[1] 9557\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Detected custom background input, domain scope is set to 'custom'\n",
+ "\n",
+ "No results to show\n",
+ "Please make sure that the organism is correct or set significant = FALSE\n",
+ "\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[1] \"rs7806458_TMEM176A\"\n",
+ "[1] 9557\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Detected custom background input, domain scope is set to 'custom'\n",
+ "\n",
+ "No results to show\n",
+ "Please make sure that the organism is correct or set significant = FALSE\n",
+ "\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[1] \"rs7806458_TMEM176B\"\n",
+ "[1] 9557\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Detected custom background input, domain scope is set to 'custom'\n",
+ "\n",
+ "No results to show\n",
+ "Please make sure that the organism is correct or set significant = FALSE\n",
+ "\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[1] \"rs9271520_HLA-DQA2\"\n",
+ "[1] 9557\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Detected custom background input, domain scope is set to 'custom'\n",
+ "\n",
+ "No results to show\n",
+ "Please make sure that the organism is correct or set significant = FALSE\n",
+ "\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[1] \"NK with 123 co-eQTLs\"\n",
+ "[1] \"rs1131017_RPS26\"\n",
+ "[1] 7271\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Detected custom background input, domain scope is set to 'custom'\n",
+ "\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[1] \"rs12151742_GNLY\"\n",
+ "[1] 7271\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Detected custom background input, domain scope is set to 'custom'\n",
+ "\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[1] \"rs62480001_MYOM2\"\n",
+ "[1] 7271\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Detected custom background input, domain scope is set to 'custom'\n",
+ "\n",
+ "No results to show\n",
+ "Please make sure that the organism is correct or set significant = FALSE\n",
+ "\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[1] \"B with 35 co-eQTLs\"\n",
+ "[1] \"rs1131017_RPS26\"\n",
+ "[1] 1729\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Detected custom background input, domain scope is set to 'custom'\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "enrichment<-NULL\n",
+ "enrichment_summary<-NULL\n",
+ "coegenes_counts_total<-NULL\n",
+ "for(cell_type in c(\"DC\",\"CD4T\",\"CD8T\",\"monocyte\",\"NK\",\"B\" )){\n",
+ " # Read in the data\n",
+ " coeqtls <- fread(paste0(path, \"UT_\",cell_type, \n",
+ " \"_coeqtls_fullresults_fixed.all.tsv.gz\"))\n",
+ " coeqtls$gene1<-gsub(\";.*\",\"\",coeqtls$Gene)\n",
+ " coeqtls$gene2<-gsub(\".*;\",\"\",coeqtls$Gene)\n",
+ " coeqtls$second_gene<-ifelse(coeqtls$gene1 == coeqtls$eqtlgen, coeqtls$gene2,\n",
+ " coeqtls$gene1)\n",
+ " coeqtls$gene1<-NULL\n",
+ " coeqtls$gene2<-NULL\n",
+ " \n",
+ " # Take all tested genes as background\n",
+ " background_genes <- union(coeqtls$eqtlgen,coeqtls$second_gene)\n",
+ " \n",
+ " coeqtls_sign<-coeqtls[coeqtls$gene2_isSig,]\n",
+ " \n",
+ " print(paste(cell_type,\"with\",nrow(coeqtls_sign),\"co-eQTLs\"))\n",
+ " \n",
+ " # Identify all eQTLs with at least 5 coeGenes\n",
+ " coegene_count<-coeqtls_sign%>%\n",
+ " group_by(snp_eqtlgene)%>%\n",
+ " summarise(count_coeGenes=n())%>%\n",
+ " filter(count_coeGenes>4)\n",
+ " \n",
+ " coegene_count$cell_type<-cell_type\n",
+ " coegenes_counts_total<-rbind(coegenes_counts_total,\n",
+ " coegene_count)\n",
+ " \n",
+ " enrichment_found<-0\n",
+ " #Perform GO enrichemt separately for each eQTL\n",
+ " for(eqtl in coegene_count$snp_eqtlgene){\n",
+ " print(eqtl)\n",
+ " \n",
+ " # Optional restricted background set\n",
+ " if(restrict_background_set == TRUE){\n",
+ " background_genes = unique(c(coeqtls$eqtlgene[coeqtls$snp_eqtlgene == eqtl], coeqtls$second_gene[coeqtls$snp_eqtlgene == eqtl]))\n",
+ " }\n",
+ " print(length(background_genes))\n",
+ " \n",
+ " # Run enrichment analysis with background set\n",
+ " enrich_out <- gost(\n",
+ " coeqtls_sign$second_gene[coeqtls_sign$snp_eqtlgene == eqtl],\n",
+ " organism = \"hsapiens\",\n",
+ " ordered_query = FALSE,\n",
+ " multi_query = FALSE,\n",
+ " significant = TRUE,\n",
+ " exclude_iea = FALSE,\n",
+ " measure_underrepresentation = FALSE,\n",
+ " evcodes = FALSE,\n",
+ " correction_method = correction_var,\n",
+ " user_threshold = p_val_thres,\n",
+ " custom_bg = background_genes,\n",
+ " sources = 'TF' # only do transfac enrichment\n",
+ " )\n",
+ " \n",
+ " #if(nrow(enrich_out$result[enrich_out$result$source == 'TF',])>0){\n",
+ " if(!is.null(enrich_out)){\n",
+ " # Save if a enrichment was found\n",
+ " enrichment_found<-enrichment_found+1\n",
+ " \n",
+ " # Save result dataframe\n",
+ " res<-enrich_out$result[enrich_out$result$source == 'TF',]\n",
+ " res$cell_type<-cell_type\n",
+ " res$snp_eGene<-eqtl\n",
+ " enrichment<-rbind(enrichment,\n",
+ " res)\n",
+ " }\n",
+ "\n",
+ " }\n",
+ " \n",
+ " enrichment_summary<-rbind(enrichment_summary,\n",
+ " data.frame(cell_type,\n",
+ " n_eqtls_freq=nrow(coegene_count),\n",
+ " n_enrich=enrichment_found,\n",
+ " freq_enrich=enrichment_found/nrow(coegene_count)))\n",
+ " \n",
+ " \n",
+ " \n",
+ " #Check for CD4T specificallly for RPS26 the positive & negative coeGenes separately\n",
+ " if(cell_type==\"CD4T\"){\n",
+ " eqtl<-\"rs1131017_RPS26\"\n",
+ " \n",
+ " #Test positive coeGenes (MAF not correctly flipped here)\n",
+ " enrich_out <-gost(\n",
+ " coeqtls_sign$second_gene[coeqtls_sign$snp_eqtlgene == eqtl &\n",
+ " coeqtls_sign$MetaPZ < 0],\n",
+ " organism = \"hsapiens\",\n",
+ " ordered_query = FALSE,\n",
+ " multi_query = FALSE,\n",
+ " significant = TRUE,\n",
+ " exclude_iea = FALSE,\n",
+ " measure_underrepresentation = FALSE,\n",
+ " evcodes = FALSE,\n",
+ " correction_method = correction_var,\n",
+ " user_threshold = p_val_thres,\n",
+ " custom_bg = background_genes,\n",
+ " sources = 'TF' # only do transfac enrichment\n",
+ " )\n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " if(!is.null(enrich_out)){\n",
+ " \n",
+ " # Save if a enrichment was found\n",
+ " enrichment_found<-enrichment_found+1\n",
+ " \n",
+ " # Save result dataframe\n",
+ " res<- enrich_out$result[enrich_out$result$source == 'TF',]\n",
+ " res$cell_type<-cell_type\n",
+ " res$snp_eGene<-paste0(eqtl,\"_positive\")\n",
+ " enrichment<-rbind(enrichment,\n",
+ " res)\n",
+ " }\n",
+ " \n",
+ " #Test negative coeGenes (MAF not correctly flipped here)\n",
+ " enrich_out <-gost(\n",
+ " coeqtls_sign$second_gene[coeqtls_sign$snp_eqtlgene == eqtl &\n",
+ " coeqtls_sign$MetaPZ > 0],\n",
+ " organism = \"hsapiens\",\n",
+ " ordered_query = FALSE,\n",
+ " multi_query = FALSE,\n",
+ " significant = TRUE,\n",
+ " exclude_iea = FALSE,\n",
+ " measure_underrepresentation = FALSE,\n",
+ " evcodes = FALSE,\n",
+ " correction_method = correction_var,\n",
+ " user_threshold = p_val_thres,\n",
+ " custom_bg = background_genes,\n",
+ " sources = 'TF' # only do transfac enrichment\n",
+ " )\n",
+ " \n",
+ " if(!is.null(enrich_out)){\n",
+ " \n",
+ " # Save if a enrichment was found\n",
+ " enrichment_found<-enrichment_found+1\n",
+ " \n",
+ " # Save result dataframe\n",
+ " res<-enrich_out$result[enrich_out$result$source == 'TF',]\n",
+ " res$cell_type<-cell_type\n",
+ " res$snp_eGene<-paste0(eqtl,\"_negative\")\n",
+ " enrichment<-rbind(enrichment,\n",
+ " res)\n",
+ " }\n",
+ " }\n",
+ " \n",
+ " \n",
+ " }"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 219,
+ "id": "2161ef04-9fdc-4e14-b318-089eb91db546",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "### Inspect result"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 220,
+ "id": "0ae39aef-8d0c-4f59-821e-d1672811783e",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "A data.frame: 6 × 16\n",
+ "\n",
+ "\t | query | significant | p_value | term_size | query_size | intersection_size | precision | recall | term_id | source | term_name | effective_domain_size | source_order | parents | cell_type | snp_eGene |
\n",
+ "\t | <chr> | <lgl> | <dbl> | <int> | <int> | <int> | <dbl> | <dbl> | <chr> | <chr> | <chr> | <int> | <int> | <list> | <chr> | <chr> |
\n",
+ "\n",
+ "\n",
+ "\t1 | query_1 | TRUE | 0.049610826 | 2342 | 27 | 22 | 0.8148148 | 0.009393681 | TF:M00665 | TF | Factor: Sp3; motif: ASMCTTGGGSRGGG | 5705 | 7882 | TF:M00000 | DC | rs7935082_MS4A7 |
\n",
+ "\t2 | query_1 | TRUE | 0.049610826 | 2303 | 27 | 22 | 0.8148148 | 0.009552757 | TF:M03582 | TF | Factor: TWIST; motif: CACCTGG | 5705 | 8844 | TF:M00000 | DC | rs7935082_MS4A7 |
\n",
+ "\t3 | query_1 | TRUE | 0.003978389 | 3447 | 351 | 163 | 0.4643875 | 0.047287496 | TF:M11438 | TF | Factor: SAP-1; motif: NTCGTAAATGCN | 10167 | 1882 | TF:M00000 | CD4T | rs1131017_RPS26 |
\n",
+ "\t4 | query_1 | TRUE | 0.022537569 | 3025 | 20 | 16 | 0.8000000 | 0.005289256 | TF:M08413 | TF | Factor: TEF-3:C/EBPdelta; motif: RGWATGYNRTTRCGYAAY | 10167 | 8434 | TF:M00000 | CD4T | rs7605824_SH3YL1 |
\n",
+ "\t5 | query_1 | TRUE | 0.002470867 | 3285 | 191 | 95 | 0.4973822 | 0.028919330 | TF:M10785 | TF | Factor: hoxa9; motif: RTCGTWANNN | 10167 | 3774 | TF:M00000 | CD4T | rs1131017_RPS26_positive |
\n",
+ "\t6 | query_1 | TRUE | 0.003339438 | 1184 | 191 | 46 | 0.2408377 | 0.038851351 | TF:M04696_1 | TF | Factor: YY1; motif: GCCGCCATNTTGNNNNNGGNCN; match class: 1 | 10167 | 9013 | TF:M04696 | CD4T | rs1131017_RPS26_positive |
\n",
+ "\n",
+ "
\n"
+ ],
+ "text/latex": [
+ "A data.frame: 6 × 16\n",
+ "\\begin{tabular}{r|llllllllllllllll}\n",
+ " & query & significant & p\\_value & term\\_size & query\\_size & intersection\\_size & precision & recall & term\\_id & source & term\\_name & effective\\_domain\\_size & source\\_order & parents & cell\\_type & snp\\_eGene\\\\\n",
+ " & & & & & & & & & & & & & & & & \\\\\n",
+ "\\hline\n",
+ "\t1 & query\\_1 & TRUE & 0.049610826 & 2342 & 27 & 22 & 0.8148148 & 0.009393681 & TF:M00665 & TF & Factor: Sp3; motif: ASMCTTGGGSRGGG & 5705 & 7882 & TF:M00000 & DC & rs7935082\\_MS4A7 \\\\\n",
+ "\t2 & query\\_1 & TRUE & 0.049610826 & 2303 & 27 & 22 & 0.8148148 & 0.009552757 & TF:M03582 & TF & Factor: TWIST; motif: CACCTGG & 5705 & 8844 & TF:M00000 & DC & rs7935082\\_MS4A7 \\\\\n",
+ "\t3 & query\\_1 & TRUE & 0.003978389 & 3447 & 351 & 163 & 0.4643875 & 0.047287496 & TF:M11438 & TF & Factor: SAP-1; motif: NTCGTAAATGCN & 10167 & 1882 & TF:M00000 & CD4T & rs1131017\\_RPS26 \\\\\n",
+ "\t4 & query\\_1 & TRUE & 0.022537569 & 3025 & 20 & 16 & 0.8000000 & 0.005289256 & TF:M08413 & TF & Factor: TEF-3:C/EBPdelta; motif: RGWATGYNRTTRCGYAAY & 10167 & 8434 & TF:M00000 & CD4T & rs7605824\\_SH3YL1 \\\\\n",
+ "\t5 & query\\_1 & TRUE & 0.002470867 & 3285 & 191 & 95 & 0.4973822 & 0.028919330 & TF:M10785 & TF & Factor: hoxa9; motif: RTCGTWANNN & 10167 & 3774 & TF:M00000 & CD4T & rs1131017\\_RPS26\\_positive\\\\\n",
+ "\t6 & query\\_1 & TRUE & 0.003339438 & 1184 & 191 & 46 & 0.2408377 & 0.038851351 & TF:M04696\\_1 & TF & Factor: YY1; motif: GCCGCCATNTTGNNNNNGGNCN; match class: 1 & 10167 & 9013 & TF:M04696 & CD4T & rs1131017\\_RPS26\\_positive\\\\\n",
+ "\\end{tabular}\n"
+ ],
+ "text/markdown": [
+ "\n",
+ "A data.frame: 6 × 16\n",
+ "\n",
+ "| | query <chr> | significant <lgl> | p_value <dbl> | term_size <int> | query_size <int> | intersection_size <int> | precision <dbl> | recall <dbl> | term_id <chr> | source <chr> | term_name <chr> | effective_domain_size <int> | source_order <int> | parents <list> | cell_type <chr> | snp_eGene <chr> |\n",
+ "|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|\n",
+ "| 1 | query_1 | TRUE | 0.049610826 | 2342 | 27 | 22 | 0.8148148 | 0.009393681 | TF:M00665 | TF | Factor: Sp3; motif: ASMCTTGGGSRGGG | 5705 | 7882 | TF:M00000 | DC | rs7935082_MS4A7 |\n",
+ "| 2 | query_1 | TRUE | 0.049610826 | 2303 | 27 | 22 | 0.8148148 | 0.009552757 | TF:M03582 | TF | Factor: TWIST; motif: CACCTGG | 5705 | 8844 | TF:M00000 | DC | rs7935082_MS4A7 |\n",
+ "| 3 | query_1 | TRUE | 0.003978389 | 3447 | 351 | 163 | 0.4643875 | 0.047287496 | TF:M11438 | TF | Factor: SAP-1; motif: NTCGTAAATGCN | 10167 | 1882 | TF:M00000 | CD4T | rs1131017_RPS26 |\n",
+ "| 4 | query_1 | TRUE | 0.022537569 | 3025 | 20 | 16 | 0.8000000 | 0.005289256 | TF:M08413 | TF | Factor: TEF-3:C/EBPdelta; motif: RGWATGYNRTTRCGYAAY | 10167 | 8434 | TF:M00000 | CD4T | rs7605824_SH3YL1 |\n",
+ "| 5 | query_1 | TRUE | 0.002470867 | 3285 | 191 | 95 | 0.4973822 | 0.028919330 | TF:M10785 | TF | Factor: hoxa9; motif: RTCGTWANNN | 10167 | 3774 | TF:M00000 | CD4T | rs1131017_RPS26_positive |\n",
+ "| 6 | query_1 | TRUE | 0.003339438 | 1184 | 191 | 46 | 0.2408377 | 0.038851351 | TF:M04696_1 | TF | Factor: YY1; motif: GCCGCCATNTTGNNNNNGGNCN; match class: 1 | 10167 | 9013 | TF:M04696 | CD4T | rs1131017_RPS26_positive |\n",
+ "\n"
+ ],
+ "text/plain": [
+ " query significant p_value term_size query_size intersection_size\n",
+ "1 query_1 TRUE 0.049610826 2342 27 22 \n",
+ "2 query_1 TRUE 0.049610826 2303 27 22 \n",
+ "3 query_1 TRUE 0.003978389 3447 351 163 \n",
+ "4 query_1 TRUE 0.022537569 3025 20 16 \n",
+ "5 query_1 TRUE 0.002470867 3285 191 95 \n",
+ "6 query_1 TRUE 0.003339438 1184 191 46 \n",
+ " precision recall term_id source\n",
+ "1 0.8148148 0.009393681 TF:M00665 TF \n",
+ "2 0.8148148 0.009552757 TF:M03582 TF \n",
+ "3 0.4643875 0.047287496 TF:M11438 TF \n",
+ "4 0.8000000 0.005289256 TF:M08413 TF \n",
+ "5 0.4973822 0.028919330 TF:M10785 TF \n",
+ "6 0.2408377 0.038851351 TF:M04696_1 TF \n",
+ " term_name \n",
+ "1 Factor: Sp3; motif: ASMCTTGGGSRGGG \n",
+ "2 Factor: TWIST; motif: CACCTGG \n",
+ "3 Factor: SAP-1; motif: NTCGTAAATGCN \n",
+ "4 Factor: TEF-3:C/EBPdelta; motif: RGWATGYNRTTRCGYAAY \n",
+ "5 Factor: hoxa9; motif: RTCGTWANNN \n",
+ "6 Factor: YY1; motif: GCCGCCATNTTGNNNNNGGNCN; match class: 1\n",
+ " effective_domain_size source_order parents cell_type\n",
+ "1 5705 7882 TF:M00000 DC \n",
+ "2 5705 8844 TF:M00000 DC \n",
+ "3 10167 1882 TF:M00000 CD4T \n",
+ "4 10167 8434 TF:M00000 CD4T \n",
+ "5 10167 3774 TF:M00000 CD4T \n",
+ "6 10167 9013 TF:M04696 CD4T \n",
+ " snp_eGene \n",
+ "1 rs7935082_MS4A7 \n",
+ "2 rs7935082_MS4A7 \n",
+ "3 rs1131017_RPS26 \n",
+ "4 rs7605824_SH3YL1 \n",
+ "5 rs1131017_RPS26_positive\n",
+ "6 rs1131017_RPS26_positive"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "head(enrichment)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 221,
+ "id": "724fe97f-5620-4d55-9d2f-154c00f7bcd8",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "A data.frame: 6 × 4\n",
+ "\n",
+ "\t | cell_type | n_eqtls_freq | n_enrich | freq_enrich |
\n",
+ "\t | <chr> | <int> | <dbl> | <dbl> |
\n",
+ "\n",
+ "\n",
+ "\t1 | DC | 2 | 1 | 0.5000000 |
\n",
+ "\t2 | CD4T | 8 | 2 | 0.2500000 |
\n",
+ "\t3 | CD8T | 6 | 0 | 0.0000000 |
\n",
+ "\t4 | monocyte | 9 | 1 | 0.1111111 |
\n",
+ "\t5 | NK | 3 | 2 | 0.6666667 |
\n",
+ "\t6 | B | 1 | 1 | 1.0000000 |
\n",
+ "\n",
+ "
\n"
+ ],
+ "text/latex": [
+ "A data.frame: 6 × 4\n",
+ "\\begin{tabular}{r|llll}\n",
+ " & cell\\_type & n\\_eqtls\\_freq & n\\_enrich & freq\\_enrich\\\\\n",
+ " & & & & \\\\\n",
+ "\\hline\n",
+ "\t1 & DC & 2 & 1 & 0.5000000\\\\\n",
+ "\t2 & CD4T & 8 & 2 & 0.2500000\\\\\n",
+ "\t3 & CD8T & 6 & 0 & 0.0000000\\\\\n",
+ "\t4 & monocyte & 9 & 1 & 0.1111111\\\\\n",
+ "\t5 & NK & 3 & 2 & 0.6666667\\\\\n",
+ "\t6 & B & 1 & 1 & 1.0000000\\\\\n",
+ "\\end{tabular}\n"
+ ],
+ "text/markdown": [
+ "\n",
+ "A data.frame: 6 × 4\n",
+ "\n",
+ "| | cell_type <chr> | n_eqtls_freq <int> | n_enrich <dbl> | freq_enrich <dbl> |\n",
+ "|---|---|---|---|---|\n",
+ "| 1 | DC | 2 | 1 | 0.5000000 |\n",
+ "| 2 | CD4T | 8 | 2 | 0.2500000 |\n",
+ "| 3 | CD8T | 6 | 0 | 0.0000000 |\n",
+ "| 4 | monocyte | 9 | 1 | 0.1111111 |\n",
+ "| 5 | NK | 3 | 2 | 0.6666667 |\n",
+ "| 6 | B | 1 | 1 | 1.0000000 |\n",
+ "\n"
+ ],
+ "text/plain": [
+ " cell_type n_eqtls_freq n_enrich freq_enrich\n",
+ "1 DC 2 1 0.5000000 \n",
+ "2 CD4T 8 2 0.2500000 \n",
+ "3 CD8T 6 0 0.0000000 \n",
+ "4 monocyte 9 1 0.1111111 \n",
+ "5 NK 3 2 0.6666667 \n",
+ "6 B 1 1 1.0000000 "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "head(enrichment_summary)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 223,
+ "id": "90e7aea5-2e69-4412-89b7-cbd605fb836d",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "A tibble: 6 × 3\n",
+ "\n",
+ "\tsnp_eqtlgene | count_coeGenes | cell_type |
\n",
+ "\t<chr> | <int> | <chr> |
\n",
+ "\n",
+ "\n",
+ "\trs7935082_MS4A7 | 30 | DC |
\n",
+ "\trs9271520_HLA-DQA2 | 13 | DC |
\n",
+ "\trs111454690_HLA-DRB5 | 19 | CD4T |
\n",
+ "\trs1131017_RPS26 | 372 | CD4T |
\n",
+ "\trs2741159_KRT1 | 8 | CD4T |
\n",
+ "\trs4147638_SMDT1 | 19 | CD4T |
\n",
+ "\n",
+ "
\n"
+ ],
+ "text/latex": [
+ "A tibble: 6 × 3\n",
+ "\\begin{tabular}{lll}\n",
+ " snp\\_eqtlgene & count\\_coeGenes & cell\\_type\\\\\n",
+ " & & \\\\\n",
+ "\\hline\n",
+ "\t rs7935082\\_MS4A7 & 30 & DC \\\\\n",
+ "\t rs9271520\\_HLA-DQA2 & 13 & DC \\\\\n",
+ "\t rs111454690\\_HLA-DRB5 & 19 & CD4T\\\\\n",
+ "\t rs1131017\\_RPS26 & 372 & CD4T\\\\\n",
+ "\t rs2741159\\_KRT1 & 8 & CD4T\\\\\n",
+ "\t rs4147638\\_SMDT1 & 19 & CD4T\\\\\n",
+ "\\end{tabular}\n"
+ ],
+ "text/markdown": [
+ "\n",
+ "A tibble: 6 × 3\n",
+ "\n",
+ "| snp_eqtlgene <chr> | count_coeGenes <int> | cell_type <chr> |\n",
+ "|---|---|---|\n",
+ "| rs7935082_MS4A7 | 30 | DC |\n",
+ "| rs9271520_HLA-DQA2 | 13 | DC |\n",
+ "| rs111454690_HLA-DRB5 | 19 | CD4T |\n",
+ "| rs1131017_RPS26 | 372 | CD4T |\n",
+ "| rs2741159_KRT1 | 8 | CD4T |\n",
+ "| rs4147638_SMDT1 | 19 | CD4T |\n",
+ "\n"
+ ],
+ "text/plain": [
+ " snp_eqtlgene count_coeGenes cell_type\n",
+ "1 rs7935082_MS4A7 30 DC \n",
+ "2 rs9271520_HLA-DQA2 13 DC \n",
+ "3 rs111454690_HLA-DRB5 19 CD4T \n",
+ "4 rs1131017_RPS26 372 CD4T \n",
+ "5 rs2741159_KRT1 8 CD4T \n",
+ "6 rs4147638_SMDT1 19 CD4T "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "head(coegenes_counts_total)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 225,
+ "id": "a7788afe-58c2-41bf-bd53-bec13a912a46",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "0.0496108257480342"
+ ],
+ "text/latex": [
+ "0.0496108257480342"
+ ],
+ "text/markdown": [
+ "0.0496108257480342"
+ ],
+ "text/plain": [
+ "[1] 0.04961083"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "max(enrichment$p_value) # set to same level"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 226,
+ "id": "ba254471-a07d-4b06-9280-970d02582110",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "### Evaluate amount of enrichments found per cell-type with set p-value threshold"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 227,
+ "id": "6729f307-8da0-4700-9b99-d4cc8f9b2fce",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "A grouped_df: 5 × 2\n",
+ "\n",
+ "\tcell_type | n |
\n",
+ "\t<chr> | <int> |
\n",
+ "\n",
+ "\n",
+ "\tB | 3 |
\n",
+ "\tCD4T | 54 |
\n",
+ "\tDC | 2 |
\n",
+ "\tmonocyte | 40 |
\n",
+ "\tNK | 21 |
\n",
+ "\n",
+ "
\n"
+ ],
+ "text/latex": [
+ "A grouped\\_df: 5 × 2\n",
+ "\\begin{tabular}{ll}\n",
+ " cell\\_type & n\\\\\n",
+ " & \\\\\n",
+ "\\hline\n",
+ "\t B & 3\\\\\n",
+ "\t CD4T & 54\\\\\n",
+ "\t DC & 2\\\\\n",
+ "\t monocyte & 40\\\\\n",
+ "\t NK & 21\\\\\n",
+ "\\end{tabular}\n"
+ ],
+ "text/markdown": [
+ "\n",
+ "A grouped_df: 5 × 2\n",
+ "\n",
+ "| cell_type <chr> | n <int> |\n",
+ "|---|---|\n",
+ "| B | 3 |\n",
+ "| CD4T | 54 |\n",
+ "| DC | 2 |\n",
+ "| monocyte | 40 |\n",
+ "| NK | 21 |\n",
+ "\n"
+ ],
+ "text/plain": [
+ " cell_type n \n",
+ "1 B 3\n",
+ "2 CD4T 54\n",
+ "3 DC 2\n",
+ "4 monocyte 40\n",
+ "5 NK 21"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "enrichment %>% group_by(cell_type) %>% count()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 228,
+ "id": "7d3dbfdc-2862-4ead-be5a-7645b8a170ef",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "120"
+ ],
+ "text/latex": [
+ "120"
+ ],
+ "text/markdown": [
+ "120"
+ ],
+ "text/plain": [
+ "[1] 120"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "nrow(enrichment)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 230,
+ "id": "8a481a49-f784-45a6-b640-e33cf122469f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "### Save the enrichment result"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 231,
+ "id": "f5e929c8-3fd4-4023-8483-3ef1bb3335d8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "enrichment$parents = NULL"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 232,
+ "id": "3db92875-75a3-436f-b26c-e116bb15e0ab",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "write.csv(enrichment, paste0(path, \"transfac_results/TRANSFAC_Enrichments.csv\"))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a1c57807-ebfa-4ef6-ad86-4022bcee0fe0",
+ "metadata": {},
+ "source": [
+ "# Compare to previous enrichment results with Remap"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 233,
+ "id": "08a4064d-3059-422a-9406-0cac4c75830b",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "A data.frame: 2 × 13\n",
+ "\n",
+ "\t | Cell.type | eQTL..SNP.eGene. | TF | TF.is.a.co.eGene. | enrichment.p.value | X..TF.overlap...co.eGene | X..TF.overlap...background | X..no.TF.overlap...co.eGene | X..background.gene...not.co.eGene | enrichment.fdr | eQTL.SNP | SNP.overlaps.TF. | Names.of.overlapping.SNPs |
\n",
+ "\t | <chr> | <chr> | <chr> | <lgl> | <dbl> | <int> | <int> | <int> | <int> | <dbl> | <chr> | <lgl> | <chr> |
\n",
+ "\n",
+ "\n",
+ "\t1 | CD4T | rs111454690_HLA-DRB5 | CDK8 | FALSE | 9.630369e-06 | 14 | 5 | 2778 | 8515 | 1.640373e-03 | rs111454690 | FALSE | |
\n",
+ "\t2 | CD4T | rs111454690_HLA-DRB5 | SNRNP70 | FALSE | 1.209254e-09 | 11 | 8 | 649 | 10644 | 6.179288e-07 | rs111454690 | FALSE | |
\n",
+ "\n",
+ "
\n"
+ ],
+ "text/latex": [
+ "A data.frame: 2 × 13\n",
+ "\\begin{tabular}{r|lllllllllllll}\n",
+ " & Cell.type & eQTL..SNP.eGene. & TF & TF.is.a.co.eGene. & enrichment.p.value & X..TF.overlap...co.eGene & X..TF.overlap...background & X..no.TF.overlap...co.eGene & X..background.gene...not.co.eGene & enrichment.fdr & eQTL.SNP & SNP.overlaps.TF. & Names.of.overlapping.SNPs\\\\\n",
+ " & & & & & & & & & & & & & \\\\\n",
+ "\\hline\n",
+ "\t1 & CD4T & rs111454690\\_HLA-DRB5 & CDK8 & FALSE & 9.630369e-06 & 14 & 5 & 2778 & 8515 & 1.640373e-03 & rs111454690 & FALSE & \\\\\n",
+ "\t2 & CD4T & rs111454690\\_HLA-DRB5 & SNRNP70 & FALSE & 1.209254e-09 & 11 & 8 & 649 & 10644 & 6.179288e-07 & rs111454690 & FALSE & \\\\\n",
+ "\\end{tabular}\n"
+ ],
+ "text/markdown": [
+ "\n",
+ "A data.frame: 2 × 13\n",
+ "\n",
+ "| | Cell.type <chr> | eQTL..SNP.eGene. <chr> | TF <chr> | TF.is.a.co.eGene. <lgl> | enrichment.p.value <dbl> | X..TF.overlap...co.eGene <int> | X..TF.overlap...background <int> | X..no.TF.overlap...co.eGene <int> | X..background.gene...not.co.eGene <int> | enrichment.fdr <dbl> | eQTL.SNP <chr> | SNP.overlaps.TF. <lgl> | Names.of.overlapping.SNPs <chr> |\n",
+ "|---|---|---|---|---|---|---|---|---|---|---|---|---|---|\n",
+ "| 1 | CD4T | rs111454690_HLA-DRB5 | CDK8 | FALSE | 9.630369e-06 | 14 | 5 | 2778 | 8515 | 1.640373e-03 | rs111454690 | FALSE | |\n",
+ "| 2 | CD4T | rs111454690_HLA-DRB5 | SNRNP70 | FALSE | 1.209254e-09 | 11 | 8 | 649 | 10644 | 6.179288e-07 | rs111454690 | FALSE | |\n",
+ "\n"
+ ],
+ "text/plain": [
+ " Cell.type eQTL..SNP.eGene. TF TF.is.a.co.eGene. enrichment.p.value\n",
+ "1 CD4T rs111454690_HLA-DRB5 CDK8 FALSE 9.630369e-06 \n",
+ "2 CD4T rs111454690_HLA-DRB5 SNRNP70 FALSE 1.209254e-09 \n",
+ " X..TF.overlap...co.eGene X..TF.overlap...background\n",
+ "1 14 5 \n",
+ "2 11 8 \n",
+ " X..no.TF.overlap...co.eGene X..background.gene...not.co.eGene enrichment.fdr\n",
+ "1 2778 8515 1.640373e-03 \n",
+ "2 649 10644 6.179288e-07 \n",
+ " eQTL.SNP SNP.overlaps.TF. Names.of.overlapping.SNPs\n",
+ "1 rs111454690 FALSE \n",
+ "2 rs111454690 FALSE "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "head(old_enrichments,2)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "238a1fe5-72cf-4dd4-96b2-c25e18fa41e5",
+ "metadata": {},
+ "source": [
+ "## Compare amount of enrichments"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 234,
+ "id": "8058ccda-a072-443d-9b29-66fef671cd7c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "amount_enrichments_old = old_enrichments %>% group_by(Cell.type, eQTL..SNP.eGene.) %>% count()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 235,
+ "id": "baac2391-4638-4e42-923e-912226128e43",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "A grouped_df: 6 × 3\n",
+ "\n",
+ "\tCell.type | eQTL..SNP.eGene. | n |
\n",
+ "\t<chr> | <chr> | <int> |
\n",
+ "\n",
+ "\n",
+ "\tB | rs1131017_RPS26 | 82 |
\n",
+ "\tCD4T | rs111454690_HLA-DRB5 | 14 |
\n",
+ "\tCD4T | rs1131017_RPS26 | 134 |
\n",
+ "\tCD4T | rs1131017_RPS26_negative | 93 |
\n",
+ "\tCD4T | rs1131017_RPS26_positive | 125 |
\n",
+ "\tCD4T | rs4147638_SMDT1 | 14 |
\n",
+ "\n",
+ "
\n"
+ ],
+ "text/latex": [
+ "A grouped\\_df: 6 × 3\n",
+ "\\begin{tabular}{lll}\n",
+ " Cell.type & eQTL..SNP.eGene. & n\\\\\n",
+ " & & \\\\\n",
+ "\\hline\n",
+ "\t B & rs1131017\\_RPS26 & 82\\\\\n",
+ "\t CD4T & rs111454690\\_HLA-DRB5 & 14\\\\\n",
+ "\t CD4T & rs1131017\\_RPS26 & 134\\\\\n",
+ "\t CD4T & rs1131017\\_RPS26\\_negative & 93\\\\\n",
+ "\t CD4T & rs1131017\\_RPS26\\_positive & 125\\\\\n",
+ "\t CD4T & rs4147638\\_SMDT1 & 14\\\\\n",
+ "\\end{tabular}\n"
+ ],
+ "text/markdown": [
+ "\n",
+ "A grouped_df: 6 × 3\n",
+ "\n",
+ "| Cell.type <chr> | eQTL..SNP.eGene. <chr> | n <int> |\n",
+ "|---|---|---|\n",
+ "| B | rs1131017_RPS26 | 82 |\n",
+ "| CD4T | rs111454690_HLA-DRB5 | 14 |\n",
+ "| CD4T | rs1131017_RPS26 | 134 |\n",
+ "| CD4T | rs1131017_RPS26_negative | 93 |\n",
+ "| CD4T | rs1131017_RPS26_positive | 125 |\n",
+ "| CD4T | rs4147638_SMDT1 | 14 |\n",
+ "\n"
+ ],
+ "text/plain": [
+ " Cell.type eQTL..SNP.eGene. n \n",
+ "1 B rs1131017_RPS26 82\n",
+ "2 CD4T rs111454690_HLA-DRB5 14\n",
+ "3 CD4T rs1131017_RPS26 134\n",
+ "4 CD4T rs1131017_RPS26_negative 93\n",
+ "5 CD4T rs1131017_RPS26_positive 125\n",
+ "6 CD4T rs4147638_SMDT1 14"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "head(amount_enrichments_old)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 236,
+ "id": "6f219e98-8ec6-45fa-b145-7b8086806dd9",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "colnames(amount_enrichments_old) = c('cell_type', 'snp_eGene', 'ReMap_amount')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 237,
+ "id": "9bcc8ddb-a123-4fe6-b1bb-8e6a5d451519",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "transfac_enrichments = enrichment %>% group_by(cell_type, snp_eGene) %>% count()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 238,
+ "id": "ce779ce7-e46a-466f-be5b-1cab27eadd2c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "colnames(transfac_enrichments)= c('cell_type', 'snp_eGene', 'TRANSFAC_amount')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 239,
+ "id": "06ae3596-604f-4ebc-b6e9-66150ffb9559",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "overview = merge(amount_enrichments_old, transfac_enrichments, all.x = TRUE, all.y = TRUE)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 240,
+ "id": "7b372d6d-1e65-4750-9414-6cd3b5294a05",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "### Result of comparisoon"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 245,
+ "id": "6754a2a0-cdef-4126-9042-5820a8f65f62",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "overview[is.na(overview)]= 0"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 246,
+ "id": "6fc4ba13-519b-4ccd-bf51-967295a82d06",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "A data.frame: 19 × 4\n",
+ "\n",
+ "\t | cell_type | snp_eGene | ReMap_amount | TRANSFAC_amount |
\n",
+ "\t | <chr> | <chr> | <dbl> | <dbl> |
\n",
+ "\n",
+ "\n",
+ "\t5 | CD4T | rs1131017_RPS26_positive | 125 | 51 |
\n",
+ "\t16 | monocyte | rs1131017_RPS26 | 145 | 40 |
\n",
+ "\t18 | NK | rs1131017_RPS26 | 132 | 20 |
\n",
+ "\t1 | B | rs1131017_RPS26 | 82 | 3 |
\n",
+ "\t14 | DC | rs7935082_MS4A7 | 0 | 2 |
\n",
+ "\t3 | CD4T | rs1131017_RPS26 | 134 | 1 |
\n",
+ "\t4 | CD4T | rs1131017_RPS26_negative | 93 | 1 |
\n",
+ "\t7 | CD4T | rs7605824_SH3YL1 | 58 | 1 |
\n",
+ "\t19 | NK | rs12151742_GNLY | 0 | 1 |
\n",
+ "\t2 | CD4T | rs111454690_HLA-DRB5 | 14 | 0 |
\n",
+ "\t6 | CD4T | rs4147638_SMDT1 | 14 | 0 |
\n",
+ "\t8 | CD4T | rs7632486_CMTM8 | 4 | 0 |
\n",
+ "\t9 | CD4T | rs9271520_HLA-DQA2 | 5 | 0 |
\n",
+ "\t10 | CD8T | rs1131017_RPS26 | 62 | 0 |
\n",
+ "\t11 | CD8T | rs4147638_SMDT1 | 78 | 0 |
\n",
+ "\t12 | CD8T | rs6708265_PASK | 3 | 0 |
\n",
+ "\t13 | CD8T | rs7605824_SH3YL1 | 9 | 0 |
\n",
+ "\t15 | monocyte | rs111454690_HLA-DRB5 | 1 | 0 |
\n",
+ "\t17 | monocyte | rs9271520_HLA-DQA2 | 4 | 0 |
\n",
+ "\n",
+ "
\n"
+ ],
+ "text/latex": [
+ "A data.frame: 19 × 4\n",
+ "\\begin{tabular}{r|llll}\n",
+ " & cell\\_type & snp\\_eGene & ReMap\\_amount & TRANSFAC\\_amount\\\\\n",
+ " & & & & \\\\\n",
+ "\\hline\n",
+ "\t5 & CD4T & rs1131017\\_RPS26\\_positive & 125 & 51\\\\\n",
+ "\t16 & monocyte & rs1131017\\_RPS26 & 145 & 40\\\\\n",
+ "\t18 & NK & rs1131017\\_RPS26 & 132 & 20\\\\\n",
+ "\t1 & B & rs1131017\\_RPS26 & 82 & 3\\\\\n",
+ "\t14 & DC & rs7935082\\_MS4A7 & 0 & 2\\\\\n",
+ "\t3 & CD4T & rs1131017\\_RPS26 & 134 & 1\\\\\n",
+ "\t4 & CD4T & rs1131017\\_RPS26\\_negative & 93 & 1\\\\\n",
+ "\t7 & CD4T & rs7605824\\_SH3YL1 & 58 & 1\\\\\n",
+ "\t19 & NK & rs12151742\\_GNLY & 0 & 1\\\\\n",
+ "\t2 & CD4T & rs111454690\\_HLA-DRB5 & 14 & 0\\\\\n",
+ "\t6 & CD4T & rs4147638\\_SMDT1 & 14 & 0\\\\\n",
+ "\t8 & CD4T & rs7632486\\_CMTM8 & 4 & 0\\\\\n",
+ "\t9 & CD4T & rs9271520\\_HLA-DQA2 & 5 & 0\\\\\n",
+ "\t10 & CD8T & rs1131017\\_RPS26 & 62 & 0\\\\\n",
+ "\t11 & CD8T & rs4147638\\_SMDT1 & 78 & 0\\\\\n",
+ "\t12 & CD8T & rs6708265\\_PASK & 3 & 0\\\\\n",
+ "\t13 & CD8T & rs7605824\\_SH3YL1 & 9 & 0\\\\\n",
+ "\t15 & monocyte & rs111454690\\_HLA-DRB5 & 1 & 0\\\\\n",
+ "\t17 & monocyte & rs9271520\\_HLA-DQA2 & 4 & 0\\\\\n",
+ "\\end{tabular}\n"
+ ],
+ "text/markdown": [
+ "\n",
+ "A data.frame: 19 × 4\n",
+ "\n",
+ "| | cell_type <chr> | snp_eGene <chr> | ReMap_amount <dbl> | TRANSFAC_amount <dbl> |\n",
+ "|---|---|---|---|---|\n",
+ "| 5 | CD4T | rs1131017_RPS26_positive | 125 | 51 |\n",
+ "| 16 | monocyte | rs1131017_RPS26 | 145 | 40 |\n",
+ "| 18 | NK | rs1131017_RPS26 | 132 | 20 |\n",
+ "| 1 | B | rs1131017_RPS26 | 82 | 3 |\n",
+ "| 14 | DC | rs7935082_MS4A7 | 0 | 2 |\n",
+ "| 3 | CD4T | rs1131017_RPS26 | 134 | 1 |\n",
+ "| 4 | CD4T | rs1131017_RPS26_negative | 93 | 1 |\n",
+ "| 7 | CD4T | rs7605824_SH3YL1 | 58 | 1 |\n",
+ "| 19 | NK | rs12151742_GNLY | 0 | 1 |\n",
+ "| 2 | CD4T | rs111454690_HLA-DRB5 | 14 | 0 |\n",
+ "| 6 | CD4T | rs4147638_SMDT1 | 14 | 0 |\n",
+ "| 8 | CD4T | rs7632486_CMTM8 | 4 | 0 |\n",
+ "| 9 | CD4T | rs9271520_HLA-DQA2 | 5 | 0 |\n",
+ "| 10 | CD8T | rs1131017_RPS26 | 62 | 0 |\n",
+ "| 11 | CD8T | rs4147638_SMDT1 | 78 | 0 |\n",
+ "| 12 | CD8T | rs6708265_PASK | 3 | 0 |\n",
+ "| 13 | CD8T | rs7605824_SH3YL1 | 9 | 0 |\n",
+ "| 15 | monocyte | rs111454690_HLA-DRB5 | 1 | 0 |\n",
+ "| 17 | monocyte | rs9271520_HLA-DQA2 | 4 | 0 |\n",
+ "\n"
+ ],
+ "text/plain": [
+ " cell_type snp_eGene ReMap_amount TRANSFAC_amount\n",
+ "5 CD4T rs1131017_RPS26_positive 125 51 \n",
+ "16 monocyte rs1131017_RPS26 145 40 \n",
+ "18 NK rs1131017_RPS26 132 20 \n",
+ "1 B rs1131017_RPS26 82 3 \n",
+ "14 DC rs7935082_MS4A7 0 2 \n",
+ "3 CD4T rs1131017_RPS26 134 1 \n",
+ "4 CD4T rs1131017_RPS26_negative 93 1 \n",
+ "7 CD4T rs7605824_SH3YL1 58 1 \n",
+ "19 NK rs12151742_GNLY 0 1 \n",
+ "2 CD4T rs111454690_HLA-DRB5 14 0 \n",
+ "6 CD4T rs4147638_SMDT1 14 0 \n",
+ "8 CD4T rs7632486_CMTM8 4 0 \n",
+ "9 CD4T rs9271520_HLA-DQA2 5 0 \n",
+ "10 CD8T rs1131017_RPS26 62 0 \n",
+ "11 CD8T rs4147638_SMDT1 78 0 \n",
+ "12 CD8T rs6708265_PASK 3 0 \n",
+ "13 CD8T rs7605824_SH3YL1 9 0 \n",
+ "15 monocyte rs111454690_HLA-DRB5 1 0 \n",
+ "17 monocyte rs9271520_HLA-DQA2 4 0 "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "overview[order(overview$TRANSFAC_amount, decreasing = TRUE),]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 247,
+ "id": "c04f7c9b-1e74-401d-bcf4-569919447df6",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "write.csv(overview, paste0(path, \"transfac_results/TRANSFAC_ReMap_comparison.csv\"))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "80ddea9d-1cc5-4ce3-b595-e15d06593b83",
+ "metadata": {},
+ "source": [
+ "## Compare the TFs"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 254,
+ "id": "8dce9b80-35b0-4f3e-9023-9912c1348657",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Paper: six TFs—RBM39, TCF7, LEF1, KLF6, CD74 and MAF—whose binding sites were enriched in the promoter region of the rs1131017–RPS26\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 280,
+ "id": "fce20036-c368-467e-9f62-07dd7cf63b6e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "enrichment$tf = str_extract(enrichment$term_name, '.*;')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 281,
+ "id": "8ffb3678-a8e1-4648-bcfc-d9fc3d1b30c3",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "enrichment$tf = str_replace(enrichment$tf, 'Factor: ', '')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 282,
+ "id": "9b91aee3-698d-488c-8e7f-d68cc00bdb17",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "enrichment$tf = str_replace(enrichment$tf, ';', '')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 283,
+ "id": "ea937e43-2fdf-47c3-8f1a-e9919889e5f5",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "enrichment$tf = str_replace(enrichment$tf , 'motif.*', '')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 284,
+ "id": "0855320a-54d6-422c-9991-d2d9329c3fb8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "enrichment$tf = str_replace(enrichment$tf , '-', '')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 285,
+ "id": "9f0f674c-8690-4e2c-84e2-949048f3e891",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "enrichment$tf = toupper(enrichment$tf)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 286,
+ "id": "891df001-7839-41b9-96f5-6b2521e3b6eb",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "enrichment$tf = str_replace(enrichment$tf , ' ', '')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 292,
+ "id": "2873020c-ae31-4cfb-bb8b-953da1a03c61",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "enrichment$tf = str_replace(enrichment$tf, 'CETS-1', 'ETS1')\n",
+ "enrichment$tf = str_replace(enrichment$tf, 'C/EBPBETA|C/EBPBETA|C/EBPbeta|C/EBPBETA|GCMA:CEBPB', 'CEBPB')\n",
+ "enrichment$tf = str_replace(enrichment$tf, 'C/EBPDELTA|C/EBPDELTA|TEF3:CEBPD', 'CEBPD')\n",
+ "enrichment$tf = str_replace(enrichment$tf, 'C/EBPGAMMA', 'CEBPG')\n",
+ "enrichment$tf = str_replace(enrichment$tf, 'ELK1:HOXB13', 'ELK1')\n",
+ "enrichment$tf = str_replace(enrichment$tf, 'GTF2IRD1ISOFORM2', 'GTF2I')\n",
+ "enrichment$tf = str_replace(enrichment$tf, 'MEIS1:ELF1', 'ELF1')\n",
+ "enrichment$tf = str_replace(enrichment$tf, 'PU.1', 'SPI1')\n",
+ "enrichment$tf = str_replace(enrichment$tf, 'TEF3:ERG', 'ERG')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 293,
+ "id": "f2c5345b-0fdd-40fb-a982-96794b5f6440",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "A data.frame: 2 × 16\n",
+ "\n",
+ "\t | query | significant | p_value | term_size | query_size | intersection_size | precision | recall | term_id | source | term_name | effective_domain_size | source_order | cell_type | snp_eGene | tf |
\n",
+ "\t | <chr> | <lgl> | <dbl> | <int> | <int> | <int> | <dbl> | <dbl> | <chr> | <chr> | <chr> | <int> | <int> | <chr> | <chr> | <chr> |
\n",
+ "\n",
+ "\n",
+ "\t1 | query_1 | TRUE | 0.04961083 | 2342 | 27 | 22 | 0.8148148 | 0.009393681 | TF:M00665 | TF | Factor: Sp3; motif: ASMCTTGGGSRGGG | 5705 | 7882 | DC | rs7935082_MS4A7 | SP3 |
\n",
+ "\t2 | query_1 | TRUE | 0.04961083 | 2303 | 27 | 22 | 0.8148148 | 0.009552757 | TF:M03582 | TF | Factor: TWIST; motif: CACCTGG | 5705 | 8844 | DC | rs7935082_MS4A7 | TWIST |
\n",
+ "\n",
+ "
\n"
+ ],
+ "text/latex": [
+ "A data.frame: 2 × 16\n",
+ "\\begin{tabular}{r|llllllllllllllll}\n",
+ " & query & significant & p\\_value & term\\_size & query\\_size & intersection\\_size & precision & recall & term\\_id & source & term\\_name & effective\\_domain\\_size & source\\_order & cell\\_type & snp\\_eGene & tf\\\\\n",
+ " & & & & & & & & & & & & & & & & \\\\\n",
+ "\\hline\n",
+ "\t1 & query\\_1 & TRUE & 0.04961083 & 2342 & 27 & 22 & 0.8148148 & 0.009393681 & TF:M00665 & TF & Factor: Sp3; motif: ASMCTTGGGSRGGG & 5705 & 7882 & DC & rs7935082\\_MS4A7 & SP3 \\\\\n",
+ "\t2 & query\\_1 & TRUE & 0.04961083 & 2303 & 27 & 22 & 0.8148148 & 0.009552757 & TF:M03582 & TF & Factor: TWIST; motif: CACCTGG & 5705 & 8844 & DC & rs7935082\\_MS4A7 & TWIST\\\\\n",
+ "\\end{tabular}\n"
+ ],
+ "text/markdown": [
+ "\n",
+ "A data.frame: 2 × 16\n",
+ "\n",
+ "| | query <chr> | significant <lgl> | p_value <dbl> | term_size <int> | query_size <int> | intersection_size <int> | precision <dbl> | recall <dbl> | term_id <chr> | source <chr> | term_name <chr> | effective_domain_size <int> | source_order <int> | cell_type <chr> | snp_eGene <chr> | tf <chr> |\n",
+ "|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|\n",
+ "| 1 | query_1 | TRUE | 0.04961083 | 2342 | 27 | 22 | 0.8148148 | 0.009393681 | TF:M00665 | TF | Factor: Sp3; motif: ASMCTTGGGSRGGG | 5705 | 7882 | DC | rs7935082_MS4A7 | SP3 |\n",
+ "| 2 | query_1 | TRUE | 0.04961083 | 2303 | 27 | 22 | 0.8148148 | 0.009552757 | TF:M03582 | TF | Factor: TWIST; motif: CACCTGG | 5705 | 8844 | DC | rs7935082_MS4A7 | TWIST |\n",
+ "\n"
+ ],
+ "text/plain": [
+ " query significant p_value term_size query_size intersection_size\n",
+ "1 query_1 TRUE 0.04961083 2342 27 22 \n",
+ "2 query_1 TRUE 0.04961083 2303 27 22 \n",
+ " precision recall term_id source term_name \n",
+ "1 0.8148148 0.009393681 TF:M00665 TF Factor: Sp3; motif: ASMCTTGGGSRGGG\n",
+ "2 0.8148148 0.009552757 TF:M03582 TF Factor: TWIST; motif: CACCTGG \n",
+ " effective_domain_size source_order cell_type snp_eGene tf \n",
+ "1 5705 7882 DC rs7935082_MS4A7 SP3 \n",
+ "2 5705 8844 DC rs7935082_MS4A7 TWIST"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "head(enrichment,2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 299,
+ "id": "6ff9d9b7-e3b8-46d7-93ae-8b2f765f381a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "colnames(enrichment) = paste0('TRANSFAC_', colnames(enrichment))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 296,
+ "id": "b940e752-4002-4c52-b066-bb3c2ff83e36",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "### Merge with ReMap REsults"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 298,
+ "id": "60ff3520-ffac-4660-bb4b-9744db63c309",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "A data.frame: 2 × 13\n",
+ "\n",
+ "\t | Cell.type | eQTL..SNP.eGene. | TF | TF.is.a.co.eGene. | enrichment.p.value | X..TF.overlap...co.eGene | X..TF.overlap...background | X..no.TF.overlap...co.eGene | X..background.gene...not.co.eGene | enrichment.fdr | eQTL.SNP | SNP.overlaps.TF. | Names.of.overlapping.SNPs |
\n",
+ "\t | <chr> | <chr> | <chr> | <lgl> | <dbl> | <int> | <int> | <int> | <int> | <dbl> | <chr> | <lgl> | <chr> |
\n",
+ "\n",
+ "\n",
+ "\t1 | CD4T | rs111454690_HLA-DRB5 | CDK8 | FALSE | 9.630369e-06 | 14 | 5 | 2778 | 8515 | 1.640373e-03 | rs111454690 | FALSE | |
\n",
+ "\t2 | CD4T | rs111454690_HLA-DRB5 | SNRNP70 | FALSE | 1.209254e-09 | 11 | 8 | 649 | 10644 | 6.179288e-07 | rs111454690 | FALSE | |
\n",
+ "\n",
+ "
\n"
+ ],
+ "text/latex": [
+ "A data.frame: 2 × 13\n",
+ "\\begin{tabular}{r|lllllllllllll}\n",
+ " & Cell.type & eQTL..SNP.eGene. & TF & TF.is.a.co.eGene. & enrichment.p.value & X..TF.overlap...co.eGene & X..TF.overlap...background & X..no.TF.overlap...co.eGene & X..background.gene...not.co.eGene & enrichment.fdr & eQTL.SNP & SNP.overlaps.TF. & Names.of.overlapping.SNPs\\\\\n",
+ " & & & & & & & & & & & & & \\\\\n",
+ "\\hline\n",
+ "\t1 & CD4T & rs111454690\\_HLA-DRB5 & CDK8 & FALSE & 9.630369e-06 & 14 & 5 & 2778 & 8515 & 1.640373e-03 & rs111454690 & FALSE & \\\\\n",
+ "\t2 & CD4T & rs111454690\\_HLA-DRB5 & SNRNP70 & FALSE & 1.209254e-09 & 11 & 8 & 649 & 10644 & 6.179288e-07 & rs111454690 & FALSE & \\\\\n",
+ "\\end{tabular}\n"
+ ],
+ "text/markdown": [
+ "\n",
+ "A data.frame: 2 × 13\n",
+ "\n",
+ "| | Cell.type <chr> | eQTL..SNP.eGene. <chr> | TF <chr> | TF.is.a.co.eGene. <lgl> | enrichment.p.value <dbl> | X..TF.overlap...co.eGene <int> | X..TF.overlap...background <int> | X..no.TF.overlap...co.eGene <int> | X..background.gene...not.co.eGene <int> | enrichment.fdr <dbl> | eQTL.SNP <chr> | SNP.overlaps.TF. <lgl> | Names.of.overlapping.SNPs <chr> |\n",
+ "|---|---|---|---|---|---|---|---|---|---|---|---|---|---|\n",
+ "| 1 | CD4T | rs111454690_HLA-DRB5 | CDK8 | FALSE | 9.630369e-06 | 14 | 5 | 2778 | 8515 | 1.640373e-03 | rs111454690 | FALSE | |\n",
+ "| 2 | CD4T | rs111454690_HLA-DRB5 | SNRNP70 | FALSE | 1.209254e-09 | 11 | 8 | 649 | 10644 | 6.179288e-07 | rs111454690 | FALSE | |\n",
+ "\n"
+ ],
+ "text/plain": [
+ " Cell.type eQTL..SNP.eGene. TF TF.is.a.co.eGene. enrichment.p.value\n",
+ "1 CD4T rs111454690_HLA-DRB5 CDK8 FALSE 9.630369e-06 \n",
+ "2 CD4T rs111454690_HLA-DRB5 SNRNP70 FALSE 1.209254e-09 \n",
+ " X..TF.overlap...co.eGene X..TF.overlap...background\n",
+ "1 14 5 \n",
+ "2 11 8 \n",
+ " X..no.TF.overlap...co.eGene X..background.gene...not.co.eGene enrichment.fdr\n",
+ "1 2778 8515 1.640373e-03 \n",
+ "2 649 10644 6.179288e-07 \n",
+ " eQTL.SNP SNP.overlaps.TF. Names.of.overlapping.SNPs\n",
+ "1 rs111454690 FALSE \n",
+ "2 rs111454690 FALSE "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "head(old_enrichments,2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 300,
+ "id": "1e378133-c45d-4ca4-bfe0-b6b89e2dd7f2",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "colnames(old_enrichments) = paste0('ReMap', colnames(old_enrichments))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 301,
+ "id": "2f103548-f736-4a81-ba6c-af85ac4da9c4",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "combined = merge(enrichment, old_enrichments, by.x = c('TRANSFAC_cell_type', 'TRANSFAC_snp_eGene', 'TRANSFAC_tf'), by.y = c('ReMapCell.type', 'ReMapeQTL..SNP.eGene.', 'ReMapTF'))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 303,
+ "id": "2b4a65ee-ecf1-4d93-9718-88b5f9f49d20",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "31"
+ ],
+ "text/latex": [
+ "31"
+ ],
+ "text/markdown": [
+ "31"
+ ],
+ "text/plain": [
+ "[1] 31"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "nrow(combined)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 306,
+ "id": "0ae3f173-266e-4720-bea0-fbd297c726d5",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "