diff --git a/Collagen_Bx2-4.ipynb b/Collagen_Bx2-4.ipynb
new file mode 100755
index 0000000..a30d41c
--- /dev/null
+++ b/Collagen_Bx2-4.ipynb
@@ -0,0 +1,506 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#load libraries\n",
+ "\n",
+ "import pandas as pd\n",
+ "import matplotlib.pyplot as plt\n",
+ "import numpy as np\n",
+ "import os\n",
+ "import copy\n",
+ "import seaborn as sns\n",
+ "import importlib\n",
+ "import scipy\n",
+ "\n",
+ "import scanpy as sc\n",
+ "from sklearn.cluster import KMeans\n",
+ "from sklearn.preprocessing import scale, minmax_scale\n",
+ "from sklearn.metrics import silhouette_score\n",
+ "import matplotlib as mpl\n",
+ "mpl.rc('figure', max_open_warning = 0)\n",
+ "#mpl.font_manager._rebuild()\n",
+ "mpl.rcParams['mathtext.fontset'] = 'custom'\n",
+ "mpl.rcParams['mathtext.it'] = 'Arial:italic'\n",
+ "mpl.rcParams['mathtext.rm'] = 'Arial'\n",
+ "mpl.rcParams['font.sans-serif'] = \"Arial\"\n",
+ "mpl.rcParams['font.family'] = \"sans-serif\"\n",
+ "mpl.rc('font', serif='Arial') \n",
+ "codedir = os.getcwd()\n",
+ "#load cmif libraries\n",
+ "#os.chdir('/home/groups/graylab_share/OMERO.rdsStore/engje/Data/cmIF')\n",
+ "from mplex_image import visualize as viz, process, preprocess, normalize"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "os.chdir(codedir)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "np.random.seed(222)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Table of contents \n",
+ "1. [Load Data](#load)\n",
+ "2. [Normalize](#norm)\n",
+ "6. [Visualize Normalization](#normviz)\n",
+ "[leiden for cell typing](#clusterlei)\n",
+ "7. [Cluster K means](#cluster)\n",
+ "8. [Leiden cluster](#clust1)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#load data\n",
+ "os.chdir(f'{codedir}/paper_data')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "s_date = '20210402'\n",
+ "if not os.path.exists(s_date):\n",
+ " os.mkdir(s_date)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Load Data \n",
+ "\n",
+ "2.\tAs Ki67 is not continuous antigen, can you count positive cells (Proliferative cluster) by distance (<25, 25-50, 50-75, >75) from collagen I in each Bx?\n",
+ "\n",
+ "3.\tCould you map cells by distance (<25, 25-50, 50-75, >75) from collagen I in each Bx? If you can add a distance column (1-4) in the cluster csv, I can make it in Qi.\n",
+ "\n",
+ "4.\tCould you try to see the correlation between ER/PCNA and (VIM+aSMA+CD31)? – not necessary to show significance. (see attached image from Bx1 Scene-003)\n",
+ "\n",
+ "[contents](#contents)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### not normalized"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_mi = pd.read_csv('20210324_SMTBx1-4_JE-TMA-43_60_62_FilteredMeanIntensity.csv',index_col=0) \n",
+ "df_mi['slide'] = [item.split('_')[0] for item in df_mi.index]\n",
+ "df_mi['slide_scene'] = [item.split('_cell')[0] for item in df_mi.index]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "for s_file in os.listdir():\n",
+ " if s_file.find('MaskDistances') > -1:\n",
+ " print(s_file)\n",
+ "df_mask = pd.DataFrame()\n",
+ "for s_sample in ['SMT101Bx1-16','SMTBx2-5','SMTBx3','SMTBx4-3','HTA-33']: #'SMT101Bx4-3',\n",
+ " df_mask = df_mask.append(pd.read_csv(f'features_{s_sample}_MaskDistances.csv',index_col=0))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_mask.columns\n",
+ "ls_target = ['Vim_dist','CD31_dist', 'PDPN_dist', 'aSMA_dist', 'CD68_dist','ColI_dist', 'ColIV_dist']\n",
+ "ls_marker = ['ER_nuclei','Ki67_nuclei','PCNA_nuclei']\n",
+ "ls_drop = ['HTA-33_scene001','SMTBx1-16_scene001'#,'SMT101Bx4-3_scene001','SMT101Bx4-3_scene002'\n",
+ " ]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df = df_mi.merge(df_mask.loc[:,ls_target],left_index=True,right_index=True)\n",
+ "df = df[(~df.Vim_dist.isna()) & (~df.slide_scene.isin(ls_drop))]\n",
+ "df.loc[:,ls_target] = df.loc[:,ls_target]*.325"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "scrolled": false
+ },
+ "outputs": [],
+ "source": [
+ "#fit\n",
+ "data = df.loc[:,ls_marker].T\n",
+ "batch = df.slide\n",
+ "bayesdata = normalize.combat(data, batch)\n",
+ "df_norm = bayesdata.T"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_norm['slide'] = df.slide\n",
+ "df_norm.groupby('slide').mean()\n",
+ "df_norm.groupby('slide').std()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df['Vim-CD31-aSMA_dist'] = df.loc[:,['Vim_dist','CD31_dist','aSMA_dist']].min(axis=1)\n",
+ "ls_target = ls_target + ['Vim-CD31-aSMA_dist']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "mpl.rcParams['pdf.fonttype'] = 42\n",
+ "mpl.rcParams['ps.fonttype'] = 42\n",
+ "%matplotlib inline\n",
+ "#by tissue no Bx1\n",
+ "sns.set(style='white')\n",
+ "import matplotlib.ticker as tic\n",
+ "import warnings\n",
+ "warnings.filterwarnings('ignore')\n",
+ "tot = 0\n",
+ "ls_dist = [25, 50, 75]\n",
+ "i_diff = 25\n",
+ "ls_slide = ['SMTBx2-5', 'SMTBx3','SMT1Bx4-3'] #'\n",
+ "d_slide = {'SMTBx1-16':'Bx1', 'SMTBx2-5':'Bx2', 'SMTBx3':'Bx3','HTA-33':'Bx4-HTAN','SMTBx4-3':'Bx4'}\n",
+ "for s_target in ['ColI_dist', 'ColIV_dist','Vim-CD31-aSMA_dist']:\n",
+ " print(s_target)\n",
+ " fig, ax = plt.subplots(3,2, figsize=(4.5,4),sharex=True,dpi=300)\n",
+ " for idxc, s_slide in enumerate(ls_slide):\n",
+ " print(s_slide)\n",
+ " df_slide = df[df.slide==s_slide]\n",
+ " for idx, s_marker in enumerate(['ER_nuclei', 'PCNA_nuclei']): #,'Ki67_nuclei']):\n",
+ " print(s_marker)\n",
+ " df_result = pd.DataFrame(index=df_slide.index)\n",
+ " for s_dist in ls_dist:\n",
+ " b_bool = (df_slide.loc[:,s_target] < s_dist) & (df_slide.loc[:,s_target] >= s_dist - i_diff)\n",
+ " df_result.loc[b_bool,f'{s_marker}_{s_dist}'] = df_slide.loc[b_bool,s_marker]\n",
+ " for s_col in df_result.columns:\n",
+ " sns.kdeplot(df_result.loc[:,s_col].dropna(), ax=ax[idxc,idx],\n",
+ " label=f\"< {s_col.split('_')[2]}\"#,fill=True, alpha=0.3\n",
+ " )\n",
+ " if df_result.mean().fillna(0)[2] == 0:\n",
+ " statistic, pvalue = scipy.stats.f_oneway(df_result.iloc[:,0].dropna(),df_result.iloc[:,1].dropna())\n",
+ " print(len(df_result.iloc[:,0].dropna()))\n",
+ " print(len(df_result.iloc[:,1].dropna()))\n",
+ " else:\n",
+ " statistic, pvalue = scipy.stats.f_oneway(df_result.iloc[:,0].dropna(),df_result.iloc[:,1].dropna(),df_result.iloc[:,2].dropna())\n",
+ " print(len(df_result.iloc[:,0].dropna()))\n",
+ " print(len(df_result.iloc[:,1].dropna()))\n",
+ " print('over75')\n",
+ " print(len(df_result.iloc[:,2].dropna()))\n",
+ " ax[idxc,idx].set_xlabel(f\"{s_col.split('_')[0]} Intensity\",fontname=\"Arial\",fontsize=18)\n",
+ " ax[idxc,idx].set_ylabel(f\"\")\n",
+ " ax[idxc,idx].set_title(f\"\")\n",
+ " temp = tic.MaxNLocator(3)\n",
+ " ax[idxc,idx].set_yticklabels(())\n",
+ " ax[idxc,idx].xaxis.set_major_locator(temp)\n",
+ " tot+=1\n",
+ " if pvalue < 0.001: # 0.05/30: #bonferoni correction\n",
+ " ax[idxc,idx].text(0.42, 0.87, '*',\n",
+ " horizontalalignment='center',\n",
+ " verticalalignment='center',\n",
+ " transform=ax[idxc,idx].transAxes)\n",
+ " ax[idxc,idx].set_xlim(-1000,5500)\n",
+ " ax[idxc,idx].spines['right'].set_visible(False)\n",
+ " ax[idxc,idx].spines['left'].set_visible(False)\n",
+ " ax[idxc,idx].spines['top'].set_visible(False)\n",
+ " #print(ax[idxc,idx].get_xticklabels())\n",
+ " #ax[idxc,idx].set_xticklabels(ax[idxc,idx].get_xticklabels(),{'fontsize':16})\n",
+ " ax[idxc,0].set_ylabel(f\"{d_slide[s_slide]}\",fontname=\"Arial\",fontsize=18)\n",
+ " ax[2,1].legend(title='$\\mu$m',borderpad=.3,labelspacing=.3,loc=4,fontsize=14)\n",
+ " plt.subplots_adjust(wspace=.001,hspace=.001)\n",
+ " plt.suptitle(f\"Distance to {s_target.split('_')[0]}\",y=.93,fontname=\"Arial\",fontsize=24)\n",
+ " plt.tight_layout()\n",
+ " fig.savefig(f'./{s_date}/IntensityvsDistance_{i_diff}s_{s_target}_by_slide_noBx1.png',dpi=300)\n",
+ " #fig.savefig(f'./{s_date}/IntensityvsDistance_{i_diff}s_{s_target}_by_slide_noBx1.pdf',dpi=200)\n",
+ " #break"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ " 0.05/30"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tot"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "from matplotlib import gridspec\n",
+ "ax_objs = []\n",
+ "ls_slide = ['SMTBx2-5', 'SMTBx3','SMT1Bx4-3'] #'\n",
+ "d_slide = {'SMTBx1-16':'Bx1', 'SMTBx2-5':'Bx2', 'SMTBx3':'Bx3','HTA-33':'Bx4-HTAN','SMTBx4-3':'Bx4'}\n",
+ "for s_target in ['ColI_dist', 'ColIV_dist','Vim-CD31-aSMA_dist']:\n",
+ " fig = plt.figure(figsize=(5.5,3.5),dpi=300)\n",
+ " gs = gridspec.GridSpec(nrows=3, ncols=2,figure=fig, \n",
+ " wspace=0.1, hspace=0.05,left=0.1, right=.75\n",
+ " )\n",
+ " for idxc, s_slide in enumerate(ls_slide):\n",
+ " df_slide = df[df.slide==s_slide]\n",
+ " for idx, s_marker in enumerate(['ER_nuclei', 'PCNA_nuclei']):\n",
+ " ax_objs.append(fig.add_subplot(gs[idxc,idx]))\n",
+ " df_result = pd.DataFrame(index=df_slide.index)\n",
+ " for s_dist in ls_dist:\n",
+ " b_bool = (df_slide.loc[:,s_target] < s_dist) & (df_slide.loc[:,s_target] >= s_dist - i_diff)\n",
+ " df_result.loc[b_bool,f'{s_marker}_{s_dist}'] = df_slide.loc[b_bool,s_marker]\n",
+ " for s_col in df_result.columns:\n",
+ " g =sns.kdeplot(df_result.loc[:,s_col].dropna(), ax=ax_objs[-1],\n",
+ " label=f\"< {s_col.split('_')[2]}\"#,fill=True,alpha=0.5\n",
+ " )\n",
+ " if df_result.mean().fillna(0)[2] == 0:\n",
+ " statistic, pvalue = scipy.stats.f_oneway(df_result.iloc[:,0].dropna(),df_result.iloc[:,1].dropna())\n",
+ " #print(pvalue)\n",
+ " else:\n",
+ " statistic, pvalue = scipy.stats.f_oneway(df_result.iloc[:,0].dropna(),df_result.iloc[:,1].dropna(),df_result.iloc[:,2].dropna())\n",
+ " ax_objs[-1].set_ylabel(f\"\")\n",
+ " ax_objs[-1].set_title(f\"\")\n",
+ " temp = tic.MaxNLocator(3)\n",
+ " ax_objs[-1].set_yticklabels(())\n",
+ " ax_objs[-1].xaxis.set_major_locator(temp)\n",
+ " tot+=1\n",
+ " if pvalue < 0.001: # 0.05/30: #bonferoni correction\n",
+ " ax_objs[-1].text(0.55, 0.65, '*',\n",
+ " horizontalalignment='center',\n",
+ " verticalalignment='center',\n",
+ " transform=ax_objs[-1].transAxes)\n",
+ " ax_objs[-1].set_xlim(-1000,5500)\n",
+ " ax_objs[-1].spines['right'].set_visible(False)\n",
+ " ax_objs[-1].spines['left'].set_visible(False)\n",
+ " ax_objs[-1].spines['top'].set_visible(False)\n",
+ " #ax_objs[-1].spines['bottom'].set_visible(False)\n",
+ " ax_objs[-1].set_xlabel('')\n",
+ " rect = ax_objs[-1].patch\n",
+ " rect.set_alpha(0)\n",
+ " if idx == 0:\n",
+ " ax_objs[-1].set_ylabel(f\"{d_slide[s_slide]}\",fontsize=18)\n",
+ " if idx==1:\n",
+ " if idxc == 2:\n",
+ " ax_objs[-1].legend(title='$\\mu$m',borderpad=.3,labelspacing=.3,fontsize=12,loc='upper left', bbox_to_anchor=(1.05, 1.5))\n",
+ " if idxc ==2:\n",
+ " ax_objs[-1].set_xlabel(f\"{s_col.split('_')[0]} Intensity\",fontsize=18)\n",
+ " else:\n",
+ " ax_objs[-1].set_xticklabels([]) \n",
+ " plt.suptitle(f\"Distance to {s_target.split('_')[0]}\",x=.45,y=.95,fontsize=20)\n",
+ " gs.update(bottom = 0.2)\n",
+ " fig.savefig(f'./{s_date}/IntensityvsDistance_{i_diff}s_{s_target}_by_slide_noBx1_bigger.png',dpi=200)\n",
+ " #break"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#by tissue w bx1\n",
+ "%matplotlib inline\n",
+ "sns.set(style='white')\n",
+ "import matplotlib.ticker as tic\n",
+ "import warnings\n",
+ "warnings.filterwarnings('ignore')\n",
+ "tot = 0\n",
+ "ls_dist = [25, 50, 75]\n",
+ "i_diff = 25\n",
+ "ls_slide = ['SMTBx1-16','SMTBx2-5', 'SMTBx3','SMT1Bx4-3'] #'\n",
+ "d_slide = {'SMTBx1-16':'Bx1', 'SMTBx2-5':'Bx2', 'SMTBx3':'Bx3','HTA-33':'Bx4-HTAN','SMTBx4-3':'Bx4'}\n",
+ "for s_target in ls_target + ['Vim-CD31-aSMA_dist']: #['CD68_dist','ColI_dist', 'ColIV_dist']:\n",
+ " fig, ax = plt.subplots(4,3, figsize=(7,5),sharex=True,dpi=300)\n",
+ " for idxc, s_slide in enumerate(ls_slide):\n",
+ " df_slide = df[df.slide==s_slide]\n",
+ " for idx, s_marker in enumerate(ls_marker):\n",
+ " df_result = pd.DataFrame(index=df_slide.index)\n",
+ " for s_dist in ls_dist:\n",
+ " b_bool = (df_slide.loc[:,s_target] < s_dist) & (df_slide.loc[:,s_target] >= s_dist - i_diff)\n",
+ " df_result.loc[b_bool,f'{s_marker}_{s_dist}'] = df_slide.loc[b_bool,s_marker]\n",
+ " for s_col in df_result.columns:\n",
+ " sns.kdeplot(df_result.loc[:,s_col].dropna(), ax=ax[idxc,idx], label=f\"< {s_col.split('_')[2]}\")\n",
+ " if df_result.mean().fillna(0)[2] == 0:\n",
+ " statistic, pvalue = scipy.stats.f_oneway(df_result.iloc[:,0].dropna(),df_result.iloc[:,1].dropna())\n",
+ " #print(pvalue)\n",
+ " else:\n",
+ " statistic, pvalue = scipy.stats.f_oneway(df_result.iloc[:,0].dropna(),df_result.iloc[:,1].dropna(),df_result.iloc[:,2].dropna())\n",
+ " ax[idxc,idx].set_xlabel(f\"{s_col.split('_')[0]} Intensity\",fontsize=18)\n",
+ " ax[idxc,idx].set_ylabel(f\"\")\n",
+ " ax[idxc,idx].set_title(f\"\")\n",
+ " temp = tic.MaxNLocator(3)\n",
+ " ax[idxc,idx].set_yticklabels(())\n",
+ " ax[idxc,idx].xaxis.set_major_locator(temp)\n",
+ " tot+=1\n",
+ " if pvalue < 0.001: # 0.05/30: #bonferoni correction\n",
+ " ax[idxc,idx].text(0.5, 0.8, '*',\n",
+ " horizontalalignment='center',\n",
+ " verticalalignment='center',\n",
+ " transform=ax[idxc,idx].transAxes)\n",
+ " ax[idxc,idx].set_xlim(-1500,7000)\n",
+ " ax[idxc,idx].spines['right'].set_visible(False)\n",
+ " ax[idxc,idx].spines['left'].set_visible(False)\n",
+ " ax[idxc,idx].spines['top'].set_visible(False)\n",
+ " ax[idxc,0].set_ylabel(f\"{d_slide[s_slide]}\",fontsize=18)\n",
+ " ax[0,2].legend(title='$\\mu$m')\n",
+ " plt.subplots_adjust(wspace=.001,hspace=.001)\n",
+ " plt.suptitle(f\"Distance to {s_target.split('_')[0]}\",fontsize=20)\n",
+ " plt.tight_layout()\n",
+ " fig.savefig(f'./{s_date}/IntensityvsDistance_25s_{s_target}_by_slide.png',dpi=300)\n",
+ " #break"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#by tissue w bx1\n",
+ "%matplotlib inline\n",
+ "sns.set(style='white')\n",
+ "import matplotlib.ticker as tic\n",
+ "import warnings\n",
+ "warnings.filterwarnings('ignore')\n",
+ "tot = 0\n",
+ "ls_dist = [25, 50, 75]\n",
+ "i_diff = 25\n",
+ "ls_slide = ['SMTBx2-5', 'SMTBx3','SMT1Bx4-3'] #'SMTBx1-16',\n",
+ "d_slide = {'SMTBx1-16':'Bx1', 'SMTBx2-5':'Bx2', 'SMTBx3':'Bx3','HTA-33':'Bx4-HTAN','SMTBx4-3':'Bx4'}\n",
+ "for s_target in ['ColI_dist', 'ColIV_dist']:\n",
+ " fig, ax = plt.subplots(3,3, figsize=(7,4),sharex=True)\n",
+ " for idxc, s_slide in enumerate(ls_slide):\n",
+ " df_slide = df[df.slide==s_slide]\n",
+ " for idx, s_marker in enumerate(ls_marker):\n",
+ " df_result = pd.DataFrame(index=df_slide.index)\n",
+ " for s_dist in ls_dist:\n",
+ " b_bool = (df_slide.loc[:,s_target] < s_dist) & (df_slide.loc[:,s_target] >= s_dist - i_diff)\n",
+ " df_result.loc[b_bool,f'{s_marker}_{s_dist}'] = df_slide.loc[b_bool,s_marker]\n",
+ " for s_col in df_result.columns:\n",
+ " sns.kdeplot(df_result.loc[:,s_col].dropna(), ax=ax[idxc,idx], label=f\"< {s_col.split('_')[2]}\")\n",
+ " if df_result.mean().fillna(0)[2] == 0:\n",
+ " statistic, pvalue = scipy.stats.f_oneway(df_result.iloc[:,0].dropna(),df_result.iloc[:,1].dropna())\n",
+ " #print(pvalue)\n",
+ " else:\n",
+ " statistic, pvalue = scipy.stats.f_oneway(df_result.iloc[:,0].dropna(),df_result.iloc[:,1].dropna(),df_result.iloc[:,2].dropna())\n",
+ " ax[idxc,idx].set_xlabel(f\"{s_col.split('_')[0]} Intensity\")\n",
+ " ax[idxc,idx].set_ylabel(f\"\")\n",
+ " ax[idxc,idx].set_title(f\"\")\n",
+ " temp = tic.MaxNLocator(3)\n",
+ " ax[idxc,idx].set_yticklabels(())\n",
+ " ax[idxc,idx].xaxis.set_major_locator(temp)\n",
+ " tot+=1\n",
+ " if pvalue < 0.001: # 0.05/30: #bonferoni correction\n",
+ " ax[idxc,idx].text(0.5, 0.8, '*',\n",
+ " horizontalalignment='center',\n",
+ " verticalalignment='center',\n",
+ " transform=ax[idxc,idx].transAxes)\n",
+ " ax[idxc,idx].set_xlim(-1500,7000)\n",
+ " ax[idxc,idx].spines['right'].set_visible(False)\n",
+ " ax[idxc,idx].spines['left'].set_visible(False)\n",
+ " ax[idxc,idx].spines['top'].set_visible(False)\n",
+ " ax[idxc,0].set_ylabel(f\"{d_slide[s_slide]}\")\n",
+ " ax[0,2].legend(title='$\\mu$m')\n",
+ " plt.subplots_adjust(wspace=.001,hspace=.001)\n",
+ " plt.suptitle(f\"Distance to {s_target.split('_')[0]}\")\n",
+ " plt.tight_layout()\n",
+ " fig.savefig(f'./{s_date}/IntensityvsDistance_25s_{s_target}_by_slide.png',dpi=200)\n",
+ " #break"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python3.9.5",
+ "language": "python",
+ "name": "python3.9.5"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.5"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/GateCellTypes.ipynb b/GateCellTypes.ipynb
new file mode 100755
index 0000000..bba7702
--- /dev/null
+++ b/GateCellTypes.ipynb
@@ -0,0 +1,573 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#load libraries\n",
+ "import pandas as pd\n",
+ "import matplotlib.pyplot as plt\n",
+ "import numpy as np\n",
+ "import os\n",
+ "import copy\n",
+ "import seaborn as sns\n",
+ "import importlib\n",
+ "from matplotlib import cm\n",
+ "import matplotlib as mpl\n",
+ "mpl.rc('figure', max_open_warning = 0)\n",
+ "mpl.rcParams['pdf.fonttype'] = 42\n",
+ "mpl.rcParams['ps.fonttype'] = 42\n",
+ "mpl.rcParams['mathtext.fontset'] = 'custom'\n",
+ "mpl.rcParams['mathtext.it'] = 'Arial:italic'\n",
+ "mpl.rcParams['mathtext.rm'] = 'Arial'\n",
+ "codedir = os.getcwd()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#load cmif libraries\n",
+ "#os.chdir('/home/groups/graylab_share/OMERO.rdsStore/engje/Data/cmIF')\n",
+ "from mplex_image import visualize as viz, process, preprocess, gating"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "os.chdir(codedir)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Notes\n",
+ "\n",
+ "use CD45 to gate immune (CD3 more artifact)\n",
+ "\n",
+ "update 20200402: add SMT-Bx2-5 and HTA-33, simplified gating."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Load Data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#set location of files\n",
+ "#load data\n",
+ "rootdir = f'{codedir}/paper_data'\n",
+ "# go to location of files\n",
+ "os.chdir(rootdir)\n",
+ "preprocess.cmif_mkdir(['GatingPlots'])\n",
+ "#os.listdir()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# 3 define samples to work with/ image combos\n",
+ "ls_sample = ['20210402_SMT']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_data = pd.DataFrame()\n",
+ "for s_sample in ls_sample:\n",
+ " df_data = df_data.append(pd.read_csv(f'{s_sample}_ManualPositive.csv',index_col=0))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_data.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "d_rename = {'CD4':'CD4_Ring','CD8':'CD8_Ring',\n",
+ " #'HER2':'HER2_Ring','ER':'ER_Nuclei'\n",
+ " }\n",
+ "df_data = df_data.rename(d_rename, axis=1)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Specify Gating Strategy"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#parameters\n",
+ "\n",
+ "# cell types\n",
+ "ls_endothelial = ['CD31']\n",
+ "ls_immune = ['CD45','CD68'] \n",
+ "ls_tumor = ['CK7','CK19','Ecad'] \n",
+ "ls_prolif = ['Ki67']\n",
+ "\n",
+ "#tcell/myeloid\n",
+ "s_tcell = 'CD45' \n",
+ "s_bcell = 'CD20'\n",
+ "s_myeloid = 'CD68'\n",
+ "ls_immune_functional = ['PD1','CD44','prolif'] # not in dataset: 'FoxP3_Nuclei','GRNZB_Nuclei',\n",
+ "\n",
+ "#luminal/basal/mesenchymal\n",
+ "ls_luminal = ['CK19','CK7'] # not in dataset 'CK8_Ring'\n",
+ "ls_basal = ['CK5','CK14'] \n",
+ "ls_mes = ['CD44', 'Vim'] \n",
+ "ls_tumor_plus = ['Ecad'] + ['Lum']\n",
+ "ls_stromal_function = ['Vim','aSMA','PDPN']\n",
+ "ls_tumor_prolif = ['PCNA','Ki67','pHH3'] \n",
+ "\n",
+ "#index of cell line samples (i.e. 100% tumor)\n",
+ "ls_cellline_index = []\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#custom gating\n",
+ "df_data = gating.main_celltypes(df_data,ls_endothelial,ls_immune,ls_tumor,ls_cellline_index)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_data.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#add normal liver\n",
+ "df_data.loc[(~df_data.loc[:,ls_luminal].any(axis=1) & df_data.loc[:,'Ecad'] & df_data.loc[:,'tumor']),'celltype'] = 'epithelial'\n",
+ "df_data.loc[df_data.celltype == 'epithelial','tumor'] = False\n",
+ "df_data.loc[df_data.celltype == 'epithelial','epithelial'] = True\n",
+ "df_data.loc[df_data.celltype != 'epithelial','epithelial'] = False\n",
+ "df_data.epithelial = df_data.epithelial.astype('bool')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "importlib.reload(gating)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Perform Gating"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%%capture\n",
+ "#simple gating\n",
+ "df_data = gating.proliferation(df_data,ls_prolif)\n",
+ "df_data = gating.immune_types(df_data,s_myeloid,s_bcell,s_tcell)\n",
+ "df_data = gating.cell_prolif(df_data)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%%capture\n",
+ "#cutom gating (skip)\n",
+ "'''\n",
+ "df_data = gating.immune_functional(df_data,ls_immune_functional)\n",
+ "df_data = gating.diff_hr_state(df_data,ls_luminal,ls_basal,ls_mes)\n",
+ "df_data = gating.celltype_gates(df_data,ls_tumor_prolif,s_new_name='TumorProlif',s_celltype='tumor')\n",
+ "#df_data = gating.celltype_gates(df_data,ls_tumor_plus,s_new_name='TumorDiffPlus',s_celltype='tumor')\n",
+ "df_data = gating.celltype_gates(df_data,ls_stromal_function,s_new_name='StromalType',s_celltype='stromal')\n",
+ "'''"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_data = gating.non_tumor(df_data)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Output Gating Data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#check\n",
+ "ls_drop = ['ColI', 'ColIV', 'CD20', 'CD3', 'CD44', 'CK14',\n",
+ " 'CK5', 'ER', 'HER2', 'LamAC', 'PCNA', 'PD1', 'pHH3']\n",
+ "df_data.loc[:,df_data.dtypes=='object'].drop(ls_drop,axis=1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#drop extra colums\n",
+ "df_gate = df_data.loc[:,df_data.dtypes!='bool'].drop(ls_drop,axis=1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#handcrafted stromal populations (skip)\n",
+ "'''\n",
+ "d_rename_stroma = {'stromal_Vim_aSMA':'myofibroblast', 'stromal_aSMA':'myofibroblast', 'stromal___':'stromal', 'stromal_Vim':'fibroblast',\n",
+ " 'stromal_PDPN_Vim_aSMA':'myofibroblast', 'stromal_PDPN_Vim':'fibroblast', 'stromal_PDPN':'lymphatic',\n",
+ " 'stromal_PDPN_aSMA':'myofibroblast'}\n",
+ "df_gate.NonTumor = df_gate.NonTumor.replace(d_rename_stroma)\n",
+ "df_gate['FinalCell'] = df_gate.NonTumor.fillna(df_gate.CellProlif).fillna(df_gate.celltype)\n",
+ "df_gate.FinalCell = df_gate.FinalCell.replace({'tumor_nonprolif':'tumor','liver_nonprolif':'liver','liver_prolif':'liver'})\n",
+ "'''"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_gate.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "s_out = '20210402_SMT'\n",
+ "if not os.path.exists(f'{s_out}_GatedPositiveCellNames.csv'):\n",
+ " print('saving new csv')\n",
+ " df_gate.to_csv(f'{s_out}_GatedPositiveCellNames.csv')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Plot\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#importlib.reload(viz)\n",
+ "s_out = '20210402_SMT'\n",
+ "f'{s_out}_GatedPositiveCellNames.csv'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_data = pd.read_csv(f'{s_out}_GatedPositiveCellNames.csv',index_col=0)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#df_data['Stromal'] = df_data.StromalType.replace(d_rename_stroma)\n",
+ "#df_data['NonTumor'] = df_data.NonTumor.replace(d_rename_stroma)\n",
+ "#df_data['NonTumorFunc'] = df_data.NonTumorFunc.replace(d_rename_stroma)\n",
+ "#handcrafted stromal populations\n",
+ "#d_rename_stroma = {'stromal_Vim_aSMA':'myofibroblast', 'stromal_aSMA':'myofibroblast', 'stromal___':'stromal', 'stromal_Vim':'fibroblast',\n",
+ "# 'stromal_PDPN_Vim_aSMA':'myofibroblast', 'stromal_PDPN_Vim':'fibroblast', 'stromal_PDPN':'lymphatic',\n",
+ "# 'stromal_PDPN_aSMA':'myofibroblast'}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "(df_data.columns == 'FinalCell').any()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#combined cell type (run once)\n",
+ "if not (df_data.columns == 'FinalCell').any():\n",
+ " df_data.loc[df_data.celltype == 'tumor','FinalCell'] = df_data.loc[df_data.celltype == 'tumor','CellProlif']\n",
+ " df_data.loc[df_data.celltype != 'tumor','FinalCell'] = df_data.loc[df_data.celltype != 'tumor','celltype']\n",
+ " df_data.loc[df_data.celltype == 'immune','FinalCell'] = df_data.loc[df_data.celltype == 'immune','ImmuneType']\n",
+ "\n",
+ "#df_data.FinalCell.unique()\n",
+ "#df_data.to_csv(f'{s_out}_GatedPositiveCellNames.csv')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ls_drop = df_data.loc[((df_data.index.str.contains('HTA')) & (df_data.FinalCell=='epithelial'))].index"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# get rid epithelial\n",
+ "# except HTAN\n",
+ "df_data['FinalCell'] = df_data.FinalCell.replace({'epithelial':'stromal'})\n",
+ "df_data = df_data.drop(ls_drop)\n",
+ "df_data['countme'] = True\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%matplotlib inline\n",
+ "s_grouper='slide_scene'\n",
+ "\n",
+ "#calculate proportions\n",
+ "for s_cell in df_data.columns[(df_data.dtypes=='object') & ~(df_data.columns.isin([s_grouper]))].tolist():\n",
+ " df_prop = viz.prop_positive(df_data,s_cell=s_cell,s_grouper=s_grouper)\n",
+ " # make annotations\n",
+ " df_annot=pd.DataFrame(data={'ID': df_prop.index.tolist()},index=df_prop.index)\n",
+ " lut = dict(zip(sorted(df_annot.ID.unique()),cm.tab10.colors))\n",
+ " g, df_plot_less = viz.prop_clustermap(df_prop,df_annot,i_thresh =.01,lut=lut)\n",
+ " g.savefig(f'./GatingPlots/{s_cell}_clustermap.png',dpi=150)\n",
+ " plt.close()\n",
+ " fig = viz.prop_barplot(df_plot_less,s_cell,colormap=\"Spectral\")\n",
+ " fig.savefig(f'./GatingPlots/{s_cell}_bar.png',dpi=200)\n",
+ " break"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#group by tissue\n",
+ "df_data['slide_scene'] = [item.split('_')[0] for item in df_data.slide_scene]\n",
+ "df_data_select = df_data.loc[~df_data.slide_scene.isin(['HTA-33_scene001','SMTBx1-16_scene001']),:]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#by tissue\n",
+ "import warnings\n",
+ "warnings.filterwarnings('ignore')\n",
+ "s_grouper='slide_scene'\n",
+ "mpl.rcParams['pdf.fonttype'] = 42\n",
+ "mpl.rcParams['ps.fonttype'] = 42\n",
+ "\n",
+ "#calculate proportions\n",
+ "for s_cell in df_data.columns[(df_data.dtypes=='object') & ~(df_data.columns.isin([s_grouper]))].tolist():\n",
+ " df_prop = viz.prop_positive(df_data_select,s_cell=s_cell,s_grouper=s_grouper)\n",
+ " # make annotations\n",
+ " df_prop.to_csv(f'ManualGating_SMT_proportions_{s_cell}.csv')\n",
+ " df_annot=pd.DataFrame(data={'ID': df_prop.index.tolist()},index=df_prop.index)\n",
+ " lut = dict(zip(sorted(df_annot.ID.unique()),cm.tab10.colors))\n",
+ " g, df_plot_less = viz.prop_clustermap(df_prop,df_annot,i_thresh =.001,lut=lut)\n",
+ " g.savefig(f'./GatingPlots/{s_cell}_clustermap_tissue.pdf',dpi=150)\n",
+ " plt.close()\n",
+ " if df_plot_less.shape[1] < 8:\n",
+ " cmap = \"Spectral\"\n",
+ " elif df_plot_less.shape[1] < 11:\n",
+ " cmap = \"Paired\"\n",
+ " else:\n",
+ " cmap = \"tab20\"\n",
+ " fig = viz.prop_barplot(df_plot_less,s_cell,colormap=cmap)\n",
+ " fig.savefig(f'./GatingPlots/{s_cell}_bar_tissue.pdf',dpi=200)\n",
+ " break"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "s_date = '20210402'\n",
+ "d_crop = {'SMTBx2-5_scene001': (2000,9000),\n",
+ " 'SMTBx3_scene004': (20000,16000),\n",
+ " 'HTA-33_scene002': (3271, 607),\n",
+ " 'SMTBx1-16_scene003': (2440,220),\n",
+ " }\n",
+ "df_result = pd.DataFrame()\n",
+ "for s_tissue, tu_crop in d_crop.items():\n",
+ " df_scene = df_data.loc[df_data.index.str.contains(s_tissue)]\n",
+ " ls_index = df_scene.loc[((df_scene.DAPI_X > tu_crop[0]) & (df_scene.DAPI_X < tu_crop[0]+2500)) & (df_scene.DAPI_Y > tu_crop[1]) & (df_scene.DAPI_Y < tu_crop[1]+2500)].index\n",
+ " df_result = df_result.append(df_data.loc[ls_index])\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#by tissue\n",
+ "import warnings\n",
+ "warnings.filterwarnings('ignore')\n",
+ "s_grouper='slide_scene'\n",
+ "mpl.rcParams['pdf.fonttype'] = 42\n",
+ "mpl.rcParams['ps.fonttype'] = 42\n",
+ "d_rename = {'HTA-33':'Bx4', 'SMTBx1-16':'Bx1', 'SMTBx2-5':'Bx2', 'SMTBx3':'Bx3'}\n",
+ "\n",
+ "#calculate proportions\n",
+ "for s_cell in df_data.columns[(df_data.dtypes=='object') & ~(df_data.columns.isin([s_grouper]))].tolist():\n",
+ " df_prop = viz.prop_positive(df_result,s_cell=s_cell,s_grouper=s_grouper)\n",
+ " # make annotations\n",
+ " #df_prop.to_csv(f'ManualGating_SMT101_proportions_{s_cell}.csv')\n",
+ " df_annot=pd.DataFrame(data={'ID': df_prop.index.tolist()},index=df_prop.index)\n",
+ " lut = dict(zip(sorted(df_annot.ID.unique()),cm.tab10.colors))\n",
+ " g, df_plot_less = viz.prop_clustermap(df_prop,df_annot,i_thresh =.001,lut=lut)\n",
+ " g.savefig(f'./GatingPlots/{s_cell}_clustermap_tissue3.pdf',dpi=150)\n",
+ " plt.close()\n",
+ " if df_plot_less.shape[1] < 8:\n",
+ " cmap = \"Spectral\"\n",
+ " elif df_plot_less.shape[1] < 11:\n",
+ " cmap = \"Paired\"\n",
+ " else:\n",
+ " cmap = \"tab20\"\n",
+ " fig = viz.prop_barplot(df_plot_less.rename(d_rename),s_cell,colormap=cmap)\n",
+ " fig.set_size_inches(4.5, 2.3)\n",
+ " ax_list = fig.axes\n",
+ " ax_list[0].set_ylabel('')\n",
+ " ax_list[0].set_xlabel('Fraction of Cells')\n",
+ " ax_list[0].set_title('')\n",
+ " fig.suptitle('Gating Composition: Biopsies',x=0.5,y=0.9,fontsize=14)\n",
+ " plt.tight_layout()\n",
+ " fig.savefig(f'./GatingPlots/{s_cell}_bar_tissue3.png',dpi=200)\n",
+ " #fig.savefig(f'./{s_date}/{s_cell}_bar_tissue3.pdf',dpi=200)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "s_date"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python3.9.5",
+ "language": "python",
+ "name": "python3.9.5"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.5"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/Normalize_Bx2-4.ipynb b/Normalize_Bx2-4.ipynb
new file mode 100755
index 0000000..45a5f00
--- /dev/null
+++ b/Normalize_Bx2-4.ipynb
@@ -0,0 +1,1198 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#load libraries\n",
+ "\n",
+ "import pandas as pd\n",
+ "import matplotlib.pyplot as plt\n",
+ "\n",
+ "import numpy as np\n",
+ "import os\n",
+ "import copy\n",
+ "import seaborn as sns\n",
+ "import importlib\n",
+ "from scipy.signal import argrelmax, find_peaks, peak_widths\n",
+ "import scanpy as sc\n",
+ "from sklearn.cluster import KMeans\n",
+ "from sklearn.preprocessing import scale, minmax_scale\n",
+ "from sklearn.metrics import silhouette_score\n",
+ "import matplotlib as mpl\n",
+ "mpl.rc('figure', max_open_warning = 0)\n",
+ "mpl.rcParams['pdf.fonttype'] = 42\n",
+ "mpl.rcParams['ps.fonttype'] = 42\n",
+ "mpl.rcParams['mathtext.it'] = 'Arial:italic'\n",
+ "mpl.rcParams['mathtext.rm'] = 'Arial'\n",
+ "codedir = os.getcwd()\n",
+ "#load cmif libraries\n",
+ "#os.chdir('/home/groups/graylab_share/OMERO.rdsStore/engje/Data/cmIF')\n",
+ "from mplex_image import visualize as viz, process, preprocess, normalize"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "os.chdir(codedir)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "np.random.seed(222)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Table of contents \n",
+ "1. [Load Data](#load)\n",
+ "2. [Normalize](#norm)\n",
+ "3. [Visualize Normalization](#normviz)\n",
+ "4. [leiden for cell typing](#clusterlei)\n",
+ "5. [Leiden cluster](#clust1)\n",
+ "\n",
+ "\n",
+ "note:\n",
+ "\n",
+ " Could you make composite fraction bar graph only in following regions?\n",
+ "\n",
+ " Bx2: SMTBx2-5-Scene-001_ROI1-2000-9000-2500-2500\n",
+ " Bx3: SMTBx3-Scene-004_ROI2-20900-15494-2500-2500\n",
+ " Bx4: HTA-33-Scene-002_ROI1-3271-607-2500-2500\n",
+ "\n",
+ " If we can have it in Bx1\n",
+ " Bx: SMTBx1-Scene-003_ROI1-2440-220-2500-2500\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#load data\n",
+ "os.chdir(f'{codedir}/paper_data')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "s_date = '20210402'\n",
+ "if not os.path.exists(s_date):\n",
+ " os.mkdir(s_date)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Load Data \n",
+ "\n",
+ "[contents](#contents)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "os.chdir(f'{codedir}/paper_data')\n",
+ "df_file = pd.DataFrame(index=os.listdir())\n",
+ "df_file = df_file[df_file.index.str.contains('FilteredMeanIntensity_DAPI')]\n",
+ "df_file['tissue'] = [item.split('_')[1] for item in df_file.index]\n",
+ "df_file['dapi'] = ['DAPI' + item.split('y_DAPI')[1].split('.')[0] for item in df_file.index]\n",
+ "ls_sample = df_file.tissue.tolist()\n",
+ "d_dapi = dict(zip(df_file.tissue.tolist(),df_file.dapi.tolist()))\n",
+ "d_dapi.update({'JE-TMA-60': 'DAPI10_DAPI2'})\n",
+ "df_mi = pd.DataFrame()\n",
+ "df_xy = pd.DataFrame()\n",
+ "df_edge = pd.DataFrame()\n",
+ "\n",
+ "for s_sample in sorted(set(ls_sample)):\n",
+ " #if not s_sample.find('HTA')>-1:\n",
+ " print(f'loading {s_sample}')\n",
+ " df_mi = df_mi.append(pd.read_csv(f'{codedir}/paper_data/features_{s_sample}_FilteredMeanIntensity_{d_dapi[s_sample]}.csv', index_col=0))\n",
+ " df_xy = df_xy.append(pd.read_csv(f'{codedir}/paper_data/features_{s_sample}_CentroidXY.csv',index_col=0))\n",
+ " if os.path.exists(f'{codedir}/paper_data/features_{s_sample}_EdgeCells153pixels_CentroidXY.csv'):\n",
+ " df_edge = df_edge.append(pd.read_csv(f'{codedir}/paper_data/features_{s_sample}_EdgeCells153pixels_CentroidXY.csv',index_col=0))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#sorted(df_mi.columns[df_mi[~df_mi.index.str.contains('JE-TMA-60')].isna().sum() != 0])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ls_marker = ['AR_nuclei', 'CD20_perinuc5', 'CD31_perinuc5', 'CD3_perinuc5', 'CD44_perinuc5', 'CD45_perinuc5',#'CD44_nucadj2',\n",
+ " 'CD4_perinuc5', 'CD68_perinuc5','CD8_perinuc5', 'CK14_cytoplasm', 'CK17_cytoplasm', 'CK19_cytoplasm', 'CK5_cytoplasm',\n",
+ " 'CK7_cytoplasm', 'CK8_cytoplasm', 'ColI_perinuc5', 'ColIV_perinuc5','CoxIV_perinuc5','EGFR_cytoplasm', 'ER_nuclei',\n",
+ " 'Ecad_cytoplasm', 'FoxP3_nuclei', 'GRNZB_nuclei', 'H3K27_nuclei','H3K4_nuclei', 'HER2_cellmem25','Ki67_nuclei',\n",
+ " 'LamAC_nuclei', 'PCNA_nuclei', 'PD1_perinuc5', 'PDPN_perinuc5','DAPI2_nuclei', # 'ER_nuclei25','HER2_cytoplasm','PgR_nuclei','Vim_nucadj2'\n",
+ " 'Vim_perinuc5', 'aSMA_perinuc5', 'pHH3_nuclei', 'pRB_nuclei', 'pS6RP_perinuc5','slide_scene',\n",
+ " ] # CD8R bad, 'gH2AX_nuclei' in R11 Bx3 not included\n",
+ "\n",
+ "df_mi = df_mi.loc[:,ls_marker]\n",
+ " \n",
+ "# old \n",
+ "#df_mi = df_mi.loc[:,['HER2_cellmem25', 'DAPI2_nuclei',# 'CD44_nucadj2', 'Vim_nucadj2','ER_nuclei25','HER2_cytoplasm',\n",
+ "# 'CD20_perinuc5', 'CD3_perinuc5', 'CD31_perinuc5', 'CD4_perinuc5','CD44_perinuc5', 'CD45_perinuc5', 'CD68_perinuc5', 'CD8_perinuc5',\n",
+ "# 'PD1_perinuc5', 'PDPN_perinuc5', 'Vim_perinuc5', 'aSMA_perinuc5','CK14_cytoplasm', 'CK17_cytoplasm', 'CK19_cytoplasm', 'CK5_cytoplasm',\n",
+ "# 'CK7_cytoplasm', 'Ecad_cytoplasm', 'ER_nuclei', 'Ki67_nuclei', 'LamAC_nuclei','PCNA_nuclei', 'pHH3_nuclei', 'slide_scene']]\n",
+ "\n",
+ "\n",
+ "df_mi['batch'] = [item.split('_')[0] for item in df_mi.index]\n",
+ "#df_mi['scene'] = [item.split('_')[1] for item in df_mi.index]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Deal with JE-TMA-60"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# markers in JE-TMA-60\n",
+ "#'JE-TMA-60_scene06', 'JE-TMA-60_scene08', 'JE-TMA-60_scene09', 'JE-TMA-60_scene10', 'JE-TMA-60_scene11', 'JE-TMA-60_scene13'\n",
+ "# R5 is CK17.PDPN.CD45.FoxP3"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_R5 = pd.read_csv(f'{codedir}/paper_data/features_JE-TMA-60_FilteredMeanIntensity_DAPI5_DAPI2.csv',index_col=0)\n",
+ "df_R4 = pd.read_csv(f'{codedir}/paper_data/features_JE-TMA-60_FilteredMeanIntensity_DAPI4_DAPI2.csv',index_col=0)\n",
+ "df_R10 = df_mi[df_mi.batch=='JE-TMA-60']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ls_scene = set(df_R10.slide_scene)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ls_na = set([item.split('_cell')[0] for item in df_R5.index]) - set([item.split('_cell')[0] for item in df_R10.index])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#slect markers, scenes for normalization (based on JE-TMA-60 tissue loss)\n",
+ "ls_pos = ['HER2_cellmem25','CK19_cytoplasm','CK7_cytoplasm','CK8_cytoplasm','Ecad_cytoplasm','ER_nuclei','Ki67_nuclei','LamAC_nuclei',\n",
+ " 'PCNA_nuclei','pHH3_nuclei','Vim_perinuc5','DAPI2_nuclei','H3K27_nuclei','H3K4_nuclei', 'pRB_nuclei','pS6RP_perinuc5',\n",
+ " 'CoxIV_perinuc5','EGFR_cytoplasm']\n",
+ "ls_R5 = ['CK17_cytoplasm','PDPN_perinuc5','CD45_perinuc5','FoxP3_nuclei'] #\n",
+ "ls_R4 = ['pHH3_nuclei','CK14_cytoplasm','Ki67_nuclei','CK19_cytoplasm','CK5_cytoplasm','HER2_cellmem25',\n",
+ " 'Ecad_cytoplasm', 'ER_nuclei','CD44_perinuc5', 'PCNA_nuclei','aSMA_perinuc5','CD3_perinuc5','EGFR_cytoplasm']\n",
+ "ls_bad = ['CD20_perinuc5', 'CD31_perinuc5', 'CD4_perinuc5', 'CD68_perinuc5', 'CD8_perinuc5','PD1_perinuc5',\n",
+ " 'ColI_perinuc5', 'ColIV_perinuc5'] #'CK7_cytoplasm', #'LamAC_nuclei',\n",
+ "#ls_good = ['CK7_cytoplasm','Vim_perinuc5','LamAC_nuclei']\n",
+ "\n",
+ "#R4\n",
+ "df = df_mi[df_mi.batch!='JE-TMA-60']\n",
+ "df = df.append(df_R4.loc[:,ls_R4])\n",
+ "#R5\n",
+ "ls_index = df_R5.loc[df_R5.index.isin(df_R4.index)].index\n",
+ "df.loc[ls_index,ls_R5] = df_R5.loc[ls_index,ls_R5]\n",
+ "\n",
+ "#fill R6-8\n",
+ "ls_index = df_mi.loc[(df_mi.slide_scene.isin(ls_scene)) & (df_mi.index.isin(df_R4.index))].index\n",
+ "df.loc[ls_index,ls_pos] = df_R10.loc[ls_index,ls_pos]\n",
+ "\n",
+ "#\n",
+ "df['batch'] = [item.split('_')[0] for item in df.index]\n",
+ "#df['scene'] = [item.split('_')[1] for item in df.index]\n",
+ "df['slide_scene'] = [item.split('_cell')[0] for item in df.index]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## filter edge cells"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#filter out unwanted cells\n",
+ "d_filter = {#41 (not used)\n",
+ " 'JE-TMA-41_scene01':(df_xy.DAPI_Y > 5000),'JE-TMA-41_scene03':(df_xy.DAPI_Y > 5000),\n",
+ " 'JE-TMA-41_scene04':(df_xy.DAPI_Y < 1500),'JE-TMA-41_scene05':(df_xy.DAPI_Y > 5000),\n",
+ " 'JE-TMA-41_scene06':(df_xy.DAPI_Y < 1500),'JE-TMA-41_scene08':(df_xy.DAPI_Y < 1500),\n",
+ " 'JE-TMA-41_scene09':(df_xy.DAPI_Y > 5000),'JE-TMA-41_scene11':(df_xy.DAPI_Y < 1500),\n",
+ " #43\n",
+ " 'JE-TMA-43_scene09':(df_xy.DAPI_Y < 1200),'JE-TMA-43_scene14':(df_xy.DAPI_Y < 1200),\n",
+ " #60\n",
+ " 'JE-TMA-60_scene02':(df_xy.DAPI_X < 1500),'JE-TMA-60_scene05':(df_xy.DAPI_X < 1500),\n",
+ " 'JE-TMA-60_scene11':(df_xy.DAPI_Y < 1500),'JE-TMA-60_scene14':(df_xy.DAPI_X < 1500),\n",
+ " 'JE-TMA-60_scene06':(df_xy.DAPI_Y < 1500),'JE-TMA-60_scene08':(df_xy.DAPI_Y > 5000),\n",
+ " 'JE-TMA-60_scene10':(df_xy.DAPI_Y < 1500),\n",
+ " #63\n",
+ " 'JE-TMA-62_scene01':(df_xy.DAPI_Y > 5000),\n",
+ " 'JE-TMA-62_scene02':(df_xy.DAPI_X > 5000),'JE-TMA-62_scene03':(df_xy.DAPI_X < 1000),\n",
+ " 'JE-TMA-62_scene04':(df_xy.DAPI_Y < 1500),'JE-TMA-62_scene06':(df_xy.DAPI_X < 1000),\n",
+ " 'JE-TMA-62_scene08':(df_xy.DAPI_Y > 5000),'JE-TMA-62_scene10':(df_xy.DAPI_Y < 1500),\n",
+ " #'SMTBx1-16_scene001':(df_xy.DAPI_Y > 1), #keep scene 1 for manual thresholding\n",
+ " 'SMTBx2-3_scene002':(df_xy.DAPI_Y > 5000),'SMTBx3_scene004':(df_xy.DAPI_X <11000),\n",
+ " 'SMTBx3_scene005':(df_xy.DAPI_X > 0),'SMTBx4-3_scene001':(df_xy.DAPI_Y < 2400),\n",
+ " 'SMTBx2-5_scene002':(df_xy.DAPI_Y > 5000),'HTA-33_scene003':(df_xy.DAPI_Y > 9000)}\n",
+ "d_filter2 = {'JE-TMA-60_scene02':(df_xy.DAPI_Y > 4500)}\n",
+ "ls_filter_all = []\n",
+ "for s_scene, filtercon in d_filter.items():\n",
+ " ls_filter = df_xy[(df_xy.slide_scene==s_scene) & filtercon].index.tolist()\n",
+ " ls_filter_all = ls_filter_all + ls_filter\n",
+ "for s_scene, filtercon in d_filter2.items():\n",
+ " ls_filter = df_xy[(df_xy.slide_scene==s_scene) & filtercon].index.tolist()\n",
+ " ls_filter_all = ls_filter_all + ls_filter\n",
+ "#filter edge\n",
+ "ls_filter_all = ls_filter_all + df_edge.index.tolist()\n",
+ "df_filter_mi = df[(~df.index.isin(ls_filter_all))]\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_cluster = df_filter_mi.loc[:,['HER2_cellmem25','slide_scene']]\n",
+ "df_cluster['cluster'] = 1\n",
+ "df_cluster.drop('HER2_cellmem25',axis=1,inplace=True)\n",
+ "import importlib\n",
+ "importlib.reload(viz)\n",
+ "%matplotlib inline\n",
+ "viz.plot_clusters(df_cluster,df_xy,s_num='few')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#match controls to biopsies\n",
+ "import warnings\n",
+ "warnings.filterwarnings('ignore')\n",
+ "d_replace = {'BC44290-146': 'JE-TMA-41',\n",
+ " 'SMTBx2-3': 'JE-TMA-41',\n",
+ " 'SMTBx2-5':'JE-TMA-43',\n",
+ " 'SMTBx3':'JE-TMA-60',\n",
+ " 'SMTBx4-3':'JE-TMA-62'}\n",
+ "df_filter_mi.loc[:,'batch'] = df_filter_mi.batch.replace(d_replace)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#standardize the scenes\n",
+ "d_replace = { 'JE-TMA-41_scene13':'JE-TMA-41_scene14',\n",
+ " 'JE-TMA-41_scene12':'JE-TMA-41_scene13',\n",
+ " 'JE-TMA-62_scene13':'JE-TMA-62_scene14',\n",
+ " 'JE-TMA-62_scene12':'JE-TMA-62_scene13'}\n",
+ "df_filter_mi.loc[:,'scene'] = df_filter_mi.slide_scene.replace(d_replace)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "df_filter_mi.merge(df_xy.loc[:,['DAPI_X', 'DAPI_Y', 'nuclei_area', 'nuclei_eccentricity']],left_index=True,right_index=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_out = df_filter_mi.merge(df_xy.loc[:,['DAPI_X', 'DAPI_Y', 'nuclei_area', 'nuclei_eccentricity']],left_index=True,right_index=True)\n",
+ "len(df_out)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#2-23 contains NAs\n",
+ "#2-22 the NAs were filled with random gaussian data\n",
+ "# 0302 include scene 1 Bx1\n",
+ "# 0318 just Bx2 - 4, (Bx2-5)\n",
+ "# 20210324 has HTA9-1-33\n",
+ "if not os.path.exists('20210324_SMTBx1-4_JE-TMA-43_60_62_FilteredMeanIntensity.csv'):\n",
+ " print('saving csv')\n",
+ " #df_out.to_csv('20210223_SMTBx1-4_JE-TMA-41_60_62_BC44290-146.csv')\n",
+ " df_out.to_csv('20210324_SMTBx1-4_JE-TMA-43_60_62_FilteredMeanIntensity.csv') "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#2-23 contains NAs\n",
+ "#2-22 the NAs were filled with random gaussian data\n",
+ "# 0302 include scene 1 Bx1\n",
+ "# 0318 just Bx2 - 4, (Bx2-5)\n",
+ "if not os.path.exists('20210320_SMTBx2-4_JE-TMA-43_60_62_FilteredMeanIntensity.csv'):\n",
+ " print('saving csv')\n",
+ " #df_out.to_csv('20210223_SMTBx1-4_JE-TMA-41_60_62_BC44290-146.csv')\n",
+ " df_out.to_csv('20210320_SMTBx2-4_JE-TMA-43_60_62_FilteredMeanIntensity.csv') "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Normalization \n",
+ "\n",
+ "use ComBat.\n",
+ "\n",
+ "[contents](#contents)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_mi = pd.read_csv('20210320_SMTBx2-4_JE-TMA-43_60_62_FilteredMeanIntensity.csv',index_col=0)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_mi.scene.unique()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ls_pos = ['HER2_cellmem25','CK19_cytoplasm','CK7_cytoplasm','CK8_cytoplasm','Ecad_cytoplasm','ER_nuclei','Ki67_nuclei','LamAC_nuclei',\n",
+ " 'PCNA_nuclei','pHH3_nuclei','Vim_perinuc5','DAPI2_nuclei','H3K27_nuclei','H3K4_nuclei', 'pRB_nuclei','pS6RP_perinuc5',\n",
+ " 'CoxIV_perinuc5','EGFR_cytoplasm']\n",
+ "ls_R5 = ['CK17_cytoplasm','PDPN_perinuc5','CD45_perinuc5','FoxP3_nuclei'] #\n",
+ "ls_R4 = ['pHH3_nuclei','CK14_cytoplasm','Ki67_nuclei','CK19_cytoplasm','CK5_cytoplasm','HER2_cellmem25',\n",
+ " 'Ecad_cytoplasm', 'ER_nuclei','CD44_perinuc5', 'PCNA_nuclei','aSMA_perinuc5','CD3_perinuc5','EGFR_cytoplasm']\n",
+ "ls_bad = ['CD20_perinuc5', 'CD31_perinuc5', 'CD4_perinuc5', 'CD68_perinuc5', 'CD8_perinuc5','PD1_perinuc5',\n",
+ " 'ColI_perinuc5', 'ColIV_perinuc5']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#select normalization scenes\n",
+ "ls_R10_scene = ['scene06', 'scene08', 'scene09', 'scene10', 'scene11', 'scene13']\n",
+ "ls_R10 = ['HER2_cellmem25', 'CK19_cytoplasm', 'CK7_cytoplasm', 'Ecad_cytoplasm', 'ER_nuclei', 'Ki67_nuclei', 'LamAC_nuclei',\n",
+ " 'PCNA_nuclei','pHH3_nuclei', 'Vim_perinuc5','CD44_perinuc5','DAPI2_nuclei', #adding following:\n",
+ " 'CK8_cytoplasm','CoxIV_perinuc5', 'EGFR_cytoplasm', 'H3K27_nuclei', 'H3K4_nuclei', 'pRB_nuclei', 'pS6RP_perinuc5']\n",
+ "#note: CK17 may have quenching artifact; PDPN not good in Bx1, so just CD45 important\n",
+ "#'CK17_cytoplasm','PDPN_perinuc5', #'FoxP3_nuclei' not in full set\n",
+ "ls_R5 = ['PDPN_perinuc5','CD45_perinuc5','FoxP3_nuclei', 'aSMA_perinuc5','CD3_perinuc5'] # aSMA because N breast, scene 01 better than 07 for immune\n",
+ "ls_R5_scene = ['scene01','scene03','scene04']\n",
+ "#old ls_R4 = ['pHH3_nuclei','CK14_cytoplasm','Ki67_nuclei','CK19_cytoplasm','CK5_cytoplasm','HER2_cellmem25',\n",
+ "# 'Ecad_cytoplasm', 'ER_nuclei','CD44_perinuc5', 'PCNA_nuclei','aSMA_perinuc5','CD3_perinuc5','DAPI2_nuclei']\n",
+ "#can scene 7 be good control for CD3 and CK14 and CK5?, yes. R1 doen't add much\n",
+ "ls_R4 = [ 'CK14_cytoplasm', 'CK5_cytoplasm','CK17_cytoplasm'] #'CD3_perinuc5',\n",
+ "ls_R4_scene = ['scene02','scene07']\n",
+ "ls_bad = ['CD20_perinuc5', 'CD31_perinuc5', 'CD4_perinuc5', 'CD68_perinuc5', 'CD8_perinuc5','PD1_perinuc5']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "set(df_mi.batch)\n",
+ "#df_mi = df_mi.loc[df_mi.batch!='JE-TMA-60']\n",
+ "df_mi['slide_scene'] = df_mi.scene\n",
+ "df_mi['scene'] = [item.split('_')[1] for item in df_mi.slide_scene]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#dropped 60\n",
+ "df_norm_all=pd.DataFrame(index=df_mi.dropna().index)\n",
+ "\n",
+ "#not dropped 60\n",
+ "df_norm_all=pd.DataFrame(index=df_mi.index)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#1 fit on scenes that are good through round 10 and markers that are positive on those scenes \"pos\"\n",
+ "for s_type in ['R4','R5','R10']:\n",
+ " if s_type == 'R10':\n",
+ " ls_pos = ls_R10\n",
+ " ls_scene = ls_R10_scene\n",
+ "\n",
+ " #2 fit on scenes that are good until R4, and R1-4 markers\n",
+ " if s_type == 'R4':\n",
+ " ls_pos = ls_R4\n",
+ " ls_scene = ls_R4_scene # + ls_R5_scene + ls_R10_scene \n",
+ "\n",
+ " #3 fit on scene that are good until R5, and R5 markers\n",
+ " if s_type == 'R5':\n",
+ " ls_pos = ls_R5\n",
+ " ls_scene = ls_R5_scene\n",
+ "\n",
+ " #fit\n",
+ " b_control = ((df_mi.index.str.contains('JE-TMA')) & (df_mi.scene.isin(ls_scene)) & (df_mi.loc[:,ls_pos].isna().sum(axis=1)==0))\n",
+ " data = df_mi.loc[b_control,ls_pos].T\n",
+ " batch = df_mi.loc[b_control,'batch']\n",
+ " gamma_star, delta_star, stand_mean, var_pooled = normalize.combat_fit(data, batch)\n",
+ " #transform\n",
+ " #data = df_mi.loc[df_mi.batch!='SMTBx1-16',df_mi.dtypes=='float64'].drop(['DAPI_X','DAPI_Y'],axis=1).T\n",
+ " data = df_mi.loc[df_mi.batch!='SMTBx1-16',ls_pos].T\n",
+ " batch = df_mi.loc[df_mi.batch!='SMTBx1-16','batch']\n",
+ " bayesdata = normalize.combat_transform(data,batch,gamma_star,delta_star,stand_mean, var_pooled)\n",
+ " df_norm = bayesdata.T\n",
+ " df_norm_all = df_norm_all.merge(df_norm,left_index=True,right_index=True,how='left')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_norm_all.tail()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# run after #1, 2 and 3\n",
+ "df_norm_all = df_norm_all.merge(df_mi.loc[:,['batch','DAPI_X','DAPI_Y','scene','nuclei_area','nuclei_eccentricity']],left_index=True,right_index=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#old check\n",
+ "df_norm = df_norm.merge(df_mi.loc[:,['batch','DAPI_X','DAPI_Y','scene','nuclei_area','nuclei_eccentricity']],left_index=True,right_index=True)\n",
+ "#df_mi.loc[b_control,:].drop(['DAPI_X','DAPI_Y'],axis=1).groupby('batch').mean()\n",
+ "#df_mi[df_mi.index.str.contains('JE-TMA')].drop(['DAPI_X','DAPI_Y'],axis=1).groupby('batch').std()\n",
+ "#check\n",
+ "df_norm.loc[b_control,:].drop(['DAPI_X','DAPI_Y'],axis=1).groupby('batch').mean()\n",
+ "#df_norm[df_norm.index.str.contains('JE-TMA')].drop(['DAPI_X','DAPI_Y'],axis=1).groupby('batch').std()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#df_norm_all.to_csv('20210320_SMTBx2-4_JE-TMA-43_60_62_normalized.csv')\n",
+ "#df_norm_all.to_csv('20210325_SMTBx2-4_JE-TMA-43_60_62_normalized.csv')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Umap Visualize Normalization \n",
+ "\n",
+ "[contents](#contents)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#s_sample = '20210320_SMTBx2-4_JE-TMA-43_60_62'\n",
+ "s_sample = '20210325_SMTBx2-4_JE-TMA-43_60_62'\n",
+ "df_norm_all = pd.read_csv(f'{s_sample}_normalized.csv',index_col=0)\n",
+ "df_norm_all.rename({'nuclei_area':'area','nuclei_eccentricity':'eccentricity','DAPI_X':'DAPIX',\n",
+ " 'DAPI_Y':\"DAPIY\"},axis=1, inplace=True)\n",
+ "df_norm_all.columns = [item.split('_')[0] for item in df_norm_all.columns]\n",
+ "df_norm_all['slide'] = [item.split('_')[0] for item in df_norm_all.index]\n",
+ "df_norm_all['scene'] = [item.split('_')[1] for item in df_norm_all.index]\n",
+ "df_norm_all['slide_scene'] = [item.split('_cell')[0] for item in df_norm_all.index]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_norm_all = df_norm_all.loc[~df_norm_all.slide_scene.isin(['JE-TMA-43_scene01','JE-TMA-62_scene01'])]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "# visualize\n",
+ "%matplotlib inline\n",
+ "s_type = 'w-60_no01'\n",
+ "#adata = sc.AnnData(df_norm_all.loc[:,df_norm_all.dtypes=='float64'].drop(['DAPIX','DAPIY'],axis=1)) \n",
+ "ls_drop = ['DAPIX','DAPIY','DAPI2','LamAC','pHH3','FoxP3','CoxIV',\n",
+ " 'H3K27','H3K4','pRB','pS6RP','aSMA','PDPN'] #aSMA, PDPN not well norm\n",
+ "adata = sc.AnnData(df_norm_all.dropna().loc[:,df_norm_all.dtypes=='float64'].drop(ls_drop,axis=1))\n",
+ "adata.obs['batch'] = df_norm_all.dropna().loc[:,'batch']\n",
+ "adata.obs['scene'] = df_norm_all.dropna().loc[:,'scene'].replace({'scene001':'Bx', 'scene002':'Bx','scene003':'Bx', 'scene004':'Bx', 'scene005':'Bx'})\n",
+ "adata.obs['tissue'] = df_norm_all.dropna().loc[:,'slide']\n",
+ "# reduce dimensionality (PCA)\n",
+ "adata.raw = adata\n",
+ "#reduce dimensionality\n",
+ "sc.tl.pca(adata, svd_solver='auto')\n",
+ "#sc.pl.pca(adata)\n",
+ "sc.pl.pca_variance_ratio(adata, log=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# calculate neighbors\n",
+ "n_neighbors = 31\n",
+ "n_pcs=len(adata.var.index) - 1\n",
+ "results_file = f'{s_sample}_{n_neighbors}neighbors_{n_pcs}pcs_{len(adata.var.index)}markers.h5ad'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "results_file"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "d_celline = {'scene02':'HCC1143',\n",
+ " 'scene03':'HCC3153',\n",
+ " 'scene04':'N.Breast',\n",
+ " 'scene05':'T47D',\n",
+ " 'scene06':'T47D',\n",
+ " 'scene07':'Tonsil',\n",
+ " 'scene08':'BT474',\n",
+ " 'scene09':'BT474',\n",
+ " 'scene10':'AU565',\n",
+ " 'scene11':'AU565',\n",
+ " 'scene12':'MDAMB436',\n",
+ " 'scene13':'MDAMB436',\n",
+ " 'scene14':'MDAMB436'}\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "\n",
+ "# calculate neighbors\n",
+ "if os.path.exists(results_file):\n",
+ " adata = sc.read_h5ad(results_file)\n",
+ " print('loading umap')\n",
+ "else:\n",
+ " # calculate neighbors \n",
+ " print('calculating umap')\n",
+ " sc.pp.neighbors(adata, n_neighbors=n_neighbors, n_pcs=n_pcs)\n",
+ " sc.tl.umap(adata)\n",
+ " #save results\n",
+ " if not os.path.exists(results_file):\n",
+ " adata.write(results_file)\n",
+ "\n",
+ "# umap plus scenes\n",
+ "fig,ax = plt.subplots(figsize=(3,2.5),dpi=600)\n",
+ "figname = f'UmapScene_{s_type}_{n_pcs+1}markers.png'\n",
+ "sc.pl.umap(adata, color='scene',save=figname,title=f'TMA Core',ax=ax)\n",
+ "\n",
+ "\n",
+ "# umap plus tissue\n",
+ "fig,ax = plt.subplots(figsize=(3,2.5),dpi=600)\n",
+ "figname = f'UmapTissue_{s_type}_{n_pcs+1}markers.png'\n",
+ "adata.obs['Tissue'] = adata.obs['tissue'].replace({'SMTBx2-5':'Bx2', 'SMTBx3':'Bx3','SMTBx4-3':'Bx4'})\n",
+ "sc.pl.umap(adata, color='Tissue',save=figname,title=f'Tissue',ax=ax)\n",
+ "\n",
+ "\n",
+ "# umap plus cell line\n",
+ "adata.obs['Subtype'] = adata.obs.scene.replace(d_celline)\n",
+ "fig,ax = plt.subplots(figsize=(3,2.5),dpi=600)\n",
+ "figname = f'UmapSubtype_{s_type}_{n_pcs+1}markers.png'\n",
+ "sc.pl.umap(adata, color='Subtype',save=figname,title=f'Subtype',ax=ax)\n",
+ "\n",
+ "\n",
+ "#umap plot\n",
+ "ls_marker = adata.var.index.tolist()\n",
+ "figname = f\"Umap_{s_type}_{n_pcs+1}markers.png\"\n",
+ "axes = sc.pl.umap(adata, color=ls_marker,wspace=.25,save=figname,vmin='p1.5',vmax='p98.5',ncols=3,show=False)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "#umap plot\n",
+ "ls_marker = adata.var.index.tolist()\n",
+ "figname = f\"Umap_{s_type}_{n_pcs+1}markers.png\"\n",
+ "fig = sc.pl.umap(adata, color=ls_marker,wspace=.25,vmin='p1.5',vmax='p98.5',ncols=3,show=False,return_fig=True)\n",
+ "ax_list = fig.axes\n",
+ "for ax in ax_list:\n",
+ " ax.set_title(ax.get_title(),fontsize=28)\n",
+ "fig.savefig(f'figures/{figname}',dpi=600)\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## cluster leiden \n",
+ "\n",
+ "[contents](#contents)\n",
+ "\n",
+ "cluster on the markers that are normalized well"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "resolution = 0.45\n",
+ "results_file = f'{s_sample}_{n_neighbors}neighbors_{n_pcs}pcs_{len(adata.var.index)}markers_leiden{resolution}.h5ad'\n",
+ "#save\n",
+ "if not os.path.exists(results_file):\n",
+ " sc.tl.leiden(adata,resolution=resolution)\n",
+ "else:\n",
+ " adata = sc.read_h5ad(results_file)\n",
+ " print('loading leiden') \n",
+ "fig,ax = plt.subplots(figsize=(3,2.5),dpi=600)\n",
+ "figname=f'leiden_{resolution}.png'\n",
+ "sc.pl.umap(adata, color='leiden',ax=ax,save=figname)\n",
+ "#seaborn clustermap\n",
+ "df_p = pd.DataFrame(data=adata.raw.X,index=adata.obs.index,columns=adata.var.index)\n",
+ "df_p['leiden'] = adata.obs['leiden']\n",
+ "g = sns.clustermap(df_p.groupby('leiden').mean(),z_score=1,figsize=(4,4),cmap='viridis',\n",
+ " vmin=-1.5,vmax=1.5) \n",
+ "#g.savefig(f'./figures/clustermap_leiden.png',dpi=200)\n",
+ "marker_genes = df_p.groupby('leiden').mean().iloc[:,g.dendrogram_col.reordered_ind].columns.tolist()\n",
+ "categories_order = df_p.groupby('leiden').mean().iloc[g.dendrogram_row.reordered_ind,:].index.tolist()\n",
+ "#scanpy matrixplot\n",
+ "fig,ax = plt.subplots(figsize=(5,5), dpi=200)\n",
+ "figname=f'Matrixplot_leiden_{resolution}.png'\n",
+ "sc.pl.matrixplot(adata, var_names=marker_genes, groupby=f'leiden',title='',categories_order=categories_order,\n",
+ " ax=ax,save=figname,standard_scale='var',colorbar_title='Relative\\nintensity',\n",
+ " #var_group_positions=[(3,23),(24,31),(32,42),(43,51)],\n",
+ " #var_group_labels=['tumor','T-cell','muscle\\n +AF','immune\\n+stroma'],\n",
+ " #var_group_rotation=0\n",
+ " )\n",
+ "\n",
+ "#save\n",
+ "if not os.path.exists(results_file):\n",
+ " adata.write(results_file)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Leiden barplots \n",
+ "\n",
+ "\n",
+ "[contents](#contents)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ls_order = [\n",
+ " 'Bx2','Bx3','Bx4',#'JE-TMA-43_scene01','JE-TMA-62_scene01',\n",
+ " 'JE-TMA-43_scene02', 'JE-TMA-62_scene02',\n",
+ " 'JE-TMA-43_scene03', 'JE-TMA-62_scene03', 'JE-TMA-43_scene04',\n",
+ " 'JE-TMA-62_scene04', 'JE-TMA-43_scene05', 'JE-TMA-62_scene05',\n",
+ " 'JE-TMA-43_scene06','JE-TMA-60_scene06', 'JE-TMA-62_scene06', 'JE-TMA-43_scene07',\n",
+ " 'JE-TMA-62_scene07', 'JE-TMA-43_scene08','JE-TMA-60_scene08', 'JE-TMA-62_scene08',\n",
+ " 'JE-TMA-43_scene09','JE-TMA-60_scene09', 'JE-TMA-62_scene09','JE-TMA-43_scene10', 'JE-TMA-62_scene10','JE-TMA-60_scene10',\n",
+ " 'JE-TMA-43_scene11', 'JE-TMA-60_scene11', 'JE-TMA-62_scene11', 'JE-TMA-43_scene13',\n",
+ " 'JE-TMA-62_scene12', 'JE-TMA-43_scene14','JE-TMA-60_scene13', 'JE-TMA-62_scene13'] "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ls_order_r = ls_order[::-1]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#load original\n",
+ "'''\n",
+ "s_sample = '20210320_SMTBx2-4_JE-TMA-43_60_62'\n",
+ "n_neighbors = 30\n",
+ "n_pcs = 19\n",
+ "n_markers = n_pcs+1\n",
+ "resolution = 0.5\n",
+ "results_file = f'{s_sample}_{n_neighbors}neighbors_{n_pcs}pcs_{n_markers}markers_leiden{resolution}.h5ad'\n",
+ "adata1 = sc.read_h5ad(results_file) \n",
+ "\n",
+ "d_cluster = {'14': '14: Basal',\n",
+ "'5': '5: T cell',\n",
+ "'12': '12: T cell',\n",
+ "'10': '10: Myoepithelial',\n",
+ "'1': '1: Mesenchymal',\n",
+ "'16': '16: Prolif.',\n",
+ "'15': '15: Vim+ FB (Bx3)',\n",
+ "'11': '11: Vim+ FB (Bx4)',\n",
+ "'13': '13: Vim+ FB (Bx2)',\n",
+ "'7': '7: HER2++',\n",
+ "'9': '9: EGFR+ Basal',\n",
+ "'3': '3: HER2+',\n",
+ "'8': '8: HER2++, Ecad-',\n",
+ "'0': '0: ER+ (Bx4)',\n",
+ "'2': '2: ER+, PCNA+ ',\n",
+ "'4': '4: ER+, EGFR+ (Bx3)',\n",
+ "'6': '6: ER+ (Bx2)'}\n",
+ "d_clust_names = dict(zip([item[0] for item in d_cluster.items()],[item[1].split(': ')[1] for item in d_cluster.items()]))\n",
+ "'''"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#load\n",
+ "s_sample = '20210325_SMTBx2-4_JE-TMA-43_60_62'\n",
+ "n_neighbors = 31\n",
+ "n_pcs = 17\n",
+ "n_markers = n_pcs+1\n",
+ "resolution = 0.45\n",
+ "results_file = f'{s_sample}_{n_neighbors}neighbors_{n_pcs}pcs_{n_markers}markers_leiden{resolution}.h5ad'\n",
+ "adata1 = sc.read_h5ad(results_file) \n",
+ "print(results_file)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "if resolution == 0.5:\n",
+ " d_cluster = {'14': '14: Basal','12': '12: T cell','16': '16: Prolif.','7': '7: ER+ (Bx2)','13': '13: Luminal (N.Breast)',\n",
+ " '1': '1: ER+ PCNA+ (T47D)','0': '0: ER+ (Bx4)','15': '15: ER+ CK8++ (Bx4)','4': '4: ER+, EGFR+ (Bx3)','18': '18: ER+, EGFR+ (Bx3)',\n",
+ " '17': '17: (Bx3)','10': '10: FB (Bx4)','11': '11: FB (Bx2)','3': '3: CD44+','9': '9: CD44+', '8': '8: EGFR+ Basal',\n",
+ " '5': '5: HER2++','6': '6: HER2+','2': '2: HER2++, Ecad-',}\n",
+ "elif resolution == 0.45:\n",
+ " d_cluster = {'15':'15: Basal',\n",
+ " '12':'12: T cell',\n",
+ " '16': '16: prolif.',\n",
+ " '5':'5: ER+, EGFR+ (Bx3)',\n",
+ " '0':'0: ER+ (Bx4)',\n",
+ " '1':'1: ER+, PCNA+',\n",
+ " '7':'7: ER- (Bx2)',\n",
+ " '9':'9: ER+ (Bx2)',\n",
+ " '8':'8: EGFR+ Basal',\n",
+ " '4':'4: HER2+',\n",
+ " '3':'3: HER2+',\n",
+ " '6':'6: HER2+, Ecad-',\n",
+ " '2':'2: Mesenchymal',\n",
+ " '10':'10: Mesenchymal',\n",
+ " '14':'14: fibroblast',\n",
+ " '11':'11: fibroblast',\n",
+ " '13':'13: fibroblast'}\n",
+ "d_clust_names = dict(zip([item[0] for item in d_cluster.items()],[item[1].split(': ')[1] for item in d_cluster.items()]))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "%matplotlib inline\n",
+ "mpl.rcParams['pdf.fonttype'] = 42\n",
+ "mpl.rcParams['ps.fonttype'] = 42\n",
+ "#sns.set(font_scale=1.19)\n",
+ "#seaborn clustermap\n",
+ "df_p = pd.DataFrame(data=adata1.raw.X,index=adata1.obs.index,columns=adata1.var.index)\n",
+ "df_p['leiden'] = adata1.obs['leiden']\n",
+ "g = sns.clustermap(df_p.groupby('leiden').mean().rename({'eccentricity':'eccen.'},axis=1).rename(d_cluster, axis=0),\n",
+ " z_score=1,figsize=(6.2,6),cmap='viridis',\n",
+ " vmin=-2,vmax=2,cbar_pos=(.05, .89, .10, .05),cbar_kws={'orientation': 'horizontal','label':'Z-score'}) #(left, bottom, width, height),\n",
+ "g.savefig(f'./{s_date}/clustermap_leiden_{resolution}_{n_markers}.pdf',dpi=300)\n",
+ "g.savefig(f'./{s_date}/clustermap_leiden_{resolution}_{n_markers}.png',dpi=300)\n",
+ "marker_genes = df_p.groupby('leiden').mean().iloc[:,g.dendrogram_col.reordered_ind].columns.tolist()\n",
+ "categories_order = df_p.groupby('leiden').mean().iloc[g.dendrogram_row.reordered_ind,:].index.tolist()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "# stacked bar vertical\n",
+ "\n",
+ "df = pd.DataFrame(data=adata1.raw.X,index=adata1.obs.index,columns=adata1.var.index)\n",
+ "df[f'leiden'] = [int(item) for item in adata1.obs.leiden]\n",
+ "s_markers = n_markers\n",
+ "k=resolution\n",
+ "\n",
+ "df['slide'] = [item.split('_')[0] for item in df.index]\n",
+ "df['slide_scene'] = [item.split('_cell')[0] for item in df.index]\n",
+ "df['slide_scene'] = df.slide_scene.replace({'SMTBx2-5_scene001':'Bx2', 'SMTBx2-5_scene002':'Bx2',\n",
+ " 'SMTBx3_scene004':'Bx3', 'SMTBx4-3_scene001':'Bx4',\n",
+ " 'SMTBx4-3_scene002':'Bx4'})#.replace(d_order)\n",
+ "df['scene'] = [item.split('_')[1] for item in df.index]\n",
+ "df_prop = (df.groupby([f'leiden','slide_scene']).PCNA.count())/(df.groupby(['slide_scene']).PCNA.count())\n",
+ "df_prop = df_prop.unstack().fillna(value=0).T\n",
+ "\n",
+ "fig,ax=plt.subplots(figsize=(5,6), dpi=200)\n",
+ "df_prop['slide'] =[item.split('_')[0] for item in df_prop.index]\n",
+ "#df_prop['scene'] =[item.split('_')[1] for item in df_prop.index]\n",
+ "df_prop = df_prop.loc[ls_order_r]\n",
+ "df_prop.columns = [str(item) for item in df_prop.columns]\n",
+ "#df_prop.rename(d_order).rename(d_cluster,axis=1).plot(kind='barh',stacked=True,ax=ax,legend=True,cmap='tab20',width=0.9)\n",
+ "df_prop.plot(kind='barh',stacked=True,ax=ax,legend=True,cmap='tab20',width=0.9)\n",
+ "ax.legend(bbox_to_anchor=(1.05, 1.00),ncol=1, fancybox=True,title='Cluster ID')\n",
+ "ax.set_xlabel('Fraction of Cells')\n",
+ "ax.set_ylabel('Tissue')\n",
+ "ax.set_title('')\n",
+ "plt.tight_layout()\n",
+ "fig.savefig(f'./{s_date}/StackedBar_{s_markers}markers_{k}Clusters_vertical.pdf')\n",
+ "fig.savefig(f'./{s_date}/StackedBar_{s_markers}markers_{k}Clusters_vertical.png')\n",
+ "#plt.close()\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#save the cluster ID, not hte annotation\n",
+ "#df_prop.to_csv(f'{s_sample}_{n_markers}markers_leiden{resolution}_frac_pos.csv')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "import matplotlib.ticker as tic\n",
+ "#SMT\n",
+ "fig,ax=plt.subplots(figsize=(2.8,3.2),dpi=200)\n",
+ "df_plot = df_prop.loc[['Bx2','Bx3','Bx4'],df_prop.dtypes=='float64'].T[::-1]\n",
+ "df_plot.plot(kind='barh',ax=ax,legend=True,width=.9)\n",
+ "ax.legend(title='Bx', loc='upper left',fancybox=True,borderpad=.2,bbox_to_anchor=(1.05, 1.05))\n",
+ "ax.set_xlabel('Fraction of Cells')\n",
+ "ax.set_ylabel('')\n",
+ "fig.suptitle(f'Cluster Composition: Biopsies',x=.5, y=.92)\n",
+ "for tick in ax.yaxis.get_major_ticks():\n",
+ " tick.tick1line.set_markersize(0)\n",
+ " tick.tick2line.set_markersize(0)\n",
+ "temp = tic.LinearLocator(numticks=18)\n",
+ "ax.yaxis.set_minor_locator(temp)\n",
+ "plt.grid(b=True, which='minor', axis='y')\n",
+ "plt.tight_layout()\n",
+ "fig.savefig(f'./{s_date}/Barplot_SMT{s_markers}_K{k}.pdf')\n",
+ "fig.savefig(f'./{s_date}/Barplot_SMT{s_markers}_K{k}.png')\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ls_order = ['Bx2', 'Bx3', 'Bx4','AU565-2','AU565-3', 'AU565-4', 'BT474-2','BT474-3', 'BT474-4', \n",
+ " 'HCC1143-2', 'HCC1143-4', 'HCC3153-2', 'HCC3153-4', #'JE-TMA-43_scene01','JE-TMA-62_scene01', 'JE-TMA-43_scene10',\n",
+ " 'MDAMB-436-2','MDAMB-436-3', 'MDAMB-436-4', 'T47D-2','T47D-3', 'T47D-4',\n",
+ " 'N.Breast-2', 'N.Breast-4', 'tonsil-2', 'tonsil-4']\n",
+ "d_order = {#'\n",
+ " 'JE-TMA-43_scene02':'HCC1143-2', 'JE-TMA-62_scene02':'HCC1143-4',\n",
+ " 'JE-TMA-43_scene03':'HCC3153-2', 'JE-TMA-62_scene03':'HCC3153-4', 'JE-TMA-43_scene04':'N.Breast-2',\n",
+ " 'JE-TMA-62_scene04':'N.Breast-4', 'JE-TMA-43_scene05':'T47D-2', 'JE-TMA-62_scene05':'T47D-4',\n",
+ " 'JE-TMA-43_scene06':'T47D-2', 'JE-TMA-62_scene06':'T47D-4', 'JE-TMA-43_scene07':'tonsil-2',\n",
+ " 'JE-TMA-62_scene07':'tonsil-4', 'JE-TMA-43_scene08':'BT474-2', 'JE-TMA-62_scene08':'BT474-4',\n",
+ " 'JE-TMA-43_scene09':'BT474-2', 'JE-TMA-62_scene09':'BT474-4', 'JE-TMA-43_scene10':'AU565-2','JE-TMA-62_scene10':'AU565-4',\n",
+ " 'JE-TMA-43_scene11':'AU565-2', 'JE-TMA-62_scene11':'AU565-4', 'JE-TMA-43_scene13':'MDAMB-436-2',\n",
+ " 'JE-TMA-62_scene12':'MDAMB-436-4', 'JE-TMA-43_scene14':'MDAMB-436-2', 'JE-TMA-62_scene13':'MDAMB-436-4',\n",
+ " 'JE-TMA-60_scene13':'MDAMB-436-3', 'JE-TMA-60_scene11':'AU565-3', 'JE-TMA-60_scene10':'AU565-3',\n",
+ " 'JE-TMA-60_scene09':'BT474-3', 'JE-TMA-60_scene08':'BT474-3', 'JE-TMA-60_scene06':'T47D-3'}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "#stacked bar vertical tissue\n",
+ "df['coreID'] = df.slide_scene.replace(d_order)\n",
+ "df['celltype'] = df.leiden.astype('str').replace(d_clust_names)\n",
+ "df_prop = (df.groupby([f'celltype','coreID']).PCNA.count())/(df.groupby(['coreID']).PCNA.count())\n",
+ "df_prop = df_prop.unstack().fillna(value=0).T\n",
+ "\n",
+ "fig,ax=plt.subplots(figsize=(5,3.7), dpi=200)\n",
+ "df_prop['slide'] =[item.split('_')[0] for item in df_prop.index]\n",
+ "ls_order_r = ls_order[::-1]\n",
+ "df_prop = df_prop.loc[ls_order_r]\n",
+ "df_prop.columns = [str(item) for item in df_prop.columns]\n",
+ "df_prop.plot(kind='barh',stacked=True,ax=ax,legend=True,cmap='tab20',width=0.9) #.rename(d_order).rename(d_clust_names,axis=1)\n",
+ "ax.legend(loc='upper left', bbox_to_anchor=(1.1,1.02),ncol=1, fancybox=True,title='Cluster Annotation')\n",
+ "ax.set_xlabel('Fraction of Cells')\n",
+ "ax.set_ylabel('Tissue')\n",
+ "ax.set_title('Cluster Composition: Biopsies Plus Controls')\n",
+ "plt.tight_layout()\n",
+ "fig.savefig(f'./{s_date}/StackedBar_{s_markers}markers_{k}Clusters_withcontrols_vert.pdf')\n",
+ "fig.savefig(f'./{s_date}/StackedBar_{s_markers}markers_{k}Clusters_withcontrols_vert.png')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#stacked bar horizontal\n",
+ "df['coreID'] = df.slide_scene.replace(d_order)\n",
+ "df['celltype'] = df.leiden.astype('str').replace(d_clust_names)\n",
+ "df_prop = (df.groupby([f'celltype','coreID']).PCNA.count())/(df.groupby(['coreID']).PCNA.count())\n",
+ "df_prop = df_prop.unstack().fillna(value=0).T\n",
+ "\n",
+ "fig,ax=plt.subplots(figsize=(10,2.5), dpi=200)\n",
+ "df_prop['slide'] =[item.split('_')[0] for item in df_prop.index]\n",
+ "#df_prop['scene'] =[item.split('_')[1] for item in df_prop.index]\n",
+ "df_prop = df_prop.loc[ls_order]\n",
+ "df_prop.columns = [str(item) for item in df_prop.columns]\n",
+ "df_prop.plot(kind='bar',stacked=True,ax=ax,legend=True,cmap='tab20',width=0.9) #.rename(d_order).rename(d_clust_names,axis=1)\n",
+ "ax.legend(loc='upper center', bbox_to_anchor=(1.5, 1.05),ncol=2, fancybox=True,title='Cluster Annotation')\n",
+ "ax.set_ylabel('Fraction of Cells')\n",
+ "ax.set_xlabel('Tissue')\n",
+ "ax.set_title('')\n",
+ "plt.tight_layout()\n",
+ "fig.savefig(f'./{s_date}/StackedBar_{s_markers}markers_{k}Clusters_withcontrols.pdf')\n",
+ "fig.savefig(f'./{s_date}/StackedBar_{s_markers}markers_{k}Clusters_withcontrols.png')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#plot all groups spatially \n",
+ "from matplotlib.colors import ListedColormap, LinearSegmentedColormap\n",
+ "newcmap = ListedColormap(mpl.cm.tab20.colors)#ListedColormap(mpl.cm.tab20b.colors + mpl.cm.tab20c.colors)\n",
+ "from mplex_image import analyze\n",
+ "df_pos = analyze.celltype_to_bool(df_p,'leiden')\n",
+ "df_xy = df_mi.loc[df_pos.index]\n",
+ "ls_scene = ['SMTBx2-5_scene001', 'SMTBx3_scene004', 'SMTBx4-3_scene001', 'SMTBx4-3_scene002']\n",
+ "#ls_scene = ['JE-TMA-62_scene04', 'JE-TMA-43_scene04','JE-TMA-62_scene07','JE-TMA-43_scene07']\n",
+ "for s_slide in ls_scene:\n",
+ " fig,ax = plt.subplots(figsize=(10,10),dpi=200) #10,10\n",
+ " #plot negative cells\n",
+ " df_scene = df_xy[df_xy.index.str.contains(s_slide)]\n",
+ " ax.scatter(data=df_scene,x='DAPI_X',y='DAPI_Y',color='silver',s=0.1,label=f'')\n",
+ " for idxs, s_color_int in enumerate(range(len(df_pos.columns))):\n",
+ " s_color = str(s_color_int)\n",
+ " if len(df_xy[(df_xy.slide_scene==s_slide) & (df_pos.loc[:,s_color])])>=1:\n",
+ " #plot positive cells\n",
+ " ax.scatter(data=df_xy[(df_xy.slide_scene==s_slide) & (df_pos.loc[:,s_color])],x='DAPI_X',y='DAPI_Y',\n",
+ " label=f'{s_color}',s=0.1,color=newcmap.colors[idxs])\n",
+ " #break\n",
+ " ax.set_title(f\"{s_slide}\", fontsize=16)\n",
+ " ax.axis('equal')\n",
+ " ax.set_ylim(ax.get_ylim()[::-1])\n",
+ " #ax.set_xticklabels('')\n",
+ " #ax.set_yticklabels('')\n",
+ " #break\n",
+ " plt.legend(markerscale=10) \n",
+ " fig.savefig(f'{codedir}/paper_data/GatingPlots/{s_slide}_clustering_scatterplot.png')\n",
+ " #break"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "if not os.path.exists(f'{s_sample}_{n_markers}markers_leiden{resolution}.csv'):\n",
+ " print('saving csv')\n",
+ " df.to_csv(f'{s_sample}_{n_markers}markers_leiden{resolution}.csv')\n",
+ " df_prop.to_csv(f'{s_sample}_{n_markers}markers_leiden{resolution}_frac_pos.csv')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "f'{s_sample}_{n_markers}markers_leiden{resolution}.csv'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "f'{s_sample}_{n_markers}markers_leiden{resolution}_frac_pos.csv'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python3.9.5",
+ "language": "python",
+ "name": "python3.9.5"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.5"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/mplex_image/20210312_visualize.py b/mplex_image/20210312_visualize.py
new file mode 100755
index 0000000..f9f86b9
--- /dev/null
+++ b/mplex_image/20210312_visualize.py
@@ -0,0 +1,288 @@
+####
+# title: analyze.py
+#
+# language: Python3.6
+# date: 2019-05-00
+# license: GPL>=v3
+# author: Jenny
+#
+# description:
+# python3 library to visualize cyclic data and analysis
+####
+
+#load libraries
+import matplotlib as mpl
+import matplotlib.pyplot as plt
+import pandas as pd
+import numpy as np
+import os
+import skimage
+from skimage import io, segmentation
+import tifffile
+import copy
+import napari
+import seaborn as sns
+from sklearn.cluster import KMeans
+from sklearn.preprocessing import scale
+
+#napari
+def load_crops(viewer,s_crop,s_tissue):
+ ls_color = ['blue','green','yellow','red','cyan','magenta','gray','green','yellow','red','cyan','magenta',
+ 'gray','gray','gray','gray','gray','gray','gray','gray']
+ print(s_crop)
+ #viewer = napari.Viewer()
+ for s_file in os.listdir():
+ if s_file.find(s_tissue)>-1:
+ if s_file.find(s_crop) > -1:
+ if s_file.find('ome.tif') > -1:
+ with tifffile.TiffFile(s_file) as tif:
+ array = tif.asarray()
+ omexml_string = tif.ome_metadata
+ for idx in range(array.shape[0]):
+ img = array[idx]
+ i_begin = omexml_string.find(f'Channel ID="Channel:0:{idx}" Name="')
+ i_end = omexml_string[i_begin:].find('" SamplesPerPixel')
+ s_marker = omexml_string[i_begin + 31:i_begin + i_end]
+ viewer.add_image(img,name=s_marker,rgb=False,visible=False,blending='additive',colormap=ls_color[idx],contrast_limits = (np.quantile(img,0),(np.quantile(img,0.9999)+1)*1.5))
+ elif s_file.find('SegmentationBasins') > -1:
+ label_image = io.imread(s_file)
+ viewer.add_labels(label_image, name='cell_seg',blending='additive',visible=False)
+ cell_boundaries = segmentation.find_boundaries(label_image,mode='outer')
+ viewer.add_labels(cell_boundaries,blending='additive')
+ else:
+ label_image = np.array([])
+ print('')
+ return(label_image)
+
+def pos_label(viewer,df_pos,label_image,s_cell):
+ '''
+ df_pos = boolean dataframe, s_cell = marker name
+ '''
+ #s_cell = df_pos.columns[df_pos.columns.str.contains(f'{s_cell}_')][0]
+ #get rid of extra cells (filtered by DAPI, etc)
+ li_index = [int(item.split('_')[-1].split('cell')[1]) for item in df_pos.index]
+ label_image_cell = copy.deepcopy(label_image)
+ label_image_cell[~np.isin(label_image_cell, li_index)] = 0
+ li_index_cell = [int(item.split('_')[-1].split('cell')[1]) for item in df_pos[df_pos.loc[:,s_cell]==True].index]
+ label_image_cell[~np.isin(label_image_cell,li_index_cell )] = 0
+ viewer.add_labels(label_image_cell, name=f'{s_cell.split("_")[0]}_seg',blending='additive',visible=False)
+ return(label_image_cell)
+
+#jupyter notbook
+#load manual thresholds
+def new_thresh_csv(df_mi,d_combos):
+ #make thresh csv's
+ df_man = pd.DataFrame(index= ['global']+ sorted(set(df_mi.slide_scene)))
+ for s_type, es_marker in d_combos.items():
+ for s_marker in sorted(es_marker):
+ df_man[s_marker] = ''
+ return(df_man)
+
+def load_thresh_csv(s_sample):
+ #load
+ df_man = pd.read_csv(f'thresh_JE_{s_sample}.csv',header=0,index_col = 0)
+ #reformat the thresholds data and covert to 16 bit
+ ls_index = df_man.index.tolist()
+ ls_index.remove('global')
+ df_thresh = pd.DataFrame(index = ls_index)
+ ls_marker = df_man.columns.tolist()
+ for s_marker in ls_marker:
+ df_thresh[f'{s_marker}_global'] = df_man[df_man.index=='global'].loc['global',f'{s_marker}']*256
+ df_thresh[f'{s_marker}_local'] = df_man[df_man.index!='global'].loc[:,f'{s_marker}']*256
+
+ df_thresh.replace(to_replace=0, value = 12, inplace=True)
+ return(df_thresh)
+
+def threshold_postive(df_thresh,df_mi):
+ '''
+ #make positive dataframe to check threhsolds #start with local, and if its not there, inesrt the global threshold
+ #note, this will break if there are two biomarker locations #
+ '''
+ ls_scene = sorted(df_thresh.index.tolist())
+ ls_sub = df_mi.columns[df_mi.dtypes=='float64'].tolist()
+ ls_other = []
+ df_pos= pd.DataFrame()
+ d_thresh_record= {}
+ for s_scene in ls_scene:
+ ls_index = df_mi[df_mi.slide_scene==s_scene].index
+ df_scene = pd.DataFrame(index=ls_index)
+ for s_marker_loc in ls_sub:
+ s_marker = s_marker_loc.split('_')[0]
+ # only threshold markers in .csv
+ if len(set([item.split('_')[0] for item in df_thresh.columns]).intersection({s_marker})) != 0:
+ #first check if local threshold exists
+ if df_thresh[df_thresh.index==s_scene].isna().loc[s_scene,f'{s_marker}_local']==False:
+ #local
+ i_thresh = df_thresh.loc[s_scene,f'{s_marker}_local']
+ df_scene.loc[ls_index,s_marker_loc] = df_mi.loc[ls_index,s_marker_loc] >= i_thresh
+ #otherwise use global
+ elif df_thresh[df_thresh.index==s_scene].isna().loc[s_scene,f'{s_marker}_global']==False:
+ i_thresh = df_thresh.loc[s_scene,f'{s_marker}_global']
+ df_scene.loc[ls_index,s_marker_loc] = df_mi.loc[ls_index,s_marker_loc] >= i_thresh
+ else:
+ ls_other = ls_other + [s_marker]
+ i_thresh = np.NaN
+ d_thresh_record.update({f'{s_scene}_{s_marker}':i_thresh})
+ else:
+ ls_other = ls_other + [s_marker]
+ df_pos = df_pos.append(df_scene)
+ print(f'Did not threshold {set(ls_other)}')
+ return(d_thresh_record,df_pos)
+
+def plot_positive(s_type,d_combos,df_pos,d_thresh_record,df_xy,b_save=True):
+ ls_color = sorted(d_combos[s_type])
+ ls_bool = [len(set([item.split('_')[0]]).intersection(set(ls_color)))==1 for item in df_pos.columns]
+ ls_color = df_pos.columns[ls_bool].tolist()
+ ls_scene = sorted(set(df_xy.slide_scene))
+ ls_fig = []
+ for s_scene in ls_scene:
+ #negative cells = all cells even before dapi filtering
+ df_neg = df_xy[(df_xy.slide_scene==s_scene)]
+ #plot
+ fig, ax = plt.subplots(2, ((len(ls_color))+1)//2, figsize=(18,12)) #figsize=(18,12)
+ ax = ax.ravel()
+ for ax_num, s_color in enumerate(ls_color):
+ s_marker = s_color.split('_')[0]
+ s_min = d_thresh_record[f"{s_scene}_{s_marker}"]
+ #positive cells = positive cells based on threshold
+ ls_pos_index = (df_pos[df_pos.loc[:,s_color]]).index
+ df_color_pos = df_neg[df_neg.index.isin(ls_pos_index)]
+ if len(df_color_pos)>=1:
+ #plot negative cells
+ ax[ax_num].scatter(data=df_neg,x='DAPI_X',y='DAPI_Y',color='silver',s=1)
+ #plot positive cells
+ ax[ax_num].scatter(data=df_color_pos, x='DAPI_X',y='DAPI_Y',color='DarkBlue',s=.5)
+
+ ax[ax_num].axis('equal')
+ ax[ax_num].set_ylim(ax[ax_num].get_ylim()[::-1])
+ ax[ax_num].set_title(f'{s_marker} min={int(s_min)} ({len(df_color_pos)} cells)')
+ else:
+ ax[ax_num].set_title(f'{s_marker} min={(s_min)} ({(0)} cells')
+ fig.suptitle(s_scene)
+ ls_fig.append(fig)
+ if b_save:
+ fig.savefig(f'./SpatialPlots/{s_scene}_{s_type}_manual.png')
+ return(ls_fig)
+
+#gating analysis
+def prop_positive(df_data,s_cell,s_grouper):
+ #df_data['countme'] = True
+ df_cell = df_data.loc[:,[s_cell,s_grouper,'countme']].dropna()
+ df_prop = (df_cell.groupby([s_cell,s_grouper]).countme.count()/df_cell.groupby([s_grouper]).countme.count()).unstack().T
+ return(df_prop)
+
+def prop_clustermap(df_prop,df_annot,i_thresh,lut,figsize=(10,5)):
+ for s_index in df_prop.index:
+ s_subtype = df_annot.loc[s_index,'ID'] #
+ df_prop.loc[s_index, 'ID'] = s_subtype
+ species = df_prop.pop("ID")
+ row_colors = species.map(lut)
+
+ #clustermap plot wihtout the low values -drop less than i_threh % of total
+ df_plot = df_prop.fillna(0)
+ if i_thresh > 0:
+ df_plot_less = df_plot.loc[:,df_plot.sum()/len(df_plot) > i_thresh]
+ i_len = len(df_prop)
+ i_width = len(df_plot_less.columns)
+ g = sns.clustermap(df_plot_less,figsize=figsize,cmap='viridis',row_colors=row_colors)
+ return(g,df_plot_less)
+
+def prop_barplot(df_plot_less,s_cell,colormap="Spectral",figsize=(10,5),b_sort=True):
+ i_len = len(df_plot_less)
+ i_width = len(df_plot_less.columns)
+ fig,ax = plt.subplots(figsize=figsize)
+ if b_sort:
+ df_plot_less = df_plot_less.sort_index(ascending=False)
+ df_plot_less.plot(kind='barh',stacked=True,width=.9, ax=ax,colormap=colormap)
+ ax.set_title(s_cell)
+ ax.set_xlabel('Fraction Positive')
+ ax.legend(bbox_to_anchor=(1.01, 1))
+ plt.tight_layout()
+ return(fig)
+
+def plot_color_leg(lut,figsize = (2.3,3)):
+ #colors
+ series = pd.Series(lut)
+ df_color = pd.DataFrame(index=range(len(series)),columns=['subtype','color'])
+
+ series.sort_values()
+ df_color['subtype'] = series.index
+ df_color['value'] = 1
+ df_color['color'] = series.values
+
+ fig,ax = plt.subplots(figsize = figsize,dpi=100)
+ df_color.plot(kind='barh',x='subtype',y='value',width=1,legend=False,color=df_color.color,ax=ax)
+ ax.set_xticks([])
+ ax.set_ylabel('')
+ ax.set_title(f'subtype')
+ plt.tight_layout()
+ return(fig)
+
+#cluster analysis
+
+def cluster_kmeans(df_mi,ls_columns,k,b_sil=False):
+ '''
+ log2 transform, zscore and kmens cluster
+ '''
+ df_cluster_norm = df_mi.loc[:,ls_columns]
+ df_cluster_norm_one = df_cluster_norm + 1
+ df_cluster = np.log2(df_cluster_norm_one)
+
+ #select figure size
+ i_len = k
+ i_width = len(df_cluster.columns)
+
+ #scale date
+ df_scale = scale(df_cluster)
+
+ #kmeans cluster
+ kmeans = KMeans(n_clusters=k, random_state=0).fit(df_scale)
+ df_cluster.columns = [item.split('_')[0] for item in df_cluster.columns]
+ df_cluster[f'K{k}'] = list(kmeans.labels_)
+ g = sns.clustermap(df_cluster.groupby(f'K{k}').mean(),cmap="RdYlGn_r",z_score=1,figsize=(3+i_width/3,3+i_len/3))
+ if b_sil:
+ score = silhouette_score(X = df_scale, labels=list(kmeans.labels_))
+ else:
+ score = np.nan
+ return(g,df_cluster,score)
+
+def plot_clusters(df_cluster,df_xy,s_num='many'):
+ s_type = df_cluster.columns[df_cluster.dtypes=='int64'][0]
+ print(s_type)
+ ls_scene = sorted(set(df_cluster.slide_scene))
+ ls_color = sorted(set(df_cluster.loc[:,s_type].dropna()))
+ d_fig = {}
+ for s_scene in ls_scene:
+ #negative cells = all cells even before dapi filtering
+ df_neg = df_xy[(df_xy.slide_scene==s_scene)]
+ #plot
+ if s_num == 'many':
+ fig, ax = plt.subplots(3, ((len(ls_color))+2)//3, figsize=(18,12),dpi=200)
+ else:
+ fig, ax = plt.subplots(2, 1, figsize=(7,4),dpi=200)
+ ax = ax.ravel()
+ for ax_num, s_color in enumerate(ls_color):
+ s_marker = s_color
+ #positive cells = poitive cells based on threshold
+ ls_pos_index = (df_cluster[df_cluster.loc[:,s_type]==s_color]).index
+ df_color_pos = df_neg[df_neg.index.isin(ls_pos_index)]
+ if len(df_color_pos)>=1:
+ #plot negative cells
+ ax[ax_num].scatter(data=df_neg,x='DAPI_X',y='DAPI_Y',color='silver',s=1)
+ #plot positive cells
+ ax[ax_num].scatter(data=df_color_pos, x='DAPI_X',y='DAPI_Y',color='DarkBlue',s=.5)
+
+ ax[ax_num].axis('equal')
+ ax[ax_num].set_ylim(ax[ax_num].get_ylim()[::-1])
+ if s_num == 'many':
+ ax[ax_num].set_xticklabels('')
+ ax[ax_num].set_yticklabels('')
+ ax[ax_num].set_title(f'{s_color} ({len(df_color_pos)} cells)')
+ else:
+ ax[ax_num].set_xticklabels('')
+ ax[ax_num].set_yticklabels('')
+ ax[ax_num].set_title(f'{s_color} ({(0)} cells')
+
+ fig.suptitle(s_scene)
+ d_fig.update({s_scene:fig})
+ return(d_fig)
diff --git a/mplex_image/__init__.py b/mplex_image/__init__.py
new file mode 100755
index 0000000..e69de29
diff --git a/mplex_image/__pycache__/__init__.cpython-37.pyc b/mplex_image/__pycache__/__init__.cpython-37.pyc
new file mode 100755
index 0000000..e9e21ea
Binary files /dev/null and b/mplex_image/__pycache__/__init__.cpython-37.pyc differ
diff --git a/mplex_image/__pycache__/__init__.cpython-38.pyc b/mplex_image/__pycache__/__init__.cpython-38.pyc
new file mode 100755
index 0000000..95b1ebc
Binary files /dev/null and b/mplex_image/__pycache__/__init__.cpython-38.pyc differ
diff --git a/mplex_image/__pycache__/__init__.cpython-39.pyc b/mplex_image/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000..b8859ba
Binary files /dev/null and b/mplex_image/__pycache__/__init__.cpython-39.pyc differ
diff --git a/mplex_image/__pycache__/analyze.cpython-37.pyc b/mplex_image/__pycache__/analyze.cpython-37.pyc
new file mode 100755
index 0000000..2c8fcb1
Binary files /dev/null and b/mplex_image/__pycache__/analyze.cpython-37.pyc differ
diff --git a/mplex_image/__pycache__/analyze.cpython-38.pyc b/mplex_image/__pycache__/analyze.cpython-38.pyc
new file mode 100755
index 0000000..ff95f60
Binary files /dev/null and b/mplex_image/__pycache__/analyze.cpython-38.pyc differ
diff --git a/mplex_image/__pycache__/analyze.cpython-39.pyc b/mplex_image/__pycache__/analyze.cpython-39.pyc
new file mode 100644
index 0000000..842d212
Binary files /dev/null and b/mplex_image/__pycache__/analyze.cpython-39.pyc differ
diff --git a/mplex_image/__pycache__/cmif.cpython-37.pyc b/mplex_image/__pycache__/cmif.cpython-37.pyc
new file mode 100755
index 0000000..5e4ca2b
Binary files /dev/null and b/mplex_image/__pycache__/cmif.cpython-37.pyc differ
diff --git a/mplex_image/__pycache__/cmif.cpython-38.pyc b/mplex_image/__pycache__/cmif.cpython-38.pyc
new file mode 100755
index 0000000..571f31b
Binary files /dev/null and b/mplex_image/__pycache__/cmif.cpython-38.pyc differ
diff --git a/mplex_image/__pycache__/cmif.cpython-39.pyc b/mplex_image/__pycache__/cmif.cpython-39.pyc
new file mode 100755
index 0000000..3742d85
Binary files /dev/null and b/mplex_image/__pycache__/cmif.cpython-39.pyc differ
diff --git a/mplex_image/__pycache__/codex.cpython-37.pyc b/mplex_image/__pycache__/codex.cpython-37.pyc
new file mode 100755
index 0000000..6438d19
Binary files /dev/null and b/mplex_image/__pycache__/codex.cpython-37.pyc differ
diff --git a/mplex_image/__pycache__/codex.cpython-38.pyc b/mplex_image/__pycache__/codex.cpython-38.pyc
new file mode 100755
index 0000000..0010b93
Binary files /dev/null and b/mplex_image/__pycache__/codex.cpython-38.pyc differ
diff --git a/mplex_image/__pycache__/features.cpython-37.pyc b/mplex_image/__pycache__/features.cpython-37.pyc
new file mode 100755
index 0000000..c9df747
Binary files /dev/null and b/mplex_image/__pycache__/features.cpython-37.pyc differ
diff --git a/mplex_image/__pycache__/features.cpython-38.pyc b/mplex_image/__pycache__/features.cpython-38.pyc
new file mode 100755
index 0000000..c869dfe
Binary files /dev/null and b/mplex_image/__pycache__/features.cpython-38.pyc differ
diff --git a/mplex_image/__pycache__/features.cpython-39.pyc b/mplex_image/__pycache__/features.cpython-39.pyc
new file mode 100755
index 0000000..ed790ee
Binary files /dev/null and b/mplex_image/__pycache__/features.cpython-39.pyc differ
diff --git a/mplex_image/__pycache__/gating.cpython-38.pyc b/mplex_image/__pycache__/gating.cpython-38.pyc
new file mode 100755
index 0000000..93c662f
Binary files /dev/null and b/mplex_image/__pycache__/gating.cpython-38.pyc differ
diff --git a/mplex_image/__pycache__/gating.cpython-39.pyc b/mplex_image/__pycache__/gating.cpython-39.pyc
new file mode 100644
index 0000000..88ac253
Binary files /dev/null and b/mplex_image/__pycache__/gating.cpython-39.pyc differ
diff --git a/mplex_image/__pycache__/getdata.cpython-37.pyc b/mplex_image/__pycache__/getdata.cpython-37.pyc
new file mode 100755
index 0000000..59ac9ef
Binary files /dev/null and b/mplex_image/__pycache__/getdata.cpython-37.pyc differ
diff --git a/mplex_image/__pycache__/getdata.cpython-38.pyc b/mplex_image/__pycache__/getdata.cpython-38.pyc
new file mode 100755
index 0000000..83ae205
Binary files /dev/null and b/mplex_image/__pycache__/getdata.cpython-38.pyc differ
diff --git a/mplex_image/__pycache__/getdata.cpython-39.pyc b/mplex_image/__pycache__/getdata.cpython-39.pyc
new file mode 100755
index 0000000..d77f944
Binary files /dev/null and b/mplex_image/__pycache__/getdata.cpython-39.pyc differ
diff --git a/mplex_image/__pycache__/imagine.cpython-37.pyc b/mplex_image/__pycache__/imagine.cpython-37.pyc
new file mode 100755
index 0000000..306fa6b
Binary files /dev/null and b/mplex_image/__pycache__/imagine.cpython-37.pyc differ
diff --git a/mplex_image/__pycache__/imagine.cpython-38.pyc b/mplex_image/__pycache__/imagine.cpython-38.pyc
new file mode 100755
index 0000000..49741f4
Binary files /dev/null and b/mplex_image/__pycache__/imagine.cpython-38.pyc differ
diff --git a/mplex_image/__pycache__/metadata.cpython-37.pyc b/mplex_image/__pycache__/metadata.cpython-37.pyc
new file mode 100755
index 0000000..ec53895
Binary files /dev/null and b/mplex_image/__pycache__/metadata.cpython-37.pyc differ
diff --git a/mplex_image/__pycache__/metadata.cpython-38.pyc b/mplex_image/__pycache__/metadata.cpython-38.pyc
new file mode 100755
index 0000000..862b8f4
Binary files /dev/null and b/mplex_image/__pycache__/metadata.cpython-38.pyc differ
diff --git a/mplex_image/__pycache__/mics.cpython-38.pyc b/mplex_image/__pycache__/mics.cpython-38.pyc
new file mode 100755
index 0000000..2b21d7a
Binary files /dev/null and b/mplex_image/__pycache__/mics.cpython-38.pyc differ
diff --git a/mplex_image/__pycache__/mics.cpython-39.pyc b/mplex_image/__pycache__/mics.cpython-39.pyc
new file mode 100755
index 0000000..68abfea
Binary files /dev/null and b/mplex_image/__pycache__/mics.cpython-39.pyc differ
diff --git a/mplex_image/__pycache__/mpimage.cpython-37.pyc b/mplex_image/__pycache__/mpimage.cpython-37.pyc
new file mode 100755
index 0000000..7694f6a
Binary files /dev/null and b/mplex_image/__pycache__/mpimage.cpython-37.pyc differ
diff --git a/mplex_image/__pycache__/mpimage.cpython-38.pyc b/mplex_image/__pycache__/mpimage.cpython-38.pyc
new file mode 100755
index 0000000..25b868a
Binary files /dev/null and b/mplex_image/__pycache__/mpimage.cpython-38.pyc differ
diff --git a/mplex_image/__pycache__/mpimage.cpython-39.pyc b/mplex_image/__pycache__/mpimage.cpython-39.pyc
new file mode 100755
index 0000000..93be7a7
Binary files /dev/null and b/mplex_image/__pycache__/mpimage.cpython-39.pyc differ
diff --git a/mplex_image/__pycache__/normalize.cpython-38.pyc b/mplex_image/__pycache__/normalize.cpython-38.pyc
new file mode 100755
index 0000000..432c2cd
Binary files /dev/null and b/mplex_image/__pycache__/normalize.cpython-38.pyc differ
diff --git a/mplex_image/__pycache__/normalize.cpython-39.pyc b/mplex_image/__pycache__/normalize.cpython-39.pyc
new file mode 100755
index 0000000..376a0fc
Binary files /dev/null and b/mplex_image/__pycache__/normalize.cpython-39.pyc differ
diff --git a/mplex_image/__pycache__/ometiff.cpython-37.pyc b/mplex_image/__pycache__/ometiff.cpython-37.pyc
new file mode 100755
index 0000000..575debe
Binary files /dev/null and b/mplex_image/__pycache__/ometiff.cpython-37.pyc differ
diff --git a/mplex_image/__pycache__/ometiff.cpython-38.pyc b/mplex_image/__pycache__/ometiff.cpython-38.pyc
new file mode 100755
index 0000000..b3dbb77
Binary files /dev/null and b/mplex_image/__pycache__/ometiff.cpython-38.pyc differ
diff --git a/mplex_image/__pycache__/ometiff.cpython-39.pyc b/mplex_image/__pycache__/ometiff.cpython-39.pyc
new file mode 100755
index 0000000..789526e
Binary files /dev/null and b/mplex_image/__pycache__/ometiff.cpython-39.pyc differ
diff --git a/mplex_image/__pycache__/preprocess.cpython-37.pyc b/mplex_image/__pycache__/preprocess.cpython-37.pyc
new file mode 100755
index 0000000..61224ba
Binary files /dev/null and b/mplex_image/__pycache__/preprocess.cpython-37.pyc differ
diff --git a/mplex_image/__pycache__/preprocess.cpython-38.pyc b/mplex_image/__pycache__/preprocess.cpython-38.pyc
new file mode 100755
index 0000000..14db79b
Binary files /dev/null and b/mplex_image/__pycache__/preprocess.cpython-38.pyc differ
diff --git a/mplex_image/__pycache__/preprocess.cpython-39.pyc b/mplex_image/__pycache__/preprocess.cpython-39.pyc
new file mode 100755
index 0000000..a629aca
Binary files /dev/null and b/mplex_image/__pycache__/preprocess.cpython-39.pyc differ
diff --git a/mplex_image/__pycache__/process.cpython-37.pyc b/mplex_image/__pycache__/process.cpython-37.pyc
new file mode 100755
index 0000000..a2ab185
Binary files /dev/null and b/mplex_image/__pycache__/process.cpython-37.pyc differ
diff --git a/mplex_image/__pycache__/process.cpython-38.pyc b/mplex_image/__pycache__/process.cpython-38.pyc
new file mode 100755
index 0000000..18d3893
Binary files /dev/null and b/mplex_image/__pycache__/process.cpython-38.pyc differ
diff --git a/mplex_image/__pycache__/process.cpython-39.pyc b/mplex_image/__pycache__/process.cpython-39.pyc
new file mode 100755
index 0000000..5a6c4e6
Binary files /dev/null and b/mplex_image/__pycache__/process.cpython-39.pyc differ
diff --git a/mplex_image/__pycache__/register.cpython-37.pyc b/mplex_image/__pycache__/register.cpython-37.pyc
new file mode 100755
index 0000000..6b120c4
Binary files /dev/null and b/mplex_image/__pycache__/register.cpython-37.pyc differ
diff --git a/mplex_image/__pycache__/register.cpython-38.pyc b/mplex_image/__pycache__/register.cpython-38.pyc
new file mode 100755
index 0000000..1590041
Binary files /dev/null and b/mplex_image/__pycache__/register.cpython-38.pyc differ
diff --git a/mplex_image/__pycache__/register.cpython-39.pyc b/mplex_image/__pycache__/register.cpython-39.pyc
new file mode 100755
index 0000000..d5a71c0
Binary files /dev/null and b/mplex_image/__pycache__/register.cpython-39.pyc differ
diff --git a/mplex_image/__pycache__/segment.cpython-37.pyc b/mplex_image/__pycache__/segment.cpython-37.pyc
new file mode 100755
index 0000000..3204988
Binary files /dev/null and b/mplex_image/__pycache__/segment.cpython-37.pyc differ
diff --git a/mplex_image/__pycache__/segment.cpython-38.pyc b/mplex_image/__pycache__/segment.cpython-38.pyc
new file mode 100755
index 0000000..d6e2cbc
Binary files /dev/null and b/mplex_image/__pycache__/segment.cpython-38.pyc differ
diff --git a/mplex_image/__pycache__/segment.cpython-39.pyc b/mplex_image/__pycache__/segment.cpython-39.pyc
new file mode 100755
index 0000000..9015372
Binary files /dev/null and b/mplex_image/__pycache__/segment.cpython-39.pyc differ
diff --git a/mplex_image/__pycache__/visualize.cpython-37.pyc b/mplex_image/__pycache__/visualize.cpython-37.pyc
new file mode 100755
index 0000000..77489bc
Binary files /dev/null and b/mplex_image/__pycache__/visualize.cpython-37.pyc differ
diff --git a/mplex_image/__pycache__/visualize.cpython-38.pyc b/mplex_image/__pycache__/visualize.cpython-38.pyc
new file mode 100755
index 0000000..4f6e116
Binary files /dev/null and b/mplex_image/__pycache__/visualize.cpython-38.pyc differ
diff --git a/mplex_image/__pycache__/visualize.cpython-39.pyc b/mplex_image/__pycache__/visualize.cpython-39.pyc
new file mode 100755
index 0000000..d1843c1
Binary files /dev/null and b/mplex_image/__pycache__/visualize.cpython-39.pyc differ
diff --git a/mplex_image/_version.py b/mplex_image/_version.py
new file mode 100755
index 0000000..6526deb
--- /dev/null
+++ b/mplex_image/_version.py
@@ -0,0 +1 @@
+__version__ = "0.0.7"
diff --git a/mplex_image/analyze.py b/mplex_image/analyze.py
new file mode 100755
index 0000000..2887c41
--- /dev/null
+++ b/mplex_image/analyze.py
@@ -0,0 +1,300 @@
+####
+# title: analyze.py
+#
+# language: Python3.6
+# date: 2019-05-00
+# license: GPL>=v3
+# author: Jenny
+#
+# description:
+# python3 library to analyze cyclic data and images after manual thresholding
+####
+
+#load libraries
+import matplotlib as mpl
+mpl.use('agg')
+import pandas as pd
+import numpy as np
+import os
+import skimage
+from skimage import io
+import json
+from biotransistor import imagine
+import itertools
+
+#functions
+# import importlib
+# importlib.reload(analyze)
+
+def combinations(df_tn_tumor,ls_marker=['CK19_Ring','CK7_Ring','CK5_Ring','CK14_Ring','CD44_Ring','Vim_Ring']):
+ '''
+ get all combinations of the markers (can be overlapping)
+ '''
+ ls_combos = []
+ for i in range(0,len(ls_marker)):
+ for tu_combo in itertools.combinations(ls_marker,i+1):#'Ecad_Ring',
+ ls_combos.append(tu_combo)
+
+ #create the combos dataframe dataframe
+ df_tn_counts = pd.DataFrame(index=df_tn_tumor.index)
+ se_all = set(ls_marker)
+
+ #combos of 2 or more
+ for tu_combo in ls_combos:
+ print(tu_combo)
+ se_pos = df_tn_tumor[(df_tn_tumor.loc[:,tu_combo].sum(axis=1) ==len(tu_combo))] #those are pos
+ se_neg = df_tn_tumor[(df_tn_tumor.loc[:,(se_all)].sum(axis=1) == len(tu_combo))] #and only those
+ df_tn_counts['_'.join([item for item in tu_combo])] = df_tn_tumor.index.isin(se_pos.index.intersection(se_neg.index))
+
+ #other cells (negative for all)
+ df_tn_counts['__'] = df_tn_counts.loc[:,df_tn_counts.dtypes=='bool'].sum(axis=1)==0
+ if sum(df_tn_counts.sum(axis=1)!=1) !=0:
+ print('error in analyze.combinations')
+
+ return(df_tn_counts)
+
+def gated_combinations(df_data,ls_gate,ls_marker):
+ '''
+ df_data = boolean cell type dataframe
+ ls_gate = combine each of these cell types (full coverage and non-overlapping)
+ ls_marker = with these cell tpyes (full coverage and non-overlapping)
+ '''
+ es_all = set(ls_marker + ls_gate)
+ ls_old = df_data.columns
+ df_gate_counts = pd.DataFrame()
+ for s_gate in ls_gate:
+ df_tn_tumor = df_data[df_data.loc[:,s_gate]]
+ print(f'{s_gate} {len(df_tn_tumor)}')
+ #combos of 2
+ if len(df_tn_tumor) >=1:
+ for s_marker in ls_marker:
+ print(s_marker)
+ tu_combo = (s_gate,s_marker)
+ es_neg = es_all - set(tu_combo)
+ if ~df_data.loc[:,tu_combo].all(axis=1).any():
+ df_gate_counts[f"{s_gate}_{s_marker}"] = False
+ else:
+ df_gate_counts[f"{s_gate}_{s_marker}"] = df_data.loc[:,tu_combo].all(axis=1) & ~df_data.loc[:,es_neg].any(axis=1)
+ df_gate_counts.fillna(value=False, inplace=True)
+ return(df_gate_counts)
+
+def add_celltype(df_data, ls_cell_names, s_type_name):
+ '''
+ add gated cell type to data frame, and save the possible cell typesand cell type name in a csv
+ df_data = data frame with the cell types (boolean)
+ ls_cell_names = list of the cell names
+ s_type_name = the cell category
+ '''
+ #check cell types' exclusivity
+ if ((df_data.loc[:,ls_cell_names].sum(axis=1)>1)).sum()!=0:
+ print(f'Error in exclusive cell types: {s_type_name}')
+
+ #make cell type object columns
+ for s_marker in ls_cell_names:
+ df_data.loc[(df_data[df_data.loc[:,s_marker]]).index,s_type_name] = s_marker
+ d_record = {s_type_name:ls_cell_names}
+
+ #append the record json
+ if not os.path.exists('./Gating_Record.json'):
+ with open(f'Gating_Record.json','w') as f:
+ json.dump(d_record, f, indent=4, sort_keys=True)
+ else:
+ with open('Gating_Record.json','r') as f:
+ d_current = json.load(f)
+ d_current.update(d_record)
+ with open(f'Gating_Record.json','w') as f:
+ json.dump(d_current, f, indent=4, sort_keys=True)
+
+def thresh_meanint(df_thresh,d_crop={},s_thresh='minimum',):
+ """
+ threshold, and output positive and negative mean intensity and array
+ df_thresh = dataframe of images with columns having image attributes
+ and index with image names, column with threshold values
+ d_crop = image scene and crop coordinates
+
+ """
+ d_mask = {}
+ for idx, s_index in enumerate(df_thresh.index):
+ #load image, crop, thresh
+ a_image = skimage.io.imread(s_index)
+ if len(d_crop) != 0:
+ tu_crop = d_crop[df_thresh.loc[s_index,'scene']]
+ a_image = a_image[(tu_crop[1]):(tu_crop[1]+tu_crop[3]),(tu_crop[0]):(tu_crop[0]+tu_crop[2])]
+ i_min = df_thresh.loc[s_index,s_thresh]
+ a_mask = a_image > i_min
+ print(f'mean positive intensity = {np.mean(a_image[a_mask])}')
+ df_thresh.loc[s_index,'meanpos'] = np.mean(a_image[a_mask])
+ b_mask = a_image < i_min
+ print(f'mean negative intensity = {np.mean(a_image[b_mask])}')
+ df_thresh.loc[s_index,'meanneg'] = np.mean(a_image[b_mask])
+ d_mask.update({s_index:a_mask})
+ return(df_thresh,d_mask)
+
+def mask_meanint(df_img, a_mask):
+ '''
+ for each image in dataframe of image (df_img)
+ calculate mean intensity in pixels in mask (a_mask)
+ '''
+
+ #for each image, calculate mean intensity in the masked area
+ for s_index in df_img.index:
+ a_img = skimage.io.imread(s_index)
+ a_img_total = a_img[a_mask]
+ i_img_meanint = a_img_total.sum()/a_img_total.size
+ df_img.loc[s_index,'result'] = i_img_meanint
+ return(df_img)
+
+def make_border(s_sample,df_pos,ls_color,segmentdir,savedir,b_images=True,s_find = 'Cell Segmentation Basins.tif',s_split='Scene '):
+ """
+ load positive cells dataframe, and segmentation basins
+ output the borders od positive cells and the cells touching dictionary
+ """
+ #load segmentation basins
+ #flattens ids into a set (stored in d_flatten)
+ os.chdir(segmentdir)
+ ls_file = os.listdir()
+ ls_cellseg = []
+
+ # list of Basin files
+ for s_file in ls_file:
+ if s_file.find(s_find)>-1:
+ if s_file.find(s_sample)>-1:
+ ls_cellseg.append(s_file)
+
+ d_flatten = {}
+ dd_touch = {}
+
+ for s_file in ls_cellseg:
+ s_scene_num = s_file.split(s_split)[1].split('_')[0].split(' ')[0]
+ print(s_file)
+ print(s_scene_num)
+ a_img = io.imread(s_file)
+ # get all cell ids that exist in the images
+ es_cell = set(a_img.flatten())
+ es_cell.remove(0)
+ s_scene = f'scene{s_scene_num}'
+ d_flatten.update({f'scene{s_scene_num}':es_cell})
+
+ #get a cell touching dictionary (only do this one (faster))
+ dd_touch.update({f'{s_sample}_{s_scene}':imagine.touching_cells(a_img, i_border_width=0)})
+
+ #s_type = 'Manual'
+ if b_images:
+ #save png of cell borders (single tiffs)
+ for idx, s_color in enumerate(ls_color):
+ print(f'Processing {s_color}')
+ #positive cells = positive cells based on thresholds
+ #dataframe of all the positive cells
+ df_color_pos = df_pos[df_pos.loc[:,s_color]]
+ ls_index = df_color_pos.index.tolist()
+
+ if len(df_color_pos[(df_color_pos.scene==s_scene)])>=1:
+ ls_index = df_color_pos[(df_color_pos.scene==s_scene)].index.tolist()
+ es_cell_positive = set([int(s_index.split('cell')[-1]) for s_index in ls_index])
+
+ # erase all non positive basins
+ es_cell_negative = d_flatten[s_scene].difference(es_cell_positive)
+ a_pos = np.copy(a_img)
+ a_pos[np.isin(a_img, list(es_cell_negative))] = 0 # bue: this have to be a list, else it will not work!
+
+ # get cell border (a_pos_border)
+ a_pos_border = imagine.get_border(a_pos) # border has value 1
+ a_pos_border = np.uint16(a_pos_border * 65000) # border will have value 255
+ #filename hack
+ print('saving image')
+ io.imsave(f'{savedir}/Registered-R{idx+100}_{s_color.replace("_",".")}.border.border.border_{df_color_pos.index[0].split("_")[0]}-{s_scene.replace("scene","Scene-")}_c2_ORG.tif',a_pos_border)
+ else:
+ print(len(df_color_pos[(df_color_pos.scene==s_scene)]))
+ #from elmar (reformat cells touching dictionary and save
+
+ ddes_image = {}
+ for s_image, dei_image in dd_touch.items():
+ des_cell = {}
+ for i_cell, ei_touch in dei_image.items():
+ des_cell.update({str(i_cell): [str(i_touch) for i_touch in sorted(ei_touch)]})
+ ddes_image.update({s_image:des_cell})
+
+ #save dd_touch as json file
+ with open(f'result_{s_sample}_cellstouching_dictionary.json','w') as f:
+ json.dump(ddes_image, f)
+ return(ddes_image)
+
+def make_border_all(s_sample,df_pos,segmentdir,savedir,b_images=True):
+ """
+ load positive cells dataframe, and segmentation basins
+ output the borders od positive cells and the cells touching dictionary
+ """
+ #Specify which images to save
+ #ls_color = df_pos.columns.tolist()
+ #ls_color.remove('DAPI_X')
+ #ls_color.remove('DAPI_Y')
+ #ls_color.remove('scene')
+
+ #load segmentation basins
+ #flattens ids into a set (stored in d_flatten)
+ os.chdir(segmentdir)
+ ls_file = os.listdir()
+ ls_cellseg = []
+ d_files = {}
+ #dictionary of file to scene ID , and a list of Basin files
+ for s_file in ls_file:
+ if s_file.find('Cell Segmentation Basins.tif')>-1:
+ if s_file.find(s_sample)>-1:
+ ls_cellseg.append(s_file)
+ s_scene_num = s_file.split(' ')[1]
+ d_files.update({f'scene{s_scene_num}':s_file})
+
+ d_flatten = {}
+ dd_touch = {}
+
+ for s_file in ls_cellseg:
+ s_scene_num = s_file.split(' ')[1]
+ print(s_file)
+ a_img = skimage.io.imread(s_file)
+ # get all cell ids that exist in the images
+ es_cell = set(a_img.flatten())
+ es_cell.remove(0)
+ s_scene = f'scene{s_scene_num}'
+ d_flatten.update({f'scene{s_scene_num}':es_cell})
+
+ #get a cell touching dictionary (only do this one (faster))
+ dd_touch.update({f'{s_sample}_{s_scene}':imagine.touching_cells(a_img, i_border_width=0)})
+
+ #s_type = 'Manual'
+ if b_images:
+ idx=0
+ #save png of all cell borders (single tiffs)
+ #for idx, s_color in enumerate(ls_color):
+ # print(f'Processing {s_color}')
+ #positive cells = positive cells based on thresholds
+ #dataframe of all the positive cells
+ df_color_pos = df_pos #[df_pos.loc[:,s_color]]
+ ls_index = df_color_pos.index.tolist()
+
+ if len(df_color_pos[(df_color_pos.scene==s_scene)])>=1:
+ ls_index = df_color_pos[(df_color_pos.scene==s_scene)].index.tolist()
+ es_cell_positive = set([int(s_index.split('cell')[-1]) for s_index in ls_index])
+
+ # erase all non positive basins
+ es_cell_negative = d_flatten[s_scene].difference(es_cell_positive)
+ a_pos = np.copy(a_img)
+ a_pos[np.isin(a_img, list(es_cell_negative))] = 0 # bue: this have to be a list, else it will not work!
+
+ # get cell border (a_pos_border)
+ a_pos_border = imagine.get_border(a_pos) # border has value 1
+ a_pos_border = a_pos_border.astype(np.uint8)
+ a_pos_border = a_pos_border * 255 # border will have value 255
+ #filename hack 2019-11-27
+ skimage.io.imsave(f'{savedir}/R{idx+100}_all.all_{df_color_pos.index[0].split("_")[0]}-{s_scene.replace("scene","Scene-")}_border_c3_ORG.tif',a_pos_border)
+
+def celltype_to_bool(df_data, s_column):
+ """
+ Input a dataframe and column name of cell tpyes
+ Output a new boolean dataframe with each col as a cell type
+ """
+ df_bool = pd.DataFrame(index=df_data.index)
+ for celltype in sorted(set(df_data.loc[:,s_column])):
+ df_bool.loc[df_data[df_data.loc[:,s_column]==celltype].index,celltype] = True
+ df_bool = df_bool.fillna(value=False)
+ df_data.columns = [str(item) for item in df_data.columns]
+ return(df_bool)
\ No newline at end of file
diff --git a/mplex_image/cmif.py b/mplex_image/cmif.py
new file mode 100755
index 0000000..62367dc
--- /dev/null
+++ b/mplex_image/cmif.py
@@ -0,0 +1,705 @@
+# wrapper functions for cmIF image processing
+
+from mplex_image import preprocess, mpimage, getdata, process, features, register, ometiff
+import copy
+import time
+import os
+import numpy as np
+import shutil
+import subprocess
+import pandas as pd
+import math
+from itertools import compress
+import skimage
+import sys
+import re
+from skimage import io
+from skimage.util import img_as_uint
+import tifffile
+
+#set src path (CHANGE ME)
+s_src_path = '/home/groups/graylab_share/OMERO.rdsStore/engje/Data/cmIF'
+s_work_path = '/home/groups/graylab_share/Chin_Lab/ChinData/Work/engje'
+
+
+def parse_czi(czidir,type='r',b_scenes=True):
+ """
+ parse .czi's written in koei's naming convention
+ type = 's' for stitched
+ """
+ cwd = os.getcwd()
+ #go to directory
+ os.chdir(czidir)
+ df_img = mpimage.filename_dataframe(s_end = ".czi",s_start='R',s_split='_')
+ df_img['slide'] = [item[2] for item in [item.split('_') for item in df_img.index]]
+ if type=='s':
+ df_img['slide'] = [item[5] for item in [item.split('_') for item in df_img.index]]
+ df_img['rounds'] = [item[0] for item in [item.split('_') for item in df_img.index]]
+ df_img['markers'] = [item[1] for item in [item.split('_') for item in df_img.index]]
+ if b_scenes:
+ try:
+ df_img['scene'] = [item[1].split('.')[0] for item in [item.split('Scene-') for item in df_img.index]]
+ except IndexError:
+ print(f"{set([item[0] for item in [item.split('Scene-') for item in df_img.index]])}")
+ df_img['scanID'] = [item[-1].split('-Scene')[0] for item in [item.split('__') for item in df_img.index]]
+ os.chdir(cwd)
+ return(df_img)
+
+def parse_stitched_czi(czidir,s_slide,b_scenes=True):
+ '''
+ parse .czi's wtitten in koei's naming convention, with periods changed to undescores
+ '''
+ cwd = os.getcwd()
+ #go to directory
+ os.chdir(czidir)
+ df_img = mpimage.filename_dataframe(s_end = ".czi",s_start='R',s_split='_').rename({'data':'rounds'},axis=1)
+ df_img['markers'] = [item[0] for item in [item.split(f'_{s_slide}') for item in df_img.index]]
+ for s_index in df_img.index:
+ df_img.loc[s_index,'markers_un'] = df_img.loc[s_index,'markers'].split(f"{df_img.loc[s_index,'rounds']}_")[1]
+ df_img['markers'] = df_img.markers_un.str.replace('_','.')
+ df_img.slide = s_slide
+ if b_scenes:
+ df_img['scene'] = [item[1].split('-')[0] for item in [item.split('Scene-') for item in df_img.index]]
+ os.chdir(cwd)
+ return(df_img)
+
+def count_images(df_img):
+ """
+ count and list slides, scenes, rounds
+ """
+ for s_sample in sorted(set(df_img.slide)):
+ print(s_sample)
+ df_img_slide = df_img[df_img.slide==s_sample]
+ print('scene names')
+ [print(f'{item}: {sum(df_img_slide.scene==item)}') for item in sorted(set(df_img_slide.scene))]
+ print(f'Number of images = {len(df_img_slide)}')
+ print(f'Rounds:')
+ [print(f'{item}: {sum(df_img_slide.rounds==item)}') for item in sorted(set(df_img_slide.rounds))]
+ print('\n')
+
+def visualize_raw_images(df_img,qcdir,color='c1'):
+ """
+ array raw images to check tissue identity, focus, etc.
+ """
+ for s_sample in sorted(set(df_img.slide)):
+ print(s_sample)
+
+ df_img_slide = df_img[df_img.slide==s_sample]
+ for s_scene in sorted(set(df_img_slide.scene)):
+ print(s_scene)
+ df_dapi = df_img_slide[(df_img_slide.color==color) & (df_img_slide.scene==s_scene)].sort_values(['round_ord','rounds'])
+ fig = mpimage.array_img(df_dapi,s_xlabel='slide',ls_ylabel=['scene','color'],s_title='rounds',tu_array=(2,len(df_dapi)//2+1),tu_fig=(24,10))
+ fig.savefig(f'{qcdir}/RawImages/{s_sample}-Scene-{s_scene}_{color}_all.png')
+
+def registration_python(s_sample,tiffdir,regdir,qcdir):
+ print(f'Registering {s_sample}')
+ preprocess.cmif_mkdir([f'{qcdir}/RegistrationPlots/'])
+ os.chdir(f'{tiffdir}/{s_sample}')
+ df_img = mpimage.parse_org(s_end = "ORG.tif",type='raw')
+ df_img['round_ord'] = [int(re.sub('[^0-9]','', item)) for item in df_img.rounds]
+ df_img = df_img.sort_values(['round_ord','rounds','color','scene'])
+ for i_scene in sorted(set(df_img.scene)):
+ preprocess.cmif_mkdir([f'{regdir}/{s_sample}-Scene-{i_scene}'])
+ df_dapi = df_img[(df_img.color=='c1') & (df_img.scene==i_scene)]
+ target_file = df_dapi[df_dapi.rounds=='R1'].index[0]
+ target = io.imread(target_file)
+ for moving_file in df_dapi.index:
+ s_round = moving_file.split('_')[0]
+ moving_pts, target_pts, transformer = register.register(target_file,moving_file,b_plot=True)
+ for moving_channel in df_img[(df_img.rounds==s_round) & (df_img.scene==i_scene)].index:
+ moving = io.imread(moving_channel)
+ warped_img, warped_pts = register.apply_transform(moving, target, moving_pts, target_pts, transformer)
+ warped_img = img_as_uint(warped_img)
+ io.imsave(f"{regdir}/{s_sample}-Scene-{i_scene}/Registered-{moving_channel.split(s_sample)[0]}{s_sample}-Scene-{moving_channel.split('-Scene-')[1]}",warped_img)
+
+def run_registration_matlab(d_register, ls_order, tiffdir, regdir, N_colors='5'):
+ """
+ run registration on server with or without cropping
+ """
+ os.chdir(tiffdir)
+ shutil.copyfile(f'{s_src_path}/src/wrapper.sh', './wrapper.sh')
+ for s_sample, d_crop in d_register.items():
+ if len(d_crop) > 0:
+ print(f'Large registration {s_sample}')
+ for key, value in d_crop.items():
+ if len(str(key)) == 1:
+ preprocess.cmif_mkdir([f'{regdir}/{s_sample.split("-Scene")[0]}-Scene-00{str(key)}'])
+ elif len(str(key)) == 2:
+ preprocess.cmif_mkdir([f'{regdir}/{s_sample.split("-Scene")[0]}-Scene-0{str(key)}'])
+ preprocess.large_registration_matlab(N_smpl='10000',N_colors=N_colors,s_rootdir=tiffdir, s_subdirname=regdir,
+ d_crop_regions=d_crop, s_ref_id='./R1_*_c1_ORG.tif', ls_order=ls_order)
+ MyOut = subprocess.Popen(['sbatch', 'wrapper.sh'], #the script runs fine
+ stdout=subprocess.PIPE,
+ stderr=subprocess.STDOUT)
+ #regular registration
+ else:
+ print(f'Regular registration {s_sample}')
+ df_img = mpimage.parse_org(s_end = "ORG.tif",type='raw')
+ df_img['slide_scene'] = df_img.slide + '-Scene-' + df_img.scene
+ preprocess.cmif_mkdir([(f'{regdir}/{item}') for item in sorted(set(df_img.slide_scene))]) #this will break with diff slides
+ preprocess.registration_matlab(N_smpl='10000',N_colors=N_colors,s_rootdir=tiffdir, s_subdirname=f'{regdir}/',
+ s_ref_id='./R1_*_c1_ORG.tif',ls_order =ls_order)
+ MyOut = subprocess.Popen(['sbatch', 'wrapper.sh'], #the script runs fine
+ stdout=subprocess.PIPE,
+ stderr=subprocess.STDOUT)
+
+def visualize_reg_images(regdir,qcdir,color='c1',s_sample=''):
+ """
+ array registered images to check tissue identity, focus, etc.
+ """
+ #check registration
+ preprocess.cmif_mkdir([f'{qcdir}/RegisteredImages'])
+ os.chdir(regdir)
+ for idx, s_dir in enumerate(sorted(os.listdir())):
+ if s_dir.find(s_sample) > -1:
+ os.chdir(s_dir)
+ s_sample_name = s_dir.split('-Scene')[0]
+ print(s_sample_name)
+ df_img = mpimage.parse_org(s_end = "ORG.tif",type='reg')
+ ls_scene = sorted(set(df_img.scene))
+ for s_scene in ls_scene:
+ print(s_scene)
+ df_img_scene = df_img[df_img.scene == s_scene]
+ df_img_stain = df_img_scene[df_img_scene.color==color]
+ df_img_sort = df_img_stain.sort_values(['round_ord','rounds'])
+ i_sqrt = math.ceil(math.sqrt(len(df_img_sort)))
+ fig = mpimage.array_img(df_img_sort,s_xlabel='marker',ls_ylabel=['scene','color'],s_title='rounds',tu_array=(2,len(df_img_sort)//2+1),tu_fig=(24,10))
+ #fig = mpimage.array_img(df_img_sort,s_column='color',s_row='rounds',s_label='scene',tu_array=(i_sqrt,i_sqrt),tu_fig=(16,14))
+ fig.savefig(f'{qcdir}/RegisteredImages/{s_scene}_registered_{color}.png')
+ os.chdir('..')
+ return(df_img_sort)
+
+def rename_files(d_rename,dir,b_test=True):
+ """
+ change file names
+ """
+ os.chdir(dir)
+ for idx, s_dir in enumerate(sorted(os.listdir())):
+ s_path = f'{dir}/{s_dir}'
+ os.chdir(s_path)
+ #s_sample = s_dir.split('-Scene')[0]
+ print(s_dir)
+ df_img = mpimage.parse_org(s_end = "ORG.tif",type='reg')
+ es_wrong= preprocess.check_names(df_img)
+ if b_test:
+ print('This is a test')
+ preprocess.dchange_fname(d_rename,b_test=True)
+ elif b_test==False:
+ print('Changing name - not a test')
+ preprocess.dchange_fname(d_rename,b_test=False)
+ else:
+ pass
+
+def autofluorescence_subtract_dir(regdir,codedir,d_channel,ls_exclude,subdir,d_early={}):
+ '''
+ AF subtract images
+ '''
+ os.chdir(regdir)
+ for idx, s_dir in enumerate(sorted(os.listdir())):
+ print(s_dir)
+ s_path = f'{regdir}/{s_dir}'
+ os.chdir(s_path)
+ #preprocess.cmif_mkdir([f'{s_path}/AFSubtracted'])
+ s_sample = s_dir.split('-Scene')[0]
+ df_img = mpimage.parse_org(s_end = "ORG.tif",type='reg')
+ #load exposure times csv
+ df_exp = pd.read_csv(f'{codedir}/{s_sample}_ExposureTimes.csv',index_col=0,header=0)#
+ #AF subtract images
+ df_img_exp = mpimage.add_exposure(df_img,df_exp,type='czi')
+ if len(d_early)>0:
+ df_markers, df_copy = mpimage.subtract_scaled_images(df_img_exp,d_late=d_channel,
+ d_early=d_early, ls_exclude=ls_exclude,subdir=subdir,b_8bit=False)
+ else:
+ df_markers, df_copy = mpimage.subtract_images(df_img_exp,d_channel=d_channel,
+ ls_exclude=ls_exclude,subdir=subdir,b_8bit=False)
+
+ return(df_markers)
+
+def autofluorescence_subtract(s_sample,df_img,codedir,d_channel,ls_exclude,subdir,d_early={}):
+ '''
+ AF subtract images
+ '''
+ df_img = mpimage.parse_org(s_end = "ORG.tif",type='reg')
+ #load exposure times csv
+ df_exp = pd.read_csv(f'{codedir}/{s_sample}_ExposureTimes.csv',index_col=0,header=0)#
+ #AF subtract images
+ df_img_exp = mpimage.add_exposure(df_img,df_exp,type='czi')
+ if len(d_early)>0:
+ df_markers, df_copy = mpimage.subtract_scaled_images(df_img_exp,d_late=d_channel,
+ d_early=d_early, ls_exclude=ls_exclude,subdir=subdir,b_8bit=False)
+ else:
+ df_markers, df_copy = mpimage.subtract_images(df_img_exp,d_channel=d_channel,
+ ls_exclude=ls_exclude,subdir=subdir,b_8bit=False)
+
+ return(df_markers)
+
+def multipage_ome_tiff(d_combos,d_crop,tu_dim,s_dapi,regdir,b_crop=False):
+ '''
+ make custom overlays, either original of AF subtracted, save at 8 bit for size, and thresholding
+ '''
+ os.chdir(regdir)
+ for idx, s_dir in enumerate(sorted(os.listdir())):
+ print(s_dir)
+ s_path = f'{regdir}/{s_dir}'
+ os.chdir(s_path)
+ df_img = mpimage.parse_org(s_end = "ORG.tif",s_start='R',type='reg')
+ df_dapi = df_img[df_img.marker.str.contains(s_dapi.split('_')[0])]
+ df_img_stain = df_img[(~df_img.marker.str.contains('DAPI'))]
+ #check
+ es_test = set()
+ for key, item in d_combos.items():
+ es_test = es_test.union(item)
+ print(set(df_img_stain.marker) - es_test)
+
+ #cropped
+ if b_crop:
+ s_scene = set(d_crop).intersection(set(df_img.scene))
+ d_crop_scene={k: d_crop[k] for k in (sorted(s_scene))}
+ process.custom_crop_overlays(d_combos,d_crop_scene, df_img,s_dapi, tu_dim=tu_dim) #df_dapi,
+ else:
+ process.custom_overlays(d_combos, df_img_stain, df_dapi)
+
+def visualize_multicolor_overlay(s_scene,subdir,qcdir,d_overlay,d_crop,es_bright,high_thresh):
+ s_sample = s_scene.split('-Scene')[0]
+ preprocess.cmif_mkdir([f'{qcdir}/{s_sample}'])
+ if os.path.exists(f'{subdir}/{s_sample}'):
+ s_path = f'{subdir}/{s_sample}'
+ elif os.path.exists(f'{subdir}/{s_scene}'):
+ s_path = f'{subdir}/{s_scene}'
+ os.chdir(s_path)
+ df_img = mpimage.parse_org()
+ df_img['path'] = [f'{s_path}/{item}' for item in df_img.index]
+ df_dapi_round = df_img[(df_img.color=='c1')&(df_img.scene==s_scene) & (df_img.rounds=='R2')]
+ df_scene = df_img[(df_img.color!='c1') & (df_img.scene==s_scene)]
+ for s_round,ls_marker in d_overlay.items():
+ print(f'Generating multicolor overlay {[item for item in ls_marker]}')
+ df_round = df_scene[df_scene.marker.isin(ls_marker)]
+ high_thresh=0.999
+ d_overlay_round = {s_round:ls_marker}
+ d_result = mpimage.multicolor_png(df_round,df_dapi_round,s_scene=s_scene,d_overlay=d_overlay_round,d_crop=d_crop,es_dim={'nada'},es_bright=es_bright,low_thresh=2000,high_thresh=high_thresh)
+ for key, tu_result in d_result.items():
+ io.imsave(f'{qcdir}/{s_sample}/ColorArray_{s_scene}_{key}_{".".join(tu_result[0])}.png',tu_result[1])
+
+def cropped_ometiff(s_scene,subdir,cropdir,d_crop,d_combos,s_dapi,tu_dim,b_8bit=True):
+ s_sample = s_scene.split('-Scene')[0]
+ if os.path.exists(f'{subdir}/{s_sample}'):
+ os.chdir(f'{subdir}/{s_sample}')
+ elif os.path.exists(f'{subdir}/{s_scene}'):
+ os.chdir(f'{subdir}/{s_scene}')
+ df_img = mpimage.parse_org()
+ d_crop_scene = {s_scene:d_crop[s_scene]}
+ if b_8bit:
+ dd_result = mpimage.overlay_crop(d_combos,d_crop_scene,df_img,s_dapi,tu_dim)
+ else:
+ dd_result = mpimage.overlay_crop(d_combos,d_crop_scene,df_img,s_dapi,tu_dim,b_8bit=False)
+ for s_crop, d_result in dd_result.items():
+ for s_type, (ls_marker, array) in d_result.items():
+ print(f'Generating multi-page ome-tiff {[item for item in ls_marker]}')
+ new_array = array[np.newaxis,np.newaxis,:]
+ s_xml = ometiff.gen_xml(new_array, ls_marker)
+ with tifffile.TiffWriter(f'{cropdir}/{s_crop}_{s_type}.ome.tif') as tif:
+ tif.save(new_array, photometric = "minisblack", description=s_xml, metadata = None)
+
+def crop_registered(s_scene,bigdir,regdir,d_crop):
+ '''
+ crop a stack of tiffs to the specified coordinates
+ d_crop: crop to scene:(xmin, y_min, xmax, ymax)
+ '''
+ s_sample = s_scene.split('-Scene')[0]
+ print(s_scene)
+ os.chdir(f'{bigdir}/{s_scene}')
+ df_img = mpimage.parse_org()
+ df_scene = df_img[df_img.scene==s_scene]
+ for s_image in df_scene.index:
+ #print(s_image)
+ a_dapi = io.imread(s_image)
+ for idx, xy_cropcoor in d_crop.items():
+ #crop
+ a_crop = a_dapi[xy_cropcoor[1]:xy_cropcoor[3],xy_cropcoor[0]:xy_cropcoor[2]]
+ preprocess.cmif_mkdir([f'{regdir}/{s_sample}-Scene-{idx:03}'])
+ io.imsave(f'{regdir}/{s_sample}-Scene-{idx:03}/{s_image.replace(s_scene,f"{s_sample}-Scene-{idx:03}")}',a_crop,check_contrast=False)
+
+def multipage_tiff(d_combos,d_crop,tu_dim,s_dapi,regdir,b_crop=False):
+ '''
+ make custom overlays, either original of AF subtracted, save at 8 bit for size, and thresholding
+ '''
+ os.chdir(regdir)
+ for idx, s_dir in enumerate(sorted(os.listdir())):
+ print(s_dir)
+ s_path = f'{regdir}/{s_dir}'
+ os.chdir(s_path)
+ df_img = mpimage.parse_org(s_end = "ORG.tif",s_start='R',type='reg')
+ df_dapi = df_img[df_img.marker.str.contains(s_dapi.split('_')[0])]
+ df_img_stain = df_img[(~df_img.marker.str.contains('DAPI'))]
+ #check
+ es_test = set()
+ for key, item in d_combos.items():
+ es_test = es_test.union(item)
+ print(set(df_img_stain.marker) - es_test)
+
+ #cropped
+ if b_crop:
+ s_scene = set(d_crop).intersection(set(df_img.scene))
+ d_crop_scene={k: d_crop[k] for k in (sorted(s_scene))}
+ process.custom_crop_overlays(d_combos,d_crop_scene, df_img,s_dapi, tu_dim=tu_dim) #df_dapi,
+ else:
+ process.custom_overlays(d_combos, df_img_stain, df_dapi)
+
+def crop_basins(d_crop,tu_dim,segdir,cropdir,s_type='Cell'):
+ """
+ crop the segmentation basins (cell of nuceli) to same coord as images for veiwing in Napari
+ """
+ cwd = os.getcwd()
+ for s_scene, xy_cropcoor in d_crop.items():
+ print(s_scene)
+ s_sample = s_scene.split('-Scene-')[0]
+ os.chdir(f'{segdir}/{s_sample}_Segmentation/')
+
+ for s_file in os.listdir():
+ if s_file.find(f'{s_type} Segmentation Basins.tif') > -1: #Nuclei Segmentation Basins.tif #Cell Segmentation Basins.tif
+ if s_file.find(s_scene.split('-Scene-')[1]) > -1:
+ a_seg = skimage.io.imread(s_file)
+ a_crop = a_seg[(xy_cropcoor[1]):(xy_cropcoor[1]+tu_dim[1]),(xy_cropcoor[0]):(xy_cropcoor[0]+tu_dim[0])]
+ s_coor = f'x{xy_cropcoor[0]}y{xy_cropcoor[1]}.tif'
+ #crop file
+ s_file_new = f'{cropdir}/{s_sample}-{s_file.replace(" - ","_").replace(" ","").replace("Scene","Scene-").replace(".tif",s_coor)}'
+ print(s_file_new)
+ skimage.io.imsave(s_file_new,a_crop)
+ os.chdir(cwd)
+
+def load_crop_labels(d_crop,tu_dim,segdir,cropdir,s_find='Nuclei Segmentation Basins'):
+ """
+ crop the segmentation basins (cell of nuceli) to same coord as images for veiwing in Napari
+ s_find: 'exp5_CellSegmentationBasins' or 'Nuclei Segmentation Basins'
+ """
+ cwd = os.getcwd()
+ for s_scene, xy_cropcoor in d_crop.items():
+ print(s_scene)
+ s_sample = s_scene.split('-Scene-')[0]
+ os.chdir(f'{segdir}/{s_sample}Cellpose_Segmentation/')
+
+ for s_file in os.listdir():
+ if s_file.find(s_find) > -1: #Nuclei Segmentation Basins.tif #Cell Segmentation Basins.tif
+ if s_file.find(s_scene.split(s_sample)[1]) > -1:
+ a_seg = skimage.io.imread(s_file)
+ a_crop = a_seg[(xy_cropcoor[1]):(xy_cropcoor[1]+tu_dim[1]),(xy_cropcoor[0]):(xy_cropcoor[0]+tu_dim[0])]
+ s_coor = f'x{xy_cropcoor[0]}y{xy_cropcoor[1]}.tif'
+ #crop file
+ s_file_new = f'{cropdir}/{s_file.replace(" ","").replace(".tif",s_coor)}'
+ print(s_file_new)
+ skimage.io.imsave(s_file_new,a_crop)
+ os.chdir(cwd)
+
+def load_labels(d_crop,segdir,s_find='Nuclei Segmentation Basins'):
+ """
+ load the segmentation basins (cell of nuceli)
+ s_find: 'exp5_CellSegmentationBasins' or 'Nuclei Segmentation Basins'
+ """
+ d_label={}
+ cwd = os.getcwd()
+ for s_scene, xy_cropcoor in d_crop.items():
+ print(s_scene)
+ s_sample = s_scene.split('-Scene-')[0]
+ os.chdir(f'{segdir}/{s_sample}Cellpose_Segmentation/')
+ for s_file in os.listdir():
+ if s_file.find(s_find) > -1: #Nuclei Segmentation Basins.tif #Cell Segmentation Basins.tif
+ if s_file.find(s_scene.split(s_sample)[1]) > -1:
+ a_seg = skimage.io.imread(s_file)
+ d_label.update({s_scene:a_seg})
+ os.chdir(cwd)
+ return(d_label)
+
+def crop_labels(d_crop,d_label,tu_dim,cropdir,s_name='Nuclei Segmentation Basins'):
+ """
+ crop the segmentation basins (cell of nuceli) to same coord as images for veiwing in Napari
+ s_name =
+ """
+ for s_scene, xy_cropcoor in d_crop.items():
+ print(s_scene)
+ a_seg = d_label[s_scene]
+ a_crop = a_seg[(xy_cropcoor[1]):(xy_cropcoor[1]+tu_dim[1]),(xy_cropcoor[0]):(xy_cropcoor[0]+tu_dim[0])]
+ s_coor = f'x{xy_cropcoor[0]}y{xy_cropcoor[1]}.tif'
+ #crop file
+ s_file_new = f'{cropdir}/{s_name.replace(" ","").replace(".tif",s_coor)}'
+ print(s_file_new)
+ skimage.io.imsave(s_file_new,a_crop)
+
+
+#### OLD: for Guillaume's pipeline ###
+
+def copy_files(dir,dapi_copy, marker_copy,b_test=True):
+ """
+ copy and rename files if needed as dummies
+ """
+ os.chdir(dir)
+ for idx, s_dir in enumerate(sorted(os.listdir())):
+ s_path = f'{dir}/{s_dir}'
+ os.chdir(s_path)
+ s_sample = s_dir.split('-Scene')[0]
+ df_img = mpimage.parse_org(s_end = "ORG.tif")
+ print(s_dir)
+ if b_test:
+ for key, dapi_item in dapi_copy.items():
+ preprocess.copy_dapis(s_r_old=key,s_r_new=f'-R{dapi_item}_',s_c_old='_c1_',s_c_new='_c2_',s_find='_c1_ORG.tif',b_test=True)
+ i_count=0
+ for idx,(key, item) in enumerate(marker_copy.items()):
+ preprocess.copy_markers(df_img, s_original=key, ls_copy = item,i_last_round= dapi_item + i_count, b_test=True)
+ i_count=i_count + len(item)
+ elif b_test==False:
+ print('Changing name - not a test')
+ for key, dapi_item in dapi_copy.items():
+ preprocess.copy_dapis(s_r_old=key,s_r_new=f'-R{dapi_item}_',s_c_old='_c1_',s_c_new='_c2_',s_find='_c1_ORG.tif',b_test=False)
+ i_count=0
+ for idx,(key, item) in enumerate(marker_copy.items()):
+ preprocess.copy_markers(df_img, s_original=key, ls_copy = item,i_last_round= dapi_item + i_count, b_test=False)
+ i_count=i_count + len(item)
+ else:
+ pass
+
+def segmentation_thresholds(regdir,qcdir, d_segment):
+ """
+ visualize binary mask of segmentaiton threholds
+ """
+ preprocess.cmif_mkdir([f'{qcdir}/Segmentation'])
+ os.chdir(regdir)
+ for idx, s_dir in enumerate(sorted(os.listdir())):
+ s_path = f'{regdir}/{s_dir}'
+ os.chdir(s_path)
+ df_img = mpimage.parse_org(s_end = "ORG.tif",type='reg')
+ s_sample = s_dir.split('-Scene')[0]
+ print(s_sample)
+ if (len(set(df_img.scene))) < 3:
+ d_seg = preprocess.check_seg_markers(df_img,d_segment, i_rows=1, t_figsize=(10,6)) #few scenes
+ elif (len(set(df_img.scene))) > 8:
+ d_seg = preprocess.check_seg_markers(df_img,d_segment, i_rows=3, t_figsize=(10,6)) #more scenes
+ else:
+ d_seg = preprocess.check_seg_markers(df_img,d_segment, i_rows=2, t_figsize=(10,6)) #more scenes
+ for key, fig in d_seg.items():
+ fig.savefig(f'{qcdir}/Segmentation/{s_dir}_{key}_segmentation.png')
+
+def move_af_img(s_sample, regdir, subdir, dirtype='tma',b_move=False):
+ '''
+ dirtype = 'single' or 'tma' or 'unsub'
+ '''
+ #move
+ os.chdir(regdir)
+ for s_dir in sorted(os.listdir()):
+ if s_dir.find(s_sample)>-1:
+ if dirtype =='single':
+ preprocess.cmif_mkdir([f'{subdir}/{s_dir}'])
+ elif dirtype == 'tma':
+ preprocess.cmif_mkdir([f'{subdir}/{s_sample}'])
+ elif dirtype == 'unsub':
+ preprocess.cmif_mkdir([f'{subdir}/{s_sample}'])
+ if dirtype != 'unsub':
+ print(f'{regdir}/{s_dir}/AFSubtracted')
+ os.chdir(f'{regdir}/{s_dir}/AFSubtracted')
+ else:
+ os.chdir(f'{regdir}/{s_dir}')
+ for s_file in sorted(os.listdir()):
+ if dirtype =='single':
+ movedir = f'{subdir}/{s_dir}/{s_file}'
+ print(f'{regdir}/{s_dir}/AFSubtracted/{s_file} moved to {movedir}')
+ elif dirtype == 'tma':
+ movedir = f'{subdir}/{s_sample}/{s_file}'
+ print(f'{regdir}/{s_dir}/AFSubtracted/{s_file} moved to {movedir}')
+ elif dirtype == 'unsub':
+ movedir = f'{subdir}/{s_sample}/{s_file}'
+ print(f'{regdir}/{s_dir}/{s_file} moved to {movedir}')
+ if b_move:
+ if dirtype != 'unsub':
+ shutil.move(f'{regdir}/{s_dir}/AFSubtracted/{s_file}', f'{movedir}')
+ else:
+ shutil.move(f'{regdir}/{s_dir}/{s_file}', f'{movedir}')
+
+def extract_dataframe(s_sample, segdir,qcdir,i_rows=1):
+ '''
+ get mean intensity, centroid dataframes
+ '''
+ preprocess.cmif_mkdir([f'{qcdir}/Segmentation'])
+ #get data
+ os.chdir(segdir)
+ dd_run = getdata.get_df(s_folder_regex=f"^{s_sample}.*_Features$",es_value_label = {"MeanIntensity","CentroidY","CentroidX"})#
+ os.chdir(f'{s_sample}_Segmentation')
+ d_reg = process.check_seg(s_sample=s_sample,ls_find=['Cell Segmentation Full Color'], i_rows=i_rows, t_figsize=(8,8))#
+ for key, item in d_reg.items():
+ item.savefig(f'{qcdir}/Segmentation/FullColor_{key}.png')
+
+def metadata_table(regdir,segdir):
+ """
+ output channel/marker mapping
+ """
+ os.chdir(regdir)
+ for idx, s_dir in enumerate(sorted(os.listdir())):
+ s_path = f'{regdir}/{s_dir}'
+ os.chdir(s_path)
+ df_img = mpimage.parse_org(s_end = "ORG.tif",type='reg')
+ if len(set(df_img.scene)) > 1:
+ df_img = df_img[df_img.scene==sorted(set(df_img.scene))[1]]
+ s_sample = s_dir
+ else:
+ s_sample = s_dir.split('-Scene')[0]
+ print(s_sample)
+ df_marker = df_img[df_img.color!='c1']
+ df_marker = df_marker.sort_values(['rounds','color'])
+ df_dapi = pd.DataFrame(index = [df_marker.marker.tolist()],columns=['rounds','colors','minimum','maximum','exposure','refexp','location'])
+ df_dapi['rounds'] = df_marker.loc[:,['rounds']].values
+ df_dapi['colors'] = df_marker.loc[:,['color']].values
+ df_dapi['minimum'] = 1003
+ df_dapi['maximum'] = 65535
+ df_dapi['exposure'] = 100
+ df_dapi['refexp'] = 100
+ df_dapi['location'] = 'All'
+ df_dapi.to_csv(f'{segdir}/metadata_{s_sample}_RoundsCyclesTable.csv',header=True)
+
+def segmentation_inputs(regdir,segdir, d_segment,tma_bool=False,b_start=False,i_counter=0,b_java=False):
+ """
+ make inputs for guillaumes segmentation
+ """
+
+ os.chdir(regdir)
+ for idx, s_dir in enumerate(sorted(os.listdir())):
+ s_path = f'{regdir}/{s_dir}'
+ os.chdir(s_path)
+ df_img = mpimage.parse_org(s_end = "ORG.tif",type='reg')
+ if len(set(df_img.scene)) > 1:
+ df_img = df_img[df_img.scene==sorted(set(df_img.scene))[1]]
+ s_sample = s_dir
+ else:
+ s_sample = s_dir.split('-Scene')[0]
+ print(s_sample)
+ df_marker = df_img[df_img.color!='c1']
+ df_marker = df_marker.sort_values(['rounds','color'])
+ df_dapi = pd.DataFrame(index = [df_marker.marker.tolist()],columns=['rounds','colors','minimum','maximum','exposure','refexp','location'])
+ df_dapi['rounds'] = df_marker.loc[:,['rounds']].values
+ df_dapi['colors'] = df_marker.loc[:,['color']].values
+ df_dapi['minimum'] = 1003
+ df_dapi['maximum'] = 65535
+ df_dapi['exposure'] = 100
+ df_dapi['refexp'] = 100
+ df_dapi['location'] = 'All'
+ for s_key,i_item in d_segment.items():
+ df_dapi.loc[s_key,'minimum'] = i_item
+ df_dapi.to_csv(f'{segdir}/metadata_{s_sample}_RoundsCyclesTable.csv',header=True)
+ #create cluster.java file
+ if b_java:
+ df_dapi.to_csv('RoundsCyclesTable.txt',sep=' ',header=False)
+ preprocess.cluster_java(s_dir=f'JE{idx + i_counter}',s_sample=s_sample,imagedir=f'{s_path}',segmentdir=segdir,type='exacloud',b_segment=True,b_TMA=tma_bool)
+ if b_start:
+ os.chdir(f'{s_work_path}/exacloud/JE{idx}') #exacloud
+ #shutil.copyfile(f'{s_src_path}/src/javawrapper.sh', './javawrapper.sh')
+ print(f'JE{idx + i_counter}')
+ subprocess.run(["make"])
+ subprocess.run(["make", "slurm"])
+
+def prepare_dataframe(s_sample,ls_dapi,dapi_thresh,d_channel,ls_exclude,segdir,codedir,s_af='none', b_afsub=False):
+ '''
+ filter data by last dapi, standard location, subtract AF, output treshold csv
+ ls_dapi[0] becomes s_dapi
+ '''
+
+ os.chdir(f'{segdir}')
+ #load data
+ df_mi = process.load_mi(s_sample)
+ df_xy = process.load_xy(s_sample)
+ #drop extra centroid columns,add scene column
+ df_xy = df_xy.loc[:,['DAPI_X','DAPI_Y']]
+ df_xy = process.add_scene(df_xy)
+ df_xy.to_csv(f'features_{s_sample}_CentroidXY.csv')
+ #filter by last DAPI
+ df_dapi_mi = process.filter_dapi(df_mi,df_xy,ls_dapi[0],dapi_thresh,b_images=True)
+
+ #filter mean intensity by biomarker location in metadata
+ df_filter_mi, es_standard = process.filter_standard(df_dapi_mi,d_channel,s_dapi=ls_dapi[0])
+
+ df_filter_mi.to_csv(f'features_{s_sample}_FilteredMeanIntensity_{ls_dapi[0]}{dapi_thresh}.csv')
+ #background qunatiles
+ '''
+ df_bg = process.filter_background(df_mi, es_standard)
+ df_bg.to_csv(f'features_{s_sample}_BackgroundQuantiles.csv')
+ df_bg = process.filter_background(df_dapi_mi, es_standard)
+ df_bg.to_csv(f'features_{s_sample}_FilteredBackgroundQuantiles.csv')
+
+ df_t = pd.read_csv(f'metadata_{s_sample}_RoundsCyclesTable.csv',index_col=0,header=0)
+ df_exp = pd.read_csv(f'{codedir}/{s_sample}_ExposureTimes.csv',index_col=0,header=0)
+ df_tt = process.add_exposure_roundscyles(df_t, df_exp,es_standard, ls_dapi = ls_dapi)
+ df_tt.to_csv(f'metadata_{s_sample}_RoundsCyclesTable_ExposureTimes.csv')
+ if b_afsub:
+ #load metadata
+ df_t = pd.read_csv(f'metadata_{s_sample}_RoundsCyclesTable_ExposureTimes.csv',index_col=0,header=0)
+ #normalize by exposure time, and save to csv
+ lb_columns = [len(set([item]).intersection(set(df_t.index)))>0 for item in [item.split('_')[0] for item in df_filter_mi.columns]]
+ df_filter_mi = df_filter_mi.loc[:,lb_columns]
+ df_norm = process.exposure_norm(df_filter_mi,df_t)
+ df_norm.to_csv(f'features_{s_sample}_ExpNormalizedMeanIntensity_{ls_dapi[0]}{dapi_thresh}.csv')
+ #subtract AF channels in data
+ df_sub,ls_sub,ls_record = process.af_subtract(df_norm,df_t,d_channel,ls_exclude)
+ df_out = process.output_subtract(df_sub,df_t)
+ df_sub.to_csv(f'features_{s_sample}_AFSubtractedMeanIntensityNegative{s_af}_{ls_dapi[0]}{dapi_thresh}.csv')
+ df_out.to_csv(f'features_{s_sample}_AFSubtractedMeanIntensity{s_af}_{ls_dapi[0]}{dapi_thresh}.csv')
+ f = open(f"{s_sample}_AFsubtractionData_{s_af}.txt", "w")
+ f.writelines(ls_record)
+ f.close()
+ else:
+ df_out = df_filter_mi
+ #output thresholding csv
+ #df_out = process.add_scene(df_out) #df_out
+ #df_thresh = process.make_thresh_df(df_out,ls_drop=None)
+ #df_thresh.to_csv(f'thresh_XX_{s_sample}.csv')
+ '''
+ print('Done')
+
+def fetch_celllabel(s_sampleset, s_slide, s_ipath, s_opath = './', es_scene = None, es_filename_endswith ={'Cell Segmentation Basins.tif', 'Nuclei Segmentation Basins.tif'}, s_sep = ' - ', b_test=True):
+ '''
+ input:
+ s_sampleset: sample set name. e.g. jptma
+ s_slide: slide name. e.g. jp-tma1-1
+ es_scene: set of scenes of interest. The scenes have to be written in the same way as in the basin file name.
+ if None, all scenes are if interest. default is None.
+ s_ipath: absolute or relative path where the basin files can be found.
+ s_opath: path to where the fetched basin files should be outputed.
+ a folder, based on the s_sampleset, will be generated (if it not already exist), where the basin files will be placed.
+ es_filename_endswith: set of patters that defind the endings of the files of interest.
+ s_sep: separator to separate slide and scenes in the file name.
+ b_test: test flag. if True no files will be copied, it is just a simulation mode.
+
+ output:
+ folder with basin flies. placed at {s_opath}{s_sampleset}_segmentation_basin/
+
+ description:
+ fetches basin (cell label) files from Guillaume's segmentation pipeline
+ and copies them into a folder at s_opath, named according to s_sampleset name.
+ '''
+ # generate output directory
+ os.makedirs('{}{}_segmentation_basin/'.format(s_opath, s_sampleset), exist_ok=True)
+ # processing
+ if (es_scene is None):
+ i_total = 'all'
+ else:
+ i_total = len(es_scene) * len(es_filename_endswith)
+ es_sanity_scene = copy.deepcopy(es_scene)
+ i = 0
+ for s_file in sorted(os.listdir(s_ipath)):
+ # check for file of interest
+ b_flag = False
+ for s_filename_endswith in es_filename_endswith:
+ if (s_file.endswith(s_filename_endswith)):
+ if (es_scene is None):
+ b_flag = True
+ break
+ else:
+ for s_scene in es_scene:
+ if (s_file.startswith(s_scene + s_sep)):
+ es_sanity_scene.discard(s_scene)
+ b_flag = True
+ break
+ break
+ # copy file
+ if (b_flag):
+ i += 1
+ print('copy {}/{}: {}{}{} ...'.format(i, i_total, s_slide, s_sep, s_file))
+ if not (b_test):
+ shutil.copyfile(src='{}{}'.format(s_ipath, s_file), dst='{}{}_segmentation_basin/{}{}{}'.format(s_opath, s_sampleset, s_slide, s_sep, s_file))
+ # sanity check
+ if not (es_scene is None) and (i != i_total):
+ sys.exit('Error: no file found for es_scene specified scene {}'.format(sorted(es_sanity_scene)))
\ No newline at end of file
diff --git a/mplex_image/codex.py b/mplex_image/codex.py
new file mode 100755
index 0000000..a67c58a
--- /dev/null
+++ b/mplex_image/codex.py
@@ -0,0 +1,452 @@
+# wrapper functions for codex image processing
+
+#from mplex_image import preprocess, mpimage, process,
+from mplex_image import features
+import os
+import pandas as pd
+import math
+import skimage
+from skimage import io, filters
+import re
+import numpy as np
+
+def parse_img(s_end = ".tif",s_start='reg'):
+ """
+ This function will parse images following akoya stiched naming convention
+ """
+ s_path = os.getcwd()
+ ls_file = []
+ for file in os.listdir():
+ if file.endswith(s_end):
+ if file.find(s_start)==0:
+ ls_file = ls_file + [file]
+ df_img = pd.DataFrame(index=ls_file)
+ df_img['rounds'] = [item.split('_')[1].split('cyc')[1] for item in df_img.index]
+ df_img['color'] = [item.split('_')[3] for item in df_img.index]
+ df_img['slide'] = [item.split('_')[0] for item in df_img.index]
+ df_img['marker'] = [item.split('_')[-1].split('.')[0] for item in df_img.index]
+ df_img['marker_string'] = [item.split('_')[-1].split('.')[0] for item in df_img.index]
+ df_img['path'] = [f"{s_path}/{item}" for item in df_img.index]
+ return(df_img)
+
+def load_li(ls_sample):
+ '''
+ load threshold on the segmentation marker images acquired during feature extraction
+ '''
+ df_img_all =pd.DataFrame()
+ for s_sample in ls_sample:
+ df_img = pd.read_csv(f'thresh_{s_sample}_ThresholdLi.csv', index_col=0)
+ df_img['rounds'] = [item.split('_')[1].split('cyc')[1] for item in df_img.index]
+ df_img['color'] = [item.split('_')[3] for item in df_img.index]
+ df_img['slide'] = s_sample
+ df_img['scene'] = [item.split('_')[0].split('reg')[1] for item in df_img.index]
+ df_img['marker'] = [item.split('_')[-1].split('.')[0] for item in df_img.index] #parse file name for biomarker
+ df_img['slide_scene'] = df_img.slide + '_scene' + df_img.scene
+ df_img_all = df_img_all.append(df_img)
+ return(df_img_all)
+
+def underscore_to_dash(df_mi_full,df_img_all):
+ '''
+ the underscore in sample names will break downstream code; change to dash
+ '''
+ #naming underscore to dash
+ df_mi_full['slide'] = [item.split('_scene')[0].replace('_','-') for item in df_mi_full.index]
+ df_mi_full.index = [f"_scene{item.split('_scene')[1]}" for item in df_mi_full.index]
+ df_mi_full.index = df_mi_full.slide + df_mi_full.index
+ df_mi_full['scene'] = [item.split('_')[1] for item in df_mi_full.index]
+ df_mi_full['slide_scene'] = df_mi_full.slide + '_' + df_mi_full.scene
+ #df_img renameing
+ df_img_all['slide'] = [item.replace('_','-') for item in df_img_all.slide]
+ df_img_all['slide_scene'] = df_img_all.slide + '_scene' + df_img_all.scene
+ return(df_mi_full,df_img_all)
+
+def extract_cellpose_features(s_sample, segdir, subdir, ls_seg_markers, nuc_diam, cell_diam,s_scene='reg001'):
+ '''
+ load the segmentation results, the input images, and the channels images
+ extract mean intensity from each image, and centroid, area and eccentricity for
+ '''
+
+ df_sample = pd.DataFrame()
+ df_thresh = pd.DataFrame()
+ if os.path.exists(f'{segdir}/{s_scene}Cellpose_Segmentation'):
+ os.chdir(f'{segdir}/{s_scene}Cellpose_Segmentation')
+ else:
+ os.chdir(f'{segdir}')
+ ls_scene = []
+ d_match = {}
+ for s_file in os.listdir():
+ if s_file.find(f'{".".join(ls_seg_markers)} matchedcell{cell_diam} - Cell Segmentation Basins')>-1:
+ ls_scene.append(s_file.split('_')[0])
+ d_match.update({s_file.split('_')[0]:s_file})
+ elif s_file.find(f'{".".join(ls_seg_markers)} nuc{nuc_diam} matchedcell{cell_diam} - Cell Segmentation Basins')>-1:
+ ls_scene.append(s_file.split('_')[0])
+ d_match.update({s_file.split('_')[0]:s_file})
+ for s_scene in ['reg001']: #ls_scene: #one scene
+ print(f'processing {s_scene}')
+ for s_file in os.listdir():
+ if s_file.find(s_scene) > -1:
+ if s_file.find("DAPI.png") > -1:
+ s_dapi = s_file
+ dapi = io.imread(s_dapi)
+ print(f'loading {s_scene} nuclei{nuc_diam} - Nuclei Segmentation Basins.tif')
+ labels = io.imread(f'{s_scene} nuclei{nuc_diam} - Nuclei Segmentation Basins.tif')
+ print(f'loading {d_match[s_scene]}')
+ cell_labels = io.imread(d_match[s_scene])
+ #nuclear features
+ df_feat = features.extract_feat(labels,dapi, properties=(['mean_intensity']))
+ df_feat.columns = [f'{item}_segmented-nuclei' for item in df_feat.columns]
+ df_feat.index = [f'{s_sample}_scene{s_scene.split("reg")[1]}_cell{item}' for item in df_feat.index]
+
+ #get subcellular regions
+ cyto = features.label_difference(labels,cell_labels)
+ d_loc_nuc = features.subcellular_regions(labels, distance_short=2, distance_long=4)
+ d_loc_cell = features.subcellular_regions(cell_labels, distance_short=2, distance_long=4)
+ d_loc = {'nuclei':labels,'cell':cell_labels,'cytoplasm':cyto,
+ 'nucmem':d_loc_nuc['membrane'][0],'cellmem':d_loc_cell['membrane'][0],
+ 'perinuc4':d_loc_nuc['ring'][1],'exp4':d_loc_nuc['grown'][1],
+ 'nucadj2':d_loc_nuc['straddle'][0],'celladj2':d_loc_cell['straddle'][0]}
+ #subdir organized by slide or scene
+ if os.path.exists(f'{subdir}/{s_sample}'):
+ os.chdir(f'{subdir}/{s_sample}')
+ elif os.path.exists(f'{subdir}/{s_scene}'):
+ os.chdir(f'{subdir}/{s_scene}')
+ else:
+ os.chdir(f'{subdir}')
+ df_img = parse_img()
+ df_img['round_int'] = [int(re.sub('[^0-9]','', item)) for item in df_img.rounds]
+ df_img = df_img[df_img.round_int < 90]
+ df_img = df_img.sort_values('round_int')
+ df_scene = df_img# one scene [df_img.scene==s_scene.split("-Scene-")[1].split("_")[0]]
+
+ #load each image
+ for s_index in df_scene.index:
+ intensity_image = io.imread(s_index)
+ df_thresh.loc[s_index,'threshold_li'] = filters.threshold_li(intensity_image)
+ if intensity_image.mean() > 0:
+ df_thresh.loc[s_index,'threshold_otsu'] = filters.threshold_otsu(intensity_image)
+ df_thresh.loc[s_index,'threshold_triangle'] = filters.threshold_triangle(intensity_image)
+ s_marker = df_scene.loc[s_index,'marker']
+ print(f'extracting features {s_marker}')
+ #if s_marker == 'DAPI':
+ # s_marker = s_marker + f'{df_scene.loc[s_index,"rounds"].split("cyc")[1]}'
+ for s_loc, a_loc in d_loc.items():
+ if s_loc == 'nuclei':
+ df_marker_loc = features.extract_feat(a_loc,intensity_image, properties=(['mean_intensity','centroid','area','eccentricity']))
+ df_marker_loc.columns = [f'{s_marker}_{s_loc}',f'{s_marker}_{s_loc}_centroid-0',f'{s_marker}_{s_loc}_centroid-1',f'{s_marker}_{s_loc}_area',f'{s_marker}_{s_loc}_eccentricity']
+ elif s_loc == 'cell':
+ df_marker_loc = features.extract_feat(a_loc,intensity_image, properties=(['mean_intensity','euler_number','area','eccentricity']))
+ df_marker_loc.columns = [f'{s_marker}_{s_loc}',f'{s_marker}_{s_loc}_euler',f'{s_marker}_{s_loc}_area',f'{s_marker}_{s_loc}_eccentricity']
+ else:
+ df_marker_loc = features.extract_feat(a_loc,intensity_image, properties=(['mean_intensity']))
+ df_marker_loc.columns = [f'{s_marker}_{s_loc}']
+
+ #drop zero from array, set array ids as index
+ df_marker_loc.index = sorted(np.unique(a_loc)[1::])
+ df_marker_loc.index = [f'{s_sample}_scene{s_scene.split("reg")[1]}_cell{item}' for item in df_marker_loc.index]
+ df_feat = df_feat.merge(df_marker_loc, left_index=True,right_index=True,how='left',suffixes=('',f'{s_marker}_{s_loc}'))
+ df_sample = df_sample.append(df_feat)
+ return(df_sample, df_thresh)
+
+def convert_tif(regdir,b_mkdir=True):
+ '''
+ convert codex tif to standard tif
+ '''
+ cwd = os.getcwd()
+ os.chdir(regdir)
+ for s_dir in sorted(os.listdir()):
+ if s_dir.find('reg')== 0:
+ os.chdir(s_dir)
+ for s_file in sorted(os.listdir()):
+ if s_file.find('.tif')>-1:
+ #s_round = s_file.split("Cycle(")[1].split(").ome.tif")[0]
+ #print(f'stain {s_round}')
+ #s_dir_new = s_dir.split('_')[2] + '-Scene-0' + s_dir.split('F-')[1]
+ #s_tissue_dir = s_dir.split('_F-')[0]
+ if b_mkdir:
+ preprocess.cmif_mkdir([f'{regdir}/converted_{s_dir}'])
+ a_dapi = skimage.io.imread(s_file)
+ with skimage.external.tifffile.TiffWriter(f'{regdir}/converted_{s_dir}/{s_file}') as tif:
+ tif.save(a_dapi)
+ os.chdir('..')
+ os.chdir(cwd)
+
+def visualize_reg_images(s_sample,regdir,qcdir,color='ch001'):
+ """
+ array registered images to check tissue identity, focus, etc.
+ """
+ #check registration
+ preprocess.cmif_mkdir([f'{qcdir}/RegisteredImages'])
+ cwd = os.getcwd()
+ os.chdir(regdir)
+ #for idx, s_dir in enumerate(sorted(os.listdir())):
+ # os.chdir(s_dir)
+ # s_sample = s_dir.split('-Scene')[0]
+ # print(s_sample)
+ df_img = mpimage.filename_dataframe(s_end = ".tif",s_start='reg',s_split='_')
+ df_img.rename({'data':'scene'},axis=1,inplace=True)
+ df_img['slide'] = s_sample
+ df_img['rounds'] = [item[1] for item in [item.split('_') for item in df_img.index]]
+ df_img['color'] = [item[2] for item in [item.split('_') for item in df_img.index]]
+ df_img['marker'] = [item[3].split('.')[0] for item in [item.split('_') for item in df_img.index]]
+ ls_scene = sorted(set(df_img.scene))
+ for s_scene in ls_scene:
+ print(s_scene)
+ df_img_scene = df_img[df_img.scene == s_scene]
+ df_img_stain = df_img_scene[df_img_scene.color==color]
+ df_img_sort = df_img_stain.sort_values(['rounds'])
+ i_sqrt = math.ceil(math.sqrt(len(df_img_sort)))
+ fig = mpimage.array_img(df_img_sort,s_column='color',s_row='rounds',s_label='marker',tu_array=(i_sqrt,i_sqrt),tu_fig=(16,14))
+ fig.savefig(f'{qcdir}/RegisteredImages/{s_scene}_registered_{color}.png')
+ os.chdir(cwd)
+ return(df_img_sort)
+
+def rename_files(d_rename,dir,b_test=True):
+ """
+ change file names
+ """
+ cwd = os.getcwd()
+ os.chdir(dir)
+ for idx, s_dir in enumerate(sorted(os.listdir())):
+ if s_dir.find('converted') == 0:
+ s_path = f'{dir}/{s_dir}'
+ os.chdir(s_path)
+ print(s_dir)
+ df_img = mpimage.filename_dataframe(s_end = ".tif",s_start='reg',s_split='_')
+ df_img.rename({'data':'scene'},axis=1,inplace=True)
+ df_img['rounds'] = [item[1] for item in [item.split('_') for item in df_img.index]]
+ df_img['color'] = [item[2] for item in [item.split('_') for item in df_img.index]]
+ df_img['marker'] = [item[3].split('.')[0] for item in [item.split('_') for item in df_img.index]]
+ if b_test:
+ print('This is a test')
+ preprocess.dchange_fname(d_rename,b_test=True)
+ elif b_test==False:
+ print('Changing name - not a test')
+ preprocess.dchange_fname(d_rename,b_test=False)
+ else:
+ pass
+
+def rename_fileorder(s_sample, dir, b_test=True):
+ """
+ change file names
+ """
+ cwd = os.getcwd()
+ os.chdir(dir)
+ for idx, s_dir in enumerate(sorted(os.listdir())):
+ if s_dir.find('converted') == 0:
+ s_path = f'{dir}/{s_dir}'
+ os.chdir(s_path)
+ print(s_dir)
+ df_img = mpimage.filename_dataframe(s_end = ".tif",s_start='Scene',s_split='_')
+ df_img.rename({'data':'scene'},axis=1,inplace=True)
+ df_img['rounds'] = [item[1] for item in [item.split('_') for item in df_img.index]]
+ df_img['color'] = [item[2] for item in [item.split('_') for item in df_img.index]]
+ df_img['marker'] = [item[3].split('.')[0] for item in [item.split('_') for item in df_img.index]]
+ for s_index in df_img.index:
+ s_round = df_img.loc[s_index,'rounds']
+ s_scene= f"{s_sample}-{df_img.loc[s_index,'scene']}"
+ s_marker = df_img.loc[s_index,'marker']
+ s_color = df_img.loc[s_index,'color']
+ s_index_rename = f'{s_round}_{s_scene}_{s_marker}_{s_color}_ORG.tif'
+ d_rename = {s_index:s_index_rename}
+ if b_test:
+ print('This is a test')
+ preprocess.dchange_fname(d_rename,b_test=True)
+ elif b_test==False:
+ print('Changing name - not a test')
+ preprocess.dchange_fname(d_rename,b_test=False)
+ else:
+ pass
+
+def copy_files(dir,dapi_copy, marker_copy,testbool=True,type='codex'):
+ """
+ copy and rename files if needed as dummies
+ need to edit
+ """
+ os.chdir(dir)
+ for idx, s_dir in enumerate(sorted(os.listdir())):
+ if s_dir.find('converted') == 0:
+ s_path = f'{dir}/{s_dir}'
+ os.chdir(s_path)
+ #s_sample = s_dir.split('-Scene')[0]
+ df_img = mpimage.filename_dataframe(s_end = ".tif",s_start='R0',s_split='_')
+ df_img.rename({'data':'rounds'},axis=1,inplace=True)
+ df_img['scene'] = [item[1] for item in [item.split('_') for item in df_img.index]]
+ df_img['color'] = [item[3] for item in [item.split('_') for item in df_img.index]]
+ df_img['marker'] = [item[2].split('.')[0] for item in [item.split('_') for item in df_img.index]]
+ print(s_dir)
+ for key, dapi_item in dapi_copy.items():
+ df_dapi = df_img[(df_img.rounds== key.split('_')[0]) & (df_img.color=='c1')]
+ s_dapi = df_dapi.loc[:,'marker'][0]
+ preprocess.copy_dapis(s_r_old=key,s_r_new=f'R{dapi_item}_',s_c_old='_c1_',
+ s_c_new='_c2_',s_find=f'_{s_dapi}_c1_ORG.tif',b_test=testbool,type=type)
+ i_count=0
+ for idx,(key, item) in enumerate(marker_copy.items()):
+ preprocess.copy_markers(df_img, s_original=key, ls_copy = item,
+ i_last_round= dapi_item + i_count, b_test=testbool,type=type)
+ i_count=i_count + len(item)
+ return(df_img)
+
+def segmentation_thresholds(regdir,qcdir, d_segment):
+ """
+ visualize binary mask of segmentaiton threholds
+ need to edit
+ """
+ preprocess.cmif_mkdir([f'{qcdir}/Segmentation'])
+ os.chdir(regdir)
+ for idx, s_dir in enumerate(sorted(os.listdir())):
+ if s_dir.find('converted') == 0:
+ s_path = f'{regdir}/{s_dir}'
+ os.chdir(s_path)
+ df_img = mpimage.filename_dataframe(s_end = ".tif",s_start='R',s_split='_')
+ df_img.rename({'data':'rounds'},axis=1,inplace=True)
+ df_img['scene'] = [item[1] for item in [item.split('_') for item in df_img.index]]
+ df_img['color'] = [item[3] for item in [item.split('_') for item in df_img.index]]
+ df_img['marker'] = [item[2].split('.')[0] for item in [item.split('_') for item in df_img.index]]
+ s_sample = s_dir
+ print(s_sample)
+ d_seg = preprocess.check_seg_markers(df_img,d_segment, i_rows=1, t_figsize=(6,6)) #few scenes
+ for key, fig in d_seg.items():
+ fig.savefig(f'{qcdir}/Segmentation/{s_dir}_{key}_segmentation.png')
+ return(df_img)
+
+def parse_converted(dir):
+ '''
+ parse codex filenames (coverted)
+ '''
+ cwd = os.getcwd()
+ os.chdir(dir)
+ df_img = mpimage.filename_dataframe(s_end = ".tif",s_start='R',s_split='_')
+ df_img.rename({'data':'rounds'},axis=1,inplace=True)
+ df_img['scene'] = [item[1] for item in [item.split('_') for item in df_img.index]]
+ df_img['color'] = [item[3] for item in [item.split('_') for item in df_img.index]]
+ df_img['marker'] = [item[2] for item in [item.split('_') for item in df_img.index]]
+ os.chdir(cwd)
+ return(df_img)
+
+def segmentation_inputs(s_sample,regdir,segdir,d_segment,b_start=False):
+ """
+ make inputs for guillaumes segmentation
+ """
+ os.chdir(regdir)
+ for idx, s_dir in enumerate(sorted(os.listdir())):
+ if s_dir.find('convert')== 0:
+ s_path = f'{regdir}/{s_dir}'
+ os.chdir(s_path)
+ df_img = mpimage.filename_dataframe(s_end = ".tif",s_start='R',s_split='_')
+ df_img.rename({'data':'rounds'},axis=1,inplace=True)
+ #df_img['rounds'] = [item[1] for item in [item.split('_') for item in df_img.index]]
+ df_img['color'] = [item[3] for item in [item.split('_') for item in df_img.index]]
+ df_img['marker'] = [item[2] for item in [item.split('_') for item in df_img.index]]
+ #s_sample = s_dir
+ #s_sample = s_dir.split('-Scene')[0]
+ print(s_sample)
+ df_marker = df_img[df_img.color!='c1']
+ df_marker = df_marker.sort_values(['rounds','color'])
+ df_dapi = pd.DataFrame(index = [df_marker.marker.tolist()],columns=['rounds','colors','minimum','maximum','exposure','refexp','location'])
+ df_dapi['rounds'] = df_marker.loc[:,['rounds']].values
+ df_dapi['colors'] = df_marker.loc[:,['color']].values
+ df_dapi['minimum'] = 1003
+ df_dapi['maximum'] = 65535
+ df_dapi['exposure'] = 100
+ df_dapi['refexp'] = 100
+ df_dapi['location'] = 'All'
+ for s_key,i_item in d_segment.items():
+ df_dapi.loc[s_key,'minimum'] = i_item
+ df_dapi.to_csv('RoundsCyclesTable.txt',sep=' ',header=False)
+ df_dapi.to_csv(f'metadata_{s_sample}_RoundsCyclesTable.csv',header=True)
+ #create cluster.java file
+ preprocess.cluster_java(s_dir=f'JE{idx}',s_sample=s_sample,imagedir=f'{s_path}',segmentdir=segdir,type='exacloud',b_segment=True,b_TMA=False)
+ if b_start:
+ os.chdir(f'/home/groups/graylab_share/Chin_Lab/ChinData/Work/engje/exacloud/JE{idx}') #exacloud
+ print(f'JE{idx}')
+ os.system('make_sh')
+
+def prepare_dataframe(s_sample,s_dapi,dapi_thresh,d_channel,ls_exclude,segdir,b_afsub=False):
+ '''
+ filter data by last dapi, standard location, subtract AF, output treshold csv
+ '''
+
+ os.chdir(f'{segdir}')
+ #load data
+ df_mi = process.load_mi(s_sample)
+ df_xy = process.load_xy(s_sample)
+ #drop extra centroid columns,add scene column
+ df_xy = df_xy.loc[:,['DAPI_X','DAPI_Y']]
+ df_xy = process.add_scene(df_xy)
+ df_xy.to_csv(f'features_{s_sample}_CentroidXY.csv')
+ #filter by last DAPI
+ df_dapi_mi = process.filter_dapi(df_mi,df_xy,s_dapi,dapi_thresh,b_images=True)
+ df_t = process.load_meta(s_sample, s_path='./',type='LocationCsv')
+ #filter mean intensity by biomarker location in metadata
+ df_filter_mi = process.filter_loc(df_dapi_mi,df_t)
+ df_filter_mi.to_csv(f'features_{s_sample}_FilteredMeanIntensity_{s_dapi}{dapi_thresh}.csv')
+ if b_afsub:
+ #load metadata
+ df_t = pd.read_csv(f'metadata_{s_sample}_RoundsCyclesTableExposure.csv',index_col=0,header=0)
+ #normalize by exposure time, and save to csv
+ lb_columns = [len(set([item]).intersection(set(df_t.index)))>0 for item in [item.split('_')[0] for item in df_filter_mi.columns]]
+ df_filter_mi = df_filter_mi.loc[:,lb_columns]
+ df_norm = process.exposure_norm(df_filter_mi,df_t)
+ df_norm.to_csv(f'features_{s_sample}_ExpNormalizedMeanIntensity_{s_dapi}{dapi_thresh}.csv')
+ #subtract AF channels in data
+ df_sub,ls_sub,ls_record = process.af_subtract(df_norm,df_t,d_channel,ls_exclude)
+ df_out = process.output_subtract(df_sub,df_t)
+ df_out.to_csv(f'features_{s_sample}_AFSubtractedMeanIntensity_{s_dapi}{dapi_thresh}.csv')
+ f = open(f"{s_sample}_AFsubtractionData.txt", "w")
+ f.writelines(ls_record)
+ f.close()
+ else:
+ df_out = df_filter_mi
+ #output thresholding csv
+ df_out = process.add_scene(df_out) #df_out
+ df_thresh = process.make_thresh_df(df_out,ls_drop=None)
+ df_thresh.to_csv(f'thresh_XX_{s_sample}.csv')
+
+def multipage_tiff(d_combos,s_dapi,regdir):
+ '''
+ make custom overlays, either original of AF subtracted, save at 8 bit for size, and thresholding
+ '''
+ os.chdir(regdir)
+ for idx, s_dir in enumerate(sorted(os.listdir())):
+ if s_dir.find('convert')== 0:
+ s_path = f'{regdir}/{s_dir}'
+ os.chdir(s_path)
+ df_img = mpimage.filename_dataframe(s_end = ".tif",s_start='R',s_split='_')
+ df_img.rename({'data':'rounds'},axis=1,inplace=True)
+ df_img['color'] = [item[3] for item in [item.split('_') for item in df_img.index]]
+ df_img['marker'] = [item[2] for item in [item.split('_') for item in df_img.index]]
+ df_img['scene'] = [item[1] for item in [item.split('_') for item in df_img.index]]
+ df_img['imagetype'] = [item[4].split('.')[0] for item in [item.split('_') for item in df_img.index]]
+ df_dapi = df_img[df_img.marker.str.contains(s_dapi.split('_')[0])]
+ df_img_stain = df_img[(~df_img.marker.str.contains('DAPI'))]
+ #check
+ es_test = set()
+ for key, item in d_combos.items():
+ es_test = es_test.union(item)
+ print(set(df_img_stain.marker) - es_test)
+ process.custom_overlays(d_combos, df_img_stain, df_dapi)
+ else:
+ continue
+
+def load_crop_labels(d_crop,tu_dim,segdir,cropdir,s_find='Nuclei Segmentation Basins'):
+ """
+ crop the segmentation basins (cell of nuceli) to same coord as images for veiwing in Napari
+ s_find: 'exp5_CellSegmentationBasins' or 'Nuclei Segmentation Basins'
+ """
+ cwd = os.getcwd()
+ for s_scene, xy_cropcoor in d_crop.items():
+ print(s_scene)
+ s_sample = s_scene.split('-Scene-')[0]
+ os.chdir(f'{segdir}')
+
+ for s_file in os.listdir():
+ if s_file.find(s_find) > -1: #Nuclei Segmentation Basins.tif #Cell Segmentation Basins.tif
+ if s_file.find(s_scene.split(s_sample)[1]) > -1:
+ a_seg = skimage.io.imread(s_file)
+ a_crop = a_seg[(xy_cropcoor[1]):(xy_cropcoor[1]+tu_dim[1]),(xy_cropcoor[0]):(xy_cropcoor[0]+tu_dim[0])]
+ s_coor = f'x{xy_cropcoor[0]}y{xy_cropcoor[1]}.tif'
+ #crop file
+ s_file_new = f'{cropdir}/{s_sample}_{s_file.replace(" ","").replace(".tif",s_coor)}'
+ print(s_file_new)
+ skimage.io.imsave(s_file_new,a_crop)
+ os.chdir(cwd)
diff --git a/mplex_image/features.py b/mplex_image/features.py
new file mode 100755
index 0000000..7812462
--- /dev/null
+++ b/mplex_image/features.py
@@ -0,0 +1,603 @@
+####
+# title: features.py
+# language: Python3.7
+# date: 2020-06-00
+# license: GPL>=v3
+# author: Jenny
+# description:
+# python3 script for single cell feature extraction
+####
+
+#libraries
+import os
+import sys
+import numpy as np
+import pandas as pd
+import shutil
+import skimage
+import scipy
+from scipy import stats
+from scipy import ndimage as ndi
+from skimage import measure, segmentation, morphology
+from skimage import io, filters
+import re
+import json
+from biotransistor import imagine
+from PIL import Image
+from mplex_image import process
+import matplotlib.pyplot as plt
+Image.MAX_IMAGE_PIXELS = 1000000000
+
+#functions
+def extract_feat(labels,intensity_image, properties=('centroid','mean_intensity','area','eccentricity')):
+ '''
+ given labels and intensity image, extract features to dataframe
+ '''
+ props = measure.regionprops_table(labels,intensity_image, properties=properties)
+ df_prop = pd.DataFrame(props)
+ return(df_prop)
+
+def expand_label(labels,distance=3):
+ '''
+ expand the nucelar labels by a fixed number of pixels
+ '''
+ boundaries = segmentation.find_boundaries(labels,mode='outer') #thick
+ shrunk_labels = labels.copy()
+ shrunk_labels[boundaries] = 0
+ background = shrunk_labels == 0
+ distances, (i, j) = scipy.ndimage.distance_transform_edt(
+ background, return_indices=True
+ )
+
+ grown_labels = labels.copy()
+ mask = background & (distances <= distance)
+ grown_labels[mask] = shrunk_labels[i[mask], j[mask]]
+ ring_labels = grown_labels - shrunk_labels
+
+ return(ring_labels, grown_labels) #shrunk_labels, grown_labels,
+
+def contract_label(labels,distance=3):
+ '''
+ contract labels by a fixed number of pixels
+ '''
+ boundaries = segmentation.find_boundaries(labels,mode='outer')
+ shrunk_labels = labels.copy()
+ shrunk_labels[boundaries] = 0
+ foreground = shrunk_labels != 0
+ distances, (i, j) = scipy.ndimage.distance_transform_edt(
+ foreground, return_indices=True
+ )
+
+ mask = foreground & (distances <= distance)
+ shrunk_labels[mask] = shrunk_labels[i[mask], j[mask]]
+ rim_labels = labels - shrunk_labels
+ return(rim_labels)
+
+def straddle_label(labels,distance=3):
+ '''
+ expand and contract labels by a fixed number of pixels
+ '''
+ boundaries = segmentation.find_boundaries(labels,mode='outer') #outer
+ shrunk_labels = labels.copy()
+ grown_labels = labels.copy()
+ shrunk_labels[boundaries] = 0
+ foreground = shrunk_labels != 0
+ background = shrunk_labels == 0
+ distances_f, (i, j) = scipy.ndimage.distance_transform_edt(
+ foreground, return_indices=True
+ )
+ distances_b, (i, j) = scipy.ndimage.distance_transform_edt(
+ background, return_indices=True
+ )
+ mask_f = foreground & (distances_f <= distance)
+ mask_b = background & (distances_b <= distance + 1)
+ shrunk_labels[mask_f] = 0
+ grown_labels[mask_b] = grown_labels[i[mask_b], j[mask_b]]
+ membrane_labels = grown_labels - shrunk_labels
+ return(membrane_labels, grown_labels, shrunk_labels)
+
+def label_difference(labels,cell_labels):
+ '''
+ given matched nuclear and cell label IDs,return cell_labels minus labels
+ '''
+ overlap = cell_labels==labels
+ ring_rep = cell_labels.copy()
+ ring_rep[overlap] = 0
+ return(ring_rep)
+
+def get_mip(ls_img):
+ '''
+ maximum intensity projection of images (input list of filenames)
+ '''
+ imgs = []
+ for s_img in ls_img:
+ img = io.imread(s_img)
+ imgs.append(img)
+ mip = np.stack(imgs).max(axis=0)
+ return(mip)
+
+def thresh_li(img,area_threshold=100,low_thresh=1000):
+ '''
+ threshold an image with Li’s iterative Minimum Cross Entropy method
+ if too low, apply the low threshold instead (in case negative)
+ '''
+ mask = img >= filters.threshold_li(img)
+ mask = morphology.remove_small_holes(mask, area_threshold=area_threshold)
+ mask[mask < low_thresh] = 0
+ return(mask)
+
+def mask_border(mask,type='inner',pixel_distance = 50):
+ '''
+ for inner, distance transform from mask to background
+ for outer, distance transform from back ground to mask
+ returns a mask
+ '''
+ shrunk_mask = mask.copy()
+ if type == 'inner':
+ foreground = ~mask
+ background = mask
+ elif type == 'outer':
+ foreground = ~mask
+ background = mask
+ distances, (i, j) = scipy.ndimage.distance_transform_edt(
+ background, return_indices=True
+ )
+ maskdist = mask & (distances <= pixel_distance)
+ shrunk_mask[maskdist] = shrunk_mask[i[maskdist], j[maskdist]]
+ mask_out = np.logical_and(mask,np.logical_not(shrunk_mask))
+ return(mask_out,shrunk_mask,maskdist,distances)
+
+def mask_labels(mask,labels):
+ ''''
+ return the labels that fall within the mask
+ '''
+ selected_array = labels[mask]
+ a_unique = np.unique(selected_array)
+ return(a_unique)
+
+def parse_org(s_end = "ORG.tif",s_start='R'):
+ """
+ This function will parse images following koei's naming convention
+ Example: Registered-R1_PCNA.CD8.PD1.CK19_Her2B-K157-Scene-002_c1_ORG.tif
+ The output is a dataframe with image filename in index
+ And rounds, color, imagetype, scene (/tissue), and marker in the columns
+ """
+ ls_file = []
+ for file in os.listdir():
+ if file.endswith(s_end):
+ if file.find(s_start)==0:
+ ls_file = ls_file + [file]
+ df_img = pd.DataFrame(index=ls_file)
+ df_img['rounds'] = [item.split('_')[0].split('Registered-')[1] for item in df_img.index]
+ df_img['color'] = [item.split('_')[-2] for item in df_img.index]
+ df_img['slide'] = [item.split('_')[2] for item in df_img.index]
+ df_img['scene'] = [item.split('-Scene-')[1] for item in df_img.slide]
+ #parse file name for biomarker
+ for s_index in df_img.index:
+ #print(s_index)
+ s_color = df_img.loc[s_index,'color']
+ if s_color == 'c1':
+ s_marker = 'DAPI'
+ elif s_color == 'c2':
+ s_marker = s_index.split('_')[1].split('.')[0]
+ elif s_color == 'c3':
+ s_marker = s_index.split('_')[1].split('.')[1]
+ elif s_color == 'c4':
+ s_marker = s_index.split('_')[1].split('.')[2]
+ elif s_color == 'c5':
+ s_marker = s_index.split('_')[1].split('.')[3]
+ elif s_color == 'c6':
+ s_marker = s_index.split('_')[1].split('.')[2]
+ elif s_color == 'c7':
+ s_marker = s_index.split('_')[1].split('.')[3]
+ else: print('Error')
+ df_img.loc[s_index,'marker'] = s_marker
+ return(df_img)
+
+def extract_cellpose_features(s_sample, segdir, subdir, ls_seg_markers, nuc_diam, cell_diam,b_big=False): #,b_thresh=False
+ '''
+ load the segmentation results, the input images, and the channels images
+ extract mean intensity from each image, and centroid, area and eccentricity for
+ '''
+
+ df_sample = pd.DataFrame()
+ df_thresh = pd.DataFrame()
+
+ os.chdir(f'{segdir}/{s_sample}Cellpose_Segmentation')
+ ls_scene = []
+ d_match = {}
+ for s_file in os.listdir():
+ if s_file.find(f'{".".join(ls_seg_markers)} matchedcell{cell_diam} - Cell Segmentation Basins')>-1:
+ ls_scene.append(s_file.split('_')[0])
+ d_match.update({s_file.split('_')[0]:s_file})
+ elif s_file.find(f'{".".join(ls_seg_markers)} nuc{nuc_diam} matchedcell{cell_diam} - Cell Segmentation Basins')>-1:
+ ls_scene.append(s_file.split('_')[0])
+ d_match.update({s_file.split('_')[0]:s_file})
+ for s_scene in ls_scene:
+ os.chdir(f'{segdir}/{s_sample}Cellpose_Segmentation')
+ print(f'processing {s_scene}')
+ for s_file in os.listdir():
+ if s_file.find(s_scene) > -1:
+ if s_file.find("DAPI.png") > -1:
+ s_dapi = s_file
+ dapi = io.imread(f'{segdir}/{s_sample}Cellpose_Segmentation/{s_dapi}')
+ print(f'loading {s_scene} nuclei{nuc_diam} - Nuclei Segmentation Basins.tif')
+ labels = io.imread(f'{s_scene} nuclei{nuc_diam} - Nuclei Segmentation Basins.tif')
+ cell_labels = io.imread(f'{segdir}/{s_sample}Cellpose_Segmentation/{d_match[s_scene]}')
+ print(f'loading {d_match[s_scene]}')
+ #nuclear features
+ df_feat = extract_feat(labels,dapi, properties=(['label']))
+ df_feat.columns = [f'{item}_segmented-nuclei' for item in df_feat.columns]
+ df_feat.index = [f'{s_sample}_scene{s_scene.split("-Scene-")[1].split("_")[0]}_cell{item}' for item in df_feat.loc[:,'label_segmented-nuclei']]
+
+ #get subcellular regions
+ cyto = label_difference(labels,cell_labels)
+ d_loc_nuc = subcellular_regions(labels, distance_short=2, distance_long=5)
+ d_loc_cell = subcellular_regions(cell_labels, distance_short=2, distance_long=5)
+ d_loc = {'nuclei':labels,'cell':cell_labels,'cytoplasm':cyto,
+ 'nucmem':d_loc_nuc['membrane'][0],'cellmem':d_loc_cell['membrane'][0],
+ 'perinuc5':d_loc_nuc['ring'][1],'exp5':d_loc_nuc['grown'][1],
+ 'nucadj2':d_loc_nuc['straddle'][0],'celladj2':d_loc_cell['straddle'][0]}
+
+ #subdir organized by slide or scene
+ if os.path.exists(f'{subdir}/{s_sample}'):
+ os.chdir(f'{subdir}/{s_sample}')
+ elif os.path.exists(f'{subdir}/{s_scene}'):
+ os.chdir(f'{subdir}/{s_scene}')
+ else:
+ os.chdir(f'{subdir}')
+ df_img = parse_org()
+ df_img['round_int'] = [int(re.sub('[^0-9]','', item)) for item in df_img.rounds]
+ df_img = df_img[df_img.round_int < 90]
+ df_img = df_img.sort_values('round_int')
+ df_scene = df_img[df_img.scene==s_scene.split("-Scene-")[1].split("_")[0]]
+
+ #load each image
+ for s_index in df_scene.index:
+ intensity_image = io.imread(s_index)
+ df_thresh.loc[s_index,'threshold_li'] = filters.threshold_li(intensity_image)
+ if intensity_image.mean() > 0:
+ df_thresh.loc[s_index,'threshold_otsu'] = filters.threshold_otsu(intensity_image)
+ df_thresh.loc[s_index,'threshold_triangle'] = filters.threshold_triangle(intensity_image)
+ #if b_thresh:
+ # break
+ s_marker = df_scene.loc[s_index,'marker']
+ print(f'extracting features {s_marker}')
+ if s_marker == 'DAPI':
+ s_marker = s_marker + f'{df_scene.loc[s_index,"rounds"].split("R")[1]}'
+ for s_loc, a_loc in d_loc.items():
+ if s_loc == 'nuclei':
+ df_marker_loc = extract_feat(a_loc,intensity_image, properties=(['mean_intensity','centroid','area','eccentricity','label']))
+ df_marker_loc.columns = [f'{s_marker}_{s_loc}',f'{s_marker}_{s_loc}_centroid-0',f'{s_marker}_{s_loc}_centroid-1',f'{s_marker}_{s_loc}_area',f'{s_marker}_{s_loc}_eccentricity',f'{s_marker}_{s_loc}_label']
+ elif s_loc == 'cell':
+ df_marker_loc = extract_feat(a_loc,intensity_image, properties=(['mean_intensity','euler_number','area','eccentricity','label']))
+ df_marker_loc.columns = [f'{s_marker}_{s_loc}',f'{s_marker}_{s_loc}_euler',f'{s_marker}_{s_loc}_area',f'{s_marker}_{s_loc}_eccentricity',f'{s_marker}_{s_loc}_label']
+ else:
+ df_marker_loc = extract_feat(a_loc,intensity_image, properties=(['mean_intensity','label']))
+ df_marker_loc.columns = [f'{s_marker}_{s_loc}',f'{s_marker}_{s_loc}_label']
+ #drop zero from array, set array ids as index
+ #old df_marker_loc.index = sorted(np.unique(a_loc)[1::])
+ df_marker_loc.index = df_marker_loc.loc[:,f'{s_marker}_{s_loc}_label']
+ df_marker_loc.index = [f'{s_sample}_scene{s_scene.split("-Scene-")[1].split("_")[0]}_cell{item}' for item in df_marker_loc.index]
+ df_feat = df_feat.merge(df_marker_loc, left_index=True,right_index=True,how='left',suffixes=('',f'{s_marker}_{s_loc}'))
+ if b_big:
+ df_feat.to_csv(f'{segdir}/{s_sample}Cellpose_Segmentation/features_{s_sample}-{s_scene}.csv')
+ df_sample = df_sample.append(df_feat)
+ return(df_sample, df_thresh)
+
+def extract_bright_features(s_sample, segdir, subdir, ls_seg_markers, nuc_diam, cell_diam,ls_membrane):
+ '''
+ load the features, segmentation results, the input images, and the channels images
+ extract mean intensity of the top 25% of pixel in from each label region
+ '''
+ df_sample = pd.DataFrame()
+ os.chdir(f'{segdir}/{s_sample}Cellpose_Segmentation')
+ ls_scene = []
+ d_match = {}
+ for s_file in os.listdir():
+ if s_file.find(f'{".".join(ls_seg_markers)} matchedcell{cell_diam} - Cell Segmentation Basins')>-1:
+ ls_scene.append(s_file.split('_')[0])
+ d_match.update({s_file.split('_')[0]:s_file})
+ elif s_file.find(f'{".".join(ls_seg_markers)} nuc{nuc_diam} matchedcell{cell_diam} - Cell Segmentation Basins')>-1:
+ ls_scene.append(s_file.split('_')[0])
+ d_match.update({s_file.split('_')[0]:s_file})
+ for s_scene in ls_scene:
+ os.chdir(f'{segdir}/{s_sample}Cellpose_Segmentation')
+ print(f'processing {s_scene}')
+ for s_file in os.listdir():
+ if s_file.find(s_scene) > -1:
+ if s_file.find("DAPI.png") > -1:
+ s_dapi = s_file
+ dapi = io.imread(f'{segdir}/{s_sample}Cellpose_Segmentation/{s_dapi}')
+ print(f'loading {s_scene} nuclei{nuc_diam} - Nuclei Segmentation Basins.tif')
+ labels = io.imread(f'{s_scene} nuclei{nuc_diam} - Nuclei Segmentation Basins.tif')
+ print(labels.shape)
+ cell_labels = io.imread(f'{segdir}/{s_sample}Cellpose_Segmentation/{d_match[s_scene]}')
+ print(cell_labels.shape)
+ print(f'loading {d_match[s_scene]}')
+ #nuclear features
+ df_feat = extract_feat(labels,dapi, properties=(['label']))
+ df_feat.columns = [f'{item}_segmented-nuclei' for item in df_feat.columns]
+ df_feat.index = [f'{s_sample}_scene{s_scene.split("-Scene-")[1].split("_")[0]}_cell{item}' for item in df_feat.loc[:,'label_segmented-nuclei']]
+
+ #get subcellular regions
+ d_loc_nuc = subcellular_regions(labels, distance_short=2, distance_long=5)
+ d_loc_cell = subcellular_regions(cell_labels, distance_short=2, distance_long=5)
+ d_loc = {'nucmem25':d_loc_nuc['membrane'][0],'exp5nucmembrane25':d_loc_nuc['grown'][1],
+ 'cellmem25':d_loc_cell['membrane'][0],'nuclei25':labels}
+
+ #subdir organized by slide or scene
+ if os.path.exists(f'{subdir}/{s_sample}'):
+ os.chdir(f'{subdir}/{s_sample}')
+ elif os.path.exists(f'{subdir}/{s_scene}'):
+ os.chdir(f'{subdir}/{s_scene}')
+ else:
+ os.chdir(f'{subdir}')
+ df_img = parse_org()
+ df_img['round_int'] = [int(re.sub('[^0-9]','', item)) for item in df_img.rounds]
+ df_img = df_img[df_img.round_int < 90]
+ df_img = df_img.sort_values('round_int')
+ df_scene = df_img[df_img.scene==s_scene.split("-Scene-")[1].split("_")[0]]
+ df_marker = df_scene[df_scene.marker.isin(ls_membrane)]
+ #load each image
+ for s_index in df_marker.index:
+ print(f'loading {s_index}')
+ intensity_image = io.imread(s_index)
+ #print(intensity_image.shape)
+ s_marker = df_marker.loc[s_index,'marker']
+ print(f'extracting features {s_marker}')
+ if s_marker == 'DAPI':
+ s_marker = s_marker + f'{df_marker.loc[s_index,"rounds"].split("R")[1]}'
+ for s_loc, a_loc in d_loc.items():
+ #print(a_loc.shape)
+ df_marker_loc = pd.DataFrame(columns = [f'{s_marker}_{s_loc}'])
+ df_prop = extract_feat(a_loc,intensity_image, properties=(['intensity_image','image','label']))
+ for idx in df_prop.index:
+ label_id = df_prop.loc[idx,'label']
+ intensity_image_small = df_prop.loc[idx,'intensity_image']
+ image = df_prop.loc[idx,'image']
+ pixels = intensity_image_small[image]
+ pixels25 = pixels[pixels >= np.quantile(pixels,.75)]
+ df_marker_loc.loc[label_id,f'{s_marker}_{s_loc}'] = pixels25.mean()
+ df_marker_loc.index = [f'{s_sample}_scene{s_scene.split("-Scene-")[1].split("_")[0]}_cell{item}' for item in df_marker_loc.index]
+ df_feat = df_feat.merge(df_marker_loc, left_index=True,right_index=True,how='left',suffixes=('',f'{s_marker}_{s_loc}'))
+ df_sample = df_sample.append(df_feat)
+ #break
+ return(df_sample)
+
+def subcellular_regions(labels, distance_short=2, distance_long=5):
+ '''
+ calculate subcellular segmentation regions from segmentation mask
+ '''
+ membrane_short = contract_label(labels,distance=distance_short)
+ membrane_long = contract_label(labels,distance=distance_long)
+ ring_short, grown_short = expand_label(labels,distance=distance_short)
+ ring_long, grown_long = expand_label(labels,distance=distance_long)
+ straddle_short, __, shrink_short = straddle_label(labels,distance=distance_short)
+ straddle_long, __, shrink_long = straddle_label(labels,distance=distance_long)
+ d_loc_sl={'membrane':(membrane_short,membrane_long),
+ 'ring':(ring_short,ring_long),
+ 'straddle':(straddle_short,straddle_long),
+ 'grown':(grown_short,grown_long),
+ 'shrunk':(shrink_short,shrink_long)}
+ return(d_loc_sl)
+
+def combine_labels(s_sample,segdir, subdir, ls_seg_markers, nuc_diam, cell_diam, df_mi_full,s_thresh):
+ '''
+ - load cell labels; delete cells that were not used for cytoplasm (i.e. ecad neg)
+ - nuc labels, expand to perinuc 5 and then cut out the cell labels
+ - keep track of cells that are completely coverd by another cell (or two or three: counts as touching).
+ '''
+ se_neg = df_mi_full[df_mi_full.slide == s_sample].loc[:,f'{s_thresh}_negative']
+ print(len(se_neg))
+ dd_result = {}
+ if os.path.exists(f'{segdir}/{s_sample}Cellpose_Segmentation'):
+ os.chdir(f'{segdir}/{s_sample}Cellpose_Segmentation')
+ else:
+ os.chdir(segdir)
+ print(segdir)
+ ls_scene = []
+ for s_file in os.listdir():
+ if s_file.find(' - DAPI.png') > -1:
+ ls_scene.append(s_file.split(' - DAPI.png')[0])
+ ls_scene_all = sorted(set([item.split('_cell')[0].replace('_scene','-Scene-') for item in se_neg.index]) & set(ls_scene))
+ if len(ls_scene_all) == 0:
+ ls_scene_all = sorted(set([item.split('_cell')[0].replace('_scene','-Scene-').split('_')[1] for item in se_neg.index]) & set(ls_scene))
+ print(ls_scene_all)
+ for s_scene in ls_scene_all:
+ se_neg_scene = se_neg[se_neg.index.str.contains(s_scene.replace("Scene ","scene")) | se_neg.index.str.contains(s_scene.replace("-Scene-","_scene"))]
+ print(f'Processing combined segmentaiton labels for {s_scene}')
+ if os.path.exists(f'{s_scene} nuclei{nuc_diam} - Nuclei Segmentation Basins.tif'):
+ labels = io.imread(f'{s_scene} nuclei{nuc_diam} - Nuclei Segmentation Basins.tif')
+ else:
+ print('no nuclei labels found')
+ if os.path.exists(f'{s_scene} matchedcell{cell_diam} - Cell Segmentation Basins.tif'):
+ cell_labels = io.imread(f'{s_scene} matchedcell{cell_diam} - Cell Segmentation Basins.tif')
+ elif os.path.exists(f'{s_scene}_{".".join(ls_seg_markers)} matchedcell{cell_diam} - Cell Segmentation Basins.tif'):
+ cell_labels = io.imread(f'{s_scene}_{".".join(ls_seg_markers)} matchedcell{cell_diam} - Cell Segmentation Basins.tif')
+ elif os.path.exists(f'{s_scene}_{".".join(ls_seg_markers)} nuc{nuc_diam} matchedcell{cell_diam} - Cell Segmentation Basins.tif'):
+ cell_labels = io.imread(f'{s_scene}_{".".join(ls_seg_markers)} nuc{nuc_diam} matchedcell{cell_diam} - Cell Segmentation Basins.tif')
+ else:
+ print('no cell labels found')
+ #set non-ecad cell labels to zero
+ a_zeros = np.array([int(item.split('_cell')[1]) for item in se_neg_scene[se_neg_scene].index]).astype('int64')
+ mask = np.isin(cell_labels, a_zeros)
+ cell_labels_copy = cell_labels.copy()
+ cell_labels_copy[mask] = 0
+ #make the nuclei under cells zero
+ labels_copy = labels.copy()
+ distance = 5
+ perinuc5, labels_exp = expand_label(labels,distance=distance)
+ labels_exp[cell_labels_copy > 0] = 0
+ #combine calls and expanded nuclei
+ combine = (labels_exp + cell_labels_copy)
+ if s_scene.find('Scene') == 0:
+ io.imsave(f'{s_sample}_{s_scene.replace("Scene ","scene")}_cell{cell_diam}_nuc{nuc_diam}_CombinedSegmentationBasins.tif',combine)
+ else:
+ io.imsave(f'{s_scene}_{".".join(ls_seg_markers)}-cell{cell_diam}_exp{distance}_CellSegmentationBasins.tif',combine)
+ #figure out the covered cells...labels + combined
+ not_zero_pixels = np.array([labels.ravel() !=0,combine.ravel() !=0]).all(axis=0)
+ a_tups = np.array([combine.ravel()[not_zero_pixels],labels.ravel()[not_zero_pixels]]).T #combined over nuclei
+ unique_rows = np.unique(a_tups, axis=0)
+ new_dict = {}
+ for key, value in unique_rows:
+ if key == value:
+ continue
+ else:
+ if key in new_dict:
+ new_dict[key].append(value)
+ else:
+ new_dict[key] = [value]
+ #from elmar (reformat cells touching dictionary and save
+ d_result = {}
+ for i_cell, li_touch in new_dict.items():
+ d_result.update({str(i_cell): [str(i_touch) for i_touch in li_touch]})
+ dd_result.update({f'{s_sample}_{s_scene.replace("Scene ","scene")}':d_result})
+ #save dd_touch as json file
+ with open(f'result_{s_sample}_cellsatop_dictionary.json','w') as f:
+ json.dump(dd_result, f)
+ print('')
+ return(labels,combine,dd_result)
+
+def check_basins(cell_labels, cell_diam):
+ dai_value = {'a':cell_labels}
+ df = imagine.membrane_px(cell_labels,dai_value)
+ ls_bad = sorted(set(df[df.x_relative > 10*cell_diam].cell) | set(df[df.y_relative > 10*cell_diam].cell))
+ return(ls_bad)
+
+def check_combined(segdir,s_sample,cell_diam,ls_seg_markers):
+ df_result = pd.DataFrame()
+ if os.path.exists(f'{segdir}/{s_sample}Cellpose_Segmentation'):
+ os.chdir(f'{segdir}/{s_sample}Cellpose_Segmentation')
+ else:
+ os.chdir(segdir)
+ ls_scene = []
+ for s_file in os.listdir():
+ if s_file.find(' - DAPI.png') > -1:
+ ls_scene.append(s_file.split(' - DAPI.png')[0])
+ for s_scene in sorted(ls_scene):
+ print(s_scene)
+ if os.path.exists(f'{s_scene}_{".".join(ls_seg_markers)}-cell{cell_diam}_exp5_CellSegmentationBasins.tif'):
+ cell_labels = io.imread(f'{s_scene}_{".".join(ls_seg_markers)}-cell{cell_diam}_exp5_CellSegmentationBasins.tif')
+ print(f'Loaded {s_scene}_{".".join(ls_seg_markers)}-cell{cell_diam}_exp5_CellSegmentationBasins.tif')
+ ls_bad = check_basins(cell_labels, cell_diam)
+ ls_bad_cells = [f"{s_scene.replace('-Scene-','_scene')}_cell{item}" for item in ls_bad]
+ df_bad = pd.DataFrame(index=ls_bad_cells,columns=['bad_match'],data=[True]*len(ls_bad_cells))
+ df_result = df_result.append(df_bad)
+ else:
+ print('no combined cell labels found')
+ return(df_result)
+
+def edge_mask(s_sample,segdir,subdir,i_pixel=154, dapi_thresh=350,i_fill=50000,i_close=20):
+ '''
+ find edge of the tissue. first, find tissue by threshodling DAPI R1 (pixels above dapi_thresh)
+ then, mask all pixels within i_pixel distance of tissue border
+ return/save binary mask
+ '''
+ os.chdir(segdir)
+ df_img = process.load_li([s_sample],s_thresh='', man_thresh=100)
+ for s_scene in sorted(set(df_img.scene)):
+ print(f'Calculating tissue edge mask for Scene {s_scene}')
+ s_index = df_img[(df_img.scene == s_scene) & (df_img.rounds == 'R1') & (df_img.color =='c1')].index[0]
+ if os.path.exists(f'{subdir}/{s_sample}/{s_index}'):
+ img_dapi = io.imread(f'{subdir}/{s_sample}/{s_index}')
+ elif os.path.exists(f'{subdir}/{s_sample}-Scene-{s_scene}/{s_index}'):
+ img_dapi = io.imread(f'{subdir}/{s_sample}-Scene-{s_scene}/{s_index}')
+ else:
+ print('no DAPI found')
+ img_dapi = np.zeros([2,2])
+ mask = img_dapi > dapi_thresh
+ mask_small = morphology.remove_small_objects(mask, min_size=100)
+ mask_closed = morphology.binary_closing(mask_small, morphology.octagon(i_close,i_close//2))
+ mask_filled = morphology.remove_small_holes(mask_closed, i_fill)
+ border_mask, __, __,distances = mask_border(mask_filled,type='inner',pixel_distance = i_pixel)
+ img = np.zeros(border_mask.shape,dtype='uint8')
+ img[border_mask] = 255
+ io.imsave(f"{segdir}/TissueEdgeMask{i_pixel}_{s_sample}_scene{s_scene}.png", img)
+
+def edge_hull(s_sample,segdir,subdir,i_pixel=154, dapi_thresh=350,i_fill=50000,i_close=40,i_small=30000):
+ '''
+ find edge of the tissue. first, find tissue by threshodling DAPI R1 (pixels above dapi_thresh)
+ then, mask all pixels within i_pixel distance of tissue border
+ return/save binary mask
+ '''
+ os.chdir(segdir)
+ df_img = process.load_li([s_sample],s_thresh='', man_thresh=100)
+ for s_scene in sorted(set(df_img.scene)):
+ print(f'Calculating tissue edge mask for Scene {s_scene}')
+ s_index = df_img[(df_img.scene == s_scene) & (df_img.rounds == 'R1') & (df_img.color =='c1')].index[0]
+ if os.path.exists(f'{subdir}/{s_sample}/{s_index}'):
+ img_dapi = io.imread(f'{subdir}/{s_sample}/{s_index}')
+ elif os.path.exists(f'{subdir}/{s_sample}-Scene-{s_scene}/{s_index}'):
+ img_dapi = io.imread(f'{subdir}/{s_sample}-Scene-{s_scene}/{s_index}')
+ else:
+ print('no DAPI found')
+ img_dapi = np.zeros([2,2])
+ mask = img_dapi > dapi_thresh
+ mask_small = morphology.remove_small_objects(mask, min_size=100)
+ mask_closed = morphology.binary_closing(mask_small, morphology.octagon(i_close,i_close//2))
+ mask_filled = morphology.remove_small_holes(mask_closed, i_fill)
+ mask_smaller = morphology.remove_small_objects(mask, min_size=i_small)
+ mask_hull = morphology.convex_hull_image(mask_smaller)
+ border_mask, __, __,distances = mask_border(mask_filled,type='inner',pixel_distance = i_pixel)
+ img = np.zeros(border_mask.shape,dtype='uint8')
+ img[border_mask] = 255
+ io.imsave(f"{segdir}/TissueEdgeMask{i_pixel}_{s_sample}_scene{s_scene}.png", img)
+
+def edge_cells(s_sample,segdir,nuc_diam,i_pixel=154):
+ '''
+ load a binary mask of tissue, cell labels, and xy coord datafreame.
+ return data frame of cells witin binary mask
+ '''
+ df_sample = pd.DataFrame()
+ #load xy
+ df_xy = pd.read_csv(f'{segdir}/features_{s_sample}_CentroidXY.csv',index_col=0)
+ df_xy['cells'] = [int(item.split('cell')[1]) for item in df_xy.index]
+ ls_scene = sorted(set([item.split('_')[1].split('scene')[1] for item in df_xy.index]))
+ #load masks
+ for s_scene in ls_scene:
+ print(f'Calculating edge cells for Scene {s_scene}')
+ mask = io.imread(f"{segdir}/TissueEdgeMask{i_pixel}_{s_sample}_scene{s_scene}.png")
+ mask_gray = mask == 255
+ labels = io.imread(f'{segdir}/{s_sample}Cellpose_Segmentation/{s_sample}-Scene-{s_scene} nuclei{nuc_diam} - Nuclei Segmentation Basins.tif')
+ edge = mask_labels(mask_gray,labels)
+ df_scene = df_xy[df_xy.index.str.contains(f'{s_sample}_scene{s_scene}')]
+ #works
+ es_cells = set(edge.astype('int')).intersection(set(df_scene.cells))
+ df_edge = df_scene[df_scene.cells.isin(es_cells)]
+ fig,ax=plt.subplots()
+ ax.imshow(mask_gray)
+ ax.scatter(df_edge.DAPI_X,df_edge.DAPI_Y,s=1)
+ fig.savefig(f'{segdir}/TissueEdgeMask{i_pixel}_{s_sample}-Scene-{s_scene}_cells.png')
+ df_sample = df_sample.append(df_edge)
+ return(df_sample)
+
+def cell_distances(df_xy,s_scene,distances):
+ '''
+ load a binary mask of tissue, cell labels, and xy coord datafreame.
+ return data frame of cells witin binary mask
+ '''
+ df_xy['DAPI_Y'] = df_xy.DAPI_Y.astype('int64')
+ df_xy['DAPI_X'] = df_xy.DAPI_X.astype('int64')
+ print(f'Calculating distances for Scene {s_scene}')
+ df_scene = df_xy[df_xy.index.str.contains(f"{s_scene.replace('-Scene-','_scene')}")].copy()
+ df_scene['pixel_dist'] = distances[df_scene.DAPI_Y,df_scene.DAPI_X]
+ return(df_scene)
+
+def cell_coords():
+ '''
+ TBD: find cell coordinate within a mask
+ '''
+ for s_scene in ls_scene:
+ #old (use if you have coordinates, not labels)
+ #mask_gray = mask#[:,:,0]
+ #contour = skimage.measure.find_contours(mask_gray,0)
+ #coords = skimage.measure.approximate_polygon(contour[0], tolerance=5)
+ #fig,ax=plt.subplots()
+ #ax.imshow(mask_gray)
+ #ax.plot(coords[:, 1], coords[:, 0], '-r', linewidth=2)
+ #fig.savefig(f'TissueEdgeMask_{s_sample}_Scene-{s_scene}_polygon.png')
+ #x = np.array(df_scene.DAPI_X.astype('int').values)
+ #y = np.array(df_scene.DAPI_Y.astype('int').values)
+ #points = np.array((y,x)).T
+ mask = skimage.measure.points_in_poly(points, coords)
\ No newline at end of file
diff --git a/mplex_image/gating.py b/mplex_image/gating.py
new file mode 100755
index 0000000..a3665fc
--- /dev/null
+++ b/mplex_image/gating.py
@@ -0,0 +1,205 @@
+#####
+# gating.py
+# author: engje, grael
+# date: 2020-04-07
+# license: GPLv3
+#####
+
+# library
+import os
+import pandas as pd
+import shutil
+from mplex_image import analyze
+import numpy as np
+
+
+def main_celltypes(df_data,ls_endothelial,ls_immune,ls_tumor,ls_cellline_index):
+ #celltpye
+ #1 endothelial
+ df_data['endothelial'] = df_data.loc[:,ls_endothelial].any(axis=1)
+ #2 immune
+ ls_exclude = ls_endothelial
+ df_data['immune'] = df_data.loc[:,ls_immune].any(axis=1) & ~df_data.loc[:,ls_exclude].any(axis=1)
+ #3 tumor
+ ls_exclude = ls_endothelial + ls_immune
+ df_data['tumor'] = df_data.loc[:,ls_tumor].any(axis=1) & ~df_data.loc[:,ls_exclude].any(axis=1)
+ #4 stromal
+ ls_exclude = ls_immune + ls_endothelial + ls_tumor
+ df_data['stromal'] = ~df_data.loc[:,ls_exclude].any(axis=1)
+ #add celltype
+ ls_cell_names = ['stromal','endothelial','tumor','immune']
+ s_type_name = 'celltype'
+ analyze.add_celltype(df_data, ls_cell_names, s_type_name)
+ #fix cell lines (all tumor!)
+ df_data['slide_scene'] = [item.split('_cell')[0] for item in df_data.index]
+ df_data.loc[df_data[df_data.slide_scene.isin(ls_cellline_index)].index,'celltype'] = 'tumor'
+ df_data['immune'] = df_data.loc[:,'celltype'] == 'immune'
+ df_data['stromal'] = df_data.loc[:,'celltype'] == 'stromal'
+ df_data['endothelial'] = df_data.loc[:,'celltype'] == 'endothelial'
+ return(df_data)
+
+def proliferation(df_data,ls_prolif):
+ #proliferation
+ df_data['prolif'] = df_data.loc[:,ls_prolif].any(axis=1)
+ df_data['nonprolif'] = ~df_data.loc[:,ls_prolif].any(axis=1)
+ #add proliferation
+ ls_cell_names = ['prolif','nonprolif']
+ s_type_name = 'proliferation'
+ analyze.add_celltype(df_data, ls_cell_names, s_type_name)
+ return(df_data)
+
+def immune_types(df_data,s_myeloid,s_bcell,s_tcell):
+ ## T cell, B cell or myeloid
+ df_data['CD68Mac'] = df_data.loc[:,[s_myeloid,'immune']].all(axis=1)
+ df_data['CD20Bcell'] = df_data.loc[:,[s_bcell,'immune']].all(axis=1) & ~df_data.loc[:,['CD68Mac',s_tcell]].any(axis=1)
+ df_data['TcellImmune'] = df_data.loc[:,[s_tcell,'immune']].all(axis=1) & ~df_data.loc[:,['CD20Bcell','CD68Mac']].any(axis=1)
+ df_data['UnspecifiedImmune'] = df_data.loc[:,'immune'] & ~df_data.loc[:,['CD20Bcell','TcellImmune','CD68Mac']].any(axis=1)
+ ## CD4 and CD8
+ if df_data.columns.isin(['CD8_Ring','CD4_Ring']).sum()==2:
+ #print('CD4 AND CD8')
+ df_data['CD8Tcell'] = df_data.loc[: ,['CD8_Ring','TcellImmune']].all(axis=1)
+ df_data['CD4Tcell'] = df_data.loc[: ,['CD4_Ring','TcellImmune']].all(axis=1) & ~df_data.loc[: ,'CD8Tcell']
+ df_data['UnspecifiedTcell'] = df_data.TcellImmune & ~df_data.loc[:,['CD8Tcell','CD4Tcell']].any(axis=1) #if cd4 or 8 then sum = 2
+ ## check
+ ls_immune = df_data[df_data.loc[:,'TcellImmune']].index.tolist()
+ if ((df_data.loc[ls_immune,['CD8Tcell','CD4Tcell','UnspecifiedTcell']].sum(axis=1)!=1)).any():
+ print('Error in Tcell cell types')
+ ls_immuntype = ['CD68Mac','CD20Bcell','UnspecifiedImmune','CD8Tcell','CD4Tcell','UnspecifiedTcell'] #'TcellImmune',
+ #add Immunetype
+ ls_cell_names = ls_immuntype
+ s_type_name = 'ImmuneType'
+ analyze.add_celltype(df_data, ls_cell_names, s_type_name)
+
+ #get rid of unspecfied immune cells (make them stroma)
+ ls_index = df_data[df_data.ImmuneType.fillna('x').str.contains('Unspecified')].index
+ df_data.loc[ls_index,'celltype'] = 'stromal'
+ df_data.loc[ls_index,'ImmuneType'] = np.nan
+ df_data.loc[ls_index,'stromal'] = True
+ df_data.loc[ls_index,'immune'] = False
+ return(df_data)
+
+def immune_functional(df_data,ls_immune_functional):
+ #Immune functional states
+ df_data.rename(dict(zip(ls_immune_functional,[item.split('_')[0] for item in ls_immune_functional])),axis=1,inplace=True)
+ df_func = analyze.combinations(df_data,[item.split('_')[0] for item in ls_immune_functional])
+ df_data = df_data.merge(df_func,how='left', left_index=True, right_index=True, suffixes = ('_all',''))
+ #gated combinations: immune type plus fuctional status
+ ls_gate = sorted(df_data[~df_data.ImmuneType.isna()].loc[:,'ImmuneType'].unique())
+ ls_marker = df_func.columns.tolist()
+ df_gate_counts = analyze.gated_combinations(df_data,ls_gate,ls_marker)
+ df_data = df_data.merge(df_gate_counts, how='left', left_index=True, right_index=True,suffixes = ('_all',''))
+ #add FuncImmune
+ ls_cell_names = df_gate_counts.columns.tolist()
+ s_type_name ='FuncImmune'
+ analyze.add_celltype(df_data, ls_cell_names, s_type_name)
+ return(df_data)
+
+########################################
+#CellProlif combinations, main cell types and proliferation
+######################################
+def cell_prolif(df_data, s_gate='celltype',ls_combo =['prolif','nonprolif']):
+ ls_gate = df_data.loc[:,s_gate].unique().tolist()
+ df_gate_counts2 = analyze.gated_combinations(df_data,ls_gate,ls_combo)
+ df_data = df_data.merge(df_gate_counts2, how='left', left_index=True, right_index=True,suffixes = ('_all',''))
+ #add CellProlif
+ ls_cell_names = ['endothelial_prolif','endothelial_nonprolif', 'tumor_prolif', 'tumor_nonprolif',
+ 'stromal_prolif', 'stromal_nonprolif', 'immune_prolif','immune_nonprolif']
+ ls_cell_names = df_gate_counts2.columns.tolist()
+ s_type_name = 'CellProlif'
+ analyze.add_celltype(df_data, ls_cell_names, s_type_name)
+ return(df_data)
+
+def diff_hr_state(df_data,ls_luminal,ls_basal,ls_mes):
+ ls_mes = df_data.columns[(df_data.dtypes=='bool') & (df_data.columns.isin(ls_mes) | df_data.columns.isin([item.split('_')[0] for item in ls_mes]))].tolist()
+ print('differentiation')
+ df_data['Lum'] = df_data.loc[:,ls_luminal].any(axis=1) & df_data.tumor
+ df_data['Bas'] = df_data.loc[:,ls_basal].any(axis=1) & df_data.tumor
+ df_data['Mes'] = df_data.loc[:,ls_mes].any(axis=1) & df_data.tumor
+
+ print('hormonal status')
+ df_data['ER'] = df_data.loc[:,['tumor','ER_Nuclei']].all(axis=1)
+ df_data['HER2'] = df_data.loc[:,['tumor','HER2_Ring']].all(axis=1)
+ ls_hr = ['ER']
+ if df_data.columns.isin(['PgR_Nuclei']).any():
+ df_data['PR'] = df_data.loc[:,['tumor','PgR_Nuclei']].all(axis=1)
+ ls_hr.append('PR')
+
+ df_data['HR'] = df_data.loc[:,ls_hr].any(axis=1) & df_data.tumor
+
+ ls_marker = ['Lum','Bas','Mes'] #
+ df_diff = analyze.combinations(df_data,ls_marker)
+ df_data = df_data.merge(df_diff,how='left', left_index=True, right_index=True, suffixes = ('_all',''))
+
+ #add DiffState
+ ls_cell_names = df_diff.columns.tolist()
+ s_type_name = 'DiffState'
+ analyze.add_celltype(df_data, ls_cell_names, s_type_name)
+ #change non-tumor to NA (works!)
+ df_data.loc[df_data[df_data.celltype != 'tumor'].index,s_type_name] = np.nan
+
+ #2 ER/PR/HER2
+ ls_marker = ['HR','HER2']
+ df_hr = analyze.combinations(df_data,ls_marker)
+ df_hr.rename({'__':'TN'},axis=1,inplace=True)
+ df_data = df_data.merge(df_hr,how='left', left_index=True, right_index=True,suffixes = ('_all',''))
+ ls_cell_names = df_hr.columns.tolist()
+ s_type_name = 'HRStatus'
+ analyze.add_celltype(df_data, ls_cell_names, s_type_name)
+ #change non-tumor to NA (works!)
+ df_data.loc[df_data[df_data.celltype != 'tumor'].index,s_type_name] = np.nan
+
+ #3 combinations: differentiation and HR status
+ ls_gate = df_diff.columns.tolist()
+ ls_marker = df_hr.columns.tolist()
+ df_gate_counts = analyze.gated_combinations(df_data,ls_gate,ls_marker)
+ df_data = df_data.merge(df_gate_counts, how='left', left_index=True, right_index=True,suffixes = ('_all',''))
+
+ # make Tumor Diff plus HR Status object column
+ ls_cell_names = df_gate_counts.columns.tolist()
+ s_type_name = 'DiffStateHRStatus'
+ analyze.add_celltype(df_data, ls_cell_names, s_type_name)
+ #change non-tumor to NA (works!)
+ df_data.loc[df_data[df_data.celltype != 'tumor'].index,s_type_name] = np.nan
+ return(df_data)
+
+def celltype_gates(df_data,ls_gate,s_new_name,s_celltype):
+ '''
+ multipurpose for stromaTumor
+ ls_gates =
+ '''
+ ls_gate = df_data.columns[(df_data.dtypes=='bool') & (df_data.columns.isin(ls_gate) | df_data.columns.isin([item.split('_')[0] for item in ls_gate]))].tolist()
+ #tumor signaling and proliferation
+ #rename
+ df_data.rename(dict(zip(ls_gate,[item.split('_')[0] for item in ls_gate])),axis=1,inplace=True)
+ ls_marker = [item.split('_')[0] for item in ls_gate]
+ #functional states (stromal) (don't forget to merge!)
+ df_func = analyze.combinations(df_data,ls_marker)
+ df_data = df_data.merge(df_func,how='left', left_index=True, right_index=True, suffixes = ('_all',''))
+ ls_cell_names = df_func.columns.tolist()
+ analyze.add_celltype(df_data, ls_cell_names, s_new_name)
+ #change non-tumor to NA (works!)
+ df_data.loc[df_data[df_data.celltype != s_celltype].index,s_new_name] = np.nan
+ df_data[s_new_name] = df_data.loc[:,s_new_name].replace(dict(zip(ls_cell_names,[f'{s_celltype}_{item}' for item in ls_cell_names])))
+ return(df_data)
+
+def non_tumor(df_data):
+ #one more column: all non-tumor cells
+ index_endothelial = df_data[df_data.celltype=='endothelial'].index
+ index_immune = df_data[df_data.celltype=='immune'].index
+ index_stroma = df_data[df_data.celltype=='stromal'].index
+ index_tumor = df_data[df_data.celltype=='tumor'].index
+
+ if df_data.columns.isin(['ImmuneType','StromalType']).sum() == 2:
+ #fewer cell tpyes
+ df_data.loc[index_endothelial,'NonTumor'] = 'endothelial'
+ df_data.loc[index_immune,'NonTumor'] = df_data.loc[index_immune,'ImmuneType']
+ df_data.loc[index_stroma,'NonTumor'] = df_data.loc[index_stroma,'StromalType']
+ df_data.loc[index_tumor,'NonTumor'] = np.nan
+
+ if df_data.columns.isin(['FuncImmune','CellProlif']).sum() == 2:
+ #more cell types
+ df_data.loc[index_endothelial,'NonTumorFunc'] = df_data.loc[index_endothelial,'CellProlif']
+ df_data.loc[index_immune,'NonTumorFunc'] = df_data.loc[index_immune,'FuncImmune']
+ df_data.loc[index_stroma,'NonTumorFunc'] = df_data.loc[index_stroma,'StromalType']
+ df_data.loc[index_tumor,'NonTumorFunc'] = np.nan
+ return(df_data)
diff --git a/mplex_image/getdata.py b/mplex_image/getdata.py
new file mode 100755
index 0000000..aca70dc
--- /dev/null
+++ b/mplex_image/getdata.py
@@ -0,0 +1,176 @@
+####
+# title: getdata.py
+#
+# language: Python3.6
+# date: 2018-08-00
+# license: GPL>=v3
+# author: Jenny, bue (mostly bue)
+#
+# description:
+# python3 library to analyise guillaume segemented cyclic staining data.
+####
+
+# load library
+import csv
+import os
+import re
+
+
+# function implementaion
+# import importlib
+# importlib.reload(getdata)
+
+def get_df(
+ #s_gseg_folder_root='/graylab/share/engje/Data/',
+ #s_scene_label='Registered-Her'
+ s_folder_regex="^SlideName.*_Features$",
+ es_value_label = {"MeanIntensity","CentroidX","CentroidY"},
+ #s_df_folder_root="./",
+ #b_roundscycles=False,
+ ):
+ '''
+ input:
+ segmentation fiels from Guillaume's software, which have in the
+ "Label" column the "cell serial number" (cell)
+ and in other columns the "feature of intrests" and unintrest.
+
+ the segmentation files are ordered in such a path structure:
+ + {s_gseg_folder_root}
+ |+ {s_gseg_folder_run_regex}*_YYYY-MM-DD_* (run)
+ | |+ Scene 000 - Nuclei - CD32.txt (scene and protein)
+ | |+ Scene 000 - Location - ProteinName.txt
+ |
+ |+ {s_gseg_folder_run_regex}*_YYYY-MM-DD_*
+
+ output:
+ at {s_df_folder_root} tab separated value dataframe files
+ per run and feature of intrest.
+ y-axis: protein_location
+ x-axis: scene_cell
+ + runYYYYMMDD_MeanIntensity.tsv
+ + runYYYYMMDD_{s_gseg_feature_label}.tsv
+
+ run:
+ import getdata
+ getdata.get_df(s_gseg_folder_root='ihcData', s_gseg_folder_run_regex='^BM-Her2N75')
+
+ description:
+ function to extrtact dataframe like files of features of intrest
+ from segmentation files from guilaumes segmentation software.
+ '''
+ # enter the data path
+ #os.chdir(s_gseg_folder_root)
+
+
+ # for each value label of intrest (such as MeanIntensity)
+ for s_value_label in es_value_label:
+
+ # for each run (such as folder BM-Her2N75-15_2017-08-07_Features)
+ # change re.search to somehow specify folder of interest
+ for s_dir in os.listdir():
+ if re.search(s_folder_regex, s_dir):
+ print(f"\nprocess {s_value_label} run: {s_dir}")
+ # enter the run directory
+ os.chdir(s_dir)
+ # extract run label from dir name
+ s_run = f"features_{s_dir.split('_')[0]}"
+ # get empty run dictionary
+ dd_run = {}
+
+ # for each data file
+ for s_file in os.listdir():
+ if re.search("^Scene", s_file):
+ print(f"process {s_value_label} file: {s_file} ...")
+ # extract scene from file name
+ ls_file = [s_splinter.strip() for s_splinter in s_file.split("-")]
+ s_scene = re.sub("[^0-9a-zA-Z]", "", ls_file[0].lower()) #take out any alpha numberic
+ # extract protein from file name
+ if (len(ls_file) < 3):
+ s_protein = f"{ls_file[1].split('.')[0]}" # this is dapi
+ else:
+ s_protein = f"{ls_file[2].split('.')[0]}_{ls_file[1]}" # others
+
+ # for each datarow in file
+ b_header = False # header row inside file not yet found, so set flag false
+ with open(s_file, newline='') as f_csv:
+ o_reader = csv.reader(f_csv, delimiter=' ', quotechar='"')
+ for ls_row in o_reader:
+ if (b_header):
+ # extract cell label and data vale
+ s_cell = ls_row[i_xcell]
+ s_cell = f"{'0'*(5 - len(s_cell))}{s_cell}"
+ o_value = ls_row[i_xvalue]
+ # update run dictionary via scene_cell dictionery (one scene_cell dictionary per dataframe row)
+ s_scene_cell = f"{s_scene}_cell{s_cell}"
+ try:
+ d_scene_cell = dd_run[s_scene_cell] # we have already some data from this scene_cell
+ except KeyError:
+ d_scene_cell = {} # this is the first time we deal with this scene_cell
+ # update scene_cell dictionary with data values (one value inside dataframe row)
+ try:
+ o_there = d_scene_cell[s_protein]
+ sys.exit(f"Error @ getDataframe : in run {s_run} code tries to populate dataframe row {s_scene_cell} column {s_protein} with a secound time (there:{o_there} new:{o_value}). this should never happen. code is messed up.")
+ except KeyError:
+ d_scene_cell.update({s_protein: o_value})
+ dd_run.update({s_scene_cell: d_scene_cell})
+ else:
+ # extract cell label and data value of intrest column position
+ i_xcell = ls_row.index("Label")
+ i_xvalue = ls_row.index(s_value_label)
+ b_header = True # header row found and information extracted, so set flag True
+
+ # write run dictionar of dictionary into dataframe like file
+ b_header = False
+ s_file_output = f"../{s_run}_{s_value_label}.tsv"
+ print(f"write file: {s_file_output}")
+ with open(s_file_output, 'w', newline='') as f:
+ for s_scene_cell in sorted(dd_run):
+ ls_datarow = [s_scene_cell]
+ # handle protein column label row
+ if not (b_header):
+ ls_protein = sorted(dd_run[s_scene_cell])
+ print(ls_protein)
+ f.write("\t" + "\t".join(ls_protein) + "\n")
+ b_header = True
+ # handle data row
+ for s_protein in ls_protein:
+ o_value = dd_run[s_scene_cell][s_protein]
+ ls_datarow.append(o_value)
+ f.write("\t".join(ls_datarow) + "\n")
+ # sanity check
+ if (len(ls_protein) != (len(ls_datarow) -1)):
+ sys.exit(f"Error @ getDataframe : at {s_scene_cell} there are {len(ls_datarow) - len(ls_protein) -1} more proteins then in the aready writen rows")
+
+ # jump back to the data path
+ os.chdir("..")
+
+ return(dd_run)
+
+
+def dfextract(df_origin, s_extract, axis=0):
+ '''
+ input:
+ df_origin: dataframe
+ s_extract: index or column marker to be extacted
+ axis: 0 specifies index to be extracted,
+ 1 specifies columns to be extracted
+
+ output:
+ df_extract: extracted dataframe
+
+ run:
+ import cycnorm
+ cycnorm.dfyextract(df_scene, s_extract='CD74')
+ cycnorm.dfextract(df_run, s_scene='scene86')
+
+ description:
+ function can extract e.g.
+ specific scene datafarme from gseg2df generated run datafarme or
+ specific protein from a scene dataframe.
+ '''
+ if (axis == 0):
+ df_extract = df_origin.loc[df_origin.index.str.contains(s_extract),:]
+ else:
+ df_extract = df_origin.loc[:,df_origin.columns.str.contains(s_extract)]
+ # output
+ return(df_extract)
diff --git a/mplex_image/imagine.py b/mplex_image/imagine.py
new file mode 100755
index 0000000..f705318
--- /dev/null
+++ b/mplex_image/imagine.py
@@ -0,0 +1,504 @@
+###
+# title: pysci.imagine.py
+#
+# language Python3
+# license: GPLv3
+# author: bue
+# date: 2019-01-31
+#
+# run:
+# form pysci import imagine
+#
+# description:
+# my image analysis library
+####
+
+# library
+import numpy as np
+import pandas as pd
+
+# function
+def slide_up(a):
+ """
+ input:
+ a: numpy array
+
+ output:
+ a: input numpy array shifted one row up.
+ top row get deleted,
+ bottom row of zeros is inserted.
+
+ description:
+ inspired by np.roll function, though elements that roll
+ beyond the last position are not re-introduced at the first.
+ """
+ a = np.delete(np.insert(a, -1, 0, axis=0), 0, axis=0)
+ return(a)
+
+
+def slide_down(a):
+ """
+ input:
+ a: numpy array
+
+ output:
+ a: input numpy array shifted one row down.
+ top row of zeros is inserted.
+ bottom row get deleted,
+
+ description:
+ inspired by np.roll function, though elements that roll
+ beyond the last position are not re-introduced at the first.
+ """
+ a = np.delete(np.insert(a, 0, 0, axis=0), -1, axis=0)
+ return(a)
+
+
+def slide_left(a):
+ """
+ input:
+ a: numpy array
+
+ output:
+ a: input numpy array shifted one column left.
+ left most column gets deleted,
+ right most a column of zeros is inserted.
+
+ description:
+ inspired by np.roll function, though elements that roll
+ beyond the last position are not re-introduced at the first.
+ """
+ a = np.delete(np.insert(a, -1, 0, axis=1), 0, axis=1)
+ return(a)
+
+
+def slide_right(a):
+ """
+ input:
+ a: numpy array
+
+ output:
+ a: input numpy array shifted one column right.
+ left most a column of zeros is inserted.
+ right most column gets deleted,
+
+ description:
+ inspired by np.roll function, though elements that roll
+ beyond the last position are not re-introduced at the first.
+ """
+ a = np.delete(np.insert(a, 0, 0, axis=1), -1, axis=1)
+ return(a)
+
+
+def slide_upleft(a):
+ """
+ input:
+ a: numpy array
+
+ output:
+ a: input numpy array shifted one row up and one column left.
+
+ description:
+ inspired by np.roll function.
+ """
+ a = slide_left(slide_up(a))
+ return(a)
+
+
+def slide_upright(a):
+ """
+ input:
+ a: numpy array
+
+ output:
+ a: input numpy array shifted one row up and one column right.
+
+ description:
+ inspired by np.roll function.
+ """
+ a = slide_right(slide_up(a))
+ return(a)
+
+
+def slide_downleft(a):
+ """
+ input:
+ a: numpy array
+
+ output:
+ a: input numpy array shifted one row down and one column left.
+
+ description:
+ inspired by np.roll function.
+ """
+ a = slide_left(slide_down(a))
+ return(a)
+
+
+def slide_downright(a):
+ """
+ input:
+ a: numpy array
+
+ output:
+ a: input numpy array shifted one row down and one column right.
+
+ description:
+ inspired by np.roll function.
+ """
+ a = slide_right(slide_down(a))
+ return(a)
+
+
+
+def get_border(ai_basin):
+ """
+ input:
+ ai_basin: numpy array representing a cells or nuclei basin file.
+ it is assumed that basin borders are represented by 0 values,
+ and basins are represented with any values different from 0.
+ ai_basin = skimage.io.imread("cells_basins.tif")
+
+ output:
+ ai_border: numpy array containing only the cell or nuclei basin border.
+ border value will be 1, non border value will be 0.
+
+ description:
+ algorithm to extract the basin borders form basin numpy arrays.
+ """
+ ab_border_up = (ai_basin - slide_up(ai_basin)) != 0
+ ab_border_down = (ai_basin - slide_down(ai_basin)) != 0
+ ab_border_left = (ai_basin - slide_left(ai_basin)) != 0
+ ab_border_right = (ai_basin - slide_right(ai_basin)) != 0
+ ab_border_upleft = (ai_basin - slide_upleft(ai_basin)) != 0
+ ab_border_upright = (ai_basin - slide_upright(ai_basin)) != 0
+ ab_border_downleft = (ai_basin - slide_downleft(ai_basin)) != 0
+ ab_border_downright = (ai_basin - slide_downright(ai_basin)) != 0
+ ab_border = ab_border_up | ab_border_down | ab_border_left | ab_border_right | ab_border_upleft | ab_border_upright | ab_border_downleft | ab_border_downright
+ ai_border = ab_border * 1
+ return(ai_border)
+
+
+def collision(ai_basin, i_step_size=1):
+ """
+ input:
+ ai_basin: numpy array representing a cells basin file.
+ it is assumed that basin borders are represented by 0 values,
+ and basins are represented with any values different from 0.
+ ai_basin = skimage.io.imread("cells_basins.tif")
+
+ i_step_size: integer that specifies the distance from a basin
+ where collisions with other basins are detected.
+ increasing the step size behind > 1 will result in faster processing
+ but less certain results. step size < 1 make no sense.
+ default step size is 1.
+
+ output:
+ eti_collision: a set of tuples representing colliding basins.
+
+ description:
+ algorithm to detect which basin collide a given step size away.
+ """
+ eti_collision = set()
+ for o_slide in {slide_up, slide_down, slide_left, slide_right, slide_upleft, slide_upright, slide_downleft, slide_downright}:
+ ai_walk = ai_basin.copy()
+ for _ in range(i_step_size):
+ ai_walk = o_slide(ai_walk)
+ ai_alice = ai_walk[(ai_basin != 0) & (ai_walk != 0)]
+ ai_bob = ai_basin[(ai_basin != 0) & (ai_walk != 0)]
+ eti_collision = eti_collision.union(set(
+ zip(
+ ai_alice[(ai_alice != ai_bob)],
+ ai_bob[(ai_bob != ai_alice)]
+ )
+ ))
+ # return
+ return(eti_collision)
+
+
+def grow(ai_basin, i_step=1):
+ """
+ input:
+ ai_basin: numpy array representing a cells basin file.
+ it is assumed that basin borders are represented by 0 values,
+ and basins are represented with any values different from 0.
+ ai_basin = skimage.io.imread("cells_basins.tif")
+
+ i_step: integer which specifies how many pixels the basin
+ to each direction should grow
+
+ output:
+ ai_grown: numpy array with the grown basins
+
+ description:
+ algorithm to grow the basis in a given basin numpy array.
+ growing happens counterclockwise.
+ """
+ ai_grown = ai_basin.copy()
+ for _ in range(i_step):
+ for o_slide in {slide_up, slide_upleft, slide_left, slide_downleft, slide_down, slide_downright, slide_right, slide_upright}:
+ ai_alice = ai_basin.copy()
+ ai_evolve = o_slide(ai_alice)
+ ai_alice[(ai_evolve != ai_alice) & (ai_alice == 0)] = ai_evolve[(ai_evolve != ai_alice) & (ai_alice == 0)]
+ # update grown
+ ai_grown[(ai_alice != ai_grown) & (ai_grown == 0)] = ai_alice[(ai_alice != ai_grown) & (ai_grown == 0)]
+ # output
+ return(ai_grown)
+
+
+def touching_cells(ai_basin, i_border_width=0, i_step_size=1):
+ """
+ input:
+ ai_basin: numpy array representing a cells basin file.
+ it is assumed that basin borders are represented by 0 values,
+ and basins are represented with any values different from 0.
+ ai_basin = skimage.io.imread("cells_basins.tif")
+
+ i_border_width: maximal acceptable border with in pixels.
+ this is half of the range how far two the adjacent cell maximal
+ can be apart and still are regarded as touching each other.
+
+ i_step_size: step size by which the border width is sampled for
+ touching cells.
+ increase the step size behind > 1 will result in faster processing
+ but less certain results. step size < 1 make no sense.
+ default step size is 1.
+
+ output:
+ dei_touch: a dictionary that for each basin states
+ which other basins are touching.
+
+ description:
+ algorithm to extract the touching basins from a cell basin numpy array.
+ algorithm inspired by C=64 computer games with sprit collision.
+ """
+
+ # detect neighbors
+ eti_collision = set()
+ ai_evolve = ai_basin.copy()
+ for _ in range(-1, i_border_width, i_step_size):
+ # detect cell border collision
+ eti_collision = eti_collision.union(
+ collision(ai_basin=ai_evolve, i_step_size=i_step_size)
+ )
+ # grow basin
+ ai_evolve = grow(ai_basin=ai_evolve, i_step=i_step_size)
+
+ # transform set of tuple of alice and bob collision to dictionary of sets
+ dei_touch = {}
+ ei_alice = set(np.ndarray.flatten(ai_basin))
+ ei_alice.remove(0)
+ for i_alice in ei_alice:
+ dei_touch.update({i_alice : set()})
+ for i_alice, i_bob in eti_collision:
+ ei_bob = dei_touch[i_alice]
+ ei_bob.add(i_bob)
+ dei_touch.update({i_alice : ei_bob})
+
+ # output
+ return(dei_touch)
+
+
+def detouch2df(deo_abc, ls_column=["cell_center","cell_touch"]):
+ """
+ input:
+ deo_touch: touching_cells generated dictionary
+ ls_column: future dictionary_key dictionary_value column name
+
+ output:
+ df_touch: dataframe which contains the same information
+ as the input deo_touch dictionary.
+
+ description:
+ transforms dei_touch dictionary into a two column dataframe.
+ """
+ lo_key_total= []
+ lo_value_total = []
+ for o_key, eo_value in deo_abc.items():
+ try:
+ lo_value = sorted(eo_value, key=int)
+ except ValueError:
+ lo_value = sorted(eo_value)
+ # extract form dictionary
+ if (len(lo_value) == 0):
+ lo_key_total.append(o_key)
+ lo_value_total.append(0)
+ else:
+ lo_key_total.extend([o_key] * len(lo_value))
+ lo_value_total.extend(lo_value)
+ # generate datafarme
+ df_touch = pd.DataFrame([lo_key_total,lo_value_total], index=ls_column).T
+ return(df_touch)
+
+
+def imgfuse(laaai_in):
+ """
+ input:
+ laaai_in: list of 3 channel (RGB) images
+
+ output:
+ aaai_out: fused 3 channel image
+
+ description:
+ code to fuse many RGB images into one.
+ """
+ # check shape
+ ti_shape = None
+ for aaai_in in laaai_in:
+ if (ti_shape is None):
+ ti_shape = aaai_in.shape
+ else:
+ if (aaai_in.shape != ti_shape):
+ sys.exit(f"Error: input images have not the same shape. {aaai_in.shape} != {aaai_in}.")
+
+ # fuse images
+ llli_channel = []
+ for i_channel in range(ti_shape[0]):
+ lli_matrix = []
+ for i_y in range(ti_shape[1]):
+ li_row = []
+ for i_x in range(ti_shape[2]):
+ #print(f"{i_channel} {i_y} {i_x}")
+ li_px = []
+ for aaai_in in laaai_in:
+ i_in = aaai_in[i_channel,i_y,i_x]
+ if (i_in != 0):
+ li_px.append(i_in)
+ if (len(li_px) != 0):
+ i_out = np.mean(li_px)
+ else:
+ i_out = 0
+ li_row.append(int(i_out))
+ lli_matrix.append(li_row)
+ llli_channel.append(lli_matrix)
+
+ # output
+ aaai_out = np.array(llli_channel)
+ return(aaai_out)
+
+
+
+# test code
+if __name__ == "__main__":
+
+ # load basins tiff into numpy array
+ '''
+ import matplotlib.pyplot as plt
+ import skimage as ski
+ a_tiff = ski.io.imread("cells_basins.tif")
+ plt.imshow(a_tiff)
+ '''
+
+ # generate test data
+ a = np.array([
+ [0,0,0,0,0,0,0,0,0,0,0,0,0,0],
+ [0,0,0,0,0,0,0,0,0,0,0,0,0,0],
+ [0,0,0,0,0,0,0,0,0,0,4,0,0,0],
+ [0,0,0,1,1,1,0,0,0,0,0,0,0,0],
+ [0,0,0,1,1,1,0,0,0,0,0,0,0,0],
+ [0,0,0,1,1,1,0,0,0,0,0,0,0,0],
+ [0,0,0,0,0,0,0,0,2,2,2,0,0,0],
+ [0,0,0,0,3,3,3,0,2,2,2,0,0,0],
+ [0,0,0,0,3,3,3,0,2,2,2,0,0,0],
+ [0,0,0,0,3,3,3,0,0,0,0,0,0,0],
+ [0,0,0,0,0,0,0,0,0,0,0,0,0,0],
+ [0,0,0,0,0,0,0,0,0,0,0,0,0,0],
+ [0,0,0,0,0,0,0,0,0,0,0,0,0,0],
+ ])
+
+ b = np.array([
+ [0,0,0,0,0,0,0,0,0,0,0],
+ [0,0,0,0,0,0,0,0,0,0,0],
+ [0,0,0,0,0,0,0,0,0,0,0],
+ [0,0,0,1,0,0,0,0,0,0,0],
+ [0,0,0,0,1,2,0,0,0,0,0],
+ [0,0,0,0,0,1,2,0,0,0,0],
+ [0,0,0,0,0,0,0,2,0,0,0],
+ [0,0,0,0,0,0,0,0,0,0,0],
+ [0,0,0,0,0,0,0,0,0,0,0],
+ [0,0,0,0,0,0,0,0,0,0,0],
+ ])
+
+ c = np.array([
+ [0,0,0,0,0,0,0,0,0,0],
+ [0,0,0,0,0,0,0,0,0,0],
+ [0,0,0,0,0,0,0,0,0,0],
+ [0,0,0,0,0,0,0,0,0,0],
+ [0,0,0,0,1,0,0,0,0,0],
+ [0,0,0,0,0,1,0,0,0,0],
+ [0,0,0,0,0,0,0,0,0,0],
+ [0,0,0,0,0,0,0,0,0,0],
+ [0,0,0,0,0,0,0,0,0,0],
+ [0,0,0,0,0,0,0,0,0,0],
+ ])
+
+ # run get_border
+ print("\nborderwall_tm")
+ print(a)
+ print(get_border(a))
+ #plt.imshow(get_border(a_tiff))
+
+ # run grow
+ '''
+ print("\ngrow")
+ print(c)
+ print(grow(c))
+ print(grow(grow(c)))
+ print(grow(c, i_step_size=2))
+ print(b)
+ print(grow(b))
+ print(grow(grow(b)))
+ print(grow(b, i_step_size=2))
+ '''
+
+ # run collision
+ '''
+ print("\ncollision")
+ print(c)
+ print(collision(c))
+ print(b)
+ print(collision(b))
+ print(c)
+ print(collision(c))
+ '''
+
+ # run touching_cells
+ print("\ntouch")
+ #print(a)
+ print(touching_cells(a, i_border_width=0))
+ print(touching_cells(a, i_border_width=1))
+ print(touching_cells(a, i_border_width=2))
+ print(touching_cells(a, i_border_width=3))
+ print(touching_cells(a, i_border_width=4))
+ print(touching_cells(a, i_border_width=4, i_step_size=2))
+ #touching_cells(a_tiff, i_border_width=1)
+
+
+ # img fuse
+ aaai_1 = np.array([
+ [[1,1,1],[2,2,2],[3,3,3]],
+ [[0,0,0,],[0,0,0],[0,0,0]],
+ [[0,0,0],[0,0,0],[0,0,0]],
+ ])
+ aaai_2 = np.array([
+ [[0,0,0,],[0,0,0],[0,0,0]],
+ [[1,1,1],[2,2,2],[3,3,3]],
+ [[0,0,0],[0,0,0],[0,0,0]],
+ ])
+ aaai_3 = np.array([
+ [[0,0,0,],[0,0,0],[0,0,0]],
+ [[0,0,0],[0,0,0],[0,0,0]],
+ [[1,1,1],[2,2,2],[3,3,3]],
+ ])
+ aaai_4 = np.array([
+ [[1,1,1],[2,2,2],[3,3,3]],
+ [[1,1,1],[2,2,2],[3,3,3]],
+ [[0,0,0],[0,0,0],[0,0,0]],
+ ])
+ aaai_5 = np.array([
+ [[0,0,0,],[0,0,0],[0,0,0]],
+ [[1,1,1],[2,2,2],[3,3,3]],
+ [[1,1,1],[2,2,2],[3,3,3]],
+ ])
+ aaai_out = imgfuse([aaai_1, aaai_2, aaai_3, aaai_4, aaai_5])
+ print("fused 3channel image:\n", aaai_out, type(aaai_out))
diff --git a/mplex_image/metadata.py b/mplex_image/metadata.py
new file mode 100755
index 0000000..4d49424
--- /dev/null
+++ b/mplex_image/metadata.py
@@ -0,0 +1,176 @@
+####
+# title: metadata.py
+#
+# language: Python3.7
+# date: 2020-07-00
+# license: GPL>=v3
+# author: Jenny
+#
+# description:
+# python3 library using python bioformats to extract image metadata
+####
+
+
+#libraries
+import matplotlib as mpl
+mpl.use('agg')
+import matplotlib.pyplot as plt
+import numpy as np
+import os
+import skimage
+import pandas as pd
+import bioformats
+#import javabridge
+import re
+import shutil
+from itertools import chain, compress
+import matplotlib.ticker as ticker
+from mplex_image import cmif
+
+# mpimage
+#functions
+
+def get_exposure(s_image, s_find="Information\|Image\|Channel\|ExposureTime\<\/Key\>\"):
+
+ s_meta = bioformats.get_omexml_metadata(path=s_image)
+ o = bioformats.OMEXML(s_meta)
+ print(o.image().Name)
+ print(o.image().AcquisitionDate)
+
+ li_start = [m.start() for m in re.finditer(s_find, s_meta)]
+ if len(li_start)!=1:
+ print('Error: found wrong number of exposure times')
+
+ ls_exposure = []
+ for i_start in li_start:
+ ls_exposure.append(s_meta[i_start:i_start+200])
+ s_exposure = ls_exposure[0].strip(s_find)
+ s_exposure = s_exposure[1:s_exposure.find(']')]
+ ls_exposure = s_exposure.split(',')
+ li_exposure = [int(item)/1000000 for item in ls_exposure]
+ return(li_exposure,s_meta)
+
+def get_exposure_sample(s_sample,df_img):
+ """
+ return a dataframe with all exposure times for a sample (slide)
+ """
+ #make dataframe of exposure time metadata
+ df_exposure = pd.DataFrame()
+ ls_image = os.listdir()
+ df_sample = df_img[df_img.index.str.contains(s_sample)]
+ for s_image in df_sample.index:
+ print(s_image)
+ li_exposure, s_meta = get_exposure(s_image)
+ se_times = pd.Series(li_exposure,name=s_image)
+ df_exposure = df_exposure.append(se_times)
+ return(df_exposure)
+
+def get_meta(s_image, s_find = 'Scene\|CenterPosition\<\/Key\>\\['):
+ """czi scene metadata
+ s_image = filename
+ s_find = string to find in the omexml metadata
+ returns:
+ ls_exposure = list of 200 character strings following s_find in metadata
+ s_meta = the whole metadata string
+ """
+ s_meta = bioformats.get_omexml_metadata(path=s_image)
+ o = bioformats.OMEXML(s_meta)
+ #print(o.image().Name)
+ #print(o.image().AcquisitionDate)
+
+ li_start = [m.start() for m in re.finditer(s_find, s_meta)]
+ if len(li_start)!=1:
+ print('Error: found wrong number of exposure times')
+
+ ls_exposure = []
+ for i_start in li_start:
+ ls_exposure.append(s_meta[i_start:i_start+200])
+ s_exposure = ls_exposure[0].strip(s_find)
+ s_exposure = s_exposure[0:s_exposure.find(']')]
+ ls_exposure = s_exposure.split(',')
+ #li_exposure = [int(item)/1000000 for item in ls_exposure]
+ return(ls_exposure,s_meta)
+
+def scene_position(czidir,type):
+ """
+ get a dataframe of scene positions for each round/scene in TMA
+ """
+ os.chdir(f'{czidir}')
+ df_img = cmif.parse_czi('.',type=type)
+
+ #javabridge.start_vm(class_path=bioformats.JARS)
+ for s_image in df_img.index:
+ print(s_image)
+ ls_exposure,s_meta = get_meta(s_image)
+ df_img.loc[s_image,'Scene_X'] = ls_exposure[0]
+ df_img.loc[s_image,'Scene_Y'] = ls_exposure[1]
+
+ #javabridge.kill_vm()
+
+ df_img = df_img.sort_values(['rounds','scanID','scene']).drop('data',axis=1)
+ return(df_img)
+
+
+ ls_exposure,s_meta = get_meta(s_image, s_find = 'Scene\|CenterPosition\<\/Key\>\\[')
+
+def exposure_times_scenes(df_img,codedir,czidir,s_end='.czi'):
+ """
+ get a csv of exposure times for each slide
+ """
+ #go to directory
+ os.chdir(czidir)
+ #export exposure time
+ s_test = sorted(compress(os.listdir(),[item.find(s_end) > -1 for item in os.listdir()]))[1]#[0]
+ s_find = f"{s_test.split('-Scene-')[1].split(s_end)[0]}"
+ for s_sample in sorted(set(df_img.slide)):
+ print(s_sample)
+ df_img_slide = df_img[(df_img.slide==s_sample) & (df_img.scene==s_find)]
+ print(len(df_img_slide))
+ df_exp = get_exposure_sample(s_sample,df_img_slide)
+ df_exp.to_csv(f'{codedir}/{s_sample}_ExposureTimes.csv',header=True,index=True)
+
+def exposure_times(df_img,codedir,czidir):
+ """
+ get a csv of exposure times for each slide
+ """
+ #go to directory
+ os.chdir(czidir)
+ print(czidir)
+ #export exposure time
+ for s_sample in sorted(set(df_img.slide)):
+ df_img_slide = df_img[df_img.slide==s_sample]
+ df_exp = get_exposure_sample(s_sample,df_img_slide)
+ df_exp.to_csv(f'{codedir}/{s_sample}_ExposureTimes.csv',header=True,index=True)
+ #close java virtual machine
+ #javabridge.kill_vm()
+
+def exposure_times_slide(df_img,codedir,czidir):
+ if len(df_img.scene.unique()) == 1:
+ exposure_times(df_img,codedir,czidir)
+ elif len(df_img.scene.unique()) > 1:
+ exposure_times_scenes(df_img,codedir,czidir,s_end='.czi')
+
+def export_tiffs(df_img, s_sample,tiffdir):
+ """
+ export the tiffs of each tile
+ """
+ #start java virtual machine
+ #javabridge.start_vm(class_path=bioformats.JARS)
+
+ #export tiffs
+ df_img_slide = df_img[df_img.slide==s_sample]
+ for path in df_img_slide.index:
+ print(path)
+ img = bioformats.load_image(path) #looks like it only loads the first tile
+ img_new = img*65535
+ img_16 = img_new.astype(np.uint16)
+ i_channels = img_16.shape[2]
+ for i_channel in range(i_channels):
+ print(f'channel {i_channel}')
+ bioformats.write_image(f'{tiffdir}/{path.split(".czi")[0]}_c{str(i_channel+1)}_ORG.tif', pixels=img_16[:,:,i_channel],pixel_type='uint16')
+ break
+ break
+ a_test = img_16[:,:,i_channel]
+ aa_test = img_16
+ #javabridge.kill_vm()
+ return(a_test,aa_test, img)
diff --git a/mplex_image/mics.py b/mplex_image/mics.py
new file mode 100755
index 0000000..d16b479
--- /dev/null
+++ b/mplex_image/mics.py
@@ -0,0 +1,581 @@
+# wrapper functions for codex image processing
+
+from mplex_image import preprocess, mpimage, getdata, process, analyze, cmif, features, ometiff
+import os
+import pandas as pd
+import math
+import skimage
+from skimage import io, filters
+import re
+import numpy as np
+import json
+from skimage.util import img_as_uint
+import tifffile
+
+def parse_processed():
+ '''
+ parse the file names of processed Macsima images
+ '''
+ df_img = mpimage.filename_dataframe(s_end ="ome.tif",s_start='R',s_split='___')
+ #standardize dapi naming
+ ls_dapi_index = df_img[df_img.index.str.contains('DAPI')].index.tolist()
+ d_replace = dict(zip(ls_dapi_index, [item.replace('DAPIV0','DAPI__DAPIV0') for item in ls_dapi_index]))
+ df_img['data'] = df_img.data.replace(d_replace)
+ #standardize AF naming
+ ls_dapi_index = df_img[df_img.index.str.contains('autofluorescence')].index.tolist()
+ d_replace = dict(zip(ls_dapi_index, [item.replace('autofluorescence_FITC','autofluorescence-FITC__FITC') for item in ls_dapi_index]))
+ df_img['data'] = df_img.data.replace(d_replace)
+ d_replace = dict(zip(ls_dapi_index, [item.replace('autofluorescence_PE','autofluorescence-PE__PE') for item in ls_dapi_index]))
+ df_img['data'] = df_img.data.replace(d_replace)
+ #standardize empty naming
+ ls_dapi_index = df_img[df_img.index.str.contains('empty')].index.tolist()
+ d_replace = dict(zip(ls_dapi_index, [item.replace('empty','empty__empty') for item in ls_dapi_index]))
+ df_img['data'] = df_img.data.replace(d_replace)
+ df_img['marker'] = [item.split(f"{item.split('_')[3]}_")[-1].split('__')[0] for item in df_img.data]
+ df_img['cycle'] = [item.split('_')[3] for item in df_img.data]
+ df_img['rounds'] = [item.split('_')[3].replace('C-','R') for item in df_img.data]
+ df_img['clone'] = [item.split('__')[1].split('.')[0] for item in df_img.data]
+ #standardize marker naming
+ d_replace = dict(zip(df_img.marker.tolist(),[item.replace('_','-') for item in df_img.marker.tolist()]))
+ df_img['data'] = [item.replace(f'''{item.split(f"{item.split('_')[3]}_")[-1].split('__')[0]}''',f'''{d_replace[item.split(f"{item.split('_')[3]}_")[-1].split('__')[0]]}''') for item in df_img.data]
+ df_img['exposure'] = [int(item.split('__')[1].split('_')[1].split('.')[0]) for item in df_img.data]
+ df_img['channel'] = [item.split('__')[1].split('_')[0].split('.')[1] for item in df_img.data]
+ d_replace = {'DAPI':'c1', 'FITC':'c2', 'PE':'c3', 'APC':'c4'}
+ df_img['color'] = [item.replace(item, d_replace[item]) for item in df_img.channel]
+ df_img['rack'] = [item.split('_')[0] for item in df_img.data]
+ df_img['slide'] = [item.split('_')[1] for item in df_img.data]
+ df_img['scene'] = [item.split('_')[2] for item in df_img.data]
+ return(df_img)
+
+def parse_org():
+ '''
+ parse the file names of copied (name-stadardized) Macsima images
+ '''
+ s_path = os.getcwd()
+ df_img = mpimage.filename_dataframe(s_end ="tif",s_start='R',s_split='___')
+ df_img['marker'] = [item.split(f"{item.split('_')[3]}_")[-1].split('__')[0] for item in df_img.data]
+ df_img['cycle'] = [item.split('_')[3] for item in df_img.data]
+ df_img['rounds'] = [item.split('_')[3].replace('C-','R') for item in df_img.data]
+ df_img['clone'] = [item.split('__')[1].split('.')[0] for item in df_img.data]
+ df_img['exposure'] = [int(item.split('__')[1].split('_')[1].split('.')[0]) for item in df_img.data]
+ df_img['channel'] = [item.split('__')[1].split('_')[0].split('.')[1] for item in df_img.data]
+ d_replace = {'DAPI':'c1', 'FITC':'c2', 'PE':'c3', 'APC':'c4'}
+ df_img['color'] = [item.replace(item, d_replace[item]) for item in df_img.channel]
+ df_img['rack'] = [item.split('_')[0] for item in df_img.data]
+ df_img['slide'] = [item.split('_')[1] for item in df_img.data]
+ df_img['scene'] = [item.split('_')[2] for item in df_img.data]
+ df_img['slide_scene'] = df_img.slide + '_' + df_img.scene
+ df_img['path'] = [f"{s_path}/{item}" for item in df_img.index]
+ return(df_img)
+
+def copy_processed(df_img,regdir,i_lines=32639):
+ '''
+ copy the highest exposure time images for processing
+ '''
+ for s_marker in sorted(set(df_img.marker) - {'DAPI','autofluorescence','empty'}):
+ df_marker = df_img[df_img.marker==s_marker]
+ for s_cycle in sorted(set(df_marker.cycle)):
+ for s_index in df_marker[df_marker.cycle==s_cycle].sort_values('exposure',ascending=False).index.tolist():
+ a_img = io.imread(s_index)
+ s_dir_new = s_index.split(f"_{df_img.loc[s_index,'cycle']}")[0]
+ s_index_new = df_img.loc[s_index,'data'].split('.ome.tif')[0]
+ preprocess.cmif_mkdir([f'{regdir}/{s_dir_new}'])
+ print(a_img.max())
+ #get rid of lines
+ a_img[a_img==i_lines] = a_img.min()
+ if a_img.max() < 65535:
+ io.imsave(f'{regdir}/{s_dir_new}/{s_index_new}.tif',a_img,plugin='tifffile',check_contrast=False)
+ break
+ else:
+ print('Try lower exposure time')
+ for s_index in df_img[df_img.marker=='DAPI'].index.tolist():
+ a_img = io.imread(s_index)
+ print(f'DAPI max: {a_img.max()}')
+ if df_img.loc[s_index,'rounds'] != 'R0': #keep lines in R0 dapi, for segmentation
+ a_img[a_img==i_lines] = a_img.min()
+ s_dir_new = s_index.split(f"_{df_img.loc[s_index,'cycle']}")[0]
+ s_index_new = df_img.loc[s_index,'data'].split('.ome.tif')[0]
+ preprocess.cmif_mkdir([f'{regdir}/{s_dir_new}'])
+ io.imsave(f'{regdir}/{s_dir_new}/{s_index_new}.tif',a_img,plugin='tifffile',check_contrast=False)
+
+def extract_cellpose_features(s_sample, segdir, regdir, ls_seg_markers, nuc_diam, cell_diam):
+ '''
+ load the segmentation results, the input images, and the channels images
+ extract mean intensity from each image, and centroid, area and eccentricity for
+ '''
+ df_sample = pd.DataFrame()
+ df_thresh = pd.DataFrame()
+ os.chdir(f'{segdir}/{s_sample}Cellpose_Segmentation')
+ ls_scene = []
+ d_match = {}
+ for s_file in os.listdir():
+ if s_file.find(f'{".".join(ls_seg_markers)} nuc{nuc_diam} matchedcell{cell_diam} - Cell Segmentation Basins')>-1:
+ ls_scene.append(s_file.split(f'_{".".join(ls_seg_markers)}')[0])
+ d_match.update({s_file.split(f'_{".".join(ls_seg_markers)}')[0]:s_file})
+ for s_scene in ls_scene:
+ os.chdir(f'{segdir}/{s_sample}Cellpose_Segmentation')
+ print(f'processing {s_scene}')
+ for s_file in os.listdir():
+ if s_file.find(s_scene) > -1:
+ if s_file.find("DAPI.png") > -1:
+ s_dapi = s_file
+ dapi = io.imread(f'{segdir}/{s_sample}Cellpose_Segmentation/{s_dapi}')
+ print(f'loading {s_scene} nuclei{nuc_diam} - Nuclei Segmentation Basins.tif')
+ labels = io.imread(f'{s_scene} nuclei{nuc_diam} - Nuclei Segmentation Basins.tif')
+ cell_labels = io.imread(f'{segdir}/{s_sample}Cellpose_Segmentation/{d_match[s_scene]}')
+ print(f'loading {d_match[s_scene]}')
+ #nuclear features
+ df_feat = features.extract_feat(labels,dapi, properties=(['label']))
+ df_feat.columns = [f'{item}_segmented-nuclei' for item in df_feat.columns]
+ df_feat.index = [f'{s_sample}_cell{item}' for item in df_feat.loc[:,'label_segmented-nuclei']]
+
+ #get subcellular regions
+ cyto = features.label_difference(labels,cell_labels)
+ d_loc_nuc = features.subcellular_regions(labels, distance_short=2, distance_long=5)
+ d_loc_cell = features.subcellular_regions(cell_labels, distance_short=2, distance_long=5)
+ d_loc = {'nuclei':labels,'cell':cell_labels,'cytoplasm':cyto,
+ 'nucmem':d_loc_nuc['membrane'][0],'cellmem':d_loc_cell['membrane'][0],
+ 'perinuc5':d_loc_nuc['ring'][1],'exp5':d_loc_nuc['grown'][1],
+ 'nucadj2':d_loc_nuc['straddle'][0],'celladj2':d_loc_cell['straddle'][0]}
+
+ #subdir organized by slide or scene
+ if os.path.exists(f'{regdir}/{s_sample}'):
+ os.chdir(f'{regdir}/{s_sample}')
+ elif os.path.exists(f'{regdir}/{s_scene}'):
+ os.chdir(f'{regdir}/{s_scene}')
+ else:
+ os.chdir(f'{regdir}')
+ df_img = parse_org()
+ df_img['round_int'] = [int(re.sub('[^0-9]','', item)) for item in df_img.rounds]
+ df_img = df_img[df_img.round_int < 90]
+ df_img = df_img.sort_values('round_int')
+ #take into account slide (well)
+ df_scene = df_img[df_img.slide_scene==s_scene]
+ #load each image
+ for s_index in df_scene.index:
+ intensity_image = io.imread(s_index)
+ df_thresh.loc[s_index,'threshold_li'] = filters.threshold_li(intensity_image)
+ if intensity_image.mean() > 0:
+ df_thresh.loc[s_index,'threshold_otsu'] = filters.threshold_otsu(intensity_image)
+ df_thresh.loc[s_index,'threshold_triangle'] = filters.threshold_triangle(intensity_image)
+ s_marker = df_scene.loc[s_index,'marker']
+ print(f'extracting features {s_marker}')
+ if s_marker == 'DAPI':
+ s_marker = s_marker + f'{df_scene.loc[s_index,"rounds"].split("R")[1]}'
+ for s_loc, a_loc in d_loc.items():
+ if s_loc == 'nuclei':
+ df_marker_loc = features.extract_feat(a_loc,intensity_image, properties=(['mean_intensity','centroid','area','eccentricity','label']))
+ df_marker_loc.columns = [f'{s_marker}_{s_loc}',f'{s_marker}_{s_loc}_centroid-0',f'{s_marker}_{s_loc}_centroid-1',f'{s_marker}_{s_loc}_area',f'{s_marker}_{s_loc}_eccentricity',f'{s_marker}_{s_loc}_label']
+ elif s_loc == 'cell':
+ df_marker_loc = features.extract_feat(a_loc,intensity_image, properties=(['mean_intensity','euler_number','area','eccentricity','label']))
+ df_marker_loc.columns = [f'{s_marker}_{s_loc}',f'{s_marker}_{s_loc}_euler',f'{s_marker}_{s_loc}_area',f'{s_marker}_{s_loc}_eccentricity',f'{s_marker}_{s_loc}_label']
+ else:
+ df_marker_loc = features.extract_feat(a_loc,intensity_image, properties=(['mean_intensity','label']))
+ df_marker_loc.columns = [f'{s_marker}_{s_loc}',f'{s_marker}_{s_loc}_label']
+ #set array ids as index
+ df_marker_loc.index = df_marker_loc.loc[:,f'{s_marker}_{s_loc}_label']
+ df_marker_loc.index = [f'{s_sample}_cell{item}' for item in df_marker_loc.index]
+ df_feat = df_feat.merge(df_marker_loc, left_index=True,right_index=True,how='left',suffixes=('',f'{s_marker}_{s_loc}'))
+ df_sample = df_sample.append(df_feat)
+ return(df_sample, df_thresh)
+
+def combine_labels(s_sample,segdir, subdir, ls_seg_markers, nuc_diam, cell_diam, df_mi_full,s_thresh):
+ '''
+ - load cell labels; delete cells that were not used for cytoplasm (i.e. ecad neg)
+ - nuc labels, expand to perinuc 5 and then cut out the cell labels
+ - keep track of cells that are completely coverd by another cell (or two or three: counts as touching).
+ '''
+ se_neg = df_mi_full[df_mi_full.slide == s_sample].loc[:,f'{s_thresh}_negative']
+ dd_result = {}
+ if os.path.exists(f'{segdir}/{s_sample}Cellpose_Segmentation'):
+ os.chdir(f'{segdir}/{s_sample}Cellpose_Segmentation')
+ else:
+ os.chdir(segdir)
+ ls_scene = []
+ for s_file in os.listdir():
+ if s_file.find(' - DAPI.png') > -1:
+ ls_scene.append(s_file.split(' - DAPI.png')[0])
+ ls_scene = sorted(set(df_mi_full[df_mi_full.slide == s_sample].scene) & set(ls_scene))
+ for s_scene in ls_scene:
+ se_neg_scene = se_neg[se_neg.index.str.contains(s_scene)]
+
+ print(f'Processing combined segmentaiton labels for {s_scene}')
+ if os.path.exists(f'{s_scene} nuclei{nuc_diam} - Nuclei Segmentation Basins.tif'):
+ labels = io.imread(f'{s_scene} nuclei{nuc_diam} - Nuclei Segmentation Basins.tif')
+ else:
+ print('no nuclei labels found')
+ if os.path.exists(f'{s_scene} matchedcell{cell_diam} - Cell Segmentation Basins.tif'):
+ cell_labels = io.imread(f'{s_scene} matchedcell{cell_diam} - Cell Segmentation Basins.tif')
+ elif os.path.exists(f'{s_scene}_{".".join(ls_seg_markers)} matchedcell{cell_diam} - Cell Segmentation Basins.tif'):
+ cell_labels = io.imread(f'{s_scene}_{".".join(ls_seg_markers)} matchedcell{cell_diam} - Cell Segmentation Basins.tif')
+ elif os.path.exists(f'{s_scene}_{".".join(ls_seg_markers)} nuc{nuc_diam} matchedcell{cell_diam} - Cell Segmentation Basins.tif'):
+ cell_labels = io.imread(f'{s_scene}_{".".join(ls_seg_markers)} nuc{nuc_diam} matchedcell{cell_diam} - Cell Segmentation Basins.tif')
+ else:
+ print('no cell labels found')
+ #set non-ecad cell labels to zero
+ a_zeros = np.array([int(item.split('_cell')[1]) for item in se_neg_scene[se_neg_scene].index]).astype('int64')
+ mask = np.isin(cell_labels, a_zeros)
+ cell_labels_copy = cell_labels.copy()
+ cell_labels_copy[mask] = 0
+ #make the nuclei under cells zero
+ labels_copy = labels.copy()
+ distance = 5
+ perinuc5, labels_exp = features.expand_label(labels,distance=distance)
+ labels_exp[cell_labels_copy > 0] = 0
+ #combine calls and expanded nuclei
+ combine = (labels_exp + cell_labels_copy)
+ if s_scene.find('Scene') == 0:
+ io.imsave(f'{s_sample}_{s_scene.replace("Scene ","scene")}_cell{cell_diam}_nuc{nuc_diam}_CombinedSegmentationBasins.tif',combine)
+ else:
+ io.imsave(f'{s_scene}_{".".join(ls_seg_markers)}-cell{cell_diam}_exp{distance}_CellSegmentationBasins.tif',combine)
+ #figure out the covered cells...labels + combined
+ not_zero_pixels = np.array([labels.ravel() !=0,combine.ravel() !=0]).all(axis=0)
+ a_tups = np.array([combine.ravel()[not_zero_pixels],labels.ravel()[not_zero_pixels]]).T #combined over nuclei
+ unique_rows = np.unique(a_tups, axis=0)
+ new_dict = {}
+ for key, value in unique_rows:
+ if key == value:
+ continue
+ else:
+ if key in new_dict:
+ new_dict[key].append(value)
+ else:
+ new_dict[key] = [value]
+ #from elmar (reformat cells touching dictionary and save
+ d_result = {}
+ for i_cell, li_touch in new_dict.items():
+ d_result.update({str(i_cell): [str(i_touch) for i_touch in li_touch]})
+ dd_result.update({f'{s_sample}_{s_scene.replace("Scene ","scene")}':d_result})
+ #save dd_touch as json file
+ with open(f'result_{s_sample}_cellsatop_dictionary.json','w') as f:
+ json.dump(dd_result, f)
+ print('')
+ return(labels,combine,dd_result)
+
+def cropped_ometiff(s_sample,subdir,cropdir,d_crop,d_combos,s_dapi,tu_dim):
+ if os.path.exists(f'{subdir}/{s_sample}'):
+ os.chdir(f'{subdir}/{s_sample}')
+ df_img = parse_org()
+ df_img['scene'] = s_sample
+ d_crop_scene = {s_sample:d_crop[s_sample]}
+ dd_result = mpimage.overlay_crop(d_combos,d_crop_scene,df_img,s_dapi,tu_dim)
+ for s_crop, d_result in dd_result.items():
+ for s_type, (ls_marker, array) in d_result.items():
+ print(f'Generating multi-page ome-tiff {[item for item in ls_marker]}')
+ new_array = array[np.newaxis,np.newaxis,:]
+ s_xml = ometiff.gen_xml(new_array, ls_marker)
+ with tifffile.TiffWriter(f'{cropdir}/{s_crop}_{s_type}.ome.tif') as tif:
+ tif.save(new_array, photometric = "minisblack", description=s_xml, metadata = None)
+
+
+#old
+def convert_dapi(debugdir,regdir,b_mkdir=True):
+ '''
+ convert dapi to tif, rename to match Guillaumes pipeline requirements
+ '''
+ cwd = os.getcwd()
+ os.chdir(debugdir)
+ for s_dir in sorted(os.listdir()):
+ if s_dir.find('R-1_')== 0:
+ os.chdir(s_dir)
+ for s_file in sorted(os.listdir()):
+ if s_file.find('bleach')==-1:
+ s_round = s_file.split("Cycle(")[1].split(").ome.tif")[0]
+ print(f'stain {s_round}')
+ s_dir_new = s_dir.split('_')[2] + '-Scene-0' + s_dir.split('F-')[1]
+ s_tissue_dir = s_dir.split('_F-')[0]
+ if b_mkdir:
+ preprocess.cmif_mkdir([f'{regdir}/{s_tissue_dir}'])
+ a_dapi = skimage.io.imread(s_file)
+ #rename with standard name (no stain !!!!)
+ with skimage.external.tifffile.TiffWriter(f'{regdir}/{s_tissue_dir}/{s_dir_new}_R{s_round}_DAPI_V0_c1_ORG_5.0.tif') as tif:
+ tif.save(a_dapi)
+ os.chdir('..')
+ os.chdir(cwd)
+
+def convert_channels(processdir, regdir, b_rename=True, testbool=True):
+ '''
+ convert channels to tif, select one exposure time of three, rename to match Guillaumes pipeline requirements
+ '''
+ cwd = os.getcwd()
+ os.chdir(processdir)
+ for s_dir in sorted(os.listdir()):
+ if s_dir.find('R-1_')== 0:
+ os.chdir(s_dir)
+ if b_rename:
+ d_rename = {'autofluorescencePE_P':'autofluorescencePE_V0_P',
+ 'autofluorescenceFITC_F':'autofluorescenceFITC_V0_F',
+ '000_DAPIi':'extra000_DAPIi',
+ '000_DAPIf':'extra000_DAPIf',
+ 'extraextraextra':'extra',
+ 'extraextra':'extra',
+ '_FITC_':'_c2_ORG_',
+ '_PE_':'_c3_ORG_',}
+ preprocess.dchange_fname(d_rename,b_test=testbool)
+
+ #parse file names
+ else:
+ ls_column = ['rounds','marker','dilution','fluor','ORG','exposure','expdecimal','imagetype1','imagetype']
+ df_img = mpimage.parse_img(s_end =".tif",s_start='0',s_sep1='_',s_sep2='.',ls_column=ls_column,b_test=False)
+ df_img['exposure'] = df_img.exposure.astype(dtype='int')
+ ls_marker = sorted(set(df_img.marker))
+ for s_marker in ls_marker:
+ df_marker = df_img[df_img.marker==s_marker]
+ df_sort = df_marker.sort_values(by=['exposure'],ascending=False,inplace=False)
+ for idx in range(len(df_sort.index)):
+ s_index = df_sort.index[idx]
+ a_img = skimage.io.imread(s_index)
+ df_file = df_sort.loc[s_index,:]
+ print(a_img.max())
+ if idx < len(df_sort.index) - 1:
+ if a_img.max() < 65535:
+ print(f'Selected {df_file.exposure} for {df_file.marker}')
+ s_dir_new = s_dir.split('_')[2] + '-Scene-0' + s_dir.split('F-')[1]
+ s_tissue_dir = s_dir.split('_F-')[0]
+ s_index_new = s_index.split(".ome.tif")[0]
+ with skimage.external.tifffile.TiffWriter(f'{regdir}/{s_tissue_dir}/{s_dir_new}_R{s_index_new}.tif') as tif:
+ tif.save(a_img)
+ break
+ else:
+ print('Try lower exposure time')
+ elif idx == len(df_sort.index) - 1:
+ print(f'Selected as the lowest exposure time {df_file.exposure} for {df_file.marker}')
+ s_dir_new = s_dir.split('_')[2] + '-Scene-0' + s_dir.split('F-')[1]
+ s_tissue_dir = s_dir.split('_F-')[0]
+ s_index_new = s_index.split(".ome.tif")[0]
+ with skimage.external.tifffile.TiffWriter(f'{regdir}/{s_tissue_dir}/{s_dir_new}_R{s_index_new}.tif') as tif:
+ tif.save(a_img)
+ else:
+ print('/n /n /n /n Error in finding exposure time')
+
+ os.chdir('..')
+
+def parse_converted(regdir):
+ '''
+ parse the converted miltenyi file names,
+ regdir contains the images
+ '''
+ s_dir = os.getcwd()
+ df_img = mpimage.filename_dataframe(s_end = ".tif",s_start='G',s_split='_')
+ df_img.rename({'data':'scene'},axis=1,inplace=True)
+ df_img['rounds'] = [item[1] for item in [item.split('_') for item in df_img.index]]
+ df_img['marker'] = [item[2] for item in [item.split('_') for item in df_img.index]]
+ df_img['dilution'] = [item[3] for item in [item.split('_') for item in df_img.index]]
+ df_img['color'] = [item[4] for item in [item.split('_') for item in df_img.index]]
+ df_img['scene_int'] = [item.split('Scene-')[1] for item in df_img.scene]
+ df_img['scene_int'] = df_img.scene_int.astype(dtype='int')
+ df_img['exposure'] = [item[6].split('.')[0] for item in [item.split('_') for item in df_img.index]]
+ df_img['path'] = [f'{regdir}/{s_dir}/{item}' for item in df_img.index]
+ df_img['tissue'] = s_dir
+ return(df_img)
+
+def parse_converted_dirs(regdir):
+ '''
+ parse the converted miltenyi file names,
+ regdir is the master folder containing subfolders with ROIs/gates
+ '''
+ os.chdir(regdir)
+ df_img_all = pd.DataFrame()
+ for idx, s_dir in enumerate(sorted(os.listdir())):
+ os.chdir(s_dir)
+ s_sample = s_dir
+ print(s_sample)
+ df_img = parse_converted(s_dir)
+ df_img_all = df_img_all.append(df_img)
+ os.chdir('..')
+ return(df_img_all)
+
+def count_images(df_img,b_tile_count=True):
+ """
+ count and list slides, scenes, rounds
+ """
+ df_count = pd.DataFrame(index=sorted(set(df_img.scene)),columns=sorted(set(df_img.color)))
+ for s_sample in sorted(set(df_img.tissue)):
+ print(f'ROI {s_sample}')
+ df_img_slide = df_img[df_img.tissue==s_sample]
+ print('tiles')
+ [print(item) for item in sorted(set(df_img_slide.scene))]
+ print(f'Number of images = {len(df_img_slide)}')
+ print(f'Rounds:')
+ [print(item) for item in sorted(set(df_img_slide.rounds))]
+ print('\n')
+ if b_tile_count:
+ for s_scene in sorted(set(df_img_slide.scene)):
+ df_img_scene = df_img_slide[df_img_slide.scene==s_scene]
+ for s_color in sorted(set(df_img_scene.color)):
+ print(f'{s_scene} {s_color} {len(df_img_scene[df_img_scene.color==s_color])}')
+ df_count.loc[s_scene,s_color] = len(df_img_scene[df_img_scene.color==s_color])
+ return(df_count)
+
+def visualize_reg_images(regdir,qcdir,color='c1',tu_array=(3,2)):
+ """
+ array registered images to check tissue identity, focus, etc.
+ """
+ #check registration
+ preprocess.cmif_mkdir([f'{qcdir}/RegisteredImages'])
+ cwd = os.getcwd()
+ os.chdir(regdir)
+ for idx, s_dir in enumerate(sorted(os.listdir())):
+ os.chdir(s_dir)
+ s_sample = s_dir
+ print(s_sample)
+ df_img = parse_converted(s_dir)
+ ls_scene = sorted(set(df_img.scene))
+ for s_scene in ls_scene:
+ print(s_scene)
+ df_img_scene = df_img[df_img.scene == s_scene]
+ df_img_stain = df_img_scene[df_img_scene.color==color]
+ df_img_sort = df_img_stain.sort_values(['rounds'])
+ i_sqrt = math.ceil(math.sqrt(len(df_img_sort)))
+ #array_img(df_img,s_xlabel='color',ls_ylabel=['rounds','exposure'],s_title='marker',tu_array=(2,4),tu_fig=(10,20))
+ if color == 'c1':
+ fig = mpimage.array_img(df_img_sort,s_xlabel='marker',ls_ylabel=['rounds','exposure'],s_title='rounds',tu_array=tu_array,tu_fig=(16,14))
+ else:
+ fig = mpimage.array_img(df_img_sort,s_xlabel='color',ls_ylabel=['rounds','exposure'],s_title='marker',tu_array=tu_array,tu_fig=(16,12))
+ fig.savefig(f'{qcdir}/RegisteredImages/{s_scene}_registered_{color}.png')
+ os.chdir('..')
+ os.chdir(cwd)
+ #return(df_img)
+
+def rename_files(d_rename,dir,b_test=True):
+ """
+ change file names
+ """
+ cwd = os.getcwd()
+ os.chdir(dir)
+ for idx, s_dir in enumerate(sorted(os.listdir())):
+ s_path = f'{dir}/{s_dir}'
+ os.chdir(s_path)
+ print(s_dir)
+ df_img = mpimage.filename_dataframe(s_end = ".tif",s_start='reg',s_split='_')
+ df_img.rename({'data':'scene'},axis=1,inplace=True)
+ df_img['rounds'] = [item[1] for item in [item.split('_') for item in df_img.index]]
+ df_img['color'] = [item[2] for item in [item.split('_') for item in df_img.index]]
+ df_img['marker'] = [item[3].split('.')[0] for item in [item.split('_') for item in df_img.index]]
+ if b_test:
+ print('This is a test')
+ preprocess.dchange_fname(d_rename,b_test=True)
+ elif b_test==False:
+ print('Changing name - not a test')
+ preprocess.dchange_fname(d_rename,b_test=False)
+ else:
+ pass
+
+def rename_fileorder(s_sample, dir, b_test=True):
+ """
+ change file names
+ """
+ cwd = os.getcwd()
+ os.chdir(dir)
+ for idx, s_dir in enumerate(sorted(os.listdir())):
+ s_path = f'{dir}/{s_dir}'
+ os.chdir(s_path)
+ print(s_dir)
+ df_img = mpimage.filename_dataframe(s_end = ".tif",s_start='Scene',s_split='_')
+ df_img.rename({'data':'scene'},axis=1,inplace=True)
+ df_img['rounds'] = [item[1] for item in [item.split('_') for item in df_img.index]]
+ df_img['color'] = [item[2] for item in [item.split('_') for item in df_img.index]]
+ df_img['marker'] = [item[3].split('.')[0] for item in [item.split('_') for item in df_img.index]]
+ for s_index in df_img.index:
+ s_round = df_img.loc[s_index,'rounds']
+ s_scene= f"{s_sample}-{df_img.loc[s_index,'scene']}"
+ s_marker = df_img.loc[s_index,'marker']
+ s_color = df_img.loc[s_index,'color']
+ s_index_rename = f'{s_round}_{s_scene}_{s_marker}_{s_color}_ORG.tif'
+ d_rename = {s_index:s_index_rename}
+ if b_test:
+ print('This is a test')
+ preprocess.dchange_fname(d_rename,b_test=True)
+ elif b_test==False:
+ print('Changing name - not a test')
+ preprocess.dchange_fname(d_rename,b_test=False)
+ else:
+ pass
+
+
+def copy_files(dir,dapi_copy, marker_copy,testbool=True,type='codex'):
+ """
+ copy and rename files if needed as dummies
+ need to edit
+ """
+ os.chdir(dir)
+ for idx, s_dir in enumerate(sorted(os.listdir())):
+ s_path = f'{dir}/{s_dir}'
+ os.chdir(s_path)
+ #s_sample = s_dir.split('-Scene')[0]
+ df_img = mpimage.filename_dataframe(s_end = ".tif",s_start='Scene',s_split='_')
+ df_img.rename({'data':'scene'},axis=1,inplace=True)
+ df_img['rounds'] = [item[1] for item in [item.split('_') for item in df_img.index]]
+ df_img['color'] = [item[2] for item in [item.split('_') for item in df_img.index]]
+ df_img['marker'] = [item[3].split('.')[0] for item in [item.split('_') for item in df_img.index]]
+ print(s_dir)
+ #if b_test:
+ for key, dapi_item in dapi_copy.items():
+ df_dapi = df_img[(df_img.rounds== key.split('_')[1]) & (df_img.color=='c1')]
+ s_dapi = df_dapi.loc[:,'marker'][0]
+ preprocess.copy_dapis(s_r_old=key,s_r_new=f'_cyc{dapi_item}_',s_c_old='_c1_',
+ s_c_new='_c2_',s_find=f'_c1_{s_dapi}_ORG.tif',b_test=testbool,type=type)
+ i_count=0
+ for idx,(key, item) in enumerate(marker_copy.items()):
+ preprocess.copy_markers(df_img, s_original=key, ls_copy = item,
+ i_last_round= dapi_item + i_count, b_test=testbool,type=type)
+ i_count=i_count + len(item)
+
+def segmentation_thresholds(regdir,qcdir, d_segment):
+ """
+ visualize binary mask of segmentaiton threholds
+ need to edit
+ """
+ preprocess.cmif_mkdir([f'{qcdir}/Segmentation'])
+ os.chdir(regdir)
+ for idx, s_dir in enumerate(sorted(os.listdir())):
+ s_path = f'{regdir}/{s_dir}'
+ os.chdir(s_path)
+ df_img = mpimage.filename_dataframe(s_end = ".tif",s_start='Scene',s_split='_')
+ df_img.rename({'data':'scene'},axis=1,inplace=True)
+ df_img['rounds'] = [item[1] for item in [item.split('_') for item in df_img.index]]
+ df_img['color'] = [item[2] for item in [item.split('_') for item in df_img.index]]
+ df_img['marker'] = [item[3].split('.')[0] for item in [item.split('_') for item in df_img.index]]
+ s_sample = s_dir
+ print(s_sample)
+ d_seg = preprocess.check_seg_markers(df_img,d_segment, i_rows=1, t_figsize=(6,6)) #few scenes
+ for key, fig in d_seg.items():
+ fig.savefig(f'{qcdir}/Segmentation/{s_dir}_{key}_segmentation.png')
+
+
+def segmentation_inputs(s_sample,regdir,segdir,d_segment,b_start=False):
+ """
+ make inputs for guillaumes segmentation
+ """
+ os.chdir(regdir)
+ for idx, s_dir in enumerate(sorted(os.listdir())):
+ s_path = f'{regdir}/{s_dir}'
+ os.chdir(s_path)
+ df_img = mpimage.filename_dataframe(s_end = ".tif",s_start='R',s_split='_')
+ df_img.rename({'data':'rounds'},axis=1,inplace=True)
+ #df_img['rounds'] = [item[1] for item in [item.split('_') for item in df_img.index]]
+ df_img['color'] = [item[3] for item in [item.split('_') for item in df_img.index]]
+ df_img['marker'] = [item[2] for item in [item.split('_') for item in df_img.index]]
+ #s_sample = s_dir
+ #s_sample = s_dir.split('-Scene')[0]
+ print(s_sample)
+ df_marker = df_img[df_img.color!='c1']
+ df_marker = df_marker.sort_values(['rounds','color'])
+ df_dapi = pd.DataFrame(index = [df_marker.marker.tolist()],columns=['rounds','colors','minimum','maximum','exposure','refexp','location'])
+ df_dapi['rounds'] = df_marker.loc[:,['rounds']].values
+ df_dapi['colors'] = df_marker.loc[:,['color']].values
+ df_dapi['minimum'] = 1003
+ df_dapi['maximum'] = 65535
+ df_dapi['exposure'] = 100
+ df_dapi['refexp'] = 100
+ df_dapi['location'] = 'All'
+ for s_key,i_item in d_segment.items():
+ df_dapi.loc[s_key,'minimum'] = i_item
+ df_dapi.to_csv('RoundsCyclesTable.txt',sep=' ',header=False)
+ df_dapi.to_csv(f'metadata_{s_sample}_RoundsCyclesTable.csv',header=True)
+ #create cluster.java file
+ preprocess.cluster_java(s_dir=f'JE{idx}',s_sample=s_sample,imagedir=f'{s_path}',segmentdir=segdir,type='exacloud',b_segment=True,b_TMA=False)
+ if b_start:
+ os.chdir(f'/home/groups/graylab_share/Chin_Lab/ChinData/Work/engje/exacloud/JE{idx}') #exacloud
+ print(f'JE{idx}')
+ os.system('make_sh')
diff --git a/mplex_image/mpimage.py b/mplex_image/mpimage.py
new file mode 100755
index 0000000..86746e4
--- /dev/null
+++ b/mplex_image/mpimage.py
@@ -0,0 +1,817 @@
+####
+# title: mpimage.py
+#
+# language: Python3.6
+# date: 2019-05-00
+# license: GPL>=v3
+# author: Jenny
+#
+# description:
+# python3 library to display, normalize and crop multiplex images
+####
+
+#libraries
+import matplotlib as mpl
+mpl.use('agg')
+import matplotlib.pyplot as plt
+import numpy as np
+import os
+import skimage
+import pandas as pd
+#import bioformats
+import re
+import shutil
+from itertools import chain
+import matplotlib.ticker as ticker
+
+#os.chdir('/home/groups/graylab_share/OMERO.rdsStore/engje/Data/cmIF/')
+#from apeer_ometiff_library import omexmlClass
+
+#functions
+
+
+def parse_img(s_end =".tif",s_start='',s_sep1='_',s_sep2='.',s_exclude='Gandalf',ls_column=['rounds','color','imagetype','scene'],b_test=True):
+ '''
+ required columns: ['rounds','color','imagetype','scene']
+ meta names names=['rounds','color','minimum', 'maximum', 'exposure', 'refexp','location'],#'marker',
+ return = df_img
+ '''
+ ls_file = []
+ for file in os.listdir():
+ #find all filenames ending in s_end
+ if file.endswith(s_end):
+ if file.find(s_start)==0:
+ if file.find(s_exclude)==-1:
+ ls_file = ls_file + [file]
+
+ print(f'test {int(1.1)}')
+ #make a list of list of file name items separated by s_sep
+ llls_split = []
+ for items in [item.split(s_sep1)for item in ls_file]:
+ llls_split.append([item.split(s_sep2) for item in items])
+
+ lls_final = []
+ for lls_split in llls_split:
+ lls_final.append(list(chain.from_iterable(lls_split)))
+
+ #make a blank dataframe with the index being the filename
+ df_img = pd.DataFrame(index=ls_file, columns=ls_column)
+ if b_test:
+ print(lls_final[0])
+ print(f'Length = {len(lls_final[0])}')
+ #add a column for each part of the name
+ else:
+ for fidx, ls_final in enumerate(lls_final):
+ for idx, s_name in enumerate(ls_final):
+ df_img.loc[ls_file[fidx],ls_column[idx]] = s_name
+ print('Mean number of items in file name')
+ print(np.asarray([(len(item)) for item in lls_final]).mean())
+ if (np.asarray([(len(item)) for item in lls_final]).mean()).is_integer()==False:
+ print([(len(item)) for item in lls_final])
+ i_right = np.asarray([(len(item)) for item in lls_final]).max()
+ for fidx, ls_final in enumerate(lls_final):
+ if len(ls_final) < i_right:
+ print(f' inconsitent name: {ls_file[fidx]}')
+ return(df_img)
+
+def parse_org(s_end = "ORG.tif",s_start='R',type='reg'):
+ """
+ This function will parse images following koei's naming convention
+ Example: Registered-R1_PCNA.CD8.PD1.CK19_Her2B-K157-Scene-002_c1_ORG.tif
+ The output is a dataframe with image filename in index
+ And rounds, color, imagetype, scene (/tissue), and marker in the columns
+ type= 'reg' or 'raw'
+ """
+
+ ls_file = []
+ for file in os.listdir():
+ #find all filenames ending in s_end
+ if file.endswith(s_end):
+ if file.find(s_start)==0:
+ ls_file = ls_file + [file]
+ lls_name = [item.split('_') for item in ls_file]
+ df_img = pd.DataFrame(index=ls_file)
+ if type == 'raw':
+ lls_scene = [item.split('-Scene-') for item in ls_file]
+ elif type== 'noscenes':
+ ls_scene = ['Scene-001'] * len(ls_file)
+ if type == 'raw':
+ df_img['rounds'] = [item[0] for item in lls_name]
+ elif type== 'noscenes':
+ df_img['rounds'] = [item[0] for item in lls_name]
+ else:
+ df_img['rounds'] = [item[0].split('Registered-')[1] for item in lls_name]
+ df_img['color'] = [item[-2] for item in lls_name]
+ df_img['imagetype'] = [item[-1].split('.tif')[0] for item in lls_name]
+ if type == 'raw':
+ df_img['slide'] = [item[2] for item in lls_name]
+ try:
+ df_img['scene'] = [item[1].split('_')[0] for item in lls_scene]
+ except IndexError:
+ print(f"{set([item[0] for item in lls_scene])}")
+ elif type == 'noscenes':
+ df_img['slide'] = [item[2] for item in lls_name]
+ df_img['scene'] = ls_scene
+ else:
+ df_img['scene'] = [item[2] for item in lls_name]
+ df_img['round_ord'] = [re.sub('Q','.5', item) for item in df_img.rounds]
+ df_img['round_ord'] = [float(re.sub('[^0-9.]','', item)) for item in df_img.round_ord]
+ df_img = df_img.sort_values(['round_ord','rounds','color'])
+ for idx, s_round in enumerate(df_img.rounds.unique()):
+ df_img.loc[df_img.rounds==s_round, 'round_num'] = idx
+ #parse file name for biomarker
+ for s_index in df_img.index:
+ #print(s_index)
+ s_color = df_img.loc[s_index,'color']
+ if s_color == 'c1':
+ s_marker = 'DAPI'
+ elif s_color == 'c2':
+ s_marker = s_index.split('_')[1].split('.')[0]
+ elif s_color == 'c3':
+ s_marker = s_index.split('_')[1].split('.')[1]
+ elif s_color == 'c4':
+ s_marker = s_index.split('_')[1].split('.')[2]
+ elif s_color == 'c5':
+ s_marker = s_index.split('_')[1].split('.')[3]
+ #these are only included in sardana shading corrected images
+ elif s_color == 'c6':
+ s_marker = s_index.split('_')[1].split('.')[2]
+ elif s_color == 'c7':
+ s_marker = s_index.split('_')[1].split('.')[3]
+ else: print('Error')
+ df_img.loc[s_index,'marker'] = s_marker
+
+ return(df_img) #,lls_name)
+
+def filename_dataframe(s_end = ".czi",s_start='R',s_split='_'):
+ '''
+ quick and dirty way to select files for dataframe.
+ s_end = string at end of file names
+ s_start = string at beginning of filenames
+ s_split = character/string in all file names
+ '''
+ ls_file = []
+ for file in os.listdir():
+ #find all filenames ending in 'ORG.tif'
+ if file.endswith(s_end):
+ if file.find(s_start)==0:
+ ls_file = ls_file + [file]
+ lls_name = [item.split(s_split) for item in ls_file]
+ df_img = pd.DataFrame(index=ls_file)
+ df_img['data'] = [item[0] for item in lls_name]
+ return(df_img)
+
+def underscore_to_dot(s_sample, s_end='ORG.tif', s_start='R',s_split='_'):
+ df = filename_dataframe(s_end,s_start,s_split)
+ ls_old = sorted(set([item.split(f'_{s_sample}')[0] for item in df.index]))
+ ls_new = sorted(set([item.split(f'_{s_sample}')[0].replace('_','.').replace(f"{df.loc[item,'data']}.",f"{df.loc[item,'data']}_") for item in df.index]))
+ d_replace = dict(zip(ls_old,ls_new))
+ for key, item in d_replace.items():
+ if key.split('_')[0] != item.split('_')[0]:
+ print(f' Error {key} mathced to {item}')
+ return(d_replace)
+
+def add_exposure(df_img,df_t,type='roundcycles'):
+ """
+ df_img = dataframe of images with columns [ 'color', 'exposure', 'marker','sub_image','sub_exposure']
+ and index with image names
+ df_t = metadata with dataframe with ['marker','exposure']
+ """
+ if type == 'roundscycles':
+ for s_index in df_img.index:
+ s_marker = df_img.loc[s_index,'marker']
+ #look up exposure time for marker in metadata
+ df_t_image = df_t[(df_t.marker==s_marker)]
+ if len(df_t_image) > 0:
+ i_exposure = df_t_image.iloc[0].loc['exposure']
+ df_img.loc[s_index,'exposure'] = i_exposure
+ else:
+ print(f'{s_marker} has no recorded exposure time')
+ elif type == 'czi':
+ #add exposure
+ df_t['rounds'] = [item.split('_')[0] for item in df_t.index]
+ #df_t['tissue'] = [item.split('_')[2].split('-Scene')[0] for item in df_t.index] #not cool with stiched
+ for s_index in df_img.index:
+ s_tissue = df_img.loc[s_index,'scene'].split('-Scene')[0]
+ s_color = str(int(df_img.loc[s_index,'color'].split('c')[1])-1)
+ s_round = df_img.loc[s_index,'rounds']
+ print(s_index)
+ df_img.loc[s_index,'exposure'] = df_t[(df_t.index.str.contains(s_tissue)) & (df_t.rounds==s_round)].loc[:,s_color][0]
+
+ return(df_img)
+
+def subtract_images(df_img,d_channel={'c2':'L488','c3':'L555','c4':'L647','c5':'L750'},ls_exclude=[],subdir='SubtractedRegisteredImages',b_8bit=True):#b_mkdir=True,
+ """
+ This code loads 16 bit grayscale tiffs, performs AF subtraction of channels/rounds defined by the user, and outputs 8 bit AF subtracted tiffs for visualization.
+ The data required is:
+ 1. The RoundsCyclesTable with real exposure times
+ 2. dataframe of images to process (df_img); can be created with any custom parsing function
+ df_img = dataframe of images with columns [ 'color', 'exposure', 'marker']
+ and index with image names
+ d_channel = dictionary mapping color to marker to subtract
+ ls_exclude = lost of markers not needing subtraction
+ """
+ #generate dataframe of subtraction markers
+ es_subtract = set()
+ for s_key, s_value in d_channel.items():
+ es_subtract.add(s_value)
+ print(f'Subtracting {s_value} for all {s_key}')
+
+ df_subtract = pd.DataFrame()
+ for s_subtract in sorted(es_subtract):
+ se_subtract = df_img[df_img.marker==s_subtract]
+ df_subtract = df_subtract.append(se_subtract)
+ print(f'The background images {df_subtract.index.tolist}')
+ print(f'The background markers {df_subtract.marker.tolist}')
+
+ #generate dataframe of how subtraction is set up
+ #set of markers minus the subtraction markers
+ es_markers = set(df_img.marker) - es_subtract
+ #dataframe of markers
+ df_markers = df_img[df_img.loc[:,'marker'].isin(sorted(es_markers))]
+ #minus dapi (color 1 or DAPI)
+ #df_markers = df_markers[df_markers.loc[:,'color']!='c1']
+ #df_markers = df_markers[~df_markers.loc[:,'marker'].str.contains('DAPI')]
+ df_copy = df_img[df_img.marker.isin(ls_exclude)]
+ df_markers = df_markers[~df_markers.marker.isin(ls_exclude)]
+
+ for s_file in df_copy.index.tolist():
+ print(s_file)
+ #print(f'copied to ./AFSubtracted/{s_file}')
+ #shutil.copyfile(s_file,f'./AFSubtracted/{s_file}')
+ print(f'copied to {subdir}/{s_file}')
+ shutil.copyfile(s_file,f'{subdir}/{s_file}')
+ #ls_scene = sorted(set(df_img.scene))
+ #add columns with mapping of proper subtracted image to dataframe
+
+ for s_index in df_markers.index.tolist():
+ print('add colums')
+ print(s_index)
+ s_scene = s_index.split('_')[2]
+ s_color = df_markers.loc[s_index,'color']
+ if len(df_subtract[(df_subtract.color==s_color) & (df_subtract.scene==s_scene)])==0:
+ print(f'missing {s_color} in {s_scene}')
+ else:
+ df_markers.loc[s_index,'sub_image'] = df_subtract[(df_subtract.color==s_color) & (df_subtract.scene==s_scene)].index[0]
+ df_markers.loc[s_index,'sub_exposure'] = df_subtract[(df_subtract.color==s_color) & (df_subtract.scene==s_scene)].exposure[0]
+
+ #loop to subtract
+ for s_index in df_markers.index.tolist():
+ print(f'Processing {s_index}')
+ s_image = s_index
+ s_color = '_' + df_markers.loc[s_index,'color'] + '_'
+ s_background = df_markers.loc[s_index,'sub_image']
+ print(f'From {s_image} subtracting \n {s_background}')
+ a_img = skimage.io.imread(s_image)
+ a_AF = skimage.io.imread(s_background)
+ #divide each image by exposure time
+ #subtract 1 ms AF from 1 ms signal
+ #multiply by original image exposure time
+ a_sub = (a_img/df_markers.loc[s_index,'exposure'] - a_AF/df_markers.loc[s_index,'sub_exposure'])*df_markers.loc[s_index,'exposure']
+ a_zero = (a_sub.clip(min=0)).astype(int) #max=a_sub.max() #took out max parameter from np.clip, but it was fine in
+ if b_8bit:
+ #a_16bit = skimage.img_as_ubyte(a_zero)
+ #a_zero = a_sub.clip(min=0,max=a_sub.max())
+ a_bit = (a_zero/256).astype(np.uint8)
+ else:
+ a_bit = skimage.img_as_uint(a_zero)
+ s_fname = f'{subdir}/{s_index.split(s_color)[0]}_Sub{df_subtract.loc[df_markers.loc[s_index,"sub_image"],"marker"]}{s_color}{s_index.split(s_color)[1]}'
+ skimage.io.imsave(s_fname,a_bit)
+
+ return(df_markers,df_copy)#df_markers,es_subtract
+
+def subtract_scaled_images(df_img,d_late={'c2':'R5Qc2','c3':'R5Qc3','c4':'R5Qc4','c5':'R5Qc5'},d_early={'c2':'R0c2','c3':'R0c3','c4':'R0c4','c5':'R0c5'},ls_exclude=[],subdir='SubtractedRegisteredImages',b_8bit=False):
+ """
+ This code loads 16 bit grayscale tiffs, performs scaled AF subtraction
+ based on the round position between early and late AF channels/rounds defined by the user,
+ and outputs AF subtracted tiffs or ome-tiffs for visualization.
+ The data required is:
+ 1. The RoundsCyclesTable with real exposure times
+ 2. dataframe of images to process (df_img); can be created with any custom parsing function
+ df_img = dataframe of images with columns [ 'color', 'exposure', 'marker','round_ord']
+ and index with image names
+ d_channel = dictionary mapping color to marker to subtract
+ ls_exclude = lost of markers not needing subtraction
+ """
+ #generate dataframe of subtraction markers
+ es_subtract = set()
+ [es_subtract.add(item) for key, item in d_early.items()]
+ [es_subtract.add(item) for key, item in d_late.items()]
+
+ #markers minus the subtraction markers & excluded markers
+ es_markers = set(df_img.marker) - es_subtract
+ #dataframe of markers
+ df_markers = df_img[df_img.loc[:,'marker'].isin(es_markers)]
+ df_copy = df_img[df_img.marker.isin(ls_exclude)]
+ df_markers = df_markers[~df_markers.marker.isin(ls_exclude)]
+
+ #copy excluded markers
+ for s_file in df_copy.index.tolist():
+ print(s_file)
+ print(f'copied to {subdir}/{s_file}')
+ shutil.copyfile(s_file,f'{subdir}/{s_file}')
+
+ #add columns with mapping of proper AF images to marker images
+ for s_index in df_markers.index.tolist():
+ print('add colums')
+ print(s_index)
+ s_scene = df_markers.loc[s_index,'scene']
+ s_color = df_markers.loc[s_index,'color']
+ s_early = d_early[s_color]
+ s_late = d_late[s_color]
+ i_round = df_markers.loc[s_index,'round_num']
+ df_scene = df_img[df_img.scene==s_scene]
+ if len(df_scene[df_scene.marker==s_early]) == 0:
+ print(f' Missing early AF channel for {s_scene} {s_color}')
+ elif len(df_scene[df_scene.marker==s_late]) == 0:
+ print(f' Missing late AF channel for {s_scene} {s_color}')
+ else:
+ i_early = df_scene[(df_scene.marker==s_early)].round_num[0]
+ i_late = df_scene[(df_scene.marker==s_late)].round_num[0]
+ df_markers.loc[s_index,'sub_name'] = f'{s_early}{s_late}'
+ df_markers.loc[s_index,'sub_early'] = df_scene[(df_scene.marker==s_early)].index[0]
+ df_markers.loc[s_index,'sub_early_exp'] = df_scene[(df_scene.marker==s_early)].exposure[0]
+ df_markers.loc[s_index,'sub_late'] = df_scene[(df_scene.marker==s_late)].index[0]
+ df_markers.loc[s_index,'sub_late_exp'] = df_scene[(df_scene.marker==s_late)].exposure[0]
+ df_markers.loc[s_index,'sub_ratio_late'] = np.clip((i_round-i_early)/(i_late - i_early),0,1)
+ df_markers.loc[s_index,'sub_ratio_early'] = np.clip(1 - (i_round-i_early)/(i_late - i_early),0,1)
+
+ #loop to subtract
+ for s_index in df_markers.index.tolist():
+ print(f'Processing {s_index}')
+ s_color = '_' + df_markers.loc[s_index,'color'] + '_'
+ a_img = skimage.io.imread(s_index)
+ a_early = skimage.io.imread(df_markers.loc[s_index,'sub_early'])
+ a_late = skimage.io.imread(df_markers.loc[s_index,'sub_late'])
+ #divide each image by exposure time
+ a_img_exp = a_img/df_markers.loc[s_index,'exposure']
+ a_early_exp = a_early/df_markers.loc[s_index,'sub_early_exp']
+ a_late_exp = a_late/df_markers.loc[s_index,'sub_late_exp']
+ #combine early and late based on round_num
+ a_early_exp = a_early_exp * df_markers.loc[s_index,'sub_ratio_early']
+ a_late_exp = a_late_exp * df_markers.loc[s_index,'sub_ratio_late']
+ #subtract 1 ms AF from 1 ms signal
+ #multiply by original image exposure time
+ a_sub = (a_img_exp - a_early_exp - a_late_exp)*df_markers.loc[s_index,'exposure']
+ a_zero = (a_sub.clip(min=0)).astype(int) #
+ if b_8bit:
+ a_bit = (a_zero/256).astype(np.uint8)
+ else:
+ a_bit = skimage.img_as_uint(a_zero)
+ s_fname = f'{subdir}/{s_index.split(s_color)[0]}_Sub{df_markers.loc[s_index,"sub_name"]}{s_color}{s_index.split(s_color)[1]}'
+ skimage.io.imsave(s_fname,a_bit)
+
+ return(df_markers,df_copy)
+
+def overlay_crop(d_combos,d_crop,df_img,s_dapi,tu_dim=(1000,1000),b_8bit=True):
+ """
+ output custon multi page tiffs according to dictionary, with s_dapi as channel 1 in each overlay
+ BUG with 53BP1
+ d_crop : {slide_scene : (x,y) coord
+ tu_dim = (width, height)
+ d_combos = {'Immune':{'CD45', 'PD1', 'CD8', 'CD4', 'CD68', 'FoxP3','GRNZB','CD20','CD3'},
+ 'Stromal':{'Vim', 'aSMA', 'PDPN', 'CD31', 'ColIV','ColI'},
+ 'Differentiation':{'CK19', 'CK7','CK5', 'CK14', 'CK17','CK8'},
+ 'Tumor':{'HER2', 'Ecad', 'ER', 'PgR','Ki67','PCNA'},
+ 'Proliferation':{'EGFR','CD44','AR','pHH3','pRB'},
+ 'Functional':{'pS6RP','H3K27','H3K4','cPARP','gH2AX','pAKT','pERK'},
+ 'Lamins':{'LamB1','LamAC', 'LamB2'}}
+ """
+ dd_result = {}
+ for s_index in df_img.index:
+ s_marker = df_img.loc[s_index,'marker']
+ if s_marker == 'DAPI':
+ s_marker = s_marker + f'{df_img.loc[s_index,"rounds"].split("R")[1]}'
+ df_img.loc[s_index,'marker'] = s_marker
+ #now make overlays
+ for s_scene, xy_cropcoor in d_crop.items():
+ d_result = {}
+ print(f'Processing {s_scene}')
+ df_slide = df_img[df_img.scene==s_scene]
+ s_image_round = df_slide[df_slide.marker==s_dapi].index[0]
+ if len(df_slide[df_slide.marker==s_dapi.split('_')[0]].index) == 0:
+ print('Error: dapi not found')
+ elif len(df_slide[df_slide.marker==s_dapi.split('_')[0]].index) > 1:
+ print('Error: too many dapi images found')
+ else:
+ print(s_image_round)
+ #exclude any missing biomarkers
+ es_all = set(df_slide.marker)
+ #iterate over overlay combinations
+ for s_type, es_combos in d_combos.items():
+ d_overlay = {}
+ es_combos_shared = es_combos.intersection(es_all)
+ for idx, s_combo in enumerate(sorted(es_combos_shared)):
+ s_filename = (df_slide[df_slide.marker==s_combo]).index[0]
+ if len((df_slide[df_slide.marker==s_combo]).index) == 0:
+ print(f'Error: {s_combo} not found')
+ elif len((df_slide[df_slide.marker==s_combo]).index) > 1:
+ print(f'\n Warning {s_combo}: too many marker images found, used {s_filename}')
+ else:
+ print(f'{s_combo}: {s_filename}')
+ d_overlay.update({s_combo:s_filename})
+ #d_overlay.update({s_dapi:s_image_round})
+ a_dapi = skimage.io.imread(s_image_round)
+ #crop
+ a_crop = a_dapi[(xy_cropcoor[1]):(xy_cropcoor[1]+tu_dim[1]),(xy_cropcoor[0]):(xy_cropcoor[0]+tu_dim[0])]
+ a_overlay = np.zeros((len(d_overlay) + 1,a_crop.shape[0],a_crop.shape[1]),dtype=np.uint8)
+ if a_crop.dtype == 'uint16':
+ if b_8bit:
+ a_crop = (a_crop/256).astype(np.uint8)
+ else:
+ a_rescale = skimage.exposure.rescale_intensity(a_crop,in_range=(0,1.5*np.quantile(a_crop,0.9999)))
+ a_crop = (a_rescale/256).astype(np.uint8)
+ print(f'rescale intensity')
+ a_overlay[0,:,:] = a_crop
+ ls_biomarker_all = [s_dapi]
+ for i, s_color in enumerate(sorted(d_overlay.keys())):
+ s_overlay= d_overlay[s_color]
+ ls_biomarker_all.append(s_color)
+ a_channel = skimage.io.imread(s_overlay)
+ #crop
+ a_crop = a_channel[(xy_cropcoor[1]):(xy_cropcoor[1]+tu_dim[1]),(xy_cropcoor[0]):(xy_cropcoor[0]+tu_dim[0])]
+ if a_crop.dtype == 'uint16':
+ if b_8bit:
+ a_crop = (a_crop/256).astype(np.uint8)
+ else:
+ a_rescale = skimage.exposure.rescale_intensity(a_crop,in_range=(0,1.5*np.quantile(a_crop,0.9999)))
+ a_crop = (a_rescale/256).astype(np.uint8)
+ print(f'rescale intensity')
+ a_overlay[i + 1,:,:] = a_crop
+ d_result.update({s_type:(ls_biomarker_all,a_overlay)})
+ dd_result.update({f'{s_scene}_x{xy_cropcoor[0]}y{xy_cropcoor[1]}':d_result})
+ return(dd_result)
+
+def gen_xml(array, channel_names):
+ '''
+ copy and modify from apeer ome tiff
+ ls_marker
+ '''
+ #for idx, s_marker in enumerate(ls_marker):
+ # old = bytes(f'Name="C:{idx}"','utf-8')
+ # new = bytes(f'Name="{s_marker}"','utf-8')
+ # s_xml = s_xml.replace(old,new,-1)
+ #Dimension order is assumed to be TZCYX
+ dim_order = "TZCYX"
+
+ metadata = omexmlClass.OMEXML()
+ shape = array.shape
+ assert ( len(shape) == 5), "Expected array of 5 dimensions"
+
+ metadata.image().set_Name("IMAGE")
+ metadata.image().set_ID("0")
+
+ pixels = metadata.image().Pixels
+ pixels.ome_uuid = metadata.uuidStr
+ pixels.set_ID("0")
+
+ pixels.channel_count = shape[2]
+
+ pixels.set_SizeT(shape[0])
+ pixels.set_SizeZ(shape[1])
+ pixels.set_SizeC(shape[2])
+ pixels.set_SizeY(shape[3])
+ pixels.set_SizeX(shape[4])
+
+ pixels.set_DimensionOrder(dim_order[::-1])
+
+ pixels.set_PixelType(omexmlClass.get_pixel_type(array.dtype))
+
+ for i in range(pixels.SizeC):
+ pixels.Channel(i).set_ID("Channel:0:" + str(i))
+ pixels.Channel(i).set_Name(channel_names[i])
+
+ for i in range(pixels.SizeC):
+ pixels.Channel(i).set_SamplesPerPixel(1)
+
+ pixels.populate_TiffData()
+
+ return metadata.to_xml().encode()
+
+def array_img(df_img,s_xlabel='color',ls_ylabel=['rounds','exposure'],s_title='marker',tu_array=(2,4),tu_fig=(10,20),cmap='gray',d_crop={}):
+ """
+ create a grid of images
+ df_img = dataframe of images with columns having image attributes
+ and index with image names
+ s_xlabel = coumns of grid
+ ls_ylabel = y label
+ s_title= title
+
+ """
+
+ fig, ax = plt.subplots(tu_array[0],tu_array[1],figsize=tu_fig)
+ ax = ax.ravel()
+ for ax_num, s_index in enumerate(df_img.index):
+ s_row_label = f'{df_img.loc[s_index,ls_ylabel[0]]}\n {df_img.loc[s_index,ls_ylabel[1]]}'
+ s_col_label = df_img.loc[s_index,s_xlabel]
+ a_image=skimage.io.imread(s_index)
+ s_label_img = df_img.loc[s_index,s_title]
+ a_rescale = skimage.exposure.rescale_intensity(a_image,in_range=(0,1.5*np.quantile(a_image,0.98)))
+ if len(d_crop)!= 0:
+ tu_crop = d_crop[df_img.loc[s_index,'scene']]
+ a_rescale = a_rescale[(tu_crop[1]):(tu_crop[1]+tu_crop[3]),(tu_crop[0]):(tu_crop[0]+tu_crop[2])]
+ ax[ax_num].imshow(a_rescale,cmap=cmap)
+ ax[ax_num].set_title(s_label_img)
+ ax[ax_num].set_ylabel(s_row_label)
+ ax[ax_num].set_xlabel(f'{s_col_label}\n 0 - {int(1.5*np.quantile(a_image,0.98))}')
+ plt.tight_layout()
+ return(fig)
+
+def array_roi(df_img,s_column='color',s_row='rounds',s_label='marker',tu_crop=(0,0,100,100),tu_array=(2,4),tu_fig=(10,20), cmap='gray',b_min_label=True,tu_rescale=(0,0)):
+ """
+ create a grid of images
+ df_img = dataframe of images with columns having image attributes
+ and index with image names
+ s_column = coumns of grid
+ s_row = rows of grid
+ s_label= attribute to label axes
+ tu_crop = (upper left corner x, y , xlength, yheight)
+ tu_dim = a tumple of x and y dimensinons of crop
+ """
+
+ fig, ax = plt.subplots(tu_array[0],tu_array[1],figsize=tu_fig,sharex=True, sharey=True)
+ if b_min_label:
+ fig, ax = plt.subplots(tu_array[0],tu_array[1],figsize=tu_fig, sharey=True)
+ ax = ax.ravel()
+ for ax_num, s_index in enumerate(df_img.index):
+ s_row_label = df_img.loc[s_index,s_row]
+ s_col_label = df_img.loc[s_index,s_column]
+ s_label_img = df_img.loc[s_index,s_label]
+ #load image, copr, rescale
+ a_image=skimage.io.imread(s_index)
+ a_crop = a_image[(tu_crop[1]):(tu_crop[1]+tu_crop[3]),(tu_crop[0]):(tu_crop[0]+tu_crop[2])]
+ if tu_rescale==(0,0):
+ a_rescale = skimage.exposure.rescale_intensity(a_crop,in_range=(0,np.quantile(a_image,0.98)+np.quantile(a_image,0.98)/2))
+ tu_max = (0,np.quantile(a_image,0.98)+np.quantile(a_image,0.98)/2)
+ ax[ax_num].imshow(a_rescale,cmap='gray')
+ else:
+ print(f'original {a_crop.min()},{a_crop.max()}')
+ print(f'rescale to {tu_rescale}')
+ a_rescale = skimage.exposure.rescale_intensity(a_crop,in_range=tu_rescale,out_range=tu_rescale)
+ tu_max=tu_rescale
+ ax[ax_num].imshow(a_rescale,cmap=cmap,vmin=0, vmax=tu_max[1])
+ ax[ax_num].set_title(s_label_img)
+ ax[ax_num].set_ylabel(s_row_label)
+ ax[ax_num].set_xlabel(s_col_label)
+ if b_min_label:
+ ax[ax_num].set_xticklabels('')
+ ax[ax_num].set_xlabel(f'{tu_max[0]} - {int(tu_max[1])}') #min/max =
+ plt.tight_layout()
+ return(fig)
+
+def load_labels(d_crop,segdir,s_find='Nuclei Segmentation Basins'):
+ """
+ load the segmentation basins (cell of nuceli)
+ s_find: 'exp5_CellSegmentationBasins' or 'Nuclei Segmentation Basins'
+ """
+ d_label={}
+ cwd = os.getcwd()
+ for s_scene, xy_cropcoor in d_crop.items():
+ print(s_scene)
+ s_sample = s_scene.split('-Scene-')[0]
+ os.chdir(f'{segdir}')
+ for s_file in os.listdir():
+ if s_file.find(s_find) > -1: #Nuclei Segmentation Basins.tif #Cell Segmentation Basins.tif
+ if s_file.find(s_scene.split(s_sample)[1]) > -1:
+ print(f'loading {s_file}')
+ a_seg = skimage.io.imread(s_file)
+ d_label.update({s_scene:a_seg})
+ os.chdir(cwd)
+ return(d_label)
+
+def crop_labels(d_crop,d_label,tu_dim,cropdir,s_name='Nuclei Segmentation Basins'):
+ """
+ crop the segmentation basins (cell of nuceli) to same coord as images for veiwing in Napari
+ s_name =
+ """
+ for s_scene, xy_cropcoor in d_crop.items():
+ print(s_scene)
+ a_seg = d_label[s_scene]
+ a_crop = a_seg[(xy_cropcoor[1]):(xy_cropcoor[1]+tu_dim[1]),(xy_cropcoor[0]):(xy_cropcoor[0]+tu_dim[0])]
+ s_coor = f'x{xy_cropcoor[0]}y{xy_cropcoor[1]}.tif'
+ #crop file
+ s_file_new = f'{cropdir}/{s_scene}_{s_name.replace(" ","")}{s_coor}'
+ print(s_file_new)
+ skimage.io.imsave(s_file_new,a_crop)
+
+
+def fmt(x, pos):
+ a, b = '{:.0e}'.format(x).split('e')
+ b = int(b)
+ return r'${} \times 10^{{{}}}$'.format(a, b)
+
+def array_roi_if(df_img,df_dapi,s_label='rounds',s_title='Title',tu_crop=(0,0,100,100),tu_array=(2,4),tu_fig=(10,20),tu_rescale=(0,0),i_expnorm=0,i_micron_per_pixel=.325):
+ """
+ create a grid of images
+ df_img = dataframe of images with columns having image attributes
+ and index with image names
+ df_dapi = like df_img, but with the matching dapi images
+ s_label= attribute to label axes
+ s_title = x axis title
+ tu_crop = (upper left corner x, y , xlength, yheight)
+ tu_array = subplot array dimensions
+ tu_fig = size of figue
+ tu_rescale= range of rescaling
+ i_expnorm = normalize to an exposure time (requires 'exposure' column in dataframe
+ """
+ cmap = mpl.colors.LinearSegmentedColormap.from_list('cmap', [(0,0,0),(0,1,0)], N=256, gamma=1.0)
+ fig, ax = plt.subplots(tu_array[0],tu_array[1],figsize=tu_fig,sharey=True, squeeze=False) #
+ ax = ax.ravel()
+ for ax_num, s_index in enumerate(df_img.index):
+ s_col_label = df_img.loc[s_index,s_label]
+ #load image, copr, rescale
+ a_image=skimage.io.imread(s_index)
+ a_dapi = skimage.io.imread((df_dapi).index[0])# & (df_dapi.rounds=='R1')
+ a_crop = a_image[(tu_crop[1]):(tu_crop[1]+tu_crop[3]),(tu_crop[0]):(tu_crop[0]+tu_crop[2])]
+ a_crop_dapi = a_dapi[(tu_crop[1]):(tu_crop[1]+tu_crop[3]),(tu_crop[0]):(tu_crop[0]+tu_crop[2])]
+ #a_crop_dapi = (a_crop_dapi/255).astype('int')
+ if i_expnorm > 0:
+ a_crop = a_crop/df_img.loc[s_index,'exposure']*i_expnorm
+ if tu_rescale==(0,0):
+ a_rescale = skimage.exposure.rescale_intensity(a_crop,in_range=(np.quantile(a_crop,0.03),1.5*np.quantile(a_crop,0.998)),out_range=(0, 255))
+ tu_max = (np.quantile(a_crop,0.03),1.5*np.quantile(a_crop,0.998))
+ else:
+ #print(f'original {a_crop.min()},{a_crop.max()}')
+ #print(f'rescale to {tu_rescale}')
+ a_rescale = skimage.exposure.rescale_intensity(a_crop,in_range = tu_rescale,out_range=(0,255))
+ tu_max=tu_rescale
+ a_rescale_dapi = skimage.exposure.rescale_intensity(a_crop_dapi,in_range = (np.quantile(a_crop_dapi,0.03),2*np.quantile(a_crop_dapi,0.99)),out_range=(0,255))
+ a_rescale_dapi = a_rescale_dapi.astype(np.uint8)
+ a_rescale = a_rescale.astype(np.uint8)
+ #2 color png
+ zdh = np.dstack((np.zeros_like(a_rescale), a_rescale, a_rescale_dapi))
+ ax[ax_num].imshow(zdh)
+ ax[ax_num].set_title('')
+ ax[ax_num].set_ylabel('')
+ ax[ax_num].set_xlabel(s_col_label,fontsize = 'x-large')
+ if tu_rescale == (0,0):
+ if len(ax)>1:
+ ax[ax_num].set_xlabel(f'{s_col_label} ({int(np.quantile(a_crop,0.03))} - {int(1.5*np.quantile(a_crop,0.998))})')
+ ax[ax_num].set_xticklabels('')
+ #pixel to micron (apply after ax is returned)
+ #ax[0].set_yticklabels([str(int(re.sub(u"\u2212", "-", item.get_text()))*i_micron_per_pixel) for item in ax[0].get_yticklabels(minor=False)])
+ plt.suptitle(s_title,y=0.93,size = 'xx-large',weight='bold')
+ plt.subplots_adjust(wspace=.05, hspace=.05)
+ # Now adding the colorbar
+ norm = mpl.colors.Normalize(vmin=tu_max[0],vmax=tu_max[1])
+ sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
+ sm.set_array([])
+ if len(ax) == 1:
+ cbaxes = fig.add_axes([.88, 0.125, 0.02, 0.75]) #[left, bottom, width, height]
+ plt.colorbar(sm, cax=cbaxes)#,format=ticker.FuncFormatter(fmt))
+ plt.figtext(0.47,0.03,s_label.replace('_',' '),fontsize = 'x-large', weight='bold')
+ elif tu_rescale != (0,0):
+ cbaxes = fig.add_axes([.91, 0.15, 0.015, 0.7]) #[left, bottom, width, height]
+ plt.colorbar(sm, cax=cbaxes)#,format=ticker.FuncFormatter(fmt))
+ plt.figtext(0.42,0.03,s_label.replace('_',' '),fontsize = 'x-large', weight='bold')
+ else:
+ print("Different ranges - can't use colorbar")
+ plt.figtext(0.43,0.03,s_label.replace('_',' '),fontsize = 'x-large', weight='bold')
+
+ return(fig,ax)
+
+def multicolor_png(df_img,df_dapi,s_scene,d_overlay,d_crop,es_dim={'CD8','FoxP3','ER','AR'},es_bright={'Ki67','pHH3'},low_thresh=4000,high_thresh=0.999):
+ '''
+ create RGB image with Dapi plus four - 6 channels
+ '''
+
+ d_result = {}
+ #print(s_scene)
+ tu_crop = d_crop[s_scene]
+ df_slide = df_img[df_img.scene == s_scene]
+ x=tu_crop[1]
+ y=tu_crop[0]
+ img_dapi = skimage.io.imread(df_dapi[df_dapi.scene==s_scene].path[0])
+ a_crop = img_dapi[x:x+800,y:y+800]
+ a_rescale_dapi = skimage.exposure.rescale_intensity(a_crop,in_range=(np.quantile(img_dapi,0.2),1.5*np.quantile(img_dapi,high_thresh)),out_range=(0, 255))
+ if 1.5*np.quantile(img_dapi,high_thresh) < low_thresh:
+ a_rescale_dapi = skimage.exposure.rescale_intensity(a_crop,in_range=(low_thresh/2,low_thresh),out_range=(0, 255))
+ elif len(es_dim.intersection(set(['DAPI'])))==1:
+ new_thresh = float(str(high_thresh)[:-2])
+ a_rescale_dapi = skimage.exposure.rescale_intensity(a_crop,in_range=(np.quantile(img_dapi,0.2),1.5*np.quantile(img_dapi,new_thresh)),out_range=(0, 255))
+ elif len(es_bright.intersection(set(['DAPI'])))==1:
+ a_rescale_dapi = skimage.exposure.rescale_intensity(a_crop,in_range=(np.quantile(img_dapi,0.2),1.5*np.quantile(img_dapi,float(str(high_thresh) + '99'))),out_range=(0, 255))
+
+ #RGB
+ for s_type, ls_marker in d_overlay.items():
+ #print(s_type)
+ zdh = np.dstack((np.zeros_like(a_rescale_dapi), np.zeros_like(a_rescale_dapi),a_rescale_dapi))
+ for idx, s_marker in enumerate(ls_marker):
+ #print(s_marker)
+ s_index = df_slide[df_slide.marker == s_marker].index[0]
+ img = skimage.io.imread(df_slide.loc[s_index,'path'])
+ a_crop = img[x:x+800,y:y+800]
+ in_range = (np.quantile(a_crop,0.2),1.5*np.quantile(a_crop,high_thresh))
+ a_rescale = skimage.exposure.rescale_intensity(a_crop,in_range=in_range,out_range=(0, 255))
+ if 1.5*np.quantile(a_crop,high_thresh) < low_thresh:
+ #print('low thresh')
+ in_range=(low_thresh/2,low_thresh)
+ a_rescale = skimage.exposure.rescale_intensity(a_crop,in_range=in_range,out_range=(0, 255))
+ elif len(es_dim.intersection(set([s_marker])))==1:
+ #print('dim')
+ new_thresh = float(str(high_thresh)[:-2])
+ in_range=(np.quantile(a_crop,0.2),1.5*np.quantile(a_crop,new_thresh))
+ a_rescale = skimage.exposure.rescale_intensity(a_crop,in_range=in_range,out_range=(0, 255))
+ elif len(es_bright.intersection(set([s_marker])))==1:
+ #print('bright')
+ in_range=(np.quantile(a_crop,0.2),1.5*np.quantile(a_crop,float(str(high_thresh) + '99')))
+ a_rescale = skimage.exposure.rescale_intensity(a_crop,in_range=in_range,out_range=(0, 255))
+
+ #print(f'low {int(in_range[0])} high {int(in_range[1])}')
+ if idx == 0:
+ zdh = zdh + np.dstack((np.zeros_like(a_rescale), a_rescale,np.zeros_like(a_rescale)))
+
+ elif idx == 1:
+ zdh = zdh + np.dstack((a_rescale, a_rescale,np.zeros_like(a_rescale)))
+
+ elif idx == 2:
+ zdh = zdh + np.dstack((a_rescale, np.zeros_like(a_rescale),np.zeros_like(a_rescale) ))
+
+ elif idx == 3:
+ zdh = zdh + np.dstack((np.zeros_like(a_rescale), a_rescale, a_rescale))
+ #print(zdh.min())
+ zdh = zdh.clip(0,255)
+ zdh = zdh.astype('uint8')
+ #print(zdh.max())
+ d_result.update({s_type:(ls_marker,zdh)})
+ return(d_result)
+
+def roi_if_border(df_img,df_dapi,df_border,s_label='rounds',s_title='Title',tu_crop=(0,0,100,100),tu_array=(2,4),tu_fig=(10,20),tu_rescale=(0,0),i_expnorm=0,i_micron_per_pixel=.325):
+ """
+ create a grid of images
+ df_img = dataframe of images with columns having image attributes
+ and index with image names
+ df_dapi = like df_img, but with the matching dapi images
+ df_border: index is border image file name
+ s_label= attribute to label axes
+ s_title = x axis title
+ tu_crop = (upper left corner x, y , xlength, yheight)
+ tu_array = subplot array dimensions
+ tu_fig = size of figue
+ tu_rescale=
+ i_expnorm =
+ """
+ cmap = mpl.colors.LinearSegmentedColormap.from_list('cmap', [(0,0,0),(0,1,0)], N=256, gamma=1.0)
+ fig, ax = plt.subplots(tu_array[0],tu_array[1],figsize=tu_fig,sharey=True, squeeze=False) #
+ ax = ax.ravel()
+ for ax_num, s_index in enumerate(df_img.index):
+ s_col_label = df_img.loc[s_index,s_label]
+ #load image, copr, rescale
+ a_image=skimage.io.imread(s_index)
+ a_dapi = skimage.io.imread((df_dapi).index[0])# & (df_dapi.rounds=='R1')
+ a_crop = a_image[(tu_crop[1]):(tu_crop[1]+tu_crop[3]),(tu_crop[0]):(tu_crop[0]+tu_crop[2])]
+ a_crop_dapi = a_dapi[(tu_crop[1]):(tu_crop[1]+tu_crop[3]),(tu_crop[0]):(tu_crop[0]+tu_crop[2])]
+ #a_crop_dapi = (a_crop_dapi/255).astype('int')
+ if i_expnorm > 0:
+ a_crop = a_crop/df_img.loc[s_index,'exposure']*i_expnorm
+ if tu_rescale==(0,0):
+ a_rescale = skimage.exposure.rescale_intensity(a_crop,in_range=(np.quantile(a_crop,0.03),1.5*np.quantile(a_crop,0.998)),out_range=(0, 255))
+ tu_max = (np.quantile(a_crop,0.03),1.5*np.quantile(a_crop,0.998))
+ else:
+ print(f'original {a_crop.min()},{a_crop.max()}')
+ print(f'rescale to {tu_rescale}')
+ a_rescale = skimage.exposure.rescale_intensity(a_crop,in_range = tu_rescale,out_range=(0,255))
+ tu_max=tu_rescale
+ a_rescale_dapi = skimage.exposure.rescale_intensity(a_crop_dapi,in_range = (np.quantile(a_crop_dapi,0.03),2*np.quantile(a_crop_dapi,0.99)),out_range=(0,255))
+ a_rescale_dapi = a_rescale_dapi.astype(np.uint8)
+ a_rescale = a_rescale.astype(np.uint8)
+ #white border
+ s_border_index = df_border[df_border.marker==(df_img.loc[s_index,'marker'])].index[0]
+ a_border = skimage.io.imread(s_border_index)
+ a_crop_border = a_border[(tu_crop[1]):(tu_crop[1]+tu_crop[3]),(tu_crop[0]):(tu_crop[0]+tu_crop[2])]
+ mask = a_crop_border > 250
+ #2 color png
+ zdh = np.dstack((np.zeros_like(a_rescale), a_rescale, a_rescale_dapi))
+ zdh[mask] = 255
+ #zdh = zdh.clip(0,255)
+ #zdh = zdh.astype('uint8')
+ ax[ax_num].imshow(zdh)
+ ax[ax_num].set_title('')
+ ax[ax_num].set_ylabel('')
+ ax[ax_num].set_xlabel(s_col_label,fontsize = 'x-large')
+ if tu_rescale == (0,0):
+ if len(ax)>1:
+ ax[ax_num].set_xlabel(f'{s_col_label} ({int(np.quantile(a_crop,0.03))} - {int(1.5*np.quantile(a_crop,0.998))})')
+ ax[ax_num].set_xticklabels('')
+ #pixel to micron (apply after ax is returned)
+ #ax[0].set_yticklabels([str(int(re.sub(u"\u2212", "-", item.get_text()))*i_micron_per_pixel) for item in ax[0].get_yticklabels(minor=False)])
+ plt.suptitle(s_title,y=0.93,size = 'xx-large',weight='bold')
+ plt.subplots_adjust(wspace=.05, hspace=.05)
+ # Now adding the colorbar
+ norm = mpl.colors.Normalize(vmin=tu_max[0],vmax=tu_max[1])
+ sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
+ sm.set_array([])
+ if len(ax) == 1:
+ cbaxes = fig.add_axes([.88, 0.125, 0.02, 0.75]) #[left, bottom, width, height]
+ plt.colorbar(sm, cax = cbaxes)
+ plt.figtext(0.47,0.03,s_label.replace('_',' '),fontsize = 'x-large', weight='bold')
+ elif tu_rescale != (0,0):
+ cbaxes = fig.add_axes([.92, 0.175, 0.02, 0.64]) #[left, bottom, width, height]
+ plt.colorbar(sm, cax = cbaxes)
+ plt.figtext(0.42,0.03,s_label.replace('_',' '),fontsize = 'x-large', weight='bold')
+ else:
+ print("Different ranges - can't use colorbar")
+ plt.figtext(0.43,0.03,s_label.replace('_',' '),fontsize = 'x-large', weight='bold')
+
+ return(fig,ax,a_crop_border)
+
diff --git a/mplex_image/normalize.py b/mplex_image/normalize.py
new file mode 100755
index 0000000..2c03147
--- /dev/null
+++ b/mplex_image/normalize.py
@@ -0,0 +1,536 @@
+#from https://github.com/brentp/combat.py/blob/master/combat.py
+import patsy
+import sys
+import numpy.linalg as la
+import numpy as np
+import pandas as pd
+import sys
+import matplotlib.pyplot as plt
+
+def aprior(gamma_hat):
+ m = gamma_hat.mean()
+ s2 = gamma_hat.var()
+ return (2 * s2 +m**2) / s2
+
+def bprior(gamma_hat):
+ m = gamma_hat.mean()
+ s2 = gamma_hat.var()
+ return (m*s2+m**3)/s2
+
+def it_sol(sdat, g_hat, d_hat, g_bar, t2, a, b, conv=0.0001):
+ n = (1 - np.isnan(sdat)).sum(axis=1)
+ g_old = g_hat.copy()
+ d_old = d_hat.copy()
+
+ change = 1
+ count = 0
+ while change > conv:
+ #print g_hat.shape, g_bar.shape, t2.shape
+ g_new = postmean(g_hat, g_bar, n, d_old, t2)
+ sum2 = ((sdat - np.dot(g_new.values.reshape((g_new.shape[0], 1)), np.ones((1, sdat.shape[1])))) ** 2).sum(axis=1)
+ d_new = postvar(sum2, n, a, b)
+
+ change = max((abs(g_new - g_old) / g_old).max(), (abs(d_new - d_old) / d_old).max())
+ g_old = g_new #.copy()
+ d_old = d_new #.copy()
+ count = count + 1
+ adjust = (g_new, d_new)
+ return adjust
+
+def postmean(g_hat, g_bar, n, d_star, t2):
+ return (t2*n*g_hat+d_star * g_bar) / (t2*n+d_star)
+
+def postvar(sum2, n, a, b):
+ return (0.5 * sum2 + b) / (n / 2.0 + a - 1.0)
+
+def design_mat(mod, numerical_covariates, batch_levels):
+ # require levels to make sure they are in the same order as we use in the
+ # rest of the script.
+ design = patsy.dmatrix("~ 0 + C(batch, levels=%s)" % str(batch_levels),
+ mod, return_type="dataframe")
+
+ mod = mod.drop(["batch"], axis=1)
+ numerical_covariates = list(numerical_covariates)
+ sys.stderr.write("found %i batches\n" % design.shape[1])
+ other_cols = [c for i, c in enumerate(mod.columns)
+ if not i in numerical_covariates]
+ factor_matrix = mod[other_cols]
+ design = pd.concat((design, factor_matrix), axis=1)
+ if numerical_covariates is not None:
+ sys.stderr.write("found %i numerical covariates...\n"
+ % len(numerical_covariates))
+ for i, nC in enumerate(numerical_covariates):
+ cname = mod.columns[nC]
+ sys.stderr.write("\t{0}\n".format(cname))
+ design[cname] = mod[mod.columns[nC]]
+ sys.stderr.write("found %i categorical variables:" % len(other_cols))
+ sys.stderr.write("\t" + ", ".join(other_cols) + '\n')
+ return design
+
+def combat(data, batch, model=None, numerical_covariates=None):
+ """Correct for batch effects in a dataset
+ Parameters
+ ----------
+ data : pandas.DataFrame
+ A (n_features, n_samples) dataframe of the expression or methylation
+ data to batch correct
+ batch : pandas.Series
+ A column corresponding to the batches in the data, with index same as
+ the columns that appear in ``data``
+ model : patsy.design_info.DesignMatrix, optional
+ A model matrix describing metadata on the samples which could be
+ causing batch effects. If not provided, then will attempt to coarsely
+ correct just from the information provided in ``batch``
+ numerical_covariates : list-like
+ List of covariates in the model which are numerical, rather than
+ categorical
+ Returns
+ -------
+ corrected : pandas.DataFrame
+ A (n_features, n_samples) dataframe of the batch-corrected data
+ """
+ if isinstance(numerical_covariates, str):
+ numerical_covariates = [numerical_covariates]
+ if numerical_covariates is None:
+ numerical_covariates = []
+
+ if model is not None and isinstance(model, pd.DataFrame):
+ model["batch"] = list(batch)
+ else:
+ model = pd.DataFrame({'batch': batch})
+
+ batch_items = model.groupby("batch").groups.items()
+ batch_levels = [k for k, v in batch_items]
+ batch_info = [v for k, v in batch_items]
+ n_batch = len(batch_info)
+ n_batches = np.array([len(v) for v in batch_info])
+ n_array = float(sum(n_batches))
+
+ # drop intercept
+ drop_cols = [cname for cname, inter in ((model == 1).all()).iteritems() if inter == True]
+ drop_idxs = [list(model.columns).index(cdrop) for cdrop in drop_cols]
+ model = model[[c for c in model.columns if not c in drop_cols]]
+ numerical_covariates = [list(model.columns).index(c) if isinstance(c, str) else c
+ for c in numerical_covariates if not c in drop_cols]
+
+ design = design_mat(model, numerical_covariates, batch_levels)
+
+ sys.stderr.write("Standardizing Data across genes.\n")
+ #error shapes (3,7200) and (26,7200) not aligned: 7200 (dim 1) != 26 (dim 0)
+ B_hat = np.dot(np.dot(la.inv(np.dot(design.T, design)), design.T), data.T) #data.T
+ grand_mean = np.dot((n_batches / n_array).T, B_hat[:n_batch,:])
+ var_pooled = np.dot(((data - np.dot(design, B_hat).T)**2), np.ones((int(n_array), 1)) / int(n_array))
+
+ stand_mean = np.dot(grand_mean.T.reshape((len(grand_mean), 1)), np.ones((1, int(n_array))))
+ tmp = np.array(design.copy())
+ tmp[:,:n_batch] = 0
+ stand_mean += np.dot(tmp, B_hat).T
+
+ s_data = ((data - stand_mean) / np.dot(np.sqrt(var_pooled), np.ones((1, int(n_array)))))
+
+ sys.stderr.write("Fitting L/S model and finding priors\n")
+ batch_design = design[design.columns[:n_batch]]
+ gamma_hat = np.dot(np.dot(la.inv(np.dot(batch_design.T, batch_design)), batch_design.T), s_data.T)
+
+ delta_hat = []
+
+ for i, batch_idxs in enumerate(batch_info):
+ #batches = [list(model.columns).index(b) for b in batches]
+ delta_hat.append(s_data[batch_idxs].var(axis=1))
+
+ gamma_bar = gamma_hat.mean(axis=1)
+ t2 = gamma_hat.var(axis=1)
+
+
+ a_prior = list(map(aprior, delta_hat))
+ b_prior = list(map(bprior, delta_hat))
+
+ sys.stderr.write("Finding parametric adjustments\n")
+ gamma_star, delta_star = [], []
+ for i, batch_idxs in enumerate(batch_info):
+ #print '18 20 22 28 29 31 32 33 35 40 46'
+ #print batch_info[batch_id]
+
+ temp = it_sol(s_data[batch_idxs], gamma_hat[i],
+ delta_hat[i], gamma_bar[i], t2[i], a_prior[i], b_prior[i])
+
+ gamma_star.append(temp[0])
+ delta_star.append(temp[1])
+
+ sys.stdout.write("Adjusting data\n")
+ bayesdata = s_data
+ gamma_star = np.array(gamma_star)
+ delta_star = np.array(delta_star)
+
+
+ for j, batch_idxs in enumerate(batch_info):
+
+ dsq = np.sqrt(delta_star[j,:])
+ dsq = dsq.reshape((len(dsq), 1))
+ denom = np.dot(dsq, np.ones((1, n_batches[j])))
+ numer = np.array(bayesdata[batch_idxs] - np.dot(batch_design.loc[batch_idxs], gamma_star).T)
+
+ bayesdata[batch_idxs] = numer / denom
+
+ vpsq = np.sqrt(var_pooled).reshape((len(var_pooled), 1))
+ bayesdata = bayesdata * np.dot(vpsq, np.ones((1, int(n_array)))) + stand_mean
+
+ return bayesdata
+
+#adapted from https://github.com/brentp/combat.py/blob/master/combat.py
+
+
+def combat_fit(data, batch, model=None, numerical_covariates=None):
+ """Correct for batch effects in a dataset
+ Parameters
+ ----------
+ data : pandas.DataFrame
+ A (n_features, n_samples) dataframe of the expression or methylation
+ data to batch correct
+ batch : pandas.Series
+ A column corresponding to the batches in the data, with index same as
+ the columns that appear in ``data``
+ model : patsy.design_info.DesignMatrix, optional
+ A model matrix describing metadata on the samples which could be
+ causing batch effects. If not provided, then will attempt to coarsely
+ correct just from the information provided in ``batch``
+ numerical_covariates : list-like
+ List of covariates in the model which are numerical, rather than
+ categorical
+ Returns
+ -------
+ gamma_star : centering parameters from combat fitting
+ delta_star : scaling parameters from combat fitting
+ stand_mean: pooled mean of batches
+ var_pooled: pooled variance of batches
+ """
+ if isinstance(numerical_covariates, str):
+ numerical_covariates = [numerical_covariates]
+ if numerical_covariates is None:
+ numerical_covariates = []
+
+ if model is not None and isinstance(model, pd.DataFrame):
+ model["batch"] = list(batch)
+ else:
+ model = pd.DataFrame({'batch': batch})
+
+ batch_items = model.groupby("batch").groups.items()
+ batch_levels = [k for k, v in batch_items]
+ batch_info = [v for k, v in batch_items]
+ n_batch = len(batch_info)
+ n_batches = np.array([len(v) for v in batch_info])
+ n_array = float(sum(n_batches))
+
+ # drop intercept
+ drop_cols = [cname for cname, inter in ((model == 1).all()).iteritems() if inter == True]
+ drop_idxs = [list(model.columns).index(cdrop) for cdrop in drop_cols]
+ model = model[[c for c in model.columns if not c in drop_cols]]
+ numerical_covariates = [list(model.columns).index(c) if isinstance(c, str) else c
+ for c in numerical_covariates if not c in drop_cols]
+
+ design = design_mat(model, numerical_covariates, batch_levels)
+
+ sys.stderr.write("Standardizing Data across genes.\n")
+ B_hat = np.dot(np.dot(la.inv(np.dot(design.T, design)), design.T), data.T)
+ grand_mean = np.dot((n_batches / n_array).T, B_hat[:n_batch,:])
+ var_pooled = np.dot(((data - np.dot(design, B_hat).T)**2), np.ones((int(n_array), 1)) / int(n_array))
+
+ stand_mean = np.dot(grand_mean.T.reshape((len(grand_mean), 1)), np.ones((1, int(n_array))))
+ tmp = np.array(design.copy())
+ tmp[:,:n_batch] = 0
+ stand_mean += np.dot(tmp, B_hat).T
+
+ s_data = ((data - stand_mean) / np.dot(np.sqrt(var_pooled), np.ones((1, int(n_array)))))
+
+ sys.stderr.write("Fitting L/S model and finding priors\n")
+ batch_design = design[design.columns[:n_batch]]
+ gamma_hat = np.dot(np.dot(la.inv(np.dot(batch_design.T, batch_design)), batch_design.T), s_data.T)
+
+ delta_hat = []
+
+ for i, batch_idxs in enumerate(batch_info):
+ delta_hat.append(s_data[batch_idxs].var(axis=1))
+
+ gamma_bar = gamma_hat.mean(axis=1)
+ t2 = gamma_hat.var(axis=1)
+
+
+ a_prior = list(map(aprior, delta_hat))
+ b_prior = list(map(bprior, delta_hat))
+
+ sys.stderr.write("Finding parametric adjustments\n")
+ gamma_star, delta_star = [], []
+ for i, batch_idxs in enumerate(batch_info):
+ temp = it_sol(s_data[batch_idxs], gamma_hat[i],
+ delta_hat[i], gamma_bar[i], t2[i], a_prior[i], b_prior[i])
+
+ gamma_star.append(temp[0])
+ delta_star.append(temp[1])
+ #just retrun one stand_mean array
+ stand_mean = stand_mean[:,0]
+ return(gamma_star, delta_star, stand_mean, var_pooled)
+
+def combat_transform(data, batch, gamma_star, delta_star, stand_mean, var_pooled,model=None, numerical_covariates=None):
+ """Correct for batch effects in a dataset
+ Parameters
+ ----------
+ data : pandas.DataFrame
+ A (n_features, n_samples) dataframe of the expression or methylation
+ data to batch correct
+ batch : pandas.Series
+ A column corresponding to the batches in the data, with index same as
+ the columns that appear in ``data``
+ gamma_star : centering parameters from combat fitting
+ delta_star : scaling parameters from combat fitting
+ stand_mean: pooled mean of batches
+ var_pooled: pooled variance of batches
+ model : patsy.design_info.DesignMatrix, optional
+ A model matrix describing metadata on the samples which could be
+ causing batch effects. If not provided, then will attempt to coarsely
+ correct just from the information provided in ``batch``
+ numerical_covariates : list-like
+ List of covariates in the model which are numerical, rather than
+ categorical
+ Returns
+ -------
+ corrected : pandas.DataFrame
+ A (n_features, n_samples) dataframe of the batch-corrected data
+ """
+ #get design
+ if isinstance(numerical_covariates, str):
+ numerical_covariates = [numerical_covariates]
+ if numerical_covariates is None:
+ numerical_covariates = []
+
+ if model is not None and isinstance(model, pd.DataFrame):
+ model["batch"] = list(batch)
+ else:
+ model = pd.DataFrame({'batch': batch})
+ batch_items = model.groupby("batch").groups.items()
+ batch_levels = [k for k, v in batch_items]
+ batch_info = [v for k, v in batch_items]
+ n_batch = len(batch_info)
+ n_batches = np.array([len(v) for v in batch_info])
+ n_array = float(sum(n_batches))
+ # drop intercept
+ drop_cols = [cname for cname, inter in ((model == 1).all()).iteritems() if inter == True]
+ drop_idxs = [list(model.columns).index(cdrop) for cdrop in drop_cols]
+ model = model[[c for c in model.columns if not c in drop_cols]]
+ numerical_covariates = [list(model.columns).index(c) if isinstance(c, str) else c
+ for c in numerical_covariates if not c in drop_cols]
+ design = design_mat(model, numerical_covariates, batch_levels)
+ #standardize
+ sys.stderr.write("Standardizing Data across genes.\n")
+
+ #reshape stand mean
+ stand_mean = np.dot(stand_mean.T.reshape((len(stand_mean), 1)), np.ones((1, int(data.shape[1]))))
+ s_data = ((data - stand_mean) / np.dot(np.sqrt(var_pooled), np.ones((1, int(n_array)))))
+ batch_design = design[design.columns[:n_batch]]
+ # adjust data
+ sys.stdout.write("Adjusting data\n")
+ bayesdata = s_data
+ gamma_star = np.array(gamma_star)
+ delta_star = np.array(delta_star)
+ #for each batch
+ for j, batch_idxs in enumerate(batch_info):
+
+ dsq = np.sqrt(delta_star[j,:])
+ dsq = dsq.reshape((len(dsq), 1))
+ denom = np.dot(dsq, np.ones((1, n_batches[j]))) #divide by sqrt delta_star
+ numer = np.array(bayesdata[batch_idxs] - np.dot(batch_design.loc[batch_idxs], gamma_star).T) #subtract gamma_star
+
+ bayesdata[batch_idxs] = numer / denom
+ #multiply by square root of variance and add mean
+ vpsq = np.sqrt(var_pooled).reshape((len(var_pooled), 1))
+ bayesdata = bayesdata * np.dot(vpsq, np.ones((1, int(n_array)))) + stand_mean
+ return bayesdata
+
+
+def combat_fit_old(data, batch, model=None, numerical_covariates=None):
+ """Correct for batch effects in a dataset
+ Parameters
+ ----------
+ data : pandas.DataFrame
+ A (n_features, n_samples) dataframe of the expression or methylation
+ data to batch correct
+ batch : pandas.Series
+ A column corresponding to the batches in the data, with index same as
+ the columns that appear in ``data``
+ model : patsy.design_info.DesignMatrix, optional
+ A model matrix describing metadata on the samples which could be
+ causing batch effects. If not provided, then will attempt to coarsely
+ correct just from the information provided in ``batch``
+ numerical_covariates : list-like
+ List of covariates in the model which are numerical, rather than
+ categorical
+ Returns
+ -------
+ gamma_star : centering parameters from combat fitting
+ delta_star : scaling parameters from combat fitting
+ """
+ if isinstance(numerical_covariates, str):
+ numerical_covariates = [numerical_covariates]
+ if numerical_covariates is None:
+ numerical_covariates = []
+
+ if model is not None and isinstance(model, pd.DataFrame):
+ model["batch"] = list(batch)
+ else:
+ model = pd.DataFrame({'batch': batch})
+
+ batch_items = model.groupby("batch").groups.items()
+ batch_levels = [k for k, v in batch_items]
+ batch_info = [v for k, v in batch_items]
+ n_batch = len(batch_info)
+ n_batches = np.array([len(v) for v in batch_info])
+ n_array = float(sum(n_batches))
+
+ # drop intercept
+ drop_cols = [cname for cname, inter in ((model == 1).all()).iteritems() if inter == True]
+ drop_idxs = [list(model.columns).index(cdrop) for cdrop in drop_cols]
+ model = model[[c for c in model.columns if not c in drop_cols]]
+ numerical_covariates = [list(model.columns).index(c) if isinstance(c, str) else c
+ for c in numerical_covariates if not c in drop_cols]
+
+ design = design_mat(model, numerical_covariates, batch_levels)
+
+ sys.stderr.write("Standardizing Data across genes.\n")
+ B_hat = np.dot(np.dot(la.inv(np.dot(design.T, design)), design.T), data.T)
+ grand_mean = np.dot((n_batches / n_array).T, B_hat[:n_batch,:])
+ var_pooled = np.dot(((data - np.dot(design, B_hat).T)**2), np.ones((int(n_array), 1)) / int(n_array))
+
+ stand_mean = np.dot(grand_mean.T.reshape((len(grand_mean), 1)), np.ones((1, int(n_array))))
+ tmp = np.array(design.copy())
+ tmp[:,:n_batch] = 0
+ stand_mean += np.dot(tmp, B_hat).T
+
+ s_data = ((data - stand_mean) / np.dot(np.sqrt(var_pooled), np.ones((1, int(n_array)))))
+
+ sys.stderr.write("Fitting L/S model and finding priors\n")
+ batch_design = design[design.columns[:n_batch]]
+ gamma_hat = np.dot(np.dot(la.inv(np.dot(batch_design.T, batch_design)), batch_design.T), s_data.T)
+
+ delta_hat = []
+
+ for i, batch_idxs in enumerate(batch_info):
+ delta_hat.append(s_data[batch_idxs].var(axis=1))
+
+ gamma_bar = gamma_hat.mean(axis=1)
+ t2 = gamma_hat.var(axis=1)
+
+
+ a_prior = list(map(aprior, delta_hat))
+ b_prior = list(map(bprior, delta_hat))
+
+ sys.stderr.write("Finding parametric adjustments\n")
+ gamma_star, delta_star = [], []
+ for i, batch_idxs in enumerate(batch_info):
+ temp = it_sol(s_data[batch_idxs], gamma_hat[i],
+ delta_hat[i], gamma_bar[i], t2[i], a_prior[i], b_prior[i])
+
+ gamma_star.append(temp[0])
+ delta_star.append(temp[1])
+ return(gamma_star, delta_star)
+
+def combat_transform_old(data, batch, gamma_star, delta_star,model=None, numerical_covariates=None):
+ """Correct for batch effects in a dataset
+ Parameters
+ ----------
+ data : pandas.DataFrame
+ A (n_features, n_samples) dataframe of the expression or methylation
+ data to batch correct
+ batch : pandas.Series
+ A column corresponding to the batches in the data, with index same as
+ the columns that appear in ``data``
+ gamma_star : centering parameters from combat fitting
+ delta_star : scaling parameters from combat fitting
+ model : patsy.design_info.DesignMatrix, optional
+ A model matrix describing metadata on the samples which could be
+ causing batch effects. If not provided, then will attempt to coarsely
+ correct just from the information provided in ``batch``
+ numerical_covariates : list-like
+ List of covariates in the model which are numerical, rather than
+ categorical
+ Returns
+ -------
+ corrected : pandas.DataFrame
+ A (n_features, n_samples) dataframe of the batch-corrected data
+ """
+ #get design
+ if isinstance(numerical_covariates, str):
+ numerical_covariates = [numerical_covariates]
+ if numerical_covariates is None:
+ numerical_covariates = []
+
+ if model is not None and isinstance(model, pd.DataFrame):
+ model["batch"] = list(batch)
+ else:
+ model = pd.DataFrame({'batch': batch})
+ batch_items = model.groupby("batch").groups.items()
+ batch_levels = [k for k, v in batch_items]
+ batch_info = [v for k, v in batch_items]
+ n_batch = len(batch_info)
+ n_batches = np.array([len(v) for v in batch_info])
+ n_array = float(sum(n_batches))
+ # drop intercept
+ drop_cols = [cname for cname, inter in ((model == 1).all()).iteritems() if inter == True]
+ drop_idxs = [list(model.columns).index(cdrop) for cdrop in drop_cols]
+ model = model[[c for c in model.columns if not c in drop_cols]]
+ numerical_covariates = [list(model.columns).index(c) if isinstance(c, str) else c
+ for c in numerical_covariates if not c in drop_cols]
+ design = design_mat(model, numerical_covariates, batch_levels)
+ #standardize
+ sys.stderr.write("Standardizing Data across genes.\n")
+ B_hat = np.dot(np.dot(la.inv(np.dot(design.T, design)), design.T), data.T)
+ grand_mean = np.dot((n_batches / n_array).T, B_hat[:n_batch,:])
+ var_pooled = np.dot(((data - np.dot(design, B_hat).T)**2), np.ones((int(n_array), 1)) / int(n_array))
+
+ stand_mean = np.dot(grand_mean.T.reshape((len(grand_mean), 1)), np.ones((1, int(n_array))))
+ tmp = np.array(design.copy())
+ tmp[:,:n_batch] = 0
+ stand_mean += np.dot(tmp, B_hat).T
+ s_data = ((data - stand_mean) / np.dot(np.sqrt(var_pooled), np.ones((1, int(n_array)))))
+ batch_design = design[design.columns[:n_batch]]
+ # adjust data
+ sys.stdout.write("Adjusting data\n")
+ bayesdata = s_data
+ gamma_star = np.array(gamma_star)
+ delta_star = np.array(delta_star)
+ #for each batch
+ for j, batch_idxs in enumerate(batch_info):
+
+ dsq = np.sqrt(delta_star[j,:])
+ dsq = dsq.reshape((len(dsq), 1))
+ denom = np.dot(dsq, np.ones((1, n_batches[j]))) #divide by sqrt delta_star
+ numer = np.array(bayesdata[batch_idxs] - np.dot(batch_design.loc[batch_idxs], gamma_star).T) #subtract gamma_star
+
+ bayesdata[batch_idxs] = numer / denom
+ #multiply by square root of variance and add mean
+ vpsq = np.sqrt(var_pooled).reshape((len(var_pooled), 1))
+ bayesdata = bayesdata * np.dot(vpsq, np.ones((1, int(n_array)))) + stand_mean
+ return bayesdata
+
+def plot_histograms(df_norm,df,s_train,s_tissue):
+ '''
+ for each marker, return a histogram of trianing data and transformed data (df_norm)
+ '''
+ bins=50
+ d_fig = {}
+ for s_marker in df_norm.columns[df_norm.dtypes=='float64']:
+ print(s_marker)
+ fig,ax=plt.subplots(2,1,figsize = (3,4))
+ for idxs, s_batch in enumerate(sorted(set(df_norm.batch))):
+ df_batch = df_norm[(df_norm.batch==s_batch)].loc[:,s_marker]
+ if len(df_batch.dropna()) == 0:
+ continue
+ ax[0].hist(df.loc[df.index.str.contains(s_batch),s_marker],bins=bins,alpha=0.4, color=f'C{idxs}')
+ ax[1].hist(df_batch,bins=bins,alpha=0.4, color=f'C{idxs}',label=s_batch)
+ ax[0].set_yscale('log')
+ ax[1].set_yscale('log')
+ ax[0].set_title(f'{s_marker.split("_")[0]}: Raw Data')
+ ax[1].set_title(f'{s_marker.split("_")[0]}: Combat')
+ ax[1].legend()
+ plt.tight_layout()
+ plt.close()
+ d_fig.update({s_marker:fig})
+ return(d_fig)
\ No newline at end of file
diff --git a/mplex_image/ometiff.py b/mplex_image/ometiff.py
new file mode 100755
index 0000000..9986c6d
--- /dev/null
+++ b/mplex_image/ometiff.py
@@ -0,0 +1,76 @@
+####
+# title: mpimage.py
+#
+# language: Python3.6
+# date: 2019-05-00
+# license: GPL>=v3
+# author: Jenny
+#
+# description:
+# python3 library to display, normalize and crop multiplex images
+####
+
+#libraries
+import matplotlib as mpl
+mpl.use('agg')
+import matplotlib.pyplot as plt
+import numpy as np
+import os
+import skimage
+import pandas as pd
+#import bioformats
+import re
+import shutil
+from itertools import chain
+import matplotlib.ticker as ticker
+
+os.chdir('/home/groups/graylab_share/OMERO.rdsStore/engje/Data/cmIF/')
+from apeer_ometiff_library import omexmlClass
+
+#functions
+
+def gen_xml(array, channel_names):
+ '''
+ copy and modify from apeer ome tiff
+ ls_marker
+ '''
+ #for idx, s_marker in enumerate(ls_marker):
+ # old = bytes(f'Name="C:{idx}"','utf-8')
+ # new = bytes(f'Name="{s_marker}"','utf-8')
+ # s_xml = s_xml.replace(old,new,-1)
+ #Dimension order is assumed to be TZCYX
+ dim_order = "TZCYX"
+
+ metadata = omexmlClass.OMEXML()
+ shape = array.shape
+ assert ( len(shape) == 5), "Expected array of 5 dimensions"
+
+ metadata.image().set_Name("IMAGE")
+ metadata.image().set_ID("0")
+
+ pixels = metadata.image().Pixels
+ pixels.ome_uuid = metadata.uuidStr
+ pixels.set_ID("0")
+
+ pixels.channel_count = shape[2]
+
+ pixels.set_SizeT(shape[0])
+ pixels.set_SizeZ(shape[1])
+ pixels.set_SizeC(shape[2])
+ pixels.set_SizeY(shape[3])
+ pixels.set_SizeX(shape[4])
+
+ pixels.set_DimensionOrder(dim_order[::-1])
+
+ pixels.set_PixelType(omexmlClass.get_pixel_type(array.dtype))
+
+ for i in range(pixels.SizeC):
+ pixels.Channel(i).set_ID("Channel:0:" + str(i))
+ pixels.Channel(i).set_Name(channel_names[i])
+
+ for i in range(pixels.SizeC):
+ pixels.Channel(i).set_SamplesPerPixel(1)
+
+ pixels.populate_TiffData()
+
+ return metadata.to_xml().encode()
diff --git a/mplex_image/preprocess.py b/mplex_image/preprocess.py
new file mode 100755
index 0000000..a54e54b
--- /dev/null
+++ b/mplex_image/preprocess.py
@@ -0,0 +1,705 @@
+####
+# title: preprocess.py
+#
+# language: Python3.6
+# date: 2019-06-00
+# license: GPL>=v3
+# author: Jenny
+#
+# description:
+# python3 library to prepare images and other inputs for guillaumes segmentation software
+####
+
+#libraries
+import pandas as pd
+import matplotlib as mpl
+mpl.use('agg')
+import matplotlib.pyplot as plt
+import numpy as np
+import os
+import skimage
+import shutil
+import re
+
+#set src path (CHANGE ME)
+s_src_path = '/home/groups/graylab_share/OMERO.rdsStore/engje/Data/cmIF'
+s_work_path = '/home/groups/graylab_share/Chin_Lab/ChinData/Work/engje'
+
+# function
+# import importlib
+# importlib.reload(preprocess)
+
+def check_names(df_img,s_type='tiff'):
+ """
+ (CHANGE ME)
+ Based on filenames in segment folder,
+ checks marker names against standard list of biomarkers
+ returns a dataframe with Rounds Cycles Info, and sets of wrong and correct names
+ Input: s_find = string that will be unique to one scene to check in the folder
+ """
+ if s_type == 'tiff':
+ es_names = set(df_img.marker)
+ elif s_type == 'czi':
+ lls_marker = [item.split('.') for item in df_img.markers]
+ es_names = set([item for sublist in lls_marker for item in sublist])
+ else :
+ print('Unknown type')
+ es_standard = {'DAPI','PDL1','pERK','CK19','pHH3','CK14','Ki67','Ecad','PCNA','HER2','ER','CD44',
+ 'aSMA','AR','pAKT','LamAC','CK5','EGFR','pRB','FoxP3','CK7','PDPN','CD4','PgR','Vim',
+ 'CD8','CD31','CD45','panCK','CD68','PD1','CD20','CK8','cPARP','ColIV','ColI','CK17',
+ 'H3K4','gH2AX','CD3','H3K27','53BP1','BCL2','GRNZB','LamB1','pS6RP','BAX','RAD51',
+ 'R0c2','R0c3','R0c4','R0c5','R5Qc2','R5Qc3','R5Qc4','R5Qc5','R11Qc2','R11Qc3','R11Qc4','R11Qc5',
+ 'R7Qc2','R7Qc3','R7Qc4','R7Qc5','PDL1ab','PDL1d','R14Qc2','R14Qc3','R14Qc4','R14Qc5',
+ 'R8Qc2','R8Qc3','R8Qc4','R8Qc5','R12Qc2','R12Qc3','R12Qc4','R12Qc5','PgRc4','R1c2','CCND1',
+ 'Glut1','CoxIV','LamB2','S100','BMP4','BMP2','BMP6','pS62MYC', 'CGA', 'p63', 'SYP','PDGFRa', 'HIF1a','CC3',
+ 'MUC1','CAV1','MSH2','CSF1R','R13Qc4', 'R13Qc5', 'R13Qc3', 'R13Qc2','R10Qc2','R10Qc3','R10Qc4','R10Qc5',
+ 'R6Qc2', 'R6Qc3','R6Qc4', 'R6Qc5', 'TUBB3', 'CD90', 'GATA3'}#,'PDGFRB'CD66b (Neutrophils)
+ #HLA class II or CD21(Dendritic cells)
+ #BMP4 Fibronectin, CD11b (dendritic, macrophage/monocyte/granulocyte) CD163 (macrophages)
+ #CD83 (dendritic cells) FAP
+ es_wrong = es_names - es_standard
+ es_right = es_standard.intersection(es_names)
+ print(f'Wrong names {es_wrong}')
+ print(f' Right names {es_right}')
+ return(es_wrong)
+
+def copy_dapis(s_r_old='-R11_',s_r_new='-R91_',s_c_old='_c1_',s_c_new='_c2_',s_find='_c1_ORG.tif',b_test=True,type='org'):
+ """
+ copy specified round of dapi, rename with new round and color
+ Input:
+ s_r_old = old round
+ s_r_new = new round on copied DAPI
+ s_c_old = old color
+ s_c_new = new color on copied DAPI
+ s_find= how to identify dapis i.e. '_c1_ORG.tif'
+ b_test=True if testing only
+ """
+ i_dapi = re.sub("[^0-9]", "", s_r_old)
+ ls_test = []
+ for s_file in os.listdir():
+ if s_file.find(s_find) > -1:
+ if s_file.find(s_r_old) > -1:
+ s_file_round = s_file.replace(s_r_old,s_r_new)
+ s_file_color = s_file_round.replace(s_c_old,s_c_new)
+ if type=='org':
+ s_file_dapi = s_file_color.replace(s_file_color.split("_")[1],f'DAPI{i_dapi}.DAPI{i_dapi}.DAPI{i_dapi}.DAPI{i_dapi}')
+ else:
+ s_file_dapi=s_file_color
+ ls_test = ls_test + [s_file]
+ if b_test:
+ print(f'copied file {s_file} \t and named {s_file_dapi}')
+ else:
+ print(f'copied file {s_file} \t and named {s_file_dapi}')
+ shutil.copyfile(s_file, s_file_dapi)
+
+ print(f'total number of files changed is {len(ls_test)}')
+
+def copy_markers(df_img, s_original = 'panCK', ls_copy = ['CK19','CK5','CK7','CK14'],i_last_round = 97, b_test=True, type = 'org'):
+ """
+ copy specified marker image, rename with new round and color (default c2) and marker name
+ Input:
+ s_original = marker to copy
+ df_img = dataframe with images
+ ls_copy = list of fake channels to make
+
+ b_test=True if testing only
+ """
+ df_copy = df_img[df_img.marker==s_original]
+ ls_test = []
+ for s_index in df_copy.index:
+ s_round = df_img.loc[s_index,'rounds']
+ for idx, s_copy in enumerate(ls_copy):
+ i_round = i_last_round + 1 + idx
+ s_round = df_img.loc[s_index,'rounds']
+ s_roundnum = re.sub("[^0-9]", "", s_round)
+ s_round_pre = s_round.replace(s_roundnum,'')
+ s_file_round = s_index.replace(df_img.loc[s_index,'rounds'],f'{s_round_pre}{i_round}')
+ s_file_color = s_file_round.replace(f'_{s_round}_',f'_c{i_round}_')
+ if type == 'org':
+ s_file_dapi = s_file_color.replace(s_file_color.split("_")[1],f'{s_copy}.{s_copy}.{s_copy}.{s_copy}')
+ else:
+ s_file_dapi = s_file_color.replace(f'_{s_original}_',f'_{s_copy}_')
+ ls_test = ls_test + [s_index]
+ if b_test:
+ print(f'copied file {s_index} \t and named {s_file_dapi}')
+ else:
+ print(f'copied file {s_index} \t and named {s_file_dapi}')
+ shutil.copyfile(s_index, s_file_dapi)
+ print(f'total number of files changed is {len(ls_test)}')
+
+def dchange_fname(d_rename={'_oldstring_':'_newstring_'},b_test=True):
+ """
+ replace anything in file name, based on dictionary of key = old
+ values = new
+ Input
+ """
+ #d_rename = {'Registered-R11_CD34.AR.':'Registered-R11_CD34.ARcst.','FoxP3b':'FoxP3bio'}
+ for s_key,s_value in d_rename.items():
+ s_old=s_key
+ s_new=s_value
+ #test
+ if b_test:
+ ls_test = []
+ for s_file in os.listdir():
+ if s_file.find(s_old) > -1:
+ s_file_print = s_file
+ ls_test = ls_test + [s_file]
+ len(ls_test)
+ s_file_new = s_file.replace(s_old,s_new)
+ #print(f'changed file {s_file}\tto {s_file_new}')
+ if len(ls_test)!=0:
+ print(f'changed file {s_file_print}\tto {s_file_new}')
+ print(f'total number of files changed is {len(ls_test)}')
+ #really rename
+ else:
+ ls_test = []
+ for s_file in os.listdir():
+ if s_file.find(s_old) > -1:
+ s_file_print = s_file
+ ls_test = ls_test + [s_file]
+ len(ls_test)
+ s_file_new = s_file.replace(s_old,s_new)
+ #print(f'changed file {s_file}\tto {s_file_new}')
+ os.rename(s_file, s_file_new) #comment out this line to test
+ if len(ls_test)!=0:
+ print(f'changed file {s_file_print}\tto {s_file_new}')
+ print(f'total number of files changed is {len(ls_test)}')
+
+def csv_change_fname(i_scene_len=2, b_test=True):
+ '''
+ give a csv with wrong_round and correct scene names
+ make a Renamed folder
+ the correct scene is added after, as +correct
+ '''
+ df_test = pd.read_csv(f'FinalSceneNumbers.csv',header=0)
+ df_test = df_test.astype(str)#(works!)
+ if i_scene_len == 2:
+ df_scene = df_test.applymap('{:0>2}'.format)
+ elif i_scene_len == 3:
+ df_test.replace('nan','',inplace=True)
+ df_test.replace(to_replace = "\.0+$",value = "", regex = True,inplace=True)
+ df_scene = df_test.applymap('{:0>3}'.format)
+ else:
+ df_scene = df_test #.applymap('{:0>3}'.format)
+ #for each round with wrong names
+ for s_wrong in df_scene.columns[df_scene.columns.str.contains('wrong')]:
+ for s_file in os.listdir():
+ #find files in that round
+ if s_file.find(f'R{s_wrong.split("_")[1]}_') > -1:
+ #print(s_file)
+ #for each scene
+ for s_index in df_scene.index:
+ s_wrong_scene = df_scene.loc[s_index,s_wrong]
+ if s_file.find(f'-Scene-{s_wrong_scene}') > -1:
+ s_correct = df_scene.loc[s_index,'correct']
+ print(s_correct)
+ s_replace = s_file.replace(f'-Scene-{s_wrong_scene}', f'-Scene-{s_wrong_scene}+{s_correct}')
+ s_file_new = f"./Renamed/{s_replace}"
+
+ if b_test:
+ print(f'changed file {s_file} to {s_file_new}')
+ else:
+ os.rename(s_file, s_file_new)
+ print(f'changed file {s_file} to {s_file_new}')
+ return(df_test)
+
+def check_seg_markers(df_img,d_segment = {'CK19':1002,'CK5':5002,'CD45':2002,'Ecad':802,'CD44':1202,'CK7':2002,'CK14':502}, i_rows=1, t_figsize=(20,10)):
+ """
+ This script makes binarizedoverviews of all the specified segmentation markers
+ with specified thresholds, and outputs a rounds cycles table
+ Input: df_dapi: output of mpimage.parse_org()
+ d_segment: segmentation marker names and thresholds
+ i_rows = number or rows in figure
+ t_figsize = (x, y) in inches size of figure
+ Output: dictionary
+ """
+ d_result = {}
+ for s_key,i_item in d_segment.items():
+ #find all segmentation marker slides
+ df_img_seg = df_img[df_img.marker==s_key]
+ fig,ax = plt.subplots(i_rows,(len(df_img_seg)+(i_rows-1))//i_rows, figsize = t_figsize, squeeze=False)
+ ax = ax.ravel()
+ for idx,s_scene in enumerate(sorted(df_img_seg.index.tolist())):
+ print(f'Processing {s_scene}')
+ im_low = skimage.io.imread(s_scene)
+ im = skimage.exposure.rescale_intensity(im_low,in_range=(i_item,i_item+1))
+ ax[idx].imshow(im, cmap='gray')
+ s_round = s_scene.split('Scene')[1].split('_')[0]
+ ax[idx].set_title(f'{s_key} Scene{s_round} min={i_item}',{'fontsize':12})
+ plt.tight_layout()
+ d_result.update({s_key:fig})
+ return(d_result)
+
+def checkall_seg_markers(df_img,d_segment = {'CK19':1002,'CK5':5002,'CD45':2002,'Ecad':802,'CD44':1202,'CK7':2002,'CK14':502}, i_rows=2, t_figsize=(15,10)):
+ """
+ This script makes binarizedoverviews of all the specified segmentation markers
+ with specified thresholds, and it puts all segmentation markers in one figure
+ Input: df_dapi: output of mpimage.parse_org()
+ d_segment: segmentation marker names and thresholds
+ i_rows = number or rows in figure
+ t_figsize = (x, y) in inches size of figure
+ Output: dictionary
+ """
+ es_seg = set([s_key for s_key,i_item in d_segment.items()])
+ df_img_seg = df_img[df_img.marker.isin(es_seg)]
+ fig,ax = plt.subplots(i_rows,(len(es_seg)+(i_rows-1))//i_rows, figsize = t_figsize, squeeze=False)
+ ax = ax.ravel()
+ for idx,s_scene in enumerate(sorted(df_img_seg.index.tolist())):
+ s_key = df_img.loc[s_scene].marker
+ i_item = d_segment[s_key]
+ print(f'Processing {s_scene}')
+ im_low = skimage.io.imread(s_scene)
+ im = skimage.exposure.rescale_intensity(im_low,in_range=(i_item,i_item+1))
+ ax[idx].imshow(im, cmap='gray')
+ s_round = s_scene.split('Scene')[1].split('_')[0]
+ ax[idx].set_title(f'{s_key} Scene{s_round} min={i_item}',{'fontsize':12})
+ plt.tight_layout()
+ #d_result.update({s_key:fig})
+ return(fig)
+
+def rounds_cycles(s_find='-Scene-001_c', d_segment = {'CK19':1002,'CK5':5002,'CD45':4502,'Ecad':802,'CD44':1202,'CK7':2002,'CK14':502}):
+ """
+ Based on filenames in segment folder, makes a dataframe with Rounds Cycles Info
+ """
+ ls_marker = []
+ df_dapi = pd.DataFrame() #(columns=['rounds','colors','minimum','maximum','exposure','refexp','location'])
+ for s_name in sorted(os.listdir()):
+ if s_name.find(s_find) > -1:
+ s_color = s_name.split('_')[3]
+ if s_color != 'c1':
+ #print(s_name)
+ if s_color == 'c2':
+ s_marker = s_name.split('_')[1].split('.')[0]
+ elif s_color == 'c3':
+ s_marker = s_name.split('_')[1].split('.')[1]
+ elif s_color == 'c4':
+ s_marker = s_name.split('_')[1].split('.')[2]
+ elif s_color == 'c5':
+ s_marker = s_name.split('_')[1].split('.')[3]
+ else:
+ print('Error: unrecognized channel name')
+ s_marker = 'error'
+ ls_marker.append(s_marker)
+ df_marker = pd.DataFrame(index = [s_marker],columns=['rounds','colors','minimum','maximum','exposure','refexp','location'])
+ df_marker.loc[s_marker,'rounds'] = s_name.split('_')[0].split('Registered-')[1]
+ df_marker.loc[s_marker,'colors'] = s_name.split('_')[3]
+ df_marker.loc[s_marker,'minimum'] = 1003
+ df_marker.loc[s_marker,'maximum'] = 65535
+ df_marker.loc[s_marker,'exposure'] = 100
+ df_marker.loc[s_marker,'refexp'] = 100
+ df_marker.loc[s_marker,'location'] = 'All'
+ df_dapi = df_dapi.append(df_marker)
+ for s_key,i_item in d_segment.items():
+ df_dapi.loc[s_key,'minimum'] = i_item
+ #if len(ls_marker) != len(set(df_marker.index)):
+ # print('Check for repeated biomarkers!')
+ for s_marker in ls_marker:
+ if (np.array([s_marker == item for item in ls_marker]).sum()) != 1:
+ print('Repeated marker!/n')
+ print(s_marker)
+
+ return(df_dapi, ls_marker)
+
+def cluster_java(s_dir='JE1',s_sample='SampleID',imagedir='PathtoImages',segmentdir='PathtoSegmentation',type='exacloud',b_segment=True,b_TMA=True):
+ """
+ makes specific changes to files in Jenny's Work directories to result in Cluster.java file
+ s_dir = directory to make cluster.java file in
+ s_sample = unique sample ID
+ imagedir = full /path/to/images
+ type = 'exacloud' or 'eppec' (different make file settings)
+ b_TMA = True if tissue is a TMA
+ b_segment = True if segmentation if being done (or False if feature extraction only)
+ """
+ if type=='exacloud':
+ os.chdir(f'{s_work_path}/exacloud/')
+ with open('TemplateExacloudCluster.java') as f:
+ s_file = f.read()
+ elif type=='eppec':
+ os.chdir(f'{s_work_path}/eppec/')
+ with open('TemplateEppecCluster.java') as f:
+ s_file = f.read()
+ else:
+ print('Error: type must be exacloud or eppec')
+ s_file = s_file.replace('PathtoImages',imagedir)
+ s_file = s_file.replace('PathtoSegmentation',f'{segmentdir}/{s_sample.split("-Scene")[0]}_Segmentation/')
+ s_file = s_file.replace('PathtoFeatures',f'{segmentdir}/{s_sample.split("-Scene")[0]}_Features/')
+ if b_segment:
+ s_file = s_file.replace('/*cif.Experiment','cif.Experiment')
+ s_file = s_file.replace('("Segmentation Done!") ;*/','("Segmentation Done!") ;')
+ if b_TMA:
+ s_file = s_file.replace('cif.CROPS ;','cif.TMA ;')
+ os.chdir(f'./{s_dir}/')
+ with open('Cluster.java', 'w') as f:
+ f.write(s_file)
+
+def registration_matlab(N_smpl='10000',N_colors='5',s_rootdir='PathtoImages',s_subdirname='RegisteredImages/',s_ref_id='./R1_*_c1_ORG.tif',
+ ls_order = ['R1','R2','R3','R4','R5','R6','R7','R8','R9','R10','R11','R0','R11Q']):
+
+ """
+ makes specific changes to template matlab scripts files in Jenny's directories to result in .m file
+ Input:
+ N_smpl = i_N_smpl; %number of features to detect in image (default = 10000)
+ N_colors = i_N_colors; %number of colors in R1 (default = 5)
+ ls_order = {RoundOrderString}; %list of names and order of rounds
+ s_rootdir = 'PathtoImages' %location of raw images in folder
+ s_ref_id = 'RefDapiUniqueID'; %shared unique identifier of reference dapi
+ s_subdirname = 'PathtoRegisteredImages' %location of folder where registered images will reside
+ """
+ ls_order_q = [f"'{item}'" for item in ls_order]
+ #find template, open ,edit
+ os.chdir(f'{s_src_path}/src')
+ with open('template_registration_server_multislide_roundorder_scenes_2019_11_11.m') as f:
+ s_file = f.read()
+ s_file = s_file.replace('PathtoImages',s_rootdir)
+ s_file = s_file.replace('PathtoRegisteredImages',s_subdirname)
+ s_file = s_file.replace('i_N_smpl',N_smpl)
+ s_file = s_file.replace('i_N_colors',N_colors)
+ s_file = s_file.replace("RoundOrderString",",".join(ls_order_q))
+ s_file = s_file.replace('RefDapiUniqueID',s_ref_id)
+
+ #save edited .m file
+ os.chdir(s_rootdir)
+ with open('registration_py.m', 'w') as f:
+ f.write(s_file)
+
+def large_registration_matlab(N_smpl='10000',N_colors='5',s_rootdir='PathtoImages',s_subdirname='RegisteredImages',s_ref_id='./R1_*_c1_ORG.tif',
+ ls_order = ['R1','R2','R3','R4','R5','R6','R7','R8','R9','R10','R11','R0','R11Q'],d_crop_regions={1:'[0 0 1000 1000]'}):
+ """
+ makes specific changes to template matlab scripts files in Jenny's directories to result in .m file
+ Input:
+ N_smpl = i_N_smpl; %number of features to detect in image (default = 10000)
+ N_colors = i_N_colors; %number of colors in R1 (default = 5)
+ ls_order = {RoundOrderString}; %list of names and order of rounds
+ s_rootdir = 'PathtoImages' %location of raw images in folder
+ s_ref_id = 'RefDapiUniqueID'; %shared unique identifier of reference dapi
+ s_subdirname = 'PathtoRegisteredImages' %location of folder where registered images will reside
+ d_crop_regions= dictioanr with crop integer as key, ans string with crop array as value e.g. {1:'[0 0 1000 1000]'}
+
+ """
+ ls_order_q = [f"'{item}'" for item in ls_order]
+
+ os.chdir(f'{s_src_path}/src')
+ with open('template_registration_server_largeimages_roundorder_2019_11_11.m') as f:
+ s_file = f.read()
+ s_file = s_file.replace('PathtoImages',s_rootdir)
+ s_file = s_file.replace('PathtoRegisteredImages',s_subdirname)
+ s_file = s_file.replace('i_N_smpl',N_smpl)
+ s_file = s_file.replace('i_N_colors',N_colors)
+ s_file = s_file.replace("RoundOrderString",",".join(ls_order_q))
+ s_file = s_file.replace('RefDapiUniqueID',s_ref_id)
+
+ for i_crop_region, s_crop in d_crop_regions.items():
+ s_file = s_file.replace(f'%{i_crop_region}%{i_crop_region}%','')
+ s_file = s_file.replace(f'[a_crop_{i_crop_region}]',s_crop)
+ #save edited .m file
+ os.chdir(s_rootdir)
+ with open('registration_py.m', 'w') as f:
+ f.write(s_file)
+
+def cmif_mkdir(ls_dir):
+ '''
+ check if directories existe. if not, make them
+ '''
+ for s_dir in ls_dir:
+ if not os.path.exists(s_dir):
+ os.makedirs(s_dir)
+
+######################### Old functions ############################
+
+def check_reg_channels(ls_find=['c1_ORG','c2_ORG'], i_rows=2, t_figsize=(20,10), b_separate = False, b_mkdir=True):
+ """
+ This script makes overviews of all the specified channel images of registered tiff images
+ in a big folder (slides prepared for segmentation for example)
+ Input: ls_find = list of channels to view
+ i_rows = number or rows in figure
+ t_figsize = (x, y) in inches size of figure
+ b_mkdir = boolean whether to make a new Check_Registration folder
+ Output: dictionary with {slide_color:number of rounds found}
+ images of all rounds of a certain slide_color
+ """
+ d_result = {}
+ ls_error = []
+ if b_separate:
+ s_dir = os.getcwd()
+ os.chdir('..')
+ s_path = os.getcwd()
+ if b_mkdir:
+ os.mkdir(f'./Check_Registration')
+ os.chdir(s_dir)
+ else:
+ s_path = os.getcwd()
+ if b_mkdir:
+ os.mkdir(f'./Check_Registration')
+ for s_find in ls_find:
+ #find all dapi slides
+ ls_dapis = []
+ for s_dir in os.listdir():
+ if s_dir.find(s_find) > -1:
+ ls_dapis = ls_dapis + [s_dir]
+
+ #find all unique scenes
+ ls_scene_long = []
+ for s_dapi in ls_dapis:
+ ls_scene_long = ls_scene_long + [(s_dapi.split('_')[2])]
+ ls_scene = list(set(ls_scene_long))
+ ls_scene.sort()
+
+ for s_scene in ls_scene:
+ print(f'Processing {s_scene}')
+ ls_dapi = []
+ for s_file in ls_dapis:
+ if s_file.find(s_scene)>-1:
+ ls_dapi = ls_dapi + [s_file]
+ fig,ax = plt.subplots(i_rows,(len(ls_dapi)+(i_rows-1))//i_rows, figsize = t_figsize)
+ ax = ax.ravel()
+ ls_dapi.sort()
+ for x in range(len(ls_dapi)):
+ im_low = skimage.io.imread(ls_dapi[x])
+ im = skimage.exposure.rescale_intensity(im_low,in_range=(np.quantile(im_low,0.02),np.quantile(im_low,0.98)+np.quantile(im_low,0.98)/2))
+ ax[x].imshow(im, cmap='gray')
+ s_round = ls_dapi[x].split('_')[0].split('-')[1]
+ ax[x].set_title(s_round,{'fontsize':12})
+ s_slide = ls_dapi[0].split('_')[2]
+ plt.tight_layout()
+ fig.savefig(f'{s_path}/Check_Registration/{s_slide}_{s_find}.png')
+ d_result.update({f'{s_slide}_{s_find}':len(ls_dapi)})
+ ls_error = ls_error + [len(ls_dapi)]
+ if(len(set(ls_error))==1):
+ print("All checked scenes/channels have the same number of images")
+ else:
+ print("Warning: different number of images in some scenes/channels")
+ for s_key, i_item in d_result.items():
+ print(f'{s_key} has {i_item} images')
+ return(d_result)
+
+
+def check_names_deprecated(s_find='-Scene-001_c',b_print=False):
+ """
+ Based on filenames in segment folder,
+ checks marker names against standard list of biomarkers
+ returns a dataframe with Rounds Cycles Info, and sets of wrong and correct names
+ Input: s_find = string that will be unique to one scene to check in the folder
+ """
+ df_dapi = pd.DataFrame() #(columns=['rounds','colors','minimum','maximum','exposure','refexp','location'])
+ for s_name in sorted(os.listdir()):
+ if s_name.find(s_find) > -1:
+ s_color = s_name.split('_')[3]
+ if s_color != 'c1':
+ if b_print:
+ print(s_name)
+ if s_color == 'c2':
+ s_marker = s_name.split('_')[1].split('.')[0]
+ elif s_color == 'c3':
+ s_marker = s_name.split('_')[1].split('.')[1]
+ elif s_color == 'c4':
+ s_marker = s_name.split('_')[1].split('.')[2]
+ elif s_color == 'c5':
+ s_marker = s_name.split('_')[1].split('.')[3]
+ else:
+ print('Error: unrecognized channel name')
+ s_marker = 'error'
+ df_marker = pd.DataFrame(index = [s_marker],columns=['rounds','colors','minimum','maximum','exposure','refexp','location'])
+ df_marker.loc[s_marker,'rounds'] = s_name.split('_')[0].split('Registered-')[1]
+ df_marker.loc[s_marker,'colors'] = s_name.split('_')[3]
+ df_marker.loc[s_marker,'minimum'] = 1003
+ df_marker.loc[s_marker,'maximum'] = 65535
+ df_marker.loc[s_marker,'exposure'] = 100
+ df_marker.loc[s_marker,'refexp'] = 100
+ df_marker.loc[s_marker,'location'] = 'All'
+ df_dapi = df_dapi.append(df_marker)
+ es_names = set(df_dapi.index)
+ es_standard = {'PDL1','pERK','CK19','pHH3','CK14','Ki67','Ecad','PCNA','HER2','ER','CD44',
+ 'aSMA','AR','pAKT','LamAC','CK5','EGFR','pRB','FoxP3','CK7','PDPN','CD4','PgR','Vim',
+ 'CD8','CD31','CD45','panCK','CD68','PD1','CD20','CK8','cPARP','ColIV','ColI','CK17',
+ 'H3K4','gH2AX','CD3','H3K27','53BP1','BCL2','GRNZB','LamB1','pS6RP','BAX','RAD51',
+ 'R0c2','R0c3','R0c4','R0c5','R5Qc2','R5Qc3','R5Qc4','R5Qc5','R11Qc2','R11Qc3','R11Qc4','R11Qc5',
+ 'R7Qc2','R7Qc3','R7Qc4','R7Qc5','PDL1ab','PDL1d','R14Qc2','R14Qc3','R14Qc4','R14Qc5',
+ 'R8Qc2','R8Qc3','R8Qc4','R8Qc5','R12Qc2','R12Qc3','R12Qc4','R12Qc5','PgRc4',
+ 'Glut1','CoxIV','LamB2','S100','BMP4','BMP2','BMP6','pS62MYC', 'CGA', 'p63', 'SYP','PDGFRa', 'HIF1a'}#,'PDGFRB'CD66b (Neutrophils) HLA class II or CD21(Dendritic cells)
+ #BMP4 Fibronectin, CD11b (dendritic, macrophage/monocyte/granulocyte) CD163 (macrophages)
+ #CD83 (dendritic cells) FAP Muc1
+ es_wrong = es_names - es_standard
+ es_right = es_standard.intersection(es_names)
+ print(f'Wrong names {es_wrong}')
+ print(f' Right names {es_right}')
+ return(df_dapi, es_wrong, es_right)
+
+def file_sort(s_sample, s_path, i_scenes=14,i_rounds=12,i_digits=3,ls_quench=['R5Q','R11Q'],s_find='_ORG.tif',b_scene=False):
+ '''
+ count rounds and channels of images (koeis naming convention, not registered yet)
+ '''
+ os.chdir(s_path)
+ se_dir = pd.Series(os.listdir())
+
+ se_dir = se_dir[se_dir.str.find(s_find)>-1]
+ se_dir = se_dir.sort_values()
+ se_dir = se_dir.reset_index()
+ se_dir = se_dir.drop('index',axis=1)
+
+ print(s_sample)
+ print(f'Total _ORG.tif: {len(se_dir)}')
+
+ #count files in each round, plus store file names on df_round
+ df_round = pd.DataFrame(index=range(540))
+ i_grand_tot = 0
+ for x in range(i_rounds):
+ se_round = se_dir[se_dir.iloc[:,0].str.contains(f'R{str(x)}_')]
+ se_round = se_round.rename({0:'round'},axis=1)
+ se_round = se_round.sort_values(by='round')
+ se_round = se_round.reset_index()
+ se_round = se_round.drop('index',axis=1)
+ i_tot = se_dir.iloc[:,0].str.contains(f'R{str(x)}_').sum()
+ i_round = 'Round ' + str(x)
+ print(f'{i_round}: {i_tot}')
+ i_grand_tot = i_grand_tot + i_tot
+ df_round[i_round]=se_round
+ df_round = df_round.dropna()
+
+ #quenched round special loop
+ for s_quench in ls_quench:
+ #x = "{0:0>2}".format(x)
+ i_tot = se_dir.iloc[:,0].str.contains(s_quench).sum()
+ #i_round = 'Round ' + str(x)
+ print(f'{s_quench}: {i_tot}')
+ i_grand_tot = i_grand_tot + i_tot
+ print(f'Total files containing Rxx_: {i_grand_tot}')
+
+ if b_scene:
+ #print number of files in each scene
+ for x in range(1,i_scenes+1):
+ if i_digits==3:
+ i_scene = "{0:0>3}".format(x)
+ elif i_digits==2:
+ i_scene = "{0:0>2}".format(x)
+ elif i_digits==1:
+ i_scene = "{0:0>1}".format(x)
+ else:
+ print('wrong i_digits input (must be between 1 and 3')
+ i_tot = se_dir.iloc[:,0].str.contains(f'Scene-{i_scene}_').sum()
+ i_round = 'Scene ' + str(x)
+ print(f'{i_round}: {i_tot}')
+
+ #print number of files in each color
+ for x in range(1,6):
+ #i_scene = "{0:0>2}".format(x)
+ i_tot = se_dir.iloc[:,0].str.contains(f'_c{str(x)}_ORG').sum()
+ i_round = 'color ' + str(x)
+ print(f'{i_round}: {i_tot}')
+
+ d_result = {}
+ for s_round in df_round.columns:
+ es_round = set([item.split('-Scene-')[1].split('_')[0] for item in list(df_round.loc[:,s_round].values)])
+ d_result.update({s_round:es_round})
+ print('\n')
+
+
+def change_fname(s_old='_oldstring_',s_new='_newstring_',b_test=True):
+ """
+ replace anything in file name
+ """
+ if b_test:
+ ls_test = []
+ for s_file in os.listdir():
+ if s_file.find(s_old) > -1:
+ ls_test = ls_test + [s_file]
+ len(ls_test)
+ s_file_new = s_file.replace(s_old,s_new)
+ print(f'changed file {s_file}\tto {s_file_new}')
+
+ print(f'total number of files changed is {len(ls_test)}')
+ #really rename
+ else:
+ ls_test = []
+ for s_file in os.listdir():
+ if s_file.find(s_old) > -1:
+ ls_test = ls_test + [s_file]
+ len(ls_test)
+ s_file_new = s_file.replace(s_old,s_new)
+ print(f'changed file {s_file}\tto {s_file_new}')
+ os.rename(s_file, s_file_new) #comment out this line to test
+ print(f'total number of files changed is {len(ls_test)}')
+
+def check_reg_slides(i_rows=2, t_figsize=(20,10), b_mkdir=True):
+ """
+ This script makes overviews of all the dapi images of registered images in a big folder (slides prepared for segmentation for example)
+ """
+ #find all dapi slides
+ ls_dapis = []
+ for s_dir in os.listdir():
+ if s_dir.find('c1_ORG') > -1:
+ ls_dapis = ls_dapis + [s_dir]
+
+ #find all scenes
+ ls_scene_long = []
+ for s_dapi in ls_dapis:
+ ls_scene_long = ls_scene_long + [(s_dapi.split('Scene')[1].split('_')[0])]
+ ls_scene = list(set(ls_scene_long))
+ ls_scene.sort()
+ if b_mkdir:
+ os.mkdir(f'./Check_Registration')
+ for s_scene in ls_scene:
+ print(f'Processing {s_scene}')
+ ls_dapi = []
+ for s_file in ls_dapis:
+ if s_file.find(f'Scene{s_scene}')>-1:
+ ls_dapi = ls_dapi + [s_file]
+ fig,ax = plt.subplots(i_rows,(len(ls_dapi)+(i_rows-1))//i_rows, figsize = t_figsize)
+ ax = ax.ravel()
+ ls_dapi.sort()
+ for x in range(len(ls_dapi)):
+ im_low = skimage.io.imread(ls_dapi[x])
+ im = skimage.exposure.rescale_intensity(im_low,in_range=(np.quantile(im_low,0.02),np.quantile(im_low,0.98)+np.quantile(im_low,0.98)/2))
+ ax[x].imshow(im, cmap='gray')
+ s_round = ls_dapi[x].split('_')[0].split('-')[1]
+ ax[x].set_title(s_round,{'fontsize':12})
+ s_slide = ls_dapi[0].split('_')[2]
+ plt.tight_layout()
+ fig.savefig(f'Check_Registration/{s_slide}.png')
+
+def check_reg_dirs(s_dir='SlideName',s_subdir='Registered-SlideName', i_rows=2, t_figsize=(20,10), b_mkdir=True):
+ """
+ this checks registration when files are in subdirectories (such as with large tissues, i.e. NP005)
+ """
+
+ rootdir = os.getcwd()
+ if b_mkdir:
+ os.mkdir(f'./Check_Registration')
+ #locate subdirectores
+ for s_dir in os.listdir():
+ if s_dir.find(s_dir) > -1:
+ os.chdir(f'./{s_dir}')
+
+ #locate registered image folders
+ for s_dir in os.listdir():
+ #for s_dir in ls_test2:
+ if s_dir.find(s_subdir) > -1: #'Registered-BR1506-A019-Scene'
+ print(f'Processing {s_dir}')
+ ls_dapi = []
+ os.chdir(f'./{s_dir}')
+ ls_file = os.listdir()
+ for s_file in ls_file:
+ if s_file.find('_c1_ORG.tif')>-1:
+ ls_dapi = ls_dapi + [s_file]
+ fig,ax = plt.subplots(i_rows,(len(ls_dapi)+(i_rows-1))//i_rows, figsize = (t_figsize)) #vertical
+ ax=ax.ravel()
+ ls_dapi.sort()
+ for x in range(len(ls_dapi)):
+ im_low = skimage.io.imread(ls_dapi[x])
+ im = skimage.exposure.rescale_intensity(im_low,in_range=(np.quantile(im_low,0.02),np.quantile(im_low,0.98)+np.quantile(im_low,0.98)/2))
+ ax[x].imshow(im, cmap='gray')
+ s_round = ls_dapi[x].split('_')[0].split('-')[1]
+ s_scene = ls_dapi[x].split('-Scene')[1].split('_')[0]
+ ax[x].set_title(f'{s_round} Scene{s_scene}',{'fontsize':12})
+ plt.tight_layout()
+
+ #save figure in the rootdir/Check_Registration folder
+ fig.savefig(f'{rootdir}/Check_Registration/{s_dir}.png')
+ #go out of the subfoler and start next processing
+ os.chdir('..')
+
+def test(name="this_is_you_name"):
+ '''
+ This is my first doc string
+ '''
+ print(f'hello {name}')
+ return True
diff --git a/mplex_image/process.py b/mplex_image/process.py
new file mode 100755
index 0000000..9057580
--- /dev/null
+++ b/mplex_image/process.py
@@ -0,0 +1,1208 @@
+####
+# title: process.py
+#
+# language: Python3.6
+# date: 2019-05-00
+# license: GPL>=v3
+# author: Jenny
+#
+# description:
+# python3 library to process cyclic data and images after segmentation
+####
+
+#libraries
+import pandas as pd
+import matplotlib as mpl
+mpl.use('agg')
+import matplotlib.pyplot as plt
+import os
+import numpy as np
+import skimage
+import copy
+import re
+import seaborn as sns
+from PIL import Image
+Image.MAX_IMAGE_PIXELS = 1000000000
+
+#function cellpose
+def load_cellpose_df(ls_sample, segdir):
+ '''
+ load all full feature dataframes in sample list
+ '''
+ df_mi_full = pd.DataFrame()
+ for idx, s_sample in enumerate(ls_sample):
+ print(f'Loading features_{s_sample}_MeanIntensity_Centroid_Shape.csv')
+ df_tt = pd.read_csv(f'{segdir}/features_{s_sample}_MeanIntensity_Centroid_Shape.csv',index_col=0)
+ df_tt['slide'] = s_sample.split('-Scene')[0]
+ df_tt['scene'] = [item.split('_')[1] for item in df_tt.index]
+ df_mi_full = df_mi_full.append(df_tt,sort=True)
+ #add scene
+ df_mi_full['slide_scene'] = df_mi_full.slide + '_' + df_mi_full.scene
+ print('')
+ return(df_mi_full)
+
+# load li thresholds
+def load_li(ls_sample, s_thresh, man_thresh):
+ '''
+ load threshold on the segmentation marker images acquired during feature extraction
+ '''
+ df_img_all =pd.DataFrame()
+ for s_sample in ls_sample:
+ print(f'Loading thresh_{s_sample}_ThresholdLi.csv')
+ df_img = pd.read_csv(f'thresh_{s_sample}_ThresholdLi.csv', index_col=0)
+ df_img['rounds'] = [item.split('_')[0].split('Registered-')[1] for item in df_img.index]
+ df_img['color'] = [item.split('_')[-2] for item in df_img.index]
+ df_img['slide'] = [item.split('_')[2].split('-Scene-')[0] for item in df_img.index]
+ df_img['scene'] = [item.split('_')[2].split('-Scene-')[1] for item in df_img.index]
+ df_img['slide_scene'] = df_img.slide + '_scene' + df_img.scene
+ #parse file name for biomarker
+ for s_index in df_img.index:
+ #print(s_index)
+ s_color = df_img.loc[s_index,'color']
+ if s_color == 'c1':
+ s_marker = f"DAPI{df_img.loc[s_index,'rounds'].split('R')[1]}"
+ elif s_color == 'c2':
+ s_marker = s_index.split('_')[1].split('.')[0]
+ elif s_color == 'c3':
+ s_marker = s_index.split('_')[1].split('.')[1]
+ elif s_color == 'c4':
+ s_marker = s_index.split('_')[1].split('.')[2]
+ elif s_color == 'c5':
+ s_marker = s_index.split('_')[1].split('.')[3]
+ else: print('Error')
+ df_img.loc[s_index,'marker'] = s_marker
+ df_img_all = df_img_all.append(df_img)
+ print('')
+ #manually override too low Ecad thresh
+ if s_thresh !='':
+ df_img_all.loc[df_img_all[(df_img_all.marker==s_thresh) & (df_img_all.threshold_li < man_thresh)].index, 'threshold_li'] = man_thresh
+ return(df_img_all)
+
+def filter_cellpose_xy(df_mi_full,ls_centroid = ['DAPI2_nuclei_area', 'DAPI2_nuclei_centroid-0', 'DAPI2_nuclei_centroid-1','DAPI2_nuclei_eccentricity']):
+ '''
+ select the nuclei centoids, area, eccentricity from a marker
+ default: use DAPI2
+ '''
+ #NOTE add area
+ df_xy = df_mi_full.loc[:,ls_centroid]
+ print('QC: make sure centroids dont have too many NAs')
+ print(df_xy.isna().sum())
+ print('')
+ df_xy = df_xy.dropna(axis=0,how='any')
+ df_xy.columns = ['nuclei_area','DAPI_Y','DAPI_X','nuclei_eccentricity']
+ df_xy['slide_scene'] = [item.split('_cell')[0] for item in df_xy.index]
+ return(df_xy)
+
+def drop_last_rounds(df_img_all,ls_filter,df_mi_full):
+ '''
+ drop any rounds after the last round DAPI filter
+ '''
+ df_img_all['round_ord'] = [re.sub('Q','.5', item) for item in df_img_all.rounds]
+ df_img_all['round_ord'] = [float(re.sub('[^0-9.]','', item)) for item in df_img_all.round_ord]
+ i_max = df_img_all[df_img_all.marker.isin([item.split('_')[0] for item in ls_filter])].sort_values('round_ord').iloc[-1].round_ord
+ print(f'Dropping markers after round {i_max}')
+ ls_drop_marker = [item + '_' for item in sorted(set(df_img_all[(df_img_all.round_ord>i_max)].marker))]
+ [print(item) for item in ls_drop_marker]
+ print('')
+ [df_mi_full.drop(df_mi_full.columns[df_mi_full.columns.str.contains(item)],axis=1,inplace=True) for item in ls_drop_marker]
+ return(df_mi_full,i_max)
+
+def plot_thresh(df_img_all,s_thresh):
+ '''
+ tissues: plot threshold across all tissues
+ (negative scenes will drive down the mean
+ '''
+ ls_slides = sorted(set(df_mi_full.slide))
+ df_plot = df_img_all[(df_img_all.marker==s_thresh)].loc[:,['threshold_li']]
+ fig,ax=plt.subplots(figsize=(4,3.5))
+ sns.stripplot(data=df_plot)
+ sns.barplot(data=df_plot, alpha=0.5)
+ labels = ax.get_xticklabels
+ plt.tight_layout()
+ fig.savefig(f'{qcdir}/QC_EcadThresh_{".".join(ls_slides)}.png')
+
+def fill_cellpose_nas(df_mi_full,ls_marker_cyto,s_thresh='Ecad',man_thresh=1000):
+ '''
+ some nuclei don't have a cytoplasm, replace NA with perinuc5
+ '''
+ df = df_mi_full.copy(deep=True)
+ # since segmentation was run on ecad, use ecad threshold
+ print(f'Finding {s_thresh} positive cells')
+ ls_neg_cells = (df_mi_full[~(df_mi_full.loc[:,f'{s_thresh}_cytoplasm'] > man_thresh)]).index.tolist()#= ls_neg_cells + ls_neg_slide
+ print('')
+ # replace cells without cytoplasm (ecad) with perinuc 5
+ print(f'For cells that are {s_thresh} negative:')
+ for s_marker in ls_marker_cyto:
+ print(f'Replace {s_marker}_cytoplasm nas')
+ df.loc[ls_neg_cells,f'{s_marker}_cytoplasm'] = df.loc[ls_neg_cells,f'{s_marker}_perinuc5']
+ print(f'with {s_marker}_perinuc5')
+ df[f'{s_thresh}_negative'] = df.index.isin(ls_neg_cells)
+ return(df)
+
+def shrink_seg_regions(df_mi_full,s_thresh,ls_celline=[],ls_shrunk=[]):
+ '''
+ For markers with stromal to tumor bleedthrough, use shrunken segmentation region
+ '''
+ #enforce cell lines as tumor
+ print('')
+ if len(ls_celline) > 0:
+ print([f'Enforce {item} as tumor' for item in ls_celline])
+ ls_ecad_cells = df_mi_full[~df_mi_full.loc[:,f'{s_thresh}_negative']].index
+ ls_tumor_cells = (df_mi_full[(df_mi_full.index.isin(ls_ecad_cells)) | (df_mi_full.slide_scene.isin(ls_celline))]).index
+ ls_stromal_cells = (df_mi_full[~df_mi_full.index.isin(ls_tumor_cells)]).index
+ #relplace tumor cell CD44 and Vim with shrunken area (only helps bleed trough a little)
+ print('For markers with stromal to tumor bleedthrough, use shrunken segmentation region:')
+ for s_marker in ls_shrunk:
+ print(f'Replace {s_marker.split("_")[0]}_perinuc5 in tumor cells with')
+ df_mi_full.loc[ls_tumor_cells,f'{s_marker.split("_")[0]}_perinuc5'] = df_mi_full.loc[ls_tumor_cells,f'{s_marker}']
+ print(f'with {s_marker}')
+ print('')
+ return(df_mi_full)
+
+def fill_membrane_nas(df_mi_full, df_mi_mem,s_thresh='Ecad',ls_membrane=['HER2']):
+ '''
+ fill cell membrane nsa with expanded nuclei nas
+ '''
+ ls_neg = df_mi_full[(df_mi_full.loc[:,f'{s_thresh}_negative']) & (df_mi_full.index.isin(df_mi_mem.index))].index
+ ls_pos = df_mi_full[(~df_mi_full.loc[:,f'{s_thresh}_negative']) & (df_mi_full.index.isin(df_mi_mem.index))].index
+ for s_membrane in ls_membrane:
+ print(f'Replace {s_membrane}_cellmem25 nas \n with {s_membrane}_exp5nucmembrane25')
+ df_mi_mem.loc[ls_neg,f'{s_membrane}_cellmem25'] = df_mi_mem.loc[ls_neg,f'{s_membrane}_exp5nucmembrane25']
+ ls_na = df_mi_mem.loc[df_mi_mem.loc[:,f'{s_membrane}_cellmem25'].isna(),:].index
+ df_mi_mem.loc[ls_na,f'{s_membrane}_cellmem25'] = df_mi_mem.loc[ls_na,f'{s_membrane}_exp5nucmembrane25']
+ df_merge = df_mi_full.merge(df_mi_mem, left_index=True, right_index=True)
+ print('')
+ return(df_merge)
+
+def fill_bright_nas(ls_membrane,s_sample,s_thresh,df_mi_filled,segdir):
+ if len(ls_membrane) > 0:
+ print(f'Loading features_{s_sample}_BrightMeanIntensity.csv')
+ df_mi_mem = pd.read_csv(f'{segdir}/features_{s_sample}_BrightMeanIntensity.csv',index_col=0)
+ df_mi_mem_fill = fill_membrane_nas(df_mi_filled, df_mi_mem,s_thresh=s_thresh,ls_membrane=ls_membrane)
+ else:
+ df_mi_mem_fill = df_mi_filled
+ return(df_mi_mem_fill)
+
+def auto_threshold(df_mi,df_img_all):
+ # # Auto threshold
+
+ #make positive dataframe to check threhsolds
+ ls_scene = sorted(set(df_mi.slide_scene))
+
+ df_pos_auto = pd.DataFrame()
+ d_thresh_record= {}
+
+ for s_slide_scene in ls_scene:
+ print(f'Thresholding {s_slide_scene}')
+ ls_index = df_mi[df_mi.slide_scene==s_slide_scene].index
+ df_scene = pd.DataFrame(index=ls_index)
+ df_img_scene = df_img_all[df_img_all.slide_scene==s_slide_scene]
+
+ for s_index in df_img_scene.index:
+ s_scene =f"{df_img_all.loc[s_index,'slide']}_scene{df_img_all.loc[s_index,'scene']}"
+ s_marker = df_img_all.loc[s_index,'marker']
+ s_columns = df_mi.columns[df_mi.columns.str.contains(f"{s_marker}_")]
+ if len(s_columns)==1:
+ s_marker_loc = s_columns[0]
+ else:
+ continue
+ i_thresh = df_img_all.loc[s_index,'threshold_li']
+ d_thresh_record.update({f'{s_scene}_{s_marker}':i_thresh})
+ df_scene.loc[ls_index,s_marker_loc] = df_mi.loc[ls_index,s_marker_loc] >= i_thresh
+ df_pos_auto = df_pos_auto.append(df_scene)
+ return(df_pos_auto,d_thresh_record)
+
+def positive_scatterplots(df_pos_auto,d_thresh_record,df_xy,ls_color,qcdir='.'):
+ '''
+ for marker in ls_color, plot positive cells location in tissue
+ '''
+ ls_scene = sorted(set(df_xy.slide_scene))
+
+ for s_scene in ls_scene:
+ print(f'Plotting {s_scene}')
+ #negative cells = all cells even before dapi filtering
+ df_neg = df_xy[(df_xy.slide_scene==s_scene)]
+ #plot
+ fig, ax = plt.subplots(2, ((len(ls_color))+1)//2, figsize=(18,12)) #figsize=(18,12)
+ ax = ax.ravel()
+ for ax_num, s_color in enumerate(ls_color):
+ s_marker = s_color.split('_')[0]
+ s_min = d_thresh_record[f"{s_scene}_{s_marker}"]
+ #positive cells = positive cells based on threshold
+ ls_pos_index = (df_pos_auto[df_pos_auto.loc[:,s_color]]).index
+ df_color_pos = df_neg[df_neg.index.isin(ls_pos_index)]
+ if len(df_color_pos)>=1:
+ #plot negative cells
+ ax[ax_num].scatter(data=df_neg,x='DAPI_X',y='DAPI_Y',color='silver',s=1)
+ #plot positive cells
+ ax[ax_num].scatter(data=df_color_pos, x='DAPI_X',y='DAPI_Y',color='DarkBlue',s=.5)
+
+ ax[ax_num].axis('equal')
+ ax[ax_num].set_ylim(ax[ax_num].get_ylim()[::-1])
+ ax[ax_num].set_title(f'{s_marker} min={int(s_min)}')
+ else:
+ ax[ax_num].set_title(f'{s_marker} min={int(s_min)}')
+ ls_save = [item.split('_')[0] for item in ls_color]
+ fig.suptitle(s_scene)
+ fig.savefig(f'{qcdir}/QC_{".".join(ls_save)}_{s_scene}_auto.png')
+
+def plot_thresh_results(df_img_all,df_pos_auto,d_thresh_record,df_xy,i_max,s_thresh,qcdir):
+ ls_color = [item + '_nuclei' for item in df_img_all[(df_img_all.round_ord<=i_max) & (df_img_all.slide_scene==df_img_all.slide_scene.unique()[0]) & (df_img_all.marker.str.contains('DAPI'))].marker.tolist()]
+ positive_scatterplots(df_pos_auto,d_thresh_record,df_xy,ls_color + [f'{s_thresh}_cytoplasm'],qcdir)
+ return(ls_color)
+
+def filter_dapi_cellpose(df_pos_auto,ls_color,df_mi,ls_filter,qcdir='.'):
+ '''
+ filter by cell positive for DAPI autotresholding, in rounds specified in ls_filter
+ error
+ '''
+ #plot dapi thresholds
+ df_pos_auto['slide_scene'] = [item.split('_cell')[0] for item in df_pos_auto.index]
+ fig,ax=plt.subplots(figsize=(10,5))
+ df_plot = df_pos_auto.loc[:,ls_color+['slide_scene']]
+ df_scenes = df_plot.groupby('slide_scene').sum().T/df_plot.groupby('slide_scene').sum().max(axis=1)
+ df_scenes.plot(ax=ax,colormap='tab20')
+ ax.set_xticks(np.arange(0,(len(df_scenes.index)),1)) #+1
+ ax.set_xticklabels([item.split('_')[0] for item in df_scenes.index])
+ ax.set_ylim(0.5,1.1)
+ ax.legend(loc=3)
+ plt.tight_layout()
+ df_pos_auto['slide'] = [item.split('_')[0] for item in df_pos_auto.index]
+ ls_slides = sorted(set(df_pos_auto.slide))
+ fig.savefig(f'{qcdir}/QC_DAPIRounds_lineplot_{".".join(ls_slides)}.png')
+ #filter by first and last round dapi
+ ls_dapi_index = df_pos_auto[df_pos_auto.loc[:,ls_filter].all(axis=1)].index
+ #also filter by any dapi less than 1 in mean intensity
+ ls_dapi_missing = df_mi[(df_mi.loc[:,ls_color] < 1).sum(axis=1) > 0].index.tolist()
+ es_dapi_index = set(ls_dapi_index) - set(ls_dapi_missing)
+ print(f'number of cells before DAPI filter = {len(df_mi)}')
+ df_mi_filter = df_mi.loc[df_mi.index.isin(es_dapi_index),:]
+ [print(f'filtering by {item}') for item in ls_filter]
+ print(f'number of cells after DAPI filter = {len(df_mi_filter)}')
+ #drop cells with euler numer > 1
+ #
+ #
+ return(df_mi_filter)
+
+def load_li_thresh(ls_sample, segdir):
+ # load li thresholds
+ os.chdir(segdir)
+ df_img_all =pd.DataFrame()
+ for s_sample in ls_sample:
+ df_img = pd.read_csv(f'thresh_{s_sample}_ThresholdLi.csv', index_col=0)
+ df_img['rounds'] = [item.split('_')[0].split('Registered-')[1] for item in df_img.index]
+ df_img['color'] = [item.split('_')[-2] for item in df_img.index]
+ df_img['slide'] = [item.split('_')[2].split('-Scene-')[0] for item in df_img.index]
+ df_img['scene'] = [item.split('_')[2].split('-Scene-')[1] for item in df_img.index]
+ df_img['slide_scene'] = df_img.slide + '_scene' + df_img.scene
+ #parse file name for biomarker
+ for s_index in df_img.index:
+ #print(s_index)
+ s_color = df_img.loc[s_index,'color']
+ if s_color == 'c1':
+ s_marker = f"DAPI{df_img.loc[s_index,'rounds'].split('R')[1]}"
+ elif s_color == 'c2':
+ s_marker = s_index.split('_')[1].split('.')[0]
+ elif s_color == 'c3':
+ s_marker = s_index.split('_')[1].split('.')[1]
+ elif s_color == 'c4':
+ s_marker = s_index.split('_')[1].split('.')[2]
+ elif s_color == 'c5':
+ s_marker = s_index.split('_')[1].split('.')[3]
+ else: print('Error')
+ df_img.loc[s_index,'marker'] = s_marker
+ df_img_all = df_img_all.append(df_img)
+ return(df_img_all)
+
+def filter_standard(df_mi,d_channel,s_dapi):
+ """
+ If biomarkers have standard names according to preprocess.check_names,
+ use the hard coded locations, adds any channels needed for af subtraction
+ Input:
+ df_mi= mean intensity dataframe
+ d_channel = dictionary of channel:background marker
+ """
+ es_standard = {'PDL1_Ring','pERK_Nuclei','CK19_Ring','pHH3_Nuclei','CK14_Ring','Ki67_Nuclei','Ki67r_Nuclei','Ecad_Ring','PCNA_Nuclei','HER2_Ring','ER_Nuclei','CD44_Ring',
+ 'aSMA_Ring','AR_Nuclei','pAKT_Ring','LamAC_Nuclei','CK5_Ring','EGFR_Ring','pRb_Nuclei','FoxP3_Nuclei','CK7_Ring','PDPN_Ring','CD4_Ring','PgR_Nuclei','Vim_Ring',
+ 'CD8_Ring','CD31_Ring','CD45_Ring','panCK_Ring','CD68_Ring','PD1_Ring','CD20_Ring','CK8_Ring','cPARP_Nuclei','ColIV_Ring','ColI_Ring','CK17_Ring',
+ 'H3K4_Nuclei','gH2AX_Nuclei','CD3_Ring','H3K27_Nuclei','53BP1_Nuclei','BCL2_Ring','GRNZB_Nuclei','LamB1_Nuclei','pS6RP_Ring','BAX_Nuclei','RAD51_Nuclei',
+ 'Glut1_Ring','CoxIV_Ring','LamB2_Nuclei','S100_Ring','BMP4_Ring','PgRc4_Nuclei','pRB_Nuclei','p63_Nuclei','p63_Ring','CGA_Ring','SYP_Ring','pS62MYC_Nuclei', 'HIF1a_Nuclei',
+ 'PDGFRa_Ring', 'BMP2_Ring','PgRb_Nuclei','MUC1_Ring','CSF1R_Ring','CAV1_Ring','CCND1_Nuclei','CC3_Nuclei' } #PgRb is second PgR in dataset
+ #generate list of background markers needed for subtraction
+ lls_d_channel = []
+ for s_key,ls_item in d_channel.items():
+ lls_d_channel = lls_d_channel + [ls_item]
+ ls_background = []
+ for ls_channel in lls_d_channel:
+ ls_background = ls_background + [f'{ls_channel[0]}_Ring']
+ ls_background = ls_background + [f'{ls_channel[1]}_Nuclei']
+ #ls_background.append(f'{s_dapi}_Nuclei')
+ ls_background.append(f'{s_dapi}')
+ se_background = set(ls_background)
+ es_common = set(df_mi.columns.tolist()).intersection(es_standard) | se_background
+ df_filtered_mi = df_mi.loc[:,sorted(es_common)]
+ return(df_filtered_mi, es_standard)
+
+def filter_loc_cellpose(df_mi_filled, ls_marker_cyto, ls_custom,filter_na=True):
+ '''
+ get nuclei, perinuclei or cytoplasm, based on filter standard function
+ '''
+ __ , es_standard = filter_standard(pd.DataFrame(columns=['filter_standard']),{},'filter_standard')
+ ls_marker = sorted(set([item.split('_')[0] for item in df_mi_filled.columns[(df_mi_filled.dtypes=='float64') & (~df_mi_filled.columns.str.contains('25'))]]))
+ if ls_marker.count('mean') != 0:
+ ls_marker.remove('mean')
+ es_marker = set(ls_marker)
+ se_stand = pd.Series(index=es_standard)
+ es_dapi = set([item.split('_')[0] for item in df_mi_filled.columns[df_mi_filled.columns.str.contains('DAPI')]])
+ es_nuc = set([item.split('_')[0] for item in se_stand[se_stand.index.str.contains('_Nuclei')].index])
+ es_nuc_select = es_nuc.intersection(es_marker)
+ print('Nuclear markers:')
+ print(es_nuc_select)
+ es_ring = set([item.split('_')[0] for item in se_stand[se_stand.index.str.contains('_Ring')].index])
+ es_ring_select = es_ring.intersection(es_marker)
+ es_cyto = set(ls_marker_cyto) #set([item.split('_')[0] for item in ls_marker_cyto])
+ es_ring_only = es_ring_select - es_cyto
+ print('Ring markers:')
+ print(es_ring_only)
+ print('Cytoplasm markers:')
+ print(es_cyto)
+ es_cust = set([item.split('_')[0] for item in ls_custom])
+ es_left = es_marker - es_ring_only - es_cyto - es_nuc_select - es_dapi - es_cust
+ print('Custom markers:')
+ print(es_cust)
+ print('Markers with Nuclei or Cyto not specified: take both nuclei and ring')
+ print(es_left)
+ ls_n = [item + '_nuclei' for item in sorted(es_left | es_nuc_select | es_dapi)]
+ ls_pn = [item + '_perinuc5' for item in sorted(es_left | es_ring_only)]
+ ls_cyto = [item + '_cytoplasm' for item in sorted(es_cyto)]
+ ls_all = ls_custom + ls_pn + ls_cyto + ls_n + ['slide_scene']
+ print(f'Missing {set(ls_all) - set(df_mi_filled.columns)}')
+ df_filter = df_mi_filled.loc[:,ls_all]
+ print('')
+ if filter_na:
+ print(f' NAs filtered: {len(df_filter) - len(df_filter.dropna())}')
+ df_filter = df_filter.dropna()
+ print('')
+ return(df_filter,es_standard)
+
+def marker_table(df_img_all,qcdir):
+ '''
+ make a nice rounds/channels/markers table
+ '''
+ df_img_all['round_ord'] = [re.sub('Q','.5', item) for item in df_img_all.rounds]
+ df_img_all['round_ord'] = [re.sub('r','.25', item) for item in df_img_all.round_ord]
+ df_img_all['round'] = [float(re.sub('[^0-9.]','', item)) for item in df_img_all.round_ord]
+ df_marker = df_img_all[(df_img_all.slide_scene==df_img_all.slide_scene.unique()[0])].loc[:,['marker','round','color']].pivot('round','color')
+ df_marker.index.name = None
+ df_marker.to_csv(f'{qcdir}/MarkerTable.csv',header=None)
+
+def filter_cellpose_df(s_sample,segdir,qcdir,s_thresh,ls_membrane,ls_marker_cyto,ls_custom,ls_filter,ls_shrunk,man_thresh = 900):
+ '''
+ go from full dataframe and membrane dataframe to filtered datframe and xy coordinate dataframe
+ s_thresh='Ecad'
+ ls_membrane = ['HER2']
+ ls_marker_cyto = ['CK14','CK5','CK17','CK19','CK7','CK8','Ecad','HER2','EGFR']
+ ls_custom = ['HER2_cellmem25']
+ ls_filter = ['DAPI9_nuclei','DAPI2_nuclei']
+ ls_shrunk = ['CD44_nucadj2','Vim_nucadj2']
+ man_thresh = 900
+ '''
+ # new
+ os.chdir(segdir)
+ df_img_all = load_li([s_sample],s_thresh, man_thresh)
+ df_mi_full = load_cellpose_df([s_sample], segdir)
+ df_xy = filter_cellpose_xy(df_mi_full)
+ df_mi_full, i_max = drop_last_rounds(df_img_all,ls_filter,df_mi_full)
+ df_mi_filled = fill_cellpose_nas(df_mi_full,ls_marker_cyto,s_thresh=s_thresh,man_thresh=man_thresh)
+ df_mi_filled = shrink_seg_regions(df_mi_filled,s_thresh,ls_celline=[],ls_shrunk=ls_shrunk)
+ df_mi_mem_fill = fill_bright_nas(ls_membrane,s_sample,s_thresh,df_mi_filled,segdir)
+ df_mi,es_standard = filter_loc_cellpose(df_mi_mem_fill, ls_marker_cyto, ls_custom)
+ df_pos_auto,d_thresh_record = auto_threshold(df_mi,df_img_all)
+ ls_color = plot_thresh_results(df_img_all,df_pos_auto,d_thresh_record,df_xy,i_max,s_thresh,qcdir)
+ df_mi_filter = filter_dapi_cellpose(df_pos_auto,ls_color,df_mi,ls_filter,qcdir)
+ df_mi_filter.to_csv(f'{segdir}/features_{s_sample}_FilteredMeanIntensity_{"_".join([item.split("_")[0] for item in ls_filter])}.csv')
+ df_xy.to_csv(f'{segdir}/features_{s_sample}_CentroidXY.csv')
+ return(df_mi_mem_fill,df_img_all)
+
+def filter_cellpose_background(df_mi_filled, es_standard):
+ '''
+ given a set of standard biomarker subcellular locations, obtain the opposite subcellular location
+ and the mean intensity
+ input: df_mi = mean intensity dataframe with all biomarker locations
+ es_standard = biomarker ring or nuclei
+ return: dataframe with each scene and the quantiles of the negative cells scene
+ '''
+ ls_rim = [item.replace('Nuclei','cytoplasm') for item in sorted(es_standard)]
+ ls_nuc_ring = [item.replace('Ring','nuclei') for item in ls_rim]
+ ls_nuc_ring.append('slide_scene')
+ ls_nuc_ring = sorted(set(df_mi_filled.columns).intersection(set(ls_nuc_ring)))
+ #quntiles
+ df_bg = df_mi_filled.loc[:,ls_nuc_ring].groupby('slide_scene').quantile(0)
+ df_bg.columns = [f'{item}' for item in df_bg.columns]
+ for q in np.arange(0,1,.1):
+ df_quantile = df_mi_filled.loc[:,ls_nuc_ring].groupby('slide_scene').quantile(q)
+ df_bg = df_bg.merge(df_quantile,left_index=True, right_index=True, suffixes=('',f'_{str(int(q*10))}'))
+ #drop duplicate
+ ls_nuc_ring.remove('slide_scene')
+ df_bg = df_bg.loc[:,~df_bg.columns.isin(ls_nuc_ring)]
+ return(df_bg)
+
+def filter_cellpose_df_old(df_mi_full):
+ '''
+ old
+ '''
+ #filter
+ ls_select = [
+ #nuclei
+ 'DAPI1_nuclei', 'DAPI2_nuclei', 'DAPI3_nuclei', 'DAPI4_nuclei','DAPI5_nuclei', 'DAPI5Q_nuclei',
+ 'DAPI6_nuclei', 'DAPI7_nuclei','DAPI8_nuclei', 'DAPI9_nuclei',
+ 'DAPI10_nuclei', 'DAPI11_nuclei','DAPI12_nuclei','DAPI12Q_nuclei',
+ 'ER_nuclei','AR_nuclei','PgR_nuclei',
+ 'Ki67_nuclei', 'pRB_nuclei','PCNA_nuclei', 'pHH3_nuclei',
+ 'FoxP3_nuclei', 'GRNZB_nuclei',
+ 'H3K27_nuclei', 'H3K4_nuclei',
+ 'LamAC_nuclei', 'LamB1_nuclei', 'LamB2_nuclei',
+ 'HIF1a_nuclei', 'pERK_nuclei', 'cPARP_nuclei', 'gH2AX_nuclei',
+
+ #perinuc5
+ 'CD44_perinuc5',
+ 'CD20_perinuc5', 'CD31_perinuc5',
+ 'CD3_perinuc5', 'CD45_perinuc5', 'CD4_perinuc5',
+ 'CD68_perinuc5', 'CD8_perinuc5','pS6RP_perinuc5',
+ 'ColIV_perinuc5', 'ColI_perinuc5', 'CoxIV_perinuc5',
+ 'PD1_perinuc5', 'PDPN_perinuc5','PDGFRa_perinuc5',
+ 'Vim_perinuc5', 'aSMA_perinuc5','BMP2_perinuc5',
+ #cytoplasm
+ #'pAKT_cytoplasm','Glut1_cytoplasm',
+ 'CK14_cytoplasm','CK5_cytoplasm','CK17_cytoplasm',
+ 'CK19_cytoplasm','CK7_cytoplasm','CK8_cytoplasm',
+ 'Ecad_cytoplasm','HER2_cytoplasm','EGFR_cytoplasm',
+ #other
+ 'slide_scene',
+ #'area_segmented-nuclei', #'area_segmented-cells',
+ #'eccentricity_segmented-nuclei', #'eccentricity_segmented-cells',
+ #'mean_intensity_segmented-nuclei', #'mean_intensity_segmented-cells',
+ ]
+
+ ls_negative = df_mi_full.columns[df_mi_full.columns.str.contains('_negative')].tolist()
+ #print(type(ls_negative))
+ ls_select = ls_select + ls_negative
+
+ df_mi_nas = df_mi_full.loc[:,df_mi_full.columns.isin(ls_select)]
+ print(f'Selected makers that were missing from mean intensity {set(ls_select) - set(df_mi_nas.columns)}')
+ #fiter out nas
+ print(f'Number on df_mi nas = {df_mi_nas.isna().sum().max()}')
+ df_mi = df_mi_nas.dropna(axis=0,how='any')
+ return(df_mi,df_mi_nas)
+
+###### below: functions for guillaumes features ########
+
+def load_mi(s_sample, s_path='./', b_set_index=True):
+ """
+ input:
+ s_sample: string with sample name
+ s_path: file path to data, default is current folder
+ b_set_index:
+
+ output:
+ df_mi: dateframe with mean intensity
+ each row is a cell, each column is a biomarker_location
+
+ description:
+ load the mean intensity dataframe
+ """
+ print(f'features_{s_sample}_MeanIntensity.tsv')
+ df_mi = pd.read_csv(
+ f'{s_path}features_{s_sample}_MeanIntensity.tsv',
+ sep='\t',
+ index_col=0
+ )
+ if b_set_index:
+ df_mi = df_mi.set_index(f'{s_sample}_' + df_mi.index.astype(str))
+ return(df_mi)
+
+def load_xy(s_sample, s_path='./', b_set_index=True):
+ """
+ input:
+ s_sample: string with sample name
+ s_path: file path to data, default is current folder
+ b_set_index:
+
+ output:
+ df_mi: dateframe with mean intensity
+ each row is a cell, each column is a biomarker_location
+
+ description:
+ load the mean intensity dataframe
+ """
+ print(f'features_{s_sample}_CentroidY.tsv')
+ df_y = pd.read_csv(
+ f'features_{s_sample}_CentroidY.tsv',
+ sep='\t',
+ index_col=0
+ )
+ if b_set_index:
+ df_y = df_y.set_index(f'{s_sample}_' + df_y.index.astype(str))
+
+ print(f'features_{s_sample}_CentroidX.tsv')
+ df_x = pd.read_csv(
+ f'features_{s_sample}_CentroidX.tsv',
+ sep='\t',
+ index_col=0
+ )
+ if b_set_index:
+ df_x = df_x.set_index(f'{s_sample}_' + df_x.index.astype(str))
+ #merge the x and y dataframes
+ df_xy = pd.merge(df_x,df_y,left_index=True,right_index=True,suffixes=('_X', '_Y'))
+ return(df_xy)
+
+def add_scene(df,i_scene_index=1,s_group='scene'):
+ """
+ decription: add a coulmn with a grouping to dataframe that has grouping in the index
+ """
+ lst = df.index.str.split('_')
+ lst2 = [item[i_scene_index] for item in lst]
+ df[s_group] = lst2
+ return(df)
+
+def filter_dapi(df_mi,df_xy,s_dapi='DAPI11_Nuclei',dapi_thresh=1000,b_images=False,t_figsize=(8,8)):
+ """
+ description: return a dataframe where all cells have DAPI brigter than a threshold
+ right now the plotting works!
+ """
+ df_filtered_mi = df_mi.copy(deep=True)
+ #get tissue id from the dataframe
+ s_tissue = df_mi.index[0].split('_')[0]
+ #DAPI filter
+ df_filtered_mi = df_filtered_mi[df_filtered_mi.loc[:,s_dapi]>dapi_thresh]
+ print(f'Cells before DAPI filter = {len(df_mi)}')
+ print(f'Cells after DAPI filter = {len(df_filtered_mi)}')
+ df_filtered_mi.index.name='UNIQID'
+ if b_images:
+ ls_scene=list(set(df_xy.scene))
+ ls_scene.sort()
+ for s_scene in ls_scene:
+ df_pos = df_xy.loc[df_filtered_mi.index.tolist()]
+ df_pos_scene = df_pos[df_pos.scene==s_scene]
+ if len(df_pos_scene) >= 1:
+ fig,ax=plt.subplots(figsize=t_figsize)
+ ax.scatter(x=df_xy[df_xy.scene==s_scene].loc[:,'DAPI_X'], y=df_xy[df_xy.scene==s_scene].loc[:,'DAPI_Y'], color='silver',label='DAPI neg', s=2)
+ ax.scatter(x=df_pos_scene.loc[:,'DAPI_X'], y=df_pos_scene.loc[:,'DAPI_Y'], color='DarkBlue',label='DAPI pos',s=2)
+ ax.axis('equal')
+ ax.set_ylim(ax.get_ylim()[::-1])
+ ax.set_title(f'{s_scene}_DAPI')
+ plt.legend(markerscale=3)
+ fig.savefig(f'{s_tissue}_{s_scene}_{s_dapi}{dapi_thresh}.png')
+ return(df_filtered_mi)
+
+def load_meta(s_sample, s_path='./',type='csv'):
+ """
+ load rounds cycles table
+ make sure to specify location for use with downstream functions
+ make sure to add rows for any biomarkers used for analysis or processing
+ """
+ #tab or space delimited
+ if type == 'Location':
+ print(f'metadata_{s_sample}_RoundsCyclesTable_location.txt')
+ df_t = pd.read_csv(
+ f'metadata_{s_sample}_RoundsCyclesTable_location.txt',
+ delim_whitespace=True,
+ header=None,
+ index_col=False,
+ names=['marker', 'rounds','color','minimum', 'maximum', 'exposure', 'refexp','location'],
+ )
+ df_t = df_t.set_index(f'{s_sample}_' + df_t.index.astype(str))
+ df_t.replace({'Nucleus':'Nuclei'},inplace=True)
+ df_t['marker_loc'] = df_t.marker + '_' + df_t.location
+ df_t.set_index(keys='marker_loc',inplace=True)
+ elif type == 'csv':
+ print(f'metadata_{s_sample}_RoundsCyclesTable.csv')
+ df_t = pd.read_csv(
+ f'metadata_{s_sample}_RoundsCyclesTable.csv',
+ header=0,
+ index_col=0,
+ names=['rounds','color','minimum', 'maximum', 'exposure', 'refexp','location'],#'marker',
+ )
+ #df_t = df_t.set_index(f'{s_sample}_' + df_t.index.astype(str))
+ df_t.replace({'Nucleus':'Nuclei'},inplace=True)
+ #
+ elif type == 'LocationCsv':
+ print(f'metadata_{s_sample}_RoundsCyclesTable_location.csv')
+ df_t = pd.read_csv(
+ f'metadata_{s_sample}_RoundsCyclesTable_location.csv',
+ header=0,
+ index_col=False,
+ names=['marker', 'rounds','color','minimum', 'maximum', 'exposure', 'refexp','location'],
+ )
+ df_t = df_t.set_index(f'{s_sample}_' + df_t.index.astype(str))
+ df_t.replace({'Nucleus':'Nuclei'},inplace=True)
+ df_t['marker_loc'] = df_t.marker + '_' + df_t.location
+ df_t.set_index(keys='marker_loc',inplace=True)
+ else:
+ print(f'metadata_{s_sample}_RoundsCyclesTable.txt')
+ df_t = pd.read_csv(
+ f'metadata_{s_sample}_RoundsCyclesTable.txt',
+ delim_whitespace=True,
+ header=None,
+ index_col=False,
+ names=['rounds','color','minimum', 'maximum', 'exposure', 'refexp','location'],#'marker',
+ )
+ df_t = df_t.set_index(f'{s_sample}_' + df_t.index.astype(str))
+ df_t.replace({'Nucleus':'Nuclei'},inplace=True)
+ return(df_t)
+
+def add_exposure_roundscyles(df_tc, df_expc,es_standard,ls_dapi = ['DAPI12_Nuclei']):
+ """
+ df_exp = dataframe of exposure times with columns [0, 1,2,3,4]
+ and index with czi image names
+ df_t = metadata with dataframe with ['marker','exposure']
+ """
+ df_t = copy.copy(df_tc)
+ df_exp = copy.copy(df_expc)
+ df_t['location'] = ''
+ df_t.drop([item.split('_')[0] for item in ls_dapi], inplace=True)
+ df_exp.columns = ['c' + str(int(item)+1) for item in df_exp.columns]
+ df_exp['rounds'] = [item.split('_')[0] for item in df_exp.index]
+ for s_index in df_t.index:
+ s_channel = df_t.loc[s_index,'colors']
+ s_round = df_t.loc[s_index, 'rounds']
+ print(s_round)
+ #look up exposure time for marker in metadata
+ df_t_image = df_exp[(df_exp.rounds==s_round)]
+ if len(df_t_image) > 0:
+ i_exposure = df_t_image.loc[:,s_channel]
+ df_t.loc[s_index,'exposure'] = i_exposure[0]
+ df_t.loc[s_index,'refexp'] = i_exposure[0]
+ else:
+ print(f'{s_marker} has no recorded exposure time')
+ s_ring = s_index + '_Ring'
+ s_nuc = s_index + '_Nuclei'
+ ls_loc = sorted(es_standard.intersection({s_ring,s_nuc}))
+ if len(ls_loc) == 1:
+ df_t.loc[s_index,'location'] = ls_loc[0].split('_')[1]
+ return(df_t)
+
+def filter_loc(df_mi,df_t):
+ """
+ filters columns of dataframe based on locations selected in metadata_location table
+ """
+ ls_bio_loc = df_t.index.tolist()
+ df_filtered_mi = df_mi.loc[:,ls_bio_loc]
+ return(df_filtered_mi)
+
+#R0c2 R0c3 R0c4 R0c5 panCK CK14 Ki67 CK19 R1rc2 R1rc3 Ki67r R1rc5 PCNA HER2 ER Ecad aSMA AR pAKT
+#CD44 CK5 EGFR pRB LamAC pHH3 PDPN pERK FoxP3 R5Qc2 R5Qc3 R5Qc4 R5Qc5 CK7 CD68 PD1 CD45 Vim CD8 CD4 PgR CK8 cPARP ColIV CD20 CK17
+#H3K4 gH2AX ColI H3K27 pS6RP CD31 GRNZB LamB1 CoxIV HIF1a CD3 Glut1 PDGFRa LamB2 BMP2 R12Qc2 R12Qc3 R12Qc4 R12Qc5 DAPI12
+
+def filter_background(df_mi, es_standard):
+ '''
+ given a set of standard biomarker subcellular locations, obtain the opposite subcellular location
+ and the mean intensity
+ input: df_mi = mean intensity dataframe with all biomarker locations
+ es_standard = biomarker ring or nuclei
+ return: dataframe with each scene and the quantiles of the negative cells
+ '''
+ ls_rim = [item.replace('Nuclei','Rim') for item in sorted(es_standard)]
+ ls_nuc_rim = [item.replace('Ring','Nuclei') for item in ls_rim]
+ ls_nuc_ring = [item.replace('Rim','Ring') for item in ls_nuc_rim]
+ ls_nuc_ring.append('scene')
+ ls_nuc_rim.append('scene')
+ df_scene = add_scene(df_mi)
+ ls_nuc_ring = sorted(set(df_scene.columns).intersection(set(ls_nuc_ring)))
+ #quntiles
+ df_bg = df_scene.loc[:,ls_nuc_ring].groupby('scene').quantile(0)
+ df_bg.columns = [f'{item}' for item in df_bg.columns]
+ for q in np.arange(0,1,.1):
+ df_quantile = df_scene.loc[:,ls_nuc_ring].groupby('scene').quantile(q)
+ df_bg = df_bg.merge(df_quantile,left_index=True, right_index=True, suffixes=('',f'_{str(int(q*10))}'))
+ print(q)
+ print(f'_{str(int(q*10))}')
+ #mean
+ df_quantile = df_scene.loc[:,ls_nuc_ring].groupby('scene').mean()
+ df_bg = df_bg.merge(df_quantile,left_index=True, right_index=True, suffixes=('','_mean'))
+ #drop duplicate
+ ls_nuc_ring.remove('scene')
+ df_bg = df_bg.loc[:,~df_bg.columns.isin(ls_nuc_ring)]
+ return(df_bg)
+
+def exposure_norm(df_mi,df_t,d_factor={'c1':10,'c2':30,'c3':200,'c4':500,'c5':500}):
+ """
+ normalizes to standard exposure times
+ input: mean intensity, and metadata table with exposure time
+ """
+ df_norm = pd.DataFrame()
+ ls_columns = [item.split('_')[0] for item in df_mi.columns.tolist()]
+ ls_column_mi = df_mi.columns.tolist()
+ for idx, s_column in enumerate(ls_columns):
+
+ s_marker = s_column.split('_')[0]
+ i_exp = df_t.loc[s_column,'exposure']
+ print(f'Processing exposure time for {s_column}: {i_exp}')
+ print(f'Processing mean intensity {ls_column_mi[idx]}')
+ i_factor = d_factor[df_t.loc[s_column,'colors']]
+ se_exp = df_mi.loc[:,ls_column_mi[idx]]
+ df_norm[ls_column_mi[idx]] = se_exp/i_exp*i_factor
+ return(df_norm)
+
+def af_subtract(df_norm,df_t,d_channel={'c2':['L488','L488'],'c3':['L555','L555'],'c4':['L647','L647'],'c5':['L750','L750']},ls_exclude=[]):
+ """
+ given an exposure normalized dataframe, metadata with biomarker location, and a dictionary of background channels, subtracts
+ correct background intensity from each cell
+ input:
+ d_channel = dictionary, key is color i.e. 'c2', value is list of ['Ring','Nuclei']
+ ls_exclude = markers to not subtract
+ output:
+ df_mi_sub,ls_sub,ls_record
+ """
+ #generate list of background markers needed for subtraction
+ lls_d_channel = []
+ for s_key,ls_item in d_channel.items():
+ lls_d_channel = lls_d_channel + [ls_item]
+ ls_background = []
+ for ls_channel in lls_d_channel:
+ ls_background = ls_background + [f'{ls_channel[0]}_Ring']
+ ls_background = ls_background + [f'{ls_channel[1]}_Nuclei']
+ se_background = set(ls_background)
+ se_exclude = set([item + '_Ring' for item in ls_exclude] + [item + '_Nuclei' for item in ls_exclude]).intersection(set(df_norm.columns.tolist()))
+ se_all = set(df_norm.columns.tolist())
+ se_sub = se_all - se_background - se_exclude
+ ls_sub = list(se_sub)
+
+ #subtract AF channels
+ df_mi_sub = pd.DataFrame()
+
+ ls_record = []
+ for s_marker_loc in ls_sub:
+ print(s_marker_loc)
+ s_marker = s_marker_loc.split('_')[0]
+ s_loc = s_marker_loc.split('_')[1]
+ s_channel = df_t.loc[s_marker,'colors']
+ if s_channel == 'c1':
+ df_mi_sub[s_marker_loc] = df_norm.loc[:,s_marker_loc]
+ continue
+ if s_loc =='Nuclei':
+ s_AF = d_channel[s_channel][1]
+ elif s_loc == 'Ring':
+ s_AF = d_channel[s_channel][0]
+ else:
+ print('Error: location must be Ring or Nucleus')
+ s_AF_loc = s_AF + '_' + s_loc
+ df_mi_sub[s_marker_loc] = df_norm.loc[:,s_marker_loc] - df_norm.loc[:,s_AF_loc]
+ print(f'From {s_marker_loc} subtracting {s_AF_loc}')
+ ls_record = ls_record + [f'From {s_marker_loc} subtracting {s_AF_loc}\n']
+ for s_marker in sorted(se_exclude):
+ ls_record = ls_record + [f'From {s_marker} subtracting None\n']
+ df_mi_sub[sorted(se_exclude)] = df_norm.loc[:,sorted(se_exclude)]
+ #f = open(f"AFsubtractionData.txt", "w")
+ #f.writelines(ls_record)
+ #f.close()
+ #error check
+ print('AF subtraction not performed for the following markers:')
+ print(set(df_t.index) - set(ls_sub))
+
+ return(df_mi_sub,ls_sub,ls_record)
+
+def plot_subtraction(df_norm,df_sub,ls_scene=None):
+ """
+ makes scatterplots of each marker, subtracted versus original meanintensity per cell, to judge subtraction effectiveness
+ """
+ if ls_scene == None:
+ ls_scene = list(set(df_norm.scene))
+ ls_marker = df_sub.columns.tolist()
+ ls_marker.remove('scene')
+ ls_scene.sort()
+ for s_marker in ls_marker:
+ print(f'Plotting {s_marker}')
+ fig, ax = plt.subplots(2,(len(ls_scene)+1)//2, figsize = (12,4))
+ ax = ax.ravel()
+ ax_num = -1
+ for s_scene in ls_scene:
+ df_subtracted = df_sub[df_sub.scene==s_scene]
+ df_original = df_norm[df_norm.scene==s_scene]
+ ax_num = ax_num + 1
+ ax[ax_num].scatter(x=df_original.loc[:,s_marker],y=df_subtracted.loc[:,s_marker],s=1,alpha=0.8)
+ ax[ax_num].set_title(s_scene,{'fontsize': 10,'verticalalignment': 'center'})
+ fig.text(0.5, 0.01, s_marker, ha='center')
+ fig.text(0.6, 0.01, 'Original', ha='center')
+ fig.text(0.01, 0.6, 'Subtracted', va='center', rotation='vertical')
+ plt.tight_layout()
+ fig.savefig(f'{s_marker}_NegativevsOriginal.png')
+
+def output_subtract(df_sub,df_t,d_factor={'c1':10,'c2':30,'c3':200,'c4':500,'c5':500}):
+ """
+ this un-normalizes by exposure time to output a new dataframe of AF subtracted cells for analysis
+ """
+ ls_sub = df_sub.columns.tolist()
+ result = any(elem == 'scene' for elem in ls_sub)
+ if result:
+ ls_sub.remove('scene')
+ df_sub = df_sub.drop(columns='scene')
+ else:
+ print('no scene column')
+ df_mi_zero = df_sub.clip(lower = 0)
+ df_mi_factor = pd.DataFrame()
+ for s_sub in ls_sub:
+ s_dft_index = s_sub.split('_')[0]
+ i_reverse_factor = df_t.loc[s_dft_index,'exposure']/d_factor[df_t.loc[s_dft_index,'colors']]
+ df_mi_factor[s_sub] = df_mi_zero.loc[:,s_sub]*i_reverse_factor
+ return df_mi_factor
+
+def af_subtract_images(df_t,d_channel={'c2':['L488','L488'],'c3':['L555','L555'],'c4':['L647','L647'],'c5':['L750','L750']},s_dapi='DAPI11_Nuclei',b_mkdir=True):
+ """
+ This code loads 16 bit grayscale tiffs, performs AF subtraction of channels/rounds defined by the user, and outputs 8 bit AF subtracted tiffs for visualization.
+ The data required is:
+ 1. The RoundsCyclesTable.txt with the location (Nucleus/Ring) specified (not All), and real expsure times
+ 2. 16 bit grayscale tiff images following Koei's naming convention (script processes the list of folders ls_folder)
+ Note: name of folder can be anything
+ """
+ #generate list of markers needing subtraction
+ lls_d_channel = []
+ for s_key in d_channel:
+ lls_d_channel = lls_d_channel + [d_channel[s_key]]
+ ls_background = []
+ for ls_channel in lls_d_channel:
+ ls_background = ls_background + [f'{ls_channel[0]}_Ring']
+ ls_background = ls_background + [f'{ls_channel[1]}_Nuclei']
+ se_background = set(ls_background)
+ se_all = set(df_t.index)
+ se_sub = se_all - se_background
+ ls_sub = list(se_sub)
+ #ls_sub.remove(s_dapi) #don't need line if s_DAPI is c1
+ #subtract images
+ #os.makedirs('8bit/', exist_ok=True)
+ if b_mkdir:
+ os.mkdir('8bit')
+ ls_image = os.listdir()
+ ls_slide = []
+ ls_image_org = []
+ for s_image in ls_image:
+ if s_image.find('_ORG.tif')>-1:
+ #make a list of slides/scenes in the folder
+ s_slide = s_image.split('_')[2]
+ ls_slide = ls_slide + [s_slide]
+ #make a list of all original images in the folder
+ ls_image_org = ls_image_org + [s_image]
+ ls_slide = list(set(ls_slide))
+ #process each slide in the folder
+ for s_slide in ls_slide:
+ print(f'Processing {s_slide}')
+ df_t['image'] = 'NA'
+ ls_dapi = []
+
+ for s_image in ls_image_org:
+
+ #grab all original images with slide/scene name
+ if s_image.find(s_slide) > -1:
+
+ #add matching image name to df_t (fore specific slide/scene, dapi not included)
+ s_round = s_image.split('Registered-')[1].split('_')[0]
+ s_color = s_image.split('Scene-')[1].split('_')[1]
+ s_index = df_t[(df_t.rounds==s_round) & (df_t.color==s_color)].index
+ df_t.loc[s_index,'image'] = s_image
+ if s_color == 'c1':
+ ls_dapi = ls_dapi + [s_image]
+ #subtract images
+ ls_record = []
+ for s_marker_loc in ls_sub:
+ s_marker = s_marker_loc.split('_')[0]
+ s_loc = s_marker_loc.split('_')[1]
+ s_rounds= df_t.loc[s_marker_loc,'rounds']
+ s_channel = df_t.loc[s_marker_loc,'color']
+ if s_channel == 'c1':
+ print(f'{s_marker_loc} is DAPI')
+ continue
+ elif s_loc =='Nuclei':
+ s_AF = d_channel[s_channel][1]
+ elif s_loc == 'Ring':
+ s_AF = d_channel[s_channel][0]
+ else:
+ print('Error: location must be Ring or Nucleus')
+ s_AF_loc = s_AF + '_' + s_loc
+ print(f'From {s_marker_loc} subtracting {s_AF_loc}')
+ s_image = df_t.loc[s_marker_loc,'image']
+ s_background = df_t.loc[s_AF_loc,'image']
+ a_img = skimage.io.imread(s_image)
+ a_AF = skimage.io.imread(s_background)
+ #divide each image by exposure time
+ #subtract 1 ms AF from 1 ms signal
+ #multiply by original image exposure time
+ a_sub = (a_img/df_t.loc[s_marker_loc,'exposure'] - a_AF/df_t.loc[s_AF_loc,'exposure'])*df_t.loc[s_marker_loc,'exposure']
+
+ ls_record = ls_record + [f'From {s_marker_loc} subtracting {s_AF_loc}\n']
+ #make all negative numbers into zero
+ a_zero = a_sub.clip(min=0,max=a_sub.max())
+ a_zero_8bit = (a_zero/256).astype(np.uint8)
+ s_fname = f"8bit/{s_rounds}_{s_marker}_{s_slide}_{s_channel}_8bit.tif"
+ skimage.io.imsave(s_fname,a_zero_8bit)
+ f = open(f"8bit/AFsubtractionImages.txt", "w")
+ f.writelines(ls_record)
+ f.close()
+ #save 8 bit dapis
+ for s_dapi in ls_dapi:
+ a_img = skimage.io.imread(s_dapi)
+ a_zero_8bit = (a_img/256).astype(np.uint8)
+ s_marker = 'DAPI'
+ s_channel = 'c1'
+ s_round = s_dapi.split('Registered-')[1].split('_')[0]
+ s_fname = f"8bit/{s_round}_{s_marker}_{s_slide}_{s_channel}_8bit.tif"
+ skimage.io.imsave(s_fname,a_zero_8bit)
+
+def round_overlays():
+ """
+ output multipage tiffs with five channels per round
+ """
+ os.chdir('./8bit')
+ ls_image = os.listdir()
+ ls_slide = []
+ ls_image_org = []
+ ls_round = []
+
+ for s_image in ls_image:
+ if s_image.find('8bit.tif') > -1:
+ #make a list of slides/scenes
+ #also make list of rounds
+ s_slide = s_image.split('_')[2]
+ ls_slide = ls_slide + [s_slide]
+ ls_image_org = ls_image_org + [s_image]
+ s_round = s_image.split('_')[0]
+ ls_round = ls_round + [s_round]
+ ls_slide = list(set(ls_slide))
+ ls_round = list(set(ls_round))
+ for s_slide in ls_slide:
+ print(f'Processing {s_slide}')
+ for s_round in ls_round:
+ d_overlay = {}
+ ls_color_round = []
+ for s_image in ls_image_org:
+ if s_image.find(s_slide) > -1:
+ if s_image.find(f'{s_round}_') == 0:
+ s_color = s_image.split('_')[3]
+ d_overlay.update({s_color:s_image})
+ s_image_round = s_image
+ a_size = skimage.io.imread(s_image_round)
+ a_overlay = np.zeros((len(d_overlay),a_size.shape[0],a_size.shape[1]),dtype=np.uint8)
+ s_biomarker_all = ''
+ i = -1
+ for s_color in sorted(d_overlay.keys()):
+ i = i + 1
+ s_overlay= d_overlay[s_color]
+ s_biomarker = s_overlay.split('_')[1] + '.'
+ s_biomarker_all = s_biomarker_all + s_biomarker
+ a_channel = skimage.io.imread(s_overlay)
+ a_overlay[i,:,:] = a_channel
+ s_biomarker_all = s_biomarker_all[:-1]
+ #this works. Open in image j. use Image/Color/Make Composite. Then use
+ #Image/Color/Channels Tool to turn on and off channels
+ #use Image/Adjust/Brightness/Contrast to adjust
+ with skimage.external.tifffile.TiffWriter(f'{s_round}_{s_biomarker_all}_{s_slide}_overlay.tiff', imagej=True) as tif:
+ for i in range(a_overlay.shape[0]):
+ tif.save(a_overlay[i])
+ os.chdir('..')
+
+def custom_overlays(d_combos, df_img, df_dapi):
+ """
+ output custon multi page tiffs according to dictionary, with s_dapi as channel 1 in each overlay
+ BUG with 53BP1
+ d_combos = {'Immune':{'CD45', 'PD1', 'CD8', 'CD4', 'CD68', 'FoxP3','GRNZB','CD20','CD3'},
+ 'Stromal':{'Vim', 'aSMA', 'PDPN', 'CD31', 'ColIV','ColI'},
+ 'Differentiation':{'CK19', 'CK7','CK5', 'CK14', 'CK17','CK8'},
+ 'Tumor':{'HER2', 'Ecad', 'ER', 'PgR','Ki67','PCNA'},
+ 'Proliferation':{'EGFR','CD44','AR','pHH3','pRB'},
+ 'Functional':{'pS6RP','H3K27','H3K4','cPARP','gH2AX','pAKT','pERK'},
+ 'Lamins':{'LamB1','LamAC', 'LamB2'}}
+ """
+ #os.chdir('./AFSubtracted')
+
+ ls_slide = list(set(df_img.scene))
+ #now make overlays
+ for s_slide in ls_slide:
+ print(f'Processing {s_slide}')
+ df_slide = df_img[df_img.scene==s_slide]
+ s_image_round = (df_dapi[df_dapi.scene == s_slide]).index[0]
+ if len((df_dapi[df_dapi.scene == s_slide]).index) == 0:
+ print('Error: dapi not found')
+ elif len((df_dapi[df_dapi.scene == s_slide]).index) > 1:
+ print('Error: too many dapi images found')
+ else:
+ print(s_image_round)
+ #exclude any missing biomarkers
+ es_all = set(df_slide.marker)
+ if len(list(set(df_img.imagetype)))==1:
+ s_imagetype = list(set(df_img.imagetype))[0]
+ print(s_imagetype)
+ else:
+ print('Error: more than one image type)')
+ for s_type in d_combos:
+ d_overlay = {}
+ es_combos = d_combos[s_type]
+ es_combos_shared = es_combos.intersection(es_all)
+ for idx, s_combo in enumerate(sorted(es_combos_shared)):
+ s_filename = (df_slide[df_slide.marker==s_combo]).index[0]
+ if len((df_slide[df_slide.marker==s_combo]).index) == 0:
+ print('Error: marker not found')
+ elif len((df_slide[df_slide.marker==s_combo]).index) > 1:
+ print('Error: too many marker images found')
+ else:
+ print(s_filename)
+ d_overlay.update({s_combo:s_filename})
+ d_overlay.update({'1AAADAPI':s_image_round})
+ a_size = skimage.io.imread(s_image_round)
+ a_overlay = np.zeros((len(d_overlay),a_size.shape[0],a_size.shape[1]),dtype=np.uint8)
+ s_biomarker_all = ''
+ i = -1
+ for s_color in sorted(d_overlay.keys()):
+ i = i + 1
+ s_overlay= d_overlay[s_color]
+ s_biomarker = s_color.split('1AAA')[0] + '.'
+ s_biomarker_all = s_biomarker_all + s_biomarker
+ a_channel = skimage.io.imread(s_overlay)
+ if s_imagetype=='ORG':
+ a_channel = (a_channel/256).astype(np.uint8)
+ print('covert to 8 bit')
+ a_overlay[i,:,:] = a_channel
+ s_biomarker_all = s_biomarker_all[1:-1]
+ #this works. Open in image j. use Image/Color/Make Composite. Then use
+ #Image/Color/Channels Tool to turn on and off channels
+ #use Image/Adjust/Brightness/Contrast to adjust
+ with skimage.external.tifffile.TiffWriter(f'./{s_type}_{((df_dapi[df_dapi.scene==s_slide]).marker[0])}.{s_biomarker_all}_{s_slide}_overlay.tiff', imagej=True) as tif:
+ for i in range(a_overlay.shape[0]):
+ tif.save(a_overlay[i])
+ print(f'saved {s_type}')
+
+def custom_crop_overlays(d_combos,d_crop, df_img,s_dapi, tu_dim=(1000,1000)): #df_dapi,
+ """
+ output custon multi page tiffs according to dictionary, with s_dapi as channel 1 in each overlay
+ BUG with 53BP1
+ d_crop : {slide_scene : (x,y) coord
+ tu_dim = (width, height)
+ d_combos = {'Immune':{'CD45', 'PD1', 'CD8', 'CD4', 'CD68', 'FoxP3','GRNZB','CD20','CD3'},
+ 'Stromal':{'Vim', 'aSMA', 'PDPN', 'CD31', 'ColIV','ColI'},
+ 'Differentiation':{'CK19', 'CK7','CK5', 'CK14', 'CK17','CK8'},
+ 'Tumor':{'HER2', 'Ecad', 'ER', 'PgR','Ki67','PCNA'},
+ 'Proliferation':{'EGFR','CD44','AR','pHH3','pRB'},
+ 'Functional':{'pS6RP','H3K27','H3K4','cPARP','gH2AX','pAKT','pERK'},
+ 'Lamins':{'LamB1','LamAC', 'LamB2'}}
+ """
+ #os.chdir('./AFSubtracted')
+
+ ls_slide = list(set(df_img.scene))
+ #now make overlays
+ for s_slide, xy_cropcoor in d_crop.items():
+ print(f'Processing {s_slide}')
+ df_slide = df_img[df_img.scene==s_slide]
+ s_image_round = df_slide[df_slide.marker==s_dapi.split('_')[0]].index[0]
+ if len(df_slide[df_slide.marker==s_dapi.split('_')[0]].index) == 0:
+ print('Error: dapi not found')
+ elif len(df_slide[df_slide.marker==s_dapi.split('_')[0]].index) > 1:
+ print('Error: too many dapi images found')
+ else:
+ print(s_image_round)
+ #exclude any missing biomarkers
+ es_all = set(df_slide.marker)
+ if len(list(set(df_img.imagetype)))==1:
+ s_imagetype = list(set(df_img.imagetype))[0]
+ print(s_imagetype)
+ else:
+ print('Error: more than one image type)')
+ for s_type, es_combos in d_combos.items():
+ d_overlay = {}
+ es_combos_shared = es_combos.intersection(es_all)
+ for idx, s_combo in enumerate(sorted(es_combos_shared)):
+ s_filename = (df_slide[df_slide.marker==s_combo]).index[0]
+ if len((df_slide[df_slide.marker==s_combo]).index) == 0:
+ print('Error: marker not found')
+ elif len((df_slide[df_slide.marker==s_combo]).index) > 1:
+ print('Error: too many marker images found')
+ else:
+ print(s_filename)
+ d_overlay.update({s_combo:s_filename})
+ d_overlay.update({'1AAADAPI':s_image_round})
+ a_size = skimage.io.imread(s_image_round)
+ #crop
+ a_crop = a_size[(xy_cropcoor[1]):(xy_cropcoor[1]+tu_dim[1]),(xy_cropcoor[0]):(xy_cropcoor[0]+tu_dim[0])]
+ a_overlay = np.zeros((len(d_overlay),a_crop.shape[0],a_crop.shape[1]),dtype=np.uint8)
+ s_biomarker_all = ''
+ i = -1
+ for s_color in sorted(d_overlay.keys()):
+ i = i + 1
+ s_overlay= d_overlay[s_color]
+ s_biomarker = s_color.split('1AAA')[0] + '.'
+ s_biomarker_all = s_biomarker_all + s_biomarker
+ a_size = skimage.io.imread(s_overlay)
+ #crop
+ a_channel = a_size[(xy_cropcoor[1]):(xy_cropcoor[1]+tu_dim[1]),(xy_cropcoor[0]):(xy_cropcoor[0]+tu_dim[0])]
+ if s_imagetype=='ORG':
+ a_channel = (a_channel/256).astype(np.uint8)
+ print('covert to 8 bit')
+ a_overlay[i,:,:] = a_channel
+ s_biomarker_all = s_biomarker_all[1:-1]
+ #this works. Open in image j. use Image/Color/Make Composite. Then use
+ #Image/Color/Channels Tool to turn on and off channels
+ #use Image/Adjust/Brightness/Contrast to adjust
+ with skimage.external.tifffile.TiffWriter(f'./{s_type}_{s_dapi.split("_")[0]}.{s_biomarker_all}_{s_slide}_x{xy_cropcoor[0]}y{xy_cropcoor[1]}_overlay.tiff', imagej=True) as tif:
+ for i in range(a_overlay.shape[0]):
+ tif.save(a_overlay[i])
+ print(f'saved {s_type}')
+
+def make_thresh_df(df_out,ls_drop=None):
+ """
+ makes a thresholding csv matching the output dataframe (df_out)'s scenes and biomarkers
+ """
+ ls_scene = list(set(df_out.scene))
+ ls_scene.append('global_manual')
+ ls_scene.sort()
+ ls_biomarker = df_out.columns.tolist()
+ ls_biomarker.remove('scene')
+ if ls_drop != None:
+ for s_drop in ls_drop:
+ ls_biomarker.remove(s_drop)
+ ls_manual = []
+ for s_biomarker in ls_biomarker:
+ s_marker = s_biomarker.split('_')[0] + '_manual'
+ ls_manual.append(s_marker)
+ ls_manual.sort()
+ df_thresh = pd.DataFrame(index=ls_scene,columns=ls_manual)
+ #df_thresh_t = df_thresh.transpose()
+ return(df_thresh)
+
+def check_seg(s_sample= 'sampleID',ls_find=['Cell Segmentation Full Color'], i_rows=2, t_figsize=(20,10)):
+ """
+ This script makes overviews of all the specified segmentation images of guillaumes ouput images
+ in a big folder (slides prepared for segmentation for example)
+ Input: ls_find = list of images to view
+ i_rows = number or rows in figure
+ t_figsize = (x, y) in inches size of figure
+ b_mkdir = boolean whether to make a new Check_Registration folder (deprecated)
+ Output: dictionary with {slide_color:number of rounds found}
+ images of all rounds of a certain slide_color
+ """
+ d_result = {}
+ #if b_mkdir:
+ # os.mkdir(f'./Check_Registration')
+ for s_find in ls_find:
+ #find all dapi slides
+ ls_dapis = []
+ for s_dir in os.listdir():
+ if s_dir.find(s_find) > -1:
+ ls_dapis = ls_dapis + [s_dir]
+ ls_dapis.sort()
+
+ #find all unique scenes
+ ls_scene_long = []
+ for s_dapi in ls_dapis:
+ ls_scene_long = ls_scene_long + [(s_dapi.split('-')[0])]
+ ls_scene = list(set(ls_scene_long))
+ ls_scene.sort()
+ fig,ax = plt.subplots(i_rows,(len(ls_scene)+(i_rows-1))//i_rows, figsize = t_figsize, squeeze=False)
+ ax = ax.ravel()
+ for idx, s_scene in enumerate(ls_scene):
+ print(f'Processing {s_scene}')
+ im_low = skimage.io.imread(ls_dapis[idx])#,plugin='simpleitk'
+ im = skimage.exposure.rescale_intensity(im_low,in_range=(np.quantile(im_low,0.02),np.quantile(im_low,0.98)+np.quantile(im_low,0.98)/2))
+ im = skimage.transform.rescale(im, 0.25, anti_aliasing=False)
+ ax[idx].imshow(im) #, cmap='gray'
+ ax[idx].set_title(s_scene,{'fontsize':12})
+ plt.tight_layout()
+ #fig.savefig(f'../Check_Registration/{s_sample}_{s_find}.png')
+ d_result.update({f'{s_sample}_{s_find}.png':fig})
+ return(d_result)
diff --git a/mplex_image/register.py b/mplex_image/register.py
new file mode 100755
index 0000000..b963866
--- /dev/null
+++ b/mplex_image/register.py
@@ -0,0 +1,105 @@
+import numpy as np
+from PIL import Image
+from matplotlib import pyplot as plt
+from skimage import transform, util
+from skimage import data, img_as_float
+from skimage.util import img_as_ubyte
+import cv2
+import sys
+
+# code from adapted chandler gatenbee and brian white
+# https://github.com/IAWG-CSBC-PSON/registration-challenge
+
+def match_keypoints(moving, target, feature_detector):
+ '''
+ :param moving: image that is to be warped to align with target image
+ :param target: image to which the moving image will be aligned
+ :param feature_detector: a feature detector from opencv
+ :return:
+ '''
+
+ kp1, desc1 = feature_detector.detectAndCompute(moving, None)
+ kp2, desc2 = feature_detector.detectAndCompute(target, None)
+
+ matcher = cv2.BFMatcher(normType=cv2.NORM_L2, crossCheck=True)
+ matches = matcher.match(desc1, desc2)
+
+ src_match_idx = [m.queryIdx for m in matches]
+ dst_match_idx = [m.trainIdx for m in matches]
+
+ src_points = np.float32([kp1[i].pt for i in src_match_idx])
+ dst_points = np.float32([kp2[i].pt for i in dst_match_idx])
+
+ H, mask = cv2.findHomography(src_points, dst_points, cv2.RANSAC, ransacReprojThreshold=10)
+
+ good = [matches[i] for i in np.arange(0, len(mask)) if mask[i] == [1]]
+
+ filtered_src_match_idx = [m.queryIdx for m in good]
+ filtered_dst_match_idx = [m.trainIdx for m in good]
+
+ filtered_src_points = np.float32([kp1[i].pt for i in filtered_src_match_idx])
+ filtered_dst_points = np.float32([kp2[i].pt for i in filtered_dst_match_idx])
+
+ return filtered_src_points, filtered_dst_points
+
+def apply_transform(moving, target, moving_pts, target_pts, transformer, output_shape_rc=None):
+ '''
+ :param transformer: transformer object from skimage. See https://scikit-image.org/docs/dev/api/skimage.transform.html for different transformations
+ :param output_shape_rc: shape of warped image (row, col). If None, uses shape of traget image
+ return
+ '''
+ if output_shape_rc is None:
+ output_shape_rc = target.shape[:2]
+
+ if str(transformer.__class__) == "":
+ transformer.estimate(target_pts, moving_pts)
+ warped_img = transform.warp(moving, transformer, output_shape=output_shape_rc)
+
+ ### Restimate to warp points
+ transformer.estimate(moving_pts, target_pts)
+ warped_pts = transformer(moving_pts)
+ else:
+ transformer.estimate(moving_pts, target_pts)
+ warped_img = transform.warp(moving, transformer.inverse, output_shape=output_shape_rc)
+ warped_pts = transformer(moving_pts)
+
+ return warped_img, warped_pts
+
+def keypoint_distance(moving_pts, target_pts, img_h, img_w):
+ dst = np.sqrt(np.sum((moving_pts - target_pts)**2, axis=1)) / np.sqrt(img_h**2 + img_w**2)
+ return np.mean(dst)
+
+
+
+
+def register(target_file,moving_file, b_plot=False):
+ s_round = moving_file.split('_')[0]
+ s_sample = moving_file.split('_')[2]
+ print(s_round)
+ target = img_as_ubyte(img_as_float(Image.open(target_file)))
+ moving = img_as_ubyte(img_as_float(Image.open(moving_file)))
+
+ fd = cv2.AKAZE_create()
+ #fd = cv2.KAZE_create(extended=True)
+ moving_pts, target_pts = match_keypoints(moving, target, feature_detector=fd)
+
+ transformer = transform.SimilarityTransform()
+ warped_img, warped_pts = apply_transform(moving, target, moving_pts, target_pts, transformer=transformer)
+
+ warped_img = img_as_ubyte(warped_img)
+
+ print("Unaligned offset:", keypoint_distance(moving_pts, target_pts, moving.shape[0], moving.shape[1]))
+ print("Aligned offset:", keypoint_distance(warped_pts, target_pts, moving.shape[0], moving.shape[1]))
+ if b_plot:
+ fig, ax = plt.subplots(2,2, figsize=(10,10))
+ ax[0][0].imshow(target)
+ ax[0][0].imshow(moving, alpha=0.5)
+ ax[1][0].scatter(target_pts[:,0], -target_pts[:,1])
+ ax[1][0].scatter(moving_pts[:,0], -moving_pts[:,1])
+
+ ax[0][1].imshow(target)
+ ax[0][1].imshow(warped_img, alpha=0.5)
+ ax[1][1].scatter(target_pts[:,0], -target_pts[:,1])
+ ax[1][1].scatter(warped_pts[:,0], -warped_pts[:,1])
+ plt.savefig(f"../../QC/RegistrationPlots/{s_sample}_{s_round}_rigid_align.png", format="PNG")
+ return(moving_pts, target_pts, transformer)
diff --git a/mplex_image/segment.py b/mplex_image/segment.py
new file mode 100755
index 0000000..972742a
--- /dev/null
+++ b/mplex_image/segment.py
@@ -0,0 +1,717 @@
+####
+# title: segment.py
+#
+# language: Python3.7
+# date: 2020-06-00
+# license: GPL>=v3
+# author: Jenny
+#
+# description:
+# python3 script for cell segmentation
+####
+import time
+import cellpose
+from cellpose import models
+from PIL import Image
+Image.MAX_IMAGE_PIXELS = 1000000000
+
+import os
+import skimage
+import pandas as pd
+import numpy as np
+import sys
+import scipy
+from scipy import stats
+from scipy import ndimage as ndi
+from skimage import io, filters
+from skimage import measure, segmentation, morphology
+from numba import jit, types
+from numba.extending import overload
+from numba.experimental import jitclass
+import numba
+import mxnet as mx
+import stat
+from mxnet import nd
+from mplex_image import preprocess
+
+#set src path (CHANGE ME)
+s_src_path = '/home/groups/graylab_share/OMERO.rdsStore/engje/Data/cmIF'
+
+#functions
+
+def gpu_device():
+ try:
+ _ = mx.nd.array([1, 2, 3], ctx=mx.gpu())
+ mx_gpu = mx.gpu()
+ except mx.MXNetError:
+ return None
+ return mx_gpu
+
+def cellpose_nuc(key,dapi,diameter=30):
+ '''
+ smallest nuclei are about 9 pixels, lymphocyte is 15 pixels, tumor is 25 pixels
+ using 20 can capture large tumor cells, without sacrificing smaller cells,
+ '''
+ try:
+ nd_array = mx.nd.array([1, 2, 3], ctx=mx.gpu())
+ print(nd_array)
+ mx_gpu = mx.gpu()
+ except mx.MXNetError:
+ print('Mxnet error')
+ mx_gpu = None
+ model = models.Cellpose(model_type='nuclei',device=mx_gpu)
+ newkey = f"{key.split(' - Z')[0]} nuclei{diameter}"
+ print(f"modelling {newkey}")
+ channels = [0,0]
+ print(f'Minimum nuclei size = {int(np.pi*(diameter/10)**2)}')
+ masks, flows, styles, diams = model.eval(dapi, diameter=diameter, channels=channels,flow_threshold=0,min_size= int(np.pi*(diameter/10)**2))
+ return({newkey:masks})
+
+def cellpose_cell(key,zdh,diameter=25):
+ '''
+ big tumor cell is 30 pixels, lymphocyte about 18 pixels, small fibroblast 12 pixels
+ '''
+ try:
+ _ = mx.nd.array([1, 2, 3], ctx=mx.gpu())
+ mx_gpu = mx.gpu()
+ except mx.MXNetError:
+ mx_gpu = None
+ model = models.Cellpose(model_type='cyto',device=mx_gpu)
+ newkey = f"{key.split(' - Z')[0]} cell{diameter}"
+ print(f"modelling {newkey}")
+ channels = [2,3]
+ print(f'Minimum cell size = {int(np.pi*(diameter/5)**2)}')
+ masks, flows, styles, diams = model.eval(zdh, diameter=diameter, channels=channels,flow_threshold=0.6,cellprob_threshold=0.0, min_size= int(np.pi*(diameter/5)**2))
+ return({newkey:masks})
+
+def parse_org(s_end = "ORG.tif",s_start='R'):
+ """
+ This function will parse images following koei's naming convention
+ Example: Registered-R1_PCNA.CD8.PD1.CK19_Her2B-K157-Scene-002_c1_ORG.tif
+ The output is a dataframe with image filename in index
+ And rounds, color, imagetype, scene (/tissue), and marker in the columns
+ """
+ s_path = os.getcwd()
+ ls_file = []
+ for file in os.listdir():
+ if file.endswith(s_end):
+ if file.find(s_start)==0:
+ ls_file = ls_file + [file]
+ df_img = pd.DataFrame(index=ls_file)
+ df_img['rounds'] = [item.split('_')[0].split('Registered-')[1] for item in df_img.index]
+ df_img['color'] = [item.split('_')[-2] for item in df_img.index]
+ df_img['slide'] = [item.split('_')[2] for item in df_img.index]
+ df_img['marker_string'] = [item.split('_')[1] for item in df_img.index]
+ try:
+ df_img['scene'] = [item.split('-Scene-')[1] for item in df_img.slide]
+ except:
+ df_img['scene'] = '001'
+ df_img['path'] = [f"{s_path}/{item}" for item in df_img.index]
+ #parse file name for biomarker
+ for s_index in df_img.index:
+ #print(s_index)
+ s_color = df_img.loc[s_index,'color']
+ if s_color == 'c1':
+ s_marker = 'DAPI'
+ elif s_color == 'c2':
+ s_marker = s_index.split('_')[1].split('.')[0]
+ elif s_color == 'c3':
+ s_marker = s_index.split('_')[1].split('.')[1]
+ elif s_color == 'c4':
+ s_marker = s_index.split('_')[1].split('.')[2]
+ elif s_color == 'c5':
+ s_marker = s_index.split('_')[1].split('.')[3]
+ #these are only included in sardana shading corrected images
+ elif s_color == 'c6':
+ s_marker = s_index.split('_')[1].split('.')[2]
+ elif s_color == 'c7':
+ s_marker = s_index.split('_')[1].split('.')[3]
+ else: print('Error')
+ df_img.loc[s_index,'marker'] = s_marker
+ return(df_img)
+
+def cmif_mkdir(ls_dir):
+ '''
+ check if directories existe. if not, make them
+ '''
+ for s_dir in ls_dir:
+ if not os.path.exists(s_dir):
+ os.makedirs(s_dir)
+
+def load_single(s_find, s_scene):
+ '''
+ load a single image containing the find strin, scale, return {filename:scaled image}
+ '''
+ d_img = {}
+ for s_file in os.listdir():
+ if s_file.find(s_find)>-1:
+ a_img = io.imread(s_file)
+ a_scale = skimage.exposure.rescale_intensity(a_img,in_range=(np.quantile(a_img,0.03),1.5*np.quantile(a_img,0.9999)))
+ #d_img.update({f"{os.path.splitext(s_file)[0]}":a_scale})
+ d_img.update({f"{s_scene}":a_scale})
+ print(f'Number of images = {len(d_img)}')
+ return(d_img)
+
+def load_stack(df_img,s_find,s_scene,ls_markers,ls_rare):
+ '''
+ load an image stack in df_img, (df_img must have "path")
+ scale, get mip, return {filename:mip}
+ '''
+ d_img = {}
+ for s_file in os.listdir():
+ if s_file.find(s_find)>-1:
+ a_img = io.imread(s_file)
+ dapi = skimage.exposure.rescale_intensity(a_img,in_range=(np.quantile(a_img,0.03),1.5*np.quantile(a_img,0.9999)))
+
+ imgs = []
+ #images
+ df_common = df_img[df_img.marker.isin(ls_markers) & ~df_img.marker.isin(ls_rare)]
+ df_rare = df_img[df_img.marker.isin(ls_markers) & df_img.marker.isin(ls_rare)]
+ for s_path in df_common.path:
+ #print(s_path)
+ img = io.imread(s_path)
+ img_scale = skimage.exposure.rescale_intensity(img,in_range=(np.quantile(img,0.03),1.5*np.quantile(img,0.9999)))
+ imgs.append(img_scale)
+ for s_path in df_rare.path:
+ img = io.imread(s_path)
+ img_scale = skimage.exposure.rescale_intensity(img,in_range=(np.quantile(img,0.03),1.5*np.quantile(img,0.99999)))
+ imgs.append(img_scale)
+ mip = np.stack(imgs).max(axis=0)
+ zdh = np.dstack((np.zeros(mip.shape),mip,dapi)).astype('uint16')
+ #name
+ #s_index = df_common.index[0]
+ #s_common_marker = df_common.loc[s_index,'marker_string']
+ #s_name = os.path.splitext(df_common.index[0])[0]
+ #s_name = s_name.replace(s_common_marker,".".join(ls_markers))
+ # name
+ s_name = f'{s_scene}_{".".join(ls_markers)}'
+ d_img.update({s_name:zdh})
+ print(f'Number of projection images = ({len(d_img)}')
+ return(d_img)
+
+def load_img(subdir,s_find,s_sample,s_scene,ls_seg_markers,ls_rare):
+ '''
+ load dapi round and cell segmentation images
+ '''
+ #image dataframe
+ os.chdir(subdir)
+ df_seg = pd.DataFrame()
+ for s_dir in os.listdir():
+ if s_dir.find(s_sample)>-1:
+ os.chdir(s_dir)
+ df_img = parse_org()
+ df_markers = df_img[df_img.marker.isin(ls_seg_markers)]
+ df_markers['path'] = [f'{subdir}/{s_dir}/{item}' for item in df_markers.index]
+ if df_img.index.str.contains(s_find).sum()==1:
+ s_file = s_dir
+ dapi = io.imread(df_img[df_img.index.str.contains(s_find)].index[0])
+ os.chdir('..')
+ df_seg = df_seg.append(df_markers)
+
+ #load z_projection DAPIs
+ os.chdir(subdir)
+ d_dapi = {}
+ d_cyto = {}
+
+ dapi_scale = skimage.exposure.rescale_intensity(dapi,in_range=(np.quantile(dapi,0.03),1.5*np.quantile(dapi,0.9999)))
+ d_dapi.update({f"{s_sample}-{s_scene}":dapi_scale})
+ imgs = []
+ #images
+ df_common = df_seg[(df_seg.scene==s_scene) & (~df_seg.marker.isin(ls_rare))]
+ df_rare = df_seg[(df_seg.scene==s_scene) & (df_seg.marker.isin(ls_rare))]
+ for s_path in df_common.path:
+ print(s_path)
+ img = io.imread(s_path)
+ img_scale = skimage.exposure.rescale_intensity(img,in_range=(np.quantile(img,0.03),1.5*np.quantile(img,0.9999)))
+ imgs.append(img_scale)
+ for s_path in df_rare.path:
+ img = io.imread(s_path)
+ img_scale = skimage.exposure.rescale_intensity(img,in_range=(np.quantile(img,0.03),1.5*np.quantile(img,0.99999)))
+ imgs.append(img_scale)
+ mip = np.stack(imgs).max(axis=0)
+ zdh = np.dstack((np.zeros(mip.shape),mip,dapi)).astype('uint16')
+ d_cyto.update({f"{s_sample}-{s_scene}":zdh})
+ print(f'Number of images = {len(d_dapi)} dapi projections ({len(d_cyto)} cytoplasm projections) ')
+
+ return(d_dapi,d_cyto)
+
+def cellpose_segment_job(s_sample='SampleName',s_slide_scene="SceneName",s_find="FindDAPIString",segdir='PathtoSegmentation',imgdir='PathtoImages',nuc_diam='30',cell_diam='30',s_type='cell_or_nuclei',s_seg_markers="['Ecad']",s_rare="[]",s_match='both',s_data='cmIF',s_job='cpu'):
+ """
+ makes specific changes to template pyscripts files in Jenny's directories to result in .py file
+ Input:
+ """
+ #find template, open ,edit
+ os.chdir(f'{s_src_path}/src')
+ if s_data == 'cmIF':
+ with open('cellpose_template.py') as f:
+ s_file = f.read()
+ elif s_data == 'codex':
+ with open('cellpose_template_codex.py') as f:
+ s_file = f.read()
+ s_file = s_file.replace('SampleName',s_sample)
+ s_file = s_file.replace('SceneName',s_slide_scene)
+ s_file = s_file.replace('FindDAPIString',s_find)
+ s_file = s_file.replace('nuc_diam=int',f'nuc_diam={str(nuc_diam)}')
+ s_file = s_file.replace('cell_diam=int',f'cell_diam={str(cell_diam)}')
+ s_file = s_file.replace('cell_or_nuclei',s_type)
+ s_file = s_file.replace("['Ecad']",s_seg_markers)
+ s_file = s_file.replace("ls_rare = []",f"ls_rare = {s_rare}")
+ s_file = s_file.replace('PathtoSegmentation',segdir)
+ s_file = s_file.replace('PathtoImages',imgdir)
+ if s_match == 'match':
+ s_file = s_file.replace('#MATCHONLY',"'''")
+ elif s_match == 'seg':
+ s_file = s_file.replace('#SEGONLY',"'''")
+ if s_job == 'long':
+ with open('cellpose_template_long.sh') as f:
+ s_shell = f.read()
+ elif s_job == 'gpu':
+ with open('cellpose_template_gpu.sh') as f:
+ s_shell = f.read()
+ s_file = s_file.replace('#gpu#','')
+ s_file = s_file.replace('#SEGONLY',"'''")
+ else:
+ with open('cellpose_template.sh') as f:
+ s_shell = f.read()
+ s_shell = s_shell.replace("PythonScripName",f'cellpose_{s_type}_{s_slide_scene}.py')
+
+ #save edited .py file
+ if s_sample.find("-Scene") > -1:
+ s_sample = s_sample.split("-Scene")[0]
+ print(s_sample)
+ os.chdir(f'{segdir}')
+ with open(f'cellpose_{s_type}_{s_slide_scene}.py', 'w') as f:
+ f.write(s_file)
+
+ with open(f'cellpose_{s_type}_{s_slide_scene}.sh', 'w') as f:
+ f.write(s_shell)
+ st = os.stat(f'cellpose_{s_type}_{s_slide_scene}.sh')
+ os.chmod(f'cellpose_{s_type}_{s_slide_scene}.sh', st.st_mode | stat.S_IEXEC)
+
+def segment_spawner(s_sample,segdir,regdir,nuc_diam=30,cell_diam=30,s_type='nuclei',s_seg_markers="['Ecad']",s_job='short',s_match='both'):
+ '''
+ spawns cellpose segmentation jobs by modifying a python and bash script, saving them and calling with os.system
+ s_job='gpu' or 'long' (default = 'short')
+ s_match= 'seg' or 'match' (default = 'both')
+ '''
+ preprocess.cmif_mkdir([f'{segdir}/{s_sample}Cellpose_Segmentation'])
+ os.chdir(f'{regdir}')
+ for s_file in os.listdir():
+ if s_file.find(s_sample) > -1:
+ os.chdir(f'{regdir}/{s_file}')
+ print(f'Processing {s_file}')
+ df_img = parse_org()
+ for s_scene in sorted(set(df_img.scene)):
+ s_slide_scene= f'{s_sample}-Scene-{s_scene}'
+ s_find = df_img[(df_img.rounds=='R1') & (df_img.color=='c1') & (df_img.scene==s_scene)].index[0]
+ if os.path.exists(f'{regdir}/{s_slide_scene}'):
+ cellpose_segment_job(s_file,s_slide_scene,s_find,f'{segdir}/{s_sample}Cellpose_Segmentation',f'{regdir}/{s_slide_scene}',nuc_diam,cell_diam,s_type,s_seg_markers,s_job=s_job, s_match=s_match)
+ elif os.path.exists(f'{regdir}/{s_sample}'):
+ cellpose_segment_job(s_file,s_slide_scene,s_find,f'{segdir}/{s_sample}Cellpose_Segmentation',f'{regdir}/{s_sample}',nuc_diam,cell_diam,s_type,s_seg_markers,s_job=s_job, s_match=s_match)
+ os.chdir(f'{segdir}/{s_sample}Cellpose_Segmentation')
+ os.system(f'sbatch cellpose_{s_type}_{s_slide_scene}.sh')
+ time.sleep(4)
+ print('Next')
+
+def save_seg(processed_list,segdir,s_type='nuclei'):
+ '''
+ save the segmentation basins
+ '''
+
+ for item in processed_list:
+ for newkey,mask in item.items():
+ print(f"saving {newkey.split(' - ')[0]} {s_type} Basins")
+ if s_type=='nuclei':
+ io.imsave(f"{segdir}/{newkey} - Nuclei Segmentation Basins.tif", mask) #Scene 002 - Nuclei Segmentation Basins.tif
+ elif s_type=='cell':
+ io.imsave(f"{segdir}/{newkey} - Cell Segmentation Basins.tif", mask) #Scene 002 - Nuclei Segmentation Basins.tif
+
+def save_img(d_img, segdir,s_type='nuclei',ls_seg_markers=[]):
+ '''
+ save the segmentation basins
+ '''
+ #save dapi or save the cyto projection
+ if s_type=='nuclei':
+ for key,dapi in d_img.items():
+ print('saving DAPI')
+ print(key)
+ io.imsave(f"{segdir}/{key} - DAPI.png",dapi)
+ elif s_type=='cell':
+ for key,zdh in d_img.items():
+ print('saving Cyto Projection')
+ io.imsave(f"{segdir}/{key.split(' - ')[0]} - {'.'.join(ls_seg_markers)}_CytoProj.png",(zdh/255).astype('uint8'))
+
+ else:
+ print('choose nuceli or cell')
+
+# numba functions
+kv_ty = (types.int64, types.int64)
+
+@jitclass([('d', types.DictType(*kv_ty)),
+ ('l', types.ListType(types.float64))])
+class ContainerHolder(object):
+ def __init__(self):
+ # initialize the containers
+ self.d = numba.typed.Dict.empty(*kv_ty)
+ self.l = numba.typed.List.empty_list(types.float64)
+
+@overload(np.array)
+def np_array_ol(x):
+ if isinstance(x, types.Array):
+ def impl(x):
+ return np.copy(x)
+ return impl
+
+@numba.njit
+def test(a):
+ b = np.array(a)
+
+# numba function
+ '''
+ use numba to quickly iterate over each label and replace pixels with new pixel values
+ Input:
+ container = numba container class, with key-value pairs of old-new cell IDs
+ labels: numpy array with labels to rename
+ #cell_labels = np.where(np.array(cell_labels,dtype=np.int64)==key, value, np.array(labels,dtype=np.int64))
+ '''
+
+@jit(nopython=True)
+def relabel_numba(container,cell_labels):
+ '''
+ faster; replace pixels accorind to dictionsry (i.e. numba container)
+ key is original cell label, value is replaced label
+ '''
+ cell_labels = np.array(cell_labels)
+ for key, value in container.d.items():
+ cell_labels = np.where(cell_labels==key, value, cell_labels)
+ print('done matching')
+ return(cell_labels)
+
+def relabel_numpy(d_replace,cell_labels):
+ '''
+ slow replace pixels accorind to dictionary
+ key is original cell label, value is replaced label
+ '''
+ #key is original cell albel, value is replaced label
+ for key, value in d_replace.items():
+ cell_labels = np.where(cell_labels==key, value, cell_labels)
+ print('done matching')
+ return(cell_labels)
+
+def relabel_gpu(d_replace,cell_labels):
+ '''
+ not implemented yet
+ key is original cell label, value is replaced label
+ '''
+ #key is original cell albel, value is replaced label
+ for key, value in d_replace.items():
+ cell_labels = np.where(cell_labels==key, value, cell_labels)
+ print('done mathcing')
+ return(cell_labels)
+
+def nuc_to_cell_new(labels,cell_labels):
+ '''
+ problem - still not giving same result as original function
+ associate the largest nucleaus contained in each cell segmentation
+ Input:
+ labels: nuclear labels
+ cell_labels: cell labels that need to be matched
+ Ouput:
+ container: numba container of key-value pairs of old-new cell IDs
+ '''
+ start = time.time()
+ #dominant nuclei
+ props = measure.regionprops_table(cell_labels,labels, properties=(['intensity_image','image','label']))
+ df_prop = pd.DataFrame(props)
+ d_replace = {}
+ for idx in df_prop.index[::-1]:
+ label_id = df_prop.loc[idx,'label']
+ intensity_image = df_prop.loc[idx,'intensity_image']
+ image = df_prop.loc[idx,'image']
+ nuc_labels = intensity_image[image & intensity_image!=0]
+ if len(nuc_labels) == 0:
+ d_replace.update({label_id:0})
+ elif len(np.unique(nuc_labels)) == 1:
+ d_replace.update({label_id:nuc_labels[0]})
+ else:
+ new_id = scipy.stats.mode(nuc_labels)[0][0]
+ d_replace.update({label_id:new_id})
+
+ #convert to numba container
+ container = ContainerHolder()
+ for key, value in d_replace.items():
+ container.d[key] = value
+ end = time.time()
+ print(end - start)
+ return(container,d_replace, df_prop)
+
+def nuc_to_cell(labels,cell_labels):
+ '''
+ associate the largest nucleaus contained in each cell segmentation
+ Input:
+ labels: nuclear labels
+ cell_labels: cell labels that need to be matched
+ Ouput:
+ container: numba container of key-value pairs of old-new cell IDs
+ '''
+ start = time.time()
+ #dominant nuclei
+ d_replace = {}
+ for idx in np.unique(cell_labels)[::-1]:
+ if idx == 0:
+ continue
+ #iterate over each cell label, find all non-zero values contained within that mask
+ cell_array = labels[cell_labels == idx]
+ cell_array =cell_array[cell_array !=0]
+ #for multiple nuclei, choose largest (most common pixels, i.e. mode)
+ if len(np.unique(cell_array)) > 1:
+ new_id = scipy.stats.mode(cell_array, axis=0)[0][0]
+ d_replace.update({idx:new_id})
+ elif len(np.unique(cell_array)) == 1:
+ d_replace.update({idx:cell_array[0]})
+ else:
+ d_replace.update({idx:0})
+ #fix matching bug
+ d_replace = {item[0]:item[1] for item in sorted(d_replace.items(), key=lambda x: x[1], reverse=True)}
+ #convert to numba container
+ container = ContainerHolder()
+ for key, value in d_replace.items():
+ container.d[key] = value
+ end = time.time()
+ print(end - start)
+ return(container,d_replace)
+
+########## OLD ##############
+
+def zero_background(cells_relabel):
+ '''
+ in a labelled cell image, set the background to zero
+ '''
+ mode = stats.mode(cells_relabel,axis=0)[0][0][0]
+ black = cells_relabel.copy()
+ black[black==mode] = 0
+ return(black)
+
+def nuc_to_cell_watershed(labels,cell_labels,i_small=200):
+ '''
+ associate the largest nucleus contained in each cell segmentation
+ Input:
+ labels: nuclear labels
+ cell_labels: cell labels that need to be matched
+ Ouput:
+ new_cell_labels: shrunk so not touching and cleaned of small objects < i_small
+ container: numba container of key-value pairs of old-new cell IDs
+ d_replace: python dictionary of key-value pairs
+ '''
+ #cells
+ cell_boundaries = segmentation.find_boundaries(cell_labels,mode='outer')
+ shrunk_cells = cell_labels.copy()
+ shrunk_cells[cell_boundaries] = 0
+ foreground = shrunk_cells != 0
+ foreground_cleaned = morphology.remove_small_objects(foreground, i_small)
+ background = ~foreground_cleaned
+ shrunk_cells[background] = 0
+ #problem when we filter
+ #new_cell_labels = measure.label(foreground_cleaned, background=0)
+
+ #nuclei
+ cut_labels = labels.copy()
+ background = ~foreground_cleaned
+ cut_labels[background] = 0
+ labels_in = morphology.remove_small_objects(cut_labels, i_small)
+ cleaned_nuclei = labels_in
+ distance = ndi.distance_transform_edt(foreground_cleaned)
+ labels_out = segmentation.watershed(-distance, labels_in, mask=foreground_cleaned)
+
+ #dominant nuclei
+ props = measure.regionprops_table(shrunk_cells,labels_out, properties=('min_intensity','max_intensity','mean_intensity'))
+ df_prop = pd.DataFrame(props)
+ d_replace = {}
+ for idx in df_prop.index[::-1]:
+ #iterate over each cell label, find all non-zero values of watershed expansioncontained within that mask
+ cell_array = labels_out[shrunk_cells == idx]
+ if len(np.unique(cell_array)) > 1:
+ new_id = scipy.stats.mode(cell_array, axis=0)[0][0]
+ d_replace.update({idx:new_id})
+ elif len(np.unique(cell_array)) == 1:
+ d_replace.update({idx:cell_array[0]})
+ else:
+ d_replace.update({idx:0})
+ #convert to numba container
+ container = ContainerHolder()
+ for key, value in d_replace.items():
+ container.d[key] = value
+
+ return(container)
+
+def save_seg_z(processed_list,segdir,s_type='nuclei'):
+ '''
+ save the segmentation basins
+ '''
+
+ for item in processed_list:
+ for newkey,mask in item.items():
+ print(f"saving {newkey.split(' - Z')[0]} {s_type} Basins")
+ if s_type=='nuclei':
+ io.imsave(f"{segdir}/{newkey} - Nuclei Segmentation Basins.tif", mask) #Scene 002 - Nuclei Segmentation Basins.tif
+ elif s_type=='cell':
+ io.imsave(f"{segdir}/{newkey} - Cell Segmentation Basins.tif", mask) #Scene 002 - Nuclei Segmentation Basins.tif
+
+def cellpose_segment_parallel(d_img,s_type='nuclei'):
+ '''
+ Dont use/ segment nuclei or cell
+ '''
+ if s_type=='nuclei':
+ print('segmenting nuclei')
+ if __name__ == "__main__":
+ processed_list = Parallel(n_jobs=len(d_img))(delayed(cellpose_nuc)(key,img,diameter=nuc_diam) for key,img in d_img.items())
+
+ elif s_type=='cell':
+ print('segmenting cells')
+ if __name__ == "__main__":
+ processed_list = Parallel(n_jobs=len(d_img))(delayed(cellpose_cell)(key,img,diameter=cell_diam) for key,img in d_img.items())
+
+ else:
+ print('choose nuceli or cell')
+ return(processed_list)
+
+def save_img_z(d_img, segdir,s_type='nuclei',ls_seg_markers=[]):
+ '''
+ save the segmentation basins
+ '''
+ #save dapi or save the cyto projection
+ if s_type=='nuclei':
+ for key,dapi in d_img.items():
+ print('saving DAPI')
+ io.imsave(f"{segdir}/{key}",dapi)
+ elif s_type=='cell':
+ for key,zdh in d_img.items():
+ print('saving Cyto Projection')
+ io.imsave(f"{segdir}/{key.split(' - Z')[0]} - {'.'.join(ls_seg_markers)}_CytoProj.png",(zdh/255).astype('uint8'))
+
+ else:
+ print('choose nuceli or cell')
+
+def cellpose_segment_job_z(s_sample='SampleName',s_scene="SceneName",nuc_diam='20',cell_diam='25',s_type='cell_or_nuclei',s_seg_markers="['Ecad']",s_rare="[]",codedir='PathtoCode'):
+ """
+ makes specific changes to template pyscripts files in Jenny's directories to result in .py file
+ Input:
+
+ """
+ #find template, open ,edit
+ os.chdir(f'{s_src_path}/src')
+ with open('cellpose_template_z.py') as f:
+ s_file = f.read()
+ s_file = s_file.replace('SampleName',s_sample)
+ s_file = s_file.replace('SceneName',s_scene)
+ s_file = s_file.replace('nuc_diam=int',f'nuc_diam={str(nuc_diam)}')
+ s_file = s_file.replace('cell_diam=int',f'cell_diam={str(cell_diam)}')
+ s_file = s_file.replace('cell_or_nuclei',s_type)
+ s_file = s_file.replace("['Ecad']",s_seg_markers)
+ s_file = s_file.replace("ls_rare = []",f"ls_rare = {s_rare}")
+ s_file = s_file.replace('PathtoCode',codedir)
+
+ with open('cellpose_template_z.sh') as f:
+ s_shell = f.read()
+ s_shell = s_shell.replace("PythonScripName",f'cellpose_{s_type}_{s_scene.replace(" ","-").split("_")[0]}.py')
+
+ #save edited .py file
+ os.chdir(f'{codedir}/Segmentation/{s_sample}Cellpose_Segmentation')
+ with open(f'cellpose_{s_type}_{s_scene.replace(" ","-").split("_")[0]}.py', 'w') as f:
+ f.write(s_file)
+
+ with open(f'cellpose_{s_type}_{s_scene.replace(" ","-").split("_")[0]}.sh', 'w') as f:
+ f.write(s_shell)
+
+def load_scene_z(subdir,dapidir,s_sample,s_scene,ls_seg_markers,ls_rare):
+ '''
+ load dapi projection and cell segmentation images
+ '''
+ #image dataframe
+ os.chdir(subdir)
+ df_seg = pd.DataFrame()
+ for s_dir in os.listdir():
+ if s_dir.find(s_sample)>-1:
+ os.chdir(s_dir)
+ df_img = parse_org()
+ df_markers = df_img[df_img.marker.isin(ls_seg_markers)]
+ df_markers['path'] = [f'{subdir}/{s_dir}/{item}' for item in df_markers.index]
+ os.chdir('..')
+ df_seg = df_seg.append(df_markers)
+
+ #load z_projection DAPIs
+ os.chdir(dapidir)
+ d_dapi = {}
+ d_cyto = {}
+ for s_file in sorted(os.listdir()):
+ #print(s_file)
+ if s_file.find(f'{s_scene} - ZProjectionDAPI.png')>-1:
+ dapi = io.imread(s_file)
+ dapi_scale = skimage.exposure.rescale_intensity(dapi,in_range=(np.quantile(dapi,0.03),1.5*np.quantile(dapi,0.9999)))
+ d_dapi.update({s_file:dapi_scale})
+ s_scene = s_scene.split(' ')[1].split('_')[0]
+ print(s_scene)
+ imgs = []
+ #images
+ df_common = df_seg[(df_seg.scene==s_scene) & (~df_markers.marker.isin(ls_rare))]
+ df_rare = df_seg[(df_seg.scene==s_scene) & (df_markers.marker.isin(ls_rare))]
+ for s_path in df_common.path:
+ img = io.imread(s_path)
+ img_scale = skimage.exposure.rescale_intensity(img,in_range=(np.quantile(img,0.03),1.5*np.quantile(img,0.9999)))
+ imgs.append(img_scale)
+ for s_path in df_rare.path:
+ img = io.imread(s_path)
+ img_scale = skimage.exposure.rescale_intensity(img,in_range=(np.quantile(img,0.03),1.5*np.quantile(img,0.999999)))
+ imgs.append(img_scale)
+ mip = np.stack(imgs).max(axis=0)
+ zdh = np.dstack((np.zeros(mip.shape),mip,dapi)).astype('uint16')
+ d_cyto.update({s_file:zdh})
+ print(f'Number of images = {len(d_dapi)} dapi projections ({len(d_cyto)} cytoplasm projections) ')
+
+ return(d_dapi,d_cyto)
+
+#test code
+'''
+import napari
+#os.chdir('./Desktop/BR1506')
+labels = io.imread('Scene 059 nuclei20 - Nuclei Segmentation Basins.tif')
+cell_labels = io.imread('Scene 059 cell25 - Cell Segmentation Basins.tif')
+cyto_img = io.imread('Scene 059 - CytoProj.png')
+dapi_img = io.imread('Scene 059 - ZProjectionDAPI.png')
+viewer = napari.Viewer()
+viewer.add_labels(labels,blending='additive')
+viewer.add_labels(cell_labels,blending='additive')
+viewer.add_image(cyto_img,blending='additive')
+viewer.add_image(dapi_img,blending='additive',colormap='blue')
+#cell_boundaries = segmentation.find_boundaries(cell_labels,mode='outer')
+#viewer.add_labels(cell_boundaries,blending='additive')
+#nuclear_boundaries = segmentation.find_boundaries(labels,mode='outer')
+#viewer.add_labels(nuclear_boundaries,blending='additive',num_colors=2)
+closing = skimage.morphology.closing(cell_labels)
+viewer.add_labels(closing,blending='additive')
+container = nuc_to_cell(labels,closing)#cell_labels)
+
+#matched cell labels
+cells_relabel = relabel_numba(container[0],closing)
+#remove background
+mode = stats.mode(cells_relabel,axis=0)[0][0][0]
+black = cells_relabel.copy()
+black[black==mode] = 0
+viewer.add_labels(black,blending='additive')
+cell_boundaries = segmentation.find_boundaries(cells_relabel,mode='outer')
+viewer.add_labels(cell_boundaries,blending='additive')
+#ring
+overlap = black==labels
+viewer.add_labels(overlap, blending='additive')
+#cytoplasm
+ring_rep = black.copy()
+ring_rep[overlap] = 0
+viewer.add_labels(ring_rep, blending='additive')
+#membrane
+rim_labels = contract_membrane(black)
+viewer.add_labels(rim_labels, blending='additive')
+
+#expanded nucleus
+__,__,peri_nuc = expand_nuc(labels,distance=3)
+viewer.add_labels(peri_nuc, blending='additive')
+'''
\ No newline at end of file
diff --git a/mplex_image/visualize.py b/mplex_image/visualize.py
new file mode 100755
index 0000000..3cbdf35
--- /dev/null
+++ b/mplex_image/visualize.py
@@ -0,0 +1,387 @@
+####
+# title: analyze.py
+#
+# language: Python3.6
+# date: 2019-05-00
+# license: GPL>=v3
+# author: Jenny
+#
+# description:
+# python3 library to visualize cyclic data and analysis
+####
+
+#load libraries
+import matplotlib as mpl
+import matplotlib.pyplot as plt
+import pandas as pd
+import numpy as np
+import os
+import skimage
+from skimage import io, segmentation
+import tifffile
+import copy
+import napari
+import seaborn as sns
+from sklearn.cluster import KMeans
+from sklearn.preprocessing import scale
+import random
+import copy
+from scipy.ndimage import distance_transform_edt
+
+#napari
+def load_crops(viewer,s_crop,s_tissue):
+ ls_color = ['blue','green','yellow','red','cyan','magenta','gray','green','yellow','red','cyan','magenta','gray',
+ 'green','yellow','red','cyan','magenta','gray','gray','gray','gray','gray','gray','gray','gray']
+ print(s_crop)
+ #viewer = napari.Viewer()
+ for s_file in os.listdir():
+ if s_file.find(s_tissue)>-1:
+ if s_file.find(s_crop) > -1:
+ if s_file.find('ome.tif') > -1:
+ with tifffile.TiffFile(s_file) as tif:
+ array = tif.asarray()
+ omexml_string = tif.ome_metadata
+ for idx in range(array.shape[0]):
+ img = array[idx]
+ i_begin = omexml_string.find(f'Channel ID="Channel:0:{idx}" Name="')
+ i_end = omexml_string[i_begin:].find('" SamplesPerPixel')
+ s_marker = omexml_string[i_begin + 31:i_begin + i_end]
+ if s_marker.find('utf-8') == 0:
+ s_marker = 'DAPI1'
+ print(s_marker)
+ viewer.add_image(img,name=s_marker,rgb=False,visible=False,blending='additive',colormap=ls_color[idx],contrast_limits = (np.quantile(img,0),(np.quantile(img,0.9999)+1)*1.5))
+ elif s_file.find('SegmentationBasins') > -1:
+ label_image = io.imread(s_file)
+ viewer.add_labels(label_image, name='cell_seg',blending='additive',visible=False)
+ cell_boundaries = segmentation.find_boundaries(label_image,mode='outer')
+ viewer.add_labels(cell_boundaries,blending='additive',visible=False)
+ else:
+ label_image = np.array([])
+ print('')
+ return(label_image)
+
+def load_marker(viewer,s_crop,s_tissue,ls_marker=[]):
+ ls_color = ['blue','green','yellow','red','cyan','magenta','gray','green','yellow','red','cyan','magenta',
+ 'gray','gray','gray','gray','gray','gray','gray','gray']
+ print(s_crop)
+ ls_marker_all = copy.copy(ls_marker)
+ for s_file in os.listdir():
+ if s_file.find(s_tissue)>-1:
+ if s_file.find(s_crop) > -1:
+ if s_file.find('ome.tif') > -1:
+ with tifffile.TiffFile(s_file) as tif:
+ array = tif.asarray()
+ omexml_string = tif.ome_metadata
+ d_result = {}
+ for idx in range(array.shape[0]):
+ img = array[idx]
+ i_begin = omexml_string.find(f'Channel ID="Channel:0:{idx}" Name="')
+ i_end = omexml_string[i_begin:].find('" SamplesPerPixel')
+ s_marker_idx = omexml_string[i_begin + 31:i_begin + i_end]
+ if s_marker_idx.find('utf-8') == 0:
+ s_marker_idx = 'DAPI1'
+ d_result.update({s_marker_idx:img})
+ for idxs, s_marker in enumerate(ls_marker):
+ if len(set(d_result.keys()).intersection(set([s_marker])).intersection(set(ls_marker_all))) > 0:
+ img = d_result[s_marker]
+ viewer.add_image(img,name=s_marker,rgb=False,visible=True,blending='additive',colormap=ls_color[idxs],contrast_limits = (np.quantile(img,0),(np.quantile(img,0.9999)+1)*1.5))
+ ls_marker_all.remove(s_marker)
+ elif s_file.find('SegmentationBasins') > -1:
+ label_image = io.imread(s_file)
+ else:
+ ome_array = np.array([])
+ print('')
+ return(d_result,label_image)
+
+def pos_label(viewer,df_pos,label_image,s_cell):
+ '''
+ df_pos = boolean dataframe, s_cell = marker name
+ '''
+ #s_cell = df_pos.columns[df_pos.columns.str.contains(f'{s_cell}_')][0]
+ #get rid of extra cells (filtered by DAPI, etc)
+ li_index = [int(item.split('_')[-1].split('cell')[1]) for item in df_pos.index]
+ label_image_cell = copy.deepcopy(label_image)
+ label_image_cell[~np.isin(label_image_cell, li_index)] = 0
+ li_index_cell = [int(item.split('_')[-1].split('cell')[1]) for item in df_pos[df_pos.loc[:,s_cell]==True].index]
+ label_image_cell[~np.isin(label_image_cell,li_index_cell )] = 0
+ viewer.add_labels(label_image_cell, name=f'{s_cell.split("_")[0]}_seg',blending='additive',visible=False)
+ return(label_image_cell)
+
+def expand_labels(label_image, distance=1):
+ """Expand labels in label image by ``distance`` pixels without overlapping.
+ Given a label image, ``expand_labels`` grows label regions (connected components)
+ outwards by up to ``distance`` pixels without overflowing into neighboring regions.
+ More specifically, each background pixel that is within Euclidean distance
+ of <= ``distance`` pixels of a connected component is assigned the label of that
+ connected component.
+ Where multiple connected components are within ``distance`` pixels of a background
+ pixel, the label value of the closest connected component will be assigned (see
+ Notes for the case of multiple labels at equal distance).
+
+ Parameters
+ ----------
+ label_image : ndarray of dtype int
+ label image
+ distance : float
+ Euclidean distance in pixels by which to grow the labels. Default is one.
+ Returns
+ -------
+ enlarged_labels : ndarray of dtype int
+ Labeled array, where all connected regions have been enlarged
+ """
+ distances, nearest_label_coords = distance_transform_edt(
+ label_image == 0, return_indices=True
+ )
+ labels_out = np.zeros_like(label_image)
+ dilate_mask = distances <= distance
+ # build the coordinates to find nearest labels,
+ # in contrast to [1] this implementation supports label arrays
+ # of any dimension
+ masked_nearest_label_coords = [
+ dimension_indices[dilate_mask]
+ for dimension_indices in nearest_label_coords
+ ]
+ nearest_labels = label_image[tuple(masked_nearest_label_coords)]
+ labels_out[dilate_mask] = nearest_labels
+ return labels_out
+
+def pos_boundary(viewer,df_pos,label_image,s_cell,seed=0.82,s_type='thick'):
+ '''
+ df_pos = boolean dataframe, s_cell = marker name
+ '''
+ #s_cell = df_pos.columns[df_pos.columns.str.contains(f'{s_cell}_')][0]
+ #get rid of extra cells (filtered by DAPI, etc)
+ li_index = [int(item.split('_')[-1].split('cell')[1]) for item in df_pos.index]
+ label_image_cell = copy.deepcopy(label_image)
+ label_image_cell[~np.isin(label_image_cell, li_index)] = 0
+ li_index_cell = [int(item.split('_')[-1].split('cell')[1]) for item in df_pos[df_pos.loc[:,s_cell]==True].index]
+ label_image_cell[~np.isin(label_image_cell,li_index_cell )] = 0
+ cell_boundaries = segmentation.find_boundaries(label_image_cell,mode='thick')
+ if s_type == 'thick':
+ cell_boundaries_big = segmentation.find_boundaries(expand_labels(label_image_cell, distance=2),mode='thick')
+ viewer.add_labels(cell_boundaries + cell_boundaries_big, name=f'{s_cell}_seg',blending='additive',visible=False,seed=seed)
+ else:
+ viewer.add_labels(cell_boundaries, name=f'{s_cell}_seg',blending='additive',visible=False,seed=seed)
+ cell_boundaries_big = []
+ return(cell_boundaries, cell_boundaries_big)
+
+#jupyter notbook
+#load manual thresholds
+def new_thresh_csv(df_mi,d_combos):
+ #make thresh csv's
+ df_man = pd.DataFrame(index= ['global']+ sorted(set(df_mi.slide_scene)))
+ for s_type, es_marker in d_combos.items():
+ for s_marker in sorted(es_marker):
+ df_man[s_marker] = ''
+ return(df_man)
+
+def load_thresh_csv(s_sample):
+ #load
+ df_man = pd.read_csv(f'thresh_JE_{s_sample}.csv',header=0,index_col = 0)
+ #reformat the thresholds data and covert to 16 bit
+ ls_index = df_man.index.tolist()
+ ls_index.remove('global')
+ df_thresh = pd.DataFrame(index = ls_index)
+ ls_marker = df_man.columns.tolist()
+ for s_marker in ls_marker:
+ df_thresh[f'{s_marker}_global'] = df_man[df_man.index=='global'].loc['global',f'{s_marker}']*256
+ df_thresh[f'{s_marker}_local'] = df_man[df_man.index!='global'].loc[:,f'{s_marker}']*256
+
+ df_thresh.replace(to_replace=0, value = 12, inplace=True)
+ return(df_thresh)
+
+def threshold_postive(df_thresh,df_mi):
+ '''
+ #make positive dataframe to check threhsolds #start with local, and if its not there, inesrt the global threshold
+ #note, this will break if there are two biomarker locations #
+ '''
+ ls_scene = sorted(df_thresh.index.tolist())
+ ls_sub = df_mi.columns[df_mi.dtypes=='float64'].tolist()
+ ls_other = []
+ df_pos= pd.DataFrame()
+ d_thresh_record= {}
+ for s_scene in ls_scene:
+ ls_index = df_mi[df_mi.slide_scene==s_scene].index
+ df_scene = pd.DataFrame(index=ls_index)
+ for s_marker_loc in ls_sub:
+ s_marker = s_marker_loc.split('_')[0]
+ # only threshold markers in .csv
+ if len(set([item.split('_')[0] for item in df_thresh.columns]).intersection({s_marker})) != 0:
+ #first check if local threshold exists
+ if df_thresh[df_thresh.index==s_scene].isna().loc[s_scene,f'{s_marker}_local']==False:
+ #local
+ i_thresh = df_thresh.loc[s_scene,f'{s_marker}_local']
+ df_scene.loc[ls_index,s_marker_loc] = df_mi.loc[ls_index,s_marker_loc] >= i_thresh
+ #otherwise use global
+ elif df_thresh[df_thresh.index==s_scene].isna().loc[s_scene,f'{s_marker}_global']==False:
+ i_thresh = df_thresh.loc[s_scene,f'{s_marker}_global']
+ df_scene.loc[ls_index,s_marker_loc] = df_mi.loc[ls_index,s_marker_loc] >= i_thresh
+ else:
+ ls_other = ls_other + [s_marker]
+ i_thresh = np.NaN
+ d_thresh_record.update({f'{s_scene}_{s_marker}':i_thresh})
+ else:
+ ls_other = ls_other + [s_marker]
+ df_pos = df_pos.append(df_scene)
+ print(f'Did not threshold {set(ls_other)}')
+ return(d_thresh_record,df_pos)
+
+def plot_positive(s_type,d_combos,df_pos,d_thresh_record,df_xy,b_save=True):
+ ls_color = sorted(d_combos[s_type])
+ ls_bool = [len(set([item.split('_')[0]]).intersection(set(ls_color)))==1 for item in df_pos.columns]
+ ls_color = df_pos.columns[ls_bool].tolist()
+ ls_scene = sorted(set(df_xy.slide_scene))
+ ls_fig = []
+ for s_scene in ls_scene:
+ #negative cells = all cells even before dapi filtering
+ df_neg = df_xy[(df_xy.slide_scene==s_scene)]
+ #plot
+ fig, ax = plt.subplots(2, ((len(ls_color))+1)//2, figsize=(18,12)) #figsize=(18,12)
+ ax = ax.ravel()
+ for ax_num, s_color in enumerate(ls_color):
+ s_marker = s_color.split('_')[0]
+ s_min = d_thresh_record[f"{s_scene}_{s_marker}"]
+ #positive cells = positive cells based on threshold
+ ls_pos_index = (df_pos[df_pos.loc[:,s_color]]).index
+ df_color_pos = df_neg[df_neg.index.isin(ls_pos_index)]
+ if len(df_color_pos)>=1:
+ #plot negative cells
+ ax[ax_num].scatter(data=df_neg,x='DAPI_X',y='DAPI_Y',color='silver',s=1)
+ #plot positive cells
+ ax[ax_num].scatter(data=df_color_pos, x='DAPI_X',y='DAPI_Y',color='DarkBlue',s=.5)
+
+ ax[ax_num].axis('equal')
+ ax[ax_num].set_ylim(ax[ax_num].get_ylim()[::-1])
+ ax[ax_num].set_title(f'{s_marker} min={int(s_min)} ({len(df_color_pos)} cells)')
+ else:
+ ax[ax_num].set_title(f'{s_marker} min={(s_min)} ({(0)} cells')
+ fig.suptitle(s_scene)
+ ls_fig.append(fig)
+ if b_save:
+ fig.savefig(f'./SpatialPlots/{s_scene}_{s_type}_manual.png')
+ return(ls_fig)
+
+#gating analysis
+def prop_positive(df_data,s_cell,s_grouper):
+ #df_data['countme'] = True
+ df_cell = df_data.loc[:,[s_cell,s_grouper,'countme']].dropna()
+ df_prop = (df_cell.groupby([s_cell,s_grouper]).countme.count()/df_cell.groupby([s_grouper]).countme.count()).unstack().T
+ return(df_prop)
+
+def prop_clustermap(df_prop,df_annot,i_thresh,lut,figsize=(10,5)):
+ for s_index in df_prop.index:
+ s_subtype = df_annot.loc[s_index,'ID'] #
+ df_prop.loc[s_index, 'ID'] = s_subtype
+ species = df_prop.pop("ID")
+ row_colors = species.map(lut)
+
+ #clustermap plot wihtout the low values -drop less than i_threh % of total
+ df_plot = df_prop.fillna(0)
+ if i_thresh > 0:
+ df_plot_less = df_plot.loc[:,df_plot.sum()/len(df_plot) > i_thresh]
+ i_len = len(df_prop)
+ i_width = len(df_plot_less.columns)
+ g = sns.clustermap(df_plot_less,figsize=figsize,cmap='viridis',row_colors=row_colors)
+ return(g,df_plot_less)
+
+def prop_barplot(df_plot_less,s_cell,colormap="Spectral",figsize=(10,5),b_sort=True):
+ i_len = len(df_plot_less)
+ i_width = len(df_plot_less.columns)
+ fig,ax = plt.subplots(figsize=figsize)
+ if b_sort:
+ df_plot_less = df_plot_less.sort_index(ascending=False)
+ df_plot_less.plot(kind='barh',stacked=True,width=.9, ax=ax,colormap=colormap)
+ ax.set_title(s_cell)
+ ax.set_xlabel('Fraction Positive')
+ ax.legend(bbox_to_anchor=(1.01, 1))
+ plt.tight_layout()
+ return(fig)
+
+def plot_color_leg(lut,figsize = (2.3,3)):
+ #colors
+ series = pd.Series(lut)
+ df_color = pd.DataFrame(index=range(len(series)),columns=['subtype','color'])
+
+ series.sort_values()
+ df_color['subtype'] = series.index
+ df_color['value'] = 1
+ df_color['color'] = series.values
+
+ fig,ax = plt.subplots(figsize = figsize,dpi=100)
+ df_color.plot(kind='barh',x='subtype',y='value',width=1,legend=False,color=df_color.color,ax=ax)
+ ax.set_xticks([])
+ ax.set_ylabel('')
+ ax.set_title(f'subtype')
+ plt.tight_layout()
+ return(fig)
+
+#cluster analysis
+
+def cluster_kmeans(df_mi,ls_columns,k,b_sil=False):
+ '''
+ log2 transform, zscore and kmens cluster
+ '''
+ df_cluster_norm = df_mi.loc[:,ls_columns]
+ df_cluster_norm_one = df_cluster_norm + 1
+ df_cluster = np.log2(df_cluster_norm_one)
+
+ #select figure size
+ i_len = k
+ i_width = len(df_cluster.columns)
+
+ #scale date
+ df_scale = scale(df_cluster)
+
+ #kmeans cluster
+ kmeans = KMeans(n_clusters=k, random_state=0).fit(df_scale)
+ df_cluster.columns = [item.split('_')[0] for item in df_cluster.columns]
+ df_cluster[f'K{k}'] = list(kmeans.labels_)
+ g = sns.clustermap(df_cluster.groupby(f'K{k}').mean(),cmap="RdYlGn_r",z_score=1,figsize=(3+i_width/3,3+i_len/3))
+ if b_sil:
+ score = silhouette_score(X = df_scale, labels=list(kmeans.labels_))
+ else:
+ score = np.nan
+ return(g,df_cluster,score)
+
+def plot_clusters(df_cluster,df_xy,s_num='many'):
+ s_type = df_cluster.columns[df_cluster.dtypes=='int64'][0]
+ print(s_type)
+ ls_scene = sorted(set(df_cluster.slide_scene))
+ ls_color = sorted(set(df_cluster.loc[:,s_type].dropna()))
+ d_fig = {}
+ for s_scene in ls_scene:
+ #negative cells = all cells even before dapi filtering
+ df_neg = df_xy[(df_xy.slide_scene==s_scene)]
+ #plot
+ if s_num == 'many':
+ fig, ax = plt.subplots(3, ((len(ls_color))+2)//3, figsize=(18,12),dpi=200)
+ else:
+ fig, ax = plt.subplots(2, 1, figsize=(7,4),dpi=200)
+ ax = ax.ravel()
+ for ax_num, s_color in enumerate(ls_color):
+ s_marker = s_color
+ #positive cells = poitive cells based on threshold
+ ls_pos_index = (df_cluster[df_cluster.loc[:,s_type]==s_color]).index
+ df_color_pos = df_neg[df_neg.index.isin(ls_pos_index)]
+ if len(df_color_pos)>=1:
+ #plot negative cells
+ ax[ax_num].scatter(data=df_neg,x='DAPI_X',y='DAPI_Y',color='silver',s=1)
+ #plot positive cells
+ ax[ax_num].scatter(data=df_color_pos, x='DAPI_X',y='DAPI_Y',color='DarkBlue',s=.5)
+
+ ax[ax_num].axis('equal')
+ ax[ax_num].set_ylim(ax[ax_num].get_ylim()[::-1])
+ if s_num == 'many':
+ ax[ax_num].set_xticklabels('')
+ ax[ax_num].set_yticklabels('')
+ else:
+ ax[0].set_xticklabels('')
+ ax[ax_num].set_title(f'{s_color} ({len(df_color_pos)} cells)')
+ else:
+ ax[ax_num].set_xticklabels('')
+ ax[ax_num].set_yticklabels('')
+ ax[ax_num].set_title(f'{s_color} ({(0)} cells')
+
+ fig.suptitle(s_scene)
+ d_fig.update({s_scene:fig})
+ return(d_fig)