diff --git a/Collagen_Bx2-4.ipynb b/Collagen_Bx2-4.ipynb new file mode 100755 index 0000000..a30d41c --- /dev/null +++ b/Collagen_Bx2-4.ipynb @@ -0,0 +1,506 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#load libraries\n", + "\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import os\n", + "import copy\n", + "import seaborn as sns\n", + "import importlib\n", + "import scipy\n", + "\n", + "import scanpy as sc\n", + "from sklearn.cluster import KMeans\n", + "from sklearn.preprocessing import scale, minmax_scale\n", + "from sklearn.metrics import silhouette_score\n", + "import matplotlib as mpl\n", + "mpl.rc('figure', max_open_warning = 0)\n", + "#mpl.font_manager._rebuild()\n", + "mpl.rcParams['mathtext.fontset'] = 'custom'\n", + "mpl.rcParams['mathtext.it'] = 'Arial:italic'\n", + "mpl.rcParams['mathtext.rm'] = 'Arial'\n", + "mpl.rcParams['font.sans-serif'] = \"Arial\"\n", + "mpl.rcParams['font.family'] = \"sans-serif\"\n", + "mpl.rc('font', serif='Arial') \n", + "codedir = os.getcwd()\n", + "#load cmif libraries\n", + "#os.chdir('/home/groups/graylab_share/OMERO.rdsStore/engje/Data/cmIF')\n", + "from mplex_image import visualize as viz, process, preprocess, normalize" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "os.chdir(codedir)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "np.random.seed(222)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Table of contents \n", + "1. [Load Data](#load)\n", + "2. [Normalize](#norm)\n", + "6. [Visualize Normalization](#normviz)\n", + "[leiden for cell typing](#clusterlei)\n", + "7. [Cluster K means](#cluster)\n", + "8. [Leiden cluster](#clust1)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#load data\n", + "os.chdir(f'{codedir}/paper_data')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "s_date = '20210402'\n", + "if not os.path.exists(s_date):\n", + " os.mkdir(s_date)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Load Data \n", + "\n", + "2.\tAs Ki67 is not continuous antigen, can you count positive cells (Proliferative cluster) by distance (<25, 25-50, 50-75, >75) from collagen I in each Bx?\n", + "\n", + "3.\tCould you map cells by distance (<25, 25-50, 50-75, >75) from collagen I in each Bx? If you can add a distance column (1-4) in the cluster csv, I can make it in Qi.\n", + "\n", + "4.\tCould you try to see the correlation between ER/PCNA and (VIM+aSMA+CD31)? – not necessary to show significance. (see attached image from Bx1 Scene-003)\n", + "\n", + "[contents](#contents)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### not normalized" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_mi = pd.read_csv('20210324_SMTBx1-4_JE-TMA-43_60_62_FilteredMeanIntensity.csv',index_col=0) \n", + "df_mi['slide'] = [item.split('_')[0] for item in df_mi.index]\n", + "df_mi['slide_scene'] = [item.split('_cell')[0] for item in df_mi.index]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for s_file in os.listdir():\n", + " if s_file.find('MaskDistances') > -1:\n", + " print(s_file)\n", + "df_mask = pd.DataFrame()\n", + "for s_sample in ['SMT101Bx1-16','SMTBx2-5','SMTBx3','SMTBx4-3','HTA-33']: #'SMT101Bx4-3',\n", + " df_mask = df_mask.append(pd.read_csv(f'features_{s_sample}_MaskDistances.csv',index_col=0))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_mask.columns\n", + "ls_target = ['Vim_dist','CD31_dist', 'PDPN_dist', 'aSMA_dist', 'CD68_dist','ColI_dist', 'ColIV_dist']\n", + "ls_marker = ['ER_nuclei','Ki67_nuclei','PCNA_nuclei']\n", + "ls_drop = ['HTA-33_scene001','SMTBx1-16_scene001'#,'SMT101Bx4-3_scene001','SMT101Bx4-3_scene002'\n", + " ]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = df_mi.merge(df_mask.loc[:,ls_target],left_index=True,right_index=True)\n", + "df = df[(~df.Vim_dist.isna()) & (~df.slide_scene.isin(ls_drop))]\n", + "df.loc[:,ls_target] = df.loc[:,ls_target]*.325" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "#fit\n", + "data = df.loc[:,ls_marker].T\n", + "batch = df.slide\n", + "bayesdata = normalize.combat(data, batch)\n", + "df_norm = bayesdata.T" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_norm['slide'] = df.slide\n", + "df_norm.groupby('slide').mean()\n", + "df_norm.groupby('slide').std()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df['Vim-CD31-aSMA_dist'] = df.loc[:,['Vim_dist','CD31_dist','aSMA_dist']].min(axis=1)\n", + "ls_target = ls_target + ['Vim-CD31-aSMA_dist']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "mpl.rcParams['pdf.fonttype'] = 42\n", + "mpl.rcParams['ps.fonttype'] = 42\n", + "%matplotlib inline\n", + "#by tissue no Bx1\n", + "sns.set(style='white')\n", + "import matplotlib.ticker as tic\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "tot = 0\n", + "ls_dist = [25, 50, 75]\n", + "i_diff = 25\n", + "ls_slide = ['SMTBx2-5', 'SMTBx3','SMT1Bx4-3'] #'\n", + "d_slide = {'SMTBx1-16':'Bx1', 'SMTBx2-5':'Bx2', 'SMTBx3':'Bx3','HTA-33':'Bx4-HTAN','SMTBx4-3':'Bx4'}\n", + "for s_target in ['ColI_dist', 'ColIV_dist','Vim-CD31-aSMA_dist']:\n", + " print(s_target)\n", + " fig, ax = plt.subplots(3,2, figsize=(4.5,4),sharex=True,dpi=300)\n", + " for idxc, s_slide in enumerate(ls_slide):\n", + " print(s_slide)\n", + " df_slide = df[df.slide==s_slide]\n", + " for idx, s_marker in enumerate(['ER_nuclei', 'PCNA_nuclei']): #,'Ki67_nuclei']):\n", + " print(s_marker)\n", + " df_result = pd.DataFrame(index=df_slide.index)\n", + " for s_dist in ls_dist:\n", + " b_bool = (df_slide.loc[:,s_target] < s_dist) & (df_slide.loc[:,s_target] >= s_dist - i_diff)\n", + " df_result.loc[b_bool,f'{s_marker}_{s_dist}'] = df_slide.loc[b_bool,s_marker]\n", + " for s_col in df_result.columns:\n", + " sns.kdeplot(df_result.loc[:,s_col].dropna(), ax=ax[idxc,idx],\n", + " label=f\"< {s_col.split('_')[2]}\"#,fill=True, alpha=0.3\n", + " )\n", + " if df_result.mean().fillna(0)[2] == 0:\n", + " statistic, pvalue = scipy.stats.f_oneway(df_result.iloc[:,0].dropna(),df_result.iloc[:,1].dropna())\n", + " print(len(df_result.iloc[:,0].dropna()))\n", + " print(len(df_result.iloc[:,1].dropna()))\n", + " else:\n", + " statistic, pvalue = scipy.stats.f_oneway(df_result.iloc[:,0].dropna(),df_result.iloc[:,1].dropna(),df_result.iloc[:,2].dropna())\n", + " print(len(df_result.iloc[:,0].dropna()))\n", + " print(len(df_result.iloc[:,1].dropna()))\n", + " print('over75')\n", + " print(len(df_result.iloc[:,2].dropna()))\n", + " ax[idxc,idx].set_xlabel(f\"{s_col.split('_')[0]} Intensity\",fontname=\"Arial\",fontsize=18)\n", + " ax[idxc,idx].set_ylabel(f\"\")\n", + " ax[idxc,idx].set_title(f\"\")\n", + " temp = tic.MaxNLocator(3)\n", + " ax[idxc,idx].set_yticklabels(())\n", + " ax[idxc,idx].xaxis.set_major_locator(temp)\n", + " tot+=1\n", + " if pvalue < 0.001: # 0.05/30: #bonferoni correction\n", + " ax[idxc,idx].text(0.42, 0.87, '*',\n", + " horizontalalignment='center',\n", + " verticalalignment='center',\n", + " transform=ax[idxc,idx].transAxes)\n", + " ax[idxc,idx].set_xlim(-1000,5500)\n", + " ax[idxc,idx].spines['right'].set_visible(False)\n", + " ax[idxc,idx].spines['left'].set_visible(False)\n", + " ax[idxc,idx].spines['top'].set_visible(False)\n", + " #print(ax[idxc,idx].get_xticklabels())\n", + " #ax[idxc,idx].set_xticklabels(ax[idxc,idx].get_xticklabels(),{'fontsize':16})\n", + " ax[idxc,0].set_ylabel(f\"{d_slide[s_slide]}\",fontname=\"Arial\",fontsize=18)\n", + " ax[2,1].legend(title='$\\mu$m',borderpad=.3,labelspacing=.3,loc=4,fontsize=14)\n", + " plt.subplots_adjust(wspace=.001,hspace=.001)\n", + " plt.suptitle(f\"Distance to {s_target.split('_')[0]}\",y=.93,fontname=\"Arial\",fontsize=24)\n", + " plt.tight_layout()\n", + " fig.savefig(f'./{s_date}/IntensityvsDistance_{i_diff}s_{s_target}_by_slide_noBx1.png',dpi=300)\n", + " #fig.savefig(f'./{s_date}/IntensityvsDistance_{i_diff}s_{s_target}_by_slide_noBx1.pdf',dpi=200)\n", + " #break" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + " 0.05/30" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tot" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "from matplotlib import gridspec\n", + "ax_objs = []\n", + "ls_slide = ['SMTBx2-5', 'SMTBx3','SMT1Bx4-3'] #'\n", + "d_slide = {'SMTBx1-16':'Bx1', 'SMTBx2-5':'Bx2', 'SMTBx3':'Bx3','HTA-33':'Bx4-HTAN','SMTBx4-3':'Bx4'}\n", + "for s_target in ['ColI_dist', 'ColIV_dist','Vim-CD31-aSMA_dist']:\n", + " fig = plt.figure(figsize=(5.5,3.5),dpi=300)\n", + " gs = gridspec.GridSpec(nrows=3, ncols=2,figure=fig, \n", + " wspace=0.1, hspace=0.05,left=0.1, right=.75\n", + " )\n", + " for idxc, s_slide in enumerate(ls_slide):\n", + " df_slide = df[df.slide==s_slide]\n", + " for idx, s_marker in enumerate(['ER_nuclei', 'PCNA_nuclei']):\n", + " ax_objs.append(fig.add_subplot(gs[idxc,idx]))\n", + " df_result = pd.DataFrame(index=df_slide.index)\n", + " for s_dist in ls_dist:\n", + " b_bool = (df_slide.loc[:,s_target] < s_dist) & (df_slide.loc[:,s_target] >= s_dist - i_diff)\n", + " df_result.loc[b_bool,f'{s_marker}_{s_dist}'] = df_slide.loc[b_bool,s_marker]\n", + " for s_col in df_result.columns:\n", + " g =sns.kdeplot(df_result.loc[:,s_col].dropna(), ax=ax_objs[-1],\n", + " label=f\"< {s_col.split('_')[2]}\"#,fill=True,alpha=0.5\n", + " )\n", + " if df_result.mean().fillna(0)[2] == 0:\n", + " statistic, pvalue = scipy.stats.f_oneway(df_result.iloc[:,0].dropna(),df_result.iloc[:,1].dropna())\n", + " #print(pvalue)\n", + " else:\n", + " statistic, pvalue = scipy.stats.f_oneway(df_result.iloc[:,0].dropna(),df_result.iloc[:,1].dropna(),df_result.iloc[:,2].dropna())\n", + " ax_objs[-1].set_ylabel(f\"\")\n", + " ax_objs[-1].set_title(f\"\")\n", + " temp = tic.MaxNLocator(3)\n", + " ax_objs[-1].set_yticklabels(())\n", + " ax_objs[-1].xaxis.set_major_locator(temp)\n", + " tot+=1\n", + " if pvalue < 0.001: # 0.05/30: #bonferoni correction\n", + " ax_objs[-1].text(0.55, 0.65, '*',\n", + " horizontalalignment='center',\n", + " verticalalignment='center',\n", + " transform=ax_objs[-1].transAxes)\n", + " ax_objs[-1].set_xlim(-1000,5500)\n", + " ax_objs[-1].spines['right'].set_visible(False)\n", + " ax_objs[-1].spines['left'].set_visible(False)\n", + " ax_objs[-1].spines['top'].set_visible(False)\n", + " #ax_objs[-1].spines['bottom'].set_visible(False)\n", + " ax_objs[-1].set_xlabel('')\n", + " rect = ax_objs[-1].patch\n", + " rect.set_alpha(0)\n", + " if idx == 0:\n", + " ax_objs[-1].set_ylabel(f\"{d_slide[s_slide]}\",fontsize=18)\n", + " if idx==1:\n", + " if idxc == 2:\n", + " ax_objs[-1].legend(title='$\\mu$m',borderpad=.3,labelspacing=.3,fontsize=12,loc='upper left', bbox_to_anchor=(1.05, 1.5))\n", + " if idxc ==2:\n", + " ax_objs[-1].set_xlabel(f\"{s_col.split('_')[0]} Intensity\",fontsize=18)\n", + " else:\n", + " ax_objs[-1].set_xticklabels([]) \n", + " plt.suptitle(f\"Distance to {s_target.split('_')[0]}\",x=.45,y=.95,fontsize=20)\n", + " gs.update(bottom = 0.2)\n", + " fig.savefig(f'./{s_date}/IntensityvsDistance_{i_diff}s_{s_target}_by_slide_noBx1_bigger.png',dpi=200)\n", + " #break" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#by tissue w bx1\n", + "%matplotlib inline\n", + "sns.set(style='white')\n", + "import matplotlib.ticker as tic\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "tot = 0\n", + "ls_dist = [25, 50, 75]\n", + "i_diff = 25\n", + "ls_slide = ['SMTBx1-16','SMTBx2-5', 'SMTBx3','SMT1Bx4-3'] #'\n", + "d_slide = {'SMTBx1-16':'Bx1', 'SMTBx2-5':'Bx2', 'SMTBx3':'Bx3','HTA-33':'Bx4-HTAN','SMTBx4-3':'Bx4'}\n", + "for s_target in ls_target + ['Vim-CD31-aSMA_dist']: #['CD68_dist','ColI_dist', 'ColIV_dist']:\n", + " fig, ax = plt.subplots(4,3, figsize=(7,5),sharex=True,dpi=300)\n", + " for idxc, s_slide in enumerate(ls_slide):\n", + " df_slide = df[df.slide==s_slide]\n", + " for idx, s_marker in enumerate(ls_marker):\n", + " df_result = pd.DataFrame(index=df_slide.index)\n", + " for s_dist in ls_dist:\n", + " b_bool = (df_slide.loc[:,s_target] < s_dist) & (df_slide.loc[:,s_target] >= s_dist - i_diff)\n", + " df_result.loc[b_bool,f'{s_marker}_{s_dist}'] = df_slide.loc[b_bool,s_marker]\n", + " for s_col in df_result.columns:\n", + " sns.kdeplot(df_result.loc[:,s_col].dropna(), ax=ax[idxc,idx], label=f\"< {s_col.split('_')[2]}\")\n", + " if df_result.mean().fillna(0)[2] == 0:\n", + " statistic, pvalue = scipy.stats.f_oneway(df_result.iloc[:,0].dropna(),df_result.iloc[:,1].dropna())\n", + " #print(pvalue)\n", + " else:\n", + " statistic, pvalue = scipy.stats.f_oneway(df_result.iloc[:,0].dropna(),df_result.iloc[:,1].dropna(),df_result.iloc[:,2].dropna())\n", + " ax[idxc,idx].set_xlabel(f\"{s_col.split('_')[0]} Intensity\",fontsize=18)\n", + " ax[idxc,idx].set_ylabel(f\"\")\n", + " ax[idxc,idx].set_title(f\"\")\n", + " temp = tic.MaxNLocator(3)\n", + " ax[idxc,idx].set_yticklabels(())\n", + " ax[idxc,idx].xaxis.set_major_locator(temp)\n", + " tot+=1\n", + " if pvalue < 0.001: # 0.05/30: #bonferoni correction\n", + " ax[idxc,idx].text(0.5, 0.8, '*',\n", + " horizontalalignment='center',\n", + " verticalalignment='center',\n", + " transform=ax[idxc,idx].transAxes)\n", + " ax[idxc,idx].set_xlim(-1500,7000)\n", + " ax[idxc,idx].spines['right'].set_visible(False)\n", + " ax[idxc,idx].spines['left'].set_visible(False)\n", + " ax[idxc,idx].spines['top'].set_visible(False)\n", + " ax[idxc,0].set_ylabel(f\"{d_slide[s_slide]}\",fontsize=18)\n", + " ax[0,2].legend(title='$\\mu$m')\n", + " plt.subplots_adjust(wspace=.001,hspace=.001)\n", + " plt.suptitle(f\"Distance to {s_target.split('_')[0]}\",fontsize=20)\n", + " plt.tight_layout()\n", + " fig.savefig(f'./{s_date}/IntensityvsDistance_25s_{s_target}_by_slide.png',dpi=300)\n", + " #break" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#by tissue w bx1\n", + "%matplotlib inline\n", + "sns.set(style='white')\n", + "import matplotlib.ticker as tic\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "tot = 0\n", + "ls_dist = [25, 50, 75]\n", + "i_diff = 25\n", + "ls_slide = ['SMTBx2-5', 'SMTBx3','SMT1Bx4-3'] #'SMTBx1-16',\n", + "d_slide = {'SMTBx1-16':'Bx1', 'SMTBx2-5':'Bx2', 'SMTBx3':'Bx3','HTA-33':'Bx4-HTAN','SMTBx4-3':'Bx4'}\n", + "for s_target in ['ColI_dist', 'ColIV_dist']:\n", + " fig, ax = plt.subplots(3,3, figsize=(7,4),sharex=True)\n", + " for idxc, s_slide in enumerate(ls_slide):\n", + " df_slide = df[df.slide==s_slide]\n", + " for idx, s_marker in enumerate(ls_marker):\n", + " df_result = pd.DataFrame(index=df_slide.index)\n", + " for s_dist in ls_dist:\n", + " b_bool = (df_slide.loc[:,s_target] < s_dist) & (df_slide.loc[:,s_target] >= s_dist - i_diff)\n", + " df_result.loc[b_bool,f'{s_marker}_{s_dist}'] = df_slide.loc[b_bool,s_marker]\n", + " for s_col in df_result.columns:\n", + " sns.kdeplot(df_result.loc[:,s_col].dropna(), ax=ax[idxc,idx], label=f\"< {s_col.split('_')[2]}\")\n", + " if df_result.mean().fillna(0)[2] == 0:\n", + " statistic, pvalue = scipy.stats.f_oneway(df_result.iloc[:,0].dropna(),df_result.iloc[:,1].dropna())\n", + " #print(pvalue)\n", + " else:\n", + " statistic, pvalue = scipy.stats.f_oneway(df_result.iloc[:,0].dropna(),df_result.iloc[:,1].dropna(),df_result.iloc[:,2].dropna())\n", + " ax[idxc,idx].set_xlabel(f\"{s_col.split('_')[0]} Intensity\")\n", + " ax[idxc,idx].set_ylabel(f\"\")\n", + " ax[idxc,idx].set_title(f\"\")\n", + " temp = tic.MaxNLocator(3)\n", + " ax[idxc,idx].set_yticklabels(())\n", + " ax[idxc,idx].xaxis.set_major_locator(temp)\n", + " tot+=1\n", + " if pvalue < 0.001: # 0.05/30: #bonferoni correction\n", + " ax[idxc,idx].text(0.5, 0.8, '*',\n", + " horizontalalignment='center',\n", + " verticalalignment='center',\n", + " transform=ax[idxc,idx].transAxes)\n", + " ax[idxc,idx].set_xlim(-1500,7000)\n", + " ax[idxc,idx].spines['right'].set_visible(False)\n", + " ax[idxc,idx].spines['left'].set_visible(False)\n", + " ax[idxc,idx].spines['top'].set_visible(False)\n", + " ax[idxc,0].set_ylabel(f\"{d_slide[s_slide]}\")\n", + " ax[0,2].legend(title='$\\mu$m')\n", + " plt.subplots_adjust(wspace=.001,hspace=.001)\n", + " plt.suptitle(f\"Distance to {s_target.split('_')[0]}\")\n", + " plt.tight_layout()\n", + " fig.savefig(f'./{s_date}/IntensityvsDistance_25s_{s_target}_by_slide.png',dpi=200)\n", + " #break" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python3.9.5", + "language": "python", + "name": "python3.9.5" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/GateCellTypes.ipynb b/GateCellTypes.ipynb new file mode 100755 index 0000000..bba7702 --- /dev/null +++ b/GateCellTypes.ipynb @@ -0,0 +1,573 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#load libraries\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import os\n", + "import copy\n", + "import seaborn as sns\n", + "import importlib\n", + "from matplotlib import cm\n", + "import matplotlib as mpl\n", + "mpl.rc('figure', max_open_warning = 0)\n", + "mpl.rcParams['pdf.fonttype'] = 42\n", + "mpl.rcParams['ps.fonttype'] = 42\n", + "mpl.rcParams['mathtext.fontset'] = 'custom'\n", + "mpl.rcParams['mathtext.it'] = 'Arial:italic'\n", + "mpl.rcParams['mathtext.rm'] = 'Arial'\n", + "codedir = os.getcwd()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#load cmif libraries\n", + "#os.chdir('/home/groups/graylab_share/OMERO.rdsStore/engje/Data/cmIF')\n", + "from mplex_image import visualize as viz, process, preprocess, gating" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "os.chdir(codedir)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Notes\n", + "\n", + "use CD45 to gate immune (CD3 more artifact)\n", + "\n", + "update 20200402: add SMT-Bx2-5 and HTA-33, simplified gating." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load Data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#set location of files\n", + "#load data\n", + "rootdir = f'{codedir}/paper_data'\n", + "# go to location of files\n", + "os.chdir(rootdir)\n", + "preprocess.cmif_mkdir(['GatingPlots'])\n", + "#os.listdir()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# 3 define samples to work with/ image combos\n", + "ls_sample = ['20210402_SMT']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_data = pd.DataFrame()\n", + "for s_sample in ls_sample:\n", + " df_data = df_data.append(pd.read_csv(f'{s_sample}_ManualPositive.csv',index_col=0))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_data.columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "d_rename = {'CD4':'CD4_Ring','CD8':'CD8_Ring',\n", + " #'HER2':'HER2_Ring','ER':'ER_Nuclei'\n", + " }\n", + "df_data = df_data.rename(d_rename, axis=1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Specify Gating Strategy" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#parameters\n", + "\n", + "# cell types\n", + "ls_endothelial = ['CD31']\n", + "ls_immune = ['CD45','CD68'] \n", + "ls_tumor = ['CK7','CK19','Ecad'] \n", + "ls_prolif = ['Ki67']\n", + "\n", + "#tcell/myeloid\n", + "s_tcell = 'CD45' \n", + "s_bcell = 'CD20'\n", + "s_myeloid = 'CD68'\n", + "ls_immune_functional = ['PD1','CD44','prolif'] # not in dataset: 'FoxP3_Nuclei','GRNZB_Nuclei',\n", + "\n", + "#luminal/basal/mesenchymal\n", + "ls_luminal = ['CK19','CK7'] # not in dataset 'CK8_Ring'\n", + "ls_basal = ['CK5','CK14'] \n", + "ls_mes = ['CD44', 'Vim'] \n", + "ls_tumor_plus = ['Ecad'] + ['Lum']\n", + "ls_stromal_function = ['Vim','aSMA','PDPN']\n", + "ls_tumor_prolif = ['PCNA','Ki67','pHH3'] \n", + "\n", + "#index of cell line samples (i.e. 100% tumor)\n", + "ls_cellline_index = []\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#custom gating\n", + "df_data = gating.main_celltypes(df_data,ls_endothelial,ls_immune,ls_tumor,ls_cellline_index)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_data.columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#add normal liver\n", + "df_data.loc[(~df_data.loc[:,ls_luminal].any(axis=1) & df_data.loc[:,'Ecad'] & df_data.loc[:,'tumor']),'celltype'] = 'epithelial'\n", + "df_data.loc[df_data.celltype == 'epithelial','tumor'] = False\n", + "df_data.loc[df_data.celltype == 'epithelial','epithelial'] = True\n", + "df_data.loc[df_data.celltype != 'epithelial','epithelial'] = False\n", + "df_data.epithelial = df_data.epithelial.astype('bool')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "importlib.reload(gating)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Perform Gating" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "#simple gating\n", + "df_data = gating.proliferation(df_data,ls_prolif)\n", + "df_data = gating.immune_types(df_data,s_myeloid,s_bcell,s_tcell)\n", + "df_data = gating.cell_prolif(df_data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "#cutom gating (skip)\n", + "'''\n", + "df_data = gating.immune_functional(df_data,ls_immune_functional)\n", + "df_data = gating.diff_hr_state(df_data,ls_luminal,ls_basal,ls_mes)\n", + "df_data = gating.celltype_gates(df_data,ls_tumor_prolif,s_new_name='TumorProlif',s_celltype='tumor')\n", + "#df_data = gating.celltype_gates(df_data,ls_tumor_plus,s_new_name='TumorDiffPlus',s_celltype='tumor')\n", + "df_data = gating.celltype_gates(df_data,ls_stromal_function,s_new_name='StromalType',s_celltype='stromal')\n", + "'''" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_data = gating.non_tumor(df_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Output Gating Data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#check\n", + "ls_drop = ['ColI', 'ColIV', 'CD20', 'CD3', 'CD44', 'CK14',\n", + " 'CK5', 'ER', 'HER2', 'LamAC', 'PCNA', 'PD1', 'pHH3']\n", + "df_data.loc[:,df_data.dtypes=='object'].drop(ls_drop,axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#drop extra colums\n", + "df_gate = df_data.loc[:,df_data.dtypes!='bool'].drop(ls_drop,axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#handcrafted stromal populations (skip)\n", + "'''\n", + "d_rename_stroma = {'stromal_Vim_aSMA':'myofibroblast', 'stromal_aSMA':'myofibroblast', 'stromal___':'stromal', 'stromal_Vim':'fibroblast',\n", + " 'stromal_PDPN_Vim_aSMA':'myofibroblast', 'stromal_PDPN_Vim':'fibroblast', 'stromal_PDPN':'lymphatic',\n", + " 'stromal_PDPN_aSMA':'myofibroblast'}\n", + "df_gate.NonTumor = df_gate.NonTumor.replace(d_rename_stroma)\n", + "df_gate['FinalCell'] = df_gate.NonTumor.fillna(df_gate.CellProlif).fillna(df_gate.celltype)\n", + "df_gate.FinalCell = df_gate.FinalCell.replace({'tumor_nonprolif':'tumor','liver_nonprolif':'liver','liver_prolif':'liver'})\n", + "'''" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_gate.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "s_out = '20210402_SMT'\n", + "if not os.path.exists(f'{s_out}_GatedPositiveCellNames.csv'):\n", + " print('saving new csv')\n", + " df_gate.to_csv(f'{s_out}_GatedPositiveCellNames.csv')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Plot\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#importlib.reload(viz)\n", + "s_out = '20210402_SMT'\n", + "f'{s_out}_GatedPositiveCellNames.csv'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_data = pd.read_csv(f'{s_out}_GatedPositiveCellNames.csv',index_col=0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#df_data['Stromal'] = df_data.StromalType.replace(d_rename_stroma)\n", + "#df_data['NonTumor'] = df_data.NonTumor.replace(d_rename_stroma)\n", + "#df_data['NonTumorFunc'] = df_data.NonTumorFunc.replace(d_rename_stroma)\n", + "#handcrafted stromal populations\n", + "#d_rename_stroma = {'stromal_Vim_aSMA':'myofibroblast', 'stromal_aSMA':'myofibroblast', 'stromal___':'stromal', 'stromal_Vim':'fibroblast',\n", + "# 'stromal_PDPN_Vim_aSMA':'myofibroblast', 'stromal_PDPN_Vim':'fibroblast', 'stromal_PDPN':'lymphatic',\n", + "# 'stromal_PDPN_aSMA':'myofibroblast'}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "(df_data.columns == 'FinalCell').any()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#combined cell type (run once)\n", + "if not (df_data.columns == 'FinalCell').any():\n", + " df_data.loc[df_data.celltype == 'tumor','FinalCell'] = df_data.loc[df_data.celltype == 'tumor','CellProlif']\n", + " df_data.loc[df_data.celltype != 'tumor','FinalCell'] = df_data.loc[df_data.celltype != 'tumor','celltype']\n", + " df_data.loc[df_data.celltype == 'immune','FinalCell'] = df_data.loc[df_data.celltype == 'immune','ImmuneType']\n", + "\n", + "#df_data.FinalCell.unique()\n", + "#df_data.to_csv(f'{s_out}_GatedPositiveCellNames.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ls_drop = df_data.loc[((df_data.index.str.contains('HTA')) & (df_data.FinalCell=='epithelial'))].index" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# get rid epithelial\n", + "# except HTAN\n", + "df_data['FinalCell'] = df_data.FinalCell.replace({'epithelial':'stromal'})\n", + "df_data = df_data.drop(ls_drop)\n", + "df_data['countme'] = True\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%matplotlib inline\n", + "s_grouper='slide_scene'\n", + "\n", + "#calculate proportions\n", + "for s_cell in df_data.columns[(df_data.dtypes=='object') & ~(df_data.columns.isin([s_grouper]))].tolist():\n", + " df_prop = viz.prop_positive(df_data,s_cell=s_cell,s_grouper=s_grouper)\n", + " # make annotations\n", + " df_annot=pd.DataFrame(data={'ID': df_prop.index.tolist()},index=df_prop.index)\n", + " lut = dict(zip(sorted(df_annot.ID.unique()),cm.tab10.colors))\n", + " g, df_plot_less = viz.prop_clustermap(df_prop,df_annot,i_thresh =.01,lut=lut)\n", + " g.savefig(f'./GatingPlots/{s_cell}_clustermap.png',dpi=150)\n", + " plt.close()\n", + " fig = viz.prop_barplot(df_plot_less,s_cell,colormap=\"Spectral\")\n", + " fig.savefig(f'./GatingPlots/{s_cell}_bar.png',dpi=200)\n", + " break" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#group by tissue\n", + "df_data['slide_scene'] = [item.split('_')[0] for item in df_data.slide_scene]\n", + "df_data_select = df_data.loc[~df_data.slide_scene.isin(['HTA-33_scene001','SMTBx1-16_scene001']),:]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#by tissue\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "s_grouper='slide_scene'\n", + "mpl.rcParams['pdf.fonttype'] = 42\n", + "mpl.rcParams['ps.fonttype'] = 42\n", + "\n", + "#calculate proportions\n", + "for s_cell in df_data.columns[(df_data.dtypes=='object') & ~(df_data.columns.isin([s_grouper]))].tolist():\n", + " df_prop = viz.prop_positive(df_data_select,s_cell=s_cell,s_grouper=s_grouper)\n", + " # make annotations\n", + " df_prop.to_csv(f'ManualGating_SMT_proportions_{s_cell}.csv')\n", + " df_annot=pd.DataFrame(data={'ID': df_prop.index.tolist()},index=df_prop.index)\n", + " lut = dict(zip(sorted(df_annot.ID.unique()),cm.tab10.colors))\n", + " g, df_plot_less = viz.prop_clustermap(df_prop,df_annot,i_thresh =.001,lut=lut)\n", + " g.savefig(f'./GatingPlots/{s_cell}_clustermap_tissue.pdf',dpi=150)\n", + " plt.close()\n", + " if df_plot_less.shape[1] < 8:\n", + " cmap = \"Spectral\"\n", + " elif df_plot_less.shape[1] < 11:\n", + " cmap = \"Paired\"\n", + " else:\n", + " cmap = \"tab20\"\n", + " fig = viz.prop_barplot(df_plot_less,s_cell,colormap=cmap)\n", + " fig.savefig(f'./GatingPlots/{s_cell}_bar_tissue.pdf',dpi=200)\n", + " break" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "s_date = '20210402'\n", + "d_crop = {'SMTBx2-5_scene001': (2000,9000),\n", + " 'SMTBx3_scene004': (20000,16000),\n", + " 'HTA-33_scene002': (3271, 607),\n", + " 'SMTBx1-16_scene003': (2440,220),\n", + " }\n", + "df_result = pd.DataFrame()\n", + "for s_tissue, tu_crop in d_crop.items():\n", + " df_scene = df_data.loc[df_data.index.str.contains(s_tissue)]\n", + " ls_index = df_scene.loc[((df_scene.DAPI_X > tu_crop[0]) & (df_scene.DAPI_X < tu_crop[0]+2500)) & (df_scene.DAPI_Y > tu_crop[1]) & (df_scene.DAPI_Y < tu_crop[1]+2500)].index\n", + " df_result = df_result.append(df_data.loc[ls_index])\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#by tissue\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "s_grouper='slide_scene'\n", + "mpl.rcParams['pdf.fonttype'] = 42\n", + "mpl.rcParams['ps.fonttype'] = 42\n", + "d_rename = {'HTA-33':'Bx4', 'SMTBx1-16':'Bx1', 'SMTBx2-5':'Bx2', 'SMTBx3':'Bx3'}\n", + "\n", + "#calculate proportions\n", + "for s_cell in df_data.columns[(df_data.dtypes=='object') & ~(df_data.columns.isin([s_grouper]))].tolist():\n", + " df_prop = viz.prop_positive(df_result,s_cell=s_cell,s_grouper=s_grouper)\n", + " # make annotations\n", + " #df_prop.to_csv(f'ManualGating_SMT101_proportions_{s_cell}.csv')\n", + " df_annot=pd.DataFrame(data={'ID': df_prop.index.tolist()},index=df_prop.index)\n", + " lut = dict(zip(sorted(df_annot.ID.unique()),cm.tab10.colors))\n", + " g, df_plot_less = viz.prop_clustermap(df_prop,df_annot,i_thresh =.001,lut=lut)\n", + " g.savefig(f'./GatingPlots/{s_cell}_clustermap_tissue3.pdf',dpi=150)\n", + " plt.close()\n", + " if df_plot_less.shape[1] < 8:\n", + " cmap = \"Spectral\"\n", + " elif df_plot_less.shape[1] < 11:\n", + " cmap = \"Paired\"\n", + " else:\n", + " cmap = \"tab20\"\n", + " fig = viz.prop_barplot(df_plot_less.rename(d_rename),s_cell,colormap=cmap)\n", + " fig.set_size_inches(4.5, 2.3)\n", + " ax_list = fig.axes\n", + " ax_list[0].set_ylabel('')\n", + " ax_list[0].set_xlabel('Fraction of Cells')\n", + " ax_list[0].set_title('')\n", + " fig.suptitle('Gating Composition: Biopsies',x=0.5,y=0.9,fontsize=14)\n", + " plt.tight_layout()\n", + " fig.savefig(f'./GatingPlots/{s_cell}_bar_tissue3.png',dpi=200)\n", + " #fig.savefig(f'./{s_date}/{s_cell}_bar_tissue3.pdf',dpi=200)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "s_date" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python3.9.5", + "language": "python", + "name": "python3.9.5" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Normalize_Bx2-4.ipynb b/Normalize_Bx2-4.ipynb new file mode 100755 index 0000000..45a5f00 --- /dev/null +++ b/Normalize_Bx2-4.ipynb @@ -0,0 +1,1198 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#load libraries\n", + "\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "\n", + "import numpy as np\n", + "import os\n", + "import copy\n", + "import seaborn as sns\n", + "import importlib\n", + "from scipy.signal import argrelmax, find_peaks, peak_widths\n", + "import scanpy as sc\n", + "from sklearn.cluster import KMeans\n", + "from sklearn.preprocessing import scale, minmax_scale\n", + "from sklearn.metrics import silhouette_score\n", + "import matplotlib as mpl\n", + "mpl.rc('figure', max_open_warning = 0)\n", + "mpl.rcParams['pdf.fonttype'] = 42\n", + "mpl.rcParams['ps.fonttype'] = 42\n", + "mpl.rcParams['mathtext.it'] = 'Arial:italic'\n", + "mpl.rcParams['mathtext.rm'] = 'Arial'\n", + "codedir = os.getcwd()\n", + "#load cmif libraries\n", + "#os.chdir('/home/groups/graylab_share/OMERO.rdsStore/engje/Data/cmIF')\n", + "from mplex_image import visualize as viz, process, preprocess, normalize" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "os.chdir(codedir)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "np.random.seed(222)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Table of contents \n", + "1. [Load Data](#load)\n", + "2. [Normalize](#norm)\n", + "3. [Visualize Normalization](#normviz)\n", + "4. [leiden for cell typing](#clusterlei)\n", + "5. [Leiden cluster](#clust1)\n", + "\n", + "\n", + "note:\n", + "\n", + " Could you make composite fraction bar graph only in following regions?\n", + "\n", + " Bx2: SMTBx2-5-Scene-001_ROI1-2000-9000-2500-2500\n", + " Bx3: SMTBx3-Scene-004_ROI2-20900-15494-2500-2500\n", + " Bx4: HTA-33-Scene-002_ROI1-3271-607-2500-2500\n", + "\n", + " If we can have it in Bx1\n", + " Bx: SMTBx1-Scene-003_ROI1-2440-220-2500-2500\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#load data\n", + "os.chdir(f'{codedir}/paper_data')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "s_date = '20210402'\n", + "if not os.path.exists(s_date):\n", + " os.mkdir(s_date)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Load Data \n", + "\n", + "[contents](#contents)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "os.chdir(f'{codedir}/paper_data')\n", + "df_file = pd.DataFrame(index=os.listdir())\n", + "df_file = df_file[df_file.index.str.contains('FilteredMeanIntensity_DAPI')]\n", + "df_file['tissue'] = [item.split('_')[1] for item in df_file.index]\n", + "df_file['dapi'] = ['DAPI' + item.split('y_DAPI')[1].split('.')[0] for item in df_file.index]\n", + "ls_sample = df_file.tissue.tolist()\n", + "d_dapi = dict(zip(df_file.tissue.tolist(),df_file.dapi.tolist()))\n", + "d_dapi.update({'JE-TMA-60': 'DAPI10_DAPI2'})\n", + "df_mi = pd.DataFrame()\n", + "df_xy = pd.DataFrame()\n", + "df_edge = pd.DataFrame()\n", + "\n", + "for s_sample in sorted(set(ls_sample)):\n", + " #if not s_sample.find('HTA')>-1:\n", + " print(f'loading {s_sample}')\n", + " df_mi = df_mi.append(pd.read_csv(f'{codedir}/paper_data/features_{s_sample}_FilteredMeanIntensity_{d_dapi[s_sample]}.csv', index_col=0))\n", + " df_xy = df_xy.append(pd.read_csv(f'{codedir}/paper_data/features_{s_sample}_CentroidXY.csv',index_col=0))\n", + " if os.path.exists(f'{codedir}/paper_data/features_{s_sample}_EdgeCells153pixels_CentroidXY.csv'):\n", + " df_edge = df_edge.append(pd.read_csv(f'{codedir}/paper_data/features_{s_sample}_EdgeCells153pixels_CentroidXY.csv',index_col=0))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#sorted(df_mi.columns[df_mi[~df_mi.index.str.contains('JE-TMA-60')].isna().sum() != 0])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ls_marker = ['AR_nuclei', 'CD20_perinuc5', 'CD31_perinuc5', 'CD3_perinuc5', 'CD44_perinuc5', 'CD45_perinuc5',#'CD44_nucadj2',\n", + " 'CD4_perinuc5', 'CD68_perinuc5','CD8_perinuc5', 'CK14_cytoplasm', 'CK17_cytoplasm', 'CK19_cytoplasm', 'CK5_cytoplasm',\n", + " 'CK7_cytoplasm', 'CK8_cytoplasm', 'ColI_perinuc5', 'ColIV_perinuc5','CoxIV_perinuc5','EGFR_cytoplasm', 'ER_nuclei',\n", + " 'Ecad_cytoplasm', 'FoxP3_nuclei', 'GRNZB_nuclei', 'H3K27_nuclei','H3K4_nuclei', 'HER2_cellmem25','Ki67_nuclei',\n", + " 'LamAC_nuclei', 'PCNA_nuclei', 'PD1_perinuc5', 'PDPN_perinuc5','DAPI2_nuclei', # 'ER_nuclei25','HER2_cytoplasm','PgR_nuclei','Vim_nucadj2'\n", + " 'Vim_perinuc5', 'aSMA_perinuc5', 'pHH3_nuclei', 'pRB_nuclei', 'pS6RP_perinuc5','slide_scene',\n", + " ] # CD8R bad, 'gH2AX_nuclei' in R11 Bx3 not included\n", + "\n", + "df_mi = df_mi.loc[:,ls_marker]\n", + " \n", + "# old \n", + "#df_mi = df_mi.loc[:,['HER2_cellmem25', 'DAPI2_nuclei',# 'CD44_nucadj2', 'Vim_nucadj2','ER_nuclei25','HER2_cytoplasm',\n", + "# 'CD20_perinuc5', 'CD3_perinuc5', 'CD31_perinuc5', 'CD4_perinuc5','CD44_perinuc5', 'CD45_perinuc5', 'CD68_perinuc5', 'CD8_perinuc5',\n", + "# 'PD1_perinuc5', 'PDPN_perinuc5', 'Vim_perinuc5', 'aSMA_perinuc5','CK14_cytoplasm', 'CK17_cytoplasm', 'CK19_cytoplasm', 'CK5_cytoplasm',\n", + "# 'CK7_cytoplasm', 'Ecad_cytoplasm', 'ER_nuclei', 'Ki67_nuclei', 'LamAC_nuclei','PCNA_nuclei', 'pHH3_nuclei', 'slide_scene']]\n", + "\n", + "\n", + "df_mi['batch'] = [item.split('_')[0] for item in df_mi.index]\n", + "#df_mi['scene'] = [item.split('_')[1] for item in df_mi.index]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Deal with JE-TMA-60" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# markers in JE-TMA-60\n", + "#'JE-TMA-60_scene06', 'JE-TMA-60_scene08', 'JE-TMA-60_scene09', 'JE-TMA-60_scene10', 'JE-TMA-60_scene11', 'JE-TMA-60_scene13'\n", + "# R5 is CK17.PDPN.CD45.FoxP3" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_R5 = pd.read_csv(f'{codedir}/paper_data/features_JE-TMA-60_FilteredMeanIntensity_DAPI5_DAPI2.csv',index_col=0)\n", + "df_R4 = pd.read_csv(f'{codedir}/paper_data/features_JE-TMA-60_FilteredMeanIntensity_DAPI4_DAPI2.csv',index_col=0)\n", + "df_R10 = df_mi[df_mi.batch=='JE-TMA-60']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ls_scene = set(df_R10.slide_scene)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ls_na = set([item.split('_cell')[0] for item in df_R5.index]) - set([item.split('_cell')[0] for item in df_R10.index])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#slect markers, scenes for normalization (based on JE-TMA-60 tissue loss)\n", + "ls_pos = ['HER2_cellmem25','CK19_cytoplasm','CK7_cytoplasm','CK8_cytoplasm','Ecad_cytoplasm','ER_nuclei','Ki67_nuclei','LamAC_nuclei',\n", + " 'PCNA_nuclei','pHH3_nuclei','Vim_perinuc5','DAPI2_nuclei','H3K27_nuclei','H3K4_nuclei', 'pRB_nuclei','pS6RP_perinuc5',\n", + " 'CoxIV_perinuc5','EGFR_cytoplasm']\n", + "ls_R5 = ['CK17_cytoplasm','PDPN_perinuc5','CD45_perinuc5','FoxP3_nuclei'] #\n", + "ls_R4 = ['pHH3_nuclei','CK14_cytoplasm','Ki67_nuclei','CK19_cytoplasm','CK5_cytoplasm','HER2_cellmem25',\n", + " 'Ecad_cytoplasm', 'ER_nuclei','CD44_perinuc5', 'PCNA_nuclei','aSMA_perinuc5','CD3_perinuc5','EGFR_cytoplasm']\n", + "ls_bad = ['CD20_perinuc5', 'CD31_perinuc5', 'CD4_perinuc5', 'CD68_perinuc5', 'CD8_perinuc5','PD1_perinuc5',\n", + " 'ColI_perinuc5', 'ColIV_perinuc5'] #'CK7_cytoplasm', #'LamAC_nuclei',\n", + "#ls_good = ['CK7_cytoplasm','Vim_perinuc5','LamAC_nuclei']\n", + "\n", + "#R4\n", + "df = df_mi[df_mi.batch!='JE-TMA-60']\n", + "df = df.append(df_R4.loc[:,ls_R4])\n", + "#R5\n", + "ls_index = df_R5.loc[df_R5.index.isin(df_R4.index)].index\n", + "df.loc[ls_index,ls_R5] = df_R5.loc[ls_index,ls_R5]\n", + "\n", + "#fill R6-8\n", + "ls_index = df_mi.loc[(df_mi.slide_scene.isin(ls_scene)) & (df_mi.index.isin(df_R4.index))].index\n", + "df.loc[ls_index,ls_pos] = df_R10.loc[ls_index,ls_pos]\n", + "\n", + "#\n", + "df['batch'] = [item.split('_')[0] for item in df.index]\n", + "#df['scene'] = [item.split('_')[1] for item in df.index]\n", + "df['slide_scene'] = [item.split('_cell')[0] for item in df.index]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## filter edge cells" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#filter out unwanted cells\n", + "d_filter = {#41 (not used)\n", + " 'JE-TMA-41_scene01':(df_xy.DAPI_Y > 5000),'JE-TMA-41_scene03':(df_xy.DAPI_Y > 5000),\n", + " 'JE-TMA-41_scene04':(df_xy.DAPI_Y < 1500),'JE-TMA-41_scene05':(df_xy.DAPI_Y > 5000),\n", + " 'JE-TMA-41_scene06':(df_xy.DAPI_Y < 1500),'JE-TMA-41_scene08':(df_xy.DAPI_Y < 1500),\n", + " 'JE-TMA-41_scene09':(df_xy.DAPI_Y > 5000),'JE-TMA-41_scene11':(df_xy.DAPI_Y < 1500),\n", + " #43\n", + " 'JE-TMA-43_scene09':(df_xy.DAPI_Y < 1200),'JE-TMA-43_scene14':(df_xy.DAPI_Y < 1200),\n", + " #60\n", + " 'JE-TMA-60_scene02':(df_xy.DAPI_X < 1500),'JE-TMA-60_scene05':(df_xy.DAPI_X < 1500),\n", + " 'JE-TMA-60_scene11':(df_xy.DAPI_Y < 1500),'JE-TMA-60_scene14':(df_xy.DAPI_X < 1500),\n", + " 'JE-TMA-60_scene06':(df_xy.DAPI_Y < 1500),'JE-TMA-60_scene08':(df_xy.DAPI_Y > 5000),\n", + " 'JE-TMA-60_scene10':(df_xy.DAPI_Y < 1500),\n", + " #63\n", + " 'JE-TMA-62_scene01':(df_xy.DAPI_Y > 5000),\n", + " 'JE-TMA-62_scene02':(df_xy.DAPI_X > 5000),'JE-TMA-62_scene03':(df_xy.DAPI_X < 1000),\n", + " 'JE-TMA-62_scene04':(df_xy.DAPI_Y < 1500),'JE-TMA-62_scene06':(df_xy.DAPI_X < 1000),\n", + " 'JE-TMA-62_scene08':(df_xy.DAPI_Y > 5000),'JE-TMA-62_scene10':(df_xy.DAPI_Y < 1500),\n", + " #'SMTBx1-16_scene001':(df_xy.DAPI_Y > 1), #keep scene 1 for manual thresholding\n", + " 'SMTBx2-3_scene002':(df_xy.DAPI_Y > 5000),'SMTBx3_scene004':(df_xy.DAPI_X <11000),\n", + " 'SMTBx3_scene005':(df_xy.DAPI_X > 0),'SMTBx4-3_scene001':(df_xy.DAPI_Y < 2400),\n", + " 'SMTBx2-5_scene002':(df_xy.DAPI_Y > 5000),'HTA-33_scene003':(df_xy.DAPI_Y > 9000)}\n", + "d_filter2 = {'JE-TMA-60_scene02':(df_xy.DAPI_Y > 4500)}\n", + "ls_filter_all = []\n", + "for s_scene, filtercon in d_filter.items():\n", + " ls_filter = df_xy[(df_xy.slide_scene==s_scene) & filtercon].index.tolist()\n", + " ls_filter_all = ls_filter_all + ls_filter\n", + "for s_scene, filtercon in d_filter2.items():\n", + " ls_filter = df_xy[(df_xy.slide_scene==s_scene) & filtercon].index.tolist()\n", + " ls_filter_all = ls_filter_all + ls_filter\n", + "#filter edge\n", + "ls_filter_all = ls_filter_all + df_edge.index.tolist()\n", + "df_filter_mi = df[(~df.index.isin(ls_filter_all))]\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_cluster = df_filter_mi.loc[:,['HER2_cellmem25','slide_scene']]\n", + "df_cluster['cluster'] = 1\n", + "df_cluster.drop('HER2_cellmem25',axis=1,inplace=True)\n", + "import importlib\n", + "importlib.reload(viz)\n", + "%matplotlib inline\n", + "viz.plot_clusters(df_cluster,df_xy,s_num='few')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#match controls to biopsies\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "d_replace = {'BC44290-146': 'JE-TMA-41',\n", + " 'SMTBx2-3': 'JE-TMA-41',\n", + " 'SMTBx2-5':'JE-TMA-43',\n", + " 'SMTBx3':'JE-TMA-60',\n", + " 'SMTBx4-3':'JE-TMA-62'}\n", + "df_filter_mi.loc[:,'batch'] = df_filter_mi.batch.replace(d_replace)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#standardize the scenes\n", + "d_replace = { 'JE-TMA-41_scene13':'JE-TMA-41_scene14',\n", + " 'JE-TMA-41_scene12':'JE-TMA-41_scene13',\n", + " 'JE-TMA-62_scene13':'JE-TMA-62_scene14',\n", + " 'JE-TMA-62_scene12':'JE-TMA-62_scene13'}\n", + "df_filter_mi.loc[:,'scene'] = df_filter_mi.slide_scene.replace(d_replace)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "df_filter_mi.merge(df_xy.loc[:,['DAPI_X', 'DAPI_Y', 'nuclei_area', 'nuclei_eccentricity']],left_index=True,right_index=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_out = df_filter_mi.merge(df_xy.loc[:,['DAPI_X', 'DAPI_Y', 'nuclei_area', 'nuclei_eccentricity']],left_index=True,right_index=True)\n", + "len(df_out)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#2-23 contains NAs\n", + "#2-22 the NAs were filled with random gaussian data\n", + "# 0302 include scene 1 Bx1\n", + "# 0318 just Bx2 - 4, (Bx2-5)\n", + "# 20210324 has HTA9-1-33\n", + "if not os.path.exists('20210324_SMTBx1-4_JE-TMA-43_60_62_FilteredMeanIntensity.csv'):\n", + " print('saving csv')\n", + " #df_out.to_csv('20210223_SMTBx1-4_JE-TMA-41_60_62_BC44290-146.csv')\n", + " df_out.to_csv('20210324_SMTBx1-4_JE-TMA-43_60_62_FilteredMeanIntensity.csv') " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#2-23 contains NAs\n", + "#2-22 the NAs were filled with random gaussian data\n", + "# 0302 include scene 1 Bx1\n", + "# 0318 just Bx2 - 4, (Bx2-5)\n", + "if not os.path.exists('20210320_SMTBx2-4_JE-TMA-43_60_62_FilteredMeanIntensity.csv'):\n", + " print('saving csv')\n", + " #df_out.to_csv('20210223_SMTBx1-4_JE-TMA-41_60_62_BC44290-146.csv')\n", + " df_out.to_csv('20210320_SMTBx2-4_JE-TMA-43_60_62_FilteredMeanIntensity.csv') " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Normalization \n", + "\n", + "use ComBat.\n", + "\n", + "[contents](#contents)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_mi = pd.read_csv('20210320_SMTBx2-4_JE-TMA-43_60_62_FilteredMeanIntensity.csv',index_col=0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_mi.scene.unique()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ls_pos = ['HER2_cellmem25','CK19_cytoplasm','CK7_cytoplasm','CK8_cytoplasm','Ecad_cytoplasm','ER_nuclei','Ki67_nuclei','LamAC_nuclei',\n", + " 'PCNA_nuclei','pHH3_nuclei','Vim_perinuc5','DAPI2_nuclei','H3K27_nuclei','H3K4_nuclei', 'pRB_nuclei','pS6RP_perinuc5',\n", + " 'CoxIV_perinuc5','EGFR_cytoplasm']\n", + "ls_R5 = ['CK17_cytoplasm','PDPN_perinuc5','CD45_perinuc5','FoxP3_nuclei'] #\n", + "ls_R4 = ['pHH3_nuclei','CK14_cytoplasm','Ki67_nuclei','CK19_cytoplasm','CK5_cytoplasm','HER2_cellmem25',\n", + " 'Ecad_cytoplasm', 'ER_nuclei','CD44_perinuc5', 'PCNA_nuclei','aSMA_perinuc5','CD3_perinuc5','EGFR_cytoplasm']\n", + "ls_bad = ['CD20_perinuc5', 'CD31_perinuc5', 'CD4_perinuc5', 'CD68_perinuc5', 'CD8_perinuc5','PD1_perinuc5',\n", + " 'ColI_perinuc5', 'ColIV_perinuc5']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#select normalization scenes\n", + "ls_R10_scene = ['scene06', 'scene08', 'scene09', 'scene10', 'scene11', 'scene13']\n", + "ls_R10 = ['HER2_cellmem25', 'CK19_cytoplasm', 'CK7_cytoplasm', 'Ecad_cytoplasm', 'ER_nuclei', 'Ki67_nuclei', 'LamAC_nuclei',\n", + " 'PCNA_nuclei','pHH3_nuclei', 'Vim_perinuc5','CD44_perinuc5','DAPI2_nuclei', #adding following:\n", + " 'CK8_cytoplasm','CoxIV_perinuc5', 'EGFR_cytoplasm', 'H3K27_nuclei', 'H3K4_nuclei', 'pRB_nuclei', 'pS6RP_perinuc5']\n", + "#note: CK17 may have quenching artifact; PDPN not good in Bx1, so just CD45 important\n", + "#'CK17_cytoplasm','PDPN_perinuc5', #'FoxP3_nuclei' not in full set\n", + "ls_R5 = ['PDPN_perinuc5','CD45_perinuc5','FoxP3_nuclei', 'aSMA_perinuc5','CD3_perinuc5'] # aSMA because N breast, scene 01 better than 07 for immune\n", + "ls_R5_scene = ['scene01','scene03','scene04']\n", + "#old ls_R4 = ['pHH3_nuclei','CK14_cytoplasm','Ki67_nuclei','CK19_cytoplasm','CK5_cytoplasm','HER2_cellmem25',\n", + "# 'Ecad_cytoplasm', 'ER_nuclei','CD44_perinuc5', 'PCNA_nuclei','aSMA_perinuc5','CD3_perinuc5','DAPI2_nuclei']\n", + "#can scene 7 be good control for CD3 and CK14 and CK5?, yes. R1 doen't add much\n", + "ls_R4 = [ 'CK14_cytoplasm', 'CK5_cytoplasm','CK17_cytoplasm'] #'CD3_perinuc5',\n", + "ls_R4_scene = ['scene02','scene07']\n", + "ls_bad = ['CD20_perinuc5', 'CD31_perinuc5', 'CD4_perinuc5', 'CD68_perinuc5', 'CD8_perinuc5','PD1_perinuc5']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "set(df_mi.batch)\n", + "#df_mi = df_mi.loc[df_mi.batch!='JE-TMA-60']\n", + "df_mi['slide_scene'] = df_mi.scene\n", + "df_mi['scene'] = [item.split('_')[1] for item in df_mi.slide_scene]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#dropped 60\n", + "df_norm_all=pd.DataFrame(index=df_mi.dropna().index)\n", + "\n", + "#not dropped 60\n", + "df_norm_all=pd.DataFrame(index=df_mi.index)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#1 fit on scenes that are good through round 10 and markers that are positive on those scenes \"pos\"\n", + "for s_type in ['R4','R5','R10']:\n", + " if s_type == 'R10':\n", + " ls_pos = ls_R10\n", + " ls_scene = ls_R10_scene\n", + "\n", + " #2 fit on scenes that are good until R4, and R1-4 markers\n", + " if s_type == 'R4':\n", + " ls_pos = ls_R4\n", + " ls_scene = ls_R4_scene # + ls_R5_scene + ls_R10_scene \n", + "\n", + " #3 fit on scene that are good until R5, and R5 markers\n", + " if s_type == 'R5':\n", + " ls_pos = ls_R5\n", + " ls_scene = ls_R5_scene\n", + "\n", + " #fit\n", + " b_control = ((df_mi.index.str.contains('JE-TMA')) & (df_mi.scene.isin(ls_scene)) & (df_mi.loc[:,ls_pos].isna().sum(axis=1)==0))\n", + " data = df_mi.loc[b_control,ls_pos].T\n", + " batch = df_mi.loc[b_control,'batch']\n", + " gamma_star, delta_star, stand_mean, var_pooled = normalize.combat_fit(data, batch)\n", + " #transform\n", + " #data = df_mi.loc[df_mi.batch!='SMTBx1-16',df_mi.dtypes=='float64'].drop(['DAPI_X','DAPI_Y'],axis=1).T\n", + " data = df_mi.loc[df_mi.batch!='SMTBx1-16',ls_pos].T\n", + " batch = df_mi.loc[df_mi.batch!='SMTBx1-16','batch']\n", + " bayesdata = normalize.combat_transform(data,batch,gamma_star,delta_star,stand_mean, var_pooled)\n", + " df_norm = bayesdata.T\n", + " df_norm_all = df_norm_all.merge(df_norm,left_index=True,right_index=True,how='left')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_norm_all.tail()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# run after #1, 2 and 3\n", + "df_norm_all = df_norm_all.merge(df_mi.loc[:,['batch','DAPI_X','DAPI_Y','scene','nuclei_area','nuclei_eccentricity']],left_index=True,right_index=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#old check\n", + "df_norm = df_norm.merge(df_mi.loc[:,['batch','DAPI_X','DAPI_Y','scene','nuclei_area','nuclei_eccentricity']],left_index=True,right_index=True)\n", + "#df_mi.loc[b_control,:].drop(['DAPI_X','DAPI_Y'],axis=1).groupby('batch').mean()\n", + "#df_mi[df_mi.index.str.contains('JE-TMA')].drop(['DAPI_X','DAPI_Y'],axis=1).groupby('batch').std()\n", + "#check\n", + "df_norm.loc[b_control,:].drop(['DAPI_X','DAPI_Y'],axis=1).groupby('batch').mean()\n", + "#df_norm[df_norm.index.str.contains('JE-TMA')].drop(['DAPI_X','DAPI_Y'],axis=1).groupby('batch').std()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#df_norm_all.to_csv('20210320_SMTBx2-4_JE-TMA-43_60_62_normalized.csv')\n", + "#df_norm_all.to_csv('20210325_SMTBx2-4_JE-TMA-43_60_62_normalized.csv')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Umap Visualize Normalization \n", + "\n", + "[contents](#contents)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#s_sample = '20210320_SMTBx2-4_JE-TMA-43_60_62'\n", + "s_sample = '20210325_SMTBx2-4_JE-TMA-43_60_62'\n", + "df_norm_all = pd.read_csv(f'{s_sample}_normalized.csv',index_col=0)\n", + "df_norm_all.rename({'nuclei_area':'area','nuclei_eccentricity':'eccentricity','DAPI_X':'DAPIX',\n", + " 'DAPI_Y':\"DAPIY\"},axis=1, inplace=True)\n", + "df_norm_all.columns = [item.split('_')[0] for item in df_norm_all.columns]\n", + "df_norm_all['slide'] = [item.split('_')[0] for item in df_norm_all.index]\n", + "df_norm_all['scene'] = [item.split('_')[1] for item in df_norm_all.index]\n", + "df_norm_all['slide_scene'] = [item.split('_cell')[0] for item in df_norm_all.index]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_norm_all = df_norm_all.loc[~df_norm_all.slide_scene.isin(['JE-TMA-43_scene01','JE-TMA-62_scene01'])]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# visualize\n", + "%matplotlib inline\n", + "s_type = 'w-60_no01'\n", + "#adata = sc.AnnData(df_norm_all.loc[:,df_norm_all.dtypes=='float64'].drop(['DAPIX','DAPIY'],axis=1)) \n", + "ls_drop = ['DAPIX','DAPIY','DAPI2','LamAC','pHH3','FoxP3','CoxIV',\n", + " 'H3K27','H3K4','pRB','pS6RP','aSMA','PDPN'] #aSMA, PDPN not well norm\n", + "adata = sc.AnnData(df_norm_all.dropna().loc[:,df_norm_all.dtypes=='float64'].drop(ls_drop,axis=1))\n", + "adata.obs['batch'] = df_norm_all.dropna().loc[:,'batch']\n", + "adata.obs['scene'] = df_norm_all.dropna().loc[:,'scene'].replace({'scene001':'Bx', 'scene002':'Bx','scene003':'Bx', 'scene004':'Bx', 'scene005':'Bx'})\n", + "adata.obs['tissue'] = df_norm_all.dropna().loc[:,'slide']\n", + "# reduce dimensionality (PCA)\n", + "adata.raw = adata\n", + "#reduce dimensionality\n", + "sc.tl.pca(adata, svd_solver='auto')\n", + "#sc.pl.pca(adata)\n", + "sc.pl.pca_variance_ratio(adata, log=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# calculate neighbors\n", + "n_neighbors = 31\n", + "n_pcs=len(adata.var.index) - 1\n", + "results_file = f'{s_sample}_{n_neighbors}neighbors_{n_pcs}pcs_{len(adata.var.index)}markers.h5ad'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "results_file" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "d_celline = {'scene02':'HCC1143',\n", + " 'scene03':'HCC3153',\n", + " 'scene04':'N.Breast',\n", + " 'scene05':'T47D',\n", + " 'scene06':'T47D',\n", + " 'scene07':'Tonsil',\n", + " 'scene08':'BT474',\n", + " 'scene09':'BT474',\n", + " 'scene10':'AU565',\n", + " 'scene11':'AU565',\n", + " 'scene12':'MDAMB436',\n", + " 'scene13':'MDAMB436',\n", + " 'scene14':'MDAMB436'}\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "\n", + "# calculate neighbors\n", + "if os.path.exists(results_file):\n", + " adata = sc.read_h5ad(results_file)\n", + " print('loading umap')\n", + "else:\n", + " # calculate neighbors \n", + " print('calculating umap')\n", + " sc.pp.neighbors(adata, n_neighbors=n_neighbors, n_pcs=n_pcs)\n", + " sc.tl.umap(adata)\n", + " #save results\n", + " if not os.path.exists(results_file):\n", + " adata.write(results_file)\n", + "\n", + "# umap plus scenes\n", + "fig,ax = plt.subplots(figsize=(3,2.5),dpi=600)\n", + "figname = f'UmapScene_{s_type}_{n_pcs+1}markers.png'\n", + "sc.pl.umap(adata, color='scene',save=figname,title=f'TMA Core',ax=ax)\n", + "\n", + "\n", + "# umap plus tissue\n", + "fig,ax = plt.subplots(figsize=(3,2.5),dpi=600)\n", + "figname = f'UmapTissue_{s_type}_{n_pcs+1}markers.png'\n", + "adata.obs['Tissue'] = adata.obs['tissue'].replace({'SMTBx2-5':'Bx2', 'SMTBx3':'Bx3','SMTBx4-3':'Bx4'})\n", + "sc.pl.umap(adata, color='Tissue',save=figname,title=f'Tissue',ax=ax)\n", + "\n", + "\n", + "# umap plus cell line\n", + "adata.obs['Subtype'] = adata.obs.scene.replace(d_celline)\n", + "fig,ax = plt.subplots(figsize=(3,2.5),dpi=600)\n", + "figname = f'UmapSubtype_{s_type}_{n_pcs+1}markers.png'\n", + "sc.pl.umap(adata, color='Subtype',save=figname,title=f'Subtype',ax=ax)\n", + "\n", + "\n", + "#umap plot\n", + "ls_marker = adata.var.index.tolist()\n", + "figname = f\"Umap_{s_type}_{n_pcs+1}markers.png\"\n", + "axes = sc.pl.umap(adata, color=ls_marker,wspace=.25,save=figname,vmin='p1.5',vmax='p98.5',ncols=3,show=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "#umap plot\n", + "ls_marker = adata.var.index.tolist()\n", + "figname = f\"Umap_{s_type}_{n_pcs+1}markers.png\"\n", + "fig = sc.pl.umap(adata, color=ls_marker,wspace=.25,vmin='p1.5',vmax='p98.5',ncols=3,show=False,return_fig=True)\n", + "ax_list = fig.axes\n", + "for ax in ax_list:\n", + " ax.set_title(ax.get_title(),fontsize=28)\n", + "fig.savefig(f'figures/{figname}',dpi=600)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## cluster leiden \n", + "\n", + "[contents](#contents)\n", + "\n", + "cluster on the markers that are normalized well" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "resolution = 0.45\n", + "results_file = f'{s_sample}_{n_neighbors}neighbors_{n_pcs}pcs_{len(adata.var.index)}markers_leiden{resolution}.h5ad'\n", + "#save\n", + "if not os.path.exists(results_file):\n", + " sc.tl.leiden(adata,resolution=resolution)\n", + "else:\n", + " adata = sc.read_h5ad(results_file)\n", + " print('loading leiden') \n", + "fig,ax = plt.subplots(figsize=(3,2.5),dpi=600)\n", + "figname=f'leiden_{resolution}.png'\n", + "sc.pl.umap(adata, color='leiden',ax=ax,save=figname)\n", + "#seaborn clustermap\n", + "df_p = pd.DataFrame(data=adata.raw.X,index=adata.obs.index,columns=adata.var.index)\n", + "df_p['leiden'] = adata.obs['leiden']\n", + "g = sns.clustermap(df_p.groupby('leiden').mean(),z_score=1,figsize=(4,4),cmap='viridis',\n", + " vmin=-1.5,vmax=1.5) \n", + "#g.savefig(f'./figures/clustermap_leiden.png',dpi=200)\n", + "marker_genes = df_p.groupby('leiden').mean().iloc[:,g.dendrogram_col.reordered_ind].columns.tolist()\n", + "categories_order = df_p.groupby('leiden').mean().iloc[g.dendrogram_row.reordered_ind,:].index.tolist()\n", + "#scanpy matrixplot\n", + "fig,ax = plt.subplots(figsize=(5,5), dpi=200)\n", + "figname=f'Matrixplot_leiden_{resolution}.png'\n", + "sc.pl.matrixplot(adata, var_names=marker_genes, groupby=f'leiden',title='',categories_order=categories_order,\n", + " ax=ax,save=figname,standard_scale='var',colorbar_title='Relative\\nintensity',\n", + " #var_group_positions=[(3,23),(24,31),(32,42),(43,51)],\n", + " #var_group_labels=['tumor','T-cell','muscle\\n +AF','immune\\n+stroma'],\n", + " #var_group_rotation=0\n", + " )\n", + "\n", + "#save\n", + "if not os.path.exists(results_file):\n", + " adata.write(results_file)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Leiden barplots \n", + "\n", + "\n", + "[contents](#contents)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ls_order = [\n", + " 'Bx2','Bx3','Bx4',#'JE-TMA-43_scene01','JE-TMA-62_scene01',\n", + " 'JE-TMA-43_scene02', 'JE-TMA-62_scene02',\n", + " 'JE-TMA-43_scene03', 'JE-TMA-62_scene03', 'JE-TMA-43_scene04',\n", + " 'JE-TMA-62_scene04', 'JE-TMA-43_scene05', 'JE-TMA-62_scene05',\n", + " 'JE-TMA-43_scene06','JE-TMA-60_scene06', 'JE-TMA-62_scene06', 'JE-TMA-43_scene07',\n", + " 'JE-TMA-62_scene07', 'JE-TMA-43_scene08','JE-TMA-60_scene08', 'JE-TMA-62_scene08',\n", + " 'JE-TMA-43_scene09','JE-TMA-60_scene09', 'JE-TMA-62_scene09','JE-TMA-43_scene10', 'JE-TMA-62_scene10','JE-TMA-60_scene10',\n", + " 'JE-TMA-43_scene11', 'JE-TMA-60_scene11', 'JE-TMA-62_scene11', 'JE-TMA-43_scene13',\n", + " 'JE-TMA-62_scene12', 'JE-TMA-43_scene14','JE-TMA-60_scene13', 'JE-TMA-62_scene13'] " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ls_order_r = ls_order[::-1]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#load original\n", + "'''\n", + "s_sample = '20210320_SMTBx2-4_JE-TMA-43_60_62'\n", + "n_neighbors = 30\n", + "n_pcs = 19\n", + "n_markers = n_pcs+1\n", + "resolution = 0.5\n", + "results_file = f'{s_sample}_{n_neighbors}neighbors_{n_pcs}pcs_{n_markers}markers_leiden{resolution}.h5ad'\n", + "adata1 = sc.read_h5ad(results_file) \n", + "\n", + "d_cluster = {'14': '14: Basal',\n", + "'5': '5: T cell',\n", + "'12': '12: T cell',\n", + "'10': '10: Myoepithelial',\n", + "'1': '1: Mesenchymal',\n", + "'16': '16: Prolif.',\n", + "'15': '15: Vim+ FB (Bx3)',\n", + "'11': '11: Vim+ FB (Bx4)',\n", + "'13': '13: Vim+ FB (Bx2)',\n", + "'7': '7: HER2++',\n", + "'9': '9: EGFR+ Basal',\n", + "'3': '3: HER2+',\n", + "'8': '8: HER2++, Ecad-',\n", + "'0': '0: ER+ (Bx4)',\n", + "'2': '2: ER+, PCNA+ ',\n", + "'4': '4: ER+, EGFR+ (Bx3)',\n", + "'6': '6: ER+ (Bx2)'}\n", + "d_clust_names = dict(zip([item[0] for item in d_cluster.items()],[item[1].split(': ')[1] for item in d_cluster.items()]))\n", + "'''" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#load\n", + "s_sample = '20210325_SMTBx2-4_JE-TMA-43_60_62'\n", + "n_neighbors = 31\n", + "n_pcs = 17\n", + "n_markers = n_pcs+1\n", + "resolution = 0.45\n", + "results_file = f'{s_sample}_{n_neighbors}neighbors_{n_pcs}pcs_{n_markers}markers_leiden{resolution}.h5ad'\n", + "adata1 = sc.read_h5ad(results_file) \n", + "print(results_file)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if resolution == 0.5:\n", + " d_cluster = {'14': '14: Basal','12': '12: T cell','16': '16: Prolif.','7': '7: ER+ (Bx2)','13': '13: Luminal (N.Breast)',\n", + " '1': '1: ER+ PCNA+ (T47D)','0': '0: ER+ (Bx4)','15': '15: ER+ CK8++ (Bx4)','4': '4: ER+, EGFR+ (Bx3)','18': '18: ER+, EGFR+ (Bx3)',\n", + " '17': '17: (Bx3)','10': '10: FB (Bx4)','11': '11: FB (Bx2)','3': '3: CD44+','9': '9: CD44+', '8': '8: EGFR+ Basal',\n", + " '5': '5: HER2++','6': '6: HER2+','2': '2: HER2++, Ecad-',}\n", + "elif resolution == 0.45:\n", + " d_cluster = {'15':'15: Basal',\n", + " '12':'12: T cell',\n", + " '16': '16: prolif.',\n", + " '5':'5: ER+, EGFR+ (Bx3)',\n", + " '0':'0: ER+ (Bx4)',\n", + " '1':'1: ER+, PCNA+',\n", + " '7':'7: ER- (Bx2)',\n", + " '9':'9: ER+ (Bx2)',\n", + " '8':'8: EGFR+ Basal',\n", + " '4':'4: HER2+',\n", + " '3':'3: HER2+',\n", + " '6':'6: HER2+, Ecad-',\n", + " '2':'2: Mesenchymal',\n", + " '10':'10: Mesenchymal',\n", + " '14':'14: fibroblast',\n", + " '11':'11: fibroblast',\n", + " '13':'13: fibroblast'}\n", + "d_clust_names = dict(zip([item[0] for item in d_cluster.items()],[item[1].split(': ')[1] for item in d_cluster.items()]))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "%matplotlib inline\n", + "mpl.rcParams['pdf.fonttype'] = 42\n", + "mpl.rcParams['ps.fonttype'] = 42\n", + "#sns.set(font_scale=1.19)\n", + "#seaborn clustermap\n", + "df_p = pd.DataFrame(data=adata1.raw.X,index=adata1.obs.index,columns=adata1.var.index)\n", + "df_p['leiden'] = adata1.obs['leiden']\n", + "g = sns.clustermap(df_p.groupby('leiden').mean().rename({'eccentricity':'eccen.'},axis=1).rename(d_cluster, axis=0),\n", + " z_score=1,figsize=(6.2,6),cmap='viridis',\n", + " vmin=-2,vmax=2,cbar_pos=(.05, .89, .10, .05),cbar_kws={'orientation': 'horizontal','label':'Z-score'}) #(left, bottom, width, height),\n", + "g.savefig(f'./{s_date}/clustermap_leiden_{resolution}_{n_markers}.pdf',dpi=300)\n", + "g.savefig(f'./{s_date}/clustermap_leiden_{resolution}_{n_markers}.png',dpi=300)\n", + "marker_genes = df_p.groupby('leiden').mean().iloc[:,g.dendrogram_col.reordered_ind].columns.tolist()\n", + "categories_order = df_p.groupby('leiden').mean().iloc[g.dendrogram_row.reordered_ind,:].index.tolist()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# stacked bar vertical\n", + "\n", + "df = pd.DataFrame(data=adata1.raw.X,index=adata1.obs.index,columns=adata1.var.index)\n", + "df[f'leiden'] = [int(item) for item in adata1.obs.leiden]\n", + "s_markers = n_markers\n", + "k=resolution\n", + "\n", + "df['slide'] = [item.split('_')[0] for item in df.index]\n", + "df['slide_scene'] = [item.split('_cell')[0] for item in df.index]\n", + "df['slide_scene'] = df.slide_scene.replace({'SMTBx2-5_scene001':'Bx2', 'SMTBx2-5_scene002':'Bx2',\n", + " 'SMTBx3_scene004':'Bx3', 'SMTBx4-3_scene001':'Bx4',\n", + " 'SMTBx4-3_scene002':'Bx4'})#.replace(d_order)\n", + "df['scene'] = [item.split('_')[1] for item in df.index]\n", + "df_prop = (df.groupby([f'leiden','slide_scene']).PCNA.count())/(df.groupby(['slide_scene']).PCNA.count())\n", + "df_prop = df_prop.unstack().fillna(value=0).T\n", + "\n", + "fig,ax=plt.subplots(figsize=(5,6), dpi=200)\n", + "df_prop['slide'] =[item.split('_')[0] for item in df_prop.index]\n", + "#df_prop['scene'] =[item.split('_')[1] for item in df_prop.index]\n", + "df_prop = df_prop.loc[ls_order_r]\n", + "df_prop.columns = [str(item) for item in df_prop.columns]\n", + "#df_prop.rename(d_order).rename(d_cluster,axis=1).plot(kind='barh',stacked=True,ax=ax,legend=True,cmap='tab20',width=0.9)\n", + "df_prop.plot(kind='barh',stacked=True,ax=ax,legend=True,cmap='tab20',width=0.9)\n", + "ax.legend(bbox_to_anchor=(1.05, 1.00),ncol=1, fancybox=True,title='Cluster ID')\n", + "ax.set_xlabel('Fraction of Cells')\n", + "ax.set_ylabel('Tissue')\n", + "ax.set_title('')\n", + "plt.tight_layout()\n", + "fig.savefig(f'./{s_date}/StackedBar_{s_markers}markers_{k}Clusters_vertical.pdf')\n", + "fig.savefig(f'./{s_date}/StackedBar_{s_markers}markers_{k}Clusters_vertical.png')\n", + "#plt.close()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#save the cluster ID, not hte annotation\n", + "#df_prop.to_csv(f'{s_sample}_{n_markers}markers_leiden{resolution}_frac_pos.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "import matplotlib.ticker as tic\n", + "#SMT\n", + "fig,ax=plt.subplots(figsize=(2.8,3.2),dpi=200)\n", + "df_plot = df_prop.loc[['Bx2','Bx3','Bx4'],df_prop.dtypes=='float64'].T[::-1]\n", + "df_plot.plot(kind='barh',ax=ax,legend=True,width=.9)\n", + "ax.legend(title='Bx', loc='upper left',fancybox=True,borderpad=.2,bbox_to_anchor=(1.05, 1.05))\n", + "ax.set_xlabel('Fraction of Cells')\n", + "ax.set_ylabel('')\n", + "fig.suptitle(f'Cluster Composition: Biopsies',x=.5, y=.92)\n", + "for tick in ax.yaxis.get_major_ticks():\n", + " tick.tick1line.set_markersize(0)\n", + " tick.tick2line.set_markersize(0)\n", + "temp = tic.LinearLocator(numticks=18)\n", + "ax.yaxis.set_minor_locator(temp)\n", + "plt.grid(b=True, which='minor', axis='y')\n", + "plt.tight_layout()\n", + "fig.savefig(f'./{s_date}/Barplot_SMT{s_markers}_K{k}.pdf')\n", + "fig.savefig(f'./{s_date}/Barplot_SMT{s_markers}_K{k}.png')\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ls_order = ['Bx2', 'Bx3', 'Bx4','AU565-2','AU565-3', 'AU565-4', 'BT474-2','BT474-3', 'BT474-4', \n", + " 'HCC1143-2', 'HCC1143-4', 'HCC3153-2', 'HCC3153-4', #'JE-TMA-43_scene01','JE-TMA-62_scene01', 'JE-TMA-43_scene10',\n", + " 'MDAMB-436-2','MDAMB-436-3', 'MDAMB-436-4', 'T47D-2','T47D-3', 'T47D-4',\n", + " 'N.Breast-2', 'N.Breast-4', 'tonsil-2', 'tonsil-4']\n", + "d_order = {#'\n", + " 'JE-TMA-43_scene02':'HCC1143-2', 'JE-TMA-62_scene02':'HCC1143-4',\n", + " 'JE-TMA-43_scene03':'HCC3153-2', 'JE-TMA-62_scene03':'HCC3153-4', 'JE-TMA-43_scene04':'N.Breast-2',\n", + " 'JE-TMA-62_scene04':'N.Breast-4', 'JE-TMA-43_scene05':'T47D-2', 'JE-TMA-62_scene05':'T47D-4',\n", + " 'JE-TMA-43_scene06':'T47D-2', 'JE-TMA-62_scene06':'T47D-4', 'JE-TMA-43_scene07':'tonsil-2',\n", + " 'JE-TMA-62_scene07':'tonsil-4', 'JE-TMA-43_scene08':'BT474-2', 'JE-TMA-62_scene08':'BT474-4',\n", + " 'JE-TMA-43_scene09':'BT474-2', 'JE-TMA-62_scene09':'BT474-4', 'JE-TMA-43_scene10':'AU565-2','JE-TMA-62_scene10':'AU565-4',\n", + " 'JE-TMA-43_scene11':'AU565-2', 'JE-TMA-62_scene11':'AU565-4', 'JE-TMA-43_scene13':'MDAMB-436-2',\n", + " 'JE-TMA-62_scene12':'MDAMB-436-4', 'JE-TMA-43_scene14':'MDAMB-436-2', 'JE-TMA-62_scene13':'MDAMB-436-4',\n", + " 'JE-TMA-60_scene13':'MDAMB-436-3', 'JE-TMA-60_scene11':'AU565-3', 'JE-TMA-60_scene10':'AU565-3',\n", + " 'JE-TMA-60_scene09':'BT474-3', 'JE-TMA-60_scene08':'BT474-3', 'JE-TMA-60_scene06':'T47D-3'}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "#stacked bar vertical tissue\n", + "df['coreID'] = df.slide_scene.replace(d_order)\n", + "df['celltype'] = df.leiden.astype('str').replace(d_clust_names)\n", + "df_prop = (df.groupby([f'celltype','coreID']).PCNA.count())/(df.groupby(['coreID']).PCNA.count())\n", + "df_prop = df_prop.unstack().fillna(value=0).T\n", + "\n", + "fig,ax=plt.subplots(figsize=(5,3.7), dpi=200)\n", + "df_prop['slide'] =[item.split('_')[0] for item in df_prop.index]\n", + "ls_order_r = ls_order[::-1]\n", + "df_prop = df_prop.loc[ls_order_r]\n", + "df_prop.columns = [str(item) for item in df_prop.columns]\n", + "df_prop.plot(kind='barh',stacked=True,ax=ax,legend=True,cmap='tab20',width=0.9) #.rename(d_order).rename(d_clust_names,axis=1)\n", + "ax.legend(loc='upper left', bbox_to_anchor=(1.1,1.02),ncol=1, fancybox=True,title='Cluster Annotation')\n", + "ax.set_xlabel('Fraction of Cells')\n", + "ax.set_ylabel('Tissue')\n", + "ax.set_title('Cluster Composition: Biopsies Plus Controls')\n", + "plt.tight_layout()\n", + "fig.savefig(f'./{s_date}/StackedBar_{s_markers}markers_{k}Clusters_withcontrols_vert.pdf')\n", + "fig.savefig(f'./{s_date}/StackedBar_{s_markers}markers_{k}Clusters_withcontrols_vert.png')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#stacked bar horizontal\n", + "df['coreID'] = df.slide_scene.replace(d_order)\n", + "df['celltype'] = df.leiden.astype('str').replace(d_clust_names)\n", + "df_prop = (df.groupby([f'celltype','coreID']).PCNA.count())/(df.groupby(['coreID']).PCNA.count())\n", + "df_prop = df_prop.unstack().fillna(value=0).T\n", + "\n", + "fig,ax=plt.subplots(figsize=(10,2.5), dpi=200)\n", + "df_prop['slide'] =[item.split('_')[0] for item in df_prop.index]\n", + "#df_prop['scene'] =[item.split('_')[1] for item in df_prop.index]\n", + "df_prop = df_prop.loc[ls_order]\n", + "df_prop.columns = [str(item) for item in df_prop.columns]\n", + "df_prop.plot(kind='bar',stacked=True,ax=ax,legend=True,cmap='tab20',width=0.9) #.rename(d_order).rename(d_clust_names,axis=1)\n", + "ax.legend(loc='upper center', bbox_to_anchor=(1.5, 1.05),ncol=2, fancybox=True,title='Cluster Annotation')\n", + "ax.set_ylabel('Fraction of Cells')\n", + "ax.set_xlabel('Tissue')\n", + "ax.set_title('')\n", + "plt.tight_layout()\n", + "fig.savefig(f'./{s_date}/StackedBar_{s_markers}markers_{k}Clusters_withcontrols.pdf')\n", + "fig.savefig(f'./{s_date}/StackedBar_{s_markers}markers_{k}Clusters_withcontrols.png')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#plot all groups spatially \n", + "from matplotlib.colors import ListedColormap, LinearSegmentedColormap\n", + "newcmap = ListedColormap(mpl.cm.tab20.colors)#ListedColormap(mpl.cm.tab20b.colors + mpl.cm.tab20c.colors)\n", + "from mplex_image import analyze\n", + "df_pos = analyze.celltype_to_bool(df_p,'leiden')\n", + "df_xy = df_mi.loc[df_pos.index]\n", + "ls_scene = ['SMTBx2-5_scene001', 'SMTBx3_scene004', 'SMTBx4-3_scene001', 'SMTBx4-3_scene002']\n", + "#ls_scene = ['JE-TMA-62_scene04', 'JE-TMA-43_scene04','JE-TMA-62_scene07','JE-TMA-43_scene07']\n", + "for s_slide in ls_scene:\n", + " fig,ax = plt.subplots(figsize=(10,10),dpi=200) #10,10\n", + " #plot negative cells\n", + " df_scene = df_xy[df_xy.index.str.contains(s_slide)]\n", + " ax.scatter(data=df_scene,x='DAPI_X',y='DAPI_Y',color='silver',s=0.1,label=f'')\n", + " for idxs, s_color_int in enumerate(range(len(df_pos.columns))):\n", + " s_color = str(s_color_int)\n", + " if len(df_xy[(df_xy.slide_scene==s_slide) & (df_pos.loc[:,s_color])])>=1:\n", + " #plot positive cells\n", + " ax.scatter(data=df_xy[(df_xy.slide_scene==s_slide) & (df_pos.loc[:,s_color])],x='DAPI_X',y='DAPI_Y',\n", + " label=f'{s_color}',s=0.1,color=newcmap.colors[idxs])\n", + " #break\n", + " ax.set_title(f\"{s_slide}\", fontsize=16)\n", + " ax.axis('equal')\n", + " ax.set_ylim(ax.get_ylim()[::-1])\n", + " #ax.set_xticklabels('')\n", + " #ax.set_yticklabels('')\n", + " #break\n", + " plt.legend(markerscale=10) \n", + " fig.savefig(f'{codedir}/paper_data/GatingPlots/{s_slide}_clustering_scatterplot.png')\n", + " #break" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if not os.path.exists(f'{s_sample}_{n_markers}markers_leiden{resolution}.csv'):\n", + " print('saving csv')\n", + " df.to_csv(f'{s_sample}_{n_markers}markers_leiden{resolution}.csv')\n", + " df_prop.to_csv(f'{s_sample}_{n_markers}markers_leiden{resolution}_frac_pos.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "f'{s_sample}_{n_markers}markers_leiden{resolution}.csv'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "f'{s_sample}_{n_markers}markers_leiden{resolution}_frac_pos.csv'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python3.9.5", + "language": "python", + "name": "python3.9.5" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/mplex_image/20210312_visualize.py b/mplex_image/20210312_visualize.py new file mode 100755 index 0000000..f9f86b9 --- /dev/null +++ b/mplex_image/20210312_visualize.py @@ -0,0 +1,288 @@ +#### +# title: analyze.py +# +# language: Python3.6 +# date: 2019-05-00 +# license: GPL>=v3 +# author: Jenny +# +# description: +# python3 library to visualize cyclic data and analysis +#### + +#load libraries +import matplotlib as mpl +import matplotlib.pyplot as plt +import pandas as pd +import numpy as np +import os +import skimage +from skimage import io, segmentation +import tifffile +import copy +import napari +import seaborn as sns +from sklearn.cluster import KMeans +from sklearn.preprocessing import scale + +#napari +def load_crops(viewer,s_crop,s_tissue): + ls_color = ['blue','green','yellow','red','cyan','magenta','gray','green','yellow','red','cyan','magenta', + 'gray','gray','gray','gray','gray','gray','gray','gray'] + print(s_crop) + #viewer = napari.Viewer() + for s_file in os.listdir(): + if s_file.find(s_tissue)>-1: + if s_file.find(s_crop) > -1: + if s_file.find('ome.tif') > -1: + with tifffile.TiffFile(s_file) as tif: + array = tif.asarray() + omexml_string = tif.ome_metadata + for idx in range(array.shape[0]): + img = array[idx] + i_begin = omexml_string.find(f'Channel ID="Channel:0:{idx}" Name="') + i_end = omexml_string[i_begin:].find('" SamplesPerPixel') + s_marker = omexml_string[i_begin + 31:i_begin + i_end] + viewer.add_image(img,name=s_marker,rgb=False,visible=False,blending='additive',colormap=ls_color[idx],contrast_limits = (np.quantile(img,0),(np.quantile(img,0.9999)+1)*1.5)) + elif s_file.find('SegmentationBasins') > -1: + label_image = io.imread(s_file) + viewer.add_labels(label_image, name='cell_seg',blending='additive',visible=False) + cell_boundaries = segmentation.find_boundaries(label_image,mode='outer') + viewer.add_labels(cell_boundaries,blending='additive') + else: + label_image = np.array([]) + print('') + return(label_image) + +def pos_label(viewer,df_pos,label_image,s_cell): + ''' + df_pos = boolean dataframe, s_cell = marker name + ''' + #s_cell = df_pos.columns[df_pos.columns.str.contains(f'{s_cell}_')][0] + #get rid of extra cells (filtered by DAPI, etc) + li_index = [int(item.split('_')[-1].split('cell')[1]) for item in df_pos.index] + label_image_cell = copy.deepcopy(label_image) + label_image_cell[~np.isin(label_image_cell, li_index)] = 0 + li_index_cell = [int(item.split('_')[-1].split('cell')[1]) for item in df_pos[df_pos.loc[:,s_cell]==True].index] + label_image_cell[~np.isin(label_image_cell,li_index_cell )] = 0 + viewer.add_labels(label_image_cell, name=f'{s_cell.split("_")[0]}_seg',blending='additive',visible=False) + return(label_image_cell) + +#jupyter notbook +#load manual thresholds +def new_thresh_csv(df_mi,d_combos): + #make thresh csv's + df_man = pd.DataFrame(index= ['global']+ sorted(set(df_mi.slide_scene))) + for s_type, es_marker in d_combos.items(): + for s_marker in sorted(es_marker): + df_man[s_marker] = '' + return(df_man) + +def load_thresh_csv(s_sample): + #load + df_man = pd.read_csv(f'thresh_JE_{s_sample}.csv',header=0,index_col = 0) + #reformat the thresholds data and covert to 16 bit + ls_index = df_man.index.tolist() + ls_index.remove('global') + df_thresh = pd.DataFrame(index = ls_index) + ls_marker = df_man.columns.tolist() + for s_marker in ls_marker: + df_thresh[f'{s_marker}_global'] = df_man[df_man.index=='global'].loc['global',f'{s_marker}']*256 + df_thresh[f'{s_marker}_local'] = df_man[df_man.index!='global'].loc[:,f'{s_marker}']*256 + + df_thresh.replace(to_replace=0, value = 12, inplace=True) + return(df_thresh) + +def threshold_postive(df_thresh,df_mi): + ''' + #make positive dataframe to check threhsolds #start with local, and if its not there, inesrt the global threshold + #note, this will break if there are two biomarker locations # + ''' + ls_scene = sorted(df_thresh.index.tolist()) + ls_sub = df_mi.columns[df_mi.dtypes=='float64'].tolist() + ls_other = [] + df_pos= pd.DataFrame() + d_thresh_record= {} + for s_scene in ls_scene: + ls_index = df_mi[df_mi.slide_scene==s_scene].index + df_scene = pd.DataFrame(index=ls_index) + for s_marker_loc in ls_sub: + s_marker = s_marker_loc.split('_')[0] + # only threshold markers in .csv + if len(set([item.split('_')[0] for item in df_thresh.columns]).intersection({s_marker})) != 0: + #first check if local threshold exists + if df_thresh[df_thresh.index==s_scene].isna().loc[s_scene,f'{s_marker}_local']==False: + #local + i_thresh = df_thresh.loc[s_scene,f'{s_marker}_local'] + df_scene.loc[ls_index,s_marker_loc] = df_mi.loc[ls_index,s_marker_loc] >= i_thresh + #otherwise use global + elif df_thresh[df_thresh.index==s_scene].isna().loc[s_scene,f'{s_marker}_global']==False: + i_thresh = df_thresh.loc[s_scene,f'{s_marker}_global'] + df_scene.loc[ls_index,s_marker_loc] = df_mi.loc[ls_index,s_marker_loc] >= i_thresh + else: + ls_other = ls_other + [s_marker] + i_thresh = np.NaN + d_thresh_record.update({f'{s_scene}_{s_marker}':i_thresh}) + else: + ls_other = ls_other + [s_marker] + df_pos = df_pos.append(df_scene) + print(f'Did not threshold {set(ls_other)}') + return(d_thresh_record,df_pos) + +def plot_positive(s_type,d_combos,df_pos,d_thresh_record,df_xy,b_save=True): + ls_color = sorted(d_combos[s_type]) + ls_bool = [len(set([item.split('_')[0]]).intersection(set(ls_color)))==1 for item in df_pos.columns] + ls_color = df_pos.columns[ls_bool].tolist() + ls_scene = sorted(set(df_xy.slide_scene)) + ls_fig = [] + for s_scene in ls_scene: + #negative cells = all cells even before dapi filtering + df_neg = df_xy[(df_xy.slide_scene==s_scene)] + #plot + fig, ax = plt.subplots(2, ((len(ls_color))+1)//2, figsize=(18,12)) #figsize=(18,12) + ax = ax.ravel() + for ax_num, s_color in enumerate(ls_color): + s_marker = s_color.split('_')[0] + s_min = d_thresh_record[f"{s_scene}_{s_marker}"] + #positive cells = positive cells based on threshold + ls_pos_index = (df_pos[df_pos.loc[:,s_color]]).index + df_color_pos = df_neg[df_neg.index.isin(ls_pos_index)] + if len(df_color_pos)>=1: + #plot negative cells + ax[ax_num].scatter(data=df_neg,x='DAPI_X',y='DAPI_Y',color='silver',s=1) + #plot positive cells + ax[ax_num].scatter(data=df_color_pos, x='DAPI_X',y='DAPI_Y',color='DarkBlue',s=.5) + + ax[ax_num].axis('equal') + ax[ax_num].set_ylim(ax[ax_num].get_ylim()[::-1]) + ax[ax_num].set_title(f'{s_marker} min={int(s_min)} ({len(df_color_pos)} cells)') + else: + ax[ax_num].set_title(f'{s_marker} min={(s_min)} ({(0)} cells') + fig.suptitle(s_scene) + ls_fig.append(fig) + if b_save: + fig.savefig(f'./SpatialPlots/{s_scene}_{s_type}_manual.png') + return(ls_fig) + +#gating analysis +def prop_positive(df_data,s_cell,s_grouper): + #df_data['countme'] = True + df_cell = df_data.loc[:,[s_cell,s_grouper,'countme']].dropna() + df_prop = (df_cell.groupby([s_cell,s_grouper]).countme.count()/df_cell.groupby([s_grouper]).countme.count()).unstack().T + return(df_prop) + +def prop_clustermap(df_prop,df_annot,i_thresh,lut,figsize=(10,5)): + for s_index in df_prop.index: + s_subtype = df_annot.loc[s_index,'ID'] # + df_prop.loc[s_index, 'ID'] = s_subtype + species = df_prop.pop("ID") + row_colors = species.map(lut) + + #clustermap plot wihtout the low values -drop less than i_threh % of total + df_plot = df_prop.fillna(0) + if i_thresh > 0: + df_plot_less = df_plot.loc[:,df_plot.sum()/len(df_plot) > i_thresh] + i_len = len(df_prop) + i_width = len(df_plot_less.columns) + g = sns.clustermap(df_plot_less,figsize=figsize,cmap='viridis',row_colors=row_colors) + return(g,df_plot_less) + +def prop_barplot(df_plot_less,s_cell,colormap="Spectral",figsize=(10,5),b_sort=True): + i_len = len(df_plot_less) + i_width = len(df_plot_less.columns) + fig,ax = plt.subplots(figsize=figsize) + if b_sort: + df_plot_less = df_plot_less.sort_index(ascending=False) + df_plot_less.plot(kind='barh',stacked=True,width=.9, ax=ax,colormap=colormap) + ax.set_title(s_cell) + ax.set_xlabel('Fraction Positive') + ax.legend(bbox_to_anchor=(1.01, 1)) + plt.tight_layout() + return(fig) + +def plot_color_leg(lut,figsize = (2.3,3)): + #colors + series = pd.Series(lut) + df_color = pd.DataFrame(index=range(len(series)),columns=['subtype','color']) + + series.sort_values() + df_color['subtype'] = series.index + df_color['value'] = 1 + df_color['color'] = series.values + + fig,ax = plt.subplots(figsize = figsize,dpi=100) + df_color.plot(kind='barh',x='subtype',y='value',width=1,legend=False,color=df_color.color,ax=ax) + ax.set_xticks([]) + ax.set_ylabel('') + ax.set_title(f'subtype') + plt.tight_layout() + return(fig) + +#cluster analysis + +def cluster_kmeans(df_mi,ls_columns,k,b_sil=False): + ''' + log2 transform, zscore and kmens cluster + ''' + df_cluster_norm = df_mi.loc[:,ls_columns] + df_cluster_norm_one = df_cluster_norm + 1 + df_cluster = np.log2(df_cluster_norm_one) + + #select figure size + i_len = k + i_width = len(df_cluster.columns) + + #scale date + df_scale = scale(df_cluster) + + #kmeans cluster + kmeans = KMeans(n_clusters=k, random_state=0).fit(df_scale) + df_cluster.columns = [item.split('_')[0] for item in df_cluster.columns] + df_cluster[f'K{k}'] = list(kmeans.labels_) + g = sns.clustermap(df_cluster.groupby(f'K{k}').mean(),cmap="RdYlGn_r",z_score=1,figsize=(3+i_width/3,3+i_len/3)) + if b_sil: + score = silhouette_score(X = df_scale, labels=list(kmeans.labels_)) + else: + score = np.nan + return(g,df_cluster,score) + +def plot_clusters(df_cluster,df_xy,s_num='many'): + s_type = df_cluster.columns[df_cluster.dtypes=='int64'][0] + print(s_type) + ls_scene = sorted(set(df_cluster.slide_scene)) + ls_color = sorted(set(df_cluster.loc[:,s_type].dropna())) + d_fig = {} + for s_scene in ls_scene: + #negative cells = all cells even before dapi filtering + df_neg = df_xy[(df_xy.slide_scene==s_scene)] + #plot + if s_num == 'many': + fig, ax = plt.subplots(3, ((len(ls_color))+2)//3, figsize=(18,12),dpi=200) + else: + fig, ax = plt.subplots(2, 1, figsize=(7,4),dpi=200) + ax = ax.ravel() + for ax_num, s_color in enumerate(ls_color): + s_marker = s_color + #positive cells = poitive cells based on threshold + ls_pos_index = (df_cluster[df_cluster.loc[:,s_type]==s_color]).index + df_color_pos = df_neg[df_neg.index.isin(ls_pos_index)] + if len(df_color_pos)>=1: + #plot negative cells + ax[ax_num].scatter(data=df_neg,x='DAPI_X',y='DAPI_Y',color='silver',s=1) + #plot positive cells + ax[ax_num].scatter(data=df_color_pos, x='DAPI_X',y='DAPI_Y',color='DarkBlue',s=.5) + + ax[ax_num].axis('equal') + ax[ax_num].set_ylim(ax[ax_num].get_ylim()[::-1]) + if s_num == 'many': + ax[ax_num].set_xticklabels('') + ax[ax_num].set_yticklabels('') + ax[ax_num].set_title(f'{s_color} ({len(df_color_pos)} cells)') + else: + ax[ax_num].set_xticklabels('') + ax[ax_num].set_yticklabels('') + ax[ax_num].set_title(f'{s_color} ({(0)} cells') + + fig.suptitle(s_scene) + d_fig.update({s_scene:fig}) + return(d_fig) diff --git a/mplex_image/__init__.py b/mplex_image/__init__.py new file mode 100755 index 0000000..e69de29 diff --git a/mplex_image/__pycache__/__init__.cpython-37.pyc b/mplex_image/__pycache__/__init__.cpython-37.pyc new file mode 100755 index 0000000..e9e21ea Binary files /dev/null and b/mplex_image/__pycache__/__init__.cpython-37.pyc differ diff --git a/mplex_image/__pycache__/__init__.cpython-38.pyc b/mplex_image/__pycache__/__init__.cpython-38.pyc new file mode 100755 index 0000000..95b1ebc Binary files /dev/null and b/mplex_image/__pycache__/__init__.cpython-38.pyc differ diff --git a/mplex_image/__pycache__/__init__.cpython-39.pyc b/mplex_image/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000..b8859ba Binary files /dev/null and b/mplex_image/__pycache__/__init__.cpython-39.pyc differ diff --git a/mplex_image/__pycache__/analyze.cpython-37.pyc b/mplex_image/__pycache__/analyze.cpython-37.pyc new file mode 100755 index 0000000..2c8fcb1 Binary files /dev/null and b/mplex_image/__pycache__/analyze.cpython-37.pyc differ diff --git a/mplex_image/__pycache__/analyze.cpython-38.pyc b/mplex_image/__pycache__/analyze.cpython-38.pyc new file mode 100755 index 0000000..ff95f60 Binary files /dev/null and b/mplex_image/__pycache__/analyze.cpython-38.pyc differ diff --git a/mplex_image/__pycache__/analyze.cpython-39.pyc b/mplex_image/__pycache__/analyze.cpython-39.pyc new file mode 100644 index 0000000..842d212 Binary files /dev/null and b/mplex_image/__pycache__/analyze.cpython-39.pyc differ diff --git a/mplex_image/__pycache__/cmif.cpython-37.pyc b/mplex_image/__pycache__/cmif.cpython-37.pyc new file mode 100755 index 0000000..5e4ca2b Binary files /dev/null and b/mplex_image/__pycache__/cmif.cpython-37.pyc differ diff --git a/mplex_image/__pycache__/cmif.cpython-38.pyc b/mplex_image/__pycache__/cmif.cpython-38.pyc new file mode 100755 index 0000000..571f31b Binary files /dev/null and b/mplex_image/__pycache__/cmif.cpython-38.pyc differ diff --git a/mplex_image/__pycache__/cmif.cpython-39.pyc b/mplex_image/__pycache__/cmif.cpython-39.pyc new file mode 100755 index 0000000..3742d85 Binary files /dev/null and b/mplex_image/__pycache__/cmif.cpython-39.pyc differ diff --git a/mplex_image/__pycache__/codex.cpython-37.pyc b/mplex_image/__pycache__/codex.cpython-37.pyc new file mode 100755 index 0000000..6438d19 Binary files /dev/null and b/mplex_image/__pycache__/codex.cpython-37.pyc differ diff --git a/mplex_image/__pycache__/codex.cpython-38.pyc b/mplex_image/__pycache__/codex.cpython-38.pyc new file mode 100755 index 0000000..0010b93 Binary files /dev/null and b/mplex_image/__pycache__/codex.cpython-38.pyc differ diff --git a/mplex_image/__pycache__/features.cpython-37.pyc b/mplex_image/__pycache__/features.cpython-37.pyc new file mode 100755 index 0000000..c9df747 Binary files /dev/null and b/mplex_image/__pycache__/features.cpython-37.pyc differ diff --git a/mplex_image/__pycache__/features.cpython-38.pyc b/mplex_image/__pycache__/features.cpython-38.pyc new file mode 100755 index 0000000..c869dfe Binary files /dev/null and b/mplex_image/__pycache__/features.cpython-38.pyc differ diff --git a/mplex_image/__pycache__/features.cpython-39.pyc b/mplex_image/__pycache__/features.cpython-39.pyc new file mode 100755 index 0000000..ed790ee Binary files /dev/null and b/mplex_image/__pycache__/features.cpython-39.pyc differ diff --git a/mplex_image/__pycache__/gating.cpython-38.pyc b/mplex_image/__pycache__/gating.cpython-38.pyc new file mode 100755 index 0000000..93c662f Binary files /dev/null and b/mplex_image/__pycache__/gating.cpython-38.pyc differ diff --git a/mplex_image/__pycache__/gating.cpython-39.pyc b/mplex_image/__pycache__/gating.cpython-39.pyc new file mode 100644 index 0000000..88ac253 Binary files /dev/null and b/mplex_image/__pycache__/gating.cpython-39.pyc differ diff --git a/mplex_image/__pycache__/getdata.cpython-37.pyc b/mplex_image/__pycache__/getdata.cpython-37.pyc new file mode 100755 index 0000000..59ac9ef Binary files /dev/null and b/mplex_image/__pycache__/getdata.cpython-37.pyc differ diff --git a/mplex_image/__pycache__/getdata.cpython-38.pyc b/mplex_image/__pycache__/getdata.cpython-38.pyc new file mode 100755 index 0000000..83ae205 Binary files /dev/null and b/mplex_image/__pycache__/getdata.cpython-38.pyc differ diff --git a/mplex_image/__pycache__/getdata.cpython-39.pyc b/mplex_image/__pycache__/getdata.cpython-39.pyc new file mode 100755 index 0000000..d77f944 Binary files /dev/null and b/mplex_image/__pycache__/getdata.cpython-39.pyc differ diff --git a/mplex_image/__pycache__/imagine.cpython-37.pyc b/mplex_image/__pycache__/imagine.cpython-37.pyc new file mode 100755 index 0000000..306fa6b Binary files /dev/null and b/mplex_image/__pycache__/imagine.cpython-37.pyc differ diff --git a/mplex_image/__pycache__/imagine.cpython-38.pyc b/mplex_image/__pycache__/imagine.cpython-38.pyc new file mode 100755 index 0000000..49741f4 Binary files /dev/null and b/mplex_image/__pycache__/imagine.cpython-38.pyc differ diff --git a/mplex_image/__pycache__/metadata.cpython-37.pyc b/mplex_image/__pycache__/metadata.cpython-37.pyc new file mode 100755 index 0000000..ec53895 Binary files /dev/null and b/mplex_image/__pycache__/metadata.cpython-37.pyc differ diff --git a/mplex_image/__pycache__/metadata.cpython-38.pyc b/mplex_image/__pycache__/metadata.cpython-38.pyc new file mode 100755 index 0000000..862b8f4 Binary files /dev/null and b/mplex_image/__pycache__/metadata.cpython-38.pyc differ diff --git a/mplex_image/__pycache__/mics.cpython-38.pyc b/mplex_image/__pycache__/mics.cpython-38.pyc new file mode 100755 index 0000000..2b21d7a Binary files /dev/null and b/mplex_image/__pycache__/mics.cpython-38.pyc differ diff --git a/mplex_image/__pycache__/mics.cpython-39.pyc b/mplex_image/__pycache__/mics.cpython-39.pyc new file mode 100755 index 0000000..68abfea Binary files /dev/null and b/mplex_image/__pycache__/mics.cpython-39.pyc differ diff --git a/mplex_image/__pycache__/mpimage.cpython-37.pyc b/mplex_image/__pycache__/mpimage.cpython-37.pyc new file mode 100755 index 0000000..7694f6a Binary files /dev/null and b/mplex_image/__pycache__/mpimage.cpython-37.pyc differ diff --git a/mplex_image/__pycache__/mpimage.cpython-38.pyc b/mplex_image/__pycache__/mpimage.cpython-38.pyc new file mode 100755 index 0000000..25b868a Binary files /dev/null and b/mplex_image/__pycache__/mpimage.cpython-38.pyc differ diff --git a/mplex_image/__pycache__/mpimage.cpython-39.pyc b/mplex_image/__pycache__/mpimage.cpython-39.pyc new file mode 100755 index 0000000..93be7a7 Binary files /dev/null and b/mplex_image/__pycache__/mpimage.cpython-39.pyc differ diff --git a/mplex_image/__pycache__/normalize.cpython-38.pyc b/mplex_image/__pycache__/normalize.cpython-38.pyc new file mode 100755 index 0000000..432c2cd Binary files /dev/null and b/mplex_image/__pycache__/normalize.cpython-38.pyc differ diff --git a/mplex_image/__pycache__/normalize.cpython-39.pyc b/mplex_image/__pycache__/normalize.cpython-39.pyc new file mode 100755 index 0000000..376a0fc Binary files /dev/null and b/mplex_image/__pycache__/normalize.cpython-39.pyc differ diff --git a/mplex_image/__pycache__/ometiff.cpython-37.pyc b/mplex_image/__pycache__/ometiff.cpython-37.pyc new file mode 100755 index 0000000..575debe Binary files /dev/null and b/mplex_image/__pycache__/ometiff.cpython-37.pyc differ diff --git a/mplex_image/__pycache__/ometiff.cpython-38.pyc b/mplex_image/__pycache__/ometiff.cpython-38.pyc new file mode 100755 index 0000000..b3dbb77 Binary files /dev/null and b/mplex_image/__pycache__/ometiff.cpython-38.pyc differ diff --git a/mplex_image/__pycache__/ometiff.cpython-39.pyc b/mplex_image/__pycache__/ometiff.cpython-39.pyc new file mode 100755 index 0000000..789526e Binary files /dev/null and b/mplex_image/__pycache__/ometiff.cpython-39.pyc differ diff --git a/mplex_image/__pycache__/preprocess.cpython-37.pyc b/mplex_image/__pycache__/preprocess.cpython-37.pyc new file mode 100755 index 0000000..61224ba Binary files /dev/null and b/mplex_image/__pycache__/preprocess.cpython-37.pyc differ diff --git a/mplex_image/__pycache__/preprocess.cpython-38.pyc b/mplex_image/__pycache__/preprocess.cpython-38.pyc new file mode 100755 index 0000000..14db79b Binary files /dev/null and b/mplex_image/__pycache__/preprocess.cpython-38.pyc differ diff --git a/mplex_image/__pycache__/preprocess.cpython-39.pyc b/mplex_image/__pycache__/preprocess.cpython-39.pyc new file mode 100755 index 0000000..a629aca Binary files /dev/null and b/mplex_image/__pycache__/preprocess.cpython-39.pyc differ diff --git a/mplex_image/__pycache__/process.cpython-37.pyc b/mplex_image/__pycache__/process.cpython-37.pyc new file mode 100755 index 0000000..a2ab185 Binary files /dev/null and b/mplex_image/__pycache__/process.cpython-37.pyc differ diff --git a/mplex_image/__pycache__/process.cpython-38.pyc b/mplex_image/__pycache__/process.cpython-38.pyc new file mode 100755 index 0000000..18d3893 Binary files /dev/null and b/mplex_image/__pycache__/process.cpython-38.pyc differ diff --git a/mplex_image/__pycache__/process.cpython-39.pyc b/mplex_image/__pycache__/process.cpython-39.pyc new file mode 100755 index 0000000..5a6c4e6 Binary files /dev/null and b/mplex_image/__pycache__/process.cpython-39.pyc differ diff --git a/mplex_image/__pycache__/register.cpython-37.pyc b/mplex_image/__pycache__/register.cpython-37.pyc new file mode 100755 index 0000000..6b120c4 Binary files /dev/null and b/mplex_image/__pycache__/register.cpython-37.pyc differ diff --git a/mplex_image/__pycache__/register.cpython-38.pyc b/mplex_image/__pycache__/register.cpython-38.pyc new file mode 100755 index 0000000..1590041 Binary files /dev/null and b/mplex_image/__pycache__/register.cpython-38.pyc differ diff --git a/mplex_image/__pycache__/register.cpython-39.pyc b/mplex_image/__pycache__/register.cpython-39.pyc new file mode 100755 index 0000000..d5a71c0 Binary files /dev/null and b/mplex_image/__pycache__/register.cpython-39.pyc differ diff --git a/mplex_image/__pycache__/segment.cpython-37.pyc b/mplex_image/__pycache__/segment.cpython-37.pyc new file mode 100755 index 0000000..3204988 Binary files /dev/null and b/mplex_image/__pycache__/segment.cpython-37.pyc differ diff --git a/mplex_image/__pycache__/segment.cpython-38.pyc b/mplex_image/__pycache__/segment.cpython-38.pyc new file mode 100755 index 0000000..d6e2cbc Binary files /dev/null and b/mplex_image/__pycache__/segment.cpython-38.pyc differ diff --git a/mplex_image/__pycache__/segment.cpython-39.pyc b/mplex_image/__pycache__/segment.cpython-39.pyc new file mode 100755 index 0000000..9015372 Binary files /dev/null and b/mplex_image/__pycache__/segment.cpython-39.pyc differ diff --git a/mplex_image/__pycache__/visualize.cpython-37.pyc b/mplex_image/__pycache__/visualize.cpython-37.pyc new file mode 100755 index 0000000..77489bc Binary files /dev/null and b/mplex_image/__pycache__/visualize.cpython-37.pyc differ diff --git a/mplex_image/__pycache__/visualize.cpython-38.pyc b/mplex_image/__pycache__/visualize.cpython-38.pyc new file mode 100755 index 0000000..4f6e116 Binary files /dev/null and b/mplex_image/__pycache__/visualize.cpython-38.pyc differ diff --git a/mplex_image/__pycache__/visualize.cpython-39.pyc b/mplex_image/__pycache__/visualize.cpython-39.pyc new file mode 100755 index 0000000..d1843c1 Binary files /dev/null and b/mplex_image/__pycache__/visualize.cpython-39.pyc differ diff --git a/mplex_image/_version.py b/mplex_image/_version.py new file mode 100755 index 0000000..6526deb --- /dev/null +++ b/mplex_image/_version.py @@ -0,0 +1 @@ +__version__ = "0.0.7" diff --git a/mplex_image/analyze.py b/mplex_image/analyze.py new file mode 100755 index 0000000..2887c41 --- /dev/null +++ b/mplex_image/analyze.py @@ -0,0 +1,300 @@ +#### +# title: analyze.py +# +# language: Python3.6 +# date: 2019-05-00 +# license: GPL>=v3 +# author: Jenny +# +# description: +# python3 library to analyze cyclic data and images after manual thresholding +#### + +#load libraries +import matplotlib as mpl +mpl.use('agg') +import pandas as pd +import numpy as np +import os +import skimage +from skimage import io +import json +from biotransistor import imagine +import itertools + +#functions +# import importlib +# importlib.reload(analyze) + +def combinations(df_tn_tumor,ls_marker=['CK19_Ring','CK7_Ring','CK5_Ring','CK14_Ring','CD44_Ring','Vim_Ring']): + ''' + get all combinations of the markers (can be overlapping) + ''' + ls_combos = [] + for i in range(0,len(ls_marker)): + for tu_combo in itertools.combinations(ls_marker,i+1):#'Ecad_Ring', + ls_combos.append(tu_combo) + + #create the combos dataframe dataframe + df_tn_counts = pd.DataFrame(index=df_tn_tumor.index) + se_all = set(ls_marker) + + #combos of 2 or more + for tu_combo in ls_combos: + print(tu_combo) + se_pos = df_tn_tumor[(df_tn_tumor.loc[:,tu_combo].sum(axis=1) ==len(tu_combo))] #those are pos + se_neg = df_tn_tumor[(df_tn_tumor.loc[:,(se_all)].sum(axis=1) == len(tu_combo))] #and only those + df_tn_counts['_'.join([item for item in tu_combo])] = df_tn_tumor.index.isin(se_pos.index.intersection(se_neg.index)) + + #other cells (negative for all) + df_tn_counts['__'] = df_tn_counts.loc[:,df_tn_counts.dtypes=='bool'].sum(axis=1)==0 + if sum(df_tn_counts.sum(axis=1)!=1) !=0: + print('error in analyze.combinations') + + return(df_tn_counts) + +def gated_combinations(df_data,ls_gate,ls_marker): + ''' + df_data = boolean cell type dataframe + ls_gate = combine each of these cell types (full coverage and non-overlapping) + ls_marker = with these cell tpyes (full coverage and non-overlapping) + ''' + es_all = set(ls_marker + ls_gate) + ls_old = df_data.columns + df_gate_counts = pd.DataFrame() + for s_gate in ls_gate: + df_tn_tumor = df_data[df_data.loc[:,s_gate]] + print(f'{s_gate} {len(df_tn_tumor)}') + #combos of 2 + if len(df_tn_tumor) >=1: + for s_marker in ls_marker: + print(s_marker) + tu_combo = (s_gate,s_marker) + es_neg = es_all - set(tu_combo) + if ~df_data.loc[:,tu_combo].all(axis=1).any(): + df_gate_counts[f"{s_gate}_{s_marker}"] = False + else: + df_gate_counts[f"{s_gate}_{s_marker}"] = df_data.loc[:,tu_combo].all(axis=1) & ~df_data.loc[:,es_neg].any(axis=1) + df_gate_counts.fillna(value=False, inplace=True) + return(df_gate_counts) + +def add_celltype(df_data, ls_cell_names, s_type_name): + ''' + add gated cell type to data frame, and save the possible cell typesand cell type name in a csv + df_data = data frame with the cell types (boolean) + ls_cell_names = list of the cell names + s_type_name = the cell category + ''' + #check cell types' exclusivity + if ((df_data.loc[:,ls_cell_names].sum(axis=1)>1)).sum()!=0: + print(f'Error in exclusive cell types: {s_type_name}') + + #make cell type object columns + for s_marker in ls_cell_names: + df_data.loc[(df_data[df_data.loc[:,s_marker]]).index,s_type_name] = s_marker + d_record = {s_type_name:ls_cell_names} + + #append the record json + if not os.path.exists('./Gating_Record.json'): + with open(f'Gating_Record.json','w') as f: + json.dump(d_record, f, indent=4, sort_keys=True) + else: + with open('Gating_Record.json','r') as f: + d_current = json.load(f) + d_current.update(d_record) + with open(f'Gating_Record.json','w') as f: + json.dump(d_current, f, indent=4, sort_keys=True) + +def thresh_meanint(df_thresh,d_crop={},s_thresh='minimum',): + """ + threshold, and output positive and negative mean intensity and array + df_thresh = dataframe of images with columns having image attributes + and index with image names, column with threshold values + d_crop = image scene and crop coordinates + + """ + d_mask = {} + for idx, s_index in enumerate(df_thresh.index): + #load image, crop, thresh + a_image = skimage.io.imread(s_index) + if len(d_crop) != 0: + tu_crop = d_crop[df_thresh.loc[s_index,'scene']] + a_image = a_image[(tu_crop[1]):(tu_crop[1]+tu_crop[3]),(tu_crop[0]):(tu_crop[0]+tu_crop[2])] + i_min = df_thresh.loc[s_index,s_thresh] + a_mask = a_image > i_min + print(f'mean positive intensity = {np.mean(a_image[a_mask])}') + df_thresh.loc[s_index,'meanpos'] = np.mean(a_image[a_mask]) + b_mask = a_image < i_min + print(f'mean negative intensity = {np.mean(a_image[b_mask])}') + df_thresh.loc[s_index,'meanneg'] = np.mean(a_image[b_mask]) + d_mask.update({s_index:a_mask}) + return(df_thresh,d_mask) + +def mask_meanint(df_img, a_mask): + ''' + for each image in dataframe of image (df_img) + calculate mean intensity in pixels in mask (a_mask) + ''' + + #for each image, calculate mean intensity in the masked area + for s_index in df_img.index: + a_img = skimage.io.imread(s_index) + a_img_total = a_img[a_mask] + i_img_meanint = a_img_total.sum()/a_img_total.size + df_img.loc[s_index,'result'] = i_img_meanint + return(df_img) + +def make_border(s_sample,df_pos,ls_color,segmentdir,savedir,b_images=True,s_find = 'Cell Segmentation Basins.tif',s_split='Scene '): + """ + load positive cells dataframe, and segmentation basins + output the borders od positive cells and the cells touching dictionary + """ + #load segmentation basins + #flattens ids into a set (stored in d_flatten) + os.chdir(segmentdir) + ls_file = os.listdir() + ls_cellseg = [] + + # list of Basin files + for s_file in ls_file: + if s_file.find(s_find)>-1: + if s_file.find(s_sample)>-1: + ls_cellseg.append(s_file) + + d_flatten = {} + dd_touch = {} + + for s_file in ls_cellseg: + s_scene_num = s_file.split(s_split)[1].split('_')[0].split(' ')[0] + print(s_file) + print(s_scene_num) + a_img = io.imread(s_file) + # get all cell ids that exist in the images + es_cell = set(a_img.flatten()) + es_cell.remove(0) + s_scene = f'scene{s_scene_num}' + d_flatten.update({f'scene{s_scene_num}':es_cell}) + + #get a cell touching dictionary (only do this one (faster)) + dd_touch.update({f'{s_sample}_{s_scene}':imagine.touching_cells(a_img, i_border_width=0)}) + + #s_type = 'Manual' + if b_images: + #save png of cell borders (single tiffs) + for idx, s_color in enumerate(ls_color): + print(f'Processing {s_color}') + #positive cells = positive cells based on thresholds + #dataframe of all the positive cells + df_color_pos = df_pos[df_pos.loc[:,s_color]] + ls_index = df_color_pos.index.tolist() + + if len(df_color_pos[(df_color_pos.scene==s_scene)])>=1: + ls_index = df_color_pos[(df_color_pos.scene==s_scene)].index.tolist() + es_cell_positive = set([int(s_index.split('cell')[-1]) for s_index in ls_index]) + + # erase all non positive basins + es_cell_negative = d_flatten[s_scene].difference(es_cell_positive) + a_pos = np.copy(a_img) + a_pos[np.isin(a_img, list(es_cell_negative))] = 0 # bue: this have to be a list, else it will not work! + + # get cell border (a_pos_border) + a_pos_border = imagine.get_border(a_pos) # border has value 1 + a_pos_border = np.uint16(a_pos_border * 65000) # border will have value 255 + #filename hack + print('saving image') + io.imsave(f'{savedir}/Registered-R{idx+100}_{s_color.replace("_",".")}.border.border.border_{df_color_pos.index[0].split("_")[0]}-{s_scene.replace("scene","Scene-")}_c2_ORG.tif',a_pos_border) + else: + print(len(df_color_pos[(df_color_pos.scene==s_scene)])) + #from elmar (reformat cells touching dictionary and save + + ddes_image = {} + for s_image, dei_image in dd_touch.items(): + des_cell = {} + for i_cell, ei_touch in dei_image.items(): + des_cell.update({str(i_cell): [str(i_touch) for i_touch in sorted(ei_touch)]}) + ddes_image.update({s_image:des_cell}) + + #save dd_touch as json file + with open(f'result_{s_sample}_cellstouching_dictionary.json','w') as f: + json.dump(ddes_image, f) + return(ddes_image) + +def make_border_all(s_sample,df_pos,segmentdir,savedir,b_images=True): + """ + load positive cells dataframe, and segmentation basins + output the borders od positive cells and the cells touching dictionary + """ + #Specify which images to save + #ls_color = df_pos.columns.tolist() + #ls_color.remove('DAPI_X') + #ls_color.remove('DAPI_Y') + #ls_color.remove('scene') + + #load segmentation basins + #flattens ids into a set (stored in d_flatten) + os.chdir(segmentdir) + ls_file = os.listdir() + ls_cellseg = [] + d_files = {} + #dictionary of file to scene ID , and a list of Basin files + for s_file in ls_file: + if s_file.find('Cell Segmentation Basins.tif')>-1: + if s_file.find(s_sample)>-1: + ls_cellseg.append(s_file) + s_scene_num = s_file.split(' ')[1] + d_files.update({f'scene{s_scene_num}':s_file}) + + d_flatten = {} + dd_touch = {} + + for s_file in ls_cellseg: + s_scene_num = s_file.split(' ')[1] + print(s_file) + a_img = skimage.io.imread(s_file) + # get all cell ids that exist in the images + es_cell = set(a_img.flatten()) + es_cell.remove(0) + s_scene = f'scene{s_scene_num}' + d_flatten.update({f'scene{s_scene_num}':es_cell}) + + #get a cell touching dictionary (only do this one (faster)) + dd_touch.update({f'{s_sample}_{s_scene}':imagine.touching_cells(a_img, i_border_width=0)}) + + #s_type = 'Manual' + if b_images: + idx=0 + #save png of all cell borders (single tiffs) + #for idx, s_color in enumerate(ls_color): + # print(f'Processing {s_color}') + #positive cells = positive cells based on thresholds + #dataframe of all the positive cells + df_color_pos = df_pos #[df_pos.loc[:,s_color]] + ls_index = df_color_pos.index.tolist() + + if len(df_color_pos[(df_color_pos.scene==s_scene)])>=1: + ls_index = df_color_pos[(df_color_pos.scene==s_scene)].index.tolist() + es_cell_positive = set([int(s_index.split('cell')[-1]) for s_index in ls_index]) + + # erase all non positive basins + es_cell_negative = d_flatten[s_scene].difference(es_cell_positive) + a_pos = np.copy(a_img) + a_pos[np.isin(a_img, list(es_cell_negative))] = 0 # bue: this have to be a list, else it will not work! + + # get cell border (a_pos_border) + a_pos_border = imagine.get_border(a_pos) # border has value 1 + a_pos_border = a_pos_border.astype(np.uint8) + a_pos_border = a_pos_border * 255 # border will have value 255 + #filename hack 2019-11-27 + skimage.io.imsave(f'{savedir}/R{idx+100}_all.all_{df_color_pos.index[0].split("_")[0]}-{s_scene.replace("scene","Scene-")}_border_c3_ORG.tif',a_pos_border) + +def celltype_to_bool(df_data, s_column): + """ + Input a dataframe and column name of cell tpyes + Output a new boolean dataframe with each col as a cell type + """ + df_bool = pd.DataFrame(index=df_data.index) + for celltype in sorted(set(df_data.loc[:,s_column])): + df_bool.loc[df_data[df_data.loc[:,s_column]==celltype].index,celltype] = True + df_bool = df_bool.fillna(value=False) + df_data.columns = [str(item) for item in df_data.columns] + return(df_bool) \ No newline at end of file diff --git a/mplex_image/cmif.py b/mplex_image/cmif.py new file mode 100755 index 0000000..62367dc --- /dev/null +++ b/mplex_image/cmif.py @@ -0,0 +1,705 @@ +# wrapper functions for cmIF image processing + +from mplex_image import preprocess, mpimage, getdata, process, features, register, ometiff +import copy +import time +import os +import numpy as np +import shutil +import subprocess +import pandas as pd +import math +from itertools import compress +import skimage +import sys +import re +from skimage import io +from skimage.util import img_as_uint +import tifffile + +#set src path (CHANGE ME) +s_src_path = '/home/groups/graylab_share/OMERO.rdsStore/engje/Data/cmIF' +s_work_path = '/home/groups/graylab_share/Chin_Lab/ChinData/Work/engje' + + +def parse_czi(czidir,type='r',b_scenes=True): + """ + parse .czi's written in koei's naming convention + type = 's' for stitched + """ + cwd = os.getcwd() + #go to directory + os.chdir(czidir) + df_img = mpimage.filename_dataframe(s_end = ".czi",s_start='R',s_split='_') + df_img['slide'] = [item[2] for item in [item.split('_') for item in df_img.index]] + if type=='s': + df_img['slide'] = [item[5] for item in [item.split('_') for item in df_img.index]] + df_img['rounds'] = [item[0] for item in [item.split('_') for item in df_img.index]] + df_img['markers'] = [item[1] for item in [item.split('_') for item in df_img.index]] + if b_scenes: + try: + df_img['scene'] = [item[1].split('.')[0] for item in [item.split('Scene-') for item in df_img.index]] + except IndexError: + print(f"{set([item[0] for item in [item.split('Scene-') for item in df_img.index]])}") + df_img['scanID'] = [item[-1].split('-Scene')[0] for item in [item.split('__') for item in df_img.index]] + os.chdir(cwd) + return(df_img) + +def parse_stitched_czi(czidir,s_slide,b_scenes=True): + ''' + parse .czi's wtitten in koei's naming convention, with periods changed to undescores + ''' + cwd = os.getcwd() + #go to directory + os.chdir(czidir) + df_img = mpimage.filename_dataframe(s_end = ".czi",s_start='R',s_split='_').rename({'data':'rounds'},axis=1) + df_img['markers'] = [item[0] for item in [item.split(f'_{s_slide}') for item in df_img.index]] + for s_index in df_img.index: + df_img.loc[s_index,'markers_un'] = df_img.loc[s_index,'markers'].split(f"{df_img.loc[s_index,'rounds']}_")[1] + df_img['markers'] = df_img.markers_un.str.replace('_','.') + df_img.slide = s_slide + if b_scenes: + df_img['scene'] = [item[1].split('-')[0] for item in [item.split('Scene-') for item in df_img.index]] + os.chdir(cwd) + return(df_img) + +def count_images(df_img): + """ + count and list slides, scenes, rounds + """ + for s_sample in sorted(set(df_img.slide)): + print(s_sample) + df_img_slide = df_img[df_img.slide==s_sample] + print('scene names') + [print(f'{item}: {sum(df_img_slide.scene==item)}') for item in sorted(set(df_img_slide.scene))] + print(f'Number of images = {len(df_img_slide)}') + print(f'Rounds:') + [print(f'{item}: {sum(df_img_slide.rounds==item)}') for item in sorted(set(df_img_slide.rounds))] + print('\n') + +def visualize_raw_images(df_img,qcdir,color='c1'): + """ + array raw images to check tissue identity, focus, etc. + """ + for s_sample in sorted(set(df_img.slide)): + print(s_sample) + + df_img_slide = df_img[df_img.slide==s_sample] + for s_scene in sorted(set(df_img_slide.scene)): + print(s_scene) + df_dapi = df_img_slide[(df_img_slide.color==color) & (df_img_slide.scene==s_scene)].sort_values(['round_ord','rounds']) + fig = mpimage.array_img(df_dapi,s_xlabel='slide',ls_ylabel=['scene','color'],s_title='rounds',tu_array=(2,len(df_dapi)//2+1),tu_fig=(24,10)) + fig.savefig(f'{qcdir}/RawImages/{s_sample}-Scene-{s_scene}_{color}_all.png') + +def registration_python(s_sample,tiffdir,regdir,qcdir): + print(f'Registering {s_sample}') + preprocess.cmif_mkdir([f'{qcdir}/RegistrationPlots/']) + os.chdir(f'{tiffdir}/{s_sample}') + df_img = mpimage.parse_org(s_end = "ORG.tif",type='raw') + df_img['round_ord'] = [int(re.sub('[^0-9]','', item)) for item in df_img.rounds] + df_img = df_img.sort_values(['round_ord','rounds','color','scene']) + for i_scene in sorted(set(df_img.scene)): + preprocess.cmif_mkdir([f'{regdir}/{s_sample}-Scene-{i_scene}']) + df_dapi = df_img[(df_img.color=='c1') & (df_img.scene==i_scene)] + target_file = df_dapi[df_dapi.rounds=='R1'].index[0] + target = io.imread(target_file) + for moving_file in df_dapi.index: + s_round = moving_file.split('_')[0] + moving_pts, target_pts, transformer = register.register(target_file,moving_file,b_plot=True) + for moving_channel in df_img[(df_img.rounds==s_round) & (df_img.scene==i_scene)].index: + moving = io.imread(moving_channel) + warped_img, warped_pts = register.apply_transform(moving, target, moving_pts, target_pts, transformer) + warped_img = img_as_uint(warped_img) + io.imsave(f"{regdir}/{s_sample}-Scene-{i_scene}/Registered-{moving_channel.split(s_sample)[0]}{s_sample}-Scene-{moving_channel.split('-Scene-')[1]}",warped_img) + +def run_registration_matlab(d_register, ls_order, tiffdir, regdir, N_colors='5'): + """ + run registration on server with or without cropping + """ + os.chdir(tiffdir) + shutil.copyfile(f'{s_src_path}/src/wrapper.sh', './wrapper.sh') + for s_sample, d_crop in d_register.items(): + if len(d_crop) > 0: + print(f'Large registration {s_sample}') + for key, value in d_crop.items(): + if len(str(key)) == 1: + preprocess.cmif_mkdir([f'{regdir}/{s_sample.split("-Scene")[0]}-Scene-00{str(key)}']) + elif len(str(key)) == 2: + preprocess.cmif_mkdir([f'{regdir}/{s_sample.split("-Scene")[0]}-Scene-0{str(key)}']) + preprocess.large_registration_matlab(N_smpl='10000',N_colors=N_colors,s_rootdir=tiffdir, s_subdirname=regdir, + d_crop_regions=d_crop, s_ref_id='./R1_*_c1_ORG.tif', ls_order=ls_order) + MyOut = subprocess.Popen(['sbatch', 'wrapper.sh'], #the script runs fine + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT) + #regular registration + else: + print(f'Regular registration {s_sample}') + df_img = mpimage.parse_org(s_end = "ORG.tif",type='raw') + df_img['slide_scene'] = df_img.slide + '-Scene-' + df_img.scene + preprocess.cmif_mkdir([(f'{regdir}/{item}') for item in sorted(set(df_img.slide_scene))]) #this will break with diff slides + preprocess.registration_matlab(N_smpl='10000',N_colors=N_colors,s_rootdir=tiffdir, s_subdirname=f'{regdir}/', + s_ref_id='./R1_*_c1_ORG.tif',ls_order =ls_order) + MyOut = subprocess.Popen(['sbatch', 'wrapper.sh'], #the script runs fine + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT) + +def visualize_reg_images(regdir,qcdir,color='c1',s_sample=''): + """ + array registered images to check tissue identity, focus, etc. + """ + #check registration + preprocess.cmif_mkdir([f'{qcdir}/RegisteredImages']) + os.chdir(regdir) + for idx, s_dir in enumerate(sorted(os.listdir())): + if s_dir.find(s_sample) > -1: + os.chdir(s_dir) + s_sample_name = s_dir.split('-Scene')[0] + print(s_sample_name) + df_img = mpimage.parse_org(s_end = "ORG.tif",type='reg') + ls_scene = sorted(set(df_img.scene)) + for s_scene in ls_scene: + print(s_scene) + df_img_scene = df_img[df_img.scene == s_scene] + df_img_stain = df_img_scene[df_img_scene.color==color] + df_img_sort = df_img_stain.sort_values(['round_ord','rounds']) + i_sqrt = math.ceil(math.sqrt(len(df_img_sort))) + fig = mpimage.array_img(df_img_sort,s_xlabel='marker',ls_ylabel=['scene','color'],s_title='rounds',tu_array=(2,len(df_img_sort)//2+1),tu_fig=(24,10)) + #fig = mpimage.array_img(df_img_sort,s_column='color',s_row='rounds',s_label='scene',tu_array=(i_sqrt,i_sqrt),tu_fig=(16,14)) + fig.savefig(f'{qcdir}/RegisteredImages/{s_scene}_registered_{color}.png') + os.chdir('..') + return(df_img_sort) + +def rename_files(d_rename,dir,b_test=True): + """ + change file names + """ + os.chdir(dir) + for idx, s_dir in enumerate(sorted(os.listdir())): + s_path = f'{dir}/{s_dir}' + os.chdir(s_path) + #s_sample = s_dir.split('-Scene')[0] + print(s_dir) + df_img = mpimage.parse_org(s_end = "ORG.tif",type='reg') + es_wrong= preprocess.check_names(df_img) + if b_test: + print('This is a test') + preprocess.dchange_fname(d_rename,b_test=True) + elif b_test==False: + print('Changing name - not a test') + preprocess.dchange_fname(d_rename,b_test=False) + else: + pass + +def autofluorescence_subtract_dir(regdir,codedir,d_channel,ls_exclude,subdir,d_early={}): + ''' + AF subtract images + ''' + os.chdir(regdir) + for idx, s_dir in enumerate(sorted(os.listdir())): + print(s_dir) + s_path = f'{regdir}/{s_dir}' + os.chdir(s_path) + #preprocess.cmif_mkdir([f'{s_path}/AFSubtracted']) + s_sample = s_dir.split('-Scene')[0] + df_img = mpimage.parse_org(s_end = "ORG.tif",type='reg') + #load exposure times csv + df_exp = pd.read_csv(f'{codedir}/{s_sample}_ExposureTimes.csv',index_col=0,header=0)# + #AF subtract images + df_img_exp = mpimage.add_exposure(df_img,df_exp,type='czi') + if len(d_early)>0: + df_markers, df_copy = mpimage.subtract_scaled_images(df_img_exp,d_late=d_channel, + d_early=d_early, ls_exclude=ls_exclude,subdir=subdir,b_8bit=False) + else: + df_markers, df_copy = mpimage.subtract_images(df_img_exp,d_channel=d_channel, + ls_exclude=ls_exclude,subdir=subdir,b_8bit=False) + + return(df_markers) + +def autofluorescence_subtract(s_sample,df_img,codedir,d_channel,ls_exclude,subdir,d_early={}): + ''' + AF subtract images + ''' + df_img = mpimage.parse_org(s_end = "ORG.tif",type='reg') + #load exposure times csv + df_exp = pd.read_csv(f'{codedir}/{s_sample}_ExposureTimes.csv',index_col=0,header=0)# + #AF subtract images + df_img_exp = mpimage.add_exposure(df_img,df_exp,type='czi') + if len(d_early)>0: + df_markers, df_copy = mpimage.subtract_scaled_images(df_img_exp,d_late=d_channel, + d_early=d_early, ls_exclude=ls_exclude,subdir=subdir,b_8bit=False) + else: + df_markers, df_copy = mpimage.subtract_images(df_img_exp,d_channel=d_channel, + ls_exclude=ls_exclude,subdir=subdir,b_8bit=False) + + return(df_markers) + +def multipage_ome_tiff(d_combos,d_crop,tu_dim,s_dapi,regdir,b_crop=False): + ''' + make custom overlays, either original of AF subtracted, save at 8 bit for size, and thresholding + ''' + os.chdir(regdir) + for idx, s_dir in enumerate(sorted(os.listdir())): + print(s_dir) + s_path = f'{regdir}/{s_dir}' + os.chdir(s_path) + df_img = mpimage.parse_org(s_end = "ORG.tif",s_start='R',type='reg') + df_dapi = df_img[df_img.marker.str.contains(s_dapi.split('_')[0])] + df_img_stain = df_img[(~df_img.marker.str.contains('DAPI'))] + #check + es_test = set() + for key, item in d_combos.items(): + es_test = es_test.union(item) + print(set(df_img_stain.marker) - es_test) + + #cropped + if b_crop: + s_scene = set(d_crop).intersection(set(df_img.scene)) + d_crop_scene={k: d_crop[k] for k in (sorted(s_scene))} + process.custom_crop_overlays(d_combos,d_crop_scene, df_img,s_dapi, tu_dim=tu_dim) #df_dapi, + else: + process.custom_overlays(d_combos, df_img_stain, df_dapi) + +def visualize_multicolor_overlay(s_scene,subdir,qcdir,d_overlay,d_crop,es_bright,high_thresh): + s_sample = s_scene.split('-Scene')[0] + preprocess.cmif_mkdir([f'{qcdir}/{s_sample}']) + if os.path.exists(f'{subdir}/{s_sample}'): + s_path = f'{subdir}/{s_sample}' + elif os.path.exists(f'{subdir}/{s_scene}'): + s_path = f'{subdir}/{s_scene}' + os.chdir(s_path) + df_img = mpimage.parse_org() + df_img['path'] = [f'{s_path}/{item}' for item in df_img.index] + df_dapi_round = df_img[(df_img.color=='c1')&(df_img.scene==s_scene) & (df_img.rounds=='R2')] + df_scene = df_img[(df_img.color!='c1') & (df_img.scene==s_scene)] + for s_round,ls_marker in d_overlay.items(): + print(f'Generating multicolor overlay {[item for item in ls_marker]}') + df_round = df_scene[df_scene.marker.isin(ls_marker)] + high_thresh=0.999 + d_overlay_round = {s_round:ls_marker} + d_result = mpimage.multicolor_png(df_round,df_dapi_round,s_scene=s_scene,d_overlay=d_overlay_round,d_crop=d_crop,es_dim={'nada'},es_bright=es_bright,low_thresh=2000,high_thresh=high_thresh) + for key, tu_result in d_result.items(): + io.imsave(f'{qcdir}/{s_sample}/ColorArray_{s_scene}_{key}_{".".join(tu_result[0])}.png',tu_result[1]) + +def cropped_ometiff(s_scene,subdir,cropdir,d_crop,d_combos,s_dapi,tu_dim,b_8bit=True): + s_sample = s_scene.split('-Scene')[0] + if os.path.exists(f'{subdir}/{s_sample}'): + os.chdir(f'{subdir}/{s_sample}') + elif os.path.exists(f'{subdir}/{s_scene}'): + os.chdir(f'{subdir}/{s_scene}') + df_img = mpimage.parse_org() + d_crop_scene = {s_scene:d_crop[s_scene]} + if b_8bit: + dd_result = mpimage.overlay_crop(d_combos,d_crop_scene,df_img,s_dapi,tu_dim) + else: + dd_result = mpimage.overlay_crop(d_combos,d_crop_scene,df_img,s_dapi,tu_dim,b_8bit=False) + for s_crop, d_result in dd_result.items(): + for s_type, (ls_marker, array) in d_result.items(): + print(f'Generating multi-page ome-tiff {[item for item in ls_marker]}') + new_array = array[np.newaxis,np.newaxis,:] + s_xml = ometiff.gen_xml(new_array, ls_marker) + with tifffile.TiffWriter(f'{cropdir}/{s_crop}_{s_type}.ome.tif') as tif: + tif.save(new_array, photometric = "minisblack", description=s_xml, metadata = None) + +def crop_registered(s_scene,bigdir,regdir,d_crop): + ''' + crop a stack of tiffs to the specified coordinates + d_crop: crop to scene:(xmin, y_min, xmax, ymax) + ''' + s_sample = s_scene.split('-Scene')[0] + print(s_scene) + os.chdir(f'{bigdir}/{s_scene}') + df_img = mpimage.parse_org() + df_scene = df_img[df_img.scene==s_scene] + for s_image in df_scene.index: + #print(s_image) + a_dapi = io.imread(s_image) + for idx, xy_cropcoor in d_crop.items(): + #crop + a_crop = a_dapi[xy_cropcoor[1]:xy_cropcoor[3],xy_cropcoor[0]:xy_cropcoor[2]] + preprocess.cmif_mkdir([f'{regdir}/{s_sample}-Scene-{idx:03}']) + io.imsave(f'{regdir}/{s_sample}-Scene-{idx:03}/{s_image.replace(s_scene,f"{s_sample}-Scene-{idx:03}")}',a_crop,check_contrast=False) + +def multipage_tiff(d_combos,d_crop,tu_dim,s_dapi,regdir,b_crop=False): + ''' + make custom overlays, either original of AF subtracted, save at 8 bit for size, and thresholding + ''' + os.chdir(regdir) + for idx, s_dir in enumerate(sorted(os.listdir())): + print(s_dir) + s_path = f'{regdir}/{s_dir}' + os.chdir(s_path) + df_img = mpimage.parse_org(s_end = "ORG.tif",s_start='R',type='reg') + df_dapi = df_img[df_img.marker.str.contains(s_dapi.split('_')[0])] + df_img_stain = df_img[(~df_img.marker.str.contains('DAPI'))] + #check + es_test = set() + for key, item in d_combos.items(): + es_test = es_test.union(item) + print(set(df_img_stain.marker) - es_test) + + #cropped + if b_crop: + s_scene = set(d_crop).intersection(set(df_img.scene)) + d_crop_scene={k: d_crop[k] for k in (sorted(s_scene))} + process.custom_crop_overlays(d_combos,d_crop_scene, df_img,s_dapi, tu_dim=tu_dim) #df_dapi, + else: + process.custom_overlays(d_combos, df_img_stain, df_dapi) + +def crop_basins(d_crop,tu_dim,segdir,cropdir,s_type='Cell'): + """ + crop the segmentation basins (cell of nuceli) to same coord as images for veiwing in Napari + """ + cwd = os.getcwd() + for s_scene, xy_cropcoor in d_crop.items(): + print(s_scene) + s_sample = s_scene.split('-Scene-')[0] + os.chdir(f'{segdir}/{s_sample}_Segmentation/') + + for s_file in os.listdir(): + if s_file.find(f'{s_type} Segmentation Basins.tif') > -1: #Nuclei Segmentation Basins.tif #Cell Segmentation Basins.tif + if s_file.find(s_scene.split('-Scene-')[1]) > -1: + a_seg = skimage.io.imread(s_file) + a_crop = a_seg[(xy_cropcoor[1]):(xy_cropcoor[1]+tu_dim[1]),(xy_cropcoor[0]):(xy_cropcoor[0]+tu_dim[0])] + s_coor = f'x{xy_cropcoor[0]}y{xy_cropcoor[1]}.tif' + #crop file + s_file_new = f'{cropdir}/{s_sample}-{s_file.replace(" - ","_").replace(" ","").replace("Scene","Scene-").replace(".tif",s_coor)}' + print(s_file_new) + skimage.io.imsave(s_file_new,a_crop) + os.chdir(cwd) + +def load_crop_labels(d_crop,tu_dim,segdir,cropdir,s_find='Nuclei Segmentation Basins'): + """ + crop the segmentation basins (cell of nuceli) to same coord as images for veiwing in Napari + s_find: 'exp5_CellSegmentationBasins' or 'Nuclei Segmentation Basins' + """ + cwd = os.getcwd() + for s_scene, xy_cropcoor in d_crop.items(): + print(s_scene) + s_sample = s_scene.split('-Scene-')[0] + os.chdir(f'{segdir}/{s_sample}Cellpose_Segmentation/') + + for s_file in os.listdir(): + if s_file.find(s_find) > -1: #Nuclei Segmentation Basins.tif #Cell Segmentation Basins.tif + if s_file.find(s_scene.split(s_sample)[1]) > -1: + a_seg = skimage.io.imread(s_file) + a_crop = a_seg[(xy_cropcoor[1]):(xy_cropcoor[1]+tu_dim[1]),(xy_cropcoor[0]):(xy_cropcoor[0]+tu_dim[0])] + s_coor = f'x{xy_cropcoor[0]}y{xy_cropcoor[1]}.tif' + #crop file + s_file_new = f'{cropdir}/{s_file.replace(" ","").replace(".tif",s_coor)}' + print(s_file_new) + skimage.io.imsave(s_file_new,a_crop) + os.chdir(cwd) + +def load_labels(d_crop,segdir,s_find='Nuclei Segmentation Basins'): + """ + load the segmentation basins (cell of nuceli) + s_find: 'exp5_CellSegmentationBasins' or 'Nuclei Segmentation Basins' + """ + d_label={} + cwd = os.getcwd() + for s_scene, xy_cropcoor in d_crop.items(): + print(s_scene) + s_sample = s_scene.split('-Scene-')[0] + os.chdir(f'{segdir}/{s_sample}Cellpose_Segmentation/') + for s_file in os.listdir(): + if s_file.find(s_find) > -1: #Nuclei Segmentation Basins.tif #Cell Segmentation Basins.tif + if s_file.find(s_scene.split(s_sample)[1]) > -1: + a_seg = skimage.io.imread(s_file) + d_label.update({s_scene:a_seg}) + os.chdir(cwd) + return(d_label) + +def crop_labels(d_crop,d_label,tu_dim,cropdir,s_name='Nuclei Segmentation Basins'): + """ + crop the segmentation basins (cell of nuceli) to same coord as images for veiwing in Napari + s_name = + """ + for s_scene, xy_cropcoor in d_crop.items(): + print(s_scene) + a_seg = d_label[s_scene] + a_crop = a_seg[(xy_cropcoor[1]):(xy_cropcoor[1]+tu_dim[1]),(xy_cropcoor[0]):(xy_cropcoor[0]+tu_dim[0])] + s_coor = f'x{xy_cropcoor[0]}y{xy_cropcoor[1]}.tif' + #crop file + s_file_new = f'{cropdir}/{s_name.replace(" ","").replace(".tif",s_coor)}' + print(s_file_new) + skimage.io.imsave(s_file_new,a_crop) + + +#### OLD: for Guillaume's pipeline ### + +def copy_files(dir,dapi_copy, marker_copy,b_test=True): + """ + copy and rename files if needed as dummies + """ + os.chdir(dir) + for idx, s_dir in enumerate(sorted(os.listdir())): + s_path = f'{dir}/{s_dir}' + os.chdir(s_path) + s_sample = s_dir.split('-Scene')[0] + df_img = mpimage.parse_org(s_end = "ORG.tif") + print(s_dir) + if b_test: + for key, dapi_item in dapi_copy.items(): + preprocess.copy_dapis(s_r_old=key,s_r_new=f'-R{dapi_item}_',s_c_old='_c1_',s_c_new='_c2_',s_find='_c1_ORG.tif',b_test=True) + i_count=0 + for idx,(key, item) in enumerate(marker_copy.items()): + preprocess.copy_markers(df_img, s_original=key, ls_copy = item,i_last_round= dapi_item + i_count, b_test=True) + i_count=i_count + len(item) + elif b_test==False: + print('Changing name - not a test') + for key, dapi_item in dapi_copy.items(): + preprocess.copy_dapis(s_r_old=key,s_r_new=f'-R{dapi_item}_',s_c_old='_c1_',s_c_new='_c2_',s_find='_c1_ORG.tif',b_test=False) + i_count=0 + for idx,(key, item) in enumerate(marker_copy.items()): + preprocess.copy_markers(df_img, s_original=key, ls_copy = item,i_last_round= dapi_item + i_count, b_test=False) + i_count=i_count + len(item) + else: + pass + +def segmentation_thresholds(regdir,qcdir, d_segment): + """ + visualize binary mask of segmentaiton threholds + """ + preprocess.cmif_mkdir([f'{qcdir}/Segmentation']) + os.chdir(regdir) + for idx, s_dir in enumerate(sorted(os.listdir())): + s_path = f'{regdir}/{s_dir}' + os.chdir(s_path) + df_img = mpimage.parse_org(s_end = "ORG.tif",type='reg') + s_sample = s_dir.split('-Scene')[0] + print(s_sample) + if (len(set(df_img.scene))) < 3: + d_seg = preprocess.check_seg_markers(df_img,d_segment, i_rows=1, t_figsize=(10,6)) #few scenes + elif (len(set(df_img.scene))) > 8: + d_seg = preprocess.check_seg_markers(df_img,d_segment, i_rows=3, t_figsize=(10,6)) #more scenes + else: + d_seg = preprocess.check_seg_markers(df_img,d_segment, i_rows=2, t_figsize=(10,6)) #more scenes + for key, fig in d_seg.items(): + fig.savefig(f'{qcdir}/Segmentation/{s_dir}_{key}_segmentation.png') + +def move_af_img(s_sample, regdir, subdir, dirtype='tma',b_move=False): + ''' + dirtype = 'single' or 'tma' or 'unsub' + ''' + #move + os.chdir(regdir) + for s_dir in sorted(os.listdir()): + if s_dir.find(s_sample)>-1: + if dirtype =='single': + preprocess.cmif_mkdir([f'{subdir}/{s_dir}']) + elif dirtype == 'tma': + preprocess.cmif_mkdir([f'{subdir}/{s_sample}']) + elif dirtype == 'unsub': + preprocess.cmif_mkdir([f'{subdir}/{s_sample}']) + if dirtype != 'unsub': + print(f'{regdir}/{s_dir}/AFSubtracted') + os.chdir(f'{regdir}/{s_dir}/AFSubtracted') + else: + os.chdir(f'{regdir}/{s_dir}') + for s_file in sorted(os.listdir()): + if dirtype =='single': + movedir = f'{subdir}/{s_dir}/{s_file}' + print(f'{regdir}/{s_dir}/AFSubtracted/{s_file} moved to {movedir}') + elif dirtype == 'tma': + movedir = f'{subdir}/{s_sample}/{s_file}' + print(f'{regdir}/{s_dir}/AFSubtracted/{s_file} moved to {movedir}') + elif dirtype == 'unsub': + movedir = f'{subdir}/{s_sample}/{s_file}' + print(f'{regdir}/{s_dir}/{s_file} moved to {movedir}') + if b_move: + if dirtype != 'unsub': + shutil.move(f'{regdir}/{s_dir}/AFSubtracted/{s_file}', f'{movedir}') + else: + shutil.move(f'{regdir}/{s_dir}/{s_file}', f'{movedir}') + +def extract_dataframe(s_sample, segdir,qcdir,i_rows=1): + ''' + get mean intensity, centroid dataframes + ''' + preprocess.cmif_mkdir([f'{qcdir}/Segmentation']) + #get data + os.chdir(segdir) + dd_run = getdata.get_df(s_folder_regex=f"^{s_sample}.*_Features$",es_value_label = {"MeanIntensity","CentroidY","CentroidX"})# + os.chdir(f'{s_sample}_Segmentation') + d_reg = process.check_seg(s_sample=s_sample,ls_find=['Cell Segmentation Full Color'], i_rows=i_rows, t_figsize=(8,8))# + for key, item in d_reg.items(): + item.savefig(f'{qcdir}/Segmentation/FullColor_{key}.png') + +def metadata_table(regdir,segdir): + """ + output channel/marker mapping + """ + os.chdir(regdir) + for idx, s_dir in enumerate(sorted(os.listdir())): + s_path = f'{regdir}/{s_dir}' + os.chdir(s_path) + df_img = mpimage.parse_org(s_end = "ORG.tif",type='reg') + if len(set(df_img.scene)) > 1: + df_img = df_img[df_img.scene==sorted(set(df_img.scene))[1]] + s_sample = s_dir + else: + s_sample = s_dir.split('-Scene')[0] + print(s_sample) + df_marker = df_img[df_img.color!='c1'] + df_marker = df_marker.sort_values(['rounds','color']) + df_dapi = pd.DataFrame(index = [df_marker.marker.tolist()],columns=['rounds','colors','minimum','maximum','exposure','refexp','location']) + df_dapi['rounds'] = df_marker.loc[:,['rounds']].values + df_dapi['colors'] = df_marker.loc[:,['color']].values + df_dapi['minimum'] = 1003 + df_dapi['maximum'] = 65535 + df_dapi['exposure'] = 100 + df_dapi['refexp'] = 100 + df_dapi['location'] = 'All' + df_dapi.to_csv(f'{segdir}/metadata_{s_sample}_RoundsCyclesTable.csv',header=True) + +def segmentation_inputs(regdir,segdir, d_segment,tma_bool=False,b_start=False,i_counter=0,b_java=False): + """ + make inputs for guillaumes segmentation + """ + + os.chdir(regdir) + for idx, s_dir in enumerate(sorted(os.listdir())): + s_path = f'{regdir}/{s_dir}' + os.chdir(s_path) + df_img = mpimage.parse_org(s_end = "ORG.tif",type='reg') + if len(set(df_img.scene)) > 1: + df_img = df_img[df_img.scene==sorted(set(df_img.scene))[1]] + s_sample = s_dir + else: + s_sample = s_dir.split('-Scene')[0] + print(s_sample) + df_marker = df_img[df_img.color!='c1'] + df_marker = df_marker.sort_values(['rounds','color']) + df_dapi = pd.DataFrame(index = [df_marker.marker.tolist()],columns=['rounds','colors','minimum','maximum','exposure','refexp','location']) + df_dapi['rounds'] = df_marker.loc[:,['rounds']].values + df_dapi['colors'] = df_marker.loc[:,['color']].values + df_dapi['minimum'] = 1003 + df_dapi['maximum'] = 65535 + df_dapi['exposure'] = 100 + df_dapi['refexp'] = 100 + df_dapi['location'] = 'All' + for s_key,i_item in d_segment.items(): + df_dapi.loc[s_key,'minimum'] = i_item + df_dapi.to_csv(f'{segdir}/metadata_{s_sample}_RoundsCyclesTable.csv',header=True) + #create cluster.java file + if b_java: + df_dapi.to_csv('RoundsCyclesTable.txt',sep=' ',header=False) + preprocess.cluster_java(s_dir=f'JE{idx + i_counter}',s_sample=s_sample,imagedir=f'{s_path}',segmentdir=segdir,type='exacloud',b_segment=True,b_TMA=tma_bool) + if b_start: + os.chdir(f'{s_work_path}/exacloud/JE{idx}') #exacloud + #shutil.copyfile(f'{s_src_path}/src/javawrapper.sh', './javawrapper.sh') + print(f'JE{idx + i_counter}') + subprocess.run(["make"]) + subprocess.run(["make", "slurm"]) + +def prepare_dataframe(s_sample,ls_dapi,dapi_thresh,d_channel,ls_exclude,segdir,codedir,s_af='none', b_afsub=False): + ''' + filter data by last dapi, standard location, subtract AF, output treshold csv + ls_dapi[0] becomes s_dapi + ''' + + os.chdir(f'{segdir}') + #load data + df_mi = process.load_mi(s_sample) + df_xy = process.load_xy(s_sample) + #drop extra centroid columns,add scene column + df_xy = df_xy.loc[:,['DAPI_X','DAPI_Y']] + df_xy = process.add_scene(df_xy) + df_xy.to_csv(f'features_{s_sample}_CentroidXY.csv') + #filter by last DAPI + df_dapi_mi = process.filter_dapi(df_mi,df_xy,ls_dapi[0],dapi_thresh,b_images=True) + + #filter mean intensity by biomarker location in metadata + df_filter_mi, es_standard = process.filter_standard(df_dapi_mi,d_channel,s_dapi=ls_dapi[0]) + + df_filter_mi.to_csv(f'features_{s_sample}_FilteredMeanIntensity_{ls_dapi[0]}{dapi_thresh}.csv') + #background qunatiles + ''' + df_bg = process.filter_background(df_mi, es_standard) + df_bg.to_csv(f'features_{s_sample}_BackgroundQuantiles.csv') + df_bg = process.filter_background(df_dapi_mi, es_standard) + df_bg.to_csv(f'features_{s_sample}_FilteredBackgroundQuantiles.csv') + + df_t = pd.read_csv(f'metadata_{s_sample}_RoundsCyclesTable.csv',index_col=0,header=0) + df_exp = pd.read_csv(f'{codedir}/{s_sample}_ExposureTimes.csv',index_col=0,header=0) + df_tt = process.add_exposure_roundscyles(df_t, df_exp,es_standard, ls_dapi = ls_dapi) + df_tt.to_csv(f'metadata_{s_sample}_RoundsCyclesTable_ExposureTimes.csv') + if b_afsub: + #load metadata + df_t = pd.read_csv(f'metadata_{s_sample}_RoundsCyclesTable_ExposureTimes.csv',index_col=0,header=0) + #normalize by exposure time, and save to csv + lb_columns = [len(set([item]).intersection(set(df_t.index)))>0 for item in [item.split('_')[0] for item in df_filter_mi.columns]] + df_filter_mi = df_filter_mi.loc[:,lb_columns] + df_norm = process.exposure_norm(df_filter_mi,df_t) + df_norm.to_csv(f'features_{s_sample}_ExpNormalizedMeanIntensity_{ls_dapi[0]}{dapi_thresh}.csv') + #subtract AF channels in data + df_sub,ls_sub,ls_record = process.af_subtract(df_norm,df_t,d_channel,ls_exclude) + df_out = process.output_subtract(df_sub,df_t) + df_sub.to_csv(f'features_{s_sample}_AFSubtractedMeanIntensityNegative{s_af}_{ls_dapi[0]}{dapi_thresh}.csv') + df_out.to_csv(f'features_{s_sample}_AFSubtractedMeanIntensity{s_af}_{ls_dapi[0]}{dapi_thresh}.csv') + f = open(f"{s_sample}_AFsubtractionData_{s_af}.txt", "w") + f.writelines(ls_record) + f.close() + else: + df_out = df_filter_mi + #output thresholding csv + #df_out = process.add_scene(df_out) #df_out + #df_thresh = process.make_thresh_df(df_out,ls_drop=None) + #df_thresh.to_csv(f'thresh_XX_{s_sample}.csv') + ''' + print('Done') + +def fetch_celllabel(s_sampleset, s_slide, s_ipath, s_opath = './', es_scene = None, es_filename_endswith ={'Cell Segmentation Basins.tif', 'Nuclei Segmentation Basins.tif'}, s_sep = ' - ', b_test=True): + ''' + input: + s_sampleset: sample set name. e.g. jptma + s_slide: slide name. e.g. jp-tma1-1 + es_scene: set of scenes of interest. The scenes have to be written in the same way as in the basin file name. + if None, all scenes are if interest. default is None. + s_ipath: absolute or relative path where the basin files can be found. + s_opath: path to where the fetched basin files should be outputed. + a folder, based on the s_sampleset, will be generated (if it not already exist), where the basin files will be placed. + es_filename_endswith: set of patters that defind the endings of the files of interest. + s_sep: separator to separate slide and scenes in the file name. + b_test: test flag. if True no files will be copied, it is just a simulation mode. + + output: + folder with basin flies. placed at {s_opath}{s_sampleset}_segmentation_basin/ + + description: + fetches basin (cell label) files from Guillaume's segmentation pipeline + and copies them into a folder at s_opath, named according to s_sampleset name. + ''' + # generate output directory + os.makedirs('{}{}_segmentation_basin/'.format(s_opath, s_sampleset), exist_ok=True) + # processing + if (es_scene is None): + i_total = 'all' + else: + i_total = len(es_scene) * len(es_filename_endswith) + es_sanity_scene = copy.deepcopy(es_scene) + i = 0 + for s_file in sorted(os.listdir(s_ipath)): + # check for file of interest + b_flag = False + for s_filename_endswith in es_filename_endswith: + if (s_file.endswith(s_filename_endswith)): + if (es_scene is None): + b_flag = True + break + else: + for s_scene in es_scene: + if (s_file.startswith(s_scene + s_sep)): + es_sanity_scene.discard(s_scene) + b_flag = True + break + break + # copy file + if (b_flag): + i += 1 + print('copy {}/{}: {}{}{} ...'.format(i, i_total, s_slide, s_sep, s_file)) + if not (b_test): + shutil.copyfile(src='{}{}'.format(s_ipath, s_file), dst='{}{}_segmentation_basin/{}{}{}'.format(s_opath, s_sampleset, s_slide, s_sep, s_file)) + # sanity check + if not (es_scene is None) and (i != i_total): + sys.exit('Error: no file found for es_scene specified scene {}'.format(sorted(es_sanity_scene))) \ No newline at end of file diff --git a/mplex_image/codex.py b/mplex_image/codex.py new file mode 100755 index 0000000..a67c58a --- /dev/null +++ b/mplex_image/codex.py @@ -0,0 +1,452 @@ +# wrapper functions for codex image processing + +#from mplex_image import preprocess, mpimage, process, +from mplex_image import features +import os +import pandas as pd +import math +import skimage +from skimage import io, filters +import re +import numpy as np + +def parse_img(s_end = ".tif",s_start='reg'): + """ + This function will parse images following akoya stiched naming convention + """ + s_path = os.getcwd() + ls_file = [] + for file in os.listdir(): + if file.endswith(s_end): + if file.find(s_start)==0: + ls_file = ls_file + [file] + df_img = pd.DataFrame(index=ls_file) + df_img['rounds'] = [item.split('_')[1].split('cyc')[1] for item in df_img.index] + df_img['color'] = [item.split('_')[3] for item in df_img.index] + df_img['slide'] = [item.split('_')[0] for item in df_img.index] + df_img['marker'] = [item.split('_')[-1].split('.')[0] for item in df_img.index] + df_img['marker_string'] = [item.split('_')[-1].split('.')[0] for item in df_img.index] + df_img['path'] = [f"{s_path}/{item}" for item in df_img.index] + return(df_img) + +def load_li(ls_sample): + ''' + load threshold on the segmentation marker images acquired during feature extraction + ''' + df_img_all =pd.DataFrame() + for s_sample in ls_sample: + df_img = pd.read_csv(f'thresh_{s_sample}_ThresholdLi.csv', index_col=0) + df_img['rounds'] = [item.split('_')[1].split('cyc')[1] for item in df_img.index] + df_img['color'] = [item.split('_')[3] for item in df_img.index] + df_img['slide'] = s_sample + df_img['scene'] = [item.split('_')[0].split('reg')[1] for item in df_img.index] + df_img['marker'] = [item.split('_')[-1].split('.')[0] for item in df_img.index] #parse file name for biomarker + df_img['slide_scene'] = df_img.slide + '_scene' + df_img.scene + df_img_all = df_img_all.append(df_img) + return(df_img_all) + +def underscore_to_dash(df_mi_full,df_img_all): + ''' + the underscore in sample names will break downstream code; change to dash + ''' + #naming underscore to dash + df_mi_full['slide'] = [item.split('_scene')[0].replace('_','-') for item in df_mi_full.index] + df_mi_full.index = [f"_scene{item.split('_scene')[1]}" for item in df_mi_full.index] + df_mi_full.index = df_mi_full.slide + df_mi_full.index + df_mi_full['scene'] = [item.split('_')[1] for item in df_mi_full.index] + df_mi_full['slide_scene'] = df_mi_full.slide + '_' + df_mi_full.scene + #df_img renameing + df_img_all['slide'] = [item.replace('_','-') for item in df_img_all.slide] + df_img_all['slide_scene'] = df_img_all.slide + '_scene' + df_img_all.scene + return(df_mi_full,df_img_all) + +def extract_cellpose_features(s_sample, segdir, subdir, ls_seg_markers, nuc_diam, cell_diam,s_scene='reg001'): + ''' + load the segmentation results, the input images, and the channels images + extract mean intensity from each image, and centroid, area and eccentricity for + ''' + + df_sample = pd.DataFrame() + df_thresh = pd.DataFrame() + if os.path.exists(f'{segdir}/{s_scene}Cellpose_Segmentation'): + os.chdir(f'{segdir}/{s_scene}Cellpose_Segmentation') + else: + os.chdir(f'{segdir}') + ls_scene = [] + d_match = {} + for s_file in os.listdir(): + if s_file.find(f'{".".join(ls_seg_markers)} matchedcell{cell_diam} - Cell Segmentation Basins')>-1: + ls_scene.append(s_file.split('_')[0]) + d_match.update({s_file.split('_')[0]:s_file}) + elif s_file.find(f'{".".join(ls_seg_markers)} nuc{nuc_diam} matchedcell{cell_diam} - Cell Segmentation Basins')>-1: + ls_scene.append(s_file.split('_')[0]) + d_match.update({s_file.split('_')[0]:s_file}) + for s_scene in ['reg001']: #ls_scene: #one scene + print(f'processing {s_scene}') + for s_file in os.listdir(): + if s_file.find(s_scene) > -1: + if s_file.find("DAPI.png") > -1: + s_dapi = s_file + dapi = io.imread(s_dapi) + print(f'loading {s_scene} nuclei{nuc_diam} - Nuclei Segmentation Basins.tif') + labels = io.imread(f'{s_scene} nuclei{nuc_diam} - Nuclei Segmentation Basins.tif') + print(f'loading {d_match[s_scene]}') + cell_labels = io.imread(d_match[s_scene]) + #nuclear features + df_feat = features.extract_feat(labels,dapi, properties=(['mean_intensity'])) + df_feat.columns = [f'{item}_segmented-nuclei' for item in df_feat.columns] + df_feat.index = [f'{s_sample}_scene{s_scene.split("reg")[1]}_cell{item}' for item in df_feat.index] + + #get subcellular regions + cyto = features.label_difference(labels,cell_labels) + d_loc_nuc = features.subcellular_regions(labels, distance_short=2, distance_long=4) + d_loc_cell = features.subcellular_regions(cell_labels, distance_short=2, distance_long=4) + d_loc = {'nuclei':labels,'cell':cell_labels,'cytoplasm':cyto, + 'nucmem':d_loc_nuc['membrane'][0],'cellmem':d_loc_cell['membrane'][0], + 'perinuc4':d_loc_nuc['ring'][1],'exp4':d_loc_nuc['grown'][1], + 'nucadj2':d_loc_nuc['straddle'][0],'celladj2':d_loc_cell['straddle'][0]} + #subdir organized by slide or scene + if os.path.exists(f'{subdir}/{s_sample}'): + os.chdir(f'{subdir}/{s_sample}') + elif os.path.exists(f'{subdir}/{s_scene}'): + os.chdir(f'{subdir}/{s_scene}') + else: + os.chdir(f'{subdir}') + df_img = parse_img() + df_img['round_int'] = [int(re.sub('[^0-9]','', item)) for item in df_img.rounds] + df_img = df_img[df_img.round_int < 90] + df_img = df_img.sort_values('round_int') + df_scene = df_img# one scene [df_img.scene==s_scene.split("-Scene-")[1].split("_")[0]] + + #load each image + for s_index in df_scene.index: + intensity_image = io.imread(s_index) + df_thresh.loc[s_index,'threshold_li'] = filters.threshold_li(intensity_image) + if intensity_image.mean() > 0: + df_thresh.loc[s_index,'threshold_otsu'] = filters.threshold_otsu(intensity_image) + df_thresh.loc[s_index,'threshold_triangle'] = filters.threshold_triangle(intensity_image) + s_marker = df_scene.loc[s_index,'marker'] + print(f'extracting features {s_marker}') + #if s_marker == 'DAPI': + # s_marker = s_marker + f'{df_scene.loc[s_index,"rounds"].split("cyc")[1]}' + for s_loc, a_loc in d_loc.items(): + if s_loc == 'nuclei': + df_marker_loc = features.extract_feat(a_loc,intensity_image, properties=(['mean_intensity','centroid','area','eccentricity'])) + df_marker_loc.columns = [f'{s_marker}_{s_loc}',f'{s_marker}_{s_loc}_centroid-0',f'{s_marker}_{s_loc}_centroid-1',f'{s_marker}_{s_loc}_area',f'{s_marker}_{s_loc}_eccentricity'] + elif s_loc == 'cell': + df_marker_loc = features.extract_feat(a_loc,intensity_image, properties=(['mean_intensity','euler_number','area','eccentricity'])) + df_marker_loc.columns = [f'{s_marker}_{s_loc}',f'{s_marker}_{s_loc}_euler',f'{s_marker}_{s_loc}_area',f'{s_marker}_{s_loc}_eccentricity'] + else: + df_marker_loc = features.extract_feat(a_loc,intensity_image, properties=(['mean_intensity'])) + df_marker_loc.columns = [f'{s_marker}_{s_loc}'] + + #drop zero from array, set array ids as index + df_marker_loc.index = sorted(np.unique(a_loc)[1::]) + df_marker_loc.index = [f'{s_sample}_scene{s_scene.split("reg")[1]}_cell{item}' for item in df_marker_loc.index] + df_feat = df_feat.merge(df_marker_loc, left_index=True,right_index=True,how='left',suffixes=('',f'{s_marker}_{s_loc}')) + df_sample = df_sample.append(df_feat) + return(df_sample, df_thresh) + +def convert_tif(regdir,b_mkdir=True): + ''' + convert codex tif to standard tif + ''' + cwd = os.getcwd() + os.chdir(regdir) + for s_dir in sorted(os.listdir()): + if s_dir.find('reg')== 0: + os.chdir(s_dir) + for s_file in sorted(os.listdir()): + if s_file.find('.tif')>-1: + #s_round = s_file.split("Cycle(")[1].split(").ome.tif")[0] + #print(f'stain {s_round}') + #s_dir_new = s_dir.split('_')[2] + '-Scene-0' + s_dir.split('F-')[1] + #s_tissue_dir = s_dir.split('_F-')[0] + if b_mkdir: + preprocess.cmif_mkdir([f'{regdir}/converted_{s_dir}']) + a_dapi = skimage.io.imread(s_file) + with skimage.external.tifffile.TiffWriter(f'{regdir}/converted_{s_dir}/{s_file}') as tif: + tif.save(a_dapi) + os.chdir('..') + os.chdir(cwd) + +def visualize_reg_images(s_sample,regdir,qcdir,color='ch001'): + """ + array registered images to check tissue identity, focus, etc. + """ + #check registration + preprocess.cmif_mkdir([f'{qcdir}/RegisteredImages']) + cwd = os.getcwd() + os.chdir(regdir) + #for idx, s_dir in enumerate(sorted(os.listdir())): + # os.chdir(s_dir) + # s_sample = s_dir.split('-Scene')[0] + # print(s_sample) + df_img = mpimage.filename_dataframe(s_end = ".tif",s_start='reg',s_split='_') + df_img.rename({'data':'scene'},axis=1,inplace=True) + df_img['slide'] = s_sample + df_img['rounds'] = [item[1] for item in [item.split('_') for item in df_img.index]] + df_img['color'] = [item[2] for item in [item.split('_') for item in df_img.index]] + df_img['marker'] = [item[3].split('.')[0] for item in [item.split('_') for item in df_img.index]] + ls_scene = sorted(set(df_img.scene)) + for s_scene in ls_scene: + print(s_scene) + df_img_scene = df_img[df_img.scene == s_scene] + df_img_stain = df_img_scene[df_img_scene.color==color] + df_img_sort = df_img_stain.sort_values(['rounds']) + i_sqrt = math.ceil(math.sqrt(len(df_img_sort))) + fig = mpimage.array_img(df_img_sort,s_column='color',s_row='rounds',s_label='marker',tu_array=(i_sqrt,i_sqrt),tu_fig=(16,14)) + fig.savefig(f'{qcdir}/RegisteredImages/{s_scene}_registered_{color}.png') + os.chdir(cwd) + return(df_img_sort) + +def rename_files(d_rename,dir,b_test=True): + """ + change file names + """ + cwd = os.getcwd() + os.chdir(dir) + for idx, s_dir in enumerate(sorted(os.listdir())): + if s_dir.find('converted') == 0: + s_path = f'{dir}/{s_dir}' + os.chdir(s_path) + print(s_dir) + df_img = mpimage.filename_dataframe(s_end = ".tif",s_start='reg',s_split='_') + df_img.rename({'data':'scene'},axis=1,inplace=True) + df_img['rounds'] = [item[1] for item in [item.split('_') for item in df_img.index]] + df_img['color'] = [item[2] for item in [item.split('_') for item in df_img.index]] + df_img['marker'] = [item[3].split('.')[0] for item in [item.split('_') for item in df_img.index]] + if b_test: + print('This is a test') + preprocess.dchange_fname(d_rename,b_test=True) + elif b_test==False: + print('Changing name - not a test') + preprocess.dchange_fname(d_rename,b_test=False) + else: + pass + +def rename_fileorder(s_sample, dir, b_test=True): + """ + change file names + """ + cwd = os.getcwd() + os.chdir(dir) + for idx, s_dir in enumerate(sorted(os.listdir())): + if s_dir.find('converted') == 0: + s_path = f'{dir}/{s_dir}' + os.chdir(s_path) + print(s_dir) + df_img = mpimage.filename_dataframe(s_end = ".tif",s_start='Scene',s_split='_') + df_img.rename({'data':'scene'},axis=1,inplace=True) + df_img['rounds'] = [item[1] for item in [item.split('_') for item in df_img.index]] + df_img['color'] = [item[2] for item in [item.split('_') for item in df_img.index]] + df_img['marker'] = [item[3].split('.')[0] for item in [item.split('_') for item in df_img.index]] + for s_index in df_img.index: + s_round = df_img.loc[s_index,'rounds'] + s_scene= f"{s_sample}-{df_img.loc[s_index,'scene']}" + s_marker = df_img.loc[s_index,'marker'] + s_color = df_img.loc[s_index,'color'] + s_index_rename = f'{s_round}_{s_scene}_{s_marker}_{s_color}_ORG.tif' + d_rename = {s_index:s_index_rename} + if b_test: + print('This is a test') + preprocess.dchange_fname(d_rename,b_test=True) + elif b_test==False: + print('Changing name - not a test') + preprocess.dchange_fname(d_rename,b_test=False) + else: + pass + +def copy_files(dir,dapi_copy, marker_copy,testbool=True,type='codex'): + """ + copy and rename files if needed as dummies + need to edit + """ + os.chdir(dir) + for idx, s_dir in enumerate(sorted(os.listdir())): + if s_dir.find('converted') == 0: + s_path = f'{dir}/{s_dir}' + os.chdir(s_path) + #s_sample = s_dir.split('-Scene')[0] + df_img = mpimage.filename_dataframe(s_end = ".tif",s_start='R0',s_split='_') + df_img.rename({'data':'rounds'},axis=1,inplace=True) + df_img['scene'] = [item[1] for item in [item.split('_') for item in df_img.index]] + df_img['color'] = [item[3] for item in [item.split('_') for item in df_img.index]] + df_img['marker'] = [item[2].split('.')[0] for item in [item.split('_') for item in df_img.index]] + print(s_dir) + for key, dapi_item in dapi_copy.items(): + df_dapi = df_img[(df_img.rounds== key.split('_')[0]) & (df_img.color=='c1')] + s_dapi = df_dapi.loc[:,'marker'][0] + preprocess.copy_dapis(s_r_old=key,s_r_new=f'R{dapi_item}_',s_c_old='_c1_', + s_c_new='_c2_',s_find=f'_{s_dapi}_c1_ORG.tif',b_test=testbool,type=type) + i_count=0 + for idx,(key, item) in enumerate(marker_copy.items()): + preprocess.copy_markers(df_img, s_original=key, ls_copy = item, + i_last_round= dapi_item + i_count, b_test=testbool,type=type) + i_count=i_count + len(item) + return(df_img) + +def segmentation_thresholds(regdir,qcdir, d_segment): + """ + visualize binary mask of segmentaiton threholds + need to edit + """ + preprocess.cmif_mkdir([f'{qcdir}/Segmentation']) + os.chdir(regdir) + for idx, s_dir in enumerate(sorted(os.listdir())): + if s_dir.find('converted') == 0: + s_path = f'{regdir}/{s_dir}' + os.chdir(s_path) + df_img = mpimage.filename_dataframe(s_end = ".tif",s_start='R',s_split='_') + df_img.rename({'data':'rounds'},axis=1,inplace=True) + df_img['scene'] = [item[1] for item in [item.split('_') for item in df_img.index]] + df_img['color'] = [item[3] for item in [item.split('_') for item in df_img.index]] + df_img['marker'] = [item[2].split('.')[0] for item in [item.split('_') for item in df_img.index]] + s_sample = s_dir + print(s_sample) + d_seg = preprocess.check_seg_markers(df_img,d_segment, i_rows=1, t_figsize=(6,6)) #few scenes + for key, fig in d_seg.items(): + fig.savefig(f'{qcdir}/Segmentation/{s_dir}_{key}_segmentation.png') + return(df_img) + +def parse_converted(dir): + ''' + parse codex filenames (coverted) + ''' + cwd = os.getcwd() + os.chdir(dir) + df_img = mpimage.filename_dataframe(s_end = ".tif",s_start='R',s_split='_') + df_img.rename({'data':'rounds'},axis=1,inplace=True) + df_img['scene'] = [item[1] for item in [item.split('_') for item in df_img.index]] + df_img['color'] = [item[3] for item in [item.split('_') for item in df_img.index]] + df_img['marker'] = [item[2] for item in [item.split('_') for item in df_img.index]] + os.chdir(cwd) + return(df_img) + +def segmentation_inputs(s_sample,regdir,segdir,d_segment,b_start=False): + """ + make inputs for guillaumes segmentation + """ + os.chdir(regdir) + for idx, s_dir in enumerate(sorted(os.listdir())): + if s_dir.find('convert')== 0: + s_path = f'{regdir}/{s_dir}' + os.chdir(s_path) + df_img = mpimage.filename_dataframe(s_end = ".tif",s_start='R',s_split='_') + df_img.rename({'data':'rounds'},axis=1,inplace=True) + #df_img['rounds'] = [item[1] for item in [item.split('_') for item in df_img.index]] + df_img['color'] = [item[3] for item in [item.split('_') for item in df_img.index]] + df_img['marker'] = [item[2] for item in [item.split('_') for item in df_img.index]] + #s_sample = s_dir + #s_sample = s_dir.split('-Scene')[0] + print(s_sample) + df_marker = df_img[df_img.color!='c1'] + df_marker = df_marker.sort_values(['rounds','color']) + df_dapi = pd.DataFrame(index = [df_marker.marker.tolist()],columns=['rounds','colors','minimum','maximum','exposure','refexp','location']) + df_dapi['rounds'] = df_marker.loc[:,['rounds']].values + df_dapi['colors'] = df_marker.loc[:,['color']].values + df_dapi['minimum'] = 1003 + df_dapi['maximum'] = 65535 + df_dapi['exposure'] = 100 + df_dapi['refexp'] = 100 + df_dapi['location'] = 'All' + for s_key,i_item in d_segment.items(): + df_dapi.loc[s_key,'minimum'] = i_item + df_dapi.to_csv('RoundsCyclesTable.txt',sep=' ',header=False) + df_dapi.to_csv(f'metadata_{s_sample}_RoundsCyclesTable.csv',header=True) + #create cluster.java file + preprocess.cluster_java(s_dir=f'JE{idx}',s_sample=s_sample,imagedir=f'{s_path}',segmentdir=segdir,type='exacloud',b_segment=True,b_TMA=False) + if b_start: + os.chdir(f'/home/groups/graylab_share/Chin_Lab/ChinData/Work/engje/exacloud/JE{idx}') #exacloud + print(f'JE{idx}') + os.system('make_sh') + +def prepare_dataframe(s_sample,s_dapi,dapi_thresh,d_channel,ls_exclude,segdir,b_afsub=False): + ''' + filter data by last dapi, standard location, subtract AF, output treshold csv + ''' + + os.chdir(f'{segdir}') + #load data + df_mi = process.load_mi(s_sample) + df_xy = process.load_xy(s_sample) + #drop extra centroid columns,add scene column + df_xy = df_xy.loc[:,['DAPI_X','DAPI_Y']] + df_xy = process.add_scene(df_xy) + df_xy.to_csv(f'features_{s_sample}_CentroidXY.csv') + #filter by last DAPI + df_dapi_mi = process.filter_dapi(df_mi,df_xy,s_dapi,dapi_thresh,b_images=True) + df_t = process.load_meta(s_sample, s_path='./',type='LocationCsv') + #filter mean intensity by biomarker location in metadata + df_filter_mi = process.filter_loc(df_dapi_mi,df_t) + df_filter_mi.to_csv(f'features_{s_sample}_FilteredMeanIntensity_{s_dapi}{dapi_thresh}.csv') + if b_afsub: + #load metadata + df_t = pd.read_csv(f'metadata_{s_sample}_RoundsCyclesTableExposure.csv',index_col=0,header=0) + #normalize by exposure time, and save to csv + lb_columns = [len(set([item]).intersection(set(df_t.index)))>0 for item in [item.split('_')[0] for item in df_filter_mi.columns]] + df_filter_mi = df_filter_mi.loc[:,lb_columns] + df_norm = process.exposure_norm(df_filter_mi,df_t) + df_norm.to_csv(f'features_{s_sample}_ExpNormalizedMeanIntensity_{s_dapi}{dapi_thresh}.csv') + #subtract AF channels in data + df_sub,ls_sub,ls_record = process.af_subtract(df_norm,df_t,d_channel,ls_exclude) + df_out = process.output_subtract(df_sub,df_t) + df_out.to_csv(f'features_{s_sample}_AFSubtractedMeanIntensity_{s_dapi}{dapi_thresh}.csv') + f = open(f"{s_sample}_AFsubtractionData.txt", "w") + f.writelines(ls_record) + f.close() + else: + df_out = df_filter_mi + #output thresholding csv + df_out = process.add_scene(df_out) #df_out + df_thresh = process.make_thresh_df(df_out,ls_drop=None) + df_thresh.to_csv(f'thresh_XX_{s_sample}.csv') + +def multipage_tiff(d_combos,s_dapi,regdir): + ''' + make custom overlays, either original of AF subtracted, save at 8 bit for size, and thresholding + ''' + os.chdir(regdir) + for idx, s_dir in enumerate(sorted(os.listdir())): + if s_dir.find('convert')== 0: + s_path = f'{regdir}/{s_dir}' + os.chdir(s_path) + df_img = mpimage.filename_dataframe(s_end = ".tif",s_start='R',s_split='_') + df_img.rename({'data':'rounds'},axis=1,inplace=True) + df_img['color'] = [item[3] for item in [item.split('_') for item in df_img.index]] + df_img['marker'] = [item[2] for item in [item.split('_') for item in df_img.index]] + df_img['scene'] = [item[1] for item in [item.split('_') for item in df_img.index]] + df_img['imagetype'] = [item[4].split('.')[0] for item in [item.split('_') for item in df_img.index]] + df_dapi = df_img[df_img.marker.str.contains(s_dapi.split('_')[0])] + df_img_stain = df_img[(~df_img.marker.str.contains('DAPI'))] + #check + es_test = set() + for key, item in d_combos.items(): + es_test = es_test.union(item) + print(set(df_img_stain.marker) - es_test) + process.custom_overlays(d_combos, df_img_stain, df_dapi) + else: + continue + +def load_crop_labels(d_crop,tu_dim,segdir,cropdir,s_find='Nuclei Segmentation Basins'): + """ + crop the segmentation basins (cell of nuceli) to same coord as images for veiwing in Napari + s_find: 'exp5_CellSegmentationBasins' or 'Nuclei Segmentation Basins' + """ + cwd = os.getcwd() + for s_scene, xy_cropcoor in d_crop.items(): + print(s_scene) + s_sample = s_scene.split('-Scene-')[0] + os.chdir(f'{segdir}') + + for s_file in os.listdir(): + if s_file.find(s_find) > -1: #Nuclei Segmentation Basins.tif #Cell Segmentation Basins.tif + if s_file.find(s_scene.split(s_sample)[1]) > -1: + a_seg = skimage.io.imread(s_file) + a_crop = a_seg[(xy_cropcoor[1]):(xy_cropcoor[1]+tu_dim[1]),(xy_cropcoor[0]):(xy_cropcoor[0]+tu_dim[0])] + s_coor = f'x{xy_cropcoor[0]}y{xy_cropcoor[1]}.tif' + #crop file + s_file_new = f'{cropdir}/{s_sample}_{s_file.replace(" ","").replace(".tif",s_coor)}' + print(s_file_new) + skimage.io.imsave(s_file_new,a_crop) + os.chdir(cwd) diff --git a/mplex_image/features.py b/mplex_image/features.py new file mode 100755 index 0000000..7812462 --- /dev/null +++ b/mplex_image/features.py @@ -0,0 +1,603 @@ +#### +# title: features.py +# language: Python3.7 +# date: 2020-06-00 +# license: GPL>=v3 +# author: Jenny +# description: +# python3 script for single cell feature extraction +#### + +#libraries +import os +import sys +import numpy as np +import pandas as pd +import shutil +import skimage +import scipy +from scipy import stats +from scipy import ndimage as ndi +from skimage import measure, segmentation, morphology +from skimage import io, filters +import re +import json +from biotransistor import imagine +from PIL import Image +from mplex_image import process +import matplotlib.pyplot as plt +Image.MAX_IMAGE_PIXELS = 1000000000 + +#functions +def extract_feat(labels,intensity_image, properties=('centroid','mean_intensity','area','eccentricity')): + ''' + given labels and intensity image, extract features to dataframe + ''' + props = measure.regionprops_table(labels,intensity_image, properties=properties) + df_prop = pd.DataFrame(props) + return(df_prop) + +def expand_label(labels,distance=3): + ''' + expand the nucelar labels by a fixed number of pixels + ''' + boundaries = segmentation.find_boundaries(labels,mode='outer') #thick + shrunk_labels = labels.copy() + shrunk_labels[boundaries] = 0 + background = shrunk_labels == 0 + distances, (i, j) = scipy.ndimage.distance_transform_edt( + background, return_indices=True + ) + + grown_labels = labels.copy() + mask = background & (distances <= distance) + grown_labels[mask] = shrunk_labels[i[mask], j[mask]] + ring_labels = grown_labels - shrunk_labels + + return(ring_labels, grown_labels) #shrunk_labels, grown_labels, + +def contract_label(labels,distance=3): + ''' + contract labels by a fixed number of pixels + ''' + boundaries = segmentation.find_boundaries(labels,mode='outer') + shrunk_labels = labels.copy() + shrunk_labels[boundaries] = 0 + foreground = shrunk_labels != 0 + distances, (i, j) = scipy.ndimage.distance_transform_edt( + foreground, return_indices=True + ) + + mask = foreground & (distances <= distance) + shrunk_labels[mask] = shrunk_labels[i[mask], j[mask]] + rim_labels = labels - shrunk_labels + return(rim_labels) + +def straddle_label(labels,distance=3): + ''' + expand and contract labels by a fixed number of pixels + ''' + boundaries = segmentation.find_boundaries(labels,mode='outer') #outer + shrunk_labels = labels.copy() + grown_labels = labels.copy() + shrunk_labels[boundaries] = 0 + foreground = shrunk_labels != 0 + background = shrunk_labels == 0 + distances_f, (i, j) = scipy.ndimage.distance_transform_edt( + foreground, return_indices=True + ) + distances_b, (i, j) = scipy.ndimage.distance_transform_edt( + background, return_indices=True + ) + mask_f = foreground & (distances_f <= distance) + mask_b = background & (distances_b <= distance + 1) + shrunk_labels[mask_f] = 0 + grown_labels[mask_b] = grown_labels[i[mask_b], j[mask_b]] + membrane_labels = grown_labels - shrunk_labels + return(membrane_labels, grown_labels, shrunk_labels) + +def label_difference(labels,cell_labels): + ''' + given matched nuclear and cell label IDs,return cell_labels minus labels + ''' + overlap = cell_labels==labels + ring_rep = cell_labels.copy() + ring_rep[overlap] = 0 + return(ring_rep) + +def get_mip(ls_img): + ''' + maximum intensity projection of images (input list of filenames) + ''' + imgs = [] + for s_img in ls_img: + img = io.imread(s_img) + imgs.append(img) + mip = np.stack(imgs).max(axis=0) + return(mip) + +def thresh_li(img,area_threshold=100,low_thresh=1000): + ''' + threshold an image with Li’s iterative Minimum Cross Entropy method + if too low, apply the low threshold instead (in case negative) + ''' + mask = img >= filters.threshold_li(img) + mask = morphology.remove_small_holes(mask, area_threshold=area_threshold) + mask[mask < low_thresh] = 0 + return(mask) + +def mask_border(mask,type='inner',pixel_distance = 50): + ''' + for inner, distance transform from mask to background + for outer, distance transform from back ground to mask + returns a mask + ''' + shrunk_mask = mask.copy() + if type == 'inner': + foreground = ~mask + background = mask + elif type == 'outer': + foreground = ~mask + background = mask + distances, (i, j) = scipy.ndimage.distance_transform_edt( + background, return_indices=True + ) + maskdist = mask & (distances <= pixel_distance) + shrunk_mask[maskdist] = shrunk_mask[i[maskdist], j[maskdist]] + mask_out = np.logical_and(mask,np.logical_not(shrunk_mask)) + return(mask_out,shrunk_mask,maskdist,distances) + +def mask_labels(mask,labels): + '''' + return the labels that fall within the mask + ''' + selected_array = labels[mask] + a_unique = np.unique(selected_array) + return(a_unique) + +def parse_org(s_end = "ORG.tif",s_start='R'): + """ + This function will parse images following koei's naming convention + Example: Registered-R1_PCNA.CD8.PD1.CK19_Her2B-K157-Scene-002_c1_ORG.tif + The output is a dataframe with image filename in index + And rounds, color, imagetype, scene (/tissue), and marker in the columns + """ + ls_file = [] + for file in os.listdir(): + if file.endswith(s_end): + if file.find(s_start)==0: + ls_file = ls_file + [file] + df_img = pd.DataFrame(index=ls_file) + df_img['rounds'] = [item.split('_')[0].split('Registered-')[1] for item in df_img.index] + df_img['color'] = [item.split('_')[-2] for item in df_img.index] + df_img['slide'] = [item.split('_')[2] for item in df_img.index] + df_img['scene'] = [item.split('-Scene-')[1] for item in df_img.slide] + #parse file name for biomarker + for s_index in df_img.index: + #print(s_index) + s_color = df_img.loc[s_index,'color'] + if s_color == 'c1': + s_marker = 'DAPI' + elif s_color == 'c2': + s_marker = s_index.split('_')[1].split('.')[0] + elif s_color == 'c3': + s_marker = s_index.split('_')[1].split('.')[1] + elif s_color == 'c4': + s_marker = s_index.split('_')[1].split('.')[2] + elif s_color == 'c5': + s_marker = s_index.split('_')[1].split('.')[3] + elif s_color == 'c6': + s_marker = s_index.split('_')[1].split('.')[2] + elif s_color == 'c7': + s_marker = s_index.split('_')[1].split('.')[3] + else: print('Error') + df_img.loc[s_index,'marker'] = s_marker + return(df_img) + +def extract_cellpose_features(s_sample, segdir, subdir, ls_seg_markers, nuc_diam, cell_diam,b_big=False): #,b_thresh=False + ''' + load the segmentation results, the input images, and the channels images + extract mean intensity from each image, and centroid, area and eccentricity for + ''' + + df_sample = pd.DataFrame() + df_thresh = pd.DataFrame() + + os.chdir(f'{segdir}/{s_sample}Cellpose_Segmentation') + ls_scene = [] + d_match = {} + for s_file in os.listdir(): + if s_file.find(f'{".".join(ls_seg_markers)} matchedcell{cell_diam} - Cell Segmentation Basins')>-1: + ls_scene.append(s_file.split('_')[0]) + d_match.update({s_file.split('_')[0]:s_file}) + elif s_file.find(f'{".".join(ls_seg_markers)} nuc{nuc_diam} matchedcell{cell_diam} - Cell Segmentation Basins')>-1: + ls_scene.append(s_file.split('_')[0]) + d_match.update({s_file.split('_')[0]:s_file}) + for s_scene in ls_scene: + os.chdir(f'{segdir}/{s_sample}Cellpose_Segmentation') + print(f'processing {s_scene}') + for s_file in os.listdir(): + if s_file.find(s_scene) > -1: + if s_file.find("DAPI.png") > -1: + s_dapi = s_file + dapi = io.imread(f'{segdir}/{s_sample}Cellpose_Segmentation/{s_dapi}') + print(f'loading {s_scene} nuclei{nuc_diam} - Nuclei Segmentation Basins.tif') + labels = io.imread(f'{s_scene} nuclei{nuc_diam} - Nuclei Segmentation Basins.tif') + cell_labels = io.imread(f'{segdir}/{s_sample}Cellpose_Segmentation/{d_match[s_scene]}') + print(f'loading {d_match[s_scene]}') + #nuclear features + df_feat = extract_feat(labels,dapi, properties=(['label'])) + df_feat.columns = [f'{item}_segmented-nuclei' for item in df_feat.columns] + df_feat.index = [f'{s_sample}_scene{s_scene.split("-Scene-")[1].split("_")[0]}_cell{item}' for item in df_feat.loc[:,'label_segmented-nuclei']] + + #get subcellular regions + cyto = label_difference(labels,cell_labels) + d_loc_nuc = subcellular_regions(labels, distance_short=2, distance_long=5) + d_loc_cell = subcellular_regions(cell_labels, distance_short=2, distance_long=5) + d_loc = {'nuclei':labels,'cell':cell_labels,'cytoplasm':cyto, + 'nucmem':d_loc_nuc['membrane'][0],'cellmem':d_loc_cell['membrane'][0], + 'perinuc5':d_loc_nuc['ring'][1],'exp5':d_loc_nuc['grown'][1], + 'nucadj2':d_loc_nuc['straddle'][0],'celladj2':d_loc_cell['straddle'][0]} + + #subdir organized by slide or scene + if os.path.exists(f'{subdir}/{s_sample}'): + os.chdir(f'{subdir}/{s_sample}') + elif os.path.exists(f'{subdir}/{s_scene}'): + os.chdir(f'{subdir}/{s_scene}') + else: + os.chdir(f'{subdir}') + df_img = parse_org() + df_img['round_int'] = [int(re.sub('[^0-9]','', item)) for item in df_img.rounds] + df_img = df_img[df_img.round_int < 90] + df_img = df_img.sort_values('round_int') + df_scene = df_img[df_img.scene==s_scene.split("-Scene-")[1].split("_")[0]] + + #load each image + for s_index in df_scene.index: + intensity_image = io.imread(s_index) + df_thresh.loc[s_index,'threshold_li'] = filters.threshold_li(intensity_image) + if intensity_image.mean() > 0: + df_thresh.loc[s_index,'threshold_otsu'] = filters.threshold_otsu(intensity_image) + df_thresh.loc[s_index,'threshold_triangle'] = filters.threshold_triangle(intensity_image) + #if b_thresh: + # break + s_marker = df_scene.loc[s_index,'marker'] + print(f'extracting features {s_marker}') + if s_marker == 'DAPI': + s_marker = s_marker + f'{df_scene.loc[s_index,"rounds"].split("R")[1]}' + for s_loc, a_loc in d_loc.items(): + if s_loc == 'nuclei': + df_marker_loc = extract_feat(a_loc,intensity_image, properties=(['mean_intensity','centroid','area','eccentricity','label'])) + df_marker_loc.columns = [f'{s_marker}_{s_loc}',f'{s_marker}_{s_loc}_centroid-0',f'{s_marker}_{s_loc}_centroid-1',f'{s_marker}_{s_loc}_area',f'{s_marker}_{s_loc}_eccentricity',f'{s_marker}_{s_loc}_label'] + elif s_loc == 'cell': + df_marker_loc = extract_feat(a_loc,intensity_image, properties=(['mean_intensity','euler_number','area','eccentricity','label'])) + df_marker_loc.columns = [f'{s_marker}_{s_loc}',f'{s_marker}_{s_loc}_euler',f'{s_marker}_{s_loc}_area',f'{s_marker}_{s_loc}_eccentricity',f'{s_marker}_{s_loc}_label'] + else: + df_marker_loc = extract_feat(a_loc,intensity_image, properties=(['mean_intensity','label'])) + df_marker_loc.columns = [f'{s_marker}_{s_loc}',f'{s_marker}_{s_loc}_label'] + #drop zero from array, set array ids as index + #old df_marker_loc.index = sorted(np.unique(a_loc)[1::]) + df_marker_loc.index = df_marker_loc.loc[:,f'{s_marker}_{s_loc}_label'] + df_marker_loc.index = [f'{s_sample}_scene{s_scene.split("-Scene-")[1].split("_")[0]}_cell{item}' for item in df_marker_loc.index] + df_feat = df_feat.merge(df_marker_loc, left_index=True,right_index=True,how='left',suffixes=('',f'{s_marker}_{s_loc}')) + if b_big: + df_feat.to_csv(f'{segdir}/{s_sample}Cellpose_Segmentation/features_{s_sample}-{s_scene}.csv') + df_sample = df_sample.append(df_feat) + return(df_sample, df_thresh) + +def extract_bright_features(s_sample, segdir, subdir, ls_seg_markers, nuc_diam, cell_diam,ls_membrane): + ''' + load the features, segmentation results, the input images, and the channels images + extract mean intensity of the top 25% of pixel in from each label region + ''' + df_sample = pd.DataFrame() + os.chdir(f'{segdir}/{s_sample}Cellpose_Segmentation') + ls_scene = [] + d_match = {} + for s_file in os.listdir(): + if s_file.find(f'{".".join(ls_seg_markers)} matchedcell{cell_diam} - Cell Segmentation Basins')>-1: + ls_scene.append(s_file.split('_')[0]) + d_match.update({s_file.split('_')[0]:s_file}) + elif s_file.find(f'{".".join(ls_seg_markers)} nuc{nuc_diam} matchedcell{cell_diam} - Cell Segmentation Basins')>-1: + ls_scene.append(s_file.split('_')[0]) + d_match.update({s_file.split('_')[0]:s_file}) + for s_scene in ls_scene: + os.chdir(f'{segdir}/{s_sample}Cellpose_Segmentation') + print(f'processing {s_scene}') + for s_file in os.listdir(): + if s_file.find(s_scene) > -1: + if s_file.find("DAPI.png") > -1: + s_dapi = s_file + dapi = io.imread(f'{segdir}/{s_sample}Cellpose_Segmentation/{s_dapi}') + print(f'loading {s_scene} nuclei{nuc_diam} - Nuclei Segmentation Basins.tif') + labels = io.imread(f'{s_scene} nuclei{nuc_diam} - Nuclei Segmentation Basins.tif') + print(labels.shape) + cell_labels = io.imread(f'{segdir}/{s_sample}Cellpose_Segmentation/{d_match[s_scene]}') + print(cell_labels.shape) + print(f'loading {d_match[s_scene]}') + #nuclear features + df_feat = extract_feat(labels,dapi, properties=(['label'])) + df_feat.columns = [f'{item}_segmented-nuclei' for item in df_feat.columns] + df_feat.index = [f'{s_sample}_scene{s_scene.split("-Scene-")[1].split("_")[0]}_cell{item}' for item in df_feat.loc[:,'label_segmented-nuclei']] + + #get subcellular regions + d_loc_nuc = subcellular_regions(labels, distance_short=2, distance_long=5) + d_loc_cell = subcellular_regions(cell_labels, distance_short=2, distance_long=5) + d_loc = {'nucmem25':d_loc_nuc['membrane'][0],'exp5nucmembrane25':d_loc_nuc['grown'][1], + 'cellmem25':d_loc_cell['membrane'][0],'nuclei25':labels} + + #subdir organized by slide or scene + if os.path.exists(f'{subdir}/{s_sample}'): + os.chdir(f'{subdir}/{s_sample}') + elif os.path.exists(f'{subdir}/{s_scene}'): + os.chdir(f'{subdir}/{s_scene}') + else: + os.chdir(f'{subdir}') + df_img = parse_org() + df_img['round_int'] = [int(re.sub('[^0-9]','', item)) for item in df_img.rounds] + df_img = df_img[df_img.round_int < 90] + df_img = df_img.sort_values('round_int') + df_scene = df_img[df_img.scene==s_scene.split("-Scene-")[1].split("_")[0]] + df_marker = df_scene[df_scene.marker.isin(ls_membrane)] + #load each image + for s_index in df_marker.index: + print(f'loading {s_index}') + intensity_image = io.imread(s_index) + #print(intensity_image.shape) + s_marker = df_marker.loc[s_index,'marker'] + print(f'extracting features {s_marker}') + if s_marker == 'DAPI': + s_marker = s_marker + f'{df_marker.loc[s_index,"rounds"].split("R")[1]}' + for s_loc, a_loc in d_loc.items(): + #print(a_loc.shape) + df_marker_loc = pd.DataFrame(columns = [f'{s_marker}_{s_loc}']) + df_prop = extract_feat(a_loc,intensity_image, properties=(['intensity_image','image','label'])) + for idx in df_prop.index: + label_id = df_prop.loc[idx,'label'] + intensity_image_small = df_prop.loc[idx,'intensity_image'] + image = df_prop.loc[idx,'image'] + pixels = intensity_image_small[image] + pixels25 = pixels[pixels >= np.quantile(pixels,.75)] + df_marker_loc.loc[label_id,f'{s_marker}_{s_loc}'] = pixels25.mean() + df_marker_loc.index = [f'{s_sample}_scene{s_scene.split("-Scene-")[1].split("_")[0]}_cell{item}' for item in df_marker_loc.index] + df_feat = df_feat.merge(df_marker_loc, left_index=True,right_index=True,how='left',suffixes=('',f'{s_marker}_{s_loc}')) + df_sample = df_sample.append(df_feat) + #break + return(df_sample) + +def subcellular_regions(labels, distance_short=2, distance_long=5): + ''' + calculate subcellular segmentation regions from segmentation mask + ''' + membrane_short = contract_label(labels,distance=distance_short) + membrane_long = contract_label(labels,distance=distance_long) + ring_short, grown_short = expand_label(labels,distance=distance_short) + ring_long, grown_long = expand_label(labels,distance=distance_long) + straddle_short, __, shrink_short = straddle_label(labels,distance=distance_short) + straddle_long, __, shrink_long = straddle_label(labels,distance=distance_long) + d_loc_sl={'membrane':(membrane_short,membrane_long), + 'ring':(ring_short,ring_long), + 'straddle':(straddle_short,straddle_long), + 'grown':(grown_short,grown_long), + 'shrunk':(shrink_short,shrink_long)} + return(d_loc_sl) + +def combine_labels(s_sample,segdir, subdir, ls_seg_markers, nuc_diam, cell_diam, df_mi_full,s_thresh): + ''' + - load cell labels; delete cells that were not used for cytoplasm (i.e. ecad neg) + - nuc labels, expand to perinuc 5 and then cut out the cell labels + - keep track of cells that are completely coverd by another cell (or two or three: counts as touching). + ''' + se_neg = df_mi_full[df_mi_full.slide == s_sample].loc[:,f'{s_thresh}_negative'] + print(len(se_neg)) + dd_result = {} + if os.path.exists(f'{segdir}/{s_sample}Cellpose_Segmentation'): + os.chdir(f'{segdir}/{s_sample}Cellpose_Segmentation') + else: + os.chdir(segdir) + print(segdir) + ls_scene = [] + for s_file in os.listdir(): + if s_file.find(' - DAPI.png') > -1: + ls_scene.append(s_file.split(' - DAPI.png')[0]) + ls_scene_all = sorted(set([item.split('_cell')[0].replace('_scene','-Scene-') for item in se_neg.index]) & set(ls_scene)) + if len(ls_scene_all) == 0: + ls_scene_all = sorted(set([item.split('_cell')[0].replace('_scene','-Scene-').split('_')[1] for item in se_neg.index]) & set(ls_scene)) + print(ls_scene_all) + for s_scene in ls_scene_all: + se_neg_scene = se_neg[se_neg.index.str.contains(s_scene.replace("Scene ","scene")) | se_neg.index.str.contains(s_scene.replace("-Scene-","_scene"))] + print(f'Processing combined segmentaiton labels for {s_scene}') + if os.path.exists(f'{s_scene} nuclei{nuc_diam} - Nuclei Segmentation Basins.tif'): + labels = io.imread(f'{s_scene} nuclei{nuc_diam} - Nuclei Segmentation Basins.tif') + else: + print('no nuclei labels found') + if os.path.exists(f'{s_scene} matchedcell{cell_diam} - Cell Segmentation Basins.tif'): + cell_labels = io.imread(f'{s_scene} matchedcell{cell_diam} - Cell Segmentation Basins.tif') + elif os.path.exists(f'{s_scene}_{".".join(ls_seg_markers)} matchedcell{cell_diam} - Cell Segmentation Basins.tif'): + cell_labels = io.imread(f'{s_scene}_{".".join(ls_seg_markers)} matchedcell{cell_diam} - Cell Segmentation Basins.tif') + elif os.path.exists(f'{s_scene}_{".".join(ls_seg_markers)} nuc{nuc_diam} matchedcell{cell_diam} - Cell Segmentation Basins.tif'): + cell_labels = io.imread(f'{s_scene}_{".".join(ls_seg_markers)} nuc{nuc_diam} matchedcell{cell_diam} - Cell Segmentation Basins.tif') + else: + print('no cell labels found') + #set non-ecad cell labels to zero + a_zeros = np.array([int(item.split('_cell')[1]) for item in se_neg_scene[se_neg_scene].index]).astype('int64') + mask = np.isin(cell_labels, a_zeros) + cell_labels_copy = cell_labels.copy() + cell_labels_copy[mask] = 0 + #make the nuclei under cells zero + labels_copy = labels.copy() + distance = 5 + perinuc5, labels_exp = expand_label(labels,distance=distance) + labels_exp[cell_labels_copy > 0] = 0 + #combine calls and expanded nuclei + combine = (labels_exp + cell_labels_copy) + if s_scene.find('Scene') == 0: + io.imsave(f'{s_sample}_{s_scene.replace("Scene ","scene")}_cell{cell_diam}_nuc{nuc_diam}_CombinedSegmentationBasins.tif',combine) + else: + io.imsave(f'{s_scene}_{".".join(ls_seg_markers)}-cell{cell_diam}_exp{distance}_CellSegmentationBasins.tif',combine) + #figure out the covered cells...labels + combined + not_zero_pixels = np.array([labels.ravel() !=0,combine.ravel() !=0]).all(axis=0) + a_tups = np.array([combine.ravel()[not_zero_pixels],labels.ravel()[not_zero_pixels]]).T #combined over nuclei + unique_rows = np.unique(a_tups, axis=0) + new_dict = {} + for key, value in unique_rows: + if key == value: + continue + else: + if key in new_dict: + new_dict[key].append(value) + else: + new_dict[key] = [value] + #from elmar (reformat cells touching dictionary and save + d_result = {} + for i_cell, li_touch in new_dict.items(): + d_result.update({str(i_cell): [str(i_touch) for i_touch in li_touch]}) + dd_result.update({f'{s_sample}_{s_scene.replace("Scene ","scene")}':d_result}) + #save dd_touch as json file + with open(f'result_{s_sample}_cellsatop_dictionary.json','w') as f: + json.dump(dd_result, f) + print('') + return(labels,combine,dd_result) + +def check_basins(cell_labels, cell_diam): + dai_value = {'a':cell_labels} + df = imagine.membrane_px(cell_labels,dai_value) + ls_bad = sorted(set(df[df.x_relative > 10*cell_diam].cell) | set(df[df.y_relative > 10*cell_diam].cell)) + return(ls_bad) + +def check_combined(segdir,s_sample,cell_diam,ls_seg_markers): + df_result = pd.DataFrame() + if os.path.exists(f'{segdir}/{s_sample}Cellpose_Segmentation'): + os.chdir(f'{segdir}/{s_sample}Cellpose_Segmentation') + else: + os.chdir(segdir) + ls_scene = [] + for s_file in os.listdir(): + if s_file.find(' - DAPI.png') > -1: + ls_scene.append(s_file.split(' - DAPI.png')[0]) + for s_scene in sorted(ls_scene): + print(s_scene) + if os.path.exists(f'{s_scene}_{".".join(ls_seg_markers)}-cell{cell_diam}_exp5_CellSegmentationBasins.tif'): + cell_labels = io.imread(f'{s_scene}_{".".join(ls_seg_markers)}-cell{cell_diam}_exp5_CellSegmentationBasins.tif') + print(f'Loaded {s_scene}_{".".join(ls_seg_markers)}-cell{cell_diam}_exp5_CellSegmentationBasins.tif') + ls_bad = check_basins(cell_labels, cell_diam) + ls_bad_cells = [f"{s_scene.replace('-Scene-','_scene')}_cell{item}" for item in ls_bad] + df_bad = pd.DataFrame(index=ls_bad_cells,columns=['bad_match'],data=[True]*len(ls_bad_cells)) + df_result = df_result.append(df_bad) + else: + print('no combined cell labels found') + return(df_result) + +def edge_mask(s_sample,segdir,subdir,i_pixel=154, dapi_thresh=350,i_fill=50000,i_close=20): + ''' + find edge of the tissue. first, find tissue by threshodling DAPI R1 (pixels above dapi_thresh) + then, mask all pixels within i_pixel distance of tissue border + return/save binary mask + ''' + os.chdir(segdir) + df_img = process.load_li([s_sample],s_thresh='', man_thresh=100) + for s_scene in sorted(set(df_img.scene)): + print(f'Calculating tissue edge mask for Scene {s_scene}') + s_index = df_img[(df_img.scene == s_scene) & (df_img.rounds == 'R1') & (df_img.color =='c1')].index[0] + if os.path.exists(f'{subdir}/{s_sample}/{s_index}'): + img_dapi = io.imread(f'{subdir}/{s_sample}/{s_index}') + elif os.path.exists(f'{subdir}/{s_sample}-Scene-{s_scene}/{s_index}'): + img_dapi = io.imread(f'{subdir}/{s_sample}-Scene-{s_scene}/{s_index}') + else: + print('no DAPI found') + img_dapi = np.zeros([2,2]) + mask = img_dapi > dapi_thresh + mask_small = morphology.remove_small_objects(mask, min_size=100) + mask_closed = morphology.binary_closing(mask_small, morphology.octagon(i_close,i_close//2)) + mask_filled = morphology.remove_small_holes(mask_closed, i_fill) + border_mask, __, __,distances = mask_border(mask_filled,type='inner',pixel_distance = i_pixel) + img = np.zeros(border_mask.shape,dtype='uint8') + img[border_mask] = 255 + io.imsave(f"{segdir}/TissueEdgeMask{i_pixel}_{s_sample}_scene{s_scene}.png", img) + +def edge_hull(s_sample,segdir,subdir,i_pixel=154, dapi_thresh=350,i_fill=50000,i_close=40,i_small=30000): + ''' + find edge of the tissue. first, find tissue by threshodling DAPI R1 (pixels above dapi_thresh) + then, mask all pixels within i_pixel distance of tissue border + return/save binary mask + ''' + os.chdir(segdir) + df_img = process.load_li([s_sample],s_thresh='', man_thresh=100) + for s_scene in sorted(set(df_img.scene)): + print(f'Calculating tissue edge mask for Scene {s_scene}') + s_index = df_img[(df_img.scene == s_scene) & (df_img.rounds == 'R1') & (df_img.color =='c1')].index[0] + if os.path.exists(f'{subdir}/{s_sample}/{s_index}'): + img_dapi = io.imread(f'{subdir}/{s_sample}/{s_index}') + elif os.path.exists(f'{subdir}/{s_sample}-Scene-{s_scene}/{s_index}'): + img_dapi = io.imread(f'{subdir}/{s_sample}-Scene-{s_scene}/{s_index}') + else: + print('no DAPI found') + img_dapi = np.zeros([2,2]) + mask = img_dapi > dapi_thresh + mask_small = morphology.remove_small_objects(mask, min_size=100) + mask_closed = morphology.binary_closing(mask_small, morphology.octagon(i_close,i_close//2)) + mask_filled = morphology.remove_small_holes(mask_closed, i_fill) + mask_smaller = morphology.remove_small_objects(mask, min_size=i_small) + mask_hull = morphology.convex_hull_image(mask_smaller) + border_mask, __, __,distances = mask_border(mask_filled,type='inner',pixel_distance = i_pixel) + img = np.zeros(border_mask.shape,dtype='uint8') + img[border_mask] = 255 + io.imsave(f"{segdir}/TissueEdgeMask{i_pixel}_{s_sample}_scene{s_scene}.png", img) + +def edge_cells(s_sample,segdir,nuc_diam,i_pixel=154): + ''' + load a binary mask of tissue, cell labels, and xy coord datafreame. + return data frame of cells witin binary mask + ''' + df_sample = pd.DataFrame() + #load xy + df_xy = pd.read_csv(f'{segdir}/features_{s_sample}_CentroidXY.csv',index_col=0) + df_xy['cells'] = [int(item.split('cell')[1]) for item in df_xy.index] + ls_scene = sorted(set([item.split('_')[1].split('scene')[1] for item in df_xy.index])) + #load masks + for s_scene in ls_scene: + print(f'Calculating edge cells for Scene {s_scene}') + mask = io.imread(f"{segdir}/TissueEdgeMask{i_pixel}_{s_sample}_scene{s_scene}.png") + mask_gray = mask == 255 + labels = io.imread(f'{segdir}/{s_sample}Cellpose_Segmentation/{s_sample}-Scene-{s_scene} nuclei{nuc_diam} - Nuclei Segmentation Basins.tif') + edge = mask_labels(mask_gray,labels) + df_scene = df_xy[df_xy.index.str.contains(f'{s_sample}_scene{s_scene}')] + #works + es_cells = set(edge.astype('int')).intersection(set(df_scene.cells)) + df_edge = df_scene[df_scene.cells.isin(es_cells)] + fig,ax=plt.subplots() + ax.imshow(mask_gray) + ax.scatter(df_edge.DAPI_X,df_edge.DAPI_Y,s=1) + fig.savefig(f'{segdir}/TissueEdgeMask{i_pixel}_{s_sample}-Scene-{s_scene}_cells.png') + df_sample = df_sample.append(df_edge) + return(df_sample) + +def cell_distances(df_xy,s_scene,distances): + ''' + load a binary mask of tissue, cell labels, and xy coord datafreame. + return data frame of cells witin binary mask + ''' + df_xy['DAPI_Y'] = df_xy.DAPI_Y.astype('int64') + df_xy['DAPI_X'] = df_xy.DAPI_X.astype('int64') + print(f'Calculating distances for Scene {s_scene}') + df_scene = df_xy[df_xy.index.str.contains(f"{s_scene.replace('-Scene-','_scene')}")].copy() + df_scene['pixel_dist'] = distances[df_scene.DAPI_Y,df_scene.DAPI_X] + return(df_scene) + +def cell_coords(): + ''' + TBD: find cell coordinate within a mask + ''' + for s_scene in ls_scene: + #old (use if you have coordinates, not labels) + #mask_gray = mask#[:,:,0] + #contour = skimage.measure.find_contours(mask_gray,0) + #coords = skimage.measure.approximate_polygon(contour[0], tolerance=5) + #fig,ax=plt.subplots() + #ax.imshow(mask_gray) + #ax.plot(coords[:, 1], coords[:, 0], '-r', linewidth=2) + #fig.savefig(f'TissueEdgeMask_{s_sample}_Scene-{s_scene}_polygon.png') + #x = np.array(df_scene.DAPI_X.astype('int').values) + #y = np.array(df_scene.DAPI_Y.astype('int').values) + #points = np.array((y,x)).T + mask = skimage.measure.points_in_poly(points, coords) \ No newline at end of file diff --git a/mplex_image/gating.py b/mplex_image/gating.py new file mode 100755 index 0000000..a3665fc --- /dev/null +++ b/mplex_image/gating.py @@ -0,0 +1,205 @@ +##### +# gating.py +# author: engje, grael +# date: 2020-04-07 +# license: GPLv3 +##### + +# library +import os +import pandas as pd +import shutil +from mplex_image import analyze +import numpy as np + + +def main_celltypes(df_data,ls_endothelial,ls_immune,ls_tumor,ls_cellline_index): + #celltpye + #1 endothelial + df_data['endothelial'] = df_data.loc[:,ls_endothelial].any(axis=1) + #2 immune + ls_exclude = ls_endothelial + df_data['immune'] = df_data.loc[:,ls_immune].any(axis=1) & ~df_data.loc[:,ls_exclude].any(axis=1) + #3 tumor + ls_exclude = ls_endothelial + ls_immune + df_data['tumor'] = df_data.loc[:,ls_tumor].any(axis=1) & ~df_data.loc[:,ls_exclude].any(axis=1) + #4 stromal + ls_exclude = ls_immune + ls_endothelial + ls_tumor + df_data['stromal'] = ~df_data.loc[:,ls_exclude].any(axis=1) + #add celltype + ls_cell_names = ['stromal','endothelial','tumor','immune'] + s_type_name = 'celltype' + analyze.add_celltype(df_data, ls_cell_names, s_type_name) + #fix cell lines (all tumor!) + df_data['slide_scene'] = [item.split('_cell')[0] for item in df_data.index] + df_data.loc[df_data[df_data.slide_scene.isin(ls_cellline_index)].index,'celltype'] = 'tumor' + df_data['immune'] = df_data.loc[:,'celltype'] == 'immune' + df_data['stromal'] = df_data.loc[:,'celltype'] == 'stromal' + df_data['endothelial'] = df_data.loc[:,'celltype'] == 'endothelial' + return(df_data) + +def proliferation(df_data,ls_prolif): + #proliferation + df_data['prolif'] = df_data.loc[:,ls_prolif].any(axis=1) + df_data['nonprolif'] = ~df_data.loc[:,ls_prolif].any(axis=1) + #add proliferation + ls_cell_names = ['prolif','nonprolif'] + s_type_name = 'proliferation' + analyze.add_celltype(df_data, ls_cell_names, s_type_name) + return(df_data) + +def immune_types(df_data,s_myeloid,s_bcell,s_tcell): + ## T cell, B cell or myeloid + df_data['CD68Mac'] = df_data.loc[:,[s_myeloid,'immune']].all(axis=1) + df_data['CD20Bcell'] = df_data.loc[:,[s_bcell,'immune']].all(axis=1) & ~df_data.loc[:,['CD68Mac',s_tcell]].any(axis=1) + df_data['TcellImmune'] = df_data.loc[:,[s_tcell,'immune']].all(axis=1) & ~df_data.loc[:,['CD20Bcell','CD68Mac']].any(axis=1) + df_data['UnspecifiedImmune'] = df_data.loc[:,'immune'] & ~df_data.loc[:,['CD20Bcell','TcellImmune','CD68Mac']].any(axis=1) + ## CD4 and CD8 + if df_data.columns.isin(['CD8_Ring','CD4_Ring']).sum()==2: + #print('CD4 AND CD8') + df_data['CD8Tcell'] = df_data.loc[: ,['CD8_Ring','TcellImmune']].all(axis=1) + df_data['CD4Tcell'] = df_data.loc[: ,['CD4_Ring','TcellImmune']].all(axis=1) & ~df_data.loc[: ,'CD8Tcell'] + df_data['UnspecifiedTcell'] = df_data.TcellImmune & ~df_data.loc[:,['CD8Tcell','CD4Tcell']].any(axis=1) #if cd4 or 8 then sum = 2 + ## check + ls_immune = df_data[df_data.loc[:,'TcellImmune']].index.tolist() + if ((df_data.loc[ls_immune,['CD8Tcell','CD4Tcell','UnspecifiedTcell']].sum(axis=1)!=1)).any(): + print('Error in Tcell cell types') + ls_immuntype = ['CD68Mac','CD20Bcell','UnspecifiedImmune','CD8Tcell','CD4Tcell','UnspecifiedTcell'] #'TcellImmune', + #add Immunetype + ls_cell_names = ls_immuntype + s_type_name = 'ImmuneType' + analyze.add_celltype(df_data, ls_cell_names, s_type_name) + + #get rid of unspecfied immune cells (make them stroma) + ls_index = df_data[df_data.ImmuneType.fillna('x').str.contains('Unspecified')].index + df_data.loc[ls_index,'celltype'] = 'stromal' + df_data.loc[ls_index,'ImmuneType'] = np.nan + df_data.loc[ls_index,'stromal'] = True + df_data.loc[ls_index,'immune'] = False + return(df_data) + +def immune_functional(df_data,ls_immune_functional): + #Immune functional states + df_data.rename(dict(zip(ls_immune_functional,[item.split('_')[0] for item in ls_immune_functional])),axis=1,inplace=True) + df_func = analyze.combinations(df_data,[item.split('_')[0] for item in ls_immune_functional]) + df_data = df_data.merge(df_func,how='left', left_index=True, right_index=True, suffixes = ('_all','')) + #gated combinations: immune type plus fuctional status + ls_gate = sorted(df_data[~df_data.ImmuneType.isna()].loc[:,'ImmuneType'].unique()) + ls_marker = df_func.columns.tolist() + df_gate_counts = analyze.gated_combinations(df_data,ls_gate,ls_marker) + df_data = df_data.merge(df_gate_counts, how='left', left_index=True, right_index=True,suffixes = ('_all','')) + #add FuncImmune + ls_cell_names = df_gate_counts.columns.tolist() + s_type_name ='FuncImmune' + analyze.add_celltype(df_data, ls_cell_names, s_type_name) + return(df_data) + +######################################## +#CellProlif combinations, main cell types and proliferation +###################################### +def cell_prolif(df_data, s_gate='celltype',ls_combo =['prolif','nonprolif']): + ls_gate = df_data.loc[:,s_gate].unique().tolist() + df_gate_counts2 = analyze.gated_combinations(df_data,ls_gate,ls_combo) + df_data = df_data.merge(df_gate_counts2, how='left', left_index=True, right_index=True,suffixes = ('_all','')) + #add CellProlif + ls_cell_names = ['endothelial_prolif','endothelial_nonprolif', 'tumor_prolif', 'tumor_nonprolif', + 'stromal_prolif', 'stromal_nonprolif', 'immune_prolif','immune_nonprolif'] + ls_cell_names = df_gate_counts2.columns.tolist() + s_type_name = 'CellProlif' + analyze.add_celltype(df_data, ls_cell_names, s_type_name) + return(df_data) + +def diff_hr_state(df_data,ls_luminal,ls_basal,ls_mes): + ls_mes = df_data.columns[(df_data.dtypes=='bool') & (df_data.columns.isin(ls_mes) | df_data.columns.isin([item.split('_')[0] for item in ls_mes]))].tolist() + print('differentiation') + df_data['Lum'] = df_data.loc[:,ls_luminal].any(axis=1) & df_data.tumor + df_data['Bas'] = df_data.loc[:,ls_basal].any(axis=1) & df_data.tumor + df_data['Mes'] = df_data.loc[:,ls_mes].any(axis=1) & df_data.tumor + + print('hormonal status') + df_data['ER'] = df_data.loc[:,['tumor','ER_Nuclei']].all(axis=1) + df_data['HER2'] = df_data.loc[:,['tumor','HER2_Ring']].all(axis=1) + ls_hr = ['ER'] + if df_data.columns.isin(['PgR_Nuclei']).any(): + df_data['PR'] = df_data.loc[:,['tumor','PgR_Nuclei']].all(axis=1) + ls_hr.append('PR') + + df_data['HR'] = df_data.loc[:,ls_hr].any(axis=1) & df_data.tumor + + ls_marker = ['Lum','Bas','Mes'] # + df_diff = analyze.combinations(df_data,ls_marker) + df_data = df_data.merge(df_diff,how='left', left_index=True, right_index=True, suffixes = ('_all','')) + + #add DiffState + ls_cell_names = df_diff.columns.tolist() + s_type_name = 'DiffState' + analyze.add_celltype(df_data, ls_cell_names, s_type_name) + #change non-tumor to NA (works!) + df_data.loc[df_data[df_data.celltype != 'tumor'].index,s_type_name] = np.nan + + #2 ER/PR/HER2 + ls_marker = ['HR','HER2'] + df_hr = analyze.combinations(df_data,ls_marker) + df_hr.rename({'__':'TN'},axis=1,inplace=True) + df_data = df_data.merge(df_hr,how='left', left_index=True, right_index=True,suffixes = ('_all','')) + ls_cell_names = df_hr.columns.tolist() + s_type_name = 'HRStatus' + analyze.add_celltype(df_data, ls_cell_names, s_type_name) + #change non-tumor to NA (works!) + df_data.loc[df_data[df_data.celltype != 'tumor'].index,s_type_name] = np.nan + + #3 combinations: differentiation and HR status + ls_gate = df_diff.columns.tolist() + ls_marker = df_hr.columns.tolist() + df_gate_counts = analyze.gated_combinations(df_data,ls_gate,ls_marker) + df_data = df_data.merge(df_gate_counts, how='left', left_index=True, right_index=True,suffixes = ('_all','')) + + # make Tumor Diff plus HR Status object column + ls_cell_names = df_gate_counts.columns.tolist() + s_type_name = 'DiffStateHRStatus' + analyze.add_celltype(df_data, ls_cell_names, s_type_name) + #change non-tumor to NA (works!) + df_data.loc[df_data[df_data.celltype != 'tumor'].index,s_type_name] = np.nan + return(df_data) + +def celltype_gates(df_data,ls_gate,s_new_name,s_celltype): + ''' + multipurpose for stromaTumor + ls_gates = + ''' + ls_gate = df_data.columns[(df_data.dtypes=='bool') & (df_data.columns.isin(ls_gate) | df_data.columns.isin([item.split('_')[0] for item in ls_gate]))].tolist() + #tumor signaling and proliferation + #rename + df_data.rename(dict(zip(ls_gate,[item.split('_')[0] for item in ls_gate])),axis=1,inplace=True) + ls_marker = [item.split('_')[0] for item in ls_gate] + #functional states (stromal) (don't forget to merge!) + df_func = analyze.combinations(df_data,ls_marker) + df_data = df_data.merge(df_func,how='left', left_index=True, right_index=True, suffixes = ('_all','')) + ls_cell_names = df_func.columns.tolist() + analyze.add_celltype(df_data, ls_cell_names, s_new_name) + #change non-tumor to NA (works!) + df_data.loc[df_data[df_data.celltype != s_celltype].index,s_new_name] = np.nan + df_data[s_new_name] = df_data.loc[:,s_new_name].replace(dict(zip(ls_cell_names,[f'{s_celltype}_{item}' for item in ls_cell_names]))) + return(df_data) + +def non_tumor(df_data): + #one more column: all non-tumor cells + index_endothelial = df_data[df_data.celltype=='endothelial'].index + index_immune = df_data[df_data.celltype=='immune'].index + index_stroma = df_data[df_data.celltype=='stromal'].index + index_tumor = df_data[df_data.celltype=='tumor'].index + + if df_data.columns.isin(['ImmuneType','StromalType']).sum() == 2: + #fewer cell tpyes + df_data.loc[index_endothelial,'NonTumor'] = 'endothelial' + df_data.loc[index_immune,'NonTumor'] = df_data.loc[index_immune,'ImmuneType'] + df_data.loc[index_stroma,'NonTumor'] = df_data.loc[index_stroma,'StromalType'] + df_data.loc[index_tumor,'NonTumor'] = np.nan + + if df_data.columns.isin(['FuncImmune','CellProlif']).sum() == 2: + #more cell types + df_data.loc[index_endothelial,'NonTumorFunc'] = df_data.loc[index_endothelial,'CellProlif'] + df_data.loc[index_immune,'NonTumorFunc'] = df_data.loc[index_immune,'FuncImmune'] + df_data.loc[index_stroma,'NonTumorFunc'] = df_data.loc[index_stroma,'StromalType'] + df_data.loc[index_tumor,'NonTumorFunc'] = np.nan + return(df_data) diff --git a/mplex_image/getdata.py b/mplex_image/getdata.py new file mode 100755 index 0000000..aca70dc --- /dev/null +++ b/mplex_image/getdata.py @@ -0,0 +1,176 @@ +#### +# title: getdata.py +# +# language: Python3.6 +# date: 2018-08-00 +# license: GPL>=v3 +# author: Jenny, bue (mostly bue) +# +# description: +# python3 library to analyise guillaume segemented cyclic staining data. +#### + +# load library +import csv +import os +import re + + +# function implementaion +# import importlib +# importlib.reload(getdata) + +def get_df( + #s_gseg_folder_root='/graylab/share/engje/Data/', + #s_scene_label='Registered-Her' + s_folder_regex="^SlideName.*_Features$", + es_value_label = {"MeanIntensity","CentroidX","CentroidY"}, + #s_df_folder_root="./", + #b_roundscycles=False, + ): + ''' + input: + segmentation fiels from Guillaume's software, which have in the + "Label" column the "cell serial number" (cell) + and in other columns the "feature of intrests" and unintrest. + + the segmentation files are ordered in such a path structure: + + {s_gseg_folder_root} + |+ {s_gseg_folder_run_regex}*_YYYY-MM-DD_* (run) + | |+ Scene 000 - Nuclei - CD32.txt (scene and protein) + | |+ Scene 000 - Location - ProteinName.txt + | + |+ {s_gseg_folder_run_regex}*_YYYY-MM-DD_* + + output: + at {s_df_folder_root} tab separated value dataframe files + per run and feature of intrest. + y-axis: protein_location + x-axis: scene_cell + + runYYYYMMDD_MeanIntensity.tsv + + runYYYYMMDD_{s_gseg_feature_label}.tsv + + run: + import getdata + getdata.get_df(s_gseg_folder_root='ihcData', s_gseg_folder_run_regex='^BM-Her2N75') + + description: + function to extrtact dataframe like files of features of intrest + from segmentation files from guilaumes segmentation software. + ''' + # enter the data path + #os.chdir(s_gseg_folder_root) + + + # for each value label of intrest (such as MeanIntensity) + for s_value_label in es_value_label: + + # for each run (such as folder BM-Her2N75-15_2017-08-07_Features) + # change re.search to somehow specify folder of interest + for s_dir in os.listdir(): + if re.search(s_folder_regex, s_dir): + print(f"\nprocess {s_value_label} run: {s_dir}") + # enter the run directory + os.chdir(s_dir) + # extract run label from dir name + s_run = f"features_{s_dir.split('_')[0]}" + # get empty run dictionary + dd_run = {} + + # for each data file + for s_file in os.listdir(): + if re.search("^Scene", s_file): + print(f"process {s_value_label} file: {s_file} ...") + # extract scene from file name + ls_file = [s_splinter.strip() for s_splinter in s_file.split("-")] + s_scene = re.sub("[^0-9a-zA-Z]", "", ls_file[0].lower()) #take out any alpha numberic + # extract protein from file name + if (len(ls_file) < 3): + s_protein = f"{ls_file[1].split('.')[0]}" # this is dapi + else: + s_protein = f"{ls_file[2].split('.')[0]}_{ls_file[1]}" # others + + # for each datarow in file + b_header = False # header row inside file not yet found, so set flag false + with open(s_file, newline='') as f_csv: + o_reader = csv.reader(f_csv, delimiter=' ', quotechar='"') + for ls_row in o_reader: + if (b_header): + # extract cell label and data vale + s_cell = ls_row[i_xcell] + s_cell = f"{'0'*(5 - len(s_cell))}{s_cell}" + o_value = ls_row[i_xvalue] + # update run dictionary via scene_cell dictionery (one scene_cell dictionary per dataframe row) + s_scene_cell = f"{s_scene}_cell{s_cell}" + try: + d_scene_cell = dd_run[s_scene_cell] # we have already some data from this scene_cell + except KeyError: + d_scene_cell = {} # this is the first time we deal with this scene_cell + # update scene_cell dictionary with data values (one value inside dataframe row) + try: + o_there = d_scene_cell[s_protein] + sys.exit(f"Error @ getDataframe : in run {s_run} code tries to populate dataframe row {s_scene_cell} column {s_protein} with a secound time (there:{o_there} new:{o_value}). this should never happen. code is messed up.") + except KeyError: + d_scene_cell.update({s_protein: o_value}) + dd_run.update({s_scene_cell: d_scene_cell}) + else: + # extract cell label and data value of intrest column position + i_xcell = ls_row.index("Label") + i_xvalue = ls_row.index(s_value_label) + b_header = True # header row found and information extracted, so set flag True + + # write run dictionar of dictionary into dataframe like file + b_header = False + s_file_output = f"../{s_run}_{s_value_label}.tsv" + print(f"write file: {s_file_output}") + with open(s_file_output, 'w', newline='') as f: + for s_scene_cell in sorted(dd_run): + ls_datarow = [s_scene_cell] + # handle protein column label row + if not (b_header): + ls_protein = sorted(dd_run[s_scene_cell]) + print(ls_protein) + f.write("\t" + "\t".join(ls_protein) + "\n") + b_header = True + # handle data row + for s_protein in ls_protein: + o_value = dd_run[s_scene_cell][s_protein] + ls_datarow.append(o_value) + f.write("\t".join(ls_datarow) + "\n") + # sanity check + if (len(ls_protein) != (len(ls_datarow) -1)): + sys.exit(f"Error @ getDataframe : at {s_scene_cell} there are {len(ls_datarow) - len(ls_protein) -1} more proteins then in the aready writen rows") + + # jump back to the data path + os.chdir("..") + + return(dd_run) + + +def dfextract(df_origin, s_extract, axis=0): + ''' + input: + df_origin: dataframe + s_extract: index or column marker to be extacted + axis: 0 specifies index to be extracted, + 1 specifies columns to be extracted + + output: + df_extract: extracted dataframe + + run: + import cycnorm + cycnorm.dfyextract(df_scene, s_extract='CD74') + cycnorm.dfextract(df_run, s_scene='scene86') + + description: + function can extract e.g. + specific scene datafarme from gseg2df generated run datafarme or + specific protein from a scene dataframe. + ''' + if (axis == 0): + df_extract = df_origin.loc[df_origin.index.str.contains(s_extract),:] + else: + df_extract = df_origin.loc[:,df_origin.columns.str.contains(s_extract)] + # output + return(df_extract) diff --git a/mplex_image/imagine.py b/mplex_image/imagine.py new file mode 100755 index 0000000..f705318 --- /dev/null +++ b/mplex_image/imagine.py @@ -0,0 +1,504 @@ +### +# title: pysci.imagine.py +# +# language Python3 +# license: GPLv3 +# author: bue +# date: 2019-01-31 +# +# run: +# form pysci import imagine +# +# description: +# my image analysis library +#### + +# library +import numpy as np +import pandas as pd + +# function +def slide_up(a): + """ + input: + a: numpy array + + output: + a: input numpy array shifted one row up. + top row get deleted, + bottom row of zeros is inserted. + + description: + inspired by np.roll function, though elements that roll + beyond the last position are not re-introduced at the first. + """ + a = np.delete(np.insert(a, -1, 0, axis=0), 0, axis=0) + return(a) + + +def slide_down(a): + """ + input: + a: numpy array + + output: + a: input numpy array shifted one row down. + top row of zeros is inserted. + bottom row get deleted, + + description: + inspired by np.roll function, though elements that roll + beyond the last position are not re-introduced at the first. + """ + a = np.delete(np.insert(a, 0, 0, axis=0), -1, axis=0) + return(a) + + +def slide_left(a): + """ + input: + a: numpy array + + output: + a: input numpy array shifted one column left. + left most column gets deleted, + right most a column of zeros is inserted. + + description: + inspired by np.roll function, though elements that roll + beyond the last position are not re-introduced at the first. + """ + a = np.delete(np.insert(a, -1, 0, axis=1), 0, axis=1) + return(a) + + +def slide_right(a): + """ + input: + a: numpy array + + output: + a: input numpy array shifted one column right. + left most a column of zeros is inserted. + right most column gets deleted, + + description: + inspired by np.roll function, though elements that roll + beyond the last position are not re-introduced at the first. + """ + a = np.delete(np.insert(a, 0, 0, axis=1), -1, axis=1) + return(a) + + +def slide_upleft(a): + """ + input: + a: numpy array + + output: + a: input numpy array shifted one row up and one column left. + + description: + inspired by np.roll function. + """ + a = slide_left(slide_up(a)) + return(a) + + +def slide_upright(a): + """ + input: + a: numpy array + + output: + a: input numpy array shifted one row up and one column right. + + description: + inspired by np.roll function. + """ + a = slide_right(slide_up(a)) + return(a) + + +def slide_downleft(a): + """ + input: + a: numpy array + + output: + a: input numpy array shifted one row down and one column left. + + description: + inspired by np.roll function. + """ + a = slide_left(slide_down(a)) + return(a) + + +def slide_downright(a): + """ + input: + a: numpy array + + output: + a: input numpy array shifted one row down and one column right. + + description: + inspired by np.roll function. + """ + a = slide_right(slide_down(a)) + return(a) + + + +def get_border(ai_basin): + """ + input: + ai_basin: numpy array representing a cells or nuclei basin file. + it is assumed that basin borders are represented by 0 values, + and basins are represented with any values different from 0. + ai_basin = skimage.io.imread("cells_basins.tif") + + output: + ai_border: numpy array containing only the cell or nuclei basin border. + border value will be 1, non border value will be 0. + + description: + algorithm to extract the basin borders form basin numpy arrays. + """ + ab_border_up = (ai_basin - slide_up(ai_basin)) != 0 + ab_border_down = (ai_basin - slide_down(ai_basin)) != 0 + ab_border_left = (ai_basin - slide_left(ai_basin)) != 0 + ab_border_right = (ai_basin - slide_right(ai_basin)) != 0 + ab_border_upleft = (ai_basin - slide_upleft(ai_basin)) != 0 + ab_border_upright = (ai_basin - slide_upright(ai_basin)) != 0 + ab_border_downleft = (ai_basin - slide_downleft(ai_basin)) != 0 + ab_border_downright = (ai_basin - slide_downright(ai_basin)) != 0 + ab_border = ab_border_up | ab_border_down | ab_border_left | ab_border_right | ab_border_upleft | ab_border_upright | ab_border_downleft | ab_border_downright + ai_border = ab_border * 1 + return(ai_border) + + +def collision(ai_basin, i_step_size=1): + """ + input: + ai_basin: numpy array representing a cells basin file. + it is assumed that basin borders are represented by 0 values, + and basins are represented with any values different from 0. + ai_basin = skimage.io.imread("cells_basins.tif") + + i_step_size: integer that specifies the distance from a basin + where collisions with other basins are detected. + increasing the step size behind > 1 will result in faster processing + but less certain results. step size < 1 make no sense. + default step size is 1. + + output: + eti_collision: a set of tuples representing colliding basins. + + description: + algorithm to detect which basin collide a given step size away. + """ + eti_collision = set() + for o_slide in {slide_up, slide_down, slide_left, slide_right, slide_upleft, slide_upright, slide_downleft, slide_downright}: + ai_walk = ai_basin.copy() + for _ in range(i_step_size): + ai_walk = o_slide(ai_walk) + ai_alice = ai_walk[(ai_basin != 0) & (ai_walk != 0)] + ai_bob = ai_basin[(ai_basin != 0) & (ai_walk != 0)] + eti_collision = eti_collision.union(set( + zip( + ai_alice[(ai_alice != ai_bob)], + ai_bob[(ai_bob != ai_alice)] + ) + )) + # return + return(eti_collision) + + +def grow(ai_basin, i_step=1): + """ + input: + ai_basin: numpy array representing a cells basin file. + it is assumed that basin borders are represented by 0 values, + and basins are represented with any values different from 0. + ai_basin = skimage.io.imread("cells_basins.tif") + + i_step: integer which specifies how many pixels the basin + to each direction should grow + + output: + ai_grown: numpy array with the grown basins + + description: + algorithm to grow the basis in a given basin numpy array. + growing happens counterclockwise. + """ + ai_grown = ai_basin.copy() + for _ in range(i_step): + for o_slide in {slide_up, slide_upleft, slide_left, slide_downleft, slide_down, slide_downright, slide_right, slide_upright}: + ai_alice = ai_basin.copy() + ai_evolve = o_slide(ai_alice) + ai_alice[(ai_evolve != ai_alice) & (ai_alice == 0)] = ai_evolve[(ai_evolve != ai_alice) & (ai_alice == 0)] + # update grown + ai_grown[(ai_alice != ai_grown) & (ai_grown == 0)] = ai_alice[(ai_alice != ai_grown) & (ai_grown == 0)] + # output + return(ai_grown) + + +def touching_cells(ai_basin, i_border_width=0, i_step_size=1): + """ + input: + ai_basin: numpy array representing a cells basin file. + it is assumed that basin borders are represented by 0 values, + and basins are represented with any values different from 0. + ai_basin = skimage.io.imread("cells_basins.tif") + + i_border_width: maximal acceptable border with in pixels. + this is half of the range how far two the adjacent cell maximal + can be apart and still are regarded as touching each other. + + i_step_size: step size by which the border width is sampled for + touching cells. + increase the step size behind > 1 will result in faster processing + but less certain results. step size < 1 make no sense. + default step size is 1. + + output: + dei_touch: a dictionary that for each basin states + which other basins are touching. + + description: + algorithm to extract the touching basins from a cell basin numpy array. + algorithm inspired by C=64 computer games with sprit collision. + """ + + # detect neighbors + eti_collision = set() + ai_evolve = ai_basin.copy() + for _ in range(-1, i_border_width, i_step_size): + # detect cell border collision + eti_collision = eti_collision.union( + collision(ai_basin=ai_evolve, i_step_size=i_step_size) + ) + # grow basin + ai_evolve = grow(ai_basin=ai_evolve, i_step=i_step_size) + + # transform set of tuple of alice and bob collision to dictionary of sets + dei_touch = {} + ei_alice = set(np.ndarray.flatten(ai_basin)) + ei_alice.remove(0) + for i_alice in ei_alice: + dei_touch.update({i_alice : set()}) + for i_alice, i_bob in eti_collision: + ei_bob = dei_touch[i_alice] + ei_bob.add(i_bob) + dei_touch.update({i_alice : ei_bob}) + + # output + return(dei_touch) + + +def detouch2df(deo_abc, ls_column=["cell_center","cell_touch"]): + """ + input: + deo_touch: touching_cells generated dictionary + ls_column: future dictionary_key dictionary_value column name + + output: + df_touch: dataframe which contains the same information + as the input deo_touch dictionary. + + description: + transforms dei_touch dictionary into a two column dataframe. + """ + lo_key_total= [] + lo_value_total = [] + for o_key, eo_value in deo_abc.items(): + try: + lo_value = sorted(eo_value, key=int) + except ValueError: + lo_value = sorted(eo_value) + # extract form dictionary + if (len(lo_value) == 0): + lo_key_total.append(o_key) + lo_value_total.append(0) + else: + lo_key_total.extend([o_key] * len(lo_value)) + lo_value_total.extend(lo_value) + # generate datafarme + df_touch = pd.DataFrame([lo_key_total,lo_value_total], index=ls_column).T + return(df_touch) + + +def imgfuse(laaai_in): + """ + input: + laaai_in: list of 3 channel (RGB) images + + output: + aaai_out: fused 3 channel image + + description: + code to fuse many RGB images into one. + """ + # check shape + ti_shape = None + for aaai_in in laaai_in: + if (ti_shape is None): + ti_shape = aaai_in.shape + else: + if (aaai_in.shape != ti_shape): + sys.exit(f"Error: input images have not the same shape. {aaai_in.shape} != {aaai_in}.") + + # fuse images + llli_channel = [] + for i_channel in range(ti_shape[0]): + lli_matrix = [] + for i_y in range(ti_shape[1]): + li_row = [] + for i_x in range(ti_shape[2]): + #print(f"{i_channel} {i_y} {i_x}") + li_px = [] + for aaai_in in laaai_in: + i_in = aaai_in[i_channel,i_y,i_x] + if (i_in != 0): + li_px.append(i_in) + if (len(li_px) != 0): + i_out = np.mean(li_px) + else: + i_out = 0 + li_row.append(int(i_out)) + lli_matrix.append(li_row) + llli_channel.append(lli_matrix) + + # output + aaai_out = np.array(llli_channel) + return(aaai_out) + + + +# test code +if __name__ == "__main__": + + # load basins tiff into numpy array + ''' + import matplotlib.pyplot as plt + import skimage as ski + a_tiff = ski.io.imread("cells_basins.tif") + plt.imshow(a_tiff) + ''' + + # generate test data + a = np.array([ + [0,0,0,0,0,0,0,0,0,0,0,0,0,0], + [0,0,0,0,0,0,0,0,0,0,0,0,0,0], + [0,0,0,0,0,0,0,0,0,0,4,0,0,0], + [0,0,0,1,1,1,0,0,0,0,0,0,0,0], + [0,0,0,1,1,1,0,0,0,0,0,0,0,0], + [0,0,0,1,1,1,0,0,0,0,0,0,0,0], + [0,0,0,0,0,0,0,0,2,2,2,0,0,0], + [0,0,0,0,3,3,3,0,2,2,2,0,0,0], + [0,0,0,0,3,3,3,0,2,2,2,0,0,0], + [0,0,0,0,3,3,3,0,0,0,0,0,0,0], + [0,0,0,0,0,0,0,0,0,0,0,0,0,0], + [0,0,0,0,0,0,0,0,0,0,0,0,0,0], + [0,0,0,0,0,0,0,0,0,0,0,0,0,0], + ]) + + b = np.array([ + [0,0,0,0,0,0,0,0,0,0,0], + [0,0,0,0,0,0,0,0,0,0,0], + [0,0,0,0,0,0,0,0,0,0,0], + [0,0,0,1,0,0,0,0,0,0,0], + [0,0,0,0,1,2,0,0,0,0,0], + [0,0,0,0,0,1,2,0,0,0,0], + [0,0,0,0,0,0,0,2,0,0,0], + [0,0,0,0,0,0,0,0,0,0,0], + [0,0,0,0,0,0,0,0,0,0,0], + [0,0,0,0,0,0,0,0,0,0,0], + ]) + + c = np.array([ + [0,0,0,0,0,0,0,0,0,0], + [0,0,0,0,0,0,0,0,0,0], + [0,0,0,0,0,0,0,0,0,0], + [0,0,0,0,0,0,0,0,0,0], + [0,0,0,0,1,0,0,0,0,0], + [0,0,0,0,0,1,0,0,0,0], + [0,0,0,0,0,0,0,0,0,0], + [0,0,0,0,0,0,0,0,0,0], + [0,0,0,0,0,0,0,0,0,0], + [0,0,0,0,0,0,0,0,0,0], + ]) + + # run get_border + print("\nborderwall_tm") + print(a) + print(get_border(a)) + #plt.imshow(get_border(a_tiff)) + + # run grow + ''' + print("\ngrow") + print(c) + print(grow(c)) + print(grow(grow(c))) + print(grow(c, i_step_size=2)) + print(b) + print(grow(b)) + print(grow(grow(b))) + print(grow(b, i_step_size=2)) + ''' + + # run collision + ''' + print("\ncollision") + print(c) + print(collision(c)) + print(b) + print(collision(b)) + print(c) + print(collision(c)) + ''' + + # run touching_cells + print("\ntouch") + #print(a) + print(touching_cells(a, i_border_width=0)) + print(touching_cells(a, i_border_width=1)) + print(touching_cells(a, i_border_width=2)) + print(touching_cells(a, i_border_width=3)) + print(touching_cells(a, i_border_width=4)) + print(touching_cells(a, i_border_width=4, i_step_size=2)) + #touching_cells(a_tiff, i_border_width=1) + + + # img fuse + aaai_1 = np.array([ + [[1,1,1],[2,2,2],[3,3,3]], + [[0,0,0,],[0,0,0],[0,0,0]], + [[0,0,0],[0,0,0],[0,0,0]], + ]) + aaai_2 = np.array([ + [[0,0,0,],[0,0,0],[0,0,0]], + [[1,1,1],[2,2,2],[3,3,3]], + [[0,0,0],[0,0,0],[0,0,0]], + ]) + aaai_3 = np.array([ + [[0,0,0,],[0,0,0],[0,0,0]], + [[0,0,0],[0,0,0],[0,0,0]], + [[1,1,1],[2,2,2],[3,3,3]], + ]) + aaai_4 = np.array([ + [[1,1,1],[2,2,2],[3,3,3]], + [[1,1,1],[2,2,2],[3,3,3]], + [[0,0,0],[0,0,0],[0,0,0]], + ]) + aaai_5 = np.array([ + [[0,0,0,],[0,0,0],[0,0,0]], + [[1,1,1],[2,2,2],[3,3,3]], + [[1,1,1],[2,2,2],[3,3,3]], + ]) + aaai_out = imgfuse([aaai_1, aaai_2, aaai_3, aaai_4, aaai_5]) + print("fused 3channel image:\n", aaai_out, type(aaai_out)) diff --git a/mplex_image/metadata.py b/mplex_image/metadata.py new file mode 100755 index 0000000..4d49424 --- /dev/null +++ b/mplex_image/metadata.py @@ -0,0 +1,176 @@ +#### +# title: metadata.py +# +# language: Python3.7 +# date: 2020-07-00 +# license: GPL>=v3 +# author: Jenny +# +# description: +# python3 library using python bioformats to extract image metadata +#### + + +#libraries +import matplotlib as mpl +mpl.use('agg') +import matplotlib.pyplot as plt +import numpy as np +import os +import skimage +import pandas as pd +import bioformats +#import javabridge +import re +import shutil +from itertools import chain, compress +import matplotlib.ticker as ticker +from mplex_image import cmif + +# mpimage +#functions + +def get_exposure(s_image, s_find="Information\|Image\|Channel\|ExposureTime\<\/Key\>\"): + + s_meta = bioformats.get_omexml_metadata(path=s_image) + o = bioformats.OMEXML(s_meta) + print(o.image().Name) + print(o.image().AcquisitionDate) + + li_start = [m.start() for m in re.finditer(s_find, s_meta)] + if len(li_start)!=1: + print('Error: found wrong number of exposure times') + + ls_exposure = [] + for i_start in li_start: + ls_exposure.append(s_meta[i_start:i_start+200]) + s_exposure = ls_exposure[0].strip(s_find) + s_exposure = s_exposure[1:s_exposure.find(']')] + ls_exposure = s_exposure.split(',') + li_exposure = [int(item)/1000000 for item in ls_exposure] + return(li_exposure,s_meta) + +def get_exposure_sample(s_sample,df_img): + """ + return a dataframe with all exposure times for a sample (slide) + """ + #make dataframe of exposure time metadata + df_exposure = pd.DataFrame() + ls_image = os.listdir() + df_sample = df_img[df_img.index.str.contains(s_sample)] + for s_image in df_sample.index: + print(s_image) + li_exposure, s_meta = get_exposure(s_image) + se_times = pd.Series(li_exposure,name=s_image) + df_exposure = df_exposure.append(se_times) + return(df_exposure) + +def get_meta(s_image, s_find = 'Scene\|CenterPosition\<\/Key\>\\['): + """czi scene metadata + s_image = filename + s_find = string to find in the omexml metadata + returns: + ls_exposure = list of 200 character strings following s_find in metadata + s_meta = the whole metadata string + """ + s_meta = bioformats.get_omexml_metadata(path=s_image) + o = bioformats.OMEXML(s_meta) + #print(o.image().Name) + #print(o.image().AcquisitionDate) + + li_start = [m.start() for m in re.finditer(s_find, s_meta)] + if len(li_start)!=1: + print('Error: found wrong number of exposure times') + + ls_exposure = [] + for i_start in li_start: + ls_exposure.append(s_meta[i_start:i_start+200]) + s_exposure = ls_exposure[0].strip(s_find) + s_exposure = s_exposure[0:s_exposure.find(']')] + ls_exposure = s_exposure.split(',') + #li_exposure = [int(item)/1000000 for item in ls_exposure] + return(ls_exposure,s_meta) + +def scene_position(czidir,type): + """ + get a dataframe of scene positions for each round/scene in TMA + """ + os.chdir(f'{czidir}') + df_img = cmif.parse_czi('.',type=type) + + #javabridge.start_vm(class_path=bioformats.JARS) + for s_image in df_img.index: + print(s_image) + ls_exposure,s_meta = get_meta(s_image) + df_img.loc[s_image,'Scene_X'] = ls_exposure[0] + df_img.loc[s_image,'Scene_Y'] = ls_exposure[1] + + #javabridge.kill_vm() + + df_img = df_img.sort_values(['rounds','scanID','scene']).drop('data',axis=1) + return(df_img) + + + ls_exposure,s_meta = get_meta(s_image, s_find = 'Scene\|CenterPosition\<\/Key\>\\[') + +def exposure_times_scenes(df_img,codedir,czidir,s_end='.czi'): + """ + get a csv of exposure times for each slide + """ + #go to directory + os.chdir(czidir) + #export exposure time + s_test = sorted(compress(os.listdir(),[item.find(s_end) > -1 for item in os.listdir()]))[1]#[0] + s_find = f"{s_test.split('-Scene-')[1].split(s_end)[0]}" + for s_sample in sorted(set(df_img.slide)): + print(s_sample) + df_img_slide = df_img[(df_img.slide==s_sample) & (df_img.scene==s_find)] + print(len(df_img_slide)) + df_exp = get_exposure_sample(s_sample,df_img_slide) + df_exp.to_csv(f'{codedir}/{s_sample}_ExposureTimes.csv',header=True,index=True) + +def exposure_times(df_img,codedir,czidir): + """ + get a csv of exposure times for each slide + """ + #go to directory + os.chdir(czidir) + print(czidir) + #export exposure time + for s_sample in sorted(set(df_img.slide)): + df_img_slide = df_img[df_img.slide==s_sample] + df_exp = get_exposure_sample(s_sample,df_img_slide) + df_exp.to_csv(f'{codedir}/{s_sample}_ExposureTimes.csv',header=True,index=True) + #close java virtual machine + #javabridge.kill_vm() + +def exposure_times_slide(df_img,codedir,czidir): + if len(df_img.scene.unique()) == 1: + exposure_times(df_img,codedir,czidir) + elif len(df_img.scene.unique()) > 1: + exposure_times_scenes(df_img,codedir,czidir,s_end='.czi') + +def export_tiffs(df_img, s_sample,tiffdir): + """ + export the tiffs of each tile + """ + #start java virtual machine + #javabridge.start_vm(class_path=bioformats.JARS) + + #export tiffs + df_img_slide = df_img[df_img.slide==s_sample] + for path in df_img_slide.index: + print(path) + img = bioformats.load_image(path) #looks like it only loads the first tile + img_new = img*65535 + img_16 = img_new.astype(np.uint16) + i_channels = img_16.shape[2] + for i_channel in range(i_channels): + print(f'channel {i_channel}') + bioformats.write_image(f'{tiffdir}/{path.split(".czi")[0]}_c{str(i_channel+1)}_ORG.tif', pixels=img_16[:,:,i_channel],pixel_type='uint16') + break + break + a_test = img_16[:,:,i_channel] + aa_test = img_16 + #javabridge.kill_vm() + return(a_test,aa_test, img) diff --git a/mplex_image/mics.py b/mplex_image/mics.py new file mode 100755 index 0000000..d16b479 --- /dev/null +++ b/mplex_image/mics.py @@ -0,0 +1,581 @@ +# wrapper functions for codex image processing + +from mplex_image import preprocess, mpimage, getdata, process, analyze, cmif, features, ometiff +import os +import pandas as pd +import math +import skimage +from skimage import io, filters +import re +import numpy as np +import json +from skimage.util import img_as_uint +import tifffile + +def parse_processed(): + ''' + parse the file names of processed Macsima images + ''' + df_img = mpimage.filename_dataframe(s_end ="ome.tif",s_start='R',s_split='___') + #standardize dapi naming + ls_dapi_index = df_img[df_img.index.str.contains('DAPI')].index.tolist() + d_replace = dict(zip(ls_dapi_index, [item.replace('DAPIV0','DAPI__DAPIV0') for item in ls_dapi_index])) + df_img['data'] = df_img.data.replace(d_replace) + #standardize AF naming + ls_dapi_index = df_img[df_img.index.str.contains('autofluorescence')].index.tolist() + d_replace = dict(zip(ls_dapi_index, [item.replace('autofluorescence_FITC','autofluorescence-FITC__FITC') for item in ls_dapi_index])) + df_img['data'] = df_img.data.replace(d_replace) + d_replace = dict(zip(ls_dapi_index, [item.replace('autofluorescence_PE','autofluorescence-PE__PE') for item in ls_dapi_index])) + df_img['data'] = df_img.data.replace(d_replace) + #standardize empty naming + ls_dapi_index = df_img[df_img.index.str.contains('empty')].index.tolist() + d_replace = dict(zip(ls_dapi_index, [item.replace('empty','empty__empty') for item in ls_dapi_index])) + df_img['data'] = df_img.data.replace(d_replace) + df_img['marker'] = [item.split(f"{item.split('_')[3]}_")[-1].split('__')[0] for item in df_img.data] + df_img['cycle'] = [item.split('_')[3] for item in df_img.data] + df_img['rounds'] = [item.split('_')[3].replace('C-','R') for item in df_img.data] + df_img['clone'] = [item.split('__')[1].split('.')[0] for item in df_img.data] + #standardize marker naming + d_replace = dict(zip(df_img.marker.tolist(),[item.replace('_','-') for item in df_img.marker.tolist()])) + df_img['data'] = [item.replace(f'''{item.split(f"{item.split('_')[3]}_")[-1].split('__')[0]}''',f'''{d_replace[item.split(f"{item.split('_')[3]}_")[-1].split('__')[0]]}''') for item in df_img.data] + df_img['exposure'] = [int(item.split('__')[1].split('_')[1].split('.')[0]) for item in df_img.data] + df_img['channel'] = [item.split('__')[1].split('_')[0].split('.')[1] for item in df_img.data] + d_replace = {'DAPI':'c1', 'FITC':'c2', 'PE':'c3', 'APC':'c4'} + df_img['color'] = [item.replace(item, d_replace[item]) for item in df_img.channel] + df_img['rack'] = [item.split('_')[0] for item in df_img.data] + df_img['slide'] = [item.split('_')[1] for item in df_img.data] + df_img['scene'] = [item.split('_')[2] for item in df_img.data] + return(df_img) + +def parse_org(): + ''' + parse the file names of copied (name-stadardized) Macsima images + ''' + s_path = os.getcwd() + df_img = mpimage.filename_dataframe(s_end ="tif",s_start='R',s_split='___') + df_img['marker'] = [item.split(f"{item.split('_')[3]}_")[-1].split('__')[0] for item in df_img.data] + df_img['cycle'] = [item.split('_')[3] for item in df_img.data] + df_img['rounds'] = [item.split('_')[3].replace('C-','R') for item in df_img.data] + df_img['clone'] = [item.split('__')[1].split('.')[0] for item in df_img.data] + df_img['exposure'] = [int(item.split('__')[1].split('_')[1].split('.')[0]) for item in df_img.data] + df_img['channel'] = [item.split('__')[1].split('_')[0].split('.')[1] for item in df_img.data] + d_replace = {'DAPI':'c1', 'FITC':'c2', 'PE':'c3', 'APC':'c4'} + df_img['color'] = [item.replace(item, d_replace[item]) for item in df_img.channel] + df_img['rack'] = [item.split('_')[0] for item in df_img.data] + df_img['slide'] = [item.split('_')[1] for item in df_img.data] + df_img['scene'] = [item.split('_')[2] for item in df_img.data] + df_img['slide_scene'] = df_img.slide + '_' + df_img.scene + df_img['path'] = [f"{s_path}/{item}" for item in df_img.index] + return(df_img) + +def copy_processed(df_img,regdir,i_lines=32639): + ''' + copy the highest exposure time images for processing + ''' + for s_marker in sorted(set(df_img.marker) - {'DAPI','autofluorescence','empty'}): + df_marker = df_img[df_img.marker==s_marker] + for s_cycle in sorted(set(df_marker.cycle)): + for s_index in df_marker[df_marker.cycle==s_cycle].sort_values('exposure',ascending=False).index.tolist(): + a_img = io.imread(s_index) + s_dir_new = s_index.split(f"_{df_img.loc[s_index,'cycle']}")[0] + s_index_new = df_img.loc[s_index,'data'].split('.ome.tif')[0] + preprocess.cmif_mkdir([f'{regdir}/{s_dir_new}']) + print(a_img.max()) + #get rid of lines + a_img[a_img==i_lines] = a_img.min() + if a_img.max() < 65535: + io.imsave(f'{regdir}/{s_dir_new}/{s_index_new}.tif',a_img,plugin='tifffile',check_contrast=False) + break + else: + print('Try lower exposure time') + for s_index in df_img[df_img.marker=='DAPI'].index.tolist(): + a_img = io.imread(s_index) + print(f'DAPI max: {a_img.max()}') + if df_img.loc[s_index,'rounds'] != 'R0': #keep lines in R0 dapi, for segmentation + a_img[a_img==i_lines] = a_img.min() + s_dir_new = s_index.split(f"_{df_img.loc[s_index,'cycle']}")[0] + s_index_new = df_img.loc[s_index,'data'].split('.ome.tif')[0] + preprocess.cmif_mkdir([f'{regdir}/{s_dir_new}']) + io.imsave(f'{regdir}/{s_dir_new}/{s_index_new}.tif',a_img,plugin='tifffile',check_contrast=False) + +def extract_cellpose_features(s_sample, segdir, regdir, ls_seg_markers, nuc_diam, cell_diam): + ''' + load the segmentation results, the input images, and the channels images + extract mean intensity from each image, and centroid, area and eccentricity for + ''' + df_sample = pd.DataFrame() + df_thresh = pd.DataFrame() + os.chdir(f'{segdir}/{s_sample}Cellpose_Segmentation') + ls_scene = [] + d_match = {} + for s_file in os.listdir(): + if s_file.find(f'{".".join(ls_seg_markers)} nuc{nuc_diam} matchedcell{cell_diam} - Cell Segmentation Basins')>-1: + ls_scene.append(s_file.split(f'_{".".join(ls_seg_markers)}')[0]) + d_match.update({s_file.split(f'_{".".join(ls_seg_markers)}')[0]:s_file}) + for s_scene in ls_scene: + os.chdir(f'{segdir}/{s_sample}Cellpose_Segmentation') + print(f'processing {s_scene}') + for s_file in os.listdir(): + if s_file.find(s_scene) > -1: + if s_file.find("DAPI.png") > -1: + s_dapi = s_file + dapi = io.imread(f'{segdir}/{s_sample}Cellpose_Segmentation/{s_dapi}') + print(f'loading {s_scene} nuclei{nuc_diam} - Nuclei Segmentation Basins.tif') + labels = io.imread(f'{s_scene} nuclei{nuc_diam} - Nuclei Segmentation Basins.tif') + cell_labels = io.imread(f'{segdir}/{s_sample}Cellpose_Segmentation/{d_match[s_scene]}') + print(f'loading {d_match[s_scene]}') + #nuclear features + df_feat = features.extract_feat(labels,dapi, properties=(['label'])) + df_feat.columns = [f'{item}_segmented-nuclei' for item in df_feat.columns] + df_feat.index = [f'{s_sample}_cell{item}' for item in df_feat.loc[:,'label_segmented-nuclei']] + + #get subcellular regions + cyto = features.label_difference(labels,cell_labels) + d_loc_nuc = features.subcellular_regions(labels, distance_short=2, distance_long=5) + d_loc_cell = features.subcellular_regions(cell_labels, distance_short=2, distance_long=5) + d_loc = {'nuclei':labels,'cell':cell_labels,'cytoplasm':cyto, + 'nucmem':d_loc_nuc['membrane'][0],'cellmem':d_loc_cell['membrane'][0], + 'perinuc5':d_loc_nuc['ring'][1],'exp5':d_loc_nuc['grown'][1], + 'nucadj2':d_loc_nuc['straddle'][0],'celladj2':d_loc_cell['straddle'][0]} + + #subdir organized by slide or scene + if os.path.exists(f'{regdir}/{s_sample}'): + os.chdir(f'{regdir}/{s_sample}') + elif os.path.exists(f'{regdir}/{s_scene}'): + os.chdir(f'{regdir}/{s_scene}') + else: + os.chdir(f'{regdir}') + df_img = parse_org() + df_img['round_int'] = [int(re.sub('[^0-9]','', item)) for item in df_img.rounds] + df_img = df_img[df_img.round_int < 90] + df_img = df_img.sort_values('round_int') + #take into account slide (well) + df_scene = df_img[df_img.slide_scene==s_scene] + #load each image + for s_index in df_scene.index: + intensity_image = io.imread(s_index) + df_thresh.loc[s_index,'threshold_li'] = filters.threshold_li(intensity_image) + if intensity_image.mean() > 0: + df_thresh.loc[s_index,'threshold_otsu'] = filters.threshold_otsu(intensity_image) + df_thresh.loc[s_index,'threshold_triangle'] = filters.threshold_triangle(intensity_image) + s_marker = df_scene.loc[s_index,'marker'] + print(f'extracting features {s_marker}') + if s_marker == 'DAPI': + s_marker = s_marker + f'{df_scene.loc[s_index,"rounds"].split("R")[1]}' + for s_loc, a_loc in d_loc.items(): + if s_loc == 'nuclei': + df_marker_loc = features.extract_feat(a_loc,intensity_image, properties=(['mean_intensity','centroid','area','eccentricity','label'])) + df_marker_loc.columns = [f'{s_marker}_{s_loc}',f'{s_marker}_{s_loc}_centroid-0',f'{s_marker}_{s_loc}_centroid-1',f'{s_marker}_{s_loc}_area',f'{s_marker}_{s_loc}_eccentricity',f'{s_marker}_{s_loc}_label'] + elif s_loc == 'cell': + df_marker_loc = features.extract_feat(a_loc,intensity_image, properties=(['mean_intensity','euler_number','area','eccentricity','label'])) + df_marker_loc.columns = [f'{s_marker}_{s_loc}',f'{s_marker}_{s_loc}_euler',f'{s_marker}_{s_loc}_area',f'{s_marker}_{s_loc}_eccentricity',f'{s_marker}_{s_loc}_label'] + else: + df_marker_loc = features.extract_feat(a_loc,intensity_image, properties=(['mean_intensity','label'])) + df_marker_loc.columns = [f'{s_marker}_{s_loc}',f'{s_marker}_{s_loc}_label'] + #set array ids as index + df_marker_loc.index = df_marker_loc.loc[:,f'{s_marker}_{s_loc}_label'] + df_marker_loc.index = [f'{s_sample}_cell{item}' for item in df_marker_loc.index] + df_feat = df_feat.merge(df_marker_loc, left_index=True,right_index=True,how='left',suffixes=('',f'{s_marker}_{s_loc}')) + df_sample = df_sample.append(df_feat) + return(df_sample, df_thresh) + +def combine_labels(s_sample,segdir, subdir, ls_seg_markers, nuc_diam, cell_diam, df_mi_full,s_thresh): + ''' + - load cell labels; delete cells that were not used for cytoplasm (i.e. ecad neg) + - nuc labels, expand to perinuc 5 and then cut out the cell labels + - keep track of cells that are completely coverd by another cell (or two or three: counts as touching). + ''' + se_neg = df_mi_full[df_mi_full.slide == s_sample].loc[:,f'{s_thresh}_negative'] + dd_result = {} + if os.path.exists(f'{segdir}/{s_sample}Cellpose_Segmentation'): + os.chdir(f'{segdir}/{s_sample}Cellpose_Segmentation') + else: + os.chdir(segdir) + ls_scene = [] + for s_file in os.listdir(): + if s_file.find(' - DAPI.png') > -1: + ls_scene.append(s_file.split(' - DAPI.png')[0]) + ls_scene = sorted(set(df_mi_full[df_mi_full.slide == s_sample].scene) & set(ls_scene)) + for s_scene in ls_scene: + se_neg_scene = se_neg[se_neg.index.str.contains(s_scene)] + + print(f'Processing combined segmentaiton labels for {s_scene}') + if os.path.exists(f'{s_scene} nuclei{nuc_diam} - Nuclei Segmentation Basins.tif'): + labels = io.imread(f'{s_scene} nuclei{nuc_diam} - Nuclei Segmentation Basins.tif') + else: + print('no nuclei labels found') + if os.path.exists(f'{s_scene} matchedcell{cell_diam} - Cell Segmentation Basins.tif'): + cell_labels = io.imread(f'{s_scene} matchedcell{cell_diam} - Cell Segmentation Basins.tif') + elif os.path.exists(f'{s_scene}_{".".join(ls_seg_markers)} matchedcell{cell_diam} - Cell Segmentation Basins.tif'): + cell_labels = io.imread(f'{s_scene}_{".".join(ls_seg_markers)} matchedcell{cell_diam} - Cell Segmentation Basins.tif') + elif os.path.exists(f'{s_scene}_{".".join(ls_seg_markers)} nuc{nuc_diam} matchedcell{cell_diam} - Cell Segmentation Basins.tif'): + cell_labels = io.imread(f'{s_scene}_{".".join(ls_seg_markers)} nuc{nuc_diam} matchedcell{cell_diam} - Cell Segmentation Basins.tif') + else: + print('no cell labels found') + #set non-ecad cell labels to zero + a_zeros = np.array([int(item.split('_cell')[1]) for item in se_neg_scene[se_neg_scene].index]).astype('int64') + mask = np.isin(cell_labels, a_zeros) + cell_labels_copy = cell_labels.copy() + cell_labels_copy[mask] = 0 + #make the nuclei under cells zero + labels_copy = labels.copy() + distance = 5 + perinuc5, labels_exp = features.expand_label(labels,distance=distance) + labels_exp[cell_labels_copy > 0] = 0 + #combine calls and expanded nuclei + combine = (labels_exp + cell_labels_copy) + if s_scene.find('Scene') == 0: + io.imsave(f'{s_sample}_{s_scene.replace("Scene ","scene")}_cell{cell_diam}_nuc{nuc_diam}_CombinedSegmentationBasins.tif',combine) + else: + io.imsave(f'{s_scene}_{".".join(ls_seg_markers)}-cell{cell_diam}_exp{distance}_CellSegmentationBasins.tif',combine) + #figure out the covered cells...labels + combined + not_zero_pixels = np.array([labels.ravel() !=0,combine.ravel() !=0]).all(axis=0) + a_tups = np.array([combine.ravel()[not_zero_pixels],labels.ravel()[not_zero_pixels]]).T #combined over nuclei + unique_rows = np.unique(a_tups, axis=0) + new_dict = {} + for key, value in unique_rows: + if key == value: + continue + else: + if key in new_dict: + new_dict[key].append(value) + else: + new_dict[key] = [value] + #from elmar (reformat cells touching dictionary and save + d_result = {} + for i_cell, li_touch in new_dict.items(): + d_result.update({str(i_cell): [str(i_touch) for i_touch in li_touch]}) + dd_result.update({f'{s_sample}_{s_scene.replace("Scene ","scene")}':d_result}) + #save dd_touch as json file + with open(f'result_{s_sample}_cellsatop_dictionary.json','w') as f: + json.dump(dd_result, f) + print('') + return(labels,combine,dd_result) + +def cropped_ometiff(s_sample,subdir,cropdir,d_crop,d_combos,s_dapi,tu_dim): + if os.path.exists(f'{subdir}/{s_sample}'): + os.chdir(f'{subdir}/{s_sample}') + df_img = parse_org() + df_img['scene'] = s_sample + d_crop_scene = {s_sample:d_crop[s_sample]} + dd_result = mpimage.overlay_crop(d_combos,d_crop_scene,df_img,s_dapi,tu_dim) + for s_crop, d_result in dd_result.items(): + for s_type, (ls_marker, array) in d_result.items(): + print(f'Generating multi-page ome-tiff {[item for item in ls_marker]}') + new_array = array[np.newaxis,np.newaxis,:] + s_xml = ometiff.gen_xml(new_array, ls_marker) + with tifffile.TiffWriter(f'{cropdir}/{s_crop}_{s_type}.ome.tif') as tif: + tif.save(new_array, photometric = "minisblack", description=s_xml, metadata = None) + + +#old +def convert_dapi(debugdir,regdir,b_mkdir=True): + ''' + convert dapi to tif, rename to match Guillaumes pipeline requirements + ''' + cwd = os.getcwd() + os.chdir(debugdir) + for s_dir in sorted(os.listdir()): + if s_dir.find('R-1_')== 0: + os.chdir(s_dir) + for s_file in sorted(os.listdir()): + if s_file.find('bleach')==-1: + s_round = s_file.split("Cycle(")[1].split(").ome.tif")[0] + print(f'stain {s_round}') + s_dir_new = s_dir.split('_')[2] + '-Scene-0' + s_dir.split('F-')[1] + s_tissue_dir = s_dir.split('_F-')[0] + if b_mkdir: + preprocess.cmif_mkdir([f'{regdir}/{s_tissue_dir}']) + a_dapi = skimage.io.imread(s_file) + #rename with standard name (no stain !!!!) + with skimage.external.tifffile.TiffWriter(f'{regdir}/{s_tissue_dir}/{s_dir_new}_R{s_round}_DAPI_V0_c1_ORG_5.0.tif') as tif: + tif.save(a_dapi) + os.chdir('..') + os.chdir(cwd) + +def convert_channels(processdir, regdir, b_rename=True, testbool=True): + ''' + convert channels to tif, select one exposure time of three, rename to match Guillaumes pipeline requirements + ''' + cwd = os.getcwd() + os.chdir(processdir) + for s_dir in sorted(os.listdir()): + if s_dir.find('R-1_')== 0: + os.chdir(s_dir) + if b_rename: + d_rename = {'autofluorescencePE_P':'autofluorescencePE_V0_P', + 'autofluorescenceFITC_F':'autofluorescenceFITC_V0_F', + '000_DAPIi':'extra000_DAPIi', + '000_DAPIf':'extra000_DAPIf', + 'extraextraextra':'extra', + 'extraextra':'extra', + '_FITC_':'_c2_ORG_', + '_PE_':'_c3_ORG_',} + preprocess.dchange_fname(d_rename,b_test=testbool) + + #parse file names + else: + ls_column = ['rounds','marker','dilution','fluor','ORG','exposure','expdecimal','imagetype1','imagetype'] + df_img = mpimage.parse_img(s_end =".tif",s_start='0',s_sep1='_',s_sep2='.',ls_column=ls_column,b_test=False) + df_img['exposure'] = df_img.exposure.astype(dtype='int') + ls_marker = sorted(set(df_img.marker)) + for s_marker in ls_marker: + df_marker = df_img[df_img.marker==s_marker] + df_sort = df_marker.sort_values(by=['exposure'],ascending=False,inplace=False) + for idx in range(len(df_sort.index)): + s_index = df_sort.index[idx] + a_img = skimage.io.imread(s_index) + df_file = df_sort.loc[s_index,:] + print(a_img.max()) + if idx < len(df_sort.index) - 1: + if a_img.max() < 65535: + print(f'Selected {df_file.exposure} for {df_file.marker}') + s_dir_new = s_dir.split('_')[2] + '-Scene-0' + s_dir.split('F-')[1] + s_tissue_dir = s_dir.split('_F-')[0] + s_index_new = s_index.split(".ome.tif")[0] + with skimage.external.tifffile.TiffWriter(f'{regdir}/{s_tissue_dir}/{s_dir_new}_R{s_index_new}.tif') as tif: + tif.save(a_img) + break + else: + print('Try lower exposure time') + elif idx == len(df_sort.index) - 1: + print(f'Selected as the lowest exposure time {df_file.exposure} for {df_file.marker}') + s_dir_new = s_dir.split('_')[2] + '-Scene-0' + s_dir.split('F-')[1] + s_tissue_dir = s_dir.split('_F-')[0] + s_index_new = s_index.split(".ome.tif")[0] + with skimage.external.tifffile.TiffWriter(f'{regdir}/{s_tissue_dir}/{s_dir_new}_R{s_index_new}.tif') as tif: + tif.save(a_img) + else: + print('/n /n /n /n Error in finding exposure time') + + os.chdir('..') + +def parse_converted(regdir): + ''' + parse the converted miltenyi file names, + regdir contains the images + ''' + s_dir = os.getcwd() + df_img = mpimage.filename_dataframe(s_end = ".tif",s_start='G',s_split='_') + df_img.rename({'data':'scene'},axis=1,inplace=True) + df_img['rounds'] = [item[1] for item in [item.split('_') for item in df_img.index]] + df_img['marker'] = [item[2] for item in [item.split('_') for item in df_img.index]] + df_img['dilution'] = [item[3] for item in [item.split('_') for item in df_img.index]] + df_img['color'] = [item[4] for item in [item.split('_') for item in df_img.index]] + df_img['scene_int'] = [item.split('Scene-')[1] for item in df_img.scene] + df_img['scene_int'] = df_img.scene_int.astype(dtype='int') + df_img['exposure'] = [item[6].split('.')[0] for item in [item.split('_') for item in df_img.index]] + df_img['path'] = [f'{regdir}/{s_dir}/{item}' for item in df_img.index] + df_img['tissue'] = s_dir + return(df_img) + +def parse_converted_dirs(regdir): + ''' + parse the converted miltenyi file names, + regdir is the master folder containing subfolders with ROIs/gates + ''' + os.chdir(regdir) + df_img_all = pd.DataFrame() + for idx, s_dir in enumerate(sorted(os.listdir())): + os.chdir(s_dir) + s_sample = s_dir + print(s_sample) + df_img = parse_converted(s_dir) + df_img_all = df_img_all.append(df_img) + os.chdir('..') + return(df_img_all) + +def count_images(df_img,b_tile_count=True): + """ + count and list slides, scenes, rounds + """ + df_count = pd.DataFrame(index=sorted(set(df_img.scene)),columns=sorted(set(df_img.color))) + for s_sample in sorted(set(df_img.tissue)): + print(f'ROI {s_sample}') + df_img_slide = df_img[df_img.tissue==s_sample] + print('tiles') + [print(item) for item in sorted(set(df_img_slide.scene))] + print(f'Number of images = {len(df_img_slide)}') + print(f'Rounds:') + [print(item) for item in sorted(set(df_img_slide.rounds))] + print('\n') + if b_tile_count: + for s_scene in sorted(set(df_img_slide.scene)): + df_img_scene = df_img_slide[df_img_slide.scene==s_scene] + for s_color in sorted(set(df_img_scene.color)): + print(f'{s_scene} {s_color} {len(df_img_scene[df_img_scene.color==s_color])}') + df_count.loc[s_scene,s_color] = len(df_img_scene[df_img_scene.color==s_color]) + return(df_count) + +def visualize_reg_images(regdir,qcdir,color='c1',tu_array=(3,2)): + """ + array registered images to check tissue identity, focus, etc. + """ + #check registration + preprocess.cmif_mkdir([f'{qcdir}/RegisteredImages']) + cwd = os.getcwd() + os.chdir(regdir) + for idx, s_dir in enumerate(sorted(os.listdir())): + os.chdir(s_dir) + s_sample = s_dir + print(s_sample) + df_img = parse_converted(s_dir) + ls_scene = sorted(set(df_img.scene)) + for s_scene in ls_scene: + print(s_scene) + df_img_scene = df_img[df_img.scene == s_scene] + df_img_stain = df_img_scene[df_img_scene.color==color] + df_img_sort = df_img_stain.sort_values(['rounds']) + i_sqrt = math.ceil(math.sqrt(len(df_img_sort))) + #array_img(df_img,s_xlabel='color',ls_ylabel=['rounds','exposure'],s_title='marker',tu_array=(2,4),tu_fig=(10,20)) + if color == 'c1': + fig = mpimage.array_img(df_img_sort,s_xlabel='marker',ls_ylabel=['rounds','exposure'],s_title='rounds',tu_array=tu_array,tu_fig=(16,14)) + else: + fig = mpimage.array_img(df_img_sort,s_xlabel='color',ls_ylabel=['rounds','exposure'],s_title='marker',tu_array=tu_array,tu_fig=(16,12)) + fig.savefig(f'{qcdir}/RegisteredImages/{s_scene}_registered_{color}.png') + os.chdir('..') + os.chdir(cwd) + #return(df_img) + +def rename_files(d_rename,dir,b_test=True): + """ + change file names + """ + cwd = os.getcwd() + os.chdir(dir) + for idx, s_dir in enumerate(sorted(os.listdir())): + s_path = f'{dir}/{s_dir}' + os.chdir(s_path) + print(s_dir) + df_img = mpimage.filename_dataframe(s_end = ".tif",s_start='reg',s_split='_') + df_img.rename({'data':'scene'},axis=1,inplace=True) + df_img['rounds'] = [item[1] for item in [item.split('_') for item in df_img.index]] + df_img['color'] = [item[2] for item in [item.split('_') for item in df_img.index]] + df_img['marker'] = [item[3].split('.')[0] for item in [item.split('_') for item in df_img.index]] + if b_test: + print('This is a test') + preprocess.dchange_fname(d_rename,b_test=True) + elif b_test==False: + print('Changing name - not a test') + preprocess.dchange_fname(d_rename,b_test=False) + else: + pass + +def rename_fileorder(s_sample, dir, b_test=True): + """ + change file names + """ + cwd = os.getcwd() + os.chdir(dir) + for idx, s_dir in enumerate(sorted(os.listdir())): + s_path = f'{dir}/{s_dir}' + os.chdir(s_path) + print(s_dir) + df_img = mpimage.filename_dataframe(s_end = ".tif",s_start='Scene',s_split='_') + df_img.rename({'data':'scene'},axis=1,inplace=True) + df_img['rounds'] = [item[1] for item in [item.split('_') for item in df_img.index]] + df_img['color'] = [item[2] for item in [item.split('_') for item in df_img.index]] + df_img['marker'] = [item[3].split('.')[0] for item in [item.split('_') for item in df_img.index]] + for s_index in df_img.index: + s_round = df_img.loc[s_index,'rounds'] + s_scene= f"{s_sample}-{df_img.loc[s_index,'scene']}" + s_marker = df_img.loc[s_index,'marker'] + s_color = df_img.loc[s_index,'color'] + s_index_rename = f'{s_round}_{s_scene}_{s_marker}_{s_color}_ORG.tif' + d_rename = {s_index:s_index_rename} + if b_test: + print('This is a test') + preprocess.dchange_fname(d_rename,b_test=True) + elif b_test==False: + print('Changing name - not a test') + preprocess.dchange_fname(d_rename,b_test=False) + else: + pass + + +def copy_files(dir,dapi_copy, marker_copy,testbool=True,type='codex'): + """ + copy and rename files if needed as dummies + need to edit + """ + os.chdir(dir) + for idx, s_dir in enumerate(sorted(os.listdir())): + s_path = f'{dir}/{s_dir}' + os.chdir(s_path) + #s_sample = s_dir.split('-Scene')[0] + df_img = mpimage.filename_dataframe(s_end = ".tif",s_start='Scene',s_split='_') + df_img.rename({'data':'scene'},axis=1,inplace=True) + df_img['rounds'] = [item[1] for item in [item.split('_') for item in df_img.index]] + df_img['color'] = [item[2] for item in [item.split('_') for item in df_img.index]] + df_img['marker'] = [item[3].split('.')[0] for item in [item.split('_') for item in df_img.index]] + print(s_dir) + #if b_test: + for key, dapi_item in dapi_copy.items(): + df_dapi = df_img[(df_img.rounds== key.split('_')[1]) & (df_img.color=='c1')] + s_dapi = df_dapi.loc[:,'marker'][0] + preprocess.copy_dapis(s_r_old=key,s_r_new=f'_cyc{dapi_item}_',s_c_old='_c1_', + s_c_new='_c2_',s_find=f'_c1_{s_dapi}_ORG.tif',b_test=testbool,type=type) + i_count=0 + for idx,(key, item) in enumerate(marker_copy.items()): + preprocess.copy_markers(df_img, s_original=key, ls_copy = item, + i_last_round= dapi_item + i_count, b_test=testbool,type=type) + i_count=i_count + len(item) + +def segmentation_thresholds(regdir,qcdir, d_segment): + """ + visualize binary mask of segmentaiton threholds + need to edit + """ + preprocess.cmif_mkdir([f'{qcdir}/Segmentation']) + os.chdir(regdir) + for idx, s_dir in enumerate(sorted(os.listdir())): + s_path = f'{regdir}/{s_dir}' + os.chdir(s_path) + df_img = mpimage.filename_dataframe(s_end = ".tif",s_start='Scene',s_split='_') + df_img.rename({'data':'scene'},axis=1,inplace=True) + df_img['rounds'] = [item[1] for item in [item.split('_') for item in df_img.index]] + df_img['color'] = [item[2] for item in [item.split('_') for item in df_img.index]] + df_img['marker'] = [item[3].split('.')[0] for item in [item.split('_') for item in df_img.index]] + s_sample = s_dir + print(s_sample) + d_seg = preprocess.check_seg_markers(df_img,d_segment, i_rows=1, t_figsize=(6,6)) #few scenes + for key, fig in d_seg.items(): + fig.savefig(f'{qcdir}/Segmentation/{s_dir}_{key}_segmentation.png') + + +def segmentation_inputs(s_sample,regdir,segdir,d_segment,b_start=False): + """ + make inputs for guillaumes segmentation + """ + os.chdir(regdir) + for idx, s_dir in enumerate(sorted(os.listdir())): + s_path = f'{regdir}/{s_dir}' + os.chdir(s_path) + df_img = mpimage.filename_dataframe(s_end = ".tif",s_start='R',s_split='_') + df_img.rename({'data':'rounds'},axis=1,inplace=True) + #df_img['rounds'] = [item[1] for item in [item.split('_') for item in df_img.index]] + df_img['color'] = [item[3] for item in [item.split('_') for item in df_img.index]] + df_img['marker'] = [item[2] for item in [item.split('_') for item in df_img.index]] + #s_sample = s_dir + #s_sample = s_dir.split('-Scene')[0] + print(s_sample) + df_marker = df_img[df_img.color!='c1'] + df_marker = df_marker.sort_values(['rounds','color']) + df_dapi = pd.DataFrame(index = [df_marker.marker.tolist()],columns=['rounds','colors','minimum','maximum','exposure','refexp','location']) + df_dapi['rounds'] = df_marker.loc[:,['rounds']].values + df_dapi['colors'] = df_marker.loc[:,['color']].values + df_dapi['minimum'] = 1003 + df_dapi['maximum'] = 65535 + df_dapi['exposure'] = 100 + df_dapi['refexp'] = 100 + df_dapi['location'] = 'All' + for s_key,i_item in d_segment.items(): + df_dapi.loc[s_key,'minimum'] = i_item + df_dapi.to_csv('RoundsCyclesTable.txt',sep=' ',header=False) + df_dapi.to_csv(f'metadata_{s_sample}_RoundsCyclesTable.csv',header=True) + #create cluster.java file + preprocess.cluster_java(s_dir=f'JE{idx}',s_sample=s_sample,imagedir=f'{s_path}',segmentdir=segdir,type='exacloud',b_segment=True,b_TMA=False) + if b_start: + os.chdir(f'/home/groups/graylab_share/Chin_Lab/ChinData/Work/engje/exacloud/JE{idx}') #exacloud + print(f'JE{idx}') + os.system('make_sh') diff --git a/mplex_image/mpimage.py b/mplex_image/mpimage.py new file mode 100755 index 0000000..86746e4 --- /dev/null +++ b/mplex_image/mpimage.py @@ -0,0 +1,817 @@ +#### +# title: mpimage.py +# +# language: Python3.6 +# date: 2019-05-00 +# license: GPL>=v3 +# author: Jenny +# +# description: +# python3 library to display, normalize and crop multiplex images +#### + +#libraries +import matplotlib as mpl +mpl.use('agg') +import matplotlib.pyplot as plt +import numpy as np +import os +import skimage +import pandas as pd +#import bioformats +import re +import shutil +from itertools import chain +import matplotlib.ticker as ticker + +#os.chdir('/home/groups/graylab_share/OMERO.rdsStore/engje/Data/cmIF/') +#from apeer_ometiff_library import omexmlClass + +#functions + + +def parse_img(s_end =".tif",s_start='',s_sep1='_',s_sep2='.',s_exclude='Gandalf',ls_column=['rounds','color','imagetype','scene'],b_test=True): + ''' + required columns: ['rounds','color','imagetype','scene'] + meta names names=['rounds','color','minimum', 'maximum', 'exposure', 'refexp','location'],#'marker', + return = df_img + ''' + ls_file = [] + for file in os.listdir(): + #find all filenames ending in s_end + if file.endswith(s_end): + if file.find(s_start)==0: + if file.find(s_exclude)==-1: + ls_file = ls_file + [file] + + print(f'test {int(1.1)}') + #make a list of list of file name items separated by s_sep + llls_split = [] + for items in [item.split(s_sep1)for item in ls_file]: + llls_split.append([item.split(s_sep2) for item in items]) + + lls_final = [] + for lls_split in llls_split: + lls_final.append(list(chain.from_iterable(lls_split))) + + #make a blank dataframe with the index being the filename + df_img = pd.DataFrame(index=ls_file, columns=ls_column) + if b_test: + print(lls_final[0]) + print(f'Length = {len(lls_final[0])}') + #add a column for each part of the name + else: + for fidx, ls_final in enumerate(lls_final): + for idx, s_name in enumerate(ls_final): + df_img.loc[ls_file[fidx],ls_column[idx]] = s_name + print('Mean number of items in file name') + print(np.asarray([(len(item)) for item in lls_final]).mean()) + if (np.asarray([(len(item)) for item in lls_final]).mean()).is_integer()==False: + print([(len(item)) for item in lls_final]) + i_right = np.asarray([(len(item)) for item in lls_final]).max() + for fidx, ls_final in enumerate(lls_final): + if len(ls_final) < i_right: + print(f' inconsitent name: {ls_file[fidx]}') + return(df_img) + +def parse_org(s_end = "ORG.tif",s_start='R',type='reg'): + """ + This function will parse images following koei's naming convention + Example: Registered-R1_PCNA.CD8.PD1.CK19_Her2B-K157-Scene-002_c1_ORG.tif + The output is a dataframe with image filename in index + And rounds, color, imagetype, scene (/tissue), and marker in the columns + type= 'reg' or 'raw' + """ + + ls_file = [] + for file in os.listdir(): + #find all filenames ending in s_end + if file.endswith(s_end): + if file.find(s_start)==0: + ls_file = ls_file + [file] + lls_name = [item.split('_') for item in ls_file] + df_img = pd.DataFrame(index=ls_file) + if type == 'raw': + lls_scene = [item.split('-Scene-') for item in ls_file] + elif type== 'noscenes': + ls_scene = ['Scene-001'] * len(ls_file) + if type == 'raw': + df_img['rounds'] = [item[0] for item in lls_name] + elif type== 'noscenes': + df_img['rounds'] = [item[0] for item in lls_name] + else: + df_img['rounds'] = [item[0].split('Registered-')[1] for item in lls_name] + df_img['color'] = [item[-2] for item in lls_name] + df_img['imagetype'] = [item[-1].split('.tif')[0] for item in lls_name] + if type == 'raw': + df_img['slide'] = [item[2] for item in lls_name] + try: + df_img['scene'] = [item[1].split('_')[0] for item in lls_scene] + except IndexError: + print(f"{set([item[0] for item in lls_scene])}") + elif type == 'noscenes': + df_img['slide'] = [item[2] for item in lls_name] + df_img['scene'] = ls_scene + else: + df_img['scene'] = [item[2] for item in lls_name] + df_img['round_ord'] = [re.sub('Q','.5', item) for item in df_img.rounds] + df_img['round_ord'] = [float(re.sub('[^0-9.]','', item)) for item in df_img.round_ord] + df_img = df_img.sort_values(['round_ord','rounds','color']) + for idx, s_round in enumerate(df_img.rounds.unique()): + df_img.loc[df_img.rounds==s_round, 'round_num'] = idx + #parse file name for biomarker + for s_index in df_img.index: + #print(s_index) + s_color = df_img.loc[s_index,'color'] + if s_color == 'c1': + s_marker = 'DAPI' + elif s_color == 'c2': + s_marker = s_index.split('_')[1].split('.')[0] + elif s_color == 'c3': + s_marker = s_index.split('_')[1].split('.')[1] + elif s_color == 'c4': + s_marker = s_index.split('_')[1].split('.')[2] + elif s_color == 'c5': + s_marker = s_index.split('_')[1].split('.')[3] + #these are only included in sardana shading corrected images + elif s_color == 'c6': + s_marker = s_index.split('_')[1].split('.')[2] + elif s_color == 'c7': + s_marker = s_index.split('_')[1].split('.')[3] + else: print('Error') + df_img.loc[s_index,'marker'] = s_marker + + return(df_img) #,lls_name) + +def filename_dataframe(s_end = ".czi",s_start='R',s_split='_'): + ''' + quick and dirty way to select files for dataframe. + s_end = string at end of file names + s_start = string at beginning of filenames + s_split = character/string in all file names + ''' + ls_file = [] + for file in os.listdir(): + #find all filenames ending in 'ORG.tif' + if file.endswith(s_end): + if file.find(s_start)==0: + ls_file = ls_file + [file] + lls_name = [item.split(s_split) for item in ls_file] + df_img = pd.DataFrame(index=ls_file) + df_img['data'] = [item[0] for item in lls_name] + return(df_img) + +def underscore_to_dot(s_sample, s_end='ORG.tif', s_start='R',s_split='_'): + df = filename_dataframe(s_end,s_start,s_split) + ls_old = sorted(set([item.split(f'_{s_sample}')[0] for item in df.index])) + ls_new = sorted(set([item.split(f'_{s_sample}')[0].replace('_','.').replace(f"{df.loc[item,'data']}.",f"{df.loc[item,'data']}_") for item in df.index])) + d_replace = dict(zip(ls_old,ls_new)) + for key, item in d_replace.items(): + if key.split('_')[0] != item.split('_')[0]: + print(f' Error {key} mathced to {item}') + return(d_replace) + +def add_exposure(df_img,df_t,type='roundcycles'): + """ + df_img = dataframe of images with columns [ 'color', 'exposure', 'marker','sub_image','sub_exposure'] + and index with image names + df_t = metadata with dataframe with ['marker','exposure'] + """ + if type == 'roundscycles': + for s_index in df_img.index: + s_marker = df_img.loc[s_index,'marker'] + #look up exposure time for marker in metadata + df_t_image = df_t[(df_t.marker==s_marker)] + if len(df_t_image) > 0: + i_exposure = df_t_image.iloc[0].loc['exposure'] + df_img.loc[s_index,'exposure'] = i_exposure + else: + print(f'{s_marker} has no recorded exposure time') + elif type == 'czi': + #add exposure + df_t['rounds'] = [item.split('_')[0] for item in df_t.index] + #df_t['tissue'] = [item.split('_')[2].split('-Scene')[0] for item in df_t.index] #not cool with stiched + for s_index in df_img.index: + s_tissue = df_img.loc[s_index,'scene'].split('-Scene')[0] + s_color = str(int(df_img.loc[s_index,'color'].split('c')[1])-1) + s_round = df_img.loc[s_index,'rounds'] + print(s_index) + df_img.loc[s_index,'exposure'] = df_t[(df_t.index.str.contains(s_tissue)) & (df_t.rounds==s_round)].loc[:,s_color][0] + + return(df_img) + +def subtract_images(df_img,d_channel={'c2':'L488','c3':'L555','c4':'L647','c5':'L750'},ls_exclude=[],subdir='SubtractedRegisteredImages',b_8bit=True):#b_mkdir=True, + """ + This code loads 16 bit grayscale tiffs, performs AF subtraction of channels/rounds defined by the user, and outputs 8 bit AF subtracted tiffs for visualization. + The data required is: + 1. The RoundsCyclesTable with real exposure times + 2. dataframe of images to process (df_img); can be created with any custom parsing function + df_img = dataframe of images with columns [ 'color', 'exposure', 'marker'] + and index with image names + d_channel = dictionary mapping color to marker to subtract + ls_exclude = lost of markers not needing subtraction + """ + #generate dataframe of subtraction markers + es_subtract = set() + for s_key, s_value in d_channel.items(): + es_subtract.add(s_value) + print(f'Subtracting {s_value} for all {s_key}') + + df_subtract = pd.DataFrame() + for s_subtract in sorted(es_subtract): + se_subtract = df_img[df_img.marker==s_subtract] + df_subtract = df_subtract.append(se_subtract) + print(f'The background images {df_subtract.index.tolist}') + print(f'The background markers {df_subtract.marker.tolist}') + + #generate dataframe of how subtraction is set up + #set of markers minus the subtraction markers + es_markers = set(df_img.marker) - es_subtract + #dataframe of markers + df_markers = df_img[df_img.loc[:,'marker'].isin(sorted(es_markers))] + #minus dapi (color 1 or DAPI) + #df_markers = df_markers[df_markers.loc[:,'color']!='c1'] + #df_markers = df_markers[~df_markers.loc[:,'marker'].str.contains('DAPI')] + df_copy = df_img[df_img.marker.isin(ls_exclude)] + df_markers = df_markers[~df_markers.marker.isin(ls_exclude)] + + for s_file in df_copy.index.tolist(): + print(s_file) + #print(f'copied to ./AFSubtracted/{s_file}') + #shutil.copyfile(s_file,f'./AFSubtracted/{s_file}') + print(f'copied to {subdir}/{s_file}') + shutil.copyfile(s_file,f'{subdir}/{s_file}') + #ls_scene = sorted(set(df_img.scene)) + #add columns with mapping of proper subtracted image to dataframe + + for s_index in df_markers.index.tolist(): + print('add colums') + print(s_index) + s_scene = s_index.split('_')[2] + s_color = df_markers.loc[s_index,'color'] + if len(df_subtract[(df_subtract.color==s_color) & (df_subtract.scene==s_scene)])==0: + print(f'missing {s_color} in {s_scene}') + else: + df_markers.loc[s_index,'sub_image'] = df_subtract[(df_subtract.color==s_color) & (df_subtract.scene==s_scene)].index[0] + df_markers.loc[s_index,'sub_exposure'] = df_subtract[(df_subtract.color==s_color) & (df_subtract.scene==s_scene)].exposure[0] + + #loop to subtract + for s_index in df_markers.index.tolist(): + print(f'Processing {s_index}') + s_image = s_index + s_color = '_' + df_markers.loc[s_index,'color'] + '_' + s_background = df_markers.loc[s_index,'sub_image'] + print(f'From {s_image} subtracting \n {s_background}') + a_img = skimage.io.imread(s_image) + a_AF = skimage.io.imread(s_background) + #divide each image by exposure time + #subtract 1 ms AF from 1 ms signal + #multiply by original image exposure time + a_sub = (a_img/df_markers.loc[s_index,'exposure'] - a_AF/df_markers.loc[s_index,'sub_exposure'])*df_markers.loc[s_index,'exposure'] + a_zero = (a_sub.clip(min=0)).astype(int) #max=a_sub.max() #took out max parameter from np.clip, but it was fine in + if b_8bit: + #a_16bit = skimage.img_as_ubyte(a_zero) + #a_zero = a_sub.clip(min=0,max=a_sub.max()) + a_bit = (a_zero/256).astype(np.uint8) + else: + a_bit = skimage.img_as_uint(a_zero) + s_fname = f'{subdir}/{s_index.split(s_color)[0]}_Sub{df_subtract.loc[df_markers.loc[s_index,"sub_image"],"marker"]}{s_color}{s_index.split(s_color)[1]}' + skimage.io.imsave(s_fname,a_bit) + + return(df_markers,df_copy)#df_markers,es_subtract + +def subtract_scaled_images(df_img,d_late={'c2':'R5Qc2','c3':'R5Qc3','c4':'R5Qc4','c5':'R5Qc5'},d_early={'c2':'R0c2','c3':'R0c3','c4':'R0c4','c5':'R0c5'},ls_exclude=[],subdir='SubtractedRegisteredImages',b_8bit=False): + """ + This code loads 16 bit grayscale tiffs, performs scaled AF subtraction + based on the round position between early and late AF channels/rounds defined by the user, + and outputs AF subtracted tiffs or ome-tiffs for visualization. + The data required is: + 1. The RoundsCyclesTable with real exposure times + 2. dataframe of images to process (df_img); can be created with any custom parsing function + df_img = dataframe of images with columns [ 'color', 'exposure', 'marker','round_ord'] + and index with image names + d_channel = dictionary mapping color to marker to subtract + ls_exclude = lost of markers not needing subtraction + """ + #generate dataframe of subtraction markers + es_subtract = set() + [es_subtract.add(item) for key, item in d_early.items()] + [es_subtract.add(item) for key, item in d_late.items()] + + #markers minus the subtraction markers & excluded markers + es_markers = set(df_img.marker) - es_subtract + #dataframe of markers + df_markers = df_img[df_img.loc[:,'marker'].isin(es_markers)] + df_copy = df_img[df_img.marker.isin(ls_exclude)] + df_markers = df_markers[~df_markers.marker.isin(ls_exclude)] + + #copy excluded markers + for s_file in df_copy.index.tolist(): + print(s_file) + print(f'copied to {subdir}/{s_file}') + shutil.copyfile(s_file,f'{subdir}/{s_file}') + + #add columns with mapping of proper AF images to marker images + for s_index in df_markers.index.tolist(): + print('add colums') + print(s_index) + s_scene = df_markers.loc[s_index,'scene'] + s_color = df_markers.loc[s_index,'color'] + s_early = d_early[s_color] + s_late = d_late[s_color] + i_round = df_markers.loc[s_index,'round_num'] + df_scene = df_img[df_img.scene==s_scene] + if len(df_scene[df_scene.marker==s_early]) == 0: + print(f' Missing early AF channel for {s_scene} {s_color}') + elif len(df_scene[df_scene.marker==s_late]) == 0: + print(f' Missing late AF channel for {s_scene} {s_color}') + else: + i_early = df_scene[(df_scene.marker==s_early)].round_num[0] + i_late = df_scene[(df_scene.marker==s_late)].round_num[0] + df_markers.loc[s_index,'sub_name'] = f'{s_early}{s_late}' + df_markers.loc[s_index,'sub_early'] = df_scene[(df_scene.marker==s_early)].index[0] + df_markers.loc[s_index,'sub_early_exp'] = df_scene[(df_scene.marker==s_early)].exposure[0] + df_markers.loc[s_index,'sub_late'] = df_scene[(df_scene.marker==s_late)].index[0] + df_markers.loc[s_index,'sub_late_exp'] = df_scene[(df_scene.marker==s_late)].exposure[0] + df_markers.loc[s_index,'sub_ratio_late'] = np.clip((i_round-i_early)/(i_late - i_early),0,1) + df_markers.loc[s_index,'sub_ratio_early'] = np.clip(1 - (i_round-i_early)/(i_late - i_early),0,1) + + #loop to subtract + for s_index in df_markers.index.tolist(): + print(f'Processing {s_index}') + s_color = '_' + df_markers.loc[s_index,'color'] + '_' + a_img = skimage.io.imread(s_index) + a_early = skimage.io.imread(df_markers.loc[s_index,'sub_early']) + a_late = skimage.io.imread(df_markers.loc[s_index,'sub_late']) + #divide each image by exposure time + a_img_exp = a_img/df_markers.loc[s_index,'exposure'] + a_early_exp = a_early/df_markers.loc[s_index,'sub_early_exp'] + a_late_exp = a_late/df_markers.loc[s_index,'sub_late_exp'] + #combine early and late based on round_num + a_early_exp = a_early_exp * df_markers.loc[s_index,'sub_ratio_early'] + a_late_exp = a_late_exp * df_markers.loc[s_index,'sub_ratio_late'] + #subtract 1 ms AF from 1 ms signal + #multiply by original image exposure time + a_sub = (a_img_exp - a_early_exp - a_late_exp)*df_markers.loc[s_index,'exposure'] + a_zero = (a_sub.clip(min=0)).astype(int) # + if b_8bit: + a_bit = (a_zero/256).astype(np.uint8) + else: + a_bit = skimage.img_as_uint(a_zero) + s_fname = f'{subdir}/{s_index.split(s_color)[0]}_Sub{df_markers.loc[s_index,"sub_name"]}{s_color}{s_index.split(s_color)[1]}' + skimage.io.imsave(s_fname,a_bit) + + return(df_markers,df_copy) + +def overlay_crop(d_combos,d_crop,df_img,s_dapi,tu_dim=(1000,1000),b_8bit=True): + """ + output custon multi page tiffs according to dictionary, with s_dapi as channel 1 in each overlay + BUG with 53BP1 + d_crop : {slide_scene : (x,y) coord + tu_dim = (width, height) + d_combos = {'Immune':{'CD45', 'PD1', 'CD8', 'CD4', 'CD68', 'FoxP3','GRNZB','CD20','CD3'}, + 'Stromal':{'Vim', 'aSMA', 'PDPN', 'CD31', 'ColIV','ColI'}, + 'Differentiation':{'CK19', 'CK7','CK5', 'CK14', 'CK17','CK8'}, + 'Tumor':{'HER2', 'Ecad', 'ER', 'PgR','Ki67','PCNA'}, + 'Proliferation':{'EGFR','CD44','AR','pHH3','pRB'}, + 'Functional':{'pS6RP','H3K27','H3K4','cPARP','gH2AX','pAKT','pERK'}, + 'Lamins':{'LamB1','LamAC', 'LamB2'}} + """ + dd_result = {} + for s_index in df_img.index: + s_marker = df_img.loc[s_index,'marker'] + if s_marker == 'DAPI': + s_marker = s_marker + f'{df_img.loc[s_index,"rounds"].split("R")[1]}' + df_img.loc[s_index,'marker'] = s_marker + #now make overlays + for s_scene, xy_cropcoor in d_crop.items(): + d_result = {} + print(f'Processing {s_scene}') + df_slide = df_img[df_img.scene==s_scene] + s_image_round = df_slide[df_slide.marker==s_dapi].index[0] + if len(df_slide[df_slide.marker==s_dapi.split('_')[0]].index) == 0: + print('Error: dapi not found') + elif len(df_slide[df_slide.marker==s_dapi.split('_')[0]].index) > 1: + print('Error: too many dapi images found') + else: + print(s_image_round) + #exclude any missing biomarkers + es_all = set(df_slide.marker) + #iterate over overlay combinations + for s_type, es_combos in d_combos.items(): + d_overlay = {} + es_combos_shared = es_combos.intersection(es_all) + for idx, s_combo in enumerate(sorted(es_combos_shared)): + s_filename = (df_slide[df_slide.marker==s_combo]).index[0] + if len((df_slide[df_slide.marker==s_combo]).index) == 0: + print(f'Error: {s_combo} not found') + elif len((df_slide[df_slide.marker==s_combo]).index) > 1: + print(f'\n Warning {s_combo}: too many marker images found, used {s_filename}') + else: + print(f'{s_combo}: {s_filename}') + d_overlay.update({s_combo:s_filename}) + #d_overlay.update({s_dapi:s_image_round}) + a_dapi = skimage.io.imread(s_image_round) + #crop + a_crop = a_dapi[(xy_cropcoor[1]):(xy_cropcoor[1]+tu_dim[1]),(xy_cropcoor[0]):(xy_cropcoor[0]+tu_dim[0])] + a_overlay = np.zeros((len(d_overlay) + 1,a_crop.shape[0],a_crop.shape[1]),dtype=np.uint8) + if a_crop.dtype == 'uint16': + if b_8bit: + a_crop = (a_crop/256).astype(np.uint8) + else: + a_rescale = skimage.exposure.rescale_intensity(a_crop,in_range=(0,1.5*np.quantile(a_crop,0.9999))) + a_crop = (a_rescale/256).astype(np.uint8) + print(f'rescale intensity') + a_overlay[0,:,:] = a_crop + ls_biomarker_all = [s_dapi] + for i, s_color in enumerate(sorted(d_overlay.keys())): + s_overlay= d_overlay[s_color] + ls_biomarker_all.append(s_color) + a_channel = skimage.io.imread(s_overlay) + #crop + a_crop = a_channel[(xy_cropcoor[1]):(xy_cropcoor[1]+tu_dim[1]),(xy_cropcoor[0]):(xy_cropcoor[0]+tu_dim[0])] + if a_crop.dtype == 'uint16': + if b_8bit: + a_crop = (a_crop/256).astype(np.uint8) + else: + a_rescale = skimage.exposure.rescale_intensity(a_crop,in_range=(0,1.5*np.quantile(a_crop,0.9999))) + a_crop = (a_rescale/256).astype(np.uint8) + print(f'rescale intensity') + a_overlay[i + 1,:,:] = a_crop + d_result.update({s_type:(ls_biomarker_all,a_overlay)}) + dd_result.update({f'{s_scene}_x{xy_cropcoor[0]}y{xy_cropcoor[1]}':d_result}) + return(dd_result) + +def gen_xml(array, channel_names): + ''' + copy and modify from apeer ome tiff + ls_marker + ''' + #for idx, s_marker in enumerate(ls_marker): + # old = bytes(f'Name="C:{idx}"','utf-8') + # new = bytes(f'Name="{s_marker}"','utf-8') + # s_xml = s_xml.replace(old,new,-1) + #Dimension order is assumed to be TZCYX + dim_order = "TZCYX" + + metadata = omexmlClass.OMEXML() + shape = array.shape + assert ( len(shape) == 5), "Expected array of 5 dimensions" + + metadata.image().set_Name("IMAGE") + metadata.image().set_ID("0") + + pixels = metadata.image().Pixels + pixels.ome_uuid = metadata.uuidStr + pixels.set_ID("0") + + pixels.channel_count = shape[2] + + pixels.set_SizeT(shape[0]) + pixels.set_SizeZ(shape[1]) + pixels.set_SizeC(shape[2]) + pixels.set_SizeY(shape[3]) + pixels.set_SizeX(shape[4]) + + pixels.set_DimensionOrder(dim_order[::-1]) + + pixels.set_PixelType(omexmlClass.get_pixel_type(array.dtype)) + + for i in range(pixels.SizeC): + pixels.Channel(i).set_ID("Channel:0:" + str(i)) + pixels.Channel(i).set_Name(channel_names[i]) + + for i in range(pixels.SizeC): + pixels.Channel(i).set_SamplesPerPixel(1) + + pixels.populate_TiffData() + + return metadata.to_xml().encode() + +def array_img(df_img,s_xlabel='color',ls_ylabel=['rounds','exposure'],s_title='marker',tu_array=(2,4),tu_fig=(10,20),cmap='gray',d_crop={}): + """ + create a grid of images + df_img = dataframe of images with columns having image attributes + and index with image names + s_xlabel = coumns of grid + ls_ylabel = y label + s_title= title + + """ + + fig, ax = plt.subplots(tu_array[0],tu_array[1],figsize=tu_fig) + ax = ax.ravel() + for ax_num, s_index in enumerate(df_img.index): + s_row_label = f'{df_img.loc[s_index,ls_ylabel[0]]}\n {df_img.loc[s_index,ls_ylabel[1]]}' + s_col_label = df_img.loc[s_index,s_xlabel] + a_image=skimage.io.imread(s_index) + s_label_img = df_img.loc[s_index,s_title] + a_rescale = skimage.exposure.rescale_intensity(a_image,in_range=(0,1.5*np.quantile(a_image,0.98))) + if len(d_crop)!= 0: + tu_crop = d_crop[df_img.loc[s_index,'scene']] + a_rescale = a_rescale[(tu_crop[1]):(tu_crop[1]+tu_crop[3]),(tu_crop[0]):(tu_crop[0]+tu_crop[2])] + ax[ax_num].imshow(a_rescale,cmap=cmap) + ax[ax_num].set_title(s_label_img) + ax[ax_num].set_ylabel(s_row_label) + ax[ax_num].set_xlabel(f'{s_col_label}\n 0 - {int(1.5*np.quantile(a_image,0.98))}') + plt.tight_layout() + return(fig) + +def array_roi(df_img,s_column='color',s_row='rounds',s_label='marker',tu_crop=(0,0,100,100),tu_array=(2,4),tu_fig=(10,20), cmap='gray',b_min_label=True,tu_rescale=(0,0)): + """ + create a grid of images + df_img = dataframe of images with columns having image attributes + and index with image names + s_column = coumns of grid + s_row = rows of grid + s_label= attribute to label axes + tu_crop = (upper left corner x, y , xlength, yheight) + tu_dim = a tumple of x and y dimensinons of crop + """ + + fig, ax = plt.subplots(tu_array[0],tu_array[1],figsize=tu_fig,sharex=True, sharey=True) + if b_min_label: + fig, ax = plt.subplots(tu_array[0],tu_array[1],figsize=tu_fig, sharey=True) + ax = ax.ravel() + for ax_num, s_index in enumerate(df_img.index): + s_row_label = df_img.loc[s_index,s_row] + s_col_label = df_img.loc[s_index,s_column] + s_label_img = df_img.loc[s_index,s_label] + #load image, copr, rescale + a_image=skimage.io.imread(s_index) + a_crop = a_image[(tu_crop[1]):(tu_crop[1]+tu_crop[3]),(tu_crop[0]):(tu_crop[0]+tu_crop[2])] + if tu_rescale==(0,0): + a_rescale = skimage.exposure.rescale_intensity(a_crop,in_range=(0,np.quantile(a_image,0.98)+np.quantile(a_image,0.98)/2)) + tu_max = (0,np.quantile(a_image,0.98)+np.quantile(a_image,0.98)/2) + ax[ax_num].imshow(a_rescale,cmap='gray') + else: + print(f'original {a_crop.min()},{a_crop.max()}') + print(f'rescale to {tu_rescale}') + a_rescale = skimage.exposure.rescale_intensity(a_crop,in_range=tu_rescale,out_range=tu_rescale) + tu_max=tu_rescale + ax[ax_num].imshow(a_rescale,cmap=cmap,vmin=0, vmax=tu_max[1]) + ax[ax_num].set_title(s_label_img) + ax[ax_num].set_ylabel(s_row_label) + ax[ax_num].set_xlabel(s_col_label) + if b_min_label: + ax[ax_num].set_xticklabels('') + ax[ax_num].set_xlabel(f'{tu_max[0]} - {int(tu_max[1])}') #min/max = + plt.tight_layout() + return(fig) + +def load_labels(d_crop,segdir,s_find='Nuclei Segmentation Basins'): + """ + load the segmentation basins (cell of nuceli) + s_find: 'exp5_CellSegmentationBasins' or 'Nuclei Segmentation Basins' + """ + d_label={} + cwd = os.getcwd() + for s_scene, xy_cropcoor in d_crop.items(): + print(s_scene) + s_sample = s_scene.split('-Scene-')[0] + os.chdir(f'{segdir}') + for s_file in os.listdir(): + if s_file.find(s_find) > -1: #Nuclei Segmentation Basins.tif #Cell Segmentation Basins.tif + if s_file.find(s_scene.split(s_sample)[1]) > -1: + print(f'loading {s_file}') + a_seg = skimage.io.imread(s_file) + d_label.update({s_scene:a_seg}) + os.chdir(cwd) + return(d_label) + +def crop_labels(d_crop,d_label,tu_dim,cropdir,s_name='Nuclei Segmentation Basins'): + """ + crop the segmentation basins (cell of nuceli) to same coord as images for veiwing in Napari + s_name = + """ + for s_scene, xy_cropcoor in d_crop.items(): + print(s_scene) + a_seg = d_label[s_scene] + a_crop = a_seg[(xy_cropcoor[1]):(xy_cropcoor[1]+tu_dim[1]),(xy_cropcoor[0]):(xy_cropcoor[0]+tu_dim[0])] + s_coor = f'x{xy_cropcoor[0]}y{xy_cropcoor[1]}.tif' + #crop file + s_file_new = f'{cropdir}/{s_scene}_{s_name.replace(" ","")}{s_coor}' + print(s_file_new) + skimage.io.imsave(s_file_new,a_crop) + + +def fmt(x, pos): + a, b = '{:.0e}'.format(x).split('e') + b = int(b) + return r'${} \times 10^{{{}}}$'.format(a, b) + +def array_roi_if(df_img,df_dapi,s_label='rounds',s_title='Title',tu_crop=(0,0,100,100),tu_array=(2,4),tu_fig=(10,20),tu_rescale=(0,0),i_expnorm=0,i_micron_per_pixel=.325): + """ + create a grid of images + df_img = dataframe of images with columns having image attributes + and index with image names + df_dapi = like df_img, but with the matching dapi images + s_label= attribute to label axes + s_title = x axis title + tu_crop = (upper left corner x, y , xlength, yheight) + tu_array = subplot array dimensions + tu_fig = size of figue + tu_rescale= range of rescaling + i_expnorm = normalize to an exposure time (requires 'exposure' column in dataframe + """ + cmap = mpl.colors.LinearSegmentedColormap.from_list('cmap', [(0,0,0),(0,1,0)], N=256, gamma=1.0) + fig, ax = plt.subplots(tu_array[0],tu_array[1],figsize=tu_fig,sharey=True, squeeze=False) # + ax = ax.ravel() + for ax_num, s_index in enumerate(df_img.index): + s_col_label = df_img.loc[s_index,s_label] + #load image, copr, rescale + a_image=skimage.io.imread(s_index) + a_dapi = skimage.io.imread((df_dapi).index[0])# & (df_dapi.rounds=='R1') + a_crop = a_image[(tu_crop[1]):(tu_crop[1]+tu_crop[3]),(tu_crop[0]):(tu_crop[0]+tu_crop[2])] + a_crop_dapi = a_dapi[(tu_crop[1]):(tu_crop[1]+tu_crop[3]),(tu_crop[0]):(tu_crop[0]+tu_crop[2])] + #a_crop_dapi = (a_crop_dapi/255).astype('int') + if i_expnorm > 0: + a_crop = a_crop/df_img.loc[s_index,'exposure']*i_expnorm + if tu_rescale==(0,0): + a_rescale = skimage.exposure.rescale_intensity(a_crop,in_range=(np.quantile(a_crop,0.03),1.5*np.quantile(a_crop,0.998)),out_range=(0, 255)) + tu_max = (np.quantile(a_crop,0.03),1.5*np.quantile(a_crop,0.998)) + else: + #print(f'original {a_crop.min()},{a_crop.max()}') + #print(f'rescale to {tu_rescale}') + a_rescale = skimage.exposure.rescale_intensity(a_crop,in_range = tu_rescale,out_range=(0,255)) + tu_max=tu_rescale + a_rescale_dapi = skimage.exposure.rescale_intensity(a_crop_dapi,in_range = (np.quantile(a_crop_dapi,0.03),2*np.quantile(a_crop_dapi,0.99)),out_range=(0,255)) + a_rescale_dapi = a_rescale_dapi.astype(np.uint8) + a_rescale = a_rescale.astype(np.uint8) + #2 color png + zdh = np.dstack((np.zeros_like(a_rescale), a_rescale, a_rescale_dapi)) + ax[ax_num].imshow(zdh) + ax[ax_num].set_title('') + ax[ax_num].set_ylabel('') + ax[ax_num].set_xlabel(s_col_label,fontsize = 'x-large') + if tu_rescale == (0,0): + if len(ax)>1: + ax[ax_num].set_xlabel(f'{s_col_label} ({int(np.quantile(a_crop,0.03))} - {int(1.5*np.quantile(a_crop,0.998))})') + ax[ax_num].set_xticklabels('') + #pixel to micron (apply after ax is returned) + #ax[0].set_yticklabels([str(int(re.sub(u"\u2212", "-", item.get_text()))*i_micron_per_pixel) for item in ax[0].get_yticklabels(minor=False)]) + plt.suptitle(s_title,y=0.93,size = 'xx-large',weight='bold') + plt.subplots_adjust(wspace=.05, hspace=.05) + # Now adding the colorbar + norm = mpl.colors.Normalize(vmin=tu_max[0],vmax=tu_max[1]) + sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm) + sm.set_array([]) + if len(ax) == 1: + cbaxes = fig.add_axes([.88, 0.125, 0.02, 0.75]) #[left, bottom, width, height] + plt.colorbar(sm, cax=cbaxes)#,format=ticker.FuncFormatter(fmt)) + plt.figtext(0.47,0.03,s_label.replace('_',' '),fontsize = 'x-large', weight='bold') + elif tu_rescale != (0,0): + cbaxes = fig.add_axes([.91, 0.15, 0.015, 0.7]) #[left, bottom, width, height] + plt.colorbar(sm, cax=cbaxes)#,format=ticker.FuncFormatter(fmt)) + plt.figtext(0.42,0.03,s_label.replace('_',' '),fontsize = 'x-large', weight='bold') + else: + print("Different ranges - can't use colorbar") + plt.figtext(0.43,0.03,s_label.replace('_',' '),fontsize = 'x-large', weight='bold') + + return(fig,ax) + +def multicolor_png(df_img,df_dapi,s_scene,d_overlay,d_crop,es_dim={'CD8','FoxP3','ER','AR'},es_bright={'Ki67','pHH3'},low_thresh=4000,high_thresh=0.999): + ''' + create RGB image with Dapi plus four - 6 channels + ''' + + d_result = {} + #print(s_scene) + tu_crop = d_crop[s_scene] + df_slide = df_img[df_img.scene == s_scene] + x=tu_crop[1] + y=tu_crop[0] + img_dapi = skimage.io.imread(df_dapi[df_dapi.scene==s_scene].path[0]) + a_crop = img_dapi[x:x+800,y:y+800] + a_rescale_dapi = skimage.exposure.rescale_intensity(a_crop,in_range=(np.quantile(img_dapi,0.2),1.5*np.quantile(img_dapi,high_thresh)),out_range=(0, 255)) + if 1.5*np.quantile(img_dapi,high_thresh) < low_thresh: + a_rescale_dapi = skimage.exposure.rescale_intensity(a_crop,in_range=(low_thresh/2,low_thresh),out_range=(0, 255)) + elif len(es_dim.intersection(set(['DAPI'])))==1: + new_thresh = float(str(high_thresh)[:-2]) + a_rescale_dapi = skimage.exposure.rescale_intensity(a_crop,in_range=(np.quantile(img_dapi,0.2),1.5*np.quantile(img_dapi,new_thresh)),out_range=(0, 255)) + elif len(es_bright.intersection(set(['DAPI'])))==1: + a_rescale_dapi = skimage.exposure.rescale_intensity(a_crop,in_range=(np.quantile(img_dapi,0.2),1.5*np.quantile(img_dapi,float(str(high_thresh) + '99'))),out_range=(0, 255)) + + #RGB + for s_type, ls_marker in d_overlay.items(): + #print(s_type) + zdh = np.dstack((np.zeros_like(a_rescale_dapi), np.zeros_like(a_rescale_dapi),a_rescale_dapi)) + for idx, s_marker in enumerate(ls_marker): + #print(s_marker) + s_index = df_slide[df_slide.marker == s_marker].index[0] + img = skimage.io.imread(df_slide.loc[s_index,'path']) + a_crop = img[x:x+800,y:y+800] + in_range = (np.quantile(a_crop,0.2),1.5*np.quantile(a_crop,high_thresh)) + a_rescale = skimage.exposure.rescale_intensity(a_crop,in_range=in_range,out_range=(0, 255)) + if 1.5*np.quantile(a_crop,high_thresh) < low_thresh: + #print('low thresh') + in_range=(low_thresh/2,low_thresh) + a_rescale = skimage.exposure.rescale_intensity(a_crop,in_range=in_range,out_range=(0, 255)) + elif len(es_dim.intersection(set([s_marker])))==1: + #print('dim') + new_thresh = float(str(high_thresh)[:-2]) + in_range=(np.quantile(a_crop,0.2),1.5*np.quantile(a_crop,new_thresh)) + a_rescale = skimage.exposure.rescale_intensity(a_crop,in_range=in_range,out_range=(0, 255)) + elif len(es_bright.intersection(set([s_marker])))==1: + #print('bright') + in_range=(np.quantile(a_crop,0.2),1.5*np.quantile(a_crop,float(str(high_thresh) + '99'))) + a_rescale = skimage.exposure.rescale_intensity(a_crop,in_range=in_range,out_range=(0, 255)) + + #print(f'low {int(in_range[0])} high {int(in_range[1])}') + if idx == 0: + zdh = zdh + np.dstack((np.zeros_like(a_rescale), a_rescale,np.zeros_like(a_rescale))) + + elif idx == 1: + zdh = zdh + np.dstack((a_rescale, a_rescale,np.zeros_like(a_rescale))) + + elif idx == 2: + zdh = zdh + np.dstack((a_rescale, np.zeros_like(a_rescale),np.zeros_like(a_rescale) )) + + elif idx == 3: + zdh = zdh + np.dstack((np.zeros_like(a_rescale), a_rescale, a_rescale)) + #print(zdh.min()) + zdh = zdh.clip(0,255) + zdh = zdh.astype('uint8') + #print(zdh.max()) + d_result.update({s_type:(ls_marker,zdh)}) + return(d_result) + +def roi_if_border(df_img,df_dapi,df_border,s_label='rounds',s_title='Title',tu_crop=(0,0,100,100),tu_array=(2,4),tu_fig=(10,20),tu_rescale=(0,0),i_expnorm=0,i_micron_per_pixel=.325): + """ + create a grid of images + df_img = dataframe of images with columns having image attributes + and index with image names + df_dapi = like df_img, but with the matching dapi images + df_border: index is border image file name + s_label= attribute to label axes + s_title = x axis title + tu_crop = (upper left corner x, y , xlength, yheight) + tu_array = subplot array dimensions + tu_fig = size of figue + tu_rescale= + i_expnorm = + """ + cmap = mpl.colors.LinearSegmentedColormap.from_list('cmap', [(0,0,0),(0,1,0)], N=256, gamma=1.0) + fig, ax = plt.subplots(tu_array[0],tu_array[1],figsize=tu_fig,sharey=True, squeeze=False) # + ax = ax.ravel() + for ax_num, s_index in enumerate(df_img.index): + s_col_label = df_img.loc[s_index,s_label] + #load image, copr, rescale + a_image=skimage.io.imread(s_index) + a_dapi = skimage.io.imread((df_dapi).index[0])# & (df_dapi.rounds=='R1') + a_crop = a_image[(tu_crop[1]):(tu_crop[1]+tu_crop[3]),(tu_crop[0]):(tu_crop[0]+tu_crop[2])] + a_crop_dapi = a_dapi[(tu_crop[1]):(tu_crop[1]+tu_crop[3]),(tu_crop[0]):(tu_crop[0]+tu_crop[2])] + #a_crop_dapi = (a_crop_dapi/255).astype('int') + if i_expnorm > 0: + a_crop = a_crop/df_img.loc[s_index,'exposure']*i_expnorm + if tu_rescale==(0,0): + a_rescale = skimage.exposure.rescale_intensity(a_crop,in_range=(np.quantile(a_crop,0.03),1.5*np.quantile(a_crop,0.998)),out_range=(0, 255)) + tu_max = (np.quantile(a_crop,0.03),1.5*np.quantile(a_crop,0.998)) + else: + print(f'original {a_crop.min()},{a_crop.max()}') + print(f'rescale to {tu_rescale}') + a_rescale = skimage.exposure.rescale_intensity(a_crop,in_range = tu_rescale,out_range=(0,255)) + tu_max=tu_rescale + a_rescale_dapi = skimage.exposure.rescale_intensity(a_crop_dapi,in_range = (np.quantile(a_crop_dapi,0.03),2*np.quantile(a_crop_dapi,0.99)),out_range=(0,255)) + a_rescale_dapi = a_rescale_dapi.astype(np.uint8) + a_rescale = a_rescale.astype(np.uint8) + #white border + s_border_index = df_border[df_border.marker==(df_img.loc[s_index,'marker'])].index[0] + a_border = skimage.io.imread(s_border_index) + a_crop_border = a_border[(tu_crop[1]):(tu_crop[1]+tu_crop[3]),(tu_crop[0]):(tu_crop[0]+tu_crop[2])] + mask = a_crop_border > 250 + #2 color png + zdh = np.dstack((np.zeros_like(a_rescale), a_rescale, a_rescale_dapi)) + zdh[mask] = 255 + #zdh = zdh.clip(0,255) + #zdh = zdh.astype('uint8') + ax[ax_num].imshow(zdh) + ax[ax_num].set_title('') + ax[ax_num].set_ylabel('') + ax[ax_num].set_xlabel(s_col_label,fontsize = 'x-large') + if tu_rescale == (0,0): + if len(ax)>1: + ax[ax_num].set_xlabel(f'{s_col_label} ({int(np.quantile(a_crop,0.03))} - {int(1.5*np.quantile(a_crop,0.998))})') + ax[ax_num].set_xticklabels('') + #pixel to micron (apply after ax is returned) + #ax[0].set_yticklabels([str(int(re.sub(u"\u2212", "-", item.get_text()))*i_micron_per_pixel) for item in ax[0].get_yticklabels(minor=False)]) + plt.suptitle(s_title,y=0.93,size = 'xx-large',weight='bold') + plt.subplots_adjust(wspace=.05, hspace=.05) + # Now adding the colorbar + norm = mpl.colors.Normalize(vmin=tu_max[0],vmax=tu_max[1]) + sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm) + sm.set_array([]) + if len(ax) == 1: + cbaxes = fig.add_axes([.88, 0.125, 0.02, 0.75]) #[left, bottom, width, height] + plt.colorbar(sm, cax = cbaxes) + plt.figtext(0.47,0.03,s_label.replace('_',' '),fontsize = 'x-large', weight='bold') + elif tu_rescale != (0,0): + cbaxes = fig.add_axes([.92, 0.175, 0.02, 0.64]) #[left, bottom, width, height] + plt.colorbar(sm, cax = cbaxes) + plt.figtext(0.42,0.03,s_label.replace('_',' '),fontsize = 'x-large', weight='bold') + else: + print("Different ranges - can't use colorbar") + plt.figtext(0.43,0.03,s_label.replace('_',' '),fontsize = 'x-large', weight='bold') + + return(fig,ax,a_crop_border) + diff --git a/mplex_image/normalize.py b/mplex_image/normalize.py new file mode 100755 index 0000000..2c03147 --- /dev/null +++ b/mplex_image/normalize.py @@ -0,0 +1,536 @@ +#from https://github.com/brentp/combat.py/blob/master/combat.py +import patsy +import sys +import numpy.linalg as la +import numpy as np +import pandas as pd +import sys +import matplotlib.pyplot as plt + +def aprior(gamma_hat): + m = gamma_hat.mean() + s2 = gamma_hat.var() + return (2 * s2 +m**2) / s2 + +def bprior(gamma_hat): + m = gamma_hat.mean() + s2 = gamma_hat.var() + return (m*s2+m**3)/s2 + +def it_sol(sdat, g_hat, d_hat, g_bar, t2, a, b, conv=0.0001): + n = (1 - np.isnan(sdat)).sum(axis=1) + g_old = g_hat.copy() + d_old = d_hat.copy() + + change = 1 + count = 0 + while change > conv: + #print g_hat.shape, g_bar.shape, t2.shape + g_new = postmean(g_hat, g_bar, n, d_old, t2) + sum2 = ((sdat - np.dot(g_new.values.reshape((g_new.shape[0], 1)), np.ones((1, sdat.shape[1])))) ** 2).sum(axis=1) + d_new = postvar(sum2, n, a, b) + + change = max((abs(g_new - g_old) / g_old).max(), (abs(d_new - d_old) / d_old).max()) + g_old = g_new #.copy() + d_old = d_new #.copy() + count = count + 1 + adjust = (g_new, d_new) + return adjust + +def postmean(g_hat, g_bar, n, d_star, t2): + return (t2*n*g_hat+d_star * g_bar) / (t2*n+d_star) + +def postvar(sum2, n, a, b): + return (0.5 * sum2 + b) / (n / 2.0 + a - 1.0) + +def design_mat(mod, numerical_covariates, batch_levels): + # require levels to make sure they are in the same order as we use in the + # rest of the script. + design = patsy.dmatrix("~ 0 + C(batch, levels=%s)" % str(batch_levels), + mod, return_type="dataframe") + + mod = mod.drop(["batch"], axis=1) + numerical_covariates = list(numerical_covariates) + sys.stderr.write("found %i batches\n" % design.shape[1]) + other_cols = [c for i, c in enumerate(mod.columns) + if not i in numerical_covariates] + factor_matrix = mod[other_cols] + design = pd.concat((design, factor_matrix), axis=1) + if numerical_covariates is not None: + sys.stderr.write("found %i numerical covariates...\n" + % len(numerical_covariates)) + for i, nC in enumerate(numerical_covariates): + cname = mod.columns[nC] + sys.stderr.write("\t{0}\n".format(cname)) + design[cname] = mod[mod.columns[nC]] + sys.stderr.write("found %i categorical variables:" % len(other_cols)) + sys.stderr.write("\t" + ", ".join(other_cols) + '\n') + return design + +def combat(data, batch, model=None, numerical_covariates=None): + """Correct for batch effects in a dataset + Parameters + ---------- + data : pandas.DataFrame + A (n_features, n_samples) dataframe of the expression or methylation + data to batch correct + batch : pandas.Series + A column corresponding to the batches in the data, with index same as + the columns that appear in ``data`` + model : patsy.design_info.DesignMatrix, optional + A model matrix describing metadata on the samples which could be + causing batch effects. If not provided, then will attempt to coarsely + correct just from the information provided in ``batch`` + numerical_covariates : list-like + List of covariates in the model which are numerical, rather than + categorical + Returns + ------- + corrected : pandas.DataFrame + A (n_features, n_samples) dataframe of the batch-corrected data + """ + if isinstance(numerical_covariates, str): + numerical_covariates = [numerical_covariates] + if numerical_covariates is None: + numerical_covariates = [] + + if model is not None and isinstance(model, pd.DataFrame): + model["batch"] = list(batch) + else: + model = pd.DataFrame({'batch': batch}) + + batch_items = model.groupby("batch").groups.items() + batch_levels = [k for k, v in batch_items] + batch_info = [v for k, v in batch_items] + n_batch = len(batch_info) + n_batches = np.array([len(v) for v in batch_info]) + n_array = float(sum(n_batches)) + + # drop intercept + drop_cols = [cname for cname, inter in ((model == 1).all()).iteritems() if inter == True] + drop_idxs = [list(model.columns).index(cdrop) for cdrop in drop_cols] + model = model[[c for c in model.columns if not c in drop_cols]] + numerical_covariates = [list(model.columns).index(c) if isinstance(c, str) else c + for c in numerical_covariates if not c in drop_cols] + + design = design_mat(model, numerical_covariates, batch_levels) + + sys.stderr.write("Standardizing Data across genes.\n") + #error shapes (3,7200) and (26,7200) not aligned: 7200 (dim 1) != 26 (dim 0) + B_hat = np.dot(np.dot(la.inv(np.dot(design.T, design)), design.T), data.T) #data.T + grand_mean = np.dot((n_batches / n_array).T, B_hat[:n_batch,:]) + var_pooled = np.dot(((data - np.dot(design, B_hat).T)**2), np.ones((int(n_array), 1)) / int(n_array)) + + stand_mean = np.dot(grand_mean.T.reshape((len(grand_mean), 1)), np.ones((1, int(n_array)))) + tmp = np.array(design.copy()) + tmp[:,:n_batch] = 0 + stand_mean += np.dot(tmp, B_hat).T + + s_data = ((data - stand_mean) / np.dot(np.sqrt(var_pooled), np.ones((1, int(n_array))))) + + sys.stderr.write("Fitting L/S model and finding priors\n") + batch_design = design[design.columns[:n_batch]] + gamma_hat = np.dot(np.dot(la.inv(np.dot(batch_design.T, batch_design)), batch_design.T), s_data.T) + + delta_hat = [] + + for i, batch_idxs in enumerate(batch_info): + #batches = [list(model.columns).index(b) for b in batches] + delta_hat.append(s_data[batch_idxs].var(axis=1)) + + gamma_bar = gamma_hat.mean(axis=1) + t2 = gamma_hat.var(axis=1) + + + a_prior = list(map(aprior, delta_hat)) + b_prior = list(map(bprior, delta_hat)) + + sys.stderr.write("Finding parametric adjustments\n") + gamma_star, delta_star = [], [] + for i, batch_idxs in enumerate(batch_info): + #print '18 20 22 28 29 31 32 33 35 40 46' + #print batch_info[batch_id] + + temp = it_sol(s_data[batch_idxs], gamma_hat[i], + delta_hat[i], gamma_bar[i], t2[i], a_prior[i], b_prior[i]) + + gamma_star.append(temp[0]) + delta_star.append(temp[1]) + + sys.stdout.write("Adjusting data\n") + bayesdata = s_data + gamma_star = np.array(gamma_star) + delta_star = np.array(delta_star) + + + for j, batch_idxs in enumerate(batch_info): + + dsq = np.sqrt(delta_star[j,:]) + dsq = dsq.reshape((len(dsq), 1)) + denom = np.dot(dsq, np.ones((1, n_batches[j]))) + numer = np.array(bayesdata[batch_idxs] - np.dot(batch_design.loc[batch_idxs], gamma_star).T) + + bayesdata[batch_idxs] = numer / denom + + vpsq = np.sqrt(var_pooled).reshape((len(var_pooled), 1)) + bayesdata = bayesdata * np.dot(vpsq, np.ones((1, int(n_array)))) + stand_mean + + return bayesdata + +#adapted from https://github.com/brentp/combat.py/blob/master/combat.py + + +def combat_fit(data, batch, model=None, numerical_covariates=None): + """Correct for batch effects in a dataset + Parameters + ---------- + data : pandas.DataFrame + A (n_features, n_samples) dataframe of the expression or methylation + data to batch correct + batch : pandas.Series + A column corresponding to the batches in the data, with index same as + the columns that appear in ``data`` + model : patsy.design_info.DesignMatrix, optional + A model matrix describing metadata on the samples which could be + causing batch effects. If not provided, then will attempt to coarsely + correct just from the information provided in ``batch`` + numerical_covariates : list-like + List of covariates in the model which are numerical, rather than + categorical + Returns + ------- + gamma_star : centering parameters from combat fitting + delta_star : scaling parameters from combat fitting + stand_mean: pooled mean of batches + var_pooled: pooled variance of batches + """ + if isinstance(numerical_covariates, str): + numerical_covariates = [numerical_covariates] + if numerical_covariates is None: + numerical_covariates = [] + + if model is not None and isinstance(model, pd.DataFrame): + model["batch"] = list(batch) + else: + model = pd.DataFrame({'batch': batch}) + + batch_items = model.groupby("batch").groups.items() + batch_levels = [k for k, v in batch_items] + batch_info = [v for k, v in batch_items] + n_batch = len(batch_info) + n_batches = np.array([len(v) for v in batch_info]) + n_array = float(sum(n_batches)) + + # drop intercept + drop_cols = [cname for cname, inter in ((model == 1).all()).iteritems() if inter == True] + drop_idxs = [list(model.columns).index(cdrop) for cdrop in drop_cols] + model = model[[c for c in model.columns if not c in drop_cols]] + numerical_covariates = [list(model.columns).index(c) if isinstance(c, str) else c + for c in numerical_covariates if not c in drop_cols] + + design = design_mat(model, numerical_covariates, batch_levels) + + sys.stderr.write("Standardizing Data across genes.\n") + B_hat = np.dot(np.dot(la.inv(np.dot(design.T, design)), design.T), data.T) + grand_mean = np.dot((n_batches / n_array).T, B_hat[:n_batch,:]) + var_pooled = np.dot(((data - np.dot(design, B_hat).T)**2), np.ones((int(n_array), 1)) / int(n_array)) + + stand_mean = np.dot(grand_mean.T.reshape((len(grand_mean), 1)), np.ones((1, int(n_array)))) + tmp = np.array(design.copy()) + tmp[:,:n_batch] = 0 + stand_mean += np.dot(tmp, B_hat).T + + s_data = ((data - stand_mean) / np.dot(np.sqrt(var_pooled), np.ones((1, int(n_array))))) + + sys.stderr.write("Fitting L/S model and finding priors\n") + batch_design = design[design.columns[:n_batch]] + gamma_hat = np.dot(np.dot(la.inv(np.dot(batch_design.T, batch_design)), batch_design.T), s_data.T) + + delta_hat = [] + + for i, batch_idxs in enumerate(batch_info): + delta_hat.append(s_data[batch_idxs].var(axis=1)) + + gamma_bar = gamma_hat.mean(axis=1) + t2 = gamma_hat.var(axis=1) + + + a_prior = list(map(aprior, delta_hat)) + b_prior = list(map(bprior, delta_hat)) + + sys.stderr.write("Finding parametric adjustments\n") + gamma_star, delta_star = [], [] + for i, batch_idxs in enumerate(batch_info): + temp = it_sol(s_data[batch_idxs], gamma_hat[i], + delta_hat[i], gamma_bar[i], t2[i], a_prior[i], b_prior[i]) + + gamma_star.append(temp[0]) + delta_star.append(temp[1]) + #just retrun one stand_mean array + stand_mean = stand_mean[:,0] + return(gamma_star, delta_star, stand_mean, var_pooled) + +def combat_transform(data, batch, gamma_star, delta_star, stand_mean, var_pooled,model=None, numerical_covariates=None): + """Correct for batch effects in a dataset + Parameters + ---------- + data : pandas.DataFrame + A (n_features, n_samples) dataframe of the expression or methylation + data to batch correct + batch : pandas.Series + A column corresponding to the batches in the data, with index same as + the columns that appear in ``data`` + gamma_star : centering parameters from combat fitting + delta_star : scaling parameters from combat fitting + stand_mean: pooled mean of batches + var_pooled: pooled variance of batches + model : patsy.design_info.DesignMatrix, optional + A model matrix describing metadata on the samples which could be + causing batch effects. If not provided, then will attempt to coarsely + correct just from the information provided in ``batch`` + numerical_covariates : list-like + List of covariates in the model which are numerical, rather than + categorical + Returns + ------- + corrected : pandas.DataFrame + A (n_features, n_samples) dataframe of the batch-corrected data + """ + #get design + if isinstance(numerical_covariates, str): + numerical_covariates = [numerical_covariates] + if numerical_covariates is None: + numerical_covariates = [] + + if model is not None and isinstance(model, pd.DataFrame): + model["batch"] = list(batch) + else: + model = pd.DataFrame({'batch': batch}) + batch_items = model.groupby("batch").groups.items() + batch_levels = [k for k, v in batch_items] + batch_info = [v for k, v in batch_items] + n_batch = len(batch_info) + n_batches = np.array([len(v) for v in batch_info]) + n_array = float(sum(n_batches)) + # drop intercept + drop_cols = [cname for cname, inter in ((model == 1).all()).iteritems() if inter == True] + drop_idxs = [list(model.columns).index(cdrop) for cdrop in drop_cols] + model = model[[c for c in model.columns if not c in drop_cols]] + numerical_covariates = [list(model.columns).index(c) if isinstance(c, str) else c + for c in numerical_covariates if not c in drop_cols] + design = design_mat(model, numerical_covariates, batch_levels) + #standardize + sys.stderr.write("Standardizing Data across genes.\n") + + #reshape stand mean + stand_mean = np.dot(stand_mean.T.reshape((len(stand_mean), 1)), np.ones((1, int(data.shape[1])))) + s_data = ((data - stand_mean) / np.dot(np.sqrt(var_pooled), np.ones((1, int(n_array))))) + batch_design = design[design.columns[:n_batch]] + # adjust data + sys.stdout.write("Adjusting data\n") + bayesdata = s_data + gamma_star = np.array(gamma_star) + delta_star = np.array(delta_star) + #for each batch + for j, batch_idxs in enumerate(batch_info): + + dsq = np.sqrt(delta_star[j,:]) + dsq = dsq.reshape((len(dsq), 1)) + denom = np.dot(dsq, np.ones((1, n_batches[j]))) #divide by sqrt delta_star + numer = np.array(bayesdata[batch_idxs] - np.dot(batch_design.loc[batch_idxs], gamma_star).T) #subtract gamma_star + + bayesdata[batch_idxs] = numer / denom + #multiply by square root of variance and add mean + vpsq = np.sqrt(var_pooled).reshape((len(var_pooled), 1)) + bayesdata = bayesdata * np.dot(vpsq, np.ones((1, int(n_array)))) + stand_mean + return bayesdata + + +def combat_fit_old(data, batch, model=None, numerical_covariates=None): + """Correct for batch effects in a dataset + Parameters + ---------- + data : pandas.DataFrame + A (n_features, n_samples) dataframe of the expression or methylation + data to batch correct + batch : pandas.Series + A column corresponding to the batches in the data, with index same as + the columns that appear in ``data`` + model : patsy.design_info.DesignMatrix, optional + A model matrix describing metadata on the samples which could be + causing batch effects. If not provided, then will attempt to coarsely + correct just from the information provided in ``batch`` + numerical_covariates : list-like + List of covariates in the model which are numerical, rather than + categorical + Returns + ------- + gamma_star : centering parameters from combat fitting + delta_star : scaling parameters from combat fitting + """ + if isinstance(numerical_covariates, str): + numerical_covariates = [numerical_covariates] + if numerical_covariates is None: + numerical_covariates = [] + + if model is not None and isinstance(model, pd.DataFrame): + model["batch"] = list(batch) + else: + model = pd.DataFrame({'batch': batch}) + + batch_items = model.groupby("batch").groups.items() + batch_levels = [k for k, v in batch_items] + batch_info = [v for k, v in batch_items] + n_batch = len(batch_info) + n_batches = np.array([len(v) for v in batch_info]) + n_array = float(sum(n_batches)) + + # drop intercept + drop_cols = [cname for cname, inter in ((model == 1).all()).iteritems() if inter == True] + drop_idxs = [list(model.columns).index(cdrop) for cdrop in drop_cols] + model = model[[c for c in model.columns if not c in drop_cols]] + numerical_covariates = [list(model.columns).index(c) if isinstance(c, str) else c + for c in numerical_covariates if not c in drop_cols] + + design = design_mat(model, numerical_covariates, batch_levels) + + sys.stderr.write("Standardizing Data across genes.\n") + B_hat = np.dot(np.dot(la.inv(np.dot(design.T, design)), design.T), data.T) + grand_mean = np.dot((n_batches / n_array).T, B_hat[:n_batch,:]) + var_pooled = np.dot(((data - np.dot(design, B_hat).T)**2), np.ones((int(n_array), 1)) / int(n_array)) + + stand_mean = np.dot(grand_mean.T.reshape((len(grand_mean), 1)), np.ones((1, int(n_array)))) + tmp = np.array(design.copy()) + tmp[:,:n_batch] = 0 + stand_mean += np.dot(tmp, B_hat).T + + s_data = ((data - stand_mean) / np.dot(np.sqrt(var_pooled), np.ones((1, int(n_array))))) + + sys.stderr.write("Fitting L/S model and finding priors\n") + batch_design = design[design.columns[:n_batch]] + gamma_hat = np.dot(np.dot(la.inv(np.dot(batch_design.T, batch_design)), batch_design.T), s_data.T) + + delta_hat = [] + + for i, batch_idxs in enumerate(batch_info): + delta_hat.append(s_data[batch_idxs].var(axis=1)) + + gamma_bar = gamma_hat.mean(axis=1) + t2 = gamma_hat.var(axis=1) + + + a_prior = list(map(aprior, delta_hat)) + b_prior = list(map(bprior, delta_hat)) + + sys.stderr.write("Finding parametric adjustments\n") + gamma_star, delta_star = [], [] + for i, batch_idxs in enumerate(batch_info): + temp = it_sol(s_data[batch_idxs], gamma_hat[i], + delta_hat[i], gamma_bar[i], t2[i], a_prior[i], b_prior[i]) + + gamma_star.append(temp[0]) + delta_star.append(temp[1]) + return(gamma_star, delta_star) + +def combat_transform_old(data, batch, gamma_star, delta_star,model=None, numerical_covariates=None): + """Correct for batch effects in a dataset + Parameters + ---------- + data : pandas.DataFrame + A (n_features, n_samples) dataframe of the expression or methylation + data to batch correct + batch : pandas.Series + A column corresponding to the batches in the data, with index same as + the columns that appear in ``data`` + gamma_star : centering parameters from combat fitting + delta_star : scaling parameters from combat fitting + model : patsy.design_info.DesignMatrix, optional + A model matrix describing metadata on the samples which could be + causing batch effects. If not provided, then will attempt to coarsely + correct just from the information provided in ``batch`` + numerical_covariates : list-like + List of covariates in the model which are numerical, rather than + categorical + Returns + ------- + corrected : pandas.DataFrame + A (n_features, n_samples) dataframe of the batch-corrected data + """ + #get design + if isinstance(numerical_covariates, str): + numerical_covariates = [numerical_covariates] + if numerical_covariates is None: + numerical_covariates = [] + + if model is not None and isinstance(model, pd.DataFrame): + model["batch"] = list(batch) + else: + model = pd.DataFrame({'batch': batch}) + batch_items = model.groupby("batch").groups.items() + batch_levels = [k for k, v in batch_items] + batch_info = [v for k, v in batch_items] + n_batch = len(batch_info) + n_batches = np.array([len(v) for v in batch_info]) + n_array = float(sum(n_batches)) + # drop intercept + drop_cols = [cname for cname, inter in ((model == 1).all()).iteritems() if inter == True] + drop_idxs = [list(model.columns).index(cdrop) for cdrop in drop_cols] + model = model[[c for c in model.columns if not c in drop_cols]] + numerical_covariates = [list(model.columns).index(c) if isinstance(c, str) else c + for c in numerical_covariates if not c in drop_cols] + design = design_mat(model, numerical_covariates, batch_levels) + #standardize + sys.stderr.write("Standardizing Data across genes.\n") + B_hat = np.dot(np.dot(la.inv(np.dot(design.T, design)), design.T), data.T) + grand_mean = np.dot((n_batches / n_array).T, B_hat[:n_batch,:]) + var_pooled = np.dot(((data - np.dot(design, B_hat).T)**2), np.ones((int(n_array), 1)) / int(n_array)) + + stand_mean = np.dot(grand_mean.T.reshape((len(grand_mean), 1)), np.ones((1, int(n_array)))) + tmp = np.array(design.copy()) + tmp[:,:n_batch] = 0 + stand_mean += np.dot(tmp, B_hat).T + s_data = ((data - stand_mean) / np.dot(np.sqrt(var_pooled), np.ones((1, int(n_array))))) + batch_design = design[design.columns[:n_batch]] + # adjust data + sys.stdout.write("Adjusting data\n") + bayesdata = s_data + gamma_star = np.array(gamma_star) + delta_star = np.array(delta_star) + #for each batch + for j, batch_idxs in enumerate(batch_info): + + dsq = np.sqrt(delta_star[j,:]) + dsq = dsq.reshape((len(dsq), 1)) + denom = np.dot(dsq, np.ones((1, n_batches[j]))) #divide by sqrt delta_star + numer = np.array(bayesdata[batch_idxs] - np.dot(batch_design.loc[batch_idxs], gamma_star).T) #subtract gamma_star + + bayesdata[batch_idxs] = numer / denom + #multiply by square root of variance and add mean + vpsq = np.sqrt(var_pooled).reshape((len(var_pooled), 1)) + bayesdata = bayesdata * np.dot(vpsq, np.ones((1, int(n_array)))) + stand_mean + return bayesdata + +def plot_histograms(df_norm,df,s_train,s_tissue): + ''' + for each marker, return a histogram of trianing data and transformed data (df_norm) + ''' + bins=50 + d_fig = {} + for s_marker in df_norm.columns[df_norm.dtypes=='float64']: + print(s_marker) + fig,ax=plt.subplots(2,1,figsize = (3,4)) + for idxs, s_batch in enumerate(sorted(set(df_norm.batch))): + df_batch = df_norm[(df_norm.batch==s_batch)].loc[:,s_marker] + if len(df_batch.dropna()) == 0: + continue + ax[0].hist(df.loc[df.index.str.contains(s_batch),s_marker],bins=bins,alpha=0.4, color=f'C{idxs}') + ax[1].hist(df_batch,bins=bins,alpha=0.4, color=f'C{idxs}',label=s_batch) + ax[0].set_yscale('log') + ax[1].set_yscale('log') + ax[0].set_title(f'{s_marker.split("_")[0]}: Raw Data') + ax[1].set_title(f'{s_marker.split("_")[0]}: Combat') + ax[1].legend() + plt.tight_layout() + plt.close() + d_fig.update({s_marker:fig}) + return(d_fig) \ No newline at end of file diff --git a/mplex_image/ometiff.py b/mplex_image/ometiff.py new file mode 100755 index 0000000..9986c6d --- /dev/null +++ b/mplex_image/ometiff.py @@ -0,0 +1,76 @@ +#### +# title: mpimage.py +# +# language: Python3.6 +# date: 2019-05-00 +# license: GPL>=v3 +# author: Jenny +# +# description: +# python3 library to display, normalize and crop multiplex images +#### + +#libraries +import matplotlib as mpl +mpl.use('agg') +import matplotlib.pyplot as plt +import numpy as np +import os +import skimage +import pandas as pd +#import bioformats +import re +import shutil +from itertools import chain +import matplotlib.ticker as ticker + +os.chdir('/home/groups/graylab_share/OMERO.rdsStore/engje/Data/cmIF/') +from apeer_ometiff_library import omexmlClass + +#functions + +def gen_xml(array, channel_names): + ''' + copy and modify from apeer ome tiff + ls_marker + ''' + #for idx, s_marker in enumerate(ls_marker): + # old = bytes(f'Name="C:{idx}"','utf-8') + # new = bytes(f'Name="{s_marker}"','utf-8') + # s_xml = s_xml.replace(old,new,-1) + #Dimension order is assumed to be TZCYX + dim_order = "TZCYX" + + metadata = omexmlClass.OMEXML() + shape = array.shape + assert ( len(shape) == 5), "Expected array of 5 dimensions" + + metadata.image().set_Name("IMAGE") + metadata.image().set_ID("0") + + pixels = metadata.image().Pixels + pixels.ome_uuid = metadata.uuidStr + pixels.set_ID("0") + + pixels.channel_count = shape[2] + + pixels.set_SizeT(shape[0]) + pixels.set_SizeZ(shape[1]) + pixels.set_SizeC(shape[2]) + pixels.set_SizeY(shape[3]) + pixels.set_SizeX(shape[4]) + + pixels.set_DimensionOrder(dim_order[::-1]) + + pixels.set_PixelType(omexmlClass.get_pixel_type(array.dtype)) + + for i in range(pixels.SizeC): + pixels.Channel(i).set_ID("Channel:0:" + str(i)) + pixels.Channel(i).set_Name(channel_names[i]) + + for i in range(pixels.SizeC): + pixels.Channel(i).set_SamplesPerPixel(1) + + pixels.populate_TiffData() + + return metadata.to_xml().encode() diff --git a/mplex_image/preprocess.py b/mplex_image/preprocess.py new file mode 100755 index 0000000..a54e54b --- /dev/null +++ b/mplex_image/preprocess.py @@ -0,0 +1,705 @@ +#### +# title: preprocess.py +# +# language: Python3.6 +# date: 2019-06-00 +# license: GPL>=v3 +# author: Jenny +# +# description: +# python3 library to prepare images and other inputs for guillaumes segmentation software +#### + +#libraries +import pandas as pd +import matplotlib as mpl +mpl.use('agg') +import matplotlib.pyplot as plt +import numpy as np +import os +import skimage +import shutil +import re + +#set src path (CHANGE ME) +s_src_path = '/home/groups/graylab_share/OMERO.rdsStore/engje/Data/cmIF' +s_work_path = '/home/groups/graylab_share/Chin_Lab/ChinData/Work/engje' + +# function +# import importlib +# importlib.reload(preprocess) + +def check_names(df_img,s_type='tiff'): + """ + (CHANGE ME) + Based on filenames in segment folder, + checks marker names against standard list of biomarkers + returns a dataframe with Rounds Cycles Info, and sets of wrong and correct names + Input: s_find = string that will be unique to one scene to check in the folder + """ + if s_type == 'tiff': + es_names = set(df_img.marker) + elif s_type == 'czi': + lls_marker = [item.split('.') for item in df_img.markers] + es_names = set([item for sublist in lls_marker for item in sublist]) + else : + print('Unknown type') + es_standard = {'DAPI','PDL1','pERK','CK19','pHH3','CK14','Ki67','Ecad','PCNA','HER2','ER','CD44', + 'aSMA','AR','pAKT','LamAC','CK5','EGFR','pRB','FoxP3','CK7','PDPN','CD4','PgR','Vim', + 'CD8','CD31','CD45','panCK','CD68','PD1','CD20','CK8','cPARP','ColIV','ColI','CK17', + 'H3K4','gH2AX','CD3','H3K27','53BP1','BCL2','GRNZB','LamB1','pS6RP','BAX','RAD51', + 'R0c2','R0c3','R0c4','R0c5','R5Qc2','R5Qc3','R5Qc4','R5Qc5','R11Qc2','R11Qc3','R11Qc4','R11Qc5', + 'R7Qc2','R7Qc3','R7Qc4','R7Qc5','PDL1ab','PDL1d','R14Qc2','R14Qc3','R14Qc4','R14Qc5', + 'R8Qc2','R8Qc3','R8Qc4','R8Qc5','R12Qc2','R12Qc3','R12Qc4','R12Qc5','PgRc4','R1c2','CCND1', + 'Glut1','CoxIV','LamB2','S100','BMP4','BMP2','BMP6','pS62MYC', 'CGA', 'p63', 'SYP','PDGFRa', 'HIF1a','CC3', + 'MUC1','CAV1','MSH2','CSF1R','R13Qc4', 'R13Qc5', 'R13Qc3', 'R13Qc2','R10Qc2','R10Qc3','R10Qc4','R10Qc5', + 'R6Qc2', 'R6Qc3','R6Qc4', 'R6Qc5', 'TUBB3', 'CD90', 'GATA3'}#,'PDGFRB'CD66b (Neutrophils) + #HLA class II or CD21(Dendritic cells) + #BMP4 Fibronectin, CD11b (dendritic, macrophage/monocyte/granulocyte) CD163 (macrophages) + #CD83 (dendritic cells) FAP + es_wrong = es_names - es_standard + es_right = es_standard.intersection(es_names) + print(f'Wrong names {es_wrong}') + print(f' Right names {es_right}') + return(es_wrong) + +def copy_dapis(s_r_old='-R11_',s_r_new='-R91_',s_c_old='_c1_',s_c_new='_c2_',s_find='_c1_ORG.tif',b_test=True,type='org'): + """ + copy specified round of dapi, rename with new round and color + Input: + s_r_old = old round + s_r_new = new round on copied DAPI + s_c_old = old color + s_c_new = new color on copied DAPI + s_find= how to identify dapis i.e. '_c1_ORG.tif' + b_test=True if testing only + """ + i_dapi = re.sub("[^0-9]", "", s_r_old) + ls_test = [] + for s_file in os.listdir(): + if s_file.find(s_find) > -1: + if s_file.find(s_r_old) > -1: + s_file_round = s_file.replace(s_r_old,s_r_new) + s_file_color = s_file_round.replace(s_c_old,s_c_new) + if type=='org': + s_file_dapi = s_file_color.replace(s_file_color.split("_")[1],f'DAPI{i_dapi}.DAPI{i_dapi}.DAPI{i_dapi}.DAPI{i_dapi}') + else: + s_file_dapi=s_file_color + ls_test = ls_test + [s_file] + if b_test: + print(f'copied file {s_file} \t and named {s_file_dapi}') + else: + print(f'copied file {s_file} \t and named {s_file_dapi}') + shutil.copyfile(s_file, s_file_dapi) + + print(f'total number of files changed is {len(ls_test)}') + +def copy_markers(df_img, s_original = 'panCK', ls_copy = ['CK19','CK5','CK7','CK14'],i_last_round = 97, b_test=True, type = 'org'): + """ + copy specified marker image, rename with new round and color (default c2) and marker name + Input: + s_original = marker to copy + df_img = dataframe with images + ls_copy = list of fake channels to make + + b_test=True if testing only + """ + df_copy = df_img[df_img.marker==s_original] + ls_test = [] + for s_index in df_copy.index: + s_round = df_img.loc[s_index,'rounds'] + for idx, s_copy in enumerate(ls_copy): + i_round = i_last_round + 1 + idx + s_round = df_img.loc[s_index,'rounds'] + s_roundnum = re.sub("[^0-9]", "", s_round) + s_round_pre = s_round.replace(s_roundnum,'') + s_file_round = s_index.replace(df_img.loc[s_index,'rounds'],f'{s_round_pre}{i_round}') + s_file_color = s_file_round.replace(f'_{s_round}_',f'_c{i_round}_') + if type == 'org': + s_file_dapi = s_file_color.replace(s_file_color.split("_")[1],f'{s_copy}.{s_copy}.{s_copy}.{s_copy}') + else: + s_file_dapi = s_file_color.replace(f'_{s_original}_',f'_{s_copy}_') + ls_test = ls_test + [s_index] + if b_test: + print(f'copied file {s_index} \t and named {s_file_dapi}') + else: + print(f'copied file {s_index} \t and named {s_file_dapi}') + shutil.copyfile(s_index, s_file_dapi) + print(f'total number of files changed is {len(ls_test)}') + +def dchange_fname(d_rename={'_oldstring_':'_newstring_'},b_test=True): + """ + replace anything in file name, based on dictionary of key = old + values = new + Input + """ + #d_rename = {'Registered-R11_CD34.AR.':'Registered-R11_CD34.ARcst.','FoxP3b':'FoxP3bio'} + for s_key,s_value in d_rename.items(): + s_old=s_key + s_new=s_value + #test + if b_test: + ls_test = [] + for s_file in os.listdir(): + if s_file.find(s_old) > -1: + s_file_print = s_file + ls_test = ls_test + [s_file] + len(ls_test) + s_file_new = s_file.replace(s_old,s_new) + #print(f'changed file {s_file}\tto {s_file_new}') + if len(ls_test)!=0: + print(f'changed file {s_file_print}\tto {s_file_new}') + print(f'total number of files changed is {len(ls_test)}') + #really rename + else: + ls_test = [] + for s_file in os.listdir(): + if s_file.find(s_old) > -1: + s_file_print = s_file + ls_test = ls_test + [s_file] + len(ls_test) + s_file_new = s_file.replace(s_old,s_new) + #print(f'changed file {s_file}\tto {s_file_new}') + os.rename(s_file, s_file_new) #comment out this line to test + if len(ls_test)!=0: + print(f'changed file {s_file_print}\tto {s_file_new}') + print(f'total number of files changed is {len(ls_test)}') + +def csv_change_fname(i_scene_len=2, b_test=True): + ''' + give a csv with wrong_round and correct scene names + make a Renamed folder + the correct scene is added after, as +correct + ''' + df_test = pd.read_csv(f'FinalSceneNumbers.csv',header=0) + df_test = df_test.astype(str)#(works!) + if i_scene_len == 2: + df_scene = df_test.applymap('{:0>2}'.format) + elif i_scene_len == 3: + df_test.replace('nan','',inplace=True) + df_test.replace(to_replace = "\.0+$",value = "", regex = True,inplace=True) + df_scene = df_test.applymap('{:0>3}'.format) + else: + df_scene = df_test #.applymap('{:0>3}'.format) + #for each round with wrong names + for s_wrong in df_scene.columns[df_scene.columns.str.contains('wrong')]: + for s_file in os.listdir(): + #find files in that round + if s_file.find(f'R{s_wrong.split("_")[1]}_') > -1: + #print(s_file) + #for each scene + for s_index in df_scene.index: + s_wrong_scene = df_scene.loc[s_index,s_wrong] + if s_file.find(f'-Scene-{s_wrong_scene}') > -1: + s_correct = df_scene.loc[s_index,'correct'] + print(s_correct) + s_replace = s_file.replace(f'-Scene-{s_wrong_scene}', f'-Scene-{s_wrong_scene}+{s_correct}') + s_file_new = f"./Renamed/{s_replace}" + + if b_test: + print(f'changed file {s_file} to {s_file_new}') + else: + os.rename(s_file, s_file_new) + print(f'changed file {s_file} to {s_file_new}') + return(df_test) + +def check_seg_markers(df_img,d_segment = {'CK19':1002,'CK5':5002,'CD45':2002,'Ecad':802,'CD44':1202,'CK7':2002,'CK14':502}, i_rows=1, t_figsize=(20,10)): + """ + This script makes binarizedoverviews of all the specified segmentation markers + with specified thresholds, and outputs a rounds cycles table + Input: df_dapi: output of mpimage.parse_org() + d_segment: segmentation marker names and thresholds + i_rows = number or rows in figure + t_figsize = (x, y) in inches size of figure + Output: dictionary + """ + d_result = {} + for s_key,i_item in d_segment.items(): + #find all segmentation marker slides + df_img_seg = df_img[df_img.marker==s_key] + fig,ax = plt.subplots(i_rows,(len(df_img_seg)+(i_rows-1))//i_rows, figsize = t_figsize, squeeze=False) + ax = ax.ravel() + for idx,s_scene in enumerate(sorted(df_img_seg.index.tolist())): + print(f'Processing {s_scene}') + im_low = skimage.io.imread(s_scene) + im = skimage.exposure.rescale_intensity(im_low,in_range=(i_item,i_item+1)) + ax[idx].imshow(im, cmap='gray') + s_round = s_scene.split('Scene')[1].split('_')[0] + ax[idx].set_title(f'{s_key} Scene{s_round} min={i_item}',{'fontsize':12}) + plt.tight_layout() + d_result.update({s_key:fig}) + return(d_result) + +def checkall_seg_markers(df_img,d_segment = {'CK19':1002,'CK5':5002,'CD45':2002,'Ecad':802,'CD44':1202,'CK7':2002,'CK14':502}, i_rows=2, t_figsize=(15,10)): + """ + This script makes binarizedoverviews of all the specified segmentation markers + with specified thresholds, and it puts all segmentation markers in one figure + Input: df_dapi: output of mpimage.parse_org() + d_segment: segmentation marker names and thresholds + i_rows = number or rows in figure + t_figsize = (x, y) in inches size of figure + Output: dictionary + """ + es_seg = set([s_key for s_key,i_item in d_segment.items()]) + df_img_seg = df_img[df_img.marker.isin(es_seg)] + fig,ax = plt.subplots(i_rows,(len(es_seg)+(i_rows-1))//i_rows, figsize = t_figsize, squeeze=False) + ax = ax.ravel() + for idx,s_scene in enumerate(sorted(df_img_seg.index.tolist())): + s_key = df_img.loc[s_scene].marker + i_item = d_segment[s_key] + print(f'Processing {s_scene}') + im_low = skimage.io.imread(s_scene) + im = skimage.exposure.rescale_intensity(im_low,in_range=(i_item,i_item+1)) + ax[idx].imshow(im, cmap='gray') + s_round = s_scene.split('Scene')[1].split('_')[0] + ax[idx].set_title(f'{s_key} Scene{s_round} min={i_item}',{'fontsize':12}) + plt.tight_layout() + #d_result.update({s_key:fig}) + return(fig) + +def rounds_cycles(s_find='-Scene-001_c', d_segment = {'CK19':1002,'CK5':5002,'CD45':4502,'Ecad':802,'CD44':1202,'CK7':2002,'CK14':502}): + """ + Based on filenames in segment folder, makes a dataframe with Rounds Cycles Info + """ + ls_marker = [] + df_dapi = pd.DataFrame() #(columns=['rounds','colors','minimum','maximum','exposure','refexp','location']) + for s_name in sorted(os.listdir()): + if s_name.find(s_find) > -1: + s_color = s_name.split('_')[3] + if s_color != 'c1': + #print(s_name) + if s_color == 'c2': + s_marker = s_name.split('_')[1].split('.')[0] + elif s_color == 'c3': + s_marker = s_name.split('_')[1].split('.')[1] + elif s_color == 'c4': + s_marker = s_name.split('_')[1].split('.')[2] + elif s_color == 'c5': + s_marker = s_name.split('_')[1].split('.')[3] + else: + print('Error: unrecognized channel name') + s_marker = 'error' + ls_marker.append(s_marker) + df_marker = pd.DataFrame(index = [s_marker],columns=['rounds','colors','minimum','maximum','exposure','refexp','location']) + df_marker.loc[s_marker,'rounds'] = s_name.split('_')[0].split('Registered-')[1] + df_marker.loc[s_marker,'colors'] = s_name.split('_')[3] + df_marker.loc[s_marker,'minimum'] = 1003 + df_marker.loc[s_marker,'maximum'] = 65535 + df_marker.loc[s_marker,'exposure'] = 100 + df_marker.loc[s_marker,'refexp'] = 100 + df_marker.loc[s_marker,'location'] = 'All' + df_dapi = df_dapi.append(df_marker) + for s_key,i_item in d_segment.items(): + df_dapi.loc[s_key,'minimum'] = i_item + #if len(ls_marker) != len(set(df_marker.index)): + # print('Check for repeated biomarkers!') + for s_marker in ls_marker: + if (np.array([s_marker == item for item in ls_marker]).sum()) != 1: + print('Repeated marker!/n') + print(s_marker) + + return(df_dapi, ls_marker) + +def cluster_java(s_dir='JE1',s_sample='SampleID',imagedir='PathtoImages',segmentdir='PathtoSegmentation',type='exacloud',b_segment=True,b_TMA=True): + """ + makes specific changes to files in Jenny's Work directories to result in Cluster.java file + s_dir = directory to make cluster.java file in + s_sample = unique sample ID + imagedir = full /path/to/images + type = 'exacloud' or 'eppec' (different make file settings) + b_TMA = True if tissue is a TMA + b_segment = True if segmentation if being done (or False if feature extraction only) + """ + if type=='exacloud': + os.chdir(f'{s_work_path}/exacloud/') + with open('TemplateExacloudCluster.java') as f: + s_file = f.read() + elif type=='eppec': + os.chdir(f'{s_work_path}/eppec/') + with open('TemplateEppecCluster.java') as f: + s_file = f.read() + else: + print('Error: type must be exacloud or eppec') + s_file = s_file.replace('PathtoImages',imagedir) + s_file = s_file.replace('PathtoSegmentation',f'{segmentdir}/{s_sample.split("-Scene")[0]}_Segmentation/') + s_file = s_file.replace('PathtoFeatures',f'{segmentdir}/{s_sample.split("-Scene")[0]}_Features/') + if b_segment: + s_file = s_file.replace('/*cif.Experiment','cif.Experiment') + s_file = s_file.replace('("Segmentation Done!") ;*/','("Segmentation Done!") ;') + if b_TMA: + s_file = s_file.replace('cif.CROPS ;','cif.TMA ;') + os.chdir(f'./{s_dir}/') + with open('Cluster.java', 'w') as f: + f.write(s_file) + +def registration_matlab(N_smpl='10000',N_colors='5',s_rootdir='PathtoImages',s_subdirname='RegisteredImages/',s_ref_id='./R1_*_c1_ORG.tif', + ls_order = ['R1','R2','R3','R4','R5','R6','R7','R8','R9','R10','R11','R0','R11Q']): + + """ + makes specific changes to template matlab scripts files in Jenny's directories to result in .m file + Input: + N_smpl = i_N_smpl; %number of features to detect in image (default = 10000) + N_colors = i_N_colors; %number of colors in R1 (default = 5) + ls_order = {RoundOrderString}; %list of names and order of rounds + s_rootdir = 'PathtoImages' %location of raw images in folder + s_ref_id = 'RefDapiUniqueID'; %shared unique identifier of reference dapi + s_subdirname = 'PathtoRegisteredImages' %location of folder where registered images will reside + """ + ls_order_q = [f"'{item}'" for item in ls_order] + #find template, open ,edit + os.chdir(f'{s_src_path}/src') + with open('template_registration_server_multislide_roundorder_scenes_2019_11_11.m') as f: + s_file = f.read() + s_file = s_file.replace('PathtoImages',s_rootdir) + s_file = s_file.replace('PathtoRegisteredImages',s_subdirname) + s_file = s_file.replace('i_N_smpl',N_smpl) + s_file = s_file.replace('i_N_colors',N_colors) + s_file = s_file.replace("RoundOrderString",",".join(ls_order_q)) + s_file = s_file.replace('RefDapiUniqueID',s_ref_id) + + #save edited .m file + os.chdir(s_rootdir) + with open('registration_py.m', 'w') as f: + f.write(s_file) + +def large_registration_matlab(N_smpl='10000',N_colors='5',s_rootdir='PathtoImages',s_subdirname='RegisteredImages',s_ref_id='./R1_*_c1_ORG.tif', + ls_order = ['R1','R2','R3','R4','R5','R6','R7','R8','R9','R10','R11','R0','R11Q'],d_crop_regions={1:'[0 0 1000 1000]'}): + """ + makes specific changes to template matlab scripts files in Jenny's directories to result in .m file + Input: + N_smpl = i_N_smpl; %number of features to detect in image (default = 10000) + N_colors = i_N_colors; %number of colors in R1 (default = 5) + ls_order = {RoundOrderString}; %list of names and order of rounds + s_rootdir = 'PathtoImages' %location of raw images in folder + s_ref_id = 'RefDapiUniqueID'; %shared unique identifier of reference dapi + s_subdirname = 'PathtoRegisteredImages' %location of folder where registered images will reside + d_crop_regions= dictioanr with crop integer as key, ans string with crop array as value e.g. {1:'[0 0 1000 1000]'} + + """ + ls_order_q = [f"'{item}'" for item in ls_order] + + os.chdir(f'{s_src_path}/src') + with open('template_registration_server_largeimages_roundorder_2019_11_11.m') as f: + s_file = f.read() + s_file = s_file.replace('PathtoImages',s_rootdir) + s_file = s_file.replace('PathtoRegisteredImages',s_subdirname) + s_file = s_file.replace('i_N_smpl',N_smpl) + s_file = s_file.replace('i_N_colors',N_colors) + s_file = s_file.replace("RoundOrderString",",".join(ls_order_q)) + s_file = s_file.replace('RefDapiUniqueID',s_ref_id) + + for i_crop_region, s_crop in d_crop_regions.items(): + s_file = s_file.replace(f'%{i_crop_region}%{i_crop_region}%','') + s_file = s_file.replace(f'[a_crop_{i_crop_region}]',s_crop) + #save edited .m file + os.chdir(s_rootdir) + with open('registration_py.m', 'w') as f: + f.write(s_file) + +def cmif_mkdir(ls_dir): + ''' + check if directories existe. if not, make them + ''' + for s_dir in ls_dir: + if not os.path.exists(s_dir): + os.makedirs(s_dir) + +######################### Old functions ############################ + +def check_reg_channels(ls_find=['c1_ORG','c2_ORG'], i_rows=2, t_figsize=(20,10), b_separate = False, b_mkdir=True): + """ + This script makes overviews of all the specified channel images of registered tiff images + in a big folder (slides prepared for segmentation for example) + Input: ls_find = list of channels to view + i_rows = number or rows in figure + t_figsize = (x, y) in inches size of figure + b_mkdir = boolean whether to make a new Check_Registration folder + Output: dictionary with {slide_color:number of rounds found} + images of all rounds of a certain slide_color + """ + d_result = {} + ls_error = [] + if b_separate: + s_dir = os.getcwd() + os.chdir('..') + s_path = os.getcwd() + if b_mkdir: + os.mkdir(f'./Check_Registration') + os.chdir(s_dir) + else: + s_path = os.getcwd() + if b_mkdir: + os.mkdir(f'./Check_Registration') + for s_find in ls_find: + #find all dapi slides + ls_dapis = [] + for s_dir in os.listdir(): + if s_dir.find(s_find) > -1: + ls_dapis = ls_dapis + [s_dir] + + #find all unique scenes + ls_scene_long = [] + for s_dapi in ls_dapis: + ls_scene_long = ls_scene_long + [(s_dapi.split('_')[2])] + ls_scene = list(set(ls_scene_long)) + ls_scene.sort() + + for s_scene in ls_scene: + print(f'Processing {s_scene}') + ls_dapi = [] + for s_file in ls_dapis: + if s_file.find(s_scene)>-1: + ls_dapi = ls_dapi + [s_file] + fig,ax = plt.subplots(i_rows,(len(ls_dapi)+(i_rows-1))//i_rows, figsize = t_figsize) + ax = ax.ravel() + ls_dapi.sort() + for x in range(len(ls_dapi)): + im_low = skimage.io.imread(ls_dapi[x]) + im = skimage.exposure.rescale_intensity(im_low,in_range=(np.quantile(im_low,0.02),np.quantile(im_low,0.98)+np.quantile(im_low,0.98)/2)) + ax[x].imshow(im, cmap='gray') + s_round = ls_dapi[x].split('_')[0].split('-')[1] + ax[x].set_title(s_round,{'fontsize':12}) + s_slide = ls_dapi[0].split('_')[2] + plt.tight_layout() + fig.savefig(f'{s_path}/Check_Registration/{s_slide}_{s_find}.png') + d_result.update({f'{s_slide}_{s_find}':len(ls_dapi)}) + ls_error = ls_error + [len(ls_dapi)] + if(len(set(ls_error))==1): + print("All checked scenes/channels have the same number of images") + else: + print("Warning: different number of images in some scenes/channels") + for s_key, i_item in d_result.items(): + print(f'{s_key} has {i_item} images') + return(d_result) + + +def check_names_deprecated(s_find='-Scene-001_c',b_print=False): + """ + Based on filenames in segment folder, + checks marker names against standard list of biomarkers + returns a dataframe with Rounds Cycles Info, and sets of wrong and correct names + Input: s_find = string that will be unique to one scene to check in the folder + """ + df_dapi = pd.DataFrame() #(columns=['rounds','colors','minimum','maximum','exposure','refexp','location']) + for s_name in sorted(os.listdir()): + if s_name.find(s_find) > -1: + s_color = s_name.split('_')[3] + if s_color != 'c1': + if b_print: + print(s_name) + if s_color == 'c2': + s_marker = s_name.split('_')[1].split('.')[0] + elif s_color == 'c3': + s_marker = s_name.split('_')[1].split('.')[1] + elif s_color == 'c4': + s_marker = s_name.split('_')[1].split('.')[2] + elif s_color == 'c5': + s_marker = s_name.split('_')[1].split('.')[3] + else: + print('Error: unrecognized channel name') + s_marker = 'error' + df_marker = pd.DataFrame(index = [s_marker],columns=['rounds','colors','minimum','maximum','exposure','refexp','location']) + df_marker.loc[s_marker,'rounds'] = s_name.split('_')[0].split('Registered-')[1] + df_marker.loc[s_marker,'colors'] = s_name.split('_')[3] + df_marker.loc[s_marker,'minimum'] = 1003 + df_marker.loc[s_marker,'maximum'] = 65535 + df_marker.loc[s_marker,'exposure'] = 100 + df_marker.loc[s_marker,'refexp'] = 100 + df_marker.loc[s_marker,'location'] = 'All' + df_dapi = df_dapi.append(df_marker) + es_names = set(df_dapi.index) + es_standard = {'PDL1','pERK','CK19','pHH3','CK14','Ki67','Ecad','PCNA','HER2','ER','CD44', + 'aSMA','AR','pAKT','LamAC','CK5','EGFR','pRB','FoxP3','CK7','PDPN','CD4','PgR','Vim', + 'CD8','CD31','CD45','panCK','CD68','PD1','CD20','CK8','cPARP','ColIV','ColI','CK17', + 'H3K4','gH2AX','CD3','H3K27','53BP1','BCL2','GRNZB','LamB1','pS6RP','BAX','RAD51', + 'R0c2','R0c3','R0c4','R0c5','R5Qc2','R5Qc3','R5Qc4','R5Qc5','R11Qc2','R11Qc3','R11Qc4','R11Qc5', + 'R7Qc2','R7Qc3','R7Qc4','R7Qc5','PDL1ab','PDL1d','R14Qc2','R14Qc3','R14Qc4','R14Qc5', + 'R8Qc2','R8Qc3','R8Qc4','R8Qc5','R12Qc2','R12Qc3','R12Qc4','R12Qc5','PgRc4', + 'Glut1','CoxIV','LamB2','S100','BMP4','BMP2','BMP6','pS62MYC', 'CGA', 'p63', 'SYP','PDGFRa', 'HIF1a'}#,'PDGFRB'CD66b (Neutrophils) HLA class II or CD21(Dendritic cells) + #BMP4 Fibronectin, CD11b (dendritic, macrophage/monocyte/granulocyte) CD163 (macrophages) + #CD83 (dendritic cells) FAP Muc1 + es_wrong = es_names - es_standard + es_right = es_standard.intersection(es_names) + print(f'Wrong names {es_wrong}') + print(f' Right names {es_right}') + return(df_dapi, es_wrong, es_right) + +def file_sort(s_sample, s_path, i_scenes=14,i_rounds=12,i_digits=3,ls_quench=['R5Q','R11Q'],s_find='_ORG.tif',b_scene=False): + ''' + count rounds and channels of images (koeis naming convention, not registered yet) + ''' + os.chdir(s_path) + se_dir = pd.Series(os.listdir()) + + se_dir = se_dir[se_dir.str.find(s_find)>-1] + se_dir = se_dir.sort_values() + se_dir = se_dir.reset_index() + se_dir = se_dir.drop('index',axis=1) + + print(s_sample) + print(f'Total _ORG.tif: {len(se_dir)}') + + #count files in each round, plus store file names on df_round + df_round = pd.DataFrame(index=range(540)) + i_grand_tot = 0 + for x in range(i_rounds): + se_round = se_dir[se_dir.iloc[:,0].str.contains(f'R{str(x)}_')] + se_round = se_round.rename({0:'round'},axis=1) + se_round = se_round.sort_values(by='round') + se_round = se_round.reset_index() + se_round = se_round.drop('index',axis=1) + i_tot = se_dir.iloc[:,0].str.contains(f'R{str(x)}_').sum() + i_round = 'Round ' + str(x) + print(f'{i_round}: {i_tot}') + i_grand_tot = i_grand_tot + i_tot + df_round[i_round]=se_round + df_round = df_round.dropna() + + #quenched round special loop + for s_quench in ls_quench: + #x = "{0:0>2}".format(x) + i_tot = se_dir.iloc[:,0].str.contains(s_quench).sum() + #i_round = 'Round ' + str(x) + print(f'{s_quench}: {i_tot}') + i_grand_tot = i_grand_tot + i_tot + print(f'Total files containing Rxx_: {i_grand_tot}') + + if b_scene: + #print number of files in each scene + for x in range(1,i_scenes+1): + if i_digits==3: + i_scene = "{0:0>3}".format(x) + elif i_digits==2: + i_scene = "{0:0>2}".format(x) + elif i_digits==1: + i_scene = "{0:0>1}".format(x) + else: + print('wrong i_digits input (must be between 1 and 3') + i_tot = se_dir.iloc[:,0].str.contains(f'Scene-{i_scene}_').sum() + i_round = 'Scene ' + str(x) + print(f'{i_round}: {i_tot}') + + #print number of files in each color + for x in range(1,6): + #i_scene = "{0:0>2}".format(x) + i_tot = se_dir.iloc[:,0].str.contains(f'_c{str(x)}_ORG').sum() + i_round = 'color ' + str(x) + print(f'{i_round}: {i_tot}') + + d_result = {} + for s_round in df_round.columns: + es_round = set([item.split('-Scene-')[1].split('_')[0] for item in list(df_round.loc[:,s_round].values)]) + d_result.update({s_round:es_round}) + print('\n') + + +def change_fname(s_old='_oldstring_',s_new='_newstring_',b_test=True): + """ + replace anything in file name + """ + if b_test: + ls_test = [] + for s_file in os.listdir(): + if s_file.find(s_old) > -1: + ls_test = ls_test + [s_file] + len(ls_test) + s_file_new = s_file.replace(s_old,s_new) + print(f'changed file {s_file}\tto {s_file_new}') + + print(f'total number of files changed is {len(ls_test)}') + #really rename + else: + ls_test = [] + for s_file in os.listdir(): + if s_file.find(s_old) > -1: + ls_test = ls_test + [s_file] + len(ls_test) + s_file_new = s_file.replace(s_old,s_new) + print(f'changed file {s_file}\tto {s_file_new}') + os.rename(s_file, s_file_new) #comment out this line to test + print(f'total number of files changed is {len(ls_test)}') + +def check_reg_slides(i_rows=2, t_figsize=(20,10), b_mkdir=True): + """ + This script makes overviews of all the dapi images of registered images in a big folder (slides prepared for segmentation for example) + """ + #find all dapi slides + ls_dapis = [] + for s_dir in os.listdir(): + if s_dir.find('c1_ORG') > -1: + ls_dapis = ls_dapis + [s_dir] + + #find all scenes + ls_scene_long = [] + for s_dapi in ls_dapis: + ls_scene_long = ls_scene_long + [(s_dapi.split('Scene')[1].split('_')[0])] + ls_scene = list(set(ls_scene_long)) + ls_scene.sort() + if b_mkdir: + os.mkdir(f'./Check_Registration') + for s_scene in ls_scene: + print(f'Processing {s_scene}') + ls_dapi = [] + for s_file in ls_dapis: + if s_file.find(f'Scene{s_scene}')>-1: + ls_dapi = ls_dapi + [s_file] + fig,ax = plt.subplots(i_rows,(len(ls_dapi)+(i_rows-1))//i_rows, figsize = t_figsize) + ax = ax.ravel() + ls_dapi.sort() + for x in range(len(ls_dapi)): + im_low = skimage.io.imread(ls_dapi[x]) + im = skimage.exposure.rescale_intensity(im_low,in_range=(np.quantile(im_low,0.02),np.quantile(im_low,0.98)+np.quantile(im_low,0.98)/2)) + ax[x].imshow(im, cmap='gray') + s_round = ls_dapi[x].split('_')[0].split('-')[1] + ax[x].set_title(s_round,{'fontsize':12}) + s_slide = ls_dapi[0].split('_')[2] + plt.tight_layout() + fig.savefig(f'Check_Registration/{s_slide}.png') + +def check_reg_dirs(s_dir='SlideName',s_subdir='Registered-SlideName', i_rows=2, t_figsize=(20,10), b_mkdir=True): + """ + this checks registration when files are in subdirectories (such as with large tissues, i.e. NP005) + """ + + rootdir = os.getcwd() + if b_mkdir: + os.mkdir(f'./Check_Registration') + #locate subdirectores + for s_dir in os.listdir(): + if s_dir.find(s_dir) > -1: + os.chdir(f'./{s_dir}') + + #locate registered image folders + for s_dir in os.listdir(): + #for s_dir in ls_test2: + if s_dir.find(s_subdir) > -1: #'Registered-BR1506-A019-Scene' + print(f'Processing {s_dir}') + ls_dapi = [] + os.chdir(f'./{s_dir}') + ls_file = os.listdir() + for s_file in ls_file: + if s_file.find('_c1_ORG.tif')>-1: + ls_dapi = ls_dapi + [s_file] + fig,ax = plt.subplots(i_rows,(len(ls_dapi)+(i_rows-1))//i_rows, figsize = (t_figsize)) #vertical + ax=ax.ravel() + ls_dapi.sort() + for x in range(len(ls_dapi)): + im_low = skimage.io.imread(ls_dapi[x]) + im = skimage.exposure.rescale_intensity(im_low,in_range=(np.quantile(im_low,0.02),np.quantile(im_low,0.98)+np.quantile(im_low,0.98)/2)) + ax[x].imshow(im, cmap='gray') + s_round = ls_dapi[x].split('_')[0].split('-')[1] + s_scene = ls_dapi[x].split('-Scene')[1].split('_')[0] + ax[x].set_title(f'{s_round} Scene{s_scene}',{'fontsize':12}) + plt.tight_layout() + + #save figure in the rootdir/Check_Registration folder + fig.savefig(f'{rootdir}/Check_Registration/{s_dir}.png') + #go out of the subfoler and start next processing + os.chdir('..') + +def test(name="this_is_you_name"): + ''' + This is my first doc string + ''' + print(f'hello {name}') + return True diff --git a/mplex_image/process.py b/mplex_image/process.py new file mode 100755 index 0000000..9057580 --- /dev/null +++ b/mplex_image/process.py @@ -0,0 +1,1208 @@ +#### +# title: process.py +# +# language: Python3.6 +# date: 2019-05-00 +# license: GPL>=v3 +# author: Jenny +# +# description: +# python3 library to process cyclic data and images after segmentation +#### + +#libraries +import pandas as pd +import matplotlib as mpl +mpl.use('agg') +import matplotlib.pyplot as plt +import os +import numpy as np +import skimage +import copy +import re +import seaborn as sns +from PIL import Image +Image.MAX_IMAGE_PIXELS = 1000000000 + +#function cellpose +def load_cellpose_df(ls_sample, segdir): + ''' + load all full feature dataframes in sample list + ''' + df_mi_full = pd.DataFrame() + for idx, s_sample in enumerate(ls_sample): + print(f'Loading features_{s_sample}_MeanIntensity_Centroid_Shape.csv') + df_tt = pd.read_csv(f'{segdir}/features_{s_sample}_MeanIntensity_Centroid_Shape.csv',index_col=0) + df_tt['slide'] = s_sample.split('-Scene')[0] + df_tt['scene'] = [item.split('_')[1] for item in df_tt.index] + df_mi_full = df_mi_full.append(df_tt,sort=True) + #add scene + df_mi_full['slide_scene'] = df_mi_full.slide + '_' + df_mi_full.scene + print('') + return(df_mi_full) + +# load li thresholds +def load_li(ls_sample, s_thresh, man_thresh): + ''' + load threshold on the segmentation marker images acquired during feature extraction + ''' + df_img_all =pd.DataFrame() + for s_sample in ls_sample: + print(f'Loading thresh_{s_sample}_ThresholdLi.csv') + df_img = pd.read_csv(f'thresh_{s_sample}_ThresholdLi.csv', index_col=0) + df_img['rounds'] = [item.split('_')[0].split('Registered-')[1] for item in df_img.index] + df_img['color'] = [item.split('_')[-2] for item in df_img.index] + df_img['slide'] = [item.split('_')[2].split('-Scene-')[0] for item in df_img.index] + df_img['scene'] = [item.split('_')[2].split('-Scene-')[1] for item in df_img.index] + df_img['slide_scene'] = df_img.slide + '_scene' + df_img.scene + #parse file name for biomarker + for s_index in df_img.index: + #print(s_index) + s_color = df_img.loc[s_index,'color'] + if s_color == 'c1': + s_marker = f"DAPI{df_img.loc[s_index,'rounds'].split('R')[1]}" + elif s_color == 'c2': + s_marker = s_index.split('_')[1].split('.')[0] + elif s_color == 'c3': + s_marker = s_index.split('_')[1].split('.')[1] + elif s_color == 'c4': + s_marker = s_index.split('_')[1].split('.')[2] + elif s_color == 'c5': + s_marker = s_index.split('_')[1].split('.')[3] + else: print('Error') + df_img.loc[s_index,'marker'] = s_marker + df_img_all = df_img_all.append(df_img) + print('') + #manually override too low Ecad thresh + if s_thresh !='': + df_img_all.loc[df_img_all[(df_img_all.marker==s_thresh) & (df_img_all.threshold_li < man_thresh)].index, 'threshold_li'] = man_thresh + return(df_img_all) + +def filter_cellpose_xy(df_mi_full,ls_centroid = ['DAPI2_nuclei_area', 'DAPI2_nuclei_centroid-0', 'DAPI2_nuclei_centroid-1','DAPI2_nuclei_eccentricity']): + ''' + select the nuclei centoids, area, eccentricity from a marker + default: use DAPI2 + ''' + #NOTE add area + df_xy = df_mi_full.loc[:,ls_centroid] + print('QC: make sure centroids dont have too many NAs') + print(df_xy.isna().sum()) + print('') + df_xy = df_xy.dropna(axis=0,how='any') + df_xy.columns = ['nuclei_area','DAPI_Y','DAPI_X','nuclei_eccentricity'] + df_xy['slide_scene'] = [item.split('_cell')[0] for item in df_xy.index] + return(df_xy) + +def drop_last_rounds(df_img_all,ls_filter,df_mi_full): + ''' + drop any rounds after the last round DAPI filter + ''' + df_img_all['round_ord'] = [re.sub('Q','.5', item) for item in df_img_all.rounds] + df_img_all['round_ord'] = [float(re.sub('[^0-9.]','', item)) for item in df_img_all.round_ord] + i_max = df_img_all[df_img_all.marker.isin([item.split('_')[0] for item in ls_filter])].sort_values('round_ord').iloc[-1].round_ord + print(f'Dropping markers after round {i_max}') + ls_drop_marker = [item + '_' for item in sorted(set(df_img_all[(df_img_all.round_ord>i_max)].marker))] + [print(item) for item in ls_drop_marker] + print('') + [df_mi_full.drop(df_mi_full.columns[df_mi_full.columns.str.contains(item)],axis=1,inplace=True) for item in ls_drop_marker] + return(df_mi_full,i_max) + +def plot_thresh(df_img_all,s_thresh): + ''' + tissues: plot threshold across all tissues + (negative scenes will drive down the mean + ''' + ls_slides = sorted(set(df_mi_full.slide)) + df_plot = df_img_all[(df_img_all.marker==s_thresh)].loc[:,['threshold_li']] + fig,ax=plt.subplots(figsize=(4,3.5)) + sns.stripplot(data=df_plot) + sns.barplot(data=df_plot, alpha=0.5) + labels = ax.get_xticklabels + plt.tight_layout() + fig.savefig(f'{qcdir}/QC_EcadThresh_{".".join(ls_slides)}.png') + +def fill_cellpose_nas(df_mi_full,ls_marker_cyto,s_thresh='Ecad',man_thresh=1000): + ''' + some nuclei don't have a cytoplasm, replace NA with perinuc5 + ''' + df = df_mi_full.copy(deep=True) + # since segmentation was run on ecad, use ecad threshold + print(f'Finding {s_thresh} positive cells') + ls_neg_cells = (df_mi_full[~(df_mi_full.loc[:,f'{s_thresh}_cytoplasm'] > man_thresh)]).index.tolist()#= ls_neg_cells + ls_neg_slide + print('') + # replace cells without cytoplasm (ecad) with perinuc 5 + print(f'For cells that are {s_thresh} negative:') + for s_marker in ls_marker_cyto: + print(f'Replace {s_marker}_cytoplasm nas') + df.loc[ls_neg_cells,f'{s_marker}_cytoplasm'] = df.loc[ls_neg_cells,f'{s_marker}_perinuc5'] + print(f'with {s_marker}_perinuc5') + df[f'{s_thresh}_negative'] = df.index.isin(ls_neg_cells) + return(df) + +def shrink_seg_regions(df_mi_full,s_thresh,ls_celline=[],ls_shrunk=[]): + ''' + For markers with stromal to tumor bleedthrough, use shrunken segmentation region + ''' + #enforce cell lines as tumor + print('') + if len(ls_celline) > 0: + print([f'Enforce {item} as tumor' for item in ls_celline]) + ls_ecad_cells = df_mi_full[~df_mi_full.loc[:,f'{s_thresh}_negative']].index + ls_tumor_cells = (df_mi_full[(df_mi_full.index.isin(ls_ecad_cells)) | (df_mi_full.slide_scene.isin(ls_celline))]).index + ls_stromal_cells = (df_mi_full[~df_mi_full.index.isin(ls_tumor_cells)]).index + #relplace tumor cell CD44 and Vim with shrunken area (only helps bleed trough a little) + print('For markers with stromal to tumor bleedthrough, use shrunken segmentation region:') + for s_marker in ls_shrunk: + print(f'Replace {s_marker.split("_")[0]}_perinuc5 in tumor cells with') + df_mi_full.loc[ls_tumor_cells,f'{s_marker.split("_")[0]}_perinuc5'] = df_mi_full.loc[ls_tumor_cells,f'{s_marker}'] + print(f'with {s_marker}') + print('') + return(df_mi_full) + +def fill_membrane_nas(df_mi_full, df_mi_mem,s_thresh='Ecad',ls_membrane=['HER2']): + ''' + fill cell membrane nsa with expanded nuclei nas + ''' + ls_neg = df_mi_full[(df_mi_full.loc[:,f'{s_thresh}_negative']) & (df_mi_full.index.isin(df_mi_mem.index))].index + ls_pos = df_mi_full[(~df_mi_full.loc[:,f'{s_thresh}_negative']) & (df_mi_full.index.isin(df_mi_mem.index))].index + for s_membrane in ls_membrane: + print(f'Replace {s_membrane}_cellmem25 nas \n with {s_membrane}_exp5nucmembrane25') + df_mi_mem.loc[ls_neg,f'{s_membrane}_cellmem25'] = df_mi_mem.loc[ls_neg,f'{s_membrane}_exp5nucmembrane25'] + ls_na = df_mi_mem.loc[df_mi_mem.loc[:,f'{s_membrane}_cellmem25'].isna(),:].index + df_mi_mem.loc[ls_na,f'{s_membrane}_cellmem25'] = df_mi_mem.loc[ls_na,f'{s_membrane}_exp5nucmembrane25'] + df_merge = df_mi_full.merge(df_mi_mem, left_index=True, right_index=True) + print('') + return(df_merge) + +def fill_bright_nas(ls_membrane,s_sample,s_thresh,df_mi_filled,segdir): + if len(ls_membrane) > 0: + print(f'Loading features_{s_sample}_BrightMeanIntensity.csv') + df_mi_mem = pd.read_csv(f'{segdir}/features_{s_sample}_BrightMeanIntensity.csv',index_col=0) + df_mi_mem_fill = fill_membrane_nas(df_mi_filled, df_mi_mem,s_thresh=s_thresh,ls_membrane=ls_membrane) + else: + df_mi_mem_fill = df_mi_filled + return(df_mi_mem_fill) + +def auto_threshold(df_mi,df_img_all): + # # Auto threshold + + #make positive dataframe to check threhsolds + ls_scene = sorted(set(df_mi.slide_scene)) + + df_pos_auto = pd.DataFrame() + d_thresh_record= {} + + for s_slide_scene in ls_scene: + print(f'Thresholding {s_slide_scene}') + ls_index = df_mi[df_mi.slide_scene==s_slide_scene].index + df_scene = pd.DataFrame(index=ls_index) + df_img_scene = df_img_all[df_img_all.slide_scene==s_slide_scene] + + for s_index in df_img_scene.index: + s_scene =f"{df_img_all.loc[s_index,'slide']}_scene{df_img_all.loc[s_index,'scene']}" + s_marker = df_img_all.loc[s_index,'marker'] + s_columns = df_mi.columns[df_mi.columns.str.contains(f"{s_marker}_")] + if len(s_columns)==1: + s_marker_loc = s_columns[0] + else: + continue + i_thresh = df_img_all.loc[s_index,'threshold_li'] + d_thresh_record.update({f'{s_scene}_{s_marker}':i_thresh}) + df_scene.loc[ls_index,s_marker_loc] = df_mi.loc[ls_index,s_marker_loc] >= i_thresh + df_pos_auto = df_pos_auto.append(df_scene) + return(df_pos_auto,d_thresh_record) + +def positive_scatterplots(df_pos_auto,d_thresh_record,df_xy,ls_color,qcdir='.'): + ''' + for marker in ls_color, plot positive cells location in tissue + ''' + ls_scene = sorted(set(df_xy.slide_scene)) + + for s_scene in ls_scene: + print(f'Plotting {s_scene}') + #negative cells = all cells even before dapi filtering + df_neg = df_xy[(df_xy.slide_scene==s_scene)] + #plot + fig, ax = plt.subplots(2, ((len(ls_color))+1)//2, figsize=(18,12)) #figsize=(18,12) + ax = ax.ravel() + for ax_num, s_color in enumerate(ls_color): + s_marker = s_color.split('_')[0] + s_min = d_thresh_record[f"{s_scene}_{s_marker}"] + #positive cells = positive cells based on threshold + ls_pos_index = (df_pos_auto[df_pos_auto.loc[:,s_color]]).index + df_color_pos = df_neg[df_neg.index.isin(ls_pos_index)] + if len(df_color_pos)>=1: + #plot negative cells + ax[ax_num].scatter(data=df_neg,x='DAPI_X',y='DAPI_Y',color='silver',s=1) + #plot positive cells + ax[ax_num].scatter(data=df_color_pos, x='DAPI_X',y='DAPI_Y',color='DarkBlue',s=.5) + + ax[ax_num].axis('equal') + ax[ax_num].set_ylim(ax[ax_num].get_ylim()[::-1]) + ax[ax_num].set_title(f'{s_marker} min={int(s_min)}') + else: + ax[ax_num].set_title(f'{s_marker} min={int(s_min)}') + ls_save = [item.split('_')[0] for item in ls_color] + fig.suptitle(s_scene) + fig.savefig(f'{qcdir}/QC_{".".join(ls_save)}_{s_scene}_auto.png') + +def plot_thresh_results(df_img_all,df_pos_auto,d_thresh_record,df_xy,i_max,s_thresh,qcdir): + ls_color = [item + '_nuclei' for item in df_img_all[(df_img_all.round_ord<=i_max) & (df_img_all.slide_scene==df_img_all.slide_scene.unique()[0]) & (df_img_all.marker.str.contains('DAPI'))].marker.tolist()] + positive_scatterplots(df_pos_auto,d_thresh_record,df_xy,ls_color + [f'{s_thresh}_cytoplasm'],qcdir) + return(ls_color) + +def filter_dapi_cellpose(df_pos_auto,ls_color,df_mi,ls_filter,qcdir='.'): + ''' + filter by cell positive for DAPI autotresholding, in rounds specified in ls_filter + error + ''' + #plot dapi thresholds + df_pos_auto['slide_scene'] = [item.split('_cell')[0] for item in df_pos_auto.index] + fig,ax=plt.subplots(figsize=(10,5)) + df_plot = df_pos_auto.loc[:,ls_color+['slide_scene']] + df_scenes = df_plot.groupby('slide_scene').sum().T/df_plot.groupby('slide_scene').sum().max(axis=1) + df_scenes.plot(ax=ax,colormap='tab20') + ax.set_xticks(np.arange(0,(len(df_scenes.index)),1)) #+1 + ax.set_xticklabels([item.split('_')[0] for item in df_scenes.index]) + ax.set_ylim(0.5,1.1) + ax.legend(loc=3) + plt.tight_layout() + df_pos_auto['slide'] = [item.split('_')[0] for item in df_pos_auto.index] + ls_slides = sorted(set(df_pos_auto.slide)) + fig.savefig(f'{qcdir}/QC_DAPIRounds_lineplot_{".".join(ls_slides)}.png') + #filter by first and last round dapi + ls_dapi_index = df_pos_auto[df_pos_auto.loc[:,ls_filter].all(axis=1)].index + #also filter by any dapi less than 1 in mean intensity + ls_dapi_missing = df_mi[(df_mi.loc[:,ls_color] < 1).sum(axis=1) > 0].index.tolist() + es_dapi_index = set(ls_dapi_index) - set(ls_dapi_missing) + print(f'number of cells before DAPI filter = {len(df_mi)}') + df_mi_filter = df_mi.loc[df_mi.index.isin(es_dapi_index),:] + [print(f'filtering by {item}') for item in ls_filter] + print(f'number of cells after DAPI filter = {len(df_mi_filter)}') + #drop cells with euler numer > 1 + # + # + return(df_mi_filter) + +def load_li_thresh(ls_sample, segdir): + # load li thresholds + os.chdir(segdir) + df_img_all =pd.DataFrame() + for s_sample in ls_sample: + df_img = pd.read_csv(f'thresh_{s_sample}_ThresholdLi.csv', index_col=0) + df_img['rounds'] = [item.split('_')[0].split('Registered-')[1] for item in df_img.index] + df_img['color'] = [item.split('_')[-2] for item in df_img.index] + df_img['slide'] = [item.split('_')[2].split('-Scene-')[0] for item in df_img.index] + df_img['scene'] = [item.split('_')[2].split('-Scene-')[1] for item in df_img.index] + df_img['slide_scene'] = df_img.slide + '_scene' + df_img.scene + #parse file name for biomarker + for s_index in df_img.index: + #print(s_index) + s_color = df_img.loc[s_index,'color'] + if s_color == 'c1': + s_marker = f"DAPI{df_img.loc[s_index,'rounds'].split('R')[1]}" + elif s_color == 'c2': + s_marker = s_index.split('_')[1].split('.')[0] + elif s_color == 'c3': + s_marker = s_index.split('_')[1].split('.')[1] + elif s_color == 'c4': + s_marker = s_index.split('_')[1].split('.')[2] + elif s_color == 'c5': + s_marker = s_index.split('_')[1].split('.')[3] + else: print('Error') + df_img.loc[s_index,'marker'] = s_marker + df_img_all = df_img_all.append(df_img) + return(df_img_all) + +def filter_standard(df_mi,d_channel,s_dapi): + """ + If biomarkers have standard names according to preprocess.check_names, + use the hard coded locations, adds any channels needed for af subtraction + Input: + df_mi= mean intensity dataframe + d_channel = dictionary of channel:background marker + """ + es_standard = {'PDL1_Ring','pERK_Nuclei','CK19_Ring','pHH3_Nuclei','CK14_Ring','Ki67_Nuclei','Ki67r_Nuclei','Ecad_Ring','PCNA_Nuclei','HER2_Ring','ER_Nuclei','CD44_Ring', + 'aSMA_Ring','AR_Nuclei','pAKT_Ring','LamAC_Nuclei','CK5_Ring','EGFR_Ring','pRb_Nuclei','FoxP3_Nuclei','CK7_Ring','PDPN_Ring','CD4_Ring','PgR_Nuclei','Vim_Ring', + 'CD8_Ring','CD31_Ring','CD45_Ring','panCK_Ring','CD68_Ring','PD1_Ring','CD20_Ring','CK8_Ring','cPARP_Nuclei','ColIV_Ring','ColI_Ring','CK17_Ring', + 'H3K4_Nuclei','gH2AX_Nuclei','CD3_Ring','H3K27_Nuclei','53BP1_Nuclei','BCL2_Ring','GRNZB_Nuclei','LamB1_Nuclei','pS6RP_Ring','BAX_Nuclei','RAD51_Nuclei', + 'Glut1_Ring','CoxIV_Ring','LamB2_Nuclei','S100_Ring','BMP4_Ring','PgRc4_Nuclei','pRB_Nuclei','p63_Nuclei','p63_Ring','CGA_Ring','SYP_Ring','pS62MYC_Nuclei', 'HIF1a_Nuclei', + 'PDGFRa_Ring', 'BMP2_Ring','PgRb_Nuclei','MUC1_Ring','CSF1R_Ring','CAV1_Ring','CCND1_Nuclei','CC3_Nuclei' } #PgRb is second PgR in dataset + #generate list of background markers needed for subtraction + lls_d_channel = [] + for s_key,ls_item in d_channel.items(): + lls_d_channel = lls_d_channel + [ls_item] + ls_background = [] + for ls_channel in lls_d_channel: + ls_background = ls_background + [f'{ls_channel[0]}_Ring'] + ls_background = ls_background + [f'{ls_channel[1]}_Nuclei'] + #ls_background.append(f'{s_dapi}_Nuclei') + ls_background.append(f'{s_dapi}') + se_background = set(ls_background) + es_common = set(df_mi.columns.tolist()).intersection(es_standard) | se_background + df_filtered_mi = df_mi.loc[:,sorted(es_common)] + return(df_filtered_mi, es_standard) + +def filter_loc_cellpose(df_mi_filled, ls_marker_cyto, ls_custom,filter_na=True): + ''' + get nuclei, perinuclei or cytoplasm, based on filter standard function + ''' + __ , es_standard = filter_standard(pd.DataFrame(columns=['filter_standard']),{},'filter_standard') + ls_marker = sorted(set([item.split('_')[0] for item in df_mi_filled.columns[(df_mi_filled.dtypes=='float64') & (~df_mi_filled.columns.str.contains('25'))]])) + if ls_marker.count('mean') != 0: + ls_marker.remove('mean') + es_marker = set(ls_marker) + se_stand = pd.Series(index=es_standard) + es_dapi = set([item.split('_')[0] for item in df_mi_filled.columns[df_mi_filled.columns.str.contains('DAPI')]]) + es_nuc = set([item.split('_')[0] for item in se_stand[se_stand.index.str.contains('_Nuclei')].index]) + es_nuc_select = es_nuc.intersection(es_marker) + print('Nuclear markers:') + print(es_nuc_select) + es_ring = set([item.split('_')[0] for item in se_stand[se_stand.index.str.contains('_Ring')].index]) + es_ring_select = es_ring.intersection(es_marker) + es_cyto = set(ls_marker_cyto) #set([item.split('_')[0] for item in ls_marker_cyto]) + es_ring_only = es_ring_select - es_cyto + print('Ring markers:') + print(es_ring_only) + print('Cytoplasm markers:') + print(es_cyto) + es_cust = set([item.split('_')[0] for item in ls_custom]) + es_left = es_marker - es_ring_only - es_cyto - es_nuc_select - es_dapi - es_cust + print('Custom markers:') + print(es_cust) + print('Markers with Nuclei or Cyto not specified: take both nuclei and ring') + print(es_left) + ls_n = [item + '_nuclei' for item in sorted(es_left | es_nuc_select | es_dapi)] + ls_pn = [item + '_perinuc5' for item in sorted(es_left | es_ring_only)] + ls_cyto = [item + '_cytoplasm' for item in sorted(es_cyto)] + ls_all = ls_custom + ls_pn + ls_cyto + ls_n + ['slide_scene'] + print(f'Missing {set(ls_all) - set(df_mi_filled.columns)}') + df_filter = df_mi_filled.loc[:,ls_all] + print('') + if filter_na: + print(f' NAs filtered: {len(df_filter) - len(df_filter.dropna())}') + df_filter = df_filter.dropna() + print('') + return(df_filter,es_standard) + +def marker_table(df_img_all,qcdir): + ''' + make a nice rounds/channels/markers table + ''' + df_img_all['round_ord'] = [re.sub('Q','.5', item) for item in df_img_all.rounds] + df_img_all['round_ord'] = [re.sub('r','.25', item) for item in df_img_all.round_ord] + df_img_all['round'] = [float(re.sub('[^0-9.]','', item)) for item in df_img_all.round_ord] + df_marker = df_img_all[(df_img_all.slide_scene==df_img_all.slide_scene.unique()[0])].loc[:,['marker','round','color']].pivot('round','color') + df_marker.index.name = None + df_marker.to_csv(f'{qcdir}/MarkerTable.csv',header=None) + +def filter_cellpose_df(s_sample,segdir,qcdir,s_thresh,ls_membrane,ls_marker_cyto,ls_custom,ls_filter,ls_shrunk,man_thresh = 900): + ''' + go from full dataframe and membrane dataframe to filtered datframe and xy coordinate dataframe + s_thresh='Ecad' + ls_membrane = ['HER2'] + ls_marker_cyto = ['CK14','CK5','CK17','CK19','CK7','CK8','Ecad','HER2','EGFR'] + ls_custom = ['HER2_cellmem25'] + ls_filter = ['DAPI9_nuclei','DAPI2_nuclei'] + ls_shrunk = ['CD44_nucadj2','Vim_nucadj2'] + man_thresh = 900 + ''' + # new + os.chdir(segdir) + df_img_all = load_li([s_sample],s_thresh, man_thresh) + df_mi_full = load_cellpose_df([s_sample], segdir) + df_xy = filter_cellpose_xy(df_mi_full) + df_mi_full, i_max = drop_last_rounds(df_img_all,ls_filter,df_mi_full) + df_mi_filled = fill_cellpose_nas(df_mi_full,ls_marker_cyto,s_thresh=s_thresh,man_thresh=man_thresh) + df_mi_filled = shrink_seg_regions(df_mi_filled,s_thresh,ls_celline=[],ls_shrunk=ls_shrunk) + df_mi_mem_fill = fill_bright_nas(ls_membrane,s_sample,s_thresh,df_mi_filled,segdir) + df_mi,es_standard = filter_loc_cellpose(df_mi_mem_fill, ls_marker_cyto, ls_custom) + df_pos_auto,d_thresh_record = auto_threshold(df_mi,df_img_all) + ls_color = plot_thresh_results(df_img_all,df_pos_auto,d_thresh_record,df_xy,i_max,s_thresh,qcdir) + df_mi_filter = filter_dapi_cellpose(df_pos_auto,ls_color,df_mi,ls_filter,qcdir) + df_mi_filter.to_csv(f'{segdir}/features_{s_sample}_FilteredMeanIntensity_{"_".join([item.split("_")[0] for item in ls_filter])}.csv') + df_xy.to_csv(f'{segdir}/features_{s_sample}_CentroidXY.csv') + return(df_mi_mem_fill,df_img_all) + +def filter_cellpose_background(df_mi_filled, es_standard): + ''' + given a set of standard biomarker subcellular locations, obtain the opposite subcellular location + and the mean intensity + input: df_mi = mean intensity dataframe with all biomarker locations + es_standard = biomarker ring or nuclei + return: dataframe with each scene and the quantiles of the negative cells scene + ''' + ls_rim = [item.replace('Nuclei','cytoplasm') for item in sorted(es_standard)] + ls_nuc_ring = [item.replace('Ring','nuclei') for item in ls_rim] + ls_nuc_ring.append('slide_scene') + ls_nuc_ring = sorted(set(df_mi_filled.columns).intersection(set(ls_nuc_ring))) + #quntiles + df_bg = df_mi_filled.loc[:,ls_nuc_ring].groupby('slide_scene').quantile(0) + df_bg.columns = [f'{item}' for item in df_bg.columns] + for q in np.arange(0,1,.1): + df_quantile = df_mi_filled.loc[:,ls_nuc_ring].groupby('slide_scene').quantile(q) + df_bg = df_bg.merge(df_quantile,left_index=True, right_index=True, suffixes=('',f'_{str(int(q*10))}')) + #drop duplicate + ls_nuc_ring.remove('slide_scene') + df_bg = df_bg.loc[:,~df_bg.columns.isin(ls_nuc_ring)] + return(df_bg) + +def filter_cellpose_df_old(df_mi_full): + ''' + old + ''' + #filter + ls_select = [ + #nuclei + 'DAPI1_nuclei', 'DAPI2_nuclei', 'DAPI3_nuclei', 'DAPI4_nuclei','DAPI5_nuclei', 'DAPI5Q_nuclei', + 'DAPI6_nuclei', 'DAPI7_nuclei','DAPI8_nuclei', 'DAPI9_nuclei', + 'DAPI10_nuclei', 'DAPI11_nuclei','DAPI12_nuclei','DAPI12Q_nuclei', + 'ER_nuclei','AR_nuclei','PgR_nuclei', + 'Ki67_nuclei', 'pRB_nuclei','PCNA_nuclei', 'pHH3_nuclei', + 'FoxP3_nuclei', 'GRNZB_nuclei', + 'H3K27_nuclei', 'H3K4_nuclei', + 'LamAC_nuclei', 'LamB1_nuclei', 'LamB2_nuclei', + 'HIF1a_nuclei', 'pERK_nuclei', 'cPARP_nuclei', 'gH2AX_nuclei', + + #perinuc5 + 'CD44_perinuc5', + 'CD20_perinuc5', 'CD31_perinuc5', + 'CD3_perinuc5', 'CD45_perinuc5', 'CD4_perinuc5', + 'CD68_perinuc5', 'CD8_perinuc5','pS6RP_perinuc5', + 'ColIV_perinuc5', 'ColI_perinuc5', 'CoxIV_perinuc5', + 'PD1_perinuc5', 'PDPN_perinuc5','PDGFRa_perinuc5', + 'Vim_perinuc5', 'aSMA_perinuc5','BMP2_perinuc5', + #cytoplasm + #'pAKT_cytoplasm','Glut1_cytoplasm', + 'CK14_cytoplasm','CK5_cytoplasm','CK17_cytoplasm', + 'CK19_cytoplasm','CK7_cytoplasm','CK8_cytoplasm', + 'Ecad_cytoplasm','HER2_cytoplasm','EGFR_cytoplasm', + #other + 'slide_scene', + #'area_segmented-nuclei', #'area_segmented-cells', + #'eccentricity_segmented-nuclei', #'eccentricity_segmented-cells', + #'mean_intensity_segmented-nuclei', #'mean_intensity_segmented-cells', + ] + + ls_negative = df_mi_full.columns[df_mi_full.columns.str.contains('_negative')].tolist() + #print(type(ls_negative)) + ls_select = ls_select + ls_negative + + df_mi_nas = df_mi_full.loc[:,df_mi_full.columns.isin(ls_select)] + print(f'Selected makers that were missing from mean intensity {set(ls_select) - set(df_mi_nas.columns)}') + #fiter out nas + print(f'Number on df_mi nas = {df_mi_nas.isna().sum().max()}') + df_mi = df_mi_nas.dropna(axis=0,how='any') + return(df_mi,df_mi_nas) + +###### below: functions for guillaumes features ######## + +def load_mi(s_sample, s_path='./', b_set_index=True): + """ + input: + s_sample: string with sample name + s_path: file path to data, default is current folder + b_set_index: + + output: + df_mi: dateframe with mean intensity + each row is a cell, each column is a biomarker_location + + description: + load the mean intensity dataframe + """ + print(f'features_{s_sample}_MeanIntensity.tsv') + df_mi = pd.read_csv( + f'{s_path}features_{s_sample}_MeanIntensity.tsv', + sep='\t', + index_col=0 + ) + if b_set_index: + df_mi = df_mi.set_index(f'{s_sample}_' + df_mi.index.astype(str)) + return(df_mi) + +def load_xy(s_sample, s_path='./', b_set_index=True): + """ + input: + s_sample: string with sample name + s_path: file path to data, default is current folder + b_set_index: + + output: + df_mi: dateframe with mean intensity + each row is a cell, each column is a biomarker_location + + description: + load the mean intensity dataframe + """ + print(f'features_{s_sample}_CentroidY.tsv') + df_y = pd.read_csv( + f'features_{s_sample}_CentroidY.tsv', + sep='\t', + index_col=0 + ) + if b_set_index: + df_y = df_y.set_index(f'{s_sample}_' + df_y.index.astype(str)) + + print(f'features_{s_sample}_CentroidX.tsv') + df_x = pd.read_csv( + f'features_{s_sample}_CentroidX.tsv', + sep='\t', + index_col=0 + ) + if b_set_index: + df_x = df_x.set_index(f'{s_sample}_' + df_x.index.astype(str)) + #merge the x and y dataframes + df_xy = pd.merge(df_x,df_y,left_index=True,right_index=True,suffixes=('_X', '_Y')) + return(df_xy) + +def add_scene(df,i_scene_index=1,s_group='scene'): + """ + decription: add a coulmn with a grouping to dataframe that has grouping in the index + """ + lst = df.index.str.split('_') + lst2 = [item[i_scene_index] for item in lst] + df[s_group] = lst2 + return(df) + +def filter_dapi(df_mi,df_xy,s_dapi='DAPI11_Nuclei',dapi_thresh=1000,b_images=False,t_figsize=(8,8)): + """ + description: return a dataframe where all cells have DAPI brigter than a threshold + right now the plotting works! + """ + df_filtered_mi = df_mi.copy(deep=True) + #get tissue id from the dataframe + s_tissue = df_mi.index[0].split('_')[0] + #DAPI filter + df_filtered_mi = df_filtered_mi[df_filtered_mi.loc[:,s_dapi]>dapi_thresh] + print(f'Cells before DAPI filter = {len(df_mi)}') + print(f'Cells after DAPI filter = {len(df_filtered_mi)}') + df_filtered_mi.index.name='UNIQID' + if b_images: + ls_scene=list(set(df_xy.scene)) + ls_scene.sort() + for s_scene in ls_scene: + df_pos = df_xy.loc[df_filtered_mi.index.tolist()] + df_pos_scene = df_pos[df_pos.scene==s_scene] + if len(df_pos_scene) >= 1: + fig,ax=plt.subplots(figsize=t_figsize) + ax.scatter(x=df_xy[df_xy.scene==s_scene].loc[:,'DAPI_X'], y=df_xy[df_xy.scene==s_scene].loc[:,'DAPI_Y'], color='silver',label='DAPI neg', s=2) + ax.scatter(x=df_pos_scene.loc[:,'DAPI_X'], y=df_pos_scene.loc[:,'DAPI_Y'], color='DarkBlue',label='DAPI pos',s=2) + ax.axis('equal') + ax.set_ylim(ax.get_ylim()[::-1]) + ax.set_title(f'{s_scene}_DAPI') + plt.legend(markerscale=3) + fig.savefig(f'{s_tissue}_{s_scene}_{s_dapi}{dapi_thresh}.png') + return(df_filtered_mi) + +def load_meta(s_sample, s_path='./',type='csv'): + """ + load rounds cycles table + make sure to specify location for use with downstream functions + make sure to add rows for any biomarkers used for analysis or processing + """ + #tab or space delimited + if type == 'Location': + print(f'metadata_{s_sample}_RoundsCyclesTable_location.txt') + df_t = pd.read_csv( + f'metadata_{s_sample}_RoundsCyclesTable_location.txt', + delim_whitespace=True, + header=None, + index_col=False, + names=['marker', 'rounds','color','minimum', 'maximum', 'exposure', 'refexp','location'], + ) + df_t = df_t.set_index(f'{s_sample}_' + df_t.index.astype(str)) + df_t.replace({'Nucleus':'Nuclei'},inplace=True) + df_t['marker_loc'] = df_t.marker + '_' + df_t.location + df_t.set_index(keys='marker_loc',inplace=True) + elif type == 'csv': + print(f'metadata_{s_sample}_RoundsCyclesTable.csv') + df_t = pd.read_csv( + f'metadata_{s_sample}_RoundsCyclesTable.csv', + header=0, + index_col=0, + names=['rounds','color','minimum', 'maximum', 'exposure', 'refexp','location'],#'marker', + ) + #df_t = df_t.set_index(f'{s_sample}_' + df_t.index.astype(str)) + df_t.replace({'Nucleus':'Nuclei'},inplace=True) + # + elif type == 'LocationCsv': + print(f'metadata_{s_sample}_RoundsCyclesTable_location.csv') + df_t = pd.read_csv( + f'metadata_{s_sample}_RoundsCyclesTable_location.csv', + header=0, + index_col=False, + names=['marker', 'rounds','color','minimum', 'maximum', 'exposure', 'refexp','location'], + ) + df_t = df_t.set_index(f'{s_sample}_' + df_t.index.astype(str)) + df_t.replace({'Nucleus':'Nuclei'},inplace=True) + df_t['marker_loc'] = df_t.marker + '_' + df_t.location + df_t.set_index(keys='marker_loc',inplace=True) + else: + print(f'metadata_{s_sample}_RoundsCyclesTable.txt') + df_t = pd.read_csv( + f'metadata_{s_sample}_RoundsCyclesTable.txt', + delim_whitespace=True, + header=None, + index_col=False, + names=['rounds','color','minimum', 'maximum', 'exposure', 'refexp','location'],#'marker', + ) + df_t = df_t.set_index(f'{s_sample}_' + df_t.index.astype(str)) + df_t.replace({'Nucleus':'Nuclei'},inplace=True) + return(df_t) + +def add_exposure_roundscyles(df_tc, df_expc,es_standard,ls_dapi = ['DAPI12_Nuclei']): + """ + df_exp = dataframe of exposure times with columns [0, 1,2,3,4] + and index with czi image names + df_t = metadata with dataframe with ['marker','exposure'] + """ + df_t = copy.copy(df_tc) + df_exp = copy.copy(df_expc) + df_t['location'] = '' + df_t.drop([item.split('_')[0] for item in ls_dapi], inplace=True) + df_exp.columns = ['c' + str(int(item)+1) for item in df_exp.columns] + df_exp['rounds'] = [item.split('_')[0] for item in df_exp.index] + for s_index in df_t.index: + s_channel = df_t.loc[s_index,'colors'] + s_round = df_t.loc[s_index, 'rounds'] + print(s_round) + #look up exposure time for marker in metadata + df_t_image = df_exp[(df_exp.rounds==s_round)] + if len(df_t_image) > 0: + i_exposure = df_t_image.loc[:,s_channel] + df_t.loc[s_index,'exposure'] = i_exposure[0] + df_t.loc[s_index,'refexp'] = i_exposure[0] + else: + print(f'{s_marker} has no recorded exposure time') + s_ring = s_index + '_Ring' + s_nuc = s_index + '_Nuclei' + ls_loc = sorted(es_standard.intersection({s_ring,s_nuc})) + if len(ls_loc) == 1: + df_t.loc[s_index,'location'] = ls_loc[0].split('_')[1] + return(df_t) + +def filter_loc(df_mi,df_t): + """ + filters columns of dataframe based on locations selected in metadata_location table + """ + ls_bio_loc = df_t.index.tolist() + df_filtered_mi = df_mi.loc[:,ls_bio_loc] + return(df_filtered_mi) + +#R0c2 R0c3 R0c4 R0c5 panCK CK14 Ki67 CK19 R1rc2 R1rc3 Ki67r R1rc5 PCNA HER2 ER Ecad aSMA AR pAKT +#CD44 CK5 EGFR pRB LamAC pHH3 PDPN pERK FoxP3 R5Qc2 R5Qc3 R5Qc4 R5Qc5 CK7 CD68 PD1 CD45 Vim CD8 CD4 PgR CK8 cPARP ColIV CD20 CK17 +#H3K4 gH2AX ColI H3K27 pS6RP CD31 GRNZB LamB1 CoxIV HIF1a CD3 Glut1 PDGFRa LamB2 BMP2 R12Qc2 R12Qc3 R12Qc4 R12Qc5 DAPI12 + +def filter_background(df_mi, es_standard): + ''' + given a set of standard biomarker subcellular locations, obtain the opposite subcellular location + and the mean intensity + input: df_mi = mean intensity dataframe with all biomarker locations + es_standard = biomarker ring or nuclei + return: dataframe with each scene and the quantiles of the negative cells + ''' + ls_rim = [item.replace('Nuclei','Rim') for item in sorted(es_standard)] + ls_nuc_rim = [item.replace('Ring','Nuclei') for item in ls_rim] + ls_nuc_ring = [item.replace('Rim','Ring') for item in ls_nuc_rim] + ls_nuc_ring.append('scene') + ls_nuc_rim.append('scene') + df_scene = add_scene(df_mi) + ls_nuc_ring = sorted(set(df_scene.columns).intersection(set(ls_nuc_ring))) + #quntiles + df_bg = df_scene.loc[:,ls_nuc_ring].groupby('scene').quantile(0) + df_bg.columns = [f'{item}' for item in df_bg.columns] + for q in np.arange(0,1,.1): + df_quantile = df_scene.loc[:,ls_nuc_ring].groupby('scene').quantile(q) + df_bg = df_bg.merge(df_quantile,left_index=True, right_index=True, suffixes=('',f'_{str(int(q*10))}')) + print(q) + print(f'_{str(int(q*10))}') + #mean + df_quantile = df_scene.loc[:,ls_nuc_ring].groupby('scene').mean() + df_bg = df_bg.merge(df_quantile,left_index=True, right_index=True, suffixes=('','_mean')) + #drop duplicate + ls_nuc_ring.remove('scene') + df_bg = df_bg.loc[:,~df_bg.columns.isin(ls_nuc_ring)] + return(df_bg) + +def exposure_norm(df_mi,df_t,d_factor={'c1':10,'c2':30,'c3':200,'c4':500,'c5':500}): + """ + normalizes to standard exposure times + input: mean intensity, and metadata table with exposure time + """ + df_norm = pd.DataFrame() + ls_columns = [item.split('_')[0] for item in df_mi.columns.tolist()] + ls_column_mi = df_mi.columns.tolist() + for idx, s_column in enumerate(ls_columns): + + s_marker = s_column.split('_')[0] + i_exp = df_t.loc[s_column,'exposure'] + print(f'Processing exposure time for {s_column}: {i_exp}') + print(f'Processing mean intensity {ls_column_mi[idx]}') + i_factor = d_factor[df_t.loc[s_column,'colors']] + se_exp = df_mi.loc[:,ls_column_mi[idx]] + df_norm[ls_column_mi[idx]] = se_exp/i_exp*i_factor + return(df_norm) + +def af_subtract(df_norm,df_t,d_channel={'c2':['L488','L488'],'c3':['L555','L555'],'c4':['L647','L647'],'c5':['L750','L750']},ls_exclude=[]): + """ + given an exposure normalized dataframe, metadata with biomarker location, and a dictionary of background channels, subtracts + correct background intensity from each cell + input: + d_channel = dictionary, key is color i.e. 'c2', value is list of ['Ring','Nuclei'] + ls_exclude = markers to not subtract + output: + df_mi_sub,ls_sub,ls_record + """ + #generate list of background markers needed for subtraction + lls_d_channel = [] + for s_key,ls_item in d_channel.items(): + lls_d_channel = lls_d_channel + [ls_item] + ls_background = [] + for ls_channel in lls_d_channel: + ls_background = ls_background + [f'{ls_channel[0]}_Ring'] + ls_background = ls_background + [f'{ls_channel[1]}_Nuclei'] + se_background = set(ls_background) + se_exclude = set([item + '_Ring' for item in ls_exclude] + [item + '_Nuclei' for item in ls_exclude]).intersection(set(df_norm.columns.tolist())) + se_all = set(df_norm.columns.tolist()) + se_sub = se_all - se_background - se_exclude + ls_sub = list(se_sub) + + #subtract AF channels + df_mi_sub = pd.DataFrame() + + ls_record = [] + for s_marker_loc in ls_sub: + print(s_marker_loc) + s_marker = s_marker_loc.split('_')[0] + s_loc = s_marker_loc.split('_')[1] + s_channel = df_t.loc[s_marker,'colors'] + if s_channel == 'c1': + df_mi_sub[s_marker_loc] = df_norm.loc[:,s_marker_loc] + continue + if s_loc =='Nuclei': + s_AF = d_channel[s_channel][1] + elif s_loc == 'Ring': + s_AF = d_channel[s_channel][0] + else: + print('Error: location must be Ring or Nucleus') + s_AF_loc = s_AF + '_' + s_loc + df_mi_sub[s_marker_loc] = df_norm.loc[:,s_marker_loc] - df_norm.loc[:,s_AF_loc] + print(f'From {s_marker_loc} subtracting {s_AF_loc}') + ls_record = ls_record + [f'From {s_marker_loc} subtracting {s_AF_loc}\n'] + for s_marker in sorted(se_exclude): + ls_record = ls_record + [f'From {s_marker} subtracting None\n'] + df_mi_sub[sorted(se_exclude)] = df_norm.loc[:,sorted(se_exclude)] + #f = open(f"AFsubtractionData.txt", "w") + #f.writelines(ls_record) + #f.close() + #error check + print('AF subtraction not performed for the following markers:') + print(set(df_t.index) - set(ls_sub)) + + return(df_mi_sub,ls_sub,ls_record) + +def plot_subtraction(df_norm,df_sub,ls_scene=None): + """ + makes scatterplots of each marker, subtracted versus original meanintensity per cell, to judge subtraction effectiveness + """ + if ls_scene == None: + ls_scene = list(set(df_norm.scene)) + ls_marker = df_sub.columns.tolist() + ls_marker.remove('scene') + ls_scene.sort() + for s_marker in ls_marker: + print(f'Plotting {s_marker}') + fig, ax = plt.subplots(2,(len(ls_scene)+1)//2, figsize = (12,4)) + ax = ax.ravel() + ax_num = -1 + for s_scene in ls_scene: + df_subtracted = df_sub[df_sub.scene==s_scene] + df_original = df_norm[df_norm.scene==s_scene] + ax_num = ax_num + 1 + ax[ax_num].scatter(x=df_original.loc[:,s_marker],y=df_subtracted.loc[:,s_marker],s=1,alpha=0.8) + ax[ax_num].set_title(s_scene,{'fontsize': 10,'verticalalignment': 'center'}) + fig.text(0.5, 0.01, s_marker, ha='center') + fig.text(0.6, 0.01, 'Original', ha='center') + fig.text(0.01, 0.6, 'Subtracted', va='center', rotation='vertical') + plt.tight_layout() + fig.savefig(f'{s_marker}_NegativevsOriginal.png') + +def output_subtract(df_sub,df_t,d_factor={'c1':10,'c2':30,'c3':200,'c4':500,'c5':500}): + """ + this un-normalizes by exposure time to output a new dataframe of AF subtracted cells for analysis + """ + ls_sub = df_sub.columns.tolist() + result = any(elem == 'scene' for elem in ls_sub) + if result: + ls_sub.remove('scene') + df_sub = df_sub.drop(columns='scene') + else: + print('no scene column') + df_mi_zero = df_sub.clip(lower = 0) + df_mi_factor = pd.DataFrame() + for s_sub in ls_sub: + s_dft_index = s_sub.split('_')[0] + i_reverse_factor = df_t.loc[s_dft_index,'exposure']/d_factor[df_t.loc[s_dft_index,'colors']] + df_mi_factor[s_sub] = df_mi_zero.loc[:,s_sub]*i_reverse_factor + return df_mi_factor + +def af_subtract_images(df_t,d_channel={'c2':['L488','L488'],'c3':['L555','L555'],'c4':['L647','L647'],'c5':['L750','L750']},s_dapi='DAPI11_Nuclei',b_mkdir=True): + """ + This code loads 16 bit grayscale tiffs, performs AF subtraction of channels/rounds defined by the user, and outputs 8 bit AF subtracted tiffs for visualization. + The data required is: + 1. The RoundsCyclesTable.txt with the location (Nucleus/Ring) specified (not All), and real expsure times + 2. 16 bit grayscale tiff images following Koei's naming convention (script processes the list of folders ls_folder) + Note: name of folder can be anything + """ + #generate list of markers needing subtraction + lls_d_channel = [] + for s_key in d_channel: + lls_d_channel = lls_d_channel + [d_channel[s_key]] + ls_background = [] + for ls_channel in lls_d_channel: + ls_background = ls_background + [f'{ls_channel[0]}_Ring'] + ls_background = ls_background + [f'{ls_channel[1]}_Nuclei'] + se_background = set(ls_background) + se_all = set(df_t.index) + se_sub = se_all - se_background + ls_sub = list(se_sub) + #ls_sub.remove(s_dapi) #don't need line if s_DAPI is c1 + #subtract images + #os.makedirs('8bit/', exist_ok=True) + if b_mkdir: + os.mkdir('8bit') + ls_image = os.listdir() + ls_slide = [] + ls_image_org = [] + for s_image in ls_image: + if s_image.find('_ORG.tif')>-1: + #make a list of slides/scenes in the folder + s_slide = s_image.split('_')[2] + ls_slide = ls_slide + [s_slide] + #make a list of all original images in the folder + ls_image_org = ls_image_org + [s_image] + ls_slide = list(set(ls_slide)) + #process each slide in the folder + for s_slide in ls_slide: + print(f'Processing {s_slide}') + df_t['image'] = 'NA' + ls_dapi = [] + + for s_image in ls_image_org: + + #grab all original images with slide/scene name + if s_image.find(s_slide) > -1: + + #add matching image name to df_t (fore specific slide/scene, dapi not included) + s_round = s_image.split('Registered-')[1].split('_')[0] + s_color = s_image.split('Scene-')[1].split('_')[1] + s_index = df_t[(df_t.rounds==s_round) & (df_t.color==s_color)].index + df_t.loc[s_index,'image'] = s_image + if s_color == 'c1': + ls_dapi = ls_dapi + [s_image] + #subtract images + ls_record = [] + for s_marker_loc in ls_sub: + s_marker = s_marker_loc.split('_')[0] + s_loc = s_marker_loc.split('_')[1] + s_rounds= df_t.loc[s_marker_loc,'rounds'] + s_channel = df_t.loc[s_marker_loc,'color'] + if s_channel == 'c1': + print(f'{s_marker_loc} is DAPI') + continue + elif s_loc =='Nuclei': + s_AF = d_channel[s_channel][1] + elif s_loc == 'Ring': + s_AF = d_channel[s_channel][0] + else: + print('Error: location must be Ring or Nucleus') + s_AF_loc = s_AF + '_' + s_loc + print(f'From {s_marker_loc} subtracting {s_AF_loc}') + s_image = df_t.loc[s_marker_loc,'image'] + s_background = df_t.loc[s_AF_loc,'image'] + a_img = skimage.io.imread(s_image) + a_AF = skimage.io.imread(s_background) + #divide each image by exposure time + #subtract 1 ms AF from 1 ms signal + #multiply by original image exposure time + a_sub = (a_img/df_t.loc[s_marker_loc,'exposure'] - a_AF/df_t.loc[s_AF_loc,'exposure'])*df_t.loc[s_marker_loc,'exposure'] + + ls_record = ls_record + [f'From {s_marker_loc} subtracting {s_AF_loc}\n'] + #make all negative numbers into zero + a_zero = a_sub.clip(min=0,max=a_sub.max()) + a_zero_8bit = (a_zero/256).astype(np.uint8) + s_fname = f"8bit/{s_rounds}_{s_marker}_{s_slide}_{s_channel}_8bit.tif" + skimage.io.imsave(s_fname,a_zero_8bit) + f = open(f"8bit/AFsubtractionImages.txt", "w") + f.writelines(ls_record) + f.close() + #save 8 bit dapis + for s_dapi in ls_dapi: + a_img = skimage.io.imread(s_dapi) + a_zero_8bit = (a_img/256).astype(np.uint8) + s_marker = 'DAPI' + s_channel = 'c1' + s_round = s_dapi.split('Registered-')[1].split('_')[0] + s_fname = f"8bit/{s_round}_{s_marker}_{s_slide}_{s_channel}_8bit.tif" + skimage.io.imsave(s_fname,a_zero_8bit) + +def round_overlays(): + """ + output multipage tiffs with five channels per round + """ + os.chdir('./8bit') + ls_image = os.listdir() + ls_slide = [] + ls_image_org = [] + ls_round = [] + + for s_image in ls_image: + if s_image.find('8bit.tif') > -1: + #make a list of slides/scenes + #also make list of rounds + s_slide = s_image.split('_')[2] + ls_slide = ls_slide + [s_slide] + ls_image_org = ls_image_org + [s_image] + s_round = s_image.split('_')[0] + ls_round = ls_round + [s_round] + ls_slide = list(set(ls_slide)) + ls_round = list(set(ls_round)) + for s_slide in ls_slide: + print(f'Processing {s_slide}') + for s_round in ls_round: + d_overlay = {} + ls_color_round = [] + for s_image in ls_image_org: + if s_image.find(s_slide) > -1: + if s_image.find(f'{s_round}_') == 0: + s_color = s_image.split('_')[3] + d_overlay.update({s_color:s_image}) + s_image_round = s_image + a_size = skimage.io.imread(s_image_round) + a_overlay = np.zeros((len(d_overlay),a_size.shape[0],a_size.shape[1]),dtype=np.uint8) + s_biomarker_all = '' + i = -1 + for s_color in sorted(d_overlay.keys()): + i = i + 1 + s_overlay= d_overlay[s_color] + s_biomarker = s_overlay.split('_')[1] + '.' + s_biomarker_all = s_biomarker_all + s_biomarker + a_channel = skimage.io.imread(s_overlay) + a_overlay[i,:,:] = a_channel + s_biomarker_all = s_biomarker_all[:-1] + #this works. Open in image j. use Image/Color/Make Composite. Then use + #Image/Color/Channels Tool to turn on and off channels + #use Image/Adjust/Brightness/Contrast to adjust + with skimage.external.tifffile.TiffWriter(f'{s_round}_{s_biomarker_all}_{s_slide}_overlay.tiff', imagej=True) as tif: + for i in range(a_overlay.shape[0]): + tif.save(a_overlay[i]) + os.chdir('..') + +def custom_overlays(d_combos, df_img, df_dapi): + """ + output custon multi page tiffs according to dictionary, with s_dapi as channel 1 in each overlay + BUG with 53BP1 + d_combos = {'Immune':{'CD45', 'PD1', 'CD8', 'CD4', 'CD68', 'FoxP3','GRNZB','CD20','CD3'}, + 'Stromal':{'Vim', 'aSMA', 'PDPN', 'CD31', 'ColIV','ColI'}, + 'Differentiation':{'CK19', 'CK7','CK5', 'CK14', 'CK17','CK8'}, + 'Tumor':{'HER2', 'Ecad', 'ER', 'PgR','Ki67','PCNA'}, + 'Proliferation':{'EGFR','CD44','AR','pHH3','pRB'}, + 'Functional':{'pS6RP','H3K27','H3K4','cPARP','gH2AX','pAKT','pERK'}, + 'Lamins':{'LamB1','LamAC', 'LamB2'}} + """ + #os.chdir('./AFSubtracted') + + ls_slide = list(set(df_img.scene)) + #now make overlays + for s_slide in ls_slide: + print(f'Processing {s_slide}') + df_slide = df_img[df_img.scene==s_slide] + s_image_round = (df_dapi[df_dapi.scene == s_slide]).index[0] + if len((df_dapi[df_dapi.scene == s_slide]).index) == 0: + print('Error: dapi not found') + elif len((df_dapi[df_dapi.scene == s_slide]).index) > 1: + print('Error: too many dapi images found') + else: + print(s_image_round) + #exclude any missing biomarkers + es_all = set(df_slide.marker) + if len(list(set(df_img.imagetype)))==1: + s_imagetype = list(set(df_img.imagetype))[0] + print(s_imagetype) + else: + print('Error: more than one image type)') + for s_type in d_combos: + d_overlay = {} + es_combos = d_combos[s_type] + es_combos_shared = es_combos.intersection(es_all) + for idx, s_combo in enumerate(sorted(es_combos_shared)): + s_filename = (df_slide[df_slide.marker==s_combo]).index[0] + if len((df_slide[df_slide.marker==s_combo]).index) == 0: + print('Error: marker not found') + elif len((df_slide[df_slide.marker==s_combo]).index) > 1: + print('Error: too many marker images found') + else: + print(s_filename) + d_overlay.update({s_combo:s_filename}) + d_overlay.update({'1AAADAPI':s_image_round}) + a_size = skimage.io.imread(s_image_round) + a_overlay = np.zeros((len(d_overlay),a_size.shape[0],a_size.shape[1]),dtype=np.uint8) + s_biomarker_all = '' + i = -1 + for s_color in sorted(d_overlay.keys()): + i = i + 1 + s_overlay= d_overlay[s_color] + s_biomarker = s_color.split('1AAA')[0] + '.' + s_biomarker_all = s_biomarker_all + s_biomarker + a_channel = skimage.io.imread(s_overlay) + if s_imagetype=='ORG': + a_channel = (a_channel/256).astype(np.uint8) + print('covert to 8 bit') + a_overlay[i,:,:] = a_channel + s_biomarker_all = s_biomarker_all[1:-1] + #this works. Open in image j. use Image/Color/Make Composite. Then use + #Image/Color/Channels Tool to turn on and off channels + #use Image/Adjust/Brightness/Contrast to adjust + with skimage.external.tifffile.TiffWriter(f'./{s_type}_{((df_dapi[df_dapi.scene==s_slide]).marker[0])}.{s_biomarker_all}_{s_slide}_overlay.tiff', imagej=True) as tif: + for i in range(a_overlay.shape[0]): + tif.save(a_overlay[i]) + print(f'saved {s_type}') + +def custom_crop_overlays(d_combos,d_crop, df_img,s_dapi, tu_dim=(1000,1000)): #df_dapi, + """ + output custon multi page tiffs according to dictionary, with s_dapi as channel 1 in each overlay + BUG with 53BP1 + d_crop : {slide_scene : (x,y) coord + tu_dim = (width, height) + d_combos = {'Immune':{'CD45', 'PD1', 'CD8', 'CD4', 'CD68', 'FoxP3','GRNZB','CD20','CD3'}, + 'Stromal':{'Vim', 'aSMA', 'PDPN', 'CD31', 'ColIV','ColI'}, + 'Differentiation':{'CK19', 'CK7','CK5', 'CK14', 'CK17','CK8'}, + 'Tumor':{'HER2', 'Ecad', 'ER', 'PgR','Ki67','PCNA'}, + 'Proliferation':{'EGFR','CD44','AR','pHH3','pRB'}, + 'Functional':{'pS6RP','H3K27','H3K4','cPARP','gH2AX','pAKT','pERK'}, + 'Lamins':{'LamB1','LamAC', 'LamB2'}} + """ + #os.chdir('./AFSubtracted') + + ls_slide = list(set(df_img.scene)) + #now make overlays + for s_slide, xy_cropcoor in d_crop.items(): + print(f'Processing {s_slide}') + df_slide = df_img[df_img.scene==s_slide] + s_image_round = df_slide[df_slide.marker==s_dapi.split('_')[0]].index[0] + if len(df_slide[df_slide.marker==s_dapi.split('_')[0]].index) == 0: + print('Error: dapi not found') + elif len(df_slide[df_slide.marker==s_dapi.split('_')[0]].index) > 1: + print('Error: too many dapi images found') + else: + print(s_image_round) + #exclude any missing biomarkers + es_all = set(df_slide.marker) + if len(list(set(df_img.imagetype)))==1: + s_imagetype = list(set(df_img.imagetype))[0] + print(s_imagetype) + else: + print('Error: more than one image type)') + for s_type, es_combos in d_combos.items(): + d_overlay = {} + es_combos_shared = es_combos.intersection(es_all) + for idx, s_combo in enumerate(sorted(es_combos_shared)): + s_filename = (df_slide[df_slide.marker==s_combo]).index[0] + if len((df_slide[df_slide.marker==s_combo]).index) == 0: + print('Error: marker not found') + elif len((df_slide[df_slide.marker==s_combo]).index) > 1: + print('Error: too many marker images found') + else: + print(s_filename) + d_overlay.update({s_combo:s_filename}) + d_overlay.update({'1AAADAPI':s_image_round}) + a_size = skimage.io.imread(s_image_round) + #crop + a_crop = a_size[(xy_cropcoor[1]):(xy_cropcoor[1]+tu_dim[1]),(xy_cropcoor[0]):(xy_cropcoor[0]+tu_dim[0])] + a_overlay = np.zeros((len(d_overlay),a_crop.shape[0],a_crop.shape[1]),dtype=np.uint8) + s_biomarker_all = '' + i = -1 + for s_color in sorted(d_overlay.keys()): + i = i + 1 + s_overlay= d_overlay[s_color] + s_biomarker = s_color.split('1AAA')[0] + '.' + s_biomarker_all = s_biomarker_all + s_biomarker + a_size = skimage.io.imread(s_overlay) + #crop + a_channel = a_size[(xy_cropcoor[1]):(xy_cropcoor[1]+tu_dim[1]),(xy_cropcoor[0]):(xy_cropcoor[0]+tu_dim[0])] + if s_imagetype=='ORG': + a_channel = (a_channel/256).astype(np.uint8) + print('covert to 8 bit') + a_overlay[i,:,:] = a_channel + s_biomarker_all = s_biomarker_all[1:-1] + #this works. Open in image j. use Image/Color/Make Composite. Then use + #Image/Color/Channels Tool to turn on and off channels + #use Image/Adjust/Brightness/Contrast to adjust + with skimage.external.tifffile.TiffWriter(f'./{s_type}_{s_dapi.split("_")[0]}.{s_biomarker_all}_{s_slide}_x{xy_cropcoor[0]}y{xy_cropcoor[1]}_overlay.tiff', imagej=True) as tif: + for i in range(a_overlay.shape[0]): + tif.save(a_overlay[i]) + print(f'saved {s_type}') + +def make_thresh_df(df_out,ls_drop=None): + """ + makes a thresholding csv matching the output dataframe (df_out)'s scenes and biomarkers + """ + ls_scene = list(set(df_out.scene)) + ls_scene.append('global_manual') + ls_scene.sort() + ls_biomarker = df_out.columns.tolist() + ls_biomarker.remove('scene') + if ls_drop != None: + for s_drop in ls_drop: + ls_biomarker.remove(s_drop) + ls_manual = [] + for s_biomarker in ls_biomarker: + s_marker = s_biomarker.split('_')[0] + '_manual' + ls_manual.append(s_marker) + ls_manual.sort() + df_thresh = pd.DataFrame(index=ls_scene,columns=ls_manual) + #df_thresh_t = df_thresh.transpose() + return(df_thresh) + +def check_seg(s_sample= 'sampleID',ls_find=['Cell Segmentation Full Color'], i_rows=2, t_figsize=(20,10)): + """ + This script makes overviews of all the specified segmentation images of guillaumes ouput images + in a big folder (slides prepared for segmentation for example) + Input: ls_find = list of images to view + i_rows = number or rows in figure + t_figsize = (x, y) in inches size of figure + b_mkdir = boolean whether to make a new Check_Registration folder (deprecated) + Output: dictionary with {slide_color:number of rounds found} + images of all rounds of a certain slide_color + """ + d_result = {} + #if b_mkdir: + # os.mkdir(f'./Check_Registration') + for s_find in ls_find: + #find all dapi slides + ls_dapis = [] + for s_dir in os.listdir(): + if s_dir.find(s_find) > -1: + ls_dapis = ls_dapis + [s_dir] + ls_dapis.sort() + + #find all unique scenes + ls_scene_long = [] + for s_dapi in ls_dapis: + ls_scene_long = ls_scene_long + [(s_dapi.split('-')[0])] + ls_scene = list(set(ls_scene_long)) + ls_scene.sort() + fig,ax = plt.subplots(i_rows,(len(ls_scene)+(i_rows-1))//i_rows, figsize = t_figsize, squeeze=False) + ax = ax.ravel() + for idx, s_scene in enumerate(ls_scene): + print(f'Processing {s_scene}') + im_low = skimage.io.imread(ls_dapis[idx])#,plugin='simpleitk' + im = skimage.exposure.rescale_intensity(im_low,in_range=(np.quantile(im_low,0.02),np.quantile(im_low,0.98)+np.quantile(im_low,0.98)/2)) + im = skimage.transform.rescale(im, 0.25, anti_aliasing=False) + ax[idx].imshow(im) #, cmap='gray' + ax[idx].set_title(s_scene,{'fontsize':12}) + plt.tight_layout() + #fig.savefig(f'../Check_Registration/{s_sample}_{s_find}.png') + d_result.update({f'{s_sample}_{s_find}.png':fig}) + return(d_result) diff --git a/mplex_image/register.py b/mplex_image/register.py new file mode 100755 index 0000000..b963866 --- /dev/null +++ b/mplex_image/register.py @@ -0,0 +1,105 @@ +import numpy as np +from PIL import Image +from matplotlib import pyplot as plt +from skimage import transform, util +from skimage import data, img_as_float +from skimage.util import img_as_ubyte +import cv2 +import sys + +# code from adapted chandler gatenbee and brian white +# https://github.com/IAWG-CSBC-PSON/registration-challenge + +def match_keypoints(moving, target, feature_detector): + ''' + :param moving: image that is to be warped to align with target image + :param target: image to which the moving image will be aligned + :param feature_detector: a feature detector from opencv + :return: + ''' + + kp1, desc1 = feature_detector.detectAndCompute(moving, None) + kp2, desc2 = feature_detector.detectAndCompute(target, None) + + matcher = cv2.BFMatcher(normType=cv2.NORM_L2, crossCheck=True) + matches = matcher.match(desc1, desc2) + + src_match_idx = [m.queryIdx for m in matches] + dst_match_idx = [m.trainIdx for m in matches] + + src_points = np.float32([kp1[i].pt for i in src_match_idx]) + dst_points = np.float32([kp2[i].pt for i in dst_match_idx]) + + H, mask = cv2.findHomography(src_points, dst_points, cv2.RANSAC, ransacReprojThreshold=10) + + good = [matches[i] for i in np.arange(0, len(mask)) if mask[i] == [1]] + + filtered_src_match_idx = [m.queryIdx for m in good] + filtered_dst_match_idx = [m.trainIdx for m in good] + + filtered_src_points = np.float32([kp1[i].pt for i in filtered_src_match_idx]) + filtered_dst_points = np.float32([kp2[i].pt for i in filtered_dst_match_idx]) + + return filtered_src_points, filtered_dst_points + +def apply_transform(moving, target, moving_pts, target_pts, transformer, output_shape_rc=None): + ''' + :param transformer: transformer object from skimage. See https://scikit-image.org/docs/dev/api/skimage.transform.html for different transformations + :param output_shape_rc: shape of warped image (row, col). If None, uses shape of traget image + return + ''' + if output_shape_rc is None: + output_shape_rc = target.shape[:2] + + if str(transformer.__class__) == "": + transformer.estimate(target_pts, moving_pts) + warped_img = transform.warp(moving, transformer, output_shape=output_shape_rc) + + ### Restimate to warp points + transformer.estimate(moving_pts, target_pts) + warped_pts = transformer(moving_pts) + else: + transformer.estimate(moving_pts, target_pts) + warped_img = transform.warp(moving, transformer.inverse, output_shape=output_shape_rc) + warped_pts = transformer(moving_pts) + + return warped_img, warped_pts + +def keypoint_distance(moving_pts, target_pts, img_h, img_w): + dst = np.sqrt(np.sum((moving_pts - target_pts)**2, axis=1)) / np.sqrt(img_h**2 + img_w**2) + return np.mean(dst) + + + + +def register(target_file,moving_file, b_plot=False): + s_round = moving_file.split('_')[0] + s_sample = moving_file.split('_')[2] + print(s_round) + target = img_as_ubyte(img_as_float(Image.open(target_file))) + moving = img_as_ubyte(img_as_float(Image.open(moving_file))) + + fd = cv2.AKAZE_create() + #fd = cv2.KAZE_create(extended=True) + moving_pts, target_pts = match_keypoints(moving, target, feature_detector=fd) + + transformer = transform.SimilarityTransform() + warped_img, warped_pts = apply_transform(moving, target, moving_pts, target_pts, transformer=transformer) + + warped_img = img_as_ubyte(warped_img) + + print("Unaligned offset:", keypoint_distance(moving_pts, target_pts, moving.shape[0], moving.shape[1])) + print("Aligned offset:", keypoint_distance(warped_pts, target_pts, moving.shape[0], moving.shape[1])) + if b_plot: + fig, ax = plt.subplots(2,2, figsize=(10,10)) + ax[0][0].imshow(target) + ax[0][0].imshow(moving, alpha=0.5) + ax[1][0].scatter(target_pts[:,0], -target_pts[:,1]) + ax[1][0].scatter(moving_pts[:,0], -moving_pts[:,1]) + + ax[0][1].imshow(target) + ax[0][1].imshow(warped_img, alpha=0.5) + ax[1][1].scatter(target_pts[:,0], -target_pts[:,1]) + ax[1][1].scatter(warped_pts[:,0], -warped_pts[:,1]) + plt.savefig(f"../../QC/RegistrationPlots/{s_sample}_{s_round}_rigid_align.png", format="PNG") + return(moving_pts, target_pts, transformer) diff --git a/mplex_image/segment.py b/mplex_image/segment.py new file mode 100755 index 0000000..972742a --- /dev/null +++ b/mplex_image/segment.py @@ -0,0 +1,717 @@ +#### +# title: segment.py +# +# language: Python3.7 +# date: 2020-06-00 +# license: GPL>=v3 +# author: Jenny +# +# description: +# python3 script for cell segmentation +#### +import time +import cellpose +from cellpose import models +from PIL import Image +Image.MAX_IMAGE_PIXELS = 1000000000 + +import os +import skimage +import pandas as pd +import numpy as np +import sys +import scipy +from scipy import stats +from scipy import ndimage as ndi +from skimage import io, filters +from skimage import measure, segmentation, morphology +from numba import jit, types +from numba.extending import overload +from numba.experimental import jitclass +import numba +import mxnet as mx +import stat +from mxnet import nd +from mplex_image import preprocess + +#set src path (CHANGE ME) +s_src_path = '/home/groups/graylab_share/OMERO.rdsStore/engje/Data/cmIF' + +#functions + +def gpu_device(): + try: + _ = mx.nd.array([1, 2, 3], ctx=mx.gpu()) + mx_gpu = mx.gpu() + except mx.MXNetError: + return None + return mx_gpu + +def cellpose_nuc(key,dapi,diameter=30): + ''' + smallest nuclei are about 9 pixels, lymphocyte is 15 pixels, tumor is 25 pixels + using 20 can capture large tumor cells, without sacrificing smaller cells, + ''' + try: + nd_array = mx.nd.array([1, 2, 3], ctx=mx.gpu()) + print(nd_array) + mx_gpu = mx.gpu() + except mx.MXNetError: + print('Mxnet error') + mx_gpu = None + model = models.Cellpose(model_type='nuclei',device=mx_gpu) + newkey = f"{key.split(' - Z')[0]} nuclei{diameter}" + print(f"modelling {newkey}") + channels = [0,0] + print(f'Minimum nuclei size = {int(np.pi*(diameter/10)**2)}') + masks, flows, styles, diams = model.eval(dapi, diameter=diameter, channels=channels,flow_threshold=0,min_size= int(np.pi*(diameter/10)**2)) + return({newkey:masks}) + +def cellpose_cell(key,zdh,diameter=25): + ''' + big tumor cell is 30 pixels, lymphocyte about 18 pixels, small fibroblast 12 pixels + ''' + try: + _ = mx.nd.array([1, 2, 3], ctx=mx.gpu()) + mx_gpu = mx.gpu() + except mx.MXNetError: + mx_gpu = None + model = models.Cellpose(model_type='cyto',device=mx_gpu) + newkey = f"{key.split(' - Z')[0]} cell{diameter}" + print(f"modelling {newkey}") + channels = [2,3] + print(f'Minimum cell size = {int(np.pi*(diameter/5)**2)}') + masks, flows, styles, diams = model.eval(zdh, diameter=diameter, channels=channels,flow_threshold=0.6,cellprob_threshold=0.0, min_size= int(np.pi*(diameter/5)**2)) + return({newkey:masks}) + +def parse_org(s_end = "ORG.tif",s_start='R'): + """ + This function will parse images following koei's naming convention + Example: Registered-R1_PCNA.CD8.PD1.CK19_Her2B-K157-Scene-002_c1_ORG.tif + The output is a dataframe with image filename in index + And rounds, color, imagetype, scene (/tissue), and marker in the columns + """ + s_path = os.getcwd() + ls_file = [] + for file in os.listdir(): + if file.endswith(s_end): + if file.find(s_start)==0: + ls_file = ls_file + [file] + df_img = pd.DataFrame(index=ls_file) + df_img['rounds'] = [item.split('_')[0].split('Registered-')[1] for item in df_img.index] + df_img['color'] = [item.split('_')[-2] for item in df_img.index] + df_img['slide'] = [item.split('_')[2] for item in df_img.index] + df_img['marker_string'] = [item.split('_')[1] for item in df_img.index] + try: + df_img['scene'] = [item.split('-Scene-')[1] for item in df_img.slide] + except: + df_img['scene'] = '001' + df_img['path'] = [f"{s_path}/{item}" for item in df_img.index] + #parse file name for biomarker + for s_index in df_img.index: + #print(s_index) + s_color = df_img.loc[s_index,'color'] + if s_color == 'c1': + s_marker = 'DAPI' + elif s_color == 'c2': + s_marker = s_index.split('_')[1].split('.')[0] + elif s_color == 'c3': + s_marker = s_index.split('_')[1].split('.')[1] + elif s_color == 'c4': + s_marker = s_index.split('_')[1].split('.')[2] + elif s_color == 'c5': + s_marker = s_index.split('_')[1].split('.')[3] + #these are only included in sardana shading corrected images + elif s_color == 'c6': + s_marker = s_index.split('_')[1].split('.')[2] + elif s_color == 'c7': + s_marker = s_index.split('_')[1].split('.')[3] + else: print('Error') + df_img.loc[s_index,'marker'] = s_marker + return(df_img) + +def cmif_mkdir(ls_dir): + ''' + check if directories existe. if not, make them + ''' + for s_dir in ls_dir: + if not os.path.exists(s_dir): + os.makedirs(s_dir) + +def load_single(s_find, s_scene): + ''' + load a single image containing the find strin, scale, return {filename:scaled image} + ''' + d_img = {} + for s_file in os.listdir(): + if s_file.find(s_find)>-1: + a_img = io.imread(s_file) + a_scale = skimage.exposure.rescale_intensity(a_img,in_range=(np.quantile(a_img,0.03),1.5*np.quantile(a_img,0.9999))) + #d_img.update({f"{os.path.splitext(s_file)[0]}":a_scale}) + d_img.update({f"{s_scene}":a_scale}) + print(f'Number of images = {len(d_img)}') + return(d_img) + +def load_stack(df_img,s_find,s_scene,ls_markers,ls_rare): + ''' + load an image stack in df_img, (df_img must have "path") + scale, get mip, return {filename:mip} + ''' + d_img = {} + for s_file in os.listdir(): + if s_file.find(s_find)>-1: + a_img = io.imread(s_file) + dapi = skimage.exposure.rescale_intensity(a_img,in_range=(np.quantile(a_img,0.03),1.5*np.quantile(a_img,0.9999))) + + imgs = [] + #images + df_common = df_img[df_img.marker.isin(ls_markers) & ~df_img.marker.isin(ls_rare)] + df_rare = df_img[df_img.marker.isin(ls_markers) & df_img.marker.isin(ls_rare)] + for s_path in df_common.path: + #print(s_path) + img = io.imread(s_path) + img_scale = skimage.exposure.rescale_intensity(img,in_range=(np.quantile(img,0.03),1.5*np.quantile(img,0.9999))) + imgs.append(img_scale) + for s_path in df_rare.path: + img = io.imread(s_path) + img_scale = skimage.exposure.rescale_intensity(img,in_range=(np.quantile(img,0.03),1.5*np.quantile(img,0.99999))) + imgs.append(img_scale) + mip = np.stack(imgs).max(axis=0) + zdh = np.dstack((np.zeros(mip.shape),mip,dapi)).astype('uint16') + #name + #s_index = df_common.index[0] + #s_common_marker = df_common.loc[s_index,'marker_string'] + #s_name = os.path.splitext(df_common.index[0])[0] + #s_name = s_name.replace(s_common_marker,".".join(ls_markers)) + # name + s_name = f'{s_scene}_{".".join(ls_markers)}' + d_img.update({s_name:zdh}) + print(f'Number of projection images = ({len(d_img)}') + return(d_img) + +def load_img(subdir,s_find,s_sample,s_scene,ls_seg_markers,ls_rare): + ''' + load dapi round and cell segmentation images + ''' + #image dataframe + os.chdir(subdir) + df_seg = pd.DataFrame() + for s_dir in os.listdir(): + if s_dir.find(s_sample)>-1: + os.chdir(s_dir) + df_img = parse_org() + df_markers = df_img[df_img.marker.isin(ls_seg_markers)] + df_markers['path'] = [f'{subdir}/{s_dir}/{item}' for item in df_markers.index] + if df_img.index.str.contains(s_find).sum()==1: + s_file = s_dir + dapi = io.imread(df_img[df_img.index.str.contains(s_find)].index[0]) + os.chdir('..') + df_seg = df_seg.append(df_markers) + + #load z_projection DAPIs + os.chdir(subdir) + d_dapi = {} + d_cyto = {} + + dapi_scale = skimage.exposure.rescale_intensity(dapi,in_range=(np.quantile(dapi,0.03),1.5*np.quantile(dapi,0.9999))) + d_dapi.update({f"{s_sample}-{s_scene}":dapi_scale}) + imgs = [] + #images + df_common = df_seg[(df_seg.scene==s_scene) & (~df_seg.marker.isin(ls_rare))] + df_rare = df_seg[(df_seg.scene==s_scene) & (df_seg.marker.isin(ls_rare))] + for s_path in df_common.path: + print(s_path) + img = io.imread(s_path) + img_scale = skimage.exposure.rescale_intensity(img,in_range=(np.quantile(img,0.03),1.5*np.quantile(img,0.9999))) + imgs.append(img_scale) + for s_path in df_rare.path: + img = io.imread(s_path) + img_scale = skimage.exposure.rescale_intensity(img,in_range=(np.quantile(img,0.03),1.5*np.quantile(img,0.99999))) + imgs.append(img_scale) + mip = np.stack(imgs).max(axis=0) + zdh = np.dstack((np.zeros(mip.shape),mip,dapi)).astype('uint16') + d_cyto.update({f"{s_sample}-{s_scene}":zdh}) + print(f'Number of images = {len(d_dapi)} dapi projections ({len(d_cyto)} cytoplasm projections) ') + + return(d_dapi,d_cyto) + +def cellpose_segment_job(s_sample='SampleName',s_slide_scene="SceneName",s_find="FindDAPIString",segdir='PathtoSegmentation',imgdir='PathtoImages',nuc_diam='30',cell_diam='30',s_type='cell_or_nuclei',s_seg_markers="['Ecad']",s_rare="[]",s_match='both',s_data='cmIF',s_job='cpu'): + """ + makes specific changes to template pyscripts files in Jenny's directories to result in .py file + Input: + """ + #find template, open ,edit + os.chdir(f'{s_src_path}/src') + if s_data == 'cmIF': + with open('cellpose_template.py') as f: + s_file = f.read() + elif s_data == 'codex': + with open('cellpose_template_codex.py') as f: + s_file = f.read() + s_file = s_file.replace('SampleName',s_sample) + s_file = s_file.replace('SceneName',s_slide_scene) + s_file = s_file.replace('FindDAPIString',s_find) + s_file = s_file.replace('nuc_diam=int',f'nuc_diam={str(nuc_diam)}') + s_file = s_file.replace('cell_diam=int',f'cell_diam={str(cell_diam)}') + s_file = s_file.replace('cell_or_nuclei',s_type) + s_file = s_file.replace("['Ecad']",s_seg_markers) + s_file = s_file.replace("ls_rare = []",f"ls_rare = {s_rare}") + s_file = s_file.replace('PathtoSegmentation',segdir) + s_file = s_file.replace('PathtoImages',imgdir) + if s_match == 'match': + s_file = s_file.replace('#MATCHONLY',"'''") + elif s_match == 'seg': + s_file = s_file.replace('#SEGONLY',"'''") + if s_job == 'long': + with open('cellpose_template_long.sh') as f: + s_shell = f.read() + elif s_job == 'gpu': + with open('cellpose_template_gpu.sh') as f: + s_shell = f.read() + s_file = s_file.replace('#gpu#','') + s_file = s_file.replace('#SEGONLY',"'''") + else: + with open('cellpose_template.sh') as f: + s_shell = f.read() + s_shell = s_shell.replace("PythonScripName",f'cellpose_{s_type}_{s_slide_scene}.py') + + #save edited .py file + if s_sample.find("-Scene") > -1: + s_sample = s_sample.split("-Scene")[0] + print(s_sample) + os.chdir(f'{segdir}') + with open(f'cellpose_{s_type}_{s_slide_scene}.py', 'w') as f: + f.write(s_file) + + with open(f'cellpose_{s_type}_{s_slide_scene}.sh', 'w') as f: + f.write(s_shell) + st = os.stat(f'cellpose_{s_type}_{s_slide_scene}.sh') + os.chmod(f'cellpose_{s_type}_{s_slide_scene}.sh', st.st_mode | stat.S_IEXEC) + +def segment_spawner(s_sample,segdir,regdir,nuc_diam=30,cell_diam=30,s_type='nuclei',s_seg_markers="['Ecad']",s_job='short',s_match='both'): + ''' + spawns cellpose segmentation jobs by modifying a python and bash script, saving them and calling with os.system + s_job='gpu' or 'long' (default = 'short') + s_match= 'seg' or 'match' (default = 'both') + ''' + preprocess.cmif_mkdir([f'{segdir}/{s_sample}Cellpose_Segmentation']) + os.chdir(f'{regdir}') + for s_file in os.listdir(): + if s_file.find(s_sample) > -1: + os.chdir(f'{regdir}/{s_file}') + print(f'Processing {s_file}') + df_img = parse_org() + for s_scene in sorted(set(df_img.scene)): + s_slide_scene= f'{s_sample}-Scene-{s_scene}' + s_find = df_img[(df_img.rounds=='R1') & (df_img.color=='c1') & (df_img.scene==s_scene)].index[0] + if os.path.exists(f'{regdir}/{s_slide_scene}'): + cellpose_segment_job(s_file,s_slide_scene,s_find,f'{segdir}/{s_sample}Cellpose_Segmentation',f'{regdir}/{s_slide_scene}',nuc_diam,cell_diam,s_type,s_seg_markers,s_job=s_job, s_match=s_match) + elif os.path.exists(f'{regdir}/{s_sample}'): + cellpose_segment_job(s_file,s_slide_scene,s_find,f'{segdir}/{s_sample}Cellpose_Segmentation',f'{regdir}/{s_sample}',nuc_diam,cell_diam,s_type,s_seg_markers,s_job=s_job, s_match=s_match) + os.chdir(f'{segdir}/{s_sample}Cellpose_Segmentation') + os.system(f'sbatch cellpose_{s_type}_{s_slide_scene}.sh') + time.sleep(4) + print('Next') + +def save_seg(processed_list,segdir,s_type='nuclei'): + ''' + save the segmentation basins + ''' + + for item in processed_list: + for newkey,mask in item.items(): + print(f"saving {newkey.split(' - ')[0]} {s_type} Basins") + if s_type=='nuclei': + io.imsave(f"{segdir}/{newkey} - Nuclei Segmentation Basins.tif", mask) #Scene 002 - Nuclei Segmentation Basins.tif + elif s_type=='cell': + io.imsave(f"{segdir}/{newkey} - Cell Segmentation Basins.tif", mask) #Scene 002 - Nuclei Segmentation Basins.tif + +def save_img(d_img, segdir,s_type='nuclei',ls_seg_markers=[]): + ''' + save the segmentation basins + ''' + #save dapi or save the cyto projection + if s_type=='nuclei': + for key,dapi in d_img.items(): + print('saving DAPI') + print(key) + io.imsave(f"{segdir}/{key} - DAPI.png",dapi) + elif s_type=='cell': + for key,zdh in d_img.items(): + print('saving Cyto Projection') + io.imsave(f"{segdir}/{key.split(' - ')[0]} - {'.'.join(ls_seg_markers)}_CytoProj.png",(zdh/255).astype('uint8')) + + else: + print('choose nuceli or cell') + +# numba functions +kv_ty = (types.int64, types.int64) + +@jitclass([('d', types.DictType(*kv_ty)), + ('l', types.ListType(types.float64))]) +class ContainerHolder(object): + def __init__(self): + # initialize the containers + self.d = numba.typed.Dict.empty(*kv_ty) + self.l = numba.typed.List.empty_list(types.float64) + +@overload(np.array) +def np_array_ol(x): + if isinstance(x, types.Array): + def impl(x): + return np.copy(x) + return impl + +@numba.njit +def test(a): + b = np.array(a) + +# numba function + ''' + use numba to quickly iterate over each label and replace pixels with new pixel values + Input: + container = numba container class, with key-value pairs of old-new cell IDs + labels: numpy array with labels to rename + #cell_labels = np.where(np.array(cell_labels,dtype=np.int64)==key, value, np.array(labels,dtype=np.int64)) + ''' + +@jit(nopython=True) +def relabel_numba(container,cell_labels): + ''' + faster; replace pixels accorind to dictionsry (i.e. numba container) + key is original cell label, value is replaced label + ''' + cell_labels = np.array(cell_labels) + for key, value in container.d.items(): + cell_labels = np.where(cell_labels==key, value, cell_labels) + print('done matching') + return(cell_labels) + +def relabel_numpy(d_replace,cell_labels): + ''' + slow replace pixels accorind to dictionary + key is original cell label, value is replaced label + ''' + #key is original cell albel, value is replaced label + for key, value in d_replace.items(): + cell_labels = np.where(cell_labels==key, value, cell_labels) + print('done matching') + return(cell_labels) + +def relabel_gpu(d_replace,cell_labels): + ''' + not implemented yet + key is original cell label, value is replaced label + ''' + #key is original cell albel, value is replaced label + for key, value in d_replace.items(): + cell_labels = np.where(cell_labels==key, value, cell_labels) + print('done mathcing') + return(cell_labels) + +def nuc_to_cell_new(labels,cell_labels): + ''' + problem - still not giving same result as original function + associate the largest nucleaus contained in each cell segmentation + Input: + labels: nuclear labels + cell_labels: cell labels that need to be matched + Ouput: + container: numba container of key-value pairs of old-new cell IDs + ''' + start = time.time() + #dominant nuclei + props = measure.regionprops_table(cell_labels,labels, properties=(['intensity_image','image','label'])) + df_prop = pd.DataFrame(props) + d_replace = {} + for idx in df_prop.index[::-1]: + label_id = df_prop.loc[idx,'label'] + intensity_image = df_prop.loc[idx,'intensity_image'] + image = df_prop.loc[idx,'image'] + nuc_labels = intensity_image[image & intensity_image!=0] + if len(nuc_labels) == 0: + d_replace.update({label_id:0}) + elif len(np.unique(nuc_labels)) == 1: + d_replace.update({label_id:nuc_labels[0]}) + else: + new_id = scipy.stats.mode(nuc_labels)[0][0] + d_replace.update({label_id:new_id}) + + #convert to numba container + container = ContainerHolder() + for key, value in d_replace.items(): + container.d[key] = value + end = time.time() + print(end - start) + return(container,d_replace, df_prop) + +def nuc_to_cell(labels,cell_labels): + ''' + associate the largest nucleaus contained in each cell segmentation + Input: + labels: nuclear labels + cell_labels: cell labels that need to be matched + Ouput: + container: numba container of key-value pairs of old-new cell IDs + ''' + start = time.time() + #dominant nuclei + d_replace = {} + for idx in np.unique(cell_labels)[::-1]: + if idx == 0: + continue + #iterate over each cell label, find all non-zero values contained within that mask + cell_array = labels[cell_labels == idx] + cell_array =cell_array[cell_array !=0] + #for multiple nuclei, choose largest (most common pixels, i.e. mode) + if len(np.unique(cell_array)) > 1: + new_id = scipy.stats.mode(cell_array, axis=0)[0][0] + d_replace.update({idx:new_id}) + elif len(np.unique(cell_array)) == 1: + d_replace.update({idx:cell_array[0]}) + else: + d_replace.update({idx:0}) + #fix matching bug + d_replace = {item[0]:item[1] for item in sorted(d_replace.items(), key=lambda x: x[1], reverse=True)} + #convert to numba container + container = ContainerHolder() + for key, value in d_replace.items(): + container.d[key] = value + end = time.time() + print(end - start) + return(container,d_replace) + +########## OLD ############## + +def zero_background(cells_relabel): + ''' + in a labelled cell image, set the background to zero + ''' + mode = stats.mode(cells_relabel,axis=0)[0][0][0] + black = cells_relabel.copy() + black[black==mode] = 0 + return(black) + +def nuc_to_cell_watershed(labels,cell_labels,i_small=200): + ''' + associate the largest nucleus contained in each cell segmentation + Input: + labels: nuclear labels + cell_labels: cell labels that need to be matched + Ouput: + new_cell_labels: shrunk so not touching and cleaned of small objects < i_small + container: numba container of key-value pairs of old-new cell IDs + d_replace: python dictionary of key-value pairs + ''' + #cells + cell_boundaries = segmentation.find_boundaries(cell_labels,mode='outer') + shrunk_cells = cell_labels.copy() + shrunk_cells[cell_boundaries] = 0 + foreground = shrunk_cells != 0 + foreground_cleaned = morphology.remove_small_objects(foreground, i_small) + background = ~foreground_cleaned + shrunk_cells[background] = 0 + #problem when we filter + #new_cell_labels = measure.label(foreground_cleaned, background=0) + + #nuclei + cut_labels = labels.copy() + background = ~foreground_cleaned + cut_labels[background] = 0 + labels_in = morphology.remove_small_objects(cut_labels, i_small) + cleaned_nuclei = labels_in + distance = ndi.distance_transform_edt(foreground_cleaned) + labels_out = segmentation.watershed(-distance, labels_in, mask=foreground_cleaned) + + #dominant nuclei + props = measure.regionprops_table(shrunk_cells,labels_out, properties=('min_intensity','max_intensity','mean_intensity')) + df_prop = pd.DataFrame(props) + d_replace = {} + for idx in df_prop.index[::-1]: + #iterate over each cell label, find all non-zero values of watershed expansioncontained within that mask + cell_array = labels_out[shrunk_cells == idx] + if len(np.unique(cell_array)) > 1: + new_id = scipy.stats.mode(cell_array, axis=0)[0][0] + d_replace.update({idx:new_id}) + elif len(np.unique(cell_array)) == 1: + d_replace.update({idx:cell_array[0]}) + else: + d_replace.update({idx:0}) + #convert to numba container + container = ContainerHolder() + for key, value in d_replace.items(): + container.d[key] = value + + return(container) + +def save_seg_z(processed_list,segdir,s_type='nuclei'): + ''' + save the segmentation basins + ''' + + for item in processed_list: + for newkey,mask in item.items(): + print(f"saving {newkey.split(' - Z')[0]} {s_type} Basins") + if s_type=='nuclei': + io.imsave(f"{segdir}/{newkey} - Nuclei Segmentation Basins.tif", mask) #Scene 002 - Nuclei Segmentation Basins.tif + elif s_type=='cell': + io.imsave(f"{segdir}/{newkey} - Cell Segmentation Basins.tif", mask) #Scene 002 - Nuclei Segmentation Basins.tif + +def cellpose_segment_parallel(d_img,s_type='nuclei'): + ''' + Dont use/ segment nuclei or cell + ''' + if s_type=='nuclei': + print('segmenting nuclei') + if __name__ == "__main__": + processed_list = Parallel(n_jobs=len(d_img))(delayed(cellpose_nuc)(key,img,diameter=nuc_diam) for key,img in d_img.items()) + + elif s_type=='cell': + print('segmenting cells') + if __name__ == "__main__": + processed_list = Parallel(n_jobs=len(d_img))(delayed(cellpose_cell)(key,img,diameter=cell_diam) for key,img in d_img.items()) + + else: + print('choose nuceli or cell') + return(processed_list) + +def save_img_z(d_img, segdir,s_type='nuclei',ls_seg_markers=[]): + ''' + save the segmentation basins + ''' + #save dapi or save the cyto projection + if s_type=='nuclei': + for key,dapi in d_img.items(): + print('saving DAPI') + io.imsave(f"{segdir}/{key}",dapi) + elif s_type=='cell': + for key,zdh in d_img.items(): + print('saving Cyto Projection') + io.imsave(f"{segdir}/{key.split(' - Z')[0]} - {'.'.join(ls_seg_markers)}_CytoProj.png",(zdh/255).astype('uint8')) + + else: + print('choose nuceli or cell') + +def cellpose_segment_job_z(s_sample='SampleName',s_scene="SceneName",nuc_diam='20',cell_diam='25',s_type='cell_or_nuclei',s_seg_markers="['Ecad']",s_rare="[]",codedir='PathtoCode'): + """ + makes specific changes to template pyscripts files in Jenny's directories to result in .py file + Input: + + """ + #find template, open ,edit + os.chdir(f'{s_src_path}/src') + with open('cellpose_template_z.py') as f: + s_file = f.read() + s_file = s_file.replace('SampleName',s_sample) + s_file = s_file.replace('SceneName',s_scene) + s_file = s_file.replace('nuc_diam=int',f'nuc_diam={str(nuc_diam)}') + s_file = s_file.replace('cell_diam=int',f'cell_diam={str(cell_diam)}') + s_file = s_file.replace('cell_or_nuclei',s_type) + s_file = s_file.replace("['Ecad']",s_seg_markers) + s_file = s_file.replace("ls_rare = []",f"ls_rare = {s_rare}") + s_file = s_file.replace('PathtoCode',codedir) + + with open('cellpose_template_z.sh') as f: + s_shell = f.read() + s_shell = s_shell.replace("PythonScripName",f'cellpose_{s_type}_{s_scene.replace(" ","-").split("_")[0]}.py') + + #save edited .py file + os.chdir(f'{codedir}/Segmentation/{s_sample}Cellpose_Segmentation') + with open(f'cellpose_{s_type}_{s_scene.replace(" ","-").split("_")[0]}.py', 'w') as f: + f.write(s_file) + + with open(f'cellpose_{s_type}_{s_scene.replace(" ","-").split("_")[0]}.sh', 'w') as f: + f.write(s_shell) + +def load_scene_z(subdir,dapidir,s_sample,s_scene,ls_seg_markers,ls_rare): + ''' + load dapi projection and cell segmentation images + ''' + #image dataframe + os.chdir(subdir) + df_seg = pd.DataFrame() + for s_dir in os.listdir(): + if s_dir.find(s_sample)>-1: + os.chdir(s_dir) + df_img = parse_org() + df_markers = df_img[df_img.marker.isin(ls_seg_markers)] + df_markers['path'] = [f'{subdir}/{s_dir}/{item}' for item in df_markers.index] + os.chdir('..') + df_seg = df_seg.append(df_markers) + + #load z_projection DAPIs + os.chdir(dapidir) + d_dapi = {} + d_cyto = {} + for s_file in sorted(os.listdir()): + #print(s_file) + if s_file.find(f'{s_scene} - ZProjectionDAPI.png')>-1: + dapi = io.imread(s_file) + dapi_scale = skimage.exposure.rescale_intensity(dapi,in_range=(np.quantile(dapi,0.03),1.5*np.quantile(dapi,0.9999))) + d_dapi.update({s_file:dapi_scale}) + s_scene = s_scene.split(' ')[1].split('_')[0] + print(s_scene) + imgs = [] + #images + df_common = df_seg[(df_seg.scene==s_scene) & (~df_markers.marker.isin(ls_rare))] + df_rare = df_seg[(df_seg.scene==s_scene) & (df_markers.marker.isin(ls_rare))] + for s_path in df_common.path: + img = io.imread(s_path) + img_scale = skimage.exposure.rescale_intensity(img,in_range=(np.quantile(img,0.03),1.5*np.quantile(img,0.9999))) + imgs.append(img_scale) + for s_path in df_rare.path: + img = io.imread(s_path) + img_scale = skimage.exposure.rescale_intensity(img,in_range=(np.quantile(img,0.03),1.5*np.quantile(img,0.999999))) + imgs.append(img_scale) + mip = np.stack(imgs).max(axis=0) + zdh = np.dstack((np.zeros(mip.shape),mip,dapi)).astype('uint16') + d_cyto.update({s_file:zdh}) + print(f'Number of images = {len(d_dapi)} dapi projections ({len(d_cyto)} cytoplasm projections) ') + + return(d_dapi,d_cyto) + +#test code +''' +import napari +#os.chdir('./Desktop/BR1506') +labels = io.imread('Scene 059 nuclei20 - Nuclei Segmentation Basins.tif') +cell_labels = io.imread('Scene 059 cell25 - Cell Segmentation Basins.tif') +cyto_img = io.imread('Scene 059 - CytoProj.png') +dapi_img = io.imread('Scene 059 - ZProjectionDAPI.png') +viewer = napari.Viewer() +viewer.add_labels(labels,blending='additive') +viewer.add_labels(cell_labels,blending='additive') +viewer.add_image(cyto_img,blending='additive') +viewer.add_image(dapi_img,blending='additive',colormap='blue') +#cell_boundaries = segmentation.find_boundaries(cell_labels,mode='outer') +#viewer.add_labels(cell_boundaries,blending='additive') +#nuclear_boundaries = segmentation.find_boundaries(labels,mode='outer') +#viewer.add_labels(nuclear_boundaries,blending='additive',num_colors=2) +closing = skimage.morphology.closing(cell_labels) +viewer.add_labels(closing,blending='additive') +container = nuc_to_cell(labels,closing)#cell_labels) + +#matched cell labels +cells_relabel = relabel_numba(container[0],closing) +#remove background +mode = stats.mode(cells_relabel,axis=0)[0][0][0] +black = cells_relabel.copy() +black[black==mode] = 0 +viewer.add_labels(black,blending='additive') +cell_boundaries = segmentation.find_boundaries(cells_relabel,mode='outer') +viewer.add_labels(cell_boundaries,blending='additive') +#ring +overlap = black==labels +viewer.add_labels(overlap, blending='additive') +#cytoplasm +ring_rep = black.copy() +ring_rep[overlap] = 0 +viewer.add_labels(ring_rep, blending='additive') +#membrane +rim_labels = contract_membrane(black) +viewer.add_labels(rim_labels, blending='additive') + +#expanded nucleus +__,__,peri_nuc = expand_nuc(labels,distance=3) +viewer.add_labels(peri_nuc, blending='additive') +''' \ No newline at end of file diff --git a/mplex_image/visualize.py b/mplex_image/visualize.py new file mode 100755 index 0000000..3cbdf35 --- /dev/null +++ b/mplex_image/visualize.py @@ -0,0 +1,387 @@ +#### +# title: analyze.py +# +# language: Python3.6 +# date: 2019-05-00 +# license: GPL>=v3 +# author: Jenny +# +# description: +# python3 library to visualize cyclic data and analysis +#### + +#load libraries +import matplotlib as mpl +import matplotlib.pyplot as plt +import pandas as pd +import numpy as np +import os +import skimage +from skimage import io, segmentation +import tifffile +import copy +import napari +import seaborn as sns +from sklearn.cluster import KMeans +from sklearn.preprocessing import scale +import random +import copy +from scipy.ndimage import distance_transform_edt + +#napari +def load_crops(viewer,s_crop,s_tissue): + ls_color = ['blue','green','yellow','red','cyan','magenta','gray','green','yellow','red','cyan','magenta','gray', + 'green','yellow','red','cyan','magenta','gray','gray','gray','gray','gray','gray','gray','gray'] + print(s_crop) + #viewer = napari.Viewer() + for s_file in os.listdir(): + if s_file.find(s_tissue)>-1: + if s_file.find(s_crop) > -1: + if s_file.find('ome.tif') > -1: + with tifffile.TiffFile(s_file) as tif: + array = tif.asarray() + omexml_string = tif.ome_metadata + for idx in range(array.shape[0]): + img = array[idx] + i_begin = omexml_string.find(f'Channel ID="Channel:0:{idx}" Name="') + i_end = omexml_string[i_begin:].find('" SamplesPerPixel') + s_marker = omexml_string[i_begin + 31:i_begin + i_end] + if s_marker.find('utf-8') == 0: + s_marker = 'DAPI1' + print(s_marker) + viewer.add_image(img,name=s_marker,rgb=False,visible=False,blending='additive',colormap=ls_color[idx],contrast_limits = (np.quantile(img,0),(np.quantile(img,0.9999)+1)*1.5)) + elif s_file.find('SegmentationBasins') > -1: + label_image = io.imread(s_file) + viewer.add_labels(label_image, name='cell_seg',blending='additive',visible=False) + cell_boundaries = segmentation.find_boundaries(label_image,mode='outer') + viewer.add_labels(cell_boundaries,blending='additive',visible=False) + else: + label_image = np.array([]) + print('') + return(label_image) + +def load_marker(viewer,s_crop,s_tissue,ls_marker=[]): + ls_color = ['blue','green','yellow','red','cyan','magenta','gray','green','yellow','red','cyan','magenta', + 'gray','gray','gray','gray','gray','gray','gray','gray'] + print(s_crop) + ls_marker_all = copy.copy(ls_marker) + for s_file in os.listdir(): + if s_file.find(s_tissue)>-1: + if s_file.find(s_crop) > -1: + if s_file.find('ome.tif') > -1: + with tifffile.TiffFile(s_file) as tif: + array = tif.asarray() + omexml_string = tif.ome_metadata + d_result = {} + for idx in range(array.shape[0]): + img = array[idx] + i_begin = omexml_string.find(f'Channel ID="Channel:0:{idx}" Name="') + i_end = omexml_string[i_begin:].find('" SamplesPerPixel') + s_marker_idx = omexml_string[i_begin + 31:i_begin + i_end] + if s_marker_idx.find('utf-8') == 0: + s_marker_idx = 'DAPI1' + d_result.update({s_marker_idx:img}) + for idxs, s_marker in enumerate(ls_marker): + if len(set(d_result.keys()).intersection(set([s_marker])).intersection(set(ls_marker_all))) > 0: + img = d_result[s_marker] + viewer.add_image(img,name=s_marker,rgb=False,visible=True,blending='additive',colormap=ls_color[idxs],contrast_limits = (np.quantile(img,0),(np.quantile(img,0.9999)+1)*1.5)) + ls_marker_all.remove(s_marker) + elif s_file.find('SegmentationBasins') > -1: + label_image = io.imread(s_file) + else: + ome_array = np.array([]) + print('') + return(d_result,label_image) + +def pos_label(viewer,df_pos,label_image,s_cell): + ''' + df_pos = boolean dataframe, s_cell = marker name + ''' + #s_cell = df_pos.columns[df_pos.columns.str.contains(f'{s_cell}_')][0] + #get rid of extra cells (filtered by DAPI, etc) + li_index = [int(item.split('_')[-1].split('cell')[1]) for item in df_pos.index] + label_image_cell = copy.deepcopy(label_image) + label_image_cell[~np.isin(label_image_cell, li_index)] = 0 + li_index_cell = [int(item.split('_')[-1].split('cell')[1]) for item in df_pos[df_pos.loc[:,s_cell]==True].index] + label_image_cell[~np.isin(label_image_cell,li_index_cell )] = 0 + viewer.add_labels(label_image_cell, name=f'{s_cell.split("_")[0]}_seg',blending='additive',visible=False) + return(label_image_cell) + +def expand_labels(label_image, distance=1): + """Expand labels in label image by ``distance`` pixels without overlapping. + Given a label image, ``expand_labels`` grows label regions (connected components) + outwards by up to ``distance`` pixels without overflowing into neighboring regions. + More specifically, each background pixel that is within Euclidean distance + of <= ``distance`` pixels of a connected component is assigned the label of that + connected component. + Where multiple connected components are within ``distance`` pixels of a background + pixel, the label value of the closest connected component will be assigned (see + Notes for the case of multiple labels at equal distance). + + Parameters + ---------- + label_image : ndarray of dtype int + label image + distance : float + Euclidean distance in pixels by which to grow the labels. Default is one. + Returns + ------- + enlarged_labels : ndarray of dtype int + Labeled array, where all connected regions have been enlarged + """ + distances, nearest_label_coords = distance_transform_edt( + label_image == 0, return_indices=True + ) + labels_out = np.zeros_like(label_image) + dilate_mask = distances <= distance + # build the coordinates to find nearest labels, + # in contrast to [1] this implementation supports label arrays + # of any dimension + masked_nearest_label_coords = [ + dimension_indices[dilate_mask] + for dimension_indices in nearest_label_coords + ] + nearest_labels = label_image[tuple(masked_nearest_label_coords)] + labels_out[dilate_mask] = nearest_labels + return labels_out + +def pos_boundary(viewer,df_pos,label_image,s_cell,seed=0.82,s_type='thick'): + ''' + df_pos = boolean dataframe, s_cell = marker name + ''' + #s_cell = df_pos.columns[df_pos.columns.str.contains(f'{s_cell}_')][0] + #get rid of extra cells (filtered by DAPI, etc) + li_index = [int(item.split('_')[-1].split('cell')[1]) for item in df_pos.index] + label_image_cell = copy.deepcopy(label_image) + label_image_cell[~np.isin(label_image_cell, li_index)] = 0 + li_index_cell = [int(item.split('_')[-1].split('cell')[1]) for item in df_pos[df_pos.loc[:,s_cell]==True].index] + label_image_cell[~np.isin(label_image_cell,li_index_cell )] = 0 + cell_boundaries = segmentation.find_boundaries(label_image_cell,mode='thick') + if s_type == 'thick': + cell_boundaries_big = segmentation.find_boundaries(expand_labels(label_image_cell, distance=2),mode='thick') + viewer.add_labels(cell_boundaries + cell_boundaries_big, name=f'{s_cell}_seg',blending='additive',visible=False,seed=seed) + else: + viewer.add_labels(cell_boundaries, name=f'{s_cell}_seg',blending='additive',visible=False,seed=seed) + cell_boundaries_big = [] + return(cell_boundaries, cell_boundaries_big) + +#jupyter notbook +#load manual thresholds +def new_thresh_csv(df_mi,d_combos): + #make thresh csv's + df_man = pd.DataFrame(index= ['global']+ sorted(set(df_mi.slide_scene))) + for s_type, es_marker in d_combos.items(): + for s_marker in sorted(es_marker): + df_man[s_marker] = '' + return(df_man) + +def load_thresh_csv(s_sample): + #load + df_man = pd.read_csv(f'thresh_JE_{s_sample}.csv',header=0,index_col = 0) + #reformat the thresholds data and covert to 16 bit + ls_index = df_man.index.tolist() + ls_index.remove('global') + df_thresh = pd.DataFrame(index = ls_index) + ls_marker = df_man.columns.tolist() + for s_marker in ls_marker: + df_thresh[f'{s_marker}_global'] = df_man[df_man.index=='global'].loc['global',f'{s_marker}']*256 + df_thresh[f'{s_marker}_local'] = df_man[df_man.index!='global'].loc[:,f'{s_marker}']*256 + + df_thresh.replace(to_replace=0, value = 12, inplace=True) + return(df_thresh) + +def threshold_postive(df_thresh,df_mi): + ''' + #make positive dataframe to check threhsolds #start with local, and if its not there, inesrt the global threshold + #note, this will break if there are two biomarker locations # + ''' + ls_scene = sorted(df_thresh.index.tolist()) + ls_sub = df_mi.columns[df_mi.dtypes=='float64'].tolist() + ls_other = [] + df_pos= pd.DataFrame() + d_thresh_record= {} + for s_scene in ls_scene: + ls_index = df_mi[df_mi.slide_scene==s_scene].index + df_scene = pd.DataFrame(index=ls_index) + for s_marker_loc in ls_sub: + s_marker = s_marker_loc.split('_')[0] + # only threshold markers in .csv + if len(set([item.split('_')[0] for item in df_thresh.columns]).intersection({s_marker})) != 0: + #first check if local threshold exists + if df_thresh[df_thresh.index==s_scene].isna().loc[s_scene,f'{s_marker}_local']==False: + #local + i_thresh = df_thresh.loc[s_scene,f'{s_marker}_local'] + df_scene.loc[ls_index,s_marker_loc] = df_mi.loc[ls_index,s_marker_loc] >= i_thresh + #otherwise use global + elif df_thresh[df_thresh.index==s_scene].isna().loc[s_scene,f'{s_marker}_global']==False: + i_thresh = df_thresh.loc[s_scene,f'{s_marker}_global'] + df_scene.loc[ls_index,s_marker_loc] = df_mi.loc[ls_index,s_marker_loc] >= i_thresh + else: + ls_other = ls_other + [s_marker] + i_thresh = np.NaN + d_thresh_record.update({f'{s_scene}_{s_marker}':i_thresh}) + else: + ls_other = ls_other + [s_marker] + df_pos = df_pos.append(df_scene) + print(f'Did not threshold {set(ls_other)}') + return(d_thresh_record,df_pos) + +def plot_positive(s_type,d_combos,df_pos,d_thresh_record,df_xy,b_save=True): + ls_color = sorted(d_combos[s_type]) + ls_bool = [len(set([item.split('_')[0]]).intersection(set(ls_color)))==1 for item in df_pos.columns] + ls_color = df_pos.columns[ls_bool].tolist() + ls_scene = sorted(set(df_xy.slide_scene)) + ls_fig = [] + for s_scene in ls_scene: + #negative cells = all cells even before dapi filtering + df_neg = df_xy[(df_xy.slide_scene==s_scene)] + #plot + fig, ax = plt.subplots(2, ((len(ls_color))+1)//2, figsize=(18,12)) #figsize=(18,12) + ax = ax.ravel() + for ax_num, s_color in enumerate(ls_color): + s_marker = s_color.split('_')[0] + s_min = d_thresh_record[f"{s_scene}_{s_marker}"] + #positive cells = positive cells based on threshold + ls_pos_index = (df_pos[df_pos.loc[:,s_color]]).index + df_color_pos = df_neg[df_neg.index.isin(ls_pos_index)] + if len(df_color_pos)>=1: + #plot negative cells + ax[ax_num].scatter(data=df_neg,x='DAPI_X',y='DAPI_Y',color='silver',s=1) + #plot positive cells + ax[ax_num].scatter(data=df_color_pos, x='DAPI_X',y='DAPI_Y',color='DarkBlue',s=.5) + + ax[ax_num].axis('equal') + ax[ax_num].set_ylim(ax[ax_num].get_ylim()[::-1]) + ax[ax_num].set_title(f'{s_marker} min={int(s_min)} ({len(df_color_pos)} cells)') + else: + ax[ax_num].set_title(f'{s_marker} min={(s_min)} ({(0)} cells') + fig.suptitle(s_scene) + ls_fig.append(fig) + if b_save: + fig.savefig(f'./SpatialPlots/{s_scene}_{s_type}_manual.png') + return(ls_fig) + +#gating analysis +def prop_positive(df_data,s_cell,s_grouper): + #df_data['countme'] = True + df_cell = df_data.loc[:,[s_cell,s_grouper,'countme']].dropna() + df_prop = (df_cell.groupby([s_cell,s_grouper]).countme.count()/df_cell.groupby([s_grouper]).countme.count()).unstack().T + return(df_prop) + +def prop_clustermap(df_prop,df_annot,i_thresh,lut,figsize=(10,5)): + for s_index in df_prop.index: + s_subtype = df_annot.loc[s_index,'ID'] # + df_prop.loc[s_index, 'ID'] = s_subtype + species = df_prop.pop("ID") + row_colors = species.map(lut) + + #clustermap plot wihtout the low values -drop less than i_threh % of total + df_plot = df_prop.fillna(0) + if i_thresh > 0: + df_plot_less = df_plot.loc[:,df_plot.sum()/len(df_plot) > i_thresh] + i_len = len(df_prop) + i_width = len(df_plot_less.columns) + g = sns.clustermap(df_plot_less,figsize=figsize,cmap='viridis',row_colors=row_colors) + return(g,df_plot_less) + +def prop_barplot(df_plot_less,s_cell,colormap="Spectral",figsize=(10,5),b_sort=True): + i_len = len(df_plot_less) + i_width = len(df_plot_less.columns) + fig,ax = plt.subplots(figsize=figsize) + if b_sort: + df_plot_less = df_plot_less.sort_index(ascending=False) + df_plot_less.plot(kind='barh',stacked=True,width=.9, ax=ax,colormap=colormap) + ax.set_title(s_cell) + ax.set_xlabel('Fraction Positive') + ax.legend(bbox_to_anchor=(1.01, 1)) + plt.tight_layout() + return(fig) + +def plot_color_leg(lut,figsize = (2.3,3)): + #colors + series = pd.Series(lut) + df_color = pd.DataFrame(index=range(len(series)),columns=['subtype','color']) + + series.sort_values() + df_color['subtype'] = series.index + df_color['value'] = 1 + df_color['color'] = series.values + + fig,ax = plt.subplots(figsize = figsize,dpi=100) + df_color.plot(kind='barh',x='subtype',y='value',width=1,legend=False,color=df_color.color,ax=ax) + ax.set_xticks([]) + ax.set_ylabel('') + ax.set_title(f'subtype') + plt.tight_layout() + return(fig) + +#cluster analysis + +def cluster_kmeans(df_mi,ls_columns,k,b_sil=False): + ''' + log2 transform, zscore and kmens cluster + ''' + df_cluster_norm = df_mi.loc[:,ls_columns] + df_cluster_norm_one = df_cluster_norm + 1 + df_cluster = np.log2(df_cluster_norm_one) + + #select figure size + i_len = k + i_width = len(df_cluster.columns) + + #scale date + df_scale = scale(df_cluster) + + #kmeans cluster + kmeans = KMeans(n_clusters=k, random_state=0).fit(df_scale) + df_cluster.columns = [item.split('_')[0] for item in df_cluster.columns] + df_cluster[f'K{k}'] = list(kmeans.labels_) + g = sns.clustermap(df_cluster.groupby(f'K{k}').mean(),cmap="RdYlGn_r",z_score=1,figsize=(3+i_width/3,3+i_len/3)) + if b_sil: + score = silhouette_score(X = df_scale, labels=list(kmeans.labels_)) + else: + score = np.nan + return(g,df_cluster,score) + +def plot_clusters(df_cluster,df_xy,s_num='many'): + s_type = df_cluster.columns[df_cluster.dtypes=='int64'][0] + print(s_type) + ls_scene = sorted(set(df_cluster.slide_scene)) + ls_color = sorted(set(df_cluster.loc[:,s_type].dropna())) + d_fig = {} + for s_scene in ls_scene: + #negative cells = all cells even before dapi filtering + df_neg = df_xy[(df_xy.slide_scene==s_scene)] + #plot + if s_num == 'many': + fig, ax = plt.subplots(3, ((len(ls_color))+2)//3, figsize=(18,12),dpi=200) + else: + fig, ax = plt.subplots(2, 1, figsize=(7,4),dpi=200) + ax = ax.ravel() + for ax_num, s_color in enumerate(ls_color): + s_marker = s_color + #positive cells = poitive cells based on threshold + ls_pos_index = (df_cluster[df_cluster.loc[:,s_type]==s_color]).index + df_color_pos = df_neg[df_neg.index.isin(ls_pos_index)] + if len(df_color_pos)>=1: + #plot negative cells + ax[ax_num].scatter(data=df_neg,x='DAPI_X',y='DAPI_Y',color='silver',s=1) + #plot positive cells + ax[ax_num].scatter(data=df_color_pos, x='DAPI_X',y='DAPI_Y',color='DarkBlue',s=.5) + + ax[ax_num].axis('equal') + ax[ax_num].set_ylim(ax[ax_num].get_ylim()[::-1]) + if s_num == 'many': + ax[ax_num].set_xticklabels('') + ax[ax_num].set_yticklabels('') + else: + ax[0].set_xticklabels('') + ax[ax_num].set_title(f'{s_color} ({len(df_color_pos)} cells)') + else: + ax[ax_num].set_xticklabels('') + ax[ax_num].set_yticklabels('') + ax[ax_num].set_title(f'{s_color} ({(0)} cells') + + fig.suptitle(s_scene) + d_fig.update({s_scene:fig}) + return(d_fig)