Merge pull request #1 from jjc2718/cross_validation

Implement and test single-cancer cross-validation
greenelab · Aug 3, 2020 · d2047f8 · d2047f8
2 parents 03fc3a5 + 276f032
commit d2047f8
Show file tree

Hide file tree

Showing 14 changed files with 11,637 additions and 30 deletions.
diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
@@ -0,0 +1,26 @@
+name: Python tests
+
+on: [push, pull_request]
+
+jobs:
+    run-tests:
+        name: Run Python tests
+        runs-on: ubuntu-latest
+
+        steps:
+            - name: Check out Git repository
+              uses: actions/checkout@v2
+
+            - name: Set up Python
+              uses: actions/setup-python@v1
+              with:
+                python-version: 3.6
+
+            - name: Setup conda environment
+              run: |
+                $CONDA/bin/conda env update --file environment.yml --name base
+                $CONDA/bin/pip install -e .
+
+            - name: Run tests
+              run: |
+                $CONDA/bin/pytest tests/
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,6 @@
 **/__pycache__/*
 **/.ipynb_checkpoints/*
+*.egg-info*
 
-data/*
+data/EBPlusPlusAdjustPANCAN_IlluminaHiSeq_RNASeqV2-v2.geneExp.tsv
+data/*.gz
diff --git a/download_data.ipynb → 00_download_data.ipynb b/download_data.ipynb → 00_download_data.ipynb
@@ -19,7 +19,7 @@
     "import pandas as pd\n",
     "from urllib.request import urlretrieve\n",
     "\n",
-    "import config as cfg"
+    "import pancancer_utilities.config as cfg"
    ]
   },
   {
@@ -752,7 +752,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "expr_file = os.path.join(cfg.data_dir, 'tcga_expression_matrix_processed.tsv')\n",
+    "expr_file = os.path.join(cfg.data_dir, 'tcga_expression_matrix_processed.tsv.gz')\n",
     "tcga_expr_df.to_csv(expr_file, sep='\\t', compression='gzip', float_format='%.3g')"
    ]
   },
@@ -1430,6 +1430,288 @@
     "cancertype_count_df"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Subsample expression dataframe for unit testing"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(1106, 100)\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th>gene_id</th>\n",
+       "      <th>613037</th>\n",
+       "      <th>347148</th>\n",
+       "      <th>10868</th>\n",
+       "      <th>51233</th>\n",
+       "      <th>10782</th>\n",
+       "      <th>1804</th>\n",
+       "      <th>9553</th>\n",
+       "      <th>2893</th>\n",
+       "      <th>55827</th>\n",
+       "      <th>6165</th>\n",
+       "      <th>...</th>\n",
+       "      <th>203102</th>\n",
+       "      <th>51259</th>\n",
+       "      <th>127534</th>\n",
+       "      <th>23398</th>\n",
+       "      <th>167153</th>\n",
+       "      <th>10276</th>\n",
+       "      <th>84146</th>\n",
+       "      <th>3490</th>\n",
+       "      <th>53373</th>\n",
+       "      <th>79991</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>sample_id</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>TCGA-D8-A27E-01</th>\n",
+       "      <td>12.0596</td>\n",
+       "      <td>0.0000</td>\n",
+       "      <td>714.707</td>\n",
+       "      <td>1.4398</td>\n",
+       "      <td>257.433</td>\n",
+       "      <td>1.1518</td>\n",
+       "      <td>496.724</td>\n",
+       "      <td>0.2880</td>\n",
+       "      <td>1313.080</td>\n",
+       "      <td>7585.92</td>\n",
+       "      <td>...</td>\n",
+       "      <td>16.4135</td>\n",
+       "      <td>192.931</td>\n",
+       "      <td>0.0000</td>\n",
+       "      <td>298.6110</td>\n",
+       "      <td>1002.950</td>\n",
+       "      <td>3612.70</td>\n",
+       "      <td>834.4970</td>\n",
+       "      <td>3253.62</td>\n",
+       "      <td>1557.560</td>\n",
+       "      <td>1049.600</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>TCGA-AF-A56N-01</th>\n",
+       "      <td>101.9000</td>\n",
+       "      <td>0.4044</td>\n",
+       "      <td>649.009</td>\n",
+       "      <td>0.8087</td>\n",
+       "      <td>252.325</td>\n",
+       "      <td>1.6175</td>\n",
+       "      <td>535.382</td>\n",
+       "      <td>0.0000</td>\n",
+       "      <td>890.417</td>\n",
+       "      <td>10779.20</td>\n",
+       "      <td>...</td>\n",
+       "      <td>29.1144</td>\n",
+       "      <td>348.969</td>\n",
+       "      <td>92.6001</td>\n",
+       "      <td>410.8370</td>\n",
+       "      <td>912.252</td>\n",
+       "      <td>4926.81</td>\n",
+       "      <td>828.9530</td>\n",
+       "      <td>4164.98</td>\n",
+       "      <td>1630.410</td>\n",
+       "      <td>389.001</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>TCGA-CN-6010-01</th>\n",
+       "      <td>3.4211</td>\n",
+       "      <td>1.1404</td>\n",
+       "      <td>677.373</td>\n",
+       "      <td>0.3801</td>\n",
+       "      <td>221.990</td>\n",
+       "      <td>2.6608</td>\n",
+       "      <td>327.283</td>\n",
+       "      <td>3.8012</td>\n",
+       "      <td>1851.560</td>\n",
+       "      <td>14198.20</td>\n",
+       "      <td>...</td>\n",
+       "      <td>1.5205</td>\n",
+       "      <td>303.336</td>\n",
+       "      <td>117.4570</td>\n",
+       "      <td>302.5750</td>\n",
+       "      <td>560.677</td>\n",
+       "      <td>3481.90</td>\n",
+       "      <td>702.4610</td>\n",
+       "      <td>2825.05</td>\n",
+       "      <td>969.686</td>\n",
+       "      <td>522.284</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>TCGA-49-4512-01</th>\n",
+       "      <td>82.2444</td>\n",
+       "      <td>3.0746</td>\n",
+       "      <td>640.277</td>\n",
+       "      <td>1.5373</td>\n",
+       "      <td>566.487</td>\n",
+       "      <td>13.0669</td>\n",
+       "      <td>352.805</td>\n",
+       "      <td>3.0746</td>\n",
+       "      <td>1641.810</td>\n",
+       "      <td>7758.65</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.7686</td>\n",
+       "      <td>466.564</td>\n",
+       "      <td>6.1491</td>\n",
+       "      <td>326.6720</td>\n",
+       "      <td>820.907</td>\n",
+       "      <td>1753.27</td>\n",
+       "      <td>582.6290</td>\n",
+       "      <td>6768.64</td>\n",
+       "      <td>2366.640</td>\n",
+       "      <td>1169.870</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>TCGA-V4-A9E8-01</th>\n",
+       "      <td>2.9165</td>\n",
+       "      <td>4.9432</td>\n",
+       "      <td>1256.550</td>\n",
+       "      <td>0.0000</td>\n",
+       "      <td>165.101</td>\n",
+       "      <td>119.6240</td>\n",
+       "      <td>453.781</td>\n",
+       "      <td>0.9886</td>\n",
+       "      <td>482.452</td>\n",
+       "      <td>4616.91</td>\n",
+       "      <td>...</td>\n",
+       "      <td>11.8636</td>\n",
+       "      <td>576.372</td>\n",
+       "      <td>0.0000</td>\n",
+       "      <td>98.8631</td>\n",
+       "      <td>183.885</td>\n",
+       "      <td>72.17</td>\n",
+       "      <td>93.9199</td>\n",
+       "      <td>9853.68</td>\n",
+       "      <td>1715.270</td>\n",
+       "      <td>376.668</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>5 rows × 100 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "gene_id            613037  347148     10868   51233    10782      1804  \\\n",
+       "sample_id                                                                \n",
+       "TCGA-D8-A27E-01   12.0596  0.0000   714.707  1.4398  257.433    1.1518   \n",
+       "TCGA-AF-A56N-01  101.9000  0.4044   649.009  0.8087  252.325    1.6175   \n",
+       "TCGA-CN-6010-01    3.4211  1.1404   677.373  0.3801  221.990    2.6608   \n",
+       "TCGA-49-4512-01   82.2444  3.0746   640.277  1.5373  566.487   13.0669   \n",
+       "TCGA-V4-A9E8-01    2.9165  4.9432  1256.550  0.0000  165.101  119.6240   \n",
+       "\n",
+       "gene_id             9553    2893     55827      6165  ...   203102    51259  \\\n",
+       "sample_id                                             ...                     \n",
+       "TCGA-D8-A27E-01  496.724  0.2880  1313.080   7585.92  ...  16.4135  192.931   \n",
+       "TCGA-AF-A56N-01  535.382  0.0000   890.417  10779.20  ...  29.1144  348.969   \n",
+       "TCGA-CN-6010-01  327.283  3.8012  1851.560  14198.20  ...   1.5205  303.336   \n",
+       "TCGA-49-4512-01  352.805  3.0746  1641.810   7758.65  ...   0.7686  466.564   \n",
+       "TCGA-V4-A9E8-01  453.781  0.9886   482.452   4616.91  ...  11.8636  576.372   \n",
+       "\n",
+       "gene_id            127534     23398    167153    10276     84146     3490  \\\n",
+       "sample_id                                                                   \n",
+       "TCGA-D8-A27E-01    0.0000  298.6110  1002.950  3612.70  834.4970  3253.62   \n",
+       "TCGA-AF-A56N-01   92.6001  410.8370   912.252  4926.81  828.9530  4164.98   \n",
+       "TCGA-CN-6010-01  117.4570  302.5750   560.677  3481.90  702.4610  2825.05   \n",
+       "TCGA-49-4512-01    6.1491  326.6720   820.907  1753.27  582.6290  6768.64   \n",
+       "TCGA-V4-A9E8-01    0.0000   98.8631   183.885    72.17   93.9199  9853.68   \n",
+       "\n",
+       "gene_id             53373     79991  \n",
+       "sample_id                            \n",
+       "TCGA-D8-A27E-01  1557.560  1049.600  \n",
+       "TCGA-AF-A56N-01  1630.410   389.001  \n",
+       "TCGA-CN-6010-01   969.686   522.284  \n",
+       "TCGA-49-4512-01  2366.640  1169.870  \n",
+       "TCGA-V4-A9E8-01  1715.270   376.668  \n",
+       "\n",
+       "[5 rows x 100 columns]"
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# We want the subsampled data to have representation of most cancer types\n",
+    "# We can use stratified cross-validation to make sure of this\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "_, subsample_df = train_test_split(tcga_expr_df,\n",
+    "                                   test_size=0.1,\n",
+    "                                   random_state=cfg.default_seed,\n",
+    "                                   stratify=tcga_id.stratify_samples_count)\n",
+    "\n",
+    "# Also subsample genes, otherwise file size blows up\n",
+    "subsample_df = subsample_df.sample(n=100, axis=1, random_state=cfg.default_seed)\n",
+    "\n",
+    "print(subsample_df.shape)\n",
+    "subsample_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if not os.path.exists(cfg.test_data_dir):\n",
+    "    os.makedirs(cfg.test_data_dir)\n",
+    "subsample_df.to_csv(cfg.test_expression, sep='\\t', \n",
+    "                    compression='gzip', float_format='%.3g')"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,

diff --git a/README.md b/README.md
@@ -13,3 +13,26 @@ or alternatively whether its effect is specific to a particular cancer type.
 
 More to come.
 
+## Setup
+
+We recommend using the conda environment specified in the `environment.yml` file to run these analyses. To build and activate this environment, run:
+
+```shell
+# conda version 4.5.0
+conda env create --file environment.yml
+
+conda activate pancancer-evaluation
+```
+
+## Running tests
+
+Running the tests requires the `pytest` module (included in the specified
+Conda environment). Once this module is installed, you can run the tests
+by executing the command
+
+```shell
+pytest tests/
+```
+
+from the repo root.
+
diff --git a/config.py b/config.py