Skip to content

Commit

Permalink
Merge pull request #1 from jjc2718/cross_validation
Browse files Browse the repository at this point in the history
Implement and test single-cancer cross-validation
  • Loading branch information
jjc2718 authored Aug 3, 2020
2 parents 03fc3a5 + 276f032 commit d2047f8
Show file tree
Hide file tree
Showing 14 changed files with 11,637 additions and 30 deletions.
26 changes: 26 additions & 0 deletions .github/workflows/tests.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
name: Python tests

on: [push, pull_request]

jobs:
run-tests:
name: Run Python tests
runs-on: ubuntu-latest

steps:
- name: Check out Git repository
uses: actions/checkout@v2

- name: Set up Python
uses: actions/setup-python@v1
with:
python-version: 3.6

- name: Setup conda environment
run: |
$CONDA/bin/conda env update --file environment.yml --name base
$CONDA/bin/pip install -e .
- name: Run tests
run: |
$CONDA/bin/pytest tests/
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
**/__pycache__/*
**/.ipynb_checkpoints/*
*.egg-info*

data/*
data/EBPlusPlusAdjustPANCAN_IlluminaHiSeq_RNASeqV2-v2.geneExp.tsv
data/*.gz
286 changes: 284 additions & 2 deletions download_data.ipynb → 00_download_data.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
"import pandas as pd\n",
"from urllib.request import urlretrieve\n",
"\n",
"import config as cfg"
"import pancancer_utilities.config as cfg"
]
},
{
Expand Down Expand Up @@ -752,7 +752,7 @@
"metadata": {},
"outputs": [],
"source": [
"expr_file = os.path.join(cfg.data_dir, 'tcga_expression_matrix_processed.tsv')\n",
"expr_file = os.path.join(cfg.data_dir, 'tcga_expression_matrix_processed.tsv.gz')\n",
"tcga_expr_df.to_csv(expr_file, sep='\\t', compression='gzip', float_format='%.3g')"
]
},
Expand Down Expand Up @@ -1430,6 +1430,288 @@
"cancertype_count_df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Subsample expression dataframe for unit testing"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(1106, 100)\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th>gene_id</th>\n",
" <th>613037</th>\n",
" <th>347148</th>\n",
" <th>10868</th>\n",
" <th>51233</th>\n",
" <th>10782</th>\n",
" <th>1804</th>\n",
" <th>9553</th>\n",
" <th>2893</th>\n",
" <th>55827</th>\n",
" <th>6165</th>\n",
" <th>...</th>\n",
" <th>203102</th>\n",
" <th>51259</th>\n",
" <th>127534</th>\n",
" <th>23398</th>\n",
" <th>167153</th>\n",
" <th>10276</th>\n",
" <th>84146</th>\n",
" <th>3490</th>\n",
" <th>53373</th>\n",
" <th>79991</th>\n",
" </tr>\n",
" <tr>\n",
" <th>sample_id</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>TCGA-D8-A27E-01</th>\n",
" <td>12.0596</td>\n",
" <td>0.0000</td>\n",
" <td>714.707</td>\n",
" <td>1.4398</td>\n",
" <td>257.433</td>\n",
" <td>1.1518</td>\n",
" <td>496.724</td>\n",
" <td>0.2880</td>\n",
" <td>1313.080</td>\n",
" <td>7585.92</td>\n",
" <td>...</td>\n",
" <td>16.4135</td>\n",
" <td>192.931</td>\n",
" <td>0.0000</td>\n",
" <td>298.6110</td>\n",
" <td>1002.950</td>\n",
" <td>3612.70</td>\n",
" <td>834.4970</td>\n",
" <td>3253.62</td>\n",
" <td>1557.560</td>\n",
" <td>1049.600</td>\n",
" </tr>\n",
" <tr>\n",
" <th>TCGA-AF-A56N-01</th>\n",
" <td>101.9000</td>\n",
" <td>0.4044</td>\n",
" <td>649.009</td>\n",
" <td>0.8087</td>\n",
" <td>252.325</td>\n",
" <td>1.6175</td>\n",
" <td>535.382</td>\n",
" <td>0.0000</td>\n",
" <td>890.417</td>\n",
" <td>10779.20</td>\n",
" <td>...</td>\n",
" <td>29.1144</td>\n",
" <td>348.969</td>\n",
" <td>92.6001</td>\n",
" <td>410.8370</td>\n",
" <td>912.252</td>\n",
" <td>4926.81</td>\n",
" <td>828.9530</td>\n",
" <td>4164.98</td>\n",
" <td>1630.410</td>\n",
" <td>389.001</td>\n",
" </tr>\n",
" <tr>\n",
" <th>TCGA-CN-6010-01</th>\n",
" <td>3.4211</td>\n",
" <td>1.1404</td>\n",
" <td>677.373</td>\n",
" <td>0.3801</td>\n",
" <td>221.990</td>\n",
" <td>2.6608</td>\n",
" <td>327.283</td>\n",
" <td>3.8012</td>\n",
" <td>1851.560</td>\n",
" <td>14198.20</td>\n",
" <td>...</td>\n",
" <td>1.5205</td>\n",
" <td>303.336</td>\n",
" <td>117.4570</td>\n",
" <td>302.5750</td>\n",
" <td>560.677</td>\n",
" <td>3481.90</td>\n",
" <td>702.4610</td>\n",
" <td>2825.05</td>\n",
" <td>969.686</td>\n",
" <td>522.284</td>\n",
" </tr>\n",
" <tr>\n",
" <th>TCGA-49-4512-01</th>\n",
" <td>82.2444</td>\n",
" <td>3.0746</td>\n",
" <td>640.277</td>\n",
" <td>1.5373</td>\n",
" <td>566.487</td>\n",
" <td>13.0669</td>\n",
" <td>352.805</td>\n",
" <td>3.0746</td>\n",
" <td>1641.810</td>\n",
" <td>7758.65</td>\n",
" <td>...</td>\n",
" <td>0.7686</td>\n",
" <td>466.564</td>\n",
" <td>6.1491</td>\n",
" <td>326.6720</td>\n",
" <td>820.907</td>\n",
" <td>1753.27</td>\n",
" <td>582.6290</td>\n",
" <td>6768.64</td>\n",
" <td>2366.640</td>\n",
" <td>1169.870</td>\n",
" </tr>\n",
" <tr>\n",
" <th>TCGA-V4-A9E8-01</th>\n",
" <td>2.9165</td>\n",
" <td>4.9432</td>\n",
" <td>1256.550</td>\n",
" <td>0.0000</td>\n",
" <td>165.101</td>\n",
" <td>119.6240</td>\n",
" <td>453.781</td>\n",
" <td>0.9886</td>\n",
" <td>482.452</td>\n",
" <td>4616.91</td>\n",
" <td>...</td>\n",
" <td>11.8636</td>\n",
" <td>576.372</td>\n",
" <td>0.0000</td>\n",
" <td>98.8631</td>\n",
" <td>183.885</td>\n",
" <td>72.17</td>\n",
" <td>93.9199</td>\n",
" <td>9853.68</td>\n",
" <td>1715.270</td>\n",
" <td>376.668</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 100 columns</p>\n",
"</div>"
],
"text/plain": [
"gene_id 613037 347148 10868 51233 10782 1804 \\\n",
"sample_id \n",
"TCGA-D8-A27E-01 12.0596 0.0000 714.707 1.4398 257.433 1.1518 \n",
"TCGA-AF-A56N-01 101.9000 0.4044 649.009 0.8087 252.325 1.6175 \n",
"TCGA-CN-6010-01 3.4211 1.1404 677.373 0.3801 221.990 2.6608 \n",
"TCGA-49-4512-01 82.2444 3.0746 640.277 1.5373 566.487 13.0669 \n",
"TCGA-V4-A9E8-01 2.9165 4.9432 1256.550 0.0000 165.101 119.6240 \n",
"\n",
"gene_id 9553 2893 55827 6165 ... 203102 51259 \\\n",
"sample_id ... \n",
"TCGA-D8-A27E-01 496.724 0.2880 1313.080 7585.92 ... 16.4135 192.931 \n",
"TCGA-AF-A56N-01 535.382 0.0000 890.417 10779.20 ... 29.1144 348.969 \n",
"TCGA-CN-6010-01 327.283 3.8012 1851.560 14198.20 ... 1.5205 303.336 \n",
"TCGA-49-4512-01 352.805 3.0746 1641.810 7758.65 ... 0.7686 466.564 \n",
"TCGA-V4-A9E8-01 453.781 0.9886 482.452 4616.91 ... 11.8636 576.372 \n",
"\n",
"gene_id 127534 23398 167153 10276 84146 3490 \\\n",
"sample_id \n",
"TCGA-D8-A27E-01 0.0000 298.6110 1002.950 3612.70 834.4970 3253.62 \n",
"TCGA-AF-A56N-01 92.6001 410.8370 912.252 4926.81 828.9530 4164.98 \n",
"TCGA-CN-6010-01 117.4570 302.5750 560.677 3481.90 702.4610 2825.05 \n",
"TCGA-49-4512-01 6.1491 326.6720 820.907 1753.27 582.6290 6768.64 \n",
"TCGA-V4-A9E8-01 0.0000 98.8631 183.885 72.17 93.9199 9853.68 \n",
"\n",
"gene_id 53373 79991 \n",
"sample_id \n",
"TCGA-D8-A27E-01 1557.560 1049.600 \n",
"TCGA-AF-A56N-01 1630.410 389.001 \n",
"TCGA-CN-6010-01 969.686 522.284 \n",
"TCGA-49-4512-01 2366.640 1169.870 \n",
"TCGA-V4-A9E8-01 1715.270 376.668 \n",
"\n",
"[5 rows x 100 columns]"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# We want the subsampled data to have representation of most cancer types\n",
"# We can use stratified cross-validation to make sure of this\n",
"from sklearn.model_selection import train_test_split\n",
"_, subsample_df = train_test_split(tcga_expr_df,\n",
" test_size=0.1,\n",
" random_state=cfg.default_seed,\n",
" stratify=tcga_id.stratify_samples_count)\n",
"\n",
"# Also subsample genes, otherwise file size blows up\n",
"subsample_df = subsample_df.sample(n=100, axis=1, random_state=cfg.default_seed)\n",
"\n",
"print(subsample_df.shape)\n",
"subsample_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"if not os.path.exists(cfg.test_data_dir):\n",
" os.makedirs(cfg.test_data_dir)\n",
"subsample_df.to_csv(cfg.test_expression, sep='\\t', \n",
" compression='gzip', float_format='%.3g')"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand Down
23 changes: 23 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,26 @@ or alternatively whether its effect is specific to a particular cancer type.

More to come.

## Setup

We recommend using the conda environment specified in the `environment.yml` file to run these analyses. To build and activate this environment, run:

```shell
# conda version 4.5.0
conda env create --file environment.yml

conda activate pancancer-evaluation
```

## Running tests

Running the tests requires the `pytest` module (included in the specified
Conda environment). Once this module is installed, you can run the tests
by executing the command

```shell
pytest tests/
```

from the repo root.

27 changes: 0 additions & 27 deletions config.py

This file was deleted.

Loading

0 comments on commit d2047f8

Please sign in to comment.