From 822d57653f6e27c7a335c1dc2a413d00055d1a28 Mon Sep 17 00:00:00 2001 From: Roc Reguant Date: Tue, 5 Mar 2024 10:59:37 +1100 Subject: [PATCH] Run VS no hail Run VS no hail --- examples/notebooks/run_importance_chr22.ipynb | 157 ++++++++++++++++++ 1 file changed, 157 insertions(+) create mode 100644 examples/notebooks/run_importance_chr22.ipynb diff --git a/examples/notebooks/run_importance_chr22.ipynb b/examples/notebooks/run_importance_chr22.ipynb new file mode 100644 index 00000000..3a9a04b9 --- /dev/null +++ b/examples/notebooks/run_importance_chr22.ipynb @@ -0,0 +1,157 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Running importance analysis with Python API\n", + "=====================================\n", + "\n", + "This is an *VariantSpark* example notebook.\n", + "\n", + "\n", + "One of the main applications of VariantSpark is discovery of genomic variants correlated with a response variable (e.g. case vs control) using random forest gini importance.\n", + "\n", + "The `chr22_1000.vcf` is a very small sample of the chromosome 22 VCF file from the 1000 Genomes Project.\n", + "\n", + "`chr22-labels.csv` is a CSV file with sample response variables (labels). In fact the labels directly represent the number of alternative alleles for each sample at a specific genomic position. E.g.: column 22_16050408 has labels derived from variants in chromosome 22 position 16050408. We would expect then that position 22:16050408 in the VCF file is strongly correlated with the label 22_16050408.\n", + "\n", + "Both data sets are located in the `..\\data` directory.\n", + "\n", + "This notebook demonstrates how to run importance analysis on these data with *VariantSpark* Python API." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Step 1: Create a spark session with VariantSpark jar attached." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import varspark as vs\n", + "from pyspark.sql import SparkSession \n", + "spark = SparkSession.builder.config('spark.jars', vs.find_jar()).getOrCreate()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Step 2: Create a `VarsparkContext` using `SparkSession` object (here injected as `spark`):" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "vc = vs.VarsparkContext(spark, silent = True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Step 3: Load the features `fs` and labels `ls` from data files." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "features = vc.import_vcf('../data/chr22_1000.vcf')\n", + "labels = vc.load_label('../data/chr22-labels.csv', '22_16050408')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Step 4: Run the importance analysis and retrieve top important variables:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "ia = features.importance_analysis(labels, seed = 13, n_trees=500, batch_size=20)\n", + "top_variables = ia.important_variables()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Step 5: Display the results." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Variable\tImportance\n", + "22_16050408_T_C\t0.000875899681306875\n", + "22_16050678_C_T\t0.0008045828330856887\n", + "22_16053197_G_T\t0.0006258776975143016\n", + "22_16051882_C_T\t0.0005914004839298169\n", + "22_16051107_C_A\t0.0005911526429890821\n", + "22_16051480_T_C\t0.0005362221508961817\n", + "22_16052838_T_A\t0.0004994650434540958\n", + "22_16052656_T_C\t0.0004932212678113746\n", + "22_16053435_G_T\t0.00046980813216784275\n", + "22_16054283_C_T\t0.0004692021189492525\n" + ] + } + ], + "source": [ + "print(\"%s\\t%s\" % ('Variable', 'Importance'))\n", + "for var_and_imp in top_variables:\n", + " print(\"%s\\t%s\" % var_and_imp) " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For more information on using *VariantSpark* and the Python API please visit the [documentation](http://variantspark.readthedocs.io/en/latest/)." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}