diff --git a/_includes/assignments.html b/_includes/assignments.html index 1dec06cbc..a9c40fa65 100644 --- a/_includes/assignments.html +++ b/_includes/assignments.html @@ -3,49 +3,12 @@ Week Topic - Reading - Questions - Slides Lab - Assignment {% for assignment in page.assignments %} - {% for readingpage in site.pages %} - {% if readingpage.title == assignment and readingpage.element == 'reading' %} - {% capture reading_link %}{{ otherpage.url | prepend: site.baseurl }}{% endcapture %} - {% break %} - {% endif %} - {% assign language = nil %} - {% assign reading_link = nil %} - {% endfor %} - - {% for lecturepage in site.pages %} - {% if lecturepage.title == assignment and lecturepage.element == 'lecture' %} - {% capture lecture_link %}{{ lecturepage.url | prepend: site.baseurl }}{% endcapture %} - {% break %} - {% endif %} - {% assign lecture_link = nil %} - {% endfor %} - - {% for readingpage in site.pages %} - {% if readingpage.title == assignment and readingpage.element == 'reading' %} - {% capture reading_link %}{{ readingpage.url | prepend: site.baseurl }}{% endcapture %} - {% break %} - {% endif %} - {% assign reading_link = nil %} - {% endfor %} - - {% for lessonpage in site.pages %} - {% if lessonpage.title == assignment and lessonpage.element == 'lesson' %} - {% capture lesson_link %}{{ lessonpage.url | prepend: site.baseurl }}{% endcapture %} - {% break %} - {% endif %} - {% assign lesson_link = nil %} - {% endfor %} - {% for labpage in site.pages %} {% if labpage.title == assignment and labpage.element == 'lab' %} {% capture lab_link %}{{ labpage.url | prepend: site.baseurl }}{% endcapture %} @@ -53,34 +16,13 @@ {% endif %} {% assign lab_link = nil %} {% endfor %} + {{ forloop.index }} {{ assignment }} - - {% if reading_link %} - - - {% else %} - - - {% endif %} - - {% if questions_link %} - - - {% else %} - - - {% endif %} - - {% if slides_link %} - - - {% else %} - - - {% endif %} + + {% if lab_link %} @@ -90,13 +32,7 @@ {% endif %} - {% if assignment_link %} - - - {% else %} - - - {% endif %} + {% endfor %} diff --git a/figs/lab_10.1.png b/figs/lab_10.1.png new file mode 100644 index 000000000..1dcaf4058 Binary files /dev/null and b/figs/lab_10.1.png differ diff --git a/figs/lab_10.2.png b/figs/lab_10.2.png new file mode 100644 index 000000000..ae7202b2d Binary files /dev/null and b/figs/lab_10.2.png differ diff --git a/figs/lab_10.3.png b/figs/lab_10.3.png new file mode 100644 index 000000000..dc373a74b Binary files /dev/null and b/figs/lab_10.3.png differ diff --git a/figs/lab_10.4.png b/figs/lab_10.4.png new file mode 100644 index 000000000..4a3f8f533 Binary files /dev/null and b/figs/lab_10.4.png differ diff --git a/figs/lab_10.5.png b/figs/lab_10.5.png new file mode 100644 index 000000000..5f62d8b1b Binary files /dev/null and b/figs/lab_10.5.png differ diff --git a/figs/lab_10.table.png b/figs/lab_10.table.png new file mode 100644 index 000000000..7d7e1f3cd Binary files /dev/null and b/figs/lab_10.table.png differ diff --git a/figs/lab_7.1.png b/figs/lab_7.1.png new file mode 100644 index 000000000..3778d6887 Binary files /dev/null and b/figs/lab_7.1.png differ diff --git a/figs/lab_8.1.png b/figs/lab_8.1.png new file mode 100644 index 000000000..2bc1fad3e Binary files /dev/null and b/figs/lab_8.1.png differ diff --git a/figs/lab_8.2.png b/figs/lab_8.2.png new file mode 100644 index 000000000..e518cd28e Binary files /dev/null and b/figs/lab_8.2.png differ diff --git a/figs/lab_8.3.png b/figs/lab_8.3.png new file mode 100644 index 000000000..9172060b2 Binary files /dev/null and b/figs/lab_8.3.png differ diff --git a/figs/lab_8.4.png b/figs/lab_8.4.png new file mode 100644 index 000000000..e58015ce3 Binary files /dev/null and b/figs/lab_8.4.png differ diff --git a/figs/lab_8.5.png b/figs/lab_8.5.png new file mode 100644 index 000000000..28da7ed56 Binary files /dev/null and b/figs/lab_8.5.png differ diff --git a/figs/lab_8.6.png b/figs/lab_8.6.png new file mode 100644 index 000000000..b467eb3a5 Binary files /dev/null and b/figs/lab_8.6.png differ diff --git a/figs/lab_9.1.png b/figs/lab_9.1.png new file mode 100644 index 000000000..c591468f3 Binary files /dev/null and b/figs/lab_9.1.png differ diff --git a/figs/lab_9.2.png b/figs/lab_9.2.png new file mode 100644 index 000000000..dc9de70c4 Binary files /dev/null and b/figs/lab_9.2.png differ diff --git a/figs/lab_9.3.png b/figs/lab_9.3.png new file mode 100644 index 000000000..c0e4d310d Binary files /dev/null and b/figs/lab_9.3.png differ diff --git a/figs/lab_9.4.png b/figs/lab_9.4.png new file mode 100644 index 000000000..0bc75a344 Binary files /dev/null and b/figs/lab_9.4.png differ diff --git a/figs/lab_9.5.png b/figs/lab_9.5.png new file mode 100644 index 000000000..8714a452e Binary files /dev/null and b/figs/lab_9.5.png differ diff --git a/figs/lab_9.6.png b/figs/lab_9.6.png new file mode 100644 index 000000000..171246bb7 Binary files /dev/null and b/figs/lab_9.6.png differ diff --git a/figs/private_connection.png b/figs/private_connection.png new file mode 100644 index 000000000..5afd76560 Binary files /dev/null and b/figs/private_connection.png differ diff --git a/figs/rstudio_server.png b/figs/rstudio_server.png new file mode 100644 index 000000000..596a41d51 Binary files /dev/null and b/figs/rstudio_server.png differ diff --git a/labs/lab_1.md b/labs/lab_1.md index f75ba567f..7043330ba 100644 --- a/labs/lab_1.md +++ b/labs/lab_1.md @@ -4,8 +4,8 @@ element: lab layout: default --- -# Introduction -## objectives +# Section 1 +## Objectives - Describe key reasons for learning shell. - Navigate your file system using the command line. @@ -14,7 +14,7 @@ layout: default *** -## questions +## Questions - What is a command shell and why would I use one? - How can I move around on my computer? @@ -132,6 +132,25 @@ up until this point. *** +## Downloading files. +We first have to download the data files used in this lab. I have put them on a git repository +(which you will learn about next lecture) and you download it using this code. + +```bash +[grego@indri ~]$ git clone https://github.com/owensgl/shell_data.git +``` + +```output +Cloning into 'shell_data'... +remote: Enumerating objects: 8, done. +remote: Counting objects: 100% (8/8), done. +remote: Compressing objects: 100% (6/6), done. +remote: Total 8 (delta 0), reused 0 (delta 0), pack-reused 0 +Receiving objects: 100% (8/8), 3.71 MiB | 4.38 MiB/s, done. +``` + + + ## Navigating your file system The part of the operating system that manages files and directories @@ -359,9 +378,7 @@ using the command line shell enables us to make our workflow more efficient and - Tab completion can reduce errors from mistyping and make work more efficient in the shell. *** - -# Navigating files and directories - +# Section 2 ## Objectives @@ -590,7 +607,7 @@ what will `ls ../backup` display? 3. `2012-12-01/ 2013-01-08/ 2013-01-27/` 4. `original pnas_final pnas_sub` -![](figs/filesystem-challenge.svg){alt='File System for Challenge Questions'} +![](../figs/filesystem-challenge.svg) ### Navigational Shortcuts @@ -639,7 +656,8 @@ The commands `cd`, and `cd ~` are very useful for quickly navigating back to you *** -## objectives +# Section 3 +## Objectives - View, search within, copy, move, and rename files. Create new directories. - Use wildcards (`*`) to perform operations on multiple files. @@ -648,7 +666,7 @@ The commands `cd`, and `cd ~` are very useful for quickly navigating back to you *** -## questions +## Questions - How can I view and search file contents? - How can I create, copy and delete files and directories? @@ -715,7 +733,6 @@ each result starts with `/`. ## Challenge -## Exercise Do each of the following tasks from your current directory using a single `ls` command for each: @@ -731,11 +748,10 @@ Hint: The bonus question requires a Unix wildcard that we haven't talked about yet. Try searching the internet for information about Unix wildcards to find what you need to solve the bonus problem. -::::::::::::::: solution ## Challenge -## Exercise + `echo` is a built-in shell command that writes its arguments, like a line of text to standard output. The `echo` command can also be used with pattern matching characters, such as wildcard characters. @@ -800,7 +816,7 @@ For more information on advanced usage of `history`, read section 9.3 of ## Challenge -## Exercise + Find the line number in your history for the command that listed all the .sh files in `/usr/bin`. Rerun that command. @@ -838,7 +854,7 @@ $ cat bullkelp_001_R1.fastq -## Exercise +## Challenge 1. Print out the contents of the `~/shell_data/untrimmed_fastq/bullkelp_001_R1.fastq` file. What is the last line of the file? What command would only print the last lines of a file? @@ -847,12 +863,6 @@ $ cat bullkelp_001_R1.fastq the `~/shell_data/untrimmed_fastq` directory. -## Solution - -1. The last line of the file is `C:CCC::CCCCCCCC<8?6A:C28C<608'&&&,'$`. - `tail` prints the last lines of a file. -2. `cat ~/shell_data/untrimmed_fastq/*` - *** `cat` is a terrific program, but when the file is really big, it can @@ -896,7 +906,7 @@ forward to the next instance of this sequence motif. If you instead type `?` and return, you will search backwards and move up the file to previous examples of this motif. -## Exercise +## Challenge What are the next three nucleotides (characters) after the first instance of the sequence quoted above? @@ -1137,7 +1147,7 @@ characters relate to the permissions that the file owner has, the next three rel three characters specify what other users outside of your group can do with the file. We're going to concentrate on the three positions that deal with your permissions (as the file owner). -![](figs/rwx_figure.svg){alt='Permissions breakdown'} +![](../figs/rwx_figure.svg) Here the three positions that relate to the file owner are `rwx`. The `r` means that you have permission to read the file, the `w` indicates that you have permission to write to (i.e. make changes to) the file, and the third position is a `x`, indicating that you @@ -1190,7 +1200,7 @@ you will be asked whether you want to override your permission settings. Be extr or with variables (covered later). A mistaken `rm` command can delete things you don't want to be deleted. -## Exercise +## Challenge Starting in the `shell_data/untrimmed_fastq/` directory, do the following: @@ -1203,7 +1213,7 @@ Starting in the `shell_data/untrimmed_fastq/` directory, do the following: *** -## keypoints +## Keypoints - You can view file contents using `less`, `cat`, `head` or `tail`. - The commands `cp`, `mv`, and `mkdir` are useful for manipulating existing files and creating new directories. diff --git a/labs/lab_10.md b/labs/lab_10.md new file mode 100644 index 000000000..26244b371 --- /dev/null +++ b/labs/lab_10.md @@ -0,0 +1,344 @@ +--- +title: Genome assembly +element: lab +layout: default +--- +## Genome Assembly + +Before we start working on genome assembly, we need to clear space on the server. +Currently we're using almost all of the space in the home directory. Once +that fills up, no one will be able to use the server. Please follow +these steps to remove unnecessary files. Before you start working on +the assembly portion of this lab, you will need to delete enough files +so that your directory is less than 1 GB. After you have done that, +call me over so I can check your directory and approve it. + +1. Login to the server. Source the bash.rc file to use modules. +2. To check how much you are storing using the du command +```bash +du -h --max-depth=1 +``` +```output +4.0K ./.config +272K ./.local +702M ./R +0 ./.licenses +0 ./.mii +4.0K ./.ncbi +0 ./bin +28K ./.cache +64K ./.java +4.0K ./.ssh +109M ./lab_9 +811M +``` +This shows how much is stored in each folder. The R directory +contains all of your R packages, so you can leave that. You'll +probably have a fair amount in lab_* directories. Please delete those entirely +or just delete all the large files in them. You will probably also +have a large amount in the .local directory. This contains +the saved sessions from Rstudio server. Before we clear that, +we should open Rstudio and save everything you need. +3. Open Rstudio server. In the top left, you will have +your scripts. If you have an unsaved scripts, save them to your +home directory with appropriate names. +4. Clear all objects from your workspace using the broom icon (on the right). +5. Restart your session by going to "Session"->"Quit Session". When prompted, +DO NOT save your workspace. It will prompt you to start a new session which +should be entirely clean. +6. You've cleared you Rstudio, but there still might be files saved +from previous suspended sessions. Go into the .local directory and run +'du -h'. Look through the output for directories with > 1M. If you find any +call me (Greg) over to delete them (to make sure you don't delete something essential). +7. Check with Greg to make sure that you've cleared everything you need to. + +**** + +With that out of the way, lets work on genome assembly. +Return to your home directory and download some example data. +```bash +cd ~ +mkdir lab_10 +cd lab_10 +mkdir velvet +wget https://zenodo.org/api/records/582600/files-archive -O velvet_files.zip +unzip velvet_files.zip +``` + +We're going to run a de novo assembly of a simulated Staphylococcus aureus genome. +This species has a genome only 197,394 bp so its relatively simple. +The first step is to filter our reads. If we don't remove low quality reads or reads with +adapter sequences we can incorporate errors into our assembly. +```bash +module load fastp +fastp --in1 mutant_R1.fastq --in2 mutant_R2.fastq --stdout > mutant_bothreads.fastq +``` +You may notice we're outputting only a single file from fastp. In this case, +we want to generate an interleaved fastq file that has both R1 and R2 in the +same file. This is needed for velvet. +```bash +module load velvet +#This first step breaks our reads into kmers of length 29 +velveth mutant 29 -shortPaired -fastq -interleaved mutant_bothreads.fastq +#This second step assembles the kmers +velvetg mutant +``` +We've now assembled our reads! Hurray! The main output of this is the +contigs. +```bash +cd mutant +head contigs.fa +``` +```output +>NODE_1_length_29_cov_1.000000 +AGTCGGTTGCTGAAGTATCACAGGGCCATGTCCCTCAACTTCTCACAATAAGAAATA +>NODE_2_length_171_cov_26.801170 +GTTGGTAACATTTATACAGGATCATTATACTTAAGTTTAATTTCGTTATTACAGAACCAC +ACATTCCAACCAGAAGAGAAAGTATGTCTATTTAGTTATGGTTCAGGAGCAGTAGGAGAA +ATCTTTAGTGGTTCAATCGTTAAAGGATATGACAAAGCATTAGATAAAGAGAAACACTTA +AATATGCTAGAATCTAGAG +>NODE_5_length_29_cov_1.000000 +ATACTGTGCAACCGCTTCAATATCTTTAAGAAATTGACGGTCATATGTAATGGATGG +>NODE_6_length_40_cov_1.000000 +``` + +This is your assembly in fasta format. Each contig is labelled +with a node number (just so it has an ID number), the length +and the coverage. +We can also look at the stats table. I'm using a cut command +to only print out the first 7 columns, because the columns after that +are not useful. +```bash +head stats.txt | cut -f 1-7 +``` +```output +ID lgth out in long_cov short1_cov short1_Ocov short2_cov +1 29 1 1 0.000000 1.000000 1.000000 0.000000 +2 171 2 2 0.000000 26.801170 24.830409 0.000000 +3 9 2 1 0.000000 28.000000 27.000000 0.000000 +4 4 2 1 0.000000 26.250000 25.250000 0.000000 +5 29 1 1 0.000000 1.000000 1.000000 0.000000 +6 40 1 1 0.000000 1.000000 1.000000 0.000000 +7 29 1 1 0.000000 1.000000 1.000000 0.000000 +8 29 1 1 0.000000 9.103448 9.103448 0.000000 +9 29 1 1 0.000000 1.000000 1.000000 0.000000 +``` +This tells you the length of your contigs and the amount of coverage or the number +of times was sequenced. + +### Questions +- How many contigs are there in the contigs.fa file? How many are listed in the stats.txt file? Why are these numbers different? +- What is the total length of the contigs? How does that compare to the expected length of 197,394 bp? + + +We can also use another program to get more stats on the assembly. We're going to use +QUAST (Quality Assessment Tool). QUAST generates reports about the size of your contigs, but +additionally you can give it a reference genome and it will compare your assembly to the reference. This can show you if your genome assembly is the full size you expect it to be. We're going to give QUAST the wild type reference genome and see how our assembly compares. + +```bash +cd .. +module load StdEnv/2020 gcc/9.3.0 quast/5.2.0 +quast -o mutant_quast -r wildtype.fna mutant/contigs.fa +``` + +The output of QUAST is text files, but it also has HTML reports. Download the entire +mutant_quast directory to your desktop using scp, and then open icarus.html. +In the QUAST report section, it includes some important graphs. + +![cumulative plot](../figs/lab_10.1.png) +This graph shows the cumulative length of contigs starting with the longest and moving shorter, in the red line. The grey line is the total length of the reference genome. In an +ideal world, the red line would reach the grey line quickly, showing that the de novo assembly +is the same length as the reference genome and that most of that length is in only a few contigs. + +![Nx plot](../figs/lab_10.2.png) +This is Nx plot. Here we are again ordering our contigs from largest to smallest. The X axis is the contig percentile (i.e. longest to shortest), and the y axis is the length of the contig +in that position. We often report the N50 number, is defined by the length of the shortest contig for which longer and equal length contigs cover at least 50 % of the assembly. + +### Question +- What is N90 number for this assembly? +- Look at the contig alignment viewer. What is the most obvious difference between this +assembly and the reference? +- What is an L50, and what is the L50 for your assembly? +- How many mismatches are there between your assembly and the reference? +- Repeat the velvet genome assembly with a kmer size of 53 and 93. Of the three sizes, which +produces the best assembly? + +For our next section, we're going to use hifi data to assemble an E. coli genome. We're not going to do filtering for this set because the quality is already great and it doesn't actually remove anything, but you should run it for your own projects. +```bash +cd ~/lab_10 +cp /project/ctb-grego/sharing/lab_10/ecoli_hifi.fastq.gz ./ +module load StdEnv/2020 hifiasm + +hifiasm ecoli_hifi.fastq.gz -l0 -t 10 -o ecoli_hifi +awk '/^S/{print ">"$2;print $3}' ecoli_hifi.bp.p_ctg.gfa > ecoli_hifi.bp.p_ctg.fa +``` +Since the hifi reads are so long and accurate, it allows for the independent assembly +of both parental haplotypes. Since this is a bacteria, it is haploid so we've given, +it the -l0 option to prevent it from producing two assemblies. The -t 10 option +tells it to use 10 cores. + +You'll notice as hifiasm runs it produces a histogram of sequence depths. In most cases, +the mode of that distribution is the coverage of the sample. From that, we can see that we've +got roughly 15X coverage, similar to the illumina data we used earlier. The last awk command +converts the gfa (which contains extra information) into a normal fasta file. + +Lets use quast to look at genome quality, but first we should download a genome to compare it to. +```bash +module load StdEnv/2023 gcc/12.3 sra-toolkit/3.0.9 +curl -o ecoli_ref.zip 'https://api.ncbi.nlm.nih.gov/datasets/v2alpha/genome/accession/GCF_000005845.2/download?include_annotation_type=GENOME_FASTA&include_annotation_type=GENOME_GFF&include_annotation_type=RNA_FASTA&include_annotation_type=CDS_FASTA&include_annotation_type=PROT_FASTA&include_annotation_type=SEQUENCE_REPORT&hydrated=FULLY_HYDRATED' +unzip ecoli_ref.zip + +quast -o hifi_quast -r ncbi_dataset/data/GCF_000005845.2/GCF_000005845.2_ASM584v2_genomic.fna ecoli_hifi.bp.p_ctg.fa +``` +Download the hifi_quast directory to your desktop, open icarus.html. + +### Questions +- How many contigs are in your assembly? +- What is the N50 of the assembly? +- Would you say this is a better or worse genome assembly than the velvet assembly? + + +Now that we have a genome the next step is to see where genes are. To do this, we need to annotate the genome. +This is a challenging problem. There are several different approaches: +1) Sequencing mRNA and using that to guide what genomic regions are transcribed, including splicing patterns. +2) Using a model to look for gene-like sequence motifs. +3) Comparing to other known gene models. +4) Comparing between closely related species genomes and looking for homologous sequences. + +There are multiple different programs that take different combinations of the previous approaches. For +eukaryotes, MAKER is a common choice. For prokaryotes, the situation is simpler because the genomes are smaller, +more gene dense and don't contain introns. We're going to use PROKKA to annotate our new genome. + +```bash +module load StdEnv/2020 gcc/9.3.0 prokka +prokka ecoli_hifi.bp.p_ctg.fa +``` + +Prokka compares the sequences to a set of known prokaryotic genes in UniProt and it also +looks for more distant similarity to any gene in an annotated prokaryotic genome. Based on +this similarity, it labels genes by their likely function. + +### Questions +- How many genes did prokka annotate? +- Find a gene annotated in both the reference genome and your new genome. Find a gene that is in your new +genome, but not the reference genome. + +**** + +In the last section, we're going to align two whole genome sequences. For short reads, +we used BWA-mem, but for longer sequences we need to use a different program, for example +minimap2. For this section, we're going to download two different genomes from the +Arabidopsis genus. + +```bash +cd ~/lab_10 +mkdir genome_comparison +cd genome_comparison/ +curl --output a_thaliana.zip 'https://api.ncbi.nlm.nih.gov/datasets/v2alpha/genome/accession/GCF_000001735.4/download?include_annotation_type=GENOME_FASTA&include_annotation_type=GENOME_GFF&include_annotation_type=RNA_FASTA&include_annotation_type=CDS_FASTA&include_annotation_type=PROT_FASTA&include_annotation_type=SEQUENCE_REPORT&hydrated=FULLY_HYDRATED' + +curl --output a_arenosa.zip 'https://api.ncbi.nlm.nih.gov/datasets/v2alpha/genome/accession/GCA_905216605.1/download?include_annotation_type=GENOME_FASTA&include_annotation_type=GENOME_GFF&include_annotation_type=RNA_FASTA&include_annotation_type=CDS_FASTA&include_annotation_type=PROT_FASTA&include_annotation_type=SEQUENCE_REPORT&hydrated=FULLY_HYDRATED' +unzip a_arenosa.zip +unzip a_thaliana.zip +#Note, when asked if you want to overwrite, type "All" +``` + +```bash +module load minimap2 +minimap2 -cx asm20 \ + ncbi_dataset/data/GCA_905216605.1/GCA_905216605.1_AARE701a_genomic.fna \ + ncbi_dataset/data/GCF_000001735.4/GCF_000001735.4_TAIR10.1_genomic.fna \ + > arabidopsis_aligned.paf +head arabidopsis_aligned.paf | cut -f 1-12 +``` +```output +NC_003070.9 30427671 302984 342050 - LR999451.1 25224288 123630 170102 35580 47816 60 +NC_003070.9 30427671 25069147 25095539 + LR999452.1 14316399 7694740 7721381 24575 27066 60 +NC_003070.9 30427671 5562728 5595170 + LR999451.1 25224288 6335141 6371781 29511 37557 60 +NC_003070.9 30427671 701442 728409 + LR999451.1 25224288 820561 849097 25132 29082 60 +NC_003070.9 30427671 27917258 27951438 + LR999452.1 14316399 11374022 11403129 26332 35248 60 +NC_003070.9 30427671 5224185 5256648 + LR999451.1 25224288 5964211 5997757 29202 34577 60 +NC_003070.9 30427671 11246938 11276499 - LR999451.1 25224288 15608823 15639766 26174 32415 60 +NC_003070.9 30427671 17732524 17756114 + LR999451.1 25224288 18733693 18757375 21903 24129 60 +NC_003070.9 30427671 5733060 5756911 + LR999451.1 25224288 6524999 6548507 21628 24502 60 +NC_003070.9 30427671 29038801 29064917 + LR999452.1 14316399 12577980 12605368 23429 28506 60 +``` + +This produces a .paf file, or paired mapping format file. The alignment is in many small pieces, representing +small chunks of the genomes where they are aligned. +![table plot](../figs/lab_10.table.png) + + + +By default, it doesn't actually output the +aligned sequences (unlike SAM format), it just tells you were the two sequences are aligned. This +is useful for telling where orthologous sequences are between the two genomes. In almost any case, +you will also get extra alignment matches between other random chunks of the genome. To understand the +broad synteny patterns (i.e. which chromosome in A. arenosa is orthologous to which chromosome in A. thaliana) +you need to visualize the matches. + +We're going to the pafr package to show how the two genomes align. Open Rstudio and run the following: +```R +install.packages("pafr") +library(pafr) + +ali <- read_paf("/project/ctb-grego/owens/test/lab_10/genome_comparison/arabidopsis_aligned.paf") +ali +``` +```output +pafr object with 17766 alignments (73.9Mb) + 7 query seqs + 8 target seqs + 12 tags: NM, ms, AS, nn, tp, cm, s1, s2, de, zd, rl . +``` +The pafr package has some tools for visualizing the alignments. First is a dotplot, +which shows where alignments are relative to each genome. +```R +dotplot(ali, label_seqs=TRUE, order_by="qstart") + theme_bw() +``` + +![dot plot](../figs/lab_10.3.png) + +Another way of representing the alignment is a coverage plot. For this, +every spot of the target genome is colored by which part of the query genome +aligns to it. +```R +plot_coverage(ali, fill='qname') + + scale_fill_brewer(palette="Set1") +``` +![coverage plot](../figs/lab_10.4.png) + +We can see that contig LR999451.1 has blue sequences aligning to it, which correspond +to contig NC_003070.9 in the query genome. There is a fair emount of white spots in this +plot, suggesting there areas of the genome where nothing aligns. This is likely because these +two species are fairly divergent and it is challenging to align. An alternative program +for more divergent genomes is anchorwave. + +Lastly, since the paf data is being stored as a dataframe, we can use our normal filtering +and plotting skills. + +```R +library(dplyr) +ali %>% + filter(qname == "NC_003070.9") %>% + plot_coverage(., fill='qname') + + scale_fill_brewer(palette="Set1") +``` +![coverage plot 2](../figs/lab_10.5.png) + + + + + + + + + + + + + + + + + + diff --git a/labs/lab_2.md b/labs/lab_2.md index 54fc1895f..1a0cb8a58 100644 --- a/labs/lab_2.md +++ b/labs/lab_2.md @@ -6,19 +6,84 @@ layout: default # Section 1 -## objectives +## Objectives - Create a directory hierarchy that matches a given diagram. - Create files in that hierarchy using an editor or by copying and renaming existing files. - Delete, copy and move specified files and/or directories. -## questions +## Questions - How can I create, copy, and delete files and directories? - How can I edit files? +## How to access the terminal if you're on a windows PC +For mac and linux computers, the terminal program is pre-installed, but +for windows users you will need to install it. Follow this [link](https://carpentries.github.io/workshop-template/install_instructions/#shell) and click on the "windows" tab. +Then follow the instructions there to download and install +git for windows, which includes a bash shell. + +## How to access the remote server + +All students will be accessing the server using their UVic Netlink ID. You will use the `ssh` or `Secure Shell` +command to login to the remote server, while logged into the UVic VPN. The VPN is called +"Cisco AnyConnect Secure Mobile Client". Open it and fill in "vpn.uvic.ca" into the connection box. +It will then tell you to select a group, please select "3 - Student". This will then prompt you +to fill in your netlink ID and then request a Duo Push notification. + +```bash +ssh your_id@indri.rcs.uvic.ca +``` +You will be prompted to use Duo, two-factor authentication before you enter your password. After approving the authentication on your phone, +you will use your netlink ID to login. + +```output +(base) p165-072:~ gregoryowens$ ssh grego@indri.rcs.uvic.ca +* +* Acceptable Use of Electronic Information Resources +* +* Access to this system is prohibited except for authorized University of +* Victoria students, faculty, and staff. +* +* All activity may be logged/monitored and unauthorized access will be handled +* per the provisions of UVic's Policy IM7200: Acceptable Use of Electronic +* Information Resources. +* +* https://www.uvic.ca/universitysecretary/assets/docs/policies/IM7200_6030_.pdf +* +(grego@fossa.rcs.uvic.ca) Duo two-factor login for grego + +Enter a passcode or select one of the following options: + + 1. Duo Push to XXX-XXX-XXXX + +Passcode or option (1-1): +``` + + +After logging in, you will see a screen showing something like this: + +```output +To access CVMFS modules please source the appropriate profile. +For example: 'source /cvmfs/soft.computecanada.ca/config/profile/bash.sh' + +Last failed login: Thu Oct 5 10:44:07 PDT 2023 from 142.104.165.72 on ssh:notty +There was 1 failed login attempt since the last successful login. +Last login: Thu Oct 5 09:48:45 2023 from 142.104.165.72 +``` + +## Sourcing our programs + +We want to access a variety of programs on the server. To do that, +we are going to source a file to bring it into our profile. This will +tell the computer where all your programs are stored. NOTE: +You have to run this command whenever you login. + +```bash +source /cvmfs/soft.computecanada.ca/config/profile/bash.sh +``` ## Creating directories @@ -26,14 +91,20 @@ We now know how to explore files and directories, but how do we create them in the first place? In this lab we will learn about creating and moving files and directories, -using the `exercise-data/writing` directory as an example. +using the `exercise-data/writing` directory as an example. + +We can get this data by downloading it from a git repository: + +```bash +[grego@indri ~]$ git clone https://github.com/owensgl/shell_data.git +``` ### Step one: see where we are and what we already have First we should return to the home directory: ```bash -$ cd ~ +$ cd ~/shell_data ``` Then enter the `lab_2_data` directory: @@ -173,7 +244,7 @@ another directory the first time you 'Save As...' Let's type in a few lines of text. -![](figs/nano-screenshot.png){alt="screenshot of nano text editor in action with the text It's not publish or perish any more, it's share and thrive"} +![](../figs/nano-screenshot.png) Once we're happy with our text, we can press Ctrl\+O (press the Ctrl or Control key and, while @@ -275,10 +346,10 @@ attempt to open the `whale.mp3` file. ## Moving files and directories -Returning to the `shell-lesson-data/exercise-data/writing` directory, +Returning to the `shell-data/lab_2_data/exercise-data/writing` directory, ```bash -$ cd ~/Desktop/shell-lesson-data/exercise-data/writing +$ cd ~shell_data/exercise-data/writing ``` In our `thesis` directory we have a file `draft.txt` @@ -469,7 +540,7 @@ $ ls ## Removing files and directories -Returning to the `shell-lesson-data/exercise-data/writing` directory, +Returning to the `shell-data/lab_2_data/exercise-data/writing` directory, let's tidy up this directory by removing the `quotes.txt` file we created. The Unix command we'll use for this is `rm` (short for 'remove'): @@ -541,7 +612,7 @@ or sets of characters when navigating the Unix file system. ## Challenge: Copy with Multiple Filenames -For this exercise, you can test the commands in the `shell-lesson-data/exercise-data` directory. +For this exercise, you can test the commands in the `shell-data/lab_2_data/exercise-data` directory. In the example below, what does `cp` do when given several filenames and a directory name? @@ -570,7 +641,7 @@ $ cp minotaur.dat unicorn.dat basilisk.dat ## Wildcards `*` is a **wildcard**, which represents zero or more other characters. -Let's consider the `shell-lesson-data/exercise-data/alkanes` directory: +Let's consider the `shell-data/lab_2_data/exercise-data/alkanes` directory: `*.pdb` represents `ethane.pdb`, `propane.pdb`, and every file that ends with '.pdb'. On the other hand, `p*.pdb` only represents `pentane.pdb` and `propane.pdb`, because the 'p' at the front can only @@ -786,26 +857,9 @@ $ mkdir data $ mkdir raw processed ``` -## Solution - -The first two sets of commands achieve this objective. -The first set uses relative paths to create the top-level directory before -the subdirectories. - -The third set of commands will give an error because the default behavior of `mkdir` -won't create a subdirectory of a non-existent directory: -the intermediate level folders must be created first. - -The fourth set of commands achieve this objective. Remember, the `-p` option, -followed by a path of one or more -directories, will cause `mkdir` to create any intermediate subdirectories as required. - -The final set of commands generates the 'raw' and 'processed' directories at the same level -as the 'data' directory. - *** -## keypoints +## Keypoints - `cp [old] [new]` copies a file. - `mkdir [path]` creates a new directory. @@ -825,14 +879,14 @@ as the 'data' directory. # Section 2 -## objectives +## Objectives - Redirect a command's output to a file. - Construct command pipelines with two or more stages. - Explain what usually happens if a program or pipeline isn't given any input to process. - Explain the advantage of linking commands with pipes and filters. -## questions +## Questions - How can I combine existing commands to do new things? @@ -841,7 +895,7 @@ as the 'data' directory. Now that we know a few basic commands, we can finally look at the shell's most powerful feature: the ease with which it lets us combine existing programs in new ways. -We'll start with the directory `shell-lesson-data/exercise-data/alkanes` +We'll start with the directory `shell-data/lab_2_data/exercise-data/alkanes` that contains six files describing some simple organic molecules. The `.pdb` extension indicates that these files are in Protein Data Bank format, a simple text format that specifies the type and position of each atom in the molecule. @@ -1000,7 +1054,7 @@ But first we'll do an exercise to learn a little about the sort command: ## Challenge: What Does `sort -n` Do? -The file `shell-lesson-data/exercise-data/numbers.txt` contains the following lines: +The file `shell-data/lab_2_data/exercise-data/numbers.txt` contains the following lines: ```source 10 @@ -1126,7 +1180,7 @@ Hint: Try executing each command twice in a row and then examining the output fi ## Challenge: Appending Data -Consider the file `shell-lesson-data/exercise-data/animal-counts/animals.csv`. +Consider the file `shell-data/lab_2_data/exercise-data/animal-counts/animals.csv`. After these commands, select the answer that corresponds to the file `animals-subset.csv`: @@ -1207,7 +1261,7 @@ the algorithm is 'head of sort of line count of `*.pdb`'. The redirection and pipes used in the last few commands are illustrated below: -![](figs/redirects-and-pipes.svg){alt='Redirects and Pipes of different commands: "wc -l \*.pdb" will direct theoutput to the shell. "wc -l \*.pdb > lengths" will direct output to the file"lengths". "wc -l \*.pdb | sort -n | head -n 1" will build a pipeline where theoutput of the "wc" command is the input to the "sort" command, the output ofthe "sort" command is the input to the "head" command and the output of the"head" command is directed to the shell'} +![](../figs/redirects-and-pipes.svg) ## Challenge: Piping Commands Together @@ -1247,7 +1301,7 @@ so that you and other people can put those programs into pipes to multiply their ## Challenge: Pipe Reading Comprehension -A file called `animals.csv` (in the `shell-lesson-data/exercise-data/animal-counts` folder) +A file called `animals.csv` (in the `shell-data/lab_2_data/exercise-data/animal-counts` folder) contains the following data: ```source @@ -1319,7 +1373,7 @@ The file `animals.csv` contains 8 lines of data formatted as follows: The `uniq` command has a `-c` option which gives a count of the number of times a line occurs in its input. Assuming your current -directory is `shell-lesson-data/exercise-data/animal-counts`, +directory is `shell-data/lab_2_data/exercise-data/animal-counts`, what command would you use to produce a table that shows the total count of each type of animal in the file? @@ -1335,7 +1389,7 @@ the total count of each type of animal in the file? Nelle has run her samples through the assay machines and created 17 files in the `north-pacific-gyre` directory described earlier. -As a quick check, starting from the `shell-lesson-data` directory, Nelle types: +As a quick check, starting from the `shell-data/lab_2_data` directory, Nelle types: ```bash $ cd north-pacific-gyre @@ -1432,7 +1486,7 @@ and *only* the processed data files? -## keypoints +## Keypoints - `wc` counts lines, words, and characters in its inputs. - `cat` displays the contents of its inputs. @@ -1449,7 +1503,7 @@ and *only* the processed data files? *** # Section 3 -## objectives +## Objectives - Write a loop that applies one or more commands separately to each file in a set of files. - Trace the values taken on by a loop variable during execution of the loop. @@ -1458,7 +1512,7 @@ and *only* the processed data files? - Demonstrate how to see what commands have recently been executed. - Re-run recently executed commands without retyping them. -## questions +## Questions - How can I perform the same actions on many different files? @@ -1625,7 +1679,7 @@ How would you write a loop that echoes all 10 numbers from 0 to 9? ## Challenge: Variables in Loops -This exercise refers to the `shell-lesson-data/exercise-data/alkanes` directory. +This exercise refers to the `shell-data/lab_2_data/exercise-data/alkanes` directory. `ls *.pdb` gives the following output: ```output @@ -1657,7 +1711,7 @@ Why do these two loops give different outputs? ## Challenge: Limiting Sets of Files What would be the output of running the following loop in the -`shell-lesson-data/exercise-data/alkanes` directory? +`shell-data/lab_2_data/exercise-data/alkanes` directory? ```bash $ for filename in c* @@ -1690,7 +1744,7 @@ $ for filename in *c* ## Challenge: Saving to a File in a Loop - Part One -In the `shell-lesson-data/exercise-data/alkanes` directory, what is the effect of this loop? +In the `shell-data/lab_2_data/exercise-data/alkanes` directory, what is the effect of this loop? ```bash for alkanes in *.pdb @@ -1713,7 +1767,7 @@ done ## Challenge: Saving to a File in a Loop - Part Two -Also in the `shell-lesson-data/exercise-data/alkanes` directory, +Also in the `shell-data/lab_2_data/exercise-data/alkanes` directory, what would be the output of the following loop? ```bash @@ -1733,7 +1787,7 @@ done *** -Let's continue with our example in the `shell-lesson-data/exercise-data/creatures` directory. +Let's continue with our example in the `shell-data/lab_2_data/exercise-data/creatures` directory. Here's a slightly more complicated loop: ```bash @@ -1831,7 +1885,7 @@ CAAGTGTTCC *** -We would like to modify each of the files in `shell-lesson-data/exercise-data/creatures`, +We would like to modify each of the files in `shell-data/lab_2_data/exercise-data/creatures`, but also save a version of the original files. We want to copy the original files to new files named `original-basilisk.dat` and `original-unicorn.dat`, for example. We can't use: @@ -1895,7 +1949,7 @@ The following diagram shows what happens when the modified loop is executed and demonstrates how the judicious use of `echo` is a good debugging technique. -![](fig/shell_script_for_loop_flow_chart.svg){alt='The for loop "for filename in .dat; do echo cp $filename original-$filename;done" will successively assign the names of all ".dat" files in your currentdirectory to the variable "$filename" and then execute the command. With thefiles "basilisk.dat", "minotaur.dat" and "unicorn.dat" in the current directorythe loop will successively call the echo command three times and print threelines: "cp basislisk.dat original-basilisk.dat", then "cp minotaur.datoriginal-minotaur.dat" and finally "cp unicorn.datoriginal-unicorn.dat"'} +![](../fig/shell_script_for_loop_flow_chart.svg) ## Nelle's Pipeline: Processing Files @@ -1914,7 +1968,7 @@ Moving to the `north-pacific-gyre` directory, Nelle types: ```bash $ cd -$ cd Desktop/shell-lesson-data/north-pacific-gyre +$ cd shell-data/lab_2_data/north-pacific-gyre $ for datafile in NENE*A.txt NENE*B.txt > do > echo $datafile diff --git a/labs/lab_3.md b/labs/lab_3.md index 6c19591d7..42f623758 100644 --- a/labs/lab_3.md +++ b/labs/lab_3.md @@ -4,13 +4,211 @@ element: lab layout: default --- -# Section 1 - -## objectives +## Objectives - Load Libraries in R studio -- Load data from a text file into R - Describe the structure of a dataframe -- +- Filter a dataframe +- Summarize and transform a dataframe + +## Activities + +### R and Rstudio + +R is a programming language, but is also the software that runs R code. +We are going to use Rstudio Server to run R code. This is an +integrated development environment, so it lets you write code, run the code, +and see the results all in one environment. We are using this on the servers, +which is why it is Rstudio Server, but you can also run this on your own +computer using Rstudio Desktop. The first step is to log into the UVic VPN. +Next open a web browser and go [here](https://indri.rcs.uvic.ca/). It will +give you a warning that the website is not secure because we didn't pay +for a valid security token. Ignore the security warning and proceed to the website. +On Safari, this involves clicking "Show Details" and then "visit this website". + +![](figs/private_connection.png) + +Next you will be prompted to "Sign in to Rstudio". Here the username and password +is your UVic Netlink ID and password. After entering that you will see the Rstudio +Server window which looks something like this: + +![](figs/rstudio_server.png) + +In class, I'll walk through what the different windows are used for now. + +### Setup and library installation +Many people using R have developed packages. Theses are sets of commands that extend +the capability of R. For example, there are packages for manipulating map data +or making map figures. Many of these packages are in CRAN and can be installed +directly from R itself. After installed, you can load these packages and use them in R, +you don't need to reinstall them every time you use them. + +We're going to install a series of packages for handling data. +```R +install.packages("dplyr") +install.packages("ggplot2") +install.packages("nycflights13") +install.packages("forcats") +install.packages("stringr") +install.packages("readr") +install.packages("tibble") +install.packages("tidyr") +install.packages("purrr") +install.packages("stringr") + +``` + +Those packages are installed, but we also have to load them before they can be used. + +```R +library("dplyr") +library("ggplot2") +library("nycflights13") +library("forcats") +library("stringr") +library("readr") +library("tibble") +library("tidyr") +library("purrr") +library("stringr") +``` + +### Dataframes + +One of the ways of storing data in R is in a dataframe. This is essentially a table with column names +and rows of data. Importantly a dataframe is always rectangular, which means each column has the same +number of rows. We're going to make our own dataframe to start. + +```R +df <- data.frame( + Name = c("Alice", "Bob", "Charlie", "David"), + Age = c(25, 30, 22, 28), + Salary = c(50000, 60000, 45000, 55000) +) +``` + +This table has three columns: Name, Age and Salary. It is stored in the object "df", although this +is arbitrary, you could call this dataframe whatever you want. + +We can look at the dataframe by calling the object. + +```R +df +``` + +```output + Name Age Salary +1 Alice 25 50000 +2 Bob 30 60000 +3 Charlie 22 45000 +4 David 28 55000 +``` + +#### Challenge + +We want to change our dataframe and remove the row with Bob in it. In addition, +we want to add 72 year old Abe, who has a salary of 92000. Make the new changed dataframe +and call it df2. + + +### Accessing Data + +Once our data is in a dataframe, we will want to be access it, so lets learn about several ways you +can pull out specific parts of the data. + +First method is to access specific named columns using the "$". In this case, we are pulling it out based on +the name, it doesn't matter the relative order of columns. + +```R +names <- df$Name +names +``` + +```output +[1] "Alice" "Bob" "Charlie" "David" +``` + +We've printed out all the names in the name column, in their original order. +The '[1]' indicates the position number of the first item in that line. In this case, +there is only one line of items but this can be helpful if you print out very long +lists of items. + + +We can also access a column by using its relative position. The Name column is the first +column, so we're going to print out the first column. + +```R +column_1 <- df[,1] +column_1 +``` + +```output +[1] "Alice" "Bob" "Charlie" "David" +``` + +This is a general way of accessing data in R. It uses the format [Row_number,Column_number], for +2-dimensional data. For 1-dimensional (i.e. a list) you'd only need a single number. If you have more than +2-dimensions (i.e. complex matrixes), then you can also use this notation and each number will specify the position +in a dimension. +We didn't put anything in the "Row number" portion, so it printed out all rows. +This is a general feature, if you leave a dimension blank, it includes all values. +This same function can be used to pull out specific rows, or even single values by triangulating their row and column. + + +Lastly, we can access specific columns by using the "pull()" function. + +```R +names <- pull(df, Name) +names +``` +```output +[1] "Alice" "Bob" "Charlie" "David" +``` + +For this command, we're telling it to pull out the "Name" column from the "df" dataframe. +We will see this command more when we start chaining together different commands. + +### Challenge + +Write two different sets of commands to extract the age of Charlie. +Hint: You can use the [n] notation on a list. In this case, you only need to give it +one number, since this is is 1-dimensional. + + +### Using the pipe function. + +When we used the pull function, we could also organize it using a pipe. This +is similar to using pipes in command line, where you're passing the output +of one command to the next. + +```R +df |> + pull(Name) + +``` +```output +[1] "Alice" "Bob" "Charlie" "David" +``` + +In this case, we're taking the dataframe "df" and passing it to the pull command. +You'll notice that we don't have to tell pull which dataframe to use, it knows that it is +using the dataframe passed to it from the previous part of the code. +You can also use %>% for a pipe, although |> works with base-R so is more general. + +### Transform data! + +We're going to be following along with the textbook "R for data science". We're working on +chapter 3, Data Transformation: [LINK](https://r4ds.hadley.nz/data-transform) + +Important: In the tutorial, it loads "tidyverse". We have already installed and loaded +almost all of the tidyverse packages, but due to some quirks in the server it's easier +to use it the way listed here. So, where it says "library(tidyverse)", skip that command. + + + + + + + diff --git a/labs/lab_4.md b/labs/lab_4.md new file mode 100644 index 000000000..e1f9b0bc0 --- /dev/null +++ b/labs/lab_4.md @@ -0,0 +1,61 @@ +--- +title: Data visualization +element: lab +layout: default +--- + +## Objectives + +- Understand the grammar of ggplot +- Apply aesthetic changes to a plot +- Visualize a distribution +- Visualize a relationship between variables + + +## Questions + +- How can I plot my data? +- What type of plot should I use for my data? + + +Today we're going to learn how to plot in R using ggplot2. +This is a package designed to streamline the production +of complicated and aesthetically pleasing plots. You can also +make plots in R without ggplot2, but its easier to learn ggplot2 +and almost any plot can be constructed with it. + +We first have to login to the VPN and then the [Rstudio Server](https://indri.rcs.uvic.ca/) +in your browser. We already installed most of the tidyverse packages +last lab (go back to lab 3 if you didn't do that). Today we're going to install +a few more packages used in the tutorial + +```R +#This has example data for plotting +install.packages("palmerpenguins") +#This has different themes for designing your plots +install.packages("ggthemes") + +``` + +Then we want to load all the packages. We're doing this instead of loading +tidyverse (which has issues on our system) + +```R +library("dplyr") +library("ggplot2") +library("nycflights13") +library("forcats") +library("stringr") +library("readr") +library("tibble") +library("tidyr") +library("purrr") +library("stringr") +library("palmerpenguins") +library("ggthemes") +``` + +Now we're going to work through [Chapter 1 of R for Data Science](https://r4ds.hadley.nz/data-visualize). + +If you finish Chapter 1 early, [Chapter 9](https://r4ds.hadley.nz/layers) +extends into more complicated plots. diff --git a/labs/lab_5.md b/labs/lab_5.md new file mode 100644 index 000000000..552b00349 --- /dev/null +++ b/labs/lab_5.md @@ -0,0 +1,284 @@ +--- +title: Sequence data and QC +element: lab +layout: default +--- + +## Objectives + +- To learn how to download data from the SRA +- To learn how to filter a fastq file +- To understand the structure and format of genome files +- To learn tricks for manipulating a fastq file + +**** + +Today we're going to learn how to access sequence data and prepare it for analysis. Please login to +the "indri" server and go to your home directory to start. + +As a first step, we need to download some sequence data. The North American repository of sequence data +is the Sequence Read Archive (SRA). Whenever a researcher wants to publish a genomic +analysis, they need to upload their raw data to the SRA so that other +researchers can use replicate their results or use it for other purposes. +The sample we're going to work with is identified as DRR053219. + +### Activity + +- What species is this sample from? +- What project was this sample sequenced for? +- Each data file in the SRA has multiple layers of metadata. Specifically Bioproject, Biosample, Experiment and Run. +Explain the difference between each of these layers. + +**** + +To download data we are going to use fasterq-dump, a program that can pull reads from the SRA +based on their ID number. + +```bash +#First we need to load the module system on the server, if you didn't modify your .bashrc in previous lessons. +source /cvmfs/soft.computecanada.ca/config/profile/bash.sh + +#Then we load the sra-toolkit, which has fasterq-dump +module load StdEnv/2023 gcc/12.3 sra-toolkit/3.0.9 + +#Then we download the sample, splitting up the data into two files for the forward and reverse reads +fasterq-dump --skip-technical -S DRR053219 + +``` + +We now have two read files for this sample. It's good pratice to exam the quality of the data +using a program like FastQC. This program will process your samples and output a variety of stats +about the data. + + +```bash +#First load the program +module load fastqc/0.12.1 +#Then run fastqc on our samples. +fastqc DRR053219_1.fastq DRR053219_2.fastq +``` +It will produce an html report for each sample titled DRR053219_1_fastqc.html and DRR053219_2_fastqc.html. +To view those, you'll need to download them to your desktop. Open a new terminal and run scp. +```bash +scp grego@indri.rcs.uvic.ca:/home/grego/DRR*html Desktop/ +``` + +Take a look at the output and answer the following questions +### Activity +- Which read has better base quality? +- What is an overrepresented sequence and what is the likely source? Think about how this sequence got there. +- Look at the "Per base sequence content" and compare it to an example in the FastQC manual (https://dnacore.missouri.edu/PDF/FastQC_Manual.pdf). +What is this result telling us about the sequencing library? If this were a normal whole genome sequencing library, would this be a +good or bad sign? + +**** + +If you don't see any red flags in fastqc, the next step is to trim the reads. This will do three different things: +1) Remove adapter sequences, which were added to your DNA but aren't from your target sample's genome. +2) Remove low quality reads. +3) Trim poor quality bases off the ends of reads. Base quality tends to decline so the ends of reads can be very low quality. + +```bash +#First load the program +module load trimmomatic +#Then run trimmomatic on our samples. +java -jar $EBROOTTRIMMOMATIC/trimmomatic-0.39.jar PE DRR053219_1.fastq DRR053219_2.fastq DRR053219_1.trim.fastq DRR053219_1.unpaired.fastq DRR053219_2.trim.fastq DRR053219_2.unpaired.fastq ILLUMINACLIP:$EBROOTTRIMMOMATIC/adapters/TruSeq3-PE.fa:2:30:10:2:True LEADING:3 TRAILING:3 MINLEN:36 +``` +### Activity +- Look at the manual for trimmomatic and figure out what the arguments in the trimmomatic call mean +- If you wanted to make your filtering more stringent, what arguments would you change? +- What is TruSeq3-PE? Would you use this for every library? +- How many reads were removed by the filtering? Can you calculate that just by comparing the files? + + +*** + +We also need a reference genome to compare our sample with. Reference genomes are often available +on the NCBI website. Here's a link to the yeast (S. cerevisiae) reference genome: https://www.ncbi.nlm.nih.gov/datasets/genome/GCF_000146045.2/ + +### Activity +- Using the NCBI website, find the GPX1 gene in the yeast reference genome. Where is it? What is the nucleotide sequence of the gene? What is the function of the gene? +- What are the genes next to GPX1 in the genome? +- Using BLAST, find where this gene is in the genome: +```bash +>Unknown gene +ATGGTCAAATTAACTTCAATCGCCGCTGGTGTCGCTGCCATCGCTGCTACTGCTTCTGCAACCACCACTC +TAGCTCAATCTGACGAAAGAGTCAACTTGGTGGAATTGGGTGTCTACGTCTCTGATATCAGAGCTCACTT +AGCCCAATACTACATGTTCCAAGCCGCCCACCCAACTGAAACCTACCCAGTCGAAGTTGCTGAAGCCGTT +TTCAACTACGGTGACTTCACCACCATGTTGACCGGTATTGCTCCAGACCAAGTGACCAGAATGATCACCG +GTGTTCCATGGTACTCCAGCAGATTAAAGCCAGCCATCTCCAGTGCTCTATCCAAGGACGGTATCTACAC +TATCGCAAACTAG +``` + +*** + +To work with the yeast reference genome, we have to download it to the server. +Go back to the first yeast genome page and click on the "Curl" button. It will show you a URL, which you should copy. +We can use that URL and the "curl" command to download the reference genome files + +```bash +curl -o yeast_genome.zip 'https://api.ncbi.nlm.nih.gov/datasets/v2alpha/genome/accession/GCF_000146045.2/download?include_annotation_type=GENOME_FASTA,GENOME_GFF,RNA_FASTA,CDS_FASTA,PROT_FASTA,SEQUENCE_REPORT' + +unzip yeast_genome.zip + +``` + +Now lets take a look at what we got. + +```bash +cd ncbi_dataset/data/GCF_000146045.2 +ls +``` + +```output +cds_from_genomic.fna GCF_000146045.2_R64_genomic.fna genomic.gff protein.faa rna.fna sequence_report.jsonl +``` + +First lets talk about "GCF_000146045.2_R64_genomic.fna". This is the genome sequence in fasta format. +It uses a .fna file suffix because it's a fasta with only nucleotide information (and not amino acids, for example). + +```bash +head GCF_000146045.2_R64_genomic.fna +``` +```output +>NC_001133.9 Saccharomyces cerevisiae S288C chromosome I, complete sequence +ccacaccacacccacacacccacacaccacaccacacaccacaccacacccacacacacacatCCTAACACTACCCTAAC +ACAGCCCTAATCTAACCCTGGCCAACCTGTCTCTCAACTTACCCTCCATTACCCTGCCTCCACTCGTTACCCTGTCCCAT +TCAACCATACCACTCCGAACCACCATCCATCCCTCTACTTACTACCACTCACCCACCGTTACCCTCCAATTACCCATATC +CAACCCACTGCCACTTACCCTACCATTACCCTACCATCCACCATGACCTACTCACCATACTGTTCTTCTACCCACCATAT +TGAAACGCTAACAAATGATCGTAAATAACACACACGTGCTTACCCTACCACTTTATACCACCACCACATGCCATACTCAC +CCTCACTTGTATACTGATTTTACGTACGCACACGGATGCTACAGTATATACCATCTCAAACTTACCCTACTCTCAGATTC +CACTTCACTCCATGGCCCATCTCTCACTGAATCAGTACCAAATGCACTCACATCATTATGCACGGCACTTGCCTCAGCGG +TCTATACCCTGTGCCATTTACCCATAACGCCCATCATTATCCACATTTTGATATCTATATCTCATTCGGCGGTcccaaat +attgtataaCTGCCCTTAATACATACGTTATACCACTTTTGCACCATATACTTACCACTCCATTTATATACACTTATGTC +``` + +Each chromosome (or contig) will be represented by a different fasta entry. +You can see that this has an ID for the first chromosome "NC_001133.9" as well +as a description of that entry. Some of the bases are lowercase, while others are +uppercase. The lowercase bases have been identified as being repetitive and are +what we call "soft-masked". Some programs will ignore soft-masked regions of the genome. + +We can manipulate the reference fasta file using samtools. +```bash +#First we load the samtools module +module load samtools/1.18 + +#Then we need to index the fasta file. +samtools faidx GCF_000146045.2_R64_genomic.fna + +#This creates the index file "GCF_000146045.2_R64_genomic.fna.fai". +#Sometimes index files are only machine-readable, but in this case the index file is actually helpful +head GCF_000146045.2_R64_genomic.fna.fai +``` +```output +NC_001133.9 230218 76 80 81 +NC_001134.8 813184 233249 80 81 +NC_001135.5 316620 1056676 80 81 +NC_001136.10 1531933 1377332 80 81 +NC_001137.3 576874 2928491 80 81 +NC_001138.5 270161 3512653 80 81 +NC_001139.9 1090940 3786270 80 81 +NC_001140.6 562643 4890926 80 81 +NC_001141.2 439888 5460680 80 81 +NC_001142.9 745751 5906143 80 81 +``` +This shows all of the contigs in the fasta file (first column) and the second column shows the number +of basepairs in each contig. + +### Activity +- Sometimes you will want to extract the sequence from a specific region of the genome. +You can do this using samtools faidx. Figure out how to output the region NC_001135.5:10030-10130. +- Other times, you'll want to work on only a subset of chromosomes. Write a script that finds the two longest chromosomes +and outputs only those two chromosomes to a separate fasta file. + +*** + +The directory also contains information on the annotated genes in the genome. Specifically: +1) "cds_from_genomic.fna". This is the coding sequence for all genes +2) "rna.fna". This is the RNA sequence for all genes +3) "protein.faa" This is the translated amino acid sequence for all genes. +4) "genomic.gff" This includes the locations and descriptions of all features in the genome (not only genes) + +The cds and rna are very similar, but the cds only includes bases that code for amino acids while +the rna includes transcribed bases upstream of the start codon and downstream of the stop codon. + +Some genome assemblies have additional information files and you can find their definitions here: https://www.ncbi.nlm.nih.gov/genome/doc/ftpfaq/ + +Lets take a look a the gff file. + +```bash +#We're piping it to `less -S` to prevent line wrapping, because the lines on this +#file can be quite long +head -n 20 genomic.gff | less -S +``` +```output +##gff-version 3 +#!gff-spec-version 1.21 +#!processor NCBI annotwriter +#!genome-build R64 +#!genome-build-accession NCBI_Assembly:GCF_000146045.2 +#!annotation-source SGD R64-4-1 +##sequence-region NC_001133.9 1 230218 +##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=559292 +NC_001133.9 RefSeq region 1 230218 . + . ID=NC_001133.9:1..230218;Dbxref=taxon:559292;Na> +NC_001133.9 RefSeq telomere 1 801 . - . ID=id-NC_001133.9:1..801;Dbxref=SGD:S00> +NC_001133.9 RefSeq origin_of_replication 707 776 . + . ID=id-NC_001133.9:707..776;Dbxr> +NC_001133.9 RefSeq gene 1807 2169 . - . ID=gene-YAL068C;Dbxref=GeneID:851229;Name=PAU8;> +NC_001133.9 RefSeq mRNA 1807 2169 . - . ID=rna-NM_001180043.1;Parent=gene-YAL068C;Dbxre> +NC_001133.9 RefSeq exon 1807 2169 . - . ID=exon-NM_001180043.1-1;Parent=rna-NM_00118004> +NC_001133.9 RefSeq CDS 1807 2169 . - 0 ID=cds-NP_009332.1;Parent=rna-NM_001180043.1;Db> +NC_001133.9 RefSeq gene 2480 2707 . + . ID=gene-YAL067W-A;Dbxref=GeneID:1466426;Name=YA> +NC_001133.9 RefSeq mRNA 2480 2707 . + . ID=rna-NM_001184582.1;Parent=gene-YAL067W-A;Dbx> +NC_001133.9 RefSeq exon 2480 2707 . + . ID=exon-NM_001184582.1-1;Parent=rna-NM_00118458> +NC_001133.9 RefSeq CDS 2480 2707 . + 0 ID=cds-NP_878038.1;Parent=rna-NM_001184582.1;Db> +NC_001133.9 RefSeq gene 7235 9016 . - . ID=gene-YAL067C;Dbxref=GeneID:851230;Name=SEO1;> +``` + +The start of the gff is a header, marked by the "#". It tells you which specific gff version you have and where it came from. +The first few columns tell us: +1) Chromosome or contig +2) Source of the feature +3) What type of feature +4) The start of the feature +5) The end of the feature +6) The score for the feature (in this case blank) +7) The strand the feature is on +8) The phase of the feature (in this case, what codon frame the CDS is in) +9) Additional info +You can read more about how gff format works here: https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md + +In many cases, you'll want to know which features are in a particular part of the genome. Since a gff file is +actually just text, we can use some basic unix tools to filter it. A convenient option is "awk". + +```bash +awk '$1 == "NC_001133.9" && $4 >= 10290 && $5 <= 27968' genomic.gff +``` +This filter for features on chromosome NC_001133.9 that start after 10290 and end before 27968. + +### Activity +- Find all the features on NC_001148.4 from 1 to 10000 bp. +- Find all the origin of replications on NC_001133.9. How many are there? +- The previous awk command can sometimes miss features that overlap with your window. Here's an +improved version of the command +```bash +chromosome="NC_001133.9" +start_position=10290 +end_position=27968 + +awk -v chrom="$chromosome" -v start="$start_position" -v end="$end_position" \ + '($1 == chrom && ($4 <= end && $5 >= start)) || \ + ($1 == chrom && $4 >= start && $4 <= end) || \ + ($1 == chrom && $5 >= start && $5 <= end) || \ + ($1 == chrom && $4 <= start && $5 >= end)' genomic.gff +``` +Generally speak, which features would this command capture that the previous command would miss? + +*** + + + + + + + diff --git a/labs/lab_6.md b/labs/lab_6.md new file mode 100644 index 000000000..4508ff6cf --- /dev/null +++ b/labs/lab_6.md @@ -0,0 +1,364 @@ +--- +title: Alignment +element: lab +layout: default +--- + +## Objectives + +- To learn how to align sequence data to a reference. +- To learn how to interpret sam and bam files. +- To learn how to manipulate a bam file. + +**** + +In this lab we're going to learn how to align illumina data to a reference genome. +This is the first step in many genomic analyses. We start off we a large number +of anonymous reads from somewhere in the genome (or perhaps a different genome all together) +and we need to know where they come from before we can say anything about genetic variation +or gene expression. Log into the server to get started, and source + +After logging in, lets make a new directory for this lab. + +```bash +#First we source our bash profile to make sure we have access to the modules. +source /cvmfs/soft.computecanada.ca/config/profile/bash.sh +#Next make your new directory +mkdir lab_6 +#And enter it +cd lab_6 +``` + +I've put the required files in a shared directory on the system. We'll +copy it over to our current directory. +```bash +cp /project/ctb-grego/sharing/lab6_data.tar.gz ./ +``` + +This is a tar.gz file. A tar file (originally named for tape archive) is +a way of putting multiple files into a single file. In our case, we are also +gzipping it, which compresses the size. Most programs you download will be +stored in something like a tar.gz file. Before we can access the files in the +tar.gz file, we have to open and unzip it. +```bash +tar -xzf lab6_data.tar.gz +``` + +It's hard to remember the specific letter flags to use when unzipping a tar.gz file, +so my trick is to picture someone with a thick german accent saying "Extract Zee Files!", +for X, Z, F + +```bash +ls +``` +```output +lab6_data.tar.gz data +``` +The files we need are in the data directory, lets take a look. + +```bash +cd data +ls +``` + +```output +SalmonReference.fasta SalmonSim.Stabilising.p10.i2.80000_R1.fastq.gz +SalmonSim.Stabilising.p10.i1.80000_R1.fastq.gz SalmonSim.Stabilising.p10.i2.80000_R2.fastq.gz +SalmonSim.Stabilising.p10.i1.80000_R2.fastq.gz +``` + +Here we have a miniature version of the chinook salmon reference genome, named "SalmonReference.fasta". +We also have sequencing data for two samples, SalmonSim.Stabilising.p10.i1.80000 and SalmonSim.Stabilising.p10.i2.80000. +Each of them have both forward and reverse reads. This name is fairly long and complicated because of +how I generated this sequence data, but this sort of complicated naming is often what you'll see. It's +better to have a long and descriptive name (e.g. "SpeciesX.Pop102.Sample53.Tissue39"), than a short non-descriptive +one ( e.g. "Sample_A"). When plotting the data, you can switch to easier to read sample names if necessary. + +Before we can start aligning, we need to index our reference genome. Last week we indexed our reference +genomes using samtools. The alignment program we're using also needs to index the reference genome. + +```bash +module load StdEnv/2023 +module load samtools/1.18 +module load bwa-mem2/2.2.1 + +samtools faidx SalmonReference.fasta +bwa-mem2 index SalmonReference.fasta +``` + +We now have index files for our reference genome. Take a look at them. In contrast to the .fai file we worked with +last lab, these files are not actually useful to us on their own. They're necessary for the alignment +program, but they're only machine readable. Regardless, lets align our first sample! + +```bash +bwa-mem2 mem -t 2 SalmonReference.fasta SalmonSim.Stabilising.p10.i1.80000_R1.fastq.gz SalmonSim.Stabilising.p10.i1.80000_R2.fastq.gz > SalmonSim.Stabilising.p10.i1.80000.sam +``` + +This will output a lot of text as it progresses. For most projects, this step will take minutes or hours, +but here we're working with a small dataset of simulated reads from a small genome, so it finishes +immediately. + +We now have our data in "sam" format. This stands for Sequence Alignment/Map format. In the sam file, +we have our base calls for each read, along with the quality score. This means a sam file has all the information +in our original fastq format, plus additional info! Specifically, the sam file contains information about +whether and where each read has mapped in the genome. + +```bash +head SalmonSim.Stabilising.p10.i1.80000.sam | less -S +``` + +```output +@SQ SN:chr_1 LN:5000000 +@SQ SN:chr_2 LN:5000000 +@PG ID:bwa-mem2 PN:bwa-mem2 VN:2.2.1 CL:bwa-mem2 mem -t 2 SalmonReference.fasta SalmonSim.Stabi> +chr_1_1_0_0 99 chr_1 4629813 60 126M = 4630544 857 CTCTGCCATGCTGTGTTGTCATGTGTTACTGTCA> +chr_1_1_0_0 147 chr_1 4630544 60 126M = 4629813 -857 TGGGGTCCAATAAAACCTAGTAGTATCTTATATT> +chr_1_1_1_0 99 chr_1 3307901 60 126M = 3308085 310 TTCACTATTTTGTGGTAGCTCTGAACTTGCTCAC> +chr_1_1_1_0 147 chr_1 3308085 60 126M = 3307901 -310 TGCTTGACGGATTACTTGGGGGGGGGGTTGACAC> +chr_1_1_2_0 99 chr_1 2893195 60 126M = 2894284 1215 CACCTGCCTTGTATTTCTCAGATTTCTATCTGAG> +chr_1_1_2_0 147 chr_1 2894284 60 126M = 2893195 -1215 TTGAGGCCTCATCTTCACTTTCACTTTCCAACCT> +chr_1_1_3_0 99 chr_1 4532319 60 126M = 4532871 678 TCAGTGCCTCTTTACTTGACATCATGGGAAAATC> +``` + +The first two lines are headers, which list all the contigs in the reference genome, along with their size. After that, +we have more header lines that specify all the programs and commands used to create this file. In this case, +it has the bwa-mem2 call (and specific information about the flags used for that command call). As we +modify the sam file, more lines will get added to the header recording the steps we did. This +is very helpful if you forget what you did exactly. + +Next, we have one line per read with a bunch of columns. +1. The query template name. This is a unique name for each read or read pair. In this case, it has "chr_1" +because it was simulated using chr_1 as a reference, but for real data this will be a random string of characters. +2. The bitwise flag. This number tells us about the read and its mapping. There are multiple different possible +things the read could have, and the total value of the number can the translated into that information. For example, +if the read is paired, add + 1. If the read is unmapped, add + 8. Therefore, if the final number is 9, that means it +is paired and unmapped. The decoding of all possible numbers can be found (here)[https://broadinstitute.github.io/picard/explain-flags.html]. +3. The contig that the read is mapped to +4. The starting location where the read maps to, on the contig. +5. The mapping quality. 60 is the highest (generally) and means high confidence in where the read is mapped. +The lowest value is 0, which means it is equally likely to map elsewhere. +6. The CIGAR string describing how the read is aligned. +7. The ID of the read pair mate (in this case, the same as the original) +8. The mapping location of the read pair mate. +9. The distance between read map pairs (sort of) +10. The sequence of the read +11. The base quality of the read +12. Any extra flags. + +## Questions +1. For one read, you find a CIGAR string "12M1I113M". Refer to the (sam format manual)[https://samtools.github.io/hts-specs/SAMv1.pdf] and figure out +what this means for how the read is aligned. +2. Find the read with the lowest mapping quality in your dataset using command line programs. +3. How are the reads ordered in this sam file? +4. What are three possible reasons why a read could have very low mapping quality? + +**** + +As you can tell, reads are not ordered based on where they are aligned in the genome. Most programs +require that reads be ordered by their position in the genome because it allows them to be +rapidly accessed. Imagine if the reads were randomly ordered but we wanted to get all the reads +from the start of chromosome 1. The program would need to search through the entire file to find +those reads before it could use them (which can take a while). When reads are ordered, +programs can know exactly where to find all the reads for a particular region. + +Therefore, we are going to sort the sam file by read position. At the same time, we'll +convert it to a "bam" file. This is a binary version of the sam file, so the file size +is smaller, but we will need to use samtools to convert it to a human readable version if +we later want to look at it. + +```bash +samtools sort SalmonSim.Stabilising.p10.i1.80000.sam > SalmonSim.Stabilising.p10.i1.80000.sort.bam +``` +We've now changed the name to end with .sort.bam to remind us that this version of the file +is sorted and it is a bam file. To access the file now, using standard text viewing programs +won't work. We need to use samtools view. Lets try that. + + +```bash +#This will print weird characters, because its in binary +head SalmonSim.Stabilising.p10.i1.80000.sort.bam + +#This will convert the file to standard text and then print it to screen +samtools view -h SalmonSim.Stabilising.p10.i1.80000.sort.bam | head +``` + +Another important step in processing alignment files is to mark duplicates. These +are cases where we have sequenced the same molecule more than once. Since they +don't represent independent observations of the genome, we need to mark them so +that we only use one copy in our calculations. We'll use picardtools for this. + +```bash +module load picard/3.1.0 +java -jar $EBROOTPICARD/picard.jar MarkDuplicates I=SalmonSim.Stabilising.p10.i1.80000.sort.bam O=SalmonSim.Stabilising.p10.i1.80000.sort.markdup.bam M=SalmonSim.Stabilising.p10.i1.80000.dupmetrics.txt +``` +In cases where there are many options to a single command, it can be easier to put each option on its own line. +As long as we end each line with \ the system will treat it as if it was a single line. Be careful +not to have a space or anything after the \, because then it will think that the line really has ended (and your command will mess up). + +```bash +#Alternate formatting for command +java -jar $EBROOTPICARD/picard.jar MarkDuplicates \ + I=SalmonSim.Stabilising.p10.i1.80000.sort.bam \ + O=SalmonSim.Stabilising.p10.i1.80000.sort.markdup.bam \ + M=SalmonSim.Stabilising.p10.i1.80000.dupmetrics.txt +``` + +For this example data, there are no duplicates because this is simulated. In most cases, +you will have at least some duplicates. + +Lastly, we need to index the bam file. Many programs require the file to be indexed, +before it can be used. The index files are prefixed with .bai for bam index. + +```bash +samtools index SalmonSim.Stabilising.p10.i1.80000.sort.markdup.bam +``` + +We now have a nicely sorted and labelled bam file. Lets get some stats on it, +using samtools commands. + +```bash +samtools coverage SalmonSim.Stabilising.p10.i1.80000.sort.markdup.bam +``` +This shows you a summary of the depth and coverage for your sample. +Depth indicates how many reads are on each base (on average) and +coverage tells you how much of the reference sequence is covered by +at least one base. + +### Questions +1. Use samtools stats to find out how many reads had MAPQ less than 10 in sample "SalmonSim.Stabilising.p10.i1.80000". +2. Use samtools flagstats to find out what percent of reads are mapped to the genome. + + +We can also look at the alignment itself by using samtools tview. +```bash +samtools tview SalmonSim.Stabilising.p10.i1.80000.sort.markdup.bam SalmonReference.fasta +``` +Once you're inside, use the "?" key to bring up the help menu. Then use keys to move +and look at your alignment. For this sample, we have very little differences between +our sample and the reference, but when we start doing variant calling we can use +this to actually look at the reads and make sure that the variants are real. + +*** + +We often sequence many samples, and each of them needs to be processed individually. +We could make one long script that does each step for each sample individually, +but it's much more manageable to use loops to repeat the same steps for +many different samples. Lets go through a toy example to learn some tricks. + +First lets start with a list of fastq files. You can make this file in nano by +copying and pasting and name it "fastq_files.txt". You could imagine making +this file using the "ls" command with real data. +```bash +cat fastq_files.txt +``` +```output +sample01_R1.fastq.gz +sample01_R2.fastq.gz +sample02_R1.fastq.gz +sample02_R2.fastq.gz +sample29_R1.fastq.gz +sample29_R2.fastq.gz +``` + +First thing is that we have both R1 and R2 files, but we're going to be treating +those together as a single sample. So lets filter out the R2 lines using "grep" + +```bash +cat fastq_files.txt | grep -v R2.fastq.gz > fastq_files.R1.txt +``` +Now we have a list of files that we want to work on. Here are two ways +we can loop through each line in that file. + +Option 1: +```bash +for x in `cat fastq_files.R1.txt` +do + echo $x +done +``` +In this option, we're doing a creating our list of $x variables by using +a mini command call in the `` marks. We could modify it on the fly with additional filters. +For example +Option 1a: +```bash +for x in `cat fastq_files.R1.txt | grep -v sample29` +do + echo $x +done +``` +Here we've excluded sample29. + +In the second option, we use a "while" loop and the "read" command. +The file that it is reading is specified at the end of the while loop. +Option 2: +```bash +while read x; do + echo $x +done < fastq_files.R1.txt +``` + +A second challenge when making shell scripts is that we often want to +make a series of files with similar names but different suffixes. In this case +we start off with "sample29_R1.fastq.gz". The bam file we create from this should +be sample29.bam, not sample29_R1.fastq.gz.bam. + +Here are two options for removing suffixes. +Option 1: +```bash +x=sample01_R1.fastq.gz +y=$(basename $x _R1.fastq.gz) +echo $y +``` +The "basename" removes a suffix that you specify for a string. It +can also function to remove directories before a filename. +```bash +x=fastq/data/sample01_R1.fastq.gz +y=$(basename $x) +echo $y +``` + +A more general approach is to use sed (a.k.a. stream editor) +to modify a variable. Sed can be used to find and substitute +any character pattern in a string. It works like this: +s/character to find/character to replace it with/g +The "s" at the start is for subtitution. +The "g" at the end is for global (which means it finds all instances +of the replacement, not just the first). + +Take a look at these examples and see if you understand how they work: +```bash +echo "cats" | sed s/a/o/g + +echo "cats" | sed s/a//g + +echo "cats" | sed s/a/aaaaaa/g + +echo "cats" | sed s/ats/ats_are_funky/g +``` +We can use this to rename variables. +```bash +x=sample01_R1.fastq.gz +y=$(echo $x | sed s/_R1.fastq.gz//g) +echo $y +``` + +Lastly, don't forget you can build prefixes onto variable based names by +wrapping the variable in '{}'. +```bash +x=sample01_R1.fastq.gz +y=$(echo $x | sed s/_R1.fastq.gz//g) +echo $y +echo ${y}.fastq.gz +``` + +### Questions +1. Use multiple sed commands to change the first sentence into the second sentence. "My favorite city is Vancouver because Vancouver is much better than Victoria" into "My favorite city is Victoria because Victoria is much better than Vancouver". +2. Extract the sample name using a command from the file named "fastq/pool_1/Pool_1.sample902_R1.fastq.gz" + +For the lab questions, you'll be working on a small set of real data. Lets copy that into your directory +```bash +cd ~ +mkdir lab_6_questions +cd lab_6_questions +cp /project/ctb-grego/sharing/lab_6/* ./ +``` diff --git a/labs/lab_7.md b/labs/lab_7.md new file mode 100644 index 000000000..a62f775f8 --- /dev/null +++ b/labs/lab_7.md @@ -0,0 +1,378 @@ +--- +title: Variant calling +element: lab +layout: default +--- + +## Objectives + +- To learn how to call variants using bcftools and freebayes. +- To read a VCF file. +- To filter a VCF file. +- To visualize a VCF file. + +**** + +When we are resequencing a species with a reference genome, the ultimate goal is to +detect where your new sample differs from the reference genome and how multiple +samples differ from each other. To do this, we are going to use the aligned sequence +data to call variants. + +First thing, log into the server and remember to source your bash.sh file so that you have access to the modules. + +I've supplied you with aligned bam files and a reference genomes on the server at "/project/ctb-grego/sharing/lab_7". + +```bash +ls -R /project/ctb-grego/sharing/lab_7 +``` +```output +bam reference + +/project/ctb-grego/sharing/lab_7/bam: +sample_0.bam sample_2.bam.bai sample_5.bam sample_7.bam.bai +sample_0.bam.bai sample_3.bam sample_5.bam.bai sample_8.bam +sample_1.bam sample_3.bam.bai sample_6.bam sample_8.bam.bai +sample_1.bam.bai sample_4.bam sample_6.bam.bai sample_9.bam +sample_2.bam sample_4.bam.bai sample_7.bam sample_9.bam.bai + +/project/ctb-grego/sharing/lab_7/reference: +SalmonReference.fasta +``` +We could copy those files into your home directory, but that would mean there will be a different +copy for every student. When files are very large or there are many users that can be a problem. +Instead, we're going to create a symbolic link. This looks like a file on your system, and you can use it like a file, +but the orginal data stays in one spot and isn't copied. + +```bash +#Make a directory for the new bam file links to go +mkdir lab_7/bam +cd lab_7/bam + +#Link all the bam files +ln -s /project/ctb-grego/sharing/lab_7/bam/* ./ + +#Take a look and see if it worked. +ls +``` + +```output +sample_0.bam sample_2.bam.bai sample_5.bam sample_7.bam.bai +sample_0.bam.bai sample_3.bam sample_5.bam.bai sample_8.bam +sample_1.bam sample_3.bam.bai sample_6.bam sample_8.bam.bai +sample_1.bam.bai sample_4.bam sample_6.bam.bai sample_9.bam +sample_2.bam sample_4.bam.bai sample_7.bam sample_9.bam.bai +``` + +Now lets do the same for the reference genome +```bash +mkdir ../reference +cd ../reference +ln -s /project/ctb-grego/sharing/lab_7/reference/* ./ +``` + +We're first going to use bcftools mpileup to call genotypes. We need to create a file that tells bcftools +where all the bam files are and what their names are. We can do that using shell commands. +```bash +cd ~/lab_7 +ls bam | grep -v ".bai" | sed 's/^/bam\//g' > bamlist.txt +``` +### Question +1. Break down the command used to create bamlist.txt and explain what step is doing. + + +Now we're going to call the first step, bcftools mpileup. + +```bash +module load StdEnv/2023 gcc/12.3 bcftools/1.18 + +#We're using the -q 20 option here to require that reads have mapq >= 20. +bcftools mpileup -q 20 -f reference/SalmonReference.fasta -b bamlist.txt > lab_7.g.vcf +``` + +The output of this is like a vcf file but it includes information on all sites in the genome, +but hasn't actually called genotypes yet. This is an intermediate file format before we create +the final vcf. Lets take a look. + +```bash +less lab_7.g.vcf +``` +```output +##FILTER= +##bcftoolsVersion=1.18+htslib-1.18 +##bcftoolsCommand=mpileup -q 20 -f reference/SalmonReference.fasta -b bamlist.txt +##reference=file://reference/SalmonReference.fasta +##contig= +##contig= +##ALT= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT bam/sample_0.bam bam/sample_1.bam bam/sample_2.bam bam/sample_3.bam ba> +chr_1 1 . T <*> 0 . DP=1;I16=1,0,0,0,33,1089,0,0,60,3600,0,0,0,0,0,0;QS=1,0;MQ0F=0 PL 0,0,0 0,0,0 0,0,0 0,3,33 0,0,0 0,> +chr_1 2 . G <*> 0 . DP=1;I16=1,0,0,0,14,196,0,0,60,3600,0,0,1,1,0,0;QS=1,0;MQ0F=0 PL 0,0,0 0,0,0 0,0,0 0,3,14 0,0,0 0,> +chr_1 3 . G <*> 0 . DP=1;I16=1,0,0,0,33,1089,0,0,60,3600,0,0,2,4,0,0;QS=1,0;MQ0F=0 PL 0,0,0 0,0,0 0,0,0 0,3,33 0,0,0 0,> +chr_1 4 . G <*> 0 . DP=1;I16=1,0,0,0,33,1089,0,0,60,3600,0,0,3,9,0,0;QS=1,0;MQ0F=0 PL 0,0,0 0,0,0 0,0,0 0,3,33 0,0,0 0,> +chr_1 5 . C <*> 0 . DP=1;I16=1,0,0,0,33,1089,0,0,60,3600,0,0,4,16,0,0;QS=1,0;MQ0F=0 PL 0,0,0 0,0,0 0,0,0 0,3,33 0,0,0 0,> +chr_1 6 . A <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,5,25,0,0;QS=1,0;MQ0F=0 PL 0,0,0 0,0,0 0,0,0 0,3,37 0,0,0 0,> +chr_1 7 . A <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,6,36,0,0;QS=1,0;MQ0F=0 PL 0,0,0 0,0,0 0,0,0 0,3,37 0,0,0 0,> +chr_1 8 . G <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,7,49,0,0;QS=1,0;MQ0F=0 PL 0,0,0 0,0,0 0,0,0 0,3,37 0,0,0 0,> +chr_1 9 . G <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,8,64,0,0;QS=1,0;MQ0F=0 PL 0,0,0 0,0,0 0,0,0 0,3,37 0,0,0 0,> +chr_1 10 . C <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,9,81,0,0;QS=1,0;MQ0F=0 PL 0,0,0 0,0,0 0,0,0 0,3,37 0,0,0 0,> +``` + +The file has a header denotated by the ## at the start of each line. This tells you what the abbreviations +used in the file mean. Then there's the line that starts with #CHROM, which is the column names for the rest +of the file. We have 9 information columns and then 1 column per sample. Each row after that is for a single +position in the genome. The most important parts of this file are: +- The first column, which is the chromosome. +- The second column, which is the position on that chromosome. +- The fourth column, which is the reference allele (i.e. the allele in the reference genome). +- The fifth column, which is the alternate allele(s). A possible deletion of this base is labelled as <*>. +- The DP value in the info column. This tells you hany reads there is total for this site. +- The 10th column onward shows the genotype likelihoods for each sample. The three values, separated +by ",", are the phred scaled likelihoods for 0/0, 0/1 and 1/1 respectively. A 0 phred scaled likelihood, +is the most likely. Higher values are less likely. Ultimately, the program is going to pick a single +genotype value based on comparing these likelihoods. If there are more than one alternate allele, +then there will be more possible genotype likelihoods. + +You'll notice that in the row that includes sample names, the names is the bam file including it's directory. +This is not great, since it contains extra information and we'd really just want to include only +the actual name of the sample, not in it's bam file name. The way to do this is to add a "read group" +sample to the bam file. + +```bash +module load picard/3.1.0 + +java -jar $EBROOTPICARD/picard.jar AddOrReplaceReadGroups \ + I=bam/sample_0.bam \ + O=bam/sample_0.rg.bam \ + RGSM=sample_0 \ + RGLB=sample_0 \ + RGPL=Illumina \ + RGPU=NULL +``` + +## Questions +1. We've added read group to one sample. Make a shell script that adds the read group information to all samples, and additionally +indexes the resulting bam file. Run your script. +2. RGSM is the readgroup sample information. Using the internet, figure out what information RGLB, RGPL and RGPU store. +3. The read group information is being added to the bam file. Take a look at your new .rg.bam file, and compare it to the original bam +to figure out where the readgroup information is stored. Hint: You need to use samtools -h to see the header. +4. We want to run bcftools mpileup on our new set of bam files with readgroup info added. Make a new bamlist.txt file that has +those files and name it bamlist.rg.vcf. + +Make sure you complete the four questions above before continuing. + +Lets remake our gvcf file, with the new readgroup added bams +```bash +bcftools mpileup -q 20 -f reference/SalmonReference.fasta -b bamlist.rg.txt > lab_7.g.vcf + +``` +If we take a look at the resulting file, we can see that our samples are now correctly named. +The next step is to use this gvcf to call genotypes and create a vcf file. + +```bash +#We use the -m tag to allow for multiple alternate alleles when variant calling. +#We use the -v tag to only print variable sites. +#We use 'bcftools +fill-tags' to fill in some extra information about our variants (not necessary). +bcftools call -mv lab_7.g.vcf | bcftools +fill-tags> lab_7.bcftools.vcf +``` + +Now with this new vcf file we can see that it only includes a subset of sites in the genome. It doesn't +print out sites where there isn't a SNP or an INDEL. Take a look at the file and I'll highlight a few +important points here: +- AC tag is the alternate count, which tells you how many alternate alleles have been called. Remember that +in most cases your samples are diploid, so there for 10 samples, there are 20 allele calls. +- AN tag is the number of alleles called total. It is twice the number of genotyped samples at that site. +- AF tag is the alternate allele frequency. + +The genotype field looks something like this "1/1:200,24,0". The first part is your genotype call. +0 is the reference allele, 1 is the first alternate allele, 2 is the second alternate allele, etc. A +./. means that it isn't called since there was not enough reads to know. After the genotype is the phred +scaled likelihood of each genotype (like in the gvcf). In other variant calling programs, there can +be other information here like read depth. + +If we look at this vcf file, you may notice that many sites have an AF of 1, meaning that +all the samples have the alternate allele. This is partially an artifact of how the data was +simulated, but in any case you will find allele like this in any dataset. Since these +sites are monomorphic within our dataset, they're not particularly useful. A common way of +filtering a vcf is using a minor allele frequency cut off. + +```bash +bcftools view -q 0.1:minor lab_7.bcftools.vcf > lab_7.bcftools.maf10.vcf +``` +Now that we've created our bcftools vcf, lets go back and delete the gvcf file +because it's fairly big and have limited space on the server. +```bash +rm lab_7.g.vcf +``` + +If we wanted to create the bcftools vcf again, we can actually pipe the two +bcftool commands together without saving the gvcf. +``` +#Example, you don't need to run this +#bcftools mpileup -q 20 -d 200 -f Sockeye/ReferenceGenome/SalmonReference.fasta -b bamlist.rg.txt | bcftools call -mv > Biol470.bcftools.vcf +``` + +*** + +Another program for calling variants is freebayes (https://github.com/freebayes/freebayes). +This program uses haplotypes (i.e. multiple bases together) instead of individual sites. +It is a bit more complicated than bcftools but does better when there are more complicated +changes like INDELS, which occurs in real data. + +Side Note: The most common program for variant calling +is GATK, which is a spiritual successor for freebayes but has grown into a huge set of programs. +If you use human data, GATK is awesome, but if you're working on other organisms the quirks of GATK +are sometimes not worth the effort to make it work. + +```bash +module load StdEnv/2020 freebayes/1.3.6 +#This will take a couple minutes. +freebayes -L bamlist.rg.txt -f reference/SalmonReference.fasta > lab_7.fb.vcf +``` + +The first thing we will notice is that the freebayes vcf is larger than the bcftools vcf. + +```bash +ls -thor +``` +```output +drwxr-x---. 2 grego 68 Feb 25 15:18 reference +drwxr-x---. 2 grego 4.0K Feb 25 16:27 bam +-rw-r-----. 1 grego 200 Feb 25 16:32 bamlist.txt +-rw-r-----. 1 grego 1.3M Feb 25 16:36 Biol470.vcf +-rw-r-----. 1 grego 8.3M Feb 25 16:39 lab_7.bcftools.vcf +-rw-r-----. 1 grego 44K Feb 25 16:50 lab_7.bcftools.maf10.vcf +-rw-r-----. 1 grego 274 Feb 25 17:02 add_rg.sh +-rw-r-----. 1 grego 17M Feb 25 17:18 lab_7.fb.vcf +``` + +## Question: +1. How many variants were identified with bcftools? How many with Freebayes? + +If we take a look at the two files, we can see that they do actually overlap but +freebayes tends to have a lot more SNPs. This is a general property of freebayes, it +is very lenient in what it thinks might be a variant. For example, the first SNP in +bcftools is at 73673 bases, but freebayes thinks there might actually be a bunch before that. +Since this is simulated data we know the truth, which is that there are no variants before 73673 ( +just based on how I set it up). + +The key column to note here is the QUAL column (Column #6). This is the phred scaled likelihood +that there really is a variant here in any of the samples. Larger values mean it is very very unlikely +that this variant is produced by base call errors in the sequencer. We can see that for the early SNPs in +freebayes they all have very low QUAL score. When using Freebayes, you should require at least a minimum +QUAL score of 20 to look at a SNP. + +```bash +bcftools view -i 'QUAL > 20' -q 0.1:minor lab_7.fb.vcf > lab_7.fb.Q20.vcf +``` + +You should note that QUAL is a minimum filter, and doesn't guarantee that your genotypes are correct +or that this is a real SNP. You can often have SNPs generated from misalignment of paralogs or +repetitive regions. When working with complicated genomes like plants, it's regular to remove more than 50% of SNPs +during filtering. QUAL scores scale with minor allele frequency, since at a higher minor allele frequency there +must be more reads with the alternate base. To get high quality calls, it's important to look at some +of the other stats and to remove outliers. + +In the last part of this tutorial, we're going to visualize our vcf directly using R. This +lets us see overall patterns in missing data or genotypes and can spot problems that you won't +notice otherwise. + +Open the Rstudio Server instance (https://indri.rcs.uvic.ca). + +```R +install.packages("vcfR") +library(tidyr) +library(dplyr) +library(vcfR) +library(ggplot2) + +``` + +We're going to the vcfR package to convert our vcf file, into a tidy format for easy manipulation +in R. + +```R +vcf_file <- "/home/grego/lab_7/lab_7.fb.Q20.vcf" +vcf <- read.vcfR(vcf_file) +#extract the genotypes +gt_data <- vcfR2tidy(vcf, format_fields = 'GT' ) +gt = gt_data$gt +names(gt) = c('Chr','Position','ID','Genotype','Allele') +gt +``` +```output +# A tibble: 620 × 5 + Chr Position ID Genotype Allele + + 1 1 1055444 sample_8 NA . + 2 1 1357852 sample_8 1/1 A/A + 3 1 1901255 sample_8 NA . + 4 1 2068429 sample_8 NA . + 5 1 2069422 sample_8 NA . + 6 1 2340711 sample_8 0/0 T/T + 7 1 2366756 sample_8 1/1 G/G + 8 1 2431471 sample_8 NA . + 9 1 2746213 sample_8 1/1 C/C +10 1 2896664 sample_8 0/0 C/C +# ℹ 610 more rows +# ℹ Use `print(n = ...)` to see more rows +``` + +We now have a dataframe where each row is a single genotype for a single individual. +This gives us a lot of flexibility to plot or manipulate the data using the tidyverse +skills we learned early in the course. Here are a couple examples. First lets just plot +all the genotypes + +```R +gt %>% + filter(Chr == 1) %>% + ggplot(aes(y = ID, x = as.factor(Position), fill = Genotype)) + + geom_tile() + + xlab("Individuals") + + scale_fill_brewer("Genotype",palette = "Set1") + + theme_minimal(base_size=10) + ylab('Sample')+xlab('Position') + + theme(axis.text.x = element_text(size=4,angle = 90, vjust = 0.5, hjust = 1), + panel.border = element_rect(colour = "black", fill=NA, size=1), + strip.text.y.right = element_text(angle = 0)) +``` +![](../figs/lab_7.1.png) + +This could show us if there are some samples with more missing data. + +We could also query that directly by calculating the proportion of missing +genotypes by sample. + +```R +gt %>% + group_by(ID) %>% + mutate(missing = case_when(is.na(Genotype) ~ "Yes", + TRUE ~ "No")) %>% + group_by(ID, missing) %>% + summarise(n = n()) %>% + mutate(freq = n / sum(n)) %>% + filter(missing == "Yes") +``` +While there are tools to do tasks like this (e.g. vcftools), being able +to code them yourself in R gives you flexibility to customize each analyses to your +particular requirements. + + + + + + + diff --git a/labs/lab_8.md b/labs/lab_8.md new file mode 100644 index 000000000..b12fd471e --- /dev/null +++ b/labs/lab_8.md @@ -0,0 +1,396 @@ +--- +title: Population genetics +element: lab +layout: default +--- + +## Objectives + +- To run a population structure analysis +- To run a PCA on a vcf file. +- To visualize the results of a PCA and population structure +- To calculate FST on a vcf file. +- To plot FST across a genome. + + +**** + +Today we're going to explore how to measure relatedness or genetic similarity between samples +using a vcf file. Log into the indri server and remember to source your bash.sh file +so that you can access the module system. First we're going to make some directories and +copy in an example vcf file. + +```bash +cd ~ +mkdir lab_8 +cd lab_8 +mkdir vcf +cp /project/ctb-grego/sharing/lab_8/chinook.g5.m2.c5.d5.20miss.subsample.vcf.gz vcf/ +``` + +This contains a thinned set of SNPs for some Chinook salmon. + +## Question +- How many samples are in the vcf file? How many SNPs? +- Extract a list of the sample names in the file. +HINT: Sample names are in the header line starting with #CHROM. +HINT: You can convert tabs to newlines using the 'tr' command. +HINT: Alternate idea is to use bcftools query + +We want to group these samples into populations, so we're going to use the program "admixture". +To use it, we need to have our data in "bed" format. Bed is the binary version of ped format, +which basically organizes the data into a table with genotype calls. Bed format also has +accessory files "fam" and "bim" which have information about family structure and SNP identity. +We're going to plink to convert the vcf to bed. Plink is a very large set of tools that do many +tasks in population genetics, although it's largely designed for humans. This means it expects +human chromosome labels, and you have to specify when its not. + +```bash +module load plink +plink --vcf chinook.g5.m2.c5.d5.20miss.subsample.vcf.gz \ + --out chinook.g5.m2.c5.d5.20miss.subsample \ + --make-bed \ + --allow-extra-chr \ + --double-id +ls +``` +The options for this include: +"--out" which specifies the prefix of the outfile. +"--make-bed" which tells it to make a bed file output. +"--allow-extra-chr" which allows us to use non-human chromosome names. +"--double-id" which prevents it from interpreting the "_" in the file name as indicating familial relationships. + +Take a look at the files its made, you should see a bed, fam, log, nosex and bim file. +The next step is to run admixture. We have to give it our bed file and also a k value. +The k value is the number of groups that admixture will try to place our individuals into. +We can start with k=1, as a baseline. + +```bash +module load admixture +admixture chinook.g5.m2.c5.d5.20miss.subsample.bed 1 +``` + +```output +[grego@indri vcf]$ admixture chinook.g5.m2.c5.d5.20miss.subsample.bed 1 +**** ADMIXTURE Version 1.3.0 **** +**** Copyright 2008-2015 **** +**** David Alexander, Suyash Shringarpure, **** +**** John Novembre, Ken Lange **** +**** **** +**** Please cite our paper! **** +**** Information at www.genetics.ucla.edu/software/admixture **** + +Random seed: 43 +Point estimation method: Block relaxation algorithm +Convergence acceleration algorithm: QuasiNewton, 3 secant conditions +Point estimation will terminate when objective function delta < 0.0001 +Estimation of standard errors disabled; will compute point estimates only. +Invalid chromosome code! Use integers. +``` +Uh oh, invalid chromosome code? This is another relic of programs being design for +human data. It expects that the chromosome names should be "1" instead of an arbitrary +series of letters, which is what ours are. The solution to this is to rename our chromosomes to +numbers. I don't know of a tool to do that, so I wrote one for the class. + +```bash +zcat chinook.g5.m2.c5.d5.20miss.subsample.vcf.gz | \ + perl /project/ctb-grego/sharing/vcf2numericchrs.pl \ + chinook.g5.m2.c5.d5.20miss.subsample.keyfile.txt > chinook.g5.m2.c5.d5.20miss.subsample.nchrs.vcf +``` +Now we've created a new vcf, where each chromososome is a number, and +a keyfile for later if we want to convert them back into the original names +(chinook.g5.m2.c5.d5.20miss.subsample.keyfile.txt). Lets repeat the same plink steps and +try admixture again. + +```bash +plink --vcf chinook.g5.m2.c5.d5.20miss.subsample.nchrs.vcf \ + --out chinook.g5.m2.c5.d5.20miss.subsample.nchrs \ + --make-bed \ + --allow-extra-chr \ + --double-id \ + --autosome-num 95 +``` +You can see, we've added an extra flag here "--autosome-num 95". Now that we have +converted the chromosomes to numbers, it expects that there should be a max of 26. +We're putting in 95 here, not because salmon have 95 chromosomes, but because its the max value +and we just want it to process this without giving more complaints. Since we're just converting +the file, it doesn't really matter how many chromosomes there are. + +```bash +admixture chinook.g5.m2.c5.d5.20miss.subsample.nchrs.bed 1 +``` +```output +**** ADMIXTURE Version 1.3.0 **** +**** Copyright 2008-2015 **** +**** David Alexander, Suyash Shringarpure, **** +**** John Novembre, Ken Lange **** +**** **** +**** Please cite our paper! **** +**** Information at www.genetics.ucla.edu/software/admixture **** + +Random seed: 43 +Point estimation method: Block relaxation algorithm +Convergence acceleration algorithm: QuasiNewton, 3 secant conditions +Point estimation will terminate when objective function delta < 0.0001 +Estimation of standard errors disabled; will compute point estimates only. +Size of G: 61x14346 +Performing five EM steps to prime main algorithm +1 (EM) Elapsed: 0.015 Loglikelihood: -712273 (delta): 930843 +2 (EM) Elapsed: 0.015 Loglikelihood: -712273 (delta): 0 +3 (EM) Elapsed: 0.015 Loglikelihood: -712273 (delta): 0 +4 (EM) Elapsed: 0.015 Loglikelihood: -712273 (delta): 0 +5 (EM) Elapsed: 0.015 Loglikelihood: -712273 (delta): 0 +Initial loglikelihood: -712273 +Starting main algorithm +1 (QN/Block) Elapsed: 0.052 Loglikelihood: -712273 (delta): 0 +Summary: +Converged in 1 iterations (0.184 sec) +Loglikelihood: -712272.711731 +Writing output files. +```` + +It has now produced two output files: chinook.g5.m2.c5.d5.20miss.subsample.nchrs.1.Q and +chinook.g5.m2.c5.d5.20miss.subsample.nchrs.1.P. The Q file includes the ancestry of each +sample for each group. In this case, there's only one group, but this will be more useful with +higher K values. The P file estimates allele frequencies for each SNP in each population, which +isn't useful for us now. + +Ultimately for this type of algorith, we want to try multiple K values and find which one is best. To +do that we use a cross- validation method. In this method, they mask some genotype values, and try +to predict what they are, based on the groupings. Lower cross-validation error means the model +is a better fit. Lets run this for K from 1 to 10. + +```bash +for K in `seq 10` + do + admixture --cv chinook.g5.m2.c5.d5.20miss.subsample.nchrs.bed $K -j3 | tee log${K}.out; +done +``` +The "tee" command is allowing us to save the output that is output to the screen into a file. +We need those files to compare the CV error for each. Lets look at all the CV error scores. + +```bash +grep -h CV log*.out +``` + +```output +CV error (K=10): 1.00561 +CV error (K=1): 0.46015 +CV error (K=2): 0.47496 +CV error (K=3): 0.53477 +CV error (K=4): 0.60377 +CV error (K=5): 0.66514 +CV error (K=6): 0.72932 +CV error (K=7): 0.79938 +CV error (K=8): 0.86884 +CV error (K=9): 0.94260 +``` + +From this we can see that the lowest CV is with with K=1, and K=2 is only slightly worse. This +suggests that there isn't much population grouping in our dataset. It's always good to look at +how your groups are distributed amongst your samples and see if it makes sense biologically. To +do that, we have plot the Q values for each K. We're going to use R for this, so open the R +studio server window on indri. + +```R +library(ggplot2) +library(dplyr) +library(readr) +library(tidyr) +library(forcats) + + +#Get sample names in order from the .fam +sample_names <- read_table("lab_8/vcf/chinook.g5.m2.c5.d5.20miss.subsample.nchrs.fam",col_names = F) %>% + select(X1) %>% + rename(sample.id=X1) +all_data <- tibble() + +#Loop for each K value +for (k in 1:10){ + #Read the Q file + data <- read_delim(paste0("lab_8/vcf/chinook.g5.m2.c5.d5.20miss.subsample.nchrs.",k,".Q"), + col_names = paste0("Q",seq(1:k)), + delim=" ") + #Add in sample names + data$sample.id <- sample_names$sample.id + #Add in the K value + data$k <- k + + #Convert the wide table to a long table, which is easier to plot + data %>% gather(Q, value, -sample.id,-k) -> data + #Bind together all the outputs of each K value + all_data <- rbind(all_data,data) +} +``` +Before we do the plotting, lets get some information about the samples. Thankfully, +all the information is in the sample name. e.g. Otsh_Chilcotin_2018_10.merged.dedup.bam. +- Otsh <- species abbreviation +- Chilcotin <- population +- 2018 <- sampling year +- 10 <- sample number. + +We probably will want to plot by year and population, so lets extract that into our table. + +```R +all_data <- all_data %>% + separate(sample.id, c("species","pop","year","spacer"), "_", remove=F ) %>% + separate(spacer, c("sample_n"),remove=T) +``` +And now, lets plot for K=2 +```R +all_data %>% + filter(k == 2) %>% + ggplot(.,aes(x=fct_reorder(sample.id,pop),y=value,fill=factor(Q))) + + geom_bar(stat="identity",position="stack") + + theme(axis.text.x = element_text(angle = 60, hjust = 1)) +``` +![](../figs/lab_8.1.png) + +This looks interesting! We're sorting the x axis by the population label, so +samples are ordered by population. We can see that the first half samples are mostly group 1 +and the second half are mostly group 2. This is evidence that those populations reflect +actual genetically separated populations because admixture is rediscovering the groupings, +without being given the labels. We can also explore what the groupings look like at higher levels +of K. +![](../figs/lab_8.2.png) + +## Questions +- Make a nice version of the plot for K=2. Update the axis labels, colors, theme, title and sample names. +- Make a version of the plot with K=2, where samples are ordered by their ancestry proportion. +- Try running admixture with K=10 again and see if you get identical scores. What does this mean? + +Next, lets do a PCA on the samples to see how they are related. We're going to use plink for this, +so go back into the terminal. + +```bash +plink --vcf chinook.g5.m2.c5.d5.20miss.subsample.nchrs.vcf --out chinook.g5.m2.c5.d5.20miss.subsample.nchrs --pca --allow-extra-chr --double-id --autosome-num 95 +``` +This outputs a .eigenvec and a .eigenval file. The .eigenvec file has the scores for the +first 20 Principal Components, which we want to plot. We have to load this +into R to plot it. + +```R +pca_data <- read_table("lab_8/vcf/chinook.g5.m2.c5.d5.20miss.subsample.nchrs.eigenvec", + col_names = c("sample.id","spacer",paste0("PC",1:20))) +``` +In this case, the data file does not have a header, so we have to tell it what the columns are named. +The command "paste0("PC",1:20)", makes the names PC1, PC2 ... PC20. + +Now we want to make a scatter plot of these scores. +```R +pca_data %>% + ggplot(.,aes(x=PC1,y=PC2)) + + geom_point() +``` +![](../figs/lab_8.3.png) + +## Questions +- Make a version of this plot with population as point color and year as point shape. +- Add ellipses around the population groups to show their range of values. + +Lastly, we want to calculate FST between our populations. We're going to use vcftools to +do this. Go back to the terminal. + +```bash +#First make a list of all your samples +bcftools query -l chinook.g5.m2.c5.d5.20miss.subsample.vcf.gz > chinook.g5.m2.c5.d5.20miss.subsample.samplenames.txt + +#Next we need to get a list of samples for each population, Chilko and Chilcotin. We can use grep to get that. +bcftools query -l chinook.g5.m2.c5.d5.20miss.subsample.vcf.gz | grep Chilko > chilko.txt +bcftools query -l chinook.g5.m2.c5.d5.20miss.subsample.vcf.gz | grep Chilcotin > chilcotin.txt +#This only worked because we had our populations in the sample name, which is a why its useful to have that. +``` +Now we next calculate Weir and Cockerhams FST from our vcf +``` +vcftools --gzvcf chinook.g5.m2.c5.d5.20miss.subsample.vcf.gz \ +--weir-fst-pop chilko.txt \ +--weir-fst-pop chilcotin.txt \ +--out chinook.g5.m2.c5.d5.20miss.subsample +``` +This gives us a per-site FST value. Its important to note here that to get the average FST +across the entire genome, we can't just average these scores. The correct way is to +sum the numerator and denominator of FST (which vcftools doesn't report). You'll +need to use a different program (like hierfstat) to get that. + +Anyway, we should visualize this data across the genome using a manhattan plot. Lets go back to R. +```R +fst <- read_tsv("lab_8/vcf/chinook.g5.m2.c5.d5.20miss.subsample.weir.fst") +fst %>% + filter(CHROM == "CM031199.1") %>% + ggplot(.,aes(x=POS,y=WEIR_AND_COCKERHAM_FST)) + + geom_point() +``` +![](../figs/lab_8.4.png) + +That looks pretty good, but what if we wanted to put all of the chromosomes in one figure? We +can try using facet_wrap() to facet by chromosome. +```R +fst %>% + ggplot(.,aes(x=POS,y=WEIR_AND_COCKERHAM_FST)) + + geom_point() + + facet_wrap(~CHROM, nrow=1) +``` +![](../figs/lab_8.5.png) + +That works, but is not great. It has a fair amount of wasted space between the facets. Ideally, +we'd want to have all of the SNPs in one long axis representing the cumulative genome. We +can do this! + +```R +#We need to create a table with the length of each chromosome. +chr_lengths <- fst %>% + group_by(CHROM) %>% + summarize(length=max(POS)) + +#Make cumulative +chr_lengths %>% + # Calculate cumulative position of each chromosome + mutate(total=cumsum(length)-length) %>% + # Remove the length column, we don't need it anymore. + select(-length) %>% + # Add this info to the initial dataset + left_join(fst, ., by=c("CHROM"="CHROM")) %>% + # Make sure that positions are still sorted + arrange(CHROM, POS) %>% + # Now create a new column with the cumulative position + mutate( cumulative_pos=POS+total) -> fst.cumulative + +#This is used to find the middle position of each chromosome for labelling +axisdf <- fst.cumulative %>% + group_by(CHROM) %>% + summarize(center=( max(cumulative_pos) + min(cumulative_pos) ) / 2 ) +``` +With this, we can then make a single plot that includes the entire genome. +```R +fst.cumulative %>% + ggplot(.) + + geom_point( aes(x=cumulative_pos, y=WEIR_AND_COCKERHAM_FST,color=as.factor(CHROM)), + alpha=0.8, size=1.3) + + # Alternate chromosome point colors + scale_color_manual(values = rep(c("grey", "skyblue"), nrow(chr_lengths) )) + + # Custom X-axis + scale_x_continuous( label = axisdf$CHROM, breaks= axisdf$center, + guide = guide_axis(n.dodge=2)) + + theme_bw() + + theme(legend.position="none", + panel.border = element_blank(), + panel.grid.major.x = element_blank(), + panel.grid.minor.x = element_blank()) + + xlab("Chr") + + ylab("Fst") +``` +![](../figs/lab_8.6.png) + +Hurray. We've learned how to do population genetic analyses with SNPs! + + + + + + + + + + + diff --git a/labs/lab_9.md b/labs/lab_9.md new file mode 100644 index 000000000..87a158ee9 --- /dev/null +++ b/labs/lab_9.md @@ -0,0 +1,328 @@ +--- +title: GWAS +element: lab +layout: default +timing: Too short +--- + +## Objectives + +- To format data for a GWA analysis +- To run a GWA +- To plot a GWA analysis +- To run a single association test manually + +**** + +In today's lab we're going to run a genome-wide association +on some example data. Login to the server and remember to source +your bash.sh to have access to the module system. + +First we're going to organize our directory and create a symbolic link +with the data file. +```shell +cd ~ +mkdir lab_9 +mkdir lab_9/vcf +mkdir lab_9/pca +mkdir lab_9/gwas +mkdir lab_9/info +cd lab_9 +ln -s /project/ctb-grego/sharing/lab_9/chinook.gwas.vcf.gz vcf/ +``` +We've created a series of different directories where we can put our different +files to keep everything organized. Moving forward, remember to put data files and output +in the correct location. Organization is very important because for many projects, +you're going to be working on them for perhaps years and you will forget where you +put individual items. + +As a first step, we're going to run a PCA on our samples. + +```shell +module load plink +plink --vcf vcf/chinook.gwas.vcf.gz --out pca/chinook.gwas --pca --allow-extra-chr --double-id --autosome-num 95 +``` + +Lets take a look at this to make sure none of the samples are behaving badly. For example, if we accidentally had a sample of the wrong species, when we look at this PCA then it will be very far from all the other samples. Open Rstudio server. + +```R + +library(ggplot2) +library(dplyr) +library(readr) +library(tidyr) +library(forcats) + +pca_data <- read_table("lab_9/pca/chinook.gwas.eigenvec", + col_names = c("family.id","sample.id",paste0("PC",1:20))) + +pca_data %>% + ggplot(.,aes(x=PC1, y=PC2)) + + geom_point() +``` +![PCA plot](../figs/lab_9.1.png) + +### Question +- Based on the sample names, color code the PCA plot by year. +Are the two years different in the first two PCs? What about PCs 3 and 4? + +We want to use the PCA values as covariates for our GWA. We're going to use +plink for the GWA so we have to follow its format. It wants the first two columns to +contain the family ID and sample ID. If you had related individuals, you could use this +to show which family each was in, but in our case each individual is a random fish +and we don't expect any to be close relatives. So for each sample we're using the same +value for family ID and sample ID. + +```R +pca_data %>% + select(family.id, sample.id, PC1, PC2, PC3, PC4, PC5) %>% + write_tsv("lab_9/info/chinook.gwas.pca.txt") + +``` +The phenotype file I've included in the shared directory, so we can copy that over +```shell +cp /project/ctb-grego/sharing/lab_9/chinook.date.pheno info/chinook.gwas.pheno.txt +``` + +Now we're ready to run our first GWAS. +```shell +plink --vcf vcf/chinook.gwas.vcf.gz \ + --out gwas/chinook.gwas \ + --linear \ + --covar info/chinook.gwas.pca.txt \ + --allow-extra-chr \ + --double-id \ + --pheno info/chinook.gwas.pheno.txt \ + --all-pheno \ + --allow-no-sex +``` +With this command, we're using a linear model to test the association between this trait +and the genotype (since its a quantitative trait). We're using the first 5 PCs as covariates +to control for population stratification. We're passing the phenotype values in a separate file +with the --pheno option. + +Now lets plot this in R. +```R +gwas <- read_table("lab_9/gwas/chinook.gwas.P1.assoc.linear") +head(gwas) +``` +```output +> head(gwas) +# A tibble: 6 × 10 + CHR SNP BP A1 TEST NMISS BETA STAT P X10 + +1 CM031216.1 . 455 T ADD 95 -3.09 -1.67 9.79e- 2 NA +2 CM031216.1 . 455 T COV1 95 40.9 2.86 5.25e- 3 NA +3 CM031216.1 . 455 T COV2 95 -112 -7.34 9.92e-11 NA +4 CM031216.1 . 455 T COV3 95 46.6 3.34 1.24e- 3 NA +5 CM031216.1 . 455 T COV4 95 0.534 0.0378 9.70e- 1 NA +6 CM031216.1 . 455 T COV5 95 20.8 1.40 1.64e- 1 NA +``` +For each SNP in the genome, we have multiple lines. One of the lines is the association between the SNP and our phenotype (after controlling for covariates), while the others are the +effect of the covariate on the SNP. We aren't particularly interested in the covariate +effects, so we can filter those out. +```R +gwas <- gwas %>% + filter(TEST == "ADD") +``` +Lastly, when plotting GWAS, its easier to visualize -log10(p) values, so lets create that column. +```R +gwas <- gwas %>% + mutate(log_p = -log10(P)) +``` +When making GWA plots, we often have huge numbers of points. If you try to make a figure +with millions of points, it often bogs down you computer or results in a pdf file that +is way too big to handle. So, an easy work around is to filter points to not plot +SNPs that are completely unassociated. +```R +gwas %>% + filter(log_p > 1) %>% + ggplot(.,aes(x=BP,y=log_p)) + + geom_point() +``` +![GWAS plot](../figs/lab_9.2.png) + +We see a pretty big peak at the right side of the graph, as well as a +couple high points midway across the chromosome. Lets zoom in on these. + +```R +gwas %>% + arrange(desc(log_p)) %>% + head() +``` +```output +# A tibble: 6 × 11 + CHR SNP BP A1 TEST NMISS BETA STAT P X10 log_p + +1 CM031216.1 . 12211921 C ADD 114 -12.5 -5.18 0.00000106 NA 5.97 +2 CM031216.1 . 33100788 A ADD 114 -12.2 -5.18 0.00000107 NA 5.97 +3 CM031216.1 . 12224440 T ADD 114 -12.6 -5.13 0.00000130 NA 5.88 +4 CM031216.1 . 33049078 T ADD 114 -12.1 -5.10 0.00000146 NA 5.83 +5 CM031216.1 . 32303435 T ADD 114 -9.04 -5.04 0.00000191 NA 5.72 +6 CM031216.1 . 33037939 T ADD 113 -11.1 -5.04 0.00000195 NA 5.71 +``` +```R +top_snps <- gwas %>% + arrange(desc(log_p)) %>% + head(n=10) %>% + pull(BP) + +plotting_range <- 200000 +gwas %>% + filter(BP > top_snps[1] - plotting_range, + BP < top_snps[1] + plotting_range) %>% + ggplot(.,aes(x=BP,y=log_p)) + + geom_point() + +gwas %>% + filter(BP > top_snps[2] - plotting_range, + BP < top_snps[2] + plotting_range) %>% + ggplot(.,aes(x=BP,y=log_p)) + + geom_point() +``` + + +![GWAS zoom 1](../figs/lab_9.3.png) +![GWAS zoom 2](../figs/lab_9.4.png) + +Now a big question we might as are what genes are under those peaks? +We can take a look at that in R. I've organized a list of genes, for the +chromosome we're working on. + +```R +genes <- read_tsv("/project/ctb-grego/sharing/lab_9/chinook.genes.txt") + +plot_1 <- gwas %>% + filter(BP > top_snps[1] - plotting_range, + BP < top_snps[1] + plotting_range) %>% + ggplot(.,aes(x=BP,y=log_p)) + + geom_point() + +plot_1 <- plot_1 + + geom_segment(data = genes %>% filter( start > top_snps[1] - plotting_range, + end < top_snps[1] + plotting_range), + aes(x = start, y = -0.1, xend = end, yend = -0.1), size = 2, color = "blue") + +plot_1 +``` +![GWAS zoom 3](../figs/lab_9.5.png) + +### Questions +- What are the gene(s) under the second GWAS peak (around 33 MBp)? +- Choose one of the genes under the peak and BLAST it against the human genome. What is the +best hit? HINT: The LOC# is used by NCBI to track genes. + + +Now that we've done some GWAS calculations lets see if we can validate our +p-values for one SNP. We're going to extract the genotypes for one of our +top SNPs and run our own linear model. + +```R +library(vcfR) +#We have to tell it the path to the original file, not the symbolic link +vcf <- read.vcfR("/project/ctb-grego/sharing/lab_9/chinook.gwas.vcf.gz") +#extract the genotypes +gt_data <- vcfR2tidy(vcf, format_fields = 'GT' ) +gt = gt_data$gt + +#This step removes the gt_data, since it hogs memory +rm(gt_data) + +#Get the genotypes for your top SNP candidate +top_gt <- gt %>% + filter(POS == top_snps[1]) + +phenotypes <- read_tsv("lab_9/info/chinook.gwas.pheno.txt", + col_names = c("Indiv","spacer","phenotype")) + +top_gt %>% + inner_join(phenotypes) %>% + ggplot(aes(x=gt_GT, y=phenotype)) + + geom_boxplot() +``` + +![boxplot](../figs/lab_9.6.png) + +We can see from this that the alternate allele seems to cause a smaller +phenotype value. Based on this, we'd probably say that the alternate +allele is recessive. Our simple model assumes its additive, but the data +fits that model as well which is why the P-value is low. Plink is basically +just running a linear model, so lets try that ourselves. + +```R +linear_model <- top_gt %>% + inner_join(phenotypes) %>% + mutate(genotype_numeric = case_when(gt_GT == "0/0" ~ 0, + gt_GT == "0/1" ~ 1, + gt_GT == "1/1" ~ 2, + TRUE ~ NA)) %>% + lm(phenotype ~ genotype_numeric, data=.) + +summary(linear_model) +``` +```output +Call: +lm(formula = phenotype ~ genotype_numeric, data = .) + +Residuals: + Min 1Q Median 3Q Max +-41.965 -8.965 -1.965 5.785 53.035 + +Coefficients: + Estimate Std. Error t value Pr(>|t|) +(Intercept) 204.965 1.828 112.122 < 2e-16 *** +genotype_numeric -14.001 3.450 -4.058 9.2e-05 *** +--- +Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 + +Residual standard error: 17.29 on 112 degrees of freedom +Multiple R-squared: 0.1282, Adjusted R-squared: 0.1204 +F-statistic: 16.47 on 1 and 112 DF, p-value: 9.2e-05 +``` +The p-value for this version of the test is 9.2e-5, which is pretty significant. +One difference is that we didn't control for the PCAs, so lets try that for our linear model. + +```R +pcas <- read_tsv("lab_9/info/chinook.gwas.pca.txt") %>% + rename(Indiv = family.id) + +linear_model_plus <- top_gt %>% + inner_join(phenotypes) %>% + inner_join(pcas) %>% + mutate(genotype_numeric = case_when(gt_GT == "0/0" ~ 0, + gt_GT == "0/1" ~ 1, + gt_GT == "1/1" ~ 2, + TRUE ~ NA)) %>% + lm(phenotype ~ PC1 + PC2 + PC3 + PC4 + PC5+ genotype_numeric, data=.) + +summary(linear_model_plus) +``` +```output +Call: +lm(formula = phenotype ~ PC1 + PC2 + PC3 + PC4 + PC5 + genotype_numeric, + data = .) + +Residuals: + Min 1Q Median 3Q Max +-38.636 -6.602 1.060 6.247 31.398 + +Coefficients: + Estimate Std. Error t value Pr(>|t|) +(Intercept) 204.620 1.242 164.699 < 2e-16 *** +PC1 49.129 11.647 4.218 5.17e-05 *** +PC2 -113.713 11.666 -9.747 < 2e-16 *** +PC3 57.262 11.710 4.890 3.57e-06 *** +PC4 16.984 11.705 1.451 0.150 +PC5 -11.305 12.002 -0.942 0.348 +genotype_numeric -12.542 2.422 -5.178 1.06e-06 *** +--- +Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 + +Residual standard error: 11.65 on 107 degrees of freedom +Multiple R-squared: 0.6223, Adjusted R-squared: 0.6011 +F-statistic: 29.38 on 6 and 107 DF, p-value: < 2.2e-16 +``` + +After controlling for the PCs, the genotype effect is still +signficant, suggesting it isn't only caused by correlation +with population stratification. diff --git a/schedule.md b/schedule.md index e44294244..64f3a9daa 100644 --- a/schedule.md +++ b/schedule.md @@ -1,10 +1,10 @@ --- layout: page -title: Assignment Schedule +title: Lab Schedule assignments: ['Unix commandline', 'Shell scripting', 'R and data wrangling', 'Data visualization', 'Sequence data and QC', -'Alignment', 'Reading Break', 'Variant calling', -'Genome assembly', 'Population genetics', 'GWAS', 'Project work', +'Alignment', 'Variant calling', +'Population genetics', 'GWAS', 'Genome assembly', 'Project work', 'Project work'] --- diff --git a/slides/week_01 b/slides/week_01 new file mode 100644 index 000000000..f2289c688 --- /dev/null +++ b/slides/week_01 @@ -0,0 +1,7 @@ +--- +layout: page +element: slides +title: Unix commandline +--- + +[Week 1 slides](https://bright.uvic.ca/d2l/le/content/333189/viewContent/2607591/View) diff --git a/slides/week_02 b/slides/week_02 new file mode 100644 index 000000000..27aff6c65 --- /dev/null +++ b/slides/week_02 @@ -0,0 +1,7 @@ +--- +layout: page +element: slides +title: Shell scripting +--- + +[Week 2 slides](https://bright.uvic.ca/d2l/le/content/333189/viewContent/2616456/View) diff --git a/slides/week_03 b/slides/week_03 new file mode 100644 index 000000000..f6037a6c9 --- /dev/null +++ b/slides/week_03 @@ -0,0 +1,7 @@ +--- +layout: page +element: slides +title: R and data wrangling +--- + +[Week 3 slides](https://bright.uvic.ca/d2l/le/content/333189/viewContent/2616457/View)