Genotyping.Rmd

---
title: "Bullsharks in Texas bays & estuaries"
output:
  html_notebook:
    code_folding: hide
    df_print: paged
    highlight: kate
    theme: flatly
    toc: yes
---

```{r load libraries, message=FALSE, warning=FALSE}

source("scr/libraries.R")
source("scr/ggplot.R")
source("scr/VCFfilterstats.R")
source("scr/HaplotypR.R")
source("scr/xtrafunctions.R")
source("scr/genind.R")

```

# Receiving and demultiplexing libraries

## Demultiplex samples

Make sure index is in the file name of the downloaded sequences before demultiplexing; input file barcode then index; `demultiplex.txt` is in UNIX format.

### Demultiplex CLE-2

Demultiplex using only single enzyme.

```{bash demultiplex CLE-2, eval=FALSE, include=FALSE}

# create demultiplexed sequence folder
mkdir /home/soleary/BULLSHARKS/CLE_POPGEN/data/SEQ/CLE-2
cd /home/soleary/BULLSHARKS/CLE_POPGEN/data/SEQ/CLE-2

# demultiplex files
demultiplex.pl -i Demultiplex_CLE-2.txt -o Extract_CLE-2.sh -e1 sphI -p /home/soleary/BULLSHARKS/CLE_POPGEN/data/SEQ/CLE-2 -d /home/DATA/BULLSHARKS/Cle-2
chmod 755 Extract_CLE-2.sh
./Extract_CLE-2.sh

# delete unnecessary files generated during demultiplexing
rm sample*

```

### Demultiplex CLE-3

Demultiplex using only single enzyme.

```{bash demultiplex CLE-3, eval=FALSE, include=FALSE}

# create demultiplexed sequence folder
mkdir /home/soleary/BULLSHARKS/CLE_POPGEN/data/SEQ/CLE-3
cd /home/soleary/BULLSHARKS/CLE_POPGEN/data/SEQ/CLE-3

# demultiplex files
demultiplex.pl -i Demultiplex_CLE-3.txt -o Extract_CLE-3.sh -e1 sphI -p /home/soleary/BULLSHARKS/CLE_POPGEN/data/SEQ/CLE-3 -d /home/DATA/BULLSHARKS/Cle-3
chmod 755 Extract_CLE-3.sh
./Extract_CLE-3.sh

# delete unnecessary files generated during demultiplexing
rm sample*

```

## Quality control demultiplexed reads

Parse process radtags log file.

```{r parse process radtags, message=FALSE, warning=FALSE}

# if already run unattach list
rm(l)

# create empty list
l <- list()

# CLE-2 ----

# number of samples in index
index <- "CGATGT"
lib <- "CLE2"
n_samples <- 29

# read in summary per barcode
l[["CLE2-2"]] <- read_table2("data/SEQ/CLE2_CGATGT_radtags.log",
            skip = 13, n_max = n_samples,
            col_names = c("BARCODE", "TOTAL_READS", "AMBIG_READS", "LQ_READS", "RETAINED")) %>%
  mutate(PROP_RETAINED = RETAINED/TOTAL_READS,
         INDEX = index,
         LIBRARY = lib) %>%
  select(LIBRARY, INDEX, BARCODE, PROP_RETAINED, TOTAL_READS, RETAINED, AMBIG_READS, LQ_READS)

# number of samples in index
index <- "TGACCA"
lib <- "CLE2"
n_samples <- 29

# read in summary per barcode
l[["CLE2-4"]] <- read_table2("data/SEQ/CLE2_TGACCA_radtags.log",
            skip = 13, n_max = n_samples,
            col_names = c("BARCODE", "TOTAL_READS", "AMBIG_READS", "LQ_READS", "RETAINED")) %>%
  mutate(PROP_RETAINED = RETAINED/TOTAL_READS,
         INDEX = index,
         LIBRARY = lib) %>%
  select(LIBRARY, INDEX, BARCODE, PROP_RETAINED, TOTAL_READS, RETAINED, AMBIG_READS, LQ_READS)

# number of samples in index
index <- "CAGATC"
lib <- "CLE2"
n_samples <- 29

# read in summary per barcode
l[["CLE2-7"]] <- read_table2("data/SEQ/CLE2_CAGATC_radtags.log",
            skip = 13, n_max = n_samples,
            col_names = c("BARCODE", "TOTAL_READS", "AMBIG_READS", "LQ_READS", "RETAINED")) %>%
  mutate(PROP_RETAINED = RETAINED/TOTAL_READS,
         INDEX = index,
         LIBRARY = lib) %>%
  select(LIBRARY, INDEX, BARCODE, PROP_RETAINED, TOTAL_READS, RETAINED, AMBIG_READS, LQ_READS)

# number of samples in index
index <- "TAGCTT"
lib <- "CLE2"
n_samples <- 29

# read in summary per barcode
l[["CLE2-10"]] <- read_table2("data/SEQ/CLE2_TAGCTT_radtags.log",
            skip = 13, n_max = n_samples,
            col_names = c("BARCODE", "TOTAL_READS", "AMBIG_READS", "LQ_READS", "RETAINED")) %>%
  mutate(PROP_RETAINED = RETAINED/TOTAL_READS,
         INDEX = index,
         LIBRARY = lib) %>%
  select(LIBRARY, INDEX, BARCODE, PROP_RETAINED, TOTAL_READS, RETAINED, AMBIG_READS, LQ_READS)

# CLE-3 ----

# number of samples in index
index <- "CGATGT"
lib <- "CLE2"
n_samples <- 29

# read in summary per barcode
l[["CLE3-2"]] <- read_table2("data/SEQ/CLE3_CGATGT_radtags.log",
            skip = 13, n_max = n_samples,
            col_names = c("BARCODE", "TOTAL_READS", "AMBIG_READS", "LQ_READS", "RETAINED")) %>%
  mutate(PROP_RETAINED = RETAINED/TOTAL_READS,
         INDEX = index,
         LIBRARY = lib) %>%
  select(LIBRARY, INDEX, BARCODE, PROP_RETAINED, TOTAL_READS, RETAINED, AMBIG_READS, LQ_READS)


# number of samples in index
index <- "TGACCA"
lib <- "CLE3"
n_samples <- 29

# read in summary per barcode
l[["CLE3-4"]] <- read_table2("data/SEQ/CLE3_TGACCA_radtags.log",
            skip = 13, n_max = n_samples,
            col_names = c("BARCODE", "TOTAL_READS", "AMBIG_READS", "LQ_READS", "RETAINED")) %>%
  mutate(PROP_RETAINED = RETAINED/TOTAL_READS,
         INDEX = index,
         LIBRARY = lib) %>%
  select(LIBRARY, INDEX, BARCODE, PROP_RETAINED, TOTAL_READS, RETAINED, AMBIG_READS, LQ_READS)


# number of samples in index
index <- "CAGATC"
lib <- "CLE3"
n_samples <- 29

# read in summary per barcode
l[["CLE3-7"]] <- read_table2("data/SEQ/CLE3_CAGATC_radtags.log",
            skip = 13, n_max = n_samples,
            col_names = c("BARCODE", "TOTAL_READS", "AMBIG_READS", "LQ_READS", "RETAINED")) %>%
  mutate(PROP_RETAINED = RETAINED/TOTAL_READS,
         INDEX = index,
         LIBRARY = lib) %>%
  select(LIBRARY, INDEX, BARCODE, PROP_RETAINED, TOTAL_READS, RETAINED, AMBIG_READS, LQ_READS)

# number of samples in index
index <- "TAGCTT"
lib <- "CLE3"
n_samples <- 29

# read in summary per barcode
l[["CLE3-10"]] <- read_table2("data/SEQ/CLE3_TAGCTT_radtags.log",
            skip = 13, n_max = n_samples,
            col_names = c("BARCODE", "TOTAL_READS", "AMBIG_READS", "LQ_READS", "RETAINED")) %>%
  mutate(PROP_RETAINED = RETAINED/TOTAL_READS,
         INDEX = index,
         LIBRARY = lib) %>%
  select(LIBRARY, INDEX, BARCODE, PROP_RETAINED, TOTAL_READS, RETAINED, AMBIG_READS, LQ_READS)

# create single data frame
radtagslog <- ldply(l, data.frame) %>%
  select(-`.id`, LQ_READS) %>%
  unite(LIB_IDX, LIBRARY, INDEX, sep = "_", remove = FALSE)

write_delim(radtagslog, "results/all.radtags.log")

```

Compare demultiplexed reads per library & index.

Plot distributions of total and proportion of retained reads.

```{r plot reads, fig.height=18, fig.width=6, message=TRUE, warning=TRUE}

radtagslog %>%
  select(LIB_IDX, LIBRARY, INDEX, BARCODE, PROP_RETAINED, TOTAL_READS) %>%
  gather(key = STAT, value = READS, 5:6) %>%
  ggplot(aes(x = READS)) +
  geom_histogram(color = "black", fill = "darkorange") +
  labs(x = "reads") +
  facet_grid(LIB_IDX ~ STAT, scales = "free") +
  theme_standard

```

# Reference construction

Values chosen for MiSeq Reference:

* **c** = 0.8
* **K1** = 5
* **K2** = 6

# Read mapping

## Map reads using BWA (in **dDocent** pipeline)

Transfer copy of `reference.fasta` and associated files for K1 = 5 and K2 = 6 into each directory containing demultiplexed and quality trimmed sequences

Any renaming of files needs to happen before `fastq`-files are mapped. The population designation (before underscore) is used by `FreeBayes` to call SNPs so it is important that population designation make biological sense. All files should be named `POP_PLATE-WELL_SAMPLENAMES`.

Run `dDocent` from within each Library directory to map reads to `reference.fasta`.

```{bash run dDocent, eval=FALSE, include=FALSE}

cd /home/soleary/BULLSHARKS/CLE_POPGEN/data/SEQ/CLE-2
dDocent

cd /home/soleary/BULLSHARKS/CLE_POPGEN/data/SEQ/CLE-3
dDocent

```

## QA/QC read mapping

### Query mapping statistics

During the mapping stage, `dDocent` calls `BWA` to map reads from the individuals in the folder to the generated MiSeqReference and create a `-RG.bam`-file for each individual. The second column of a BAM (or SAM) file contains FLAGs with binary encoded information on mapping, pairedness etc. that can be used to compare the mapping efficiency of the generated MiSeq references.

Count number of reads and mapped reads using `samtools idxstats <aln-RG.bam>` which will retrieve and print stats in the bam-file. The output is TAB-delimited with each line consisting of reference sequence name, sequence length, # mapped reads and # unmapped (empty) reads. `samtools` can also be be used to query `samtools flagstat file.bam` which returns an output containing the number of reads for which each flag is true.

#### Run Flagstats

dDocent writes out a file called `bamlist.list` that contains all the bam files that were generated during read mapping in `dDocent` using `BWA`. Write script to gather flagstats from all `bam`-files.

```{r flagstats script}

# write script to gather flagstats
l <- list()

l[["CLE2"]] <- read_table2("data/SEQ/CLE-2/bamlist.list", col_names = "BAM") %>%
  mutate(PATH = "data/SEQ/CLE-2/",
         COMMAND = "samtools flagstat",
         OUT = ">> data/SEQ/CLE2.flagstats") %>%
  select( COMMAND, PATH, BAM, OUT) %>%
  unite(FILE, 2:3, sep = "")

l[["CLE3"]] <- read_table2("data/SEQ/CLE-3/bamlist.list", col_names = "BAM") %>%
  mutate(PATH = "data/SEQ/CLE-3/",
         COMMAND = "samtools flagstat",
         OUT = ">> data/SEQ/CLE3.flagstats") %>%
  select( COMMAND, PATH, BAM, OUT) %>%
  unite(FILE, 2:3, sep = "")

bam <- ldply(l, data.frame) %>%
  select(-`.id`)

write_delim(bam, "scr/flagstats.sh", delim = "\t", col_names = FALSE)

```

Run flagstats.

```{bash run flagstats, eval=FALSE, include=FALSE}

chmod 755 scr/flagstats.sh
./scr/flagstats.sh

```

#### Run idxstats

Write script to gather idxstats from all `bam`-files.

```{r script idxstats}

# write script to gather idxstats
l <- list()

l[["CLE2"]] <- read_table2("data/SEQ/CLE-2/bamlist.list", col_names = "BAM") %>%
  mutate(PATH = "data/SEQ/CLE-2/",
         COMMAND = "samtools idxstats",
         OUT = ">> data/SEQ/CLE2.idxstats") %>%
  select( COMMAND, PATH, BAM, OUT) %>%
  unite(FILE, 2:3, sep = "")

l[["CLE3"]] <- read_table2("data/SEQ/CLE-3/bamlist.list", col_names = "BAM") %>%
  mutate(PATH = "data/SEQ/CLE-3/",
         COMMAND = "samtools idxstats",
         OUT = ">> data/SEQ/CLE3.idxstats") %>%
  select( COMMAND, PATH, BAM, OUT) %>%
  unite(FILE, 2:3, sep = "")

bam <- ldply(l, data.frame) %>%
  select(-`.id`)

write_delim(bam, "scr/idxstats.sh", delim = "\t", col_names = FALSE)

```

Run indxstats.

```{bash run idxstats, eval=FALSE, include=FALSE}

chmod 755 scr/idxstats.sh
./scr/idxstats.sh

```

### Format stats files into tidy data sets

Appending the file results in the information per individual being printed in a new set of row being appended to the file, i.e. there will be as many rows for a given locus as individuals were mapped. The file can be re-formatted and summary statistics calculated using dplyr and tidyr.

Format idxstats:

```{r format idxstats}

# create vectors of files to be imported, reference codes, K1 and K2, dataframe names
filenames <- list.files(path = "data/SEQ", pattern = "*.idxstats")
names <- substr(filenames, 1, 8)
lib <- substr(filenames, 1, 4)

# import data
for (i in names){
  filepath <- file.path("data/SEQ", paste(i, 'stats', sep =""))
  assign(i, read.table(filepath, sep = "", header = FALSE,
                     col.names = c("Locus", "Length", "Reads_Mapped", 'blank')) %>%
           select(1:3))
  }

# # make sure to delete old list if rerunning the code
# rm(dflist_idx)
# rm(MapStats.idx)

# Create list of one dataframe per idxstats file and group by locus
dflist_idx <- lapply(ls(pattern = "*.idx"), get)

for (df in 1:length(dflist_idx)){
  x <- dflist_idx[[df]]
  x[['Locus']] <- as.character(x[['Locus']])
  x = x %>% group_by(Locus)
  dflist_idx[[df]] <- x
}

# Create new dataframes with summary stats per library and bind into final output/dataframe
MapStats.idx <- data.frame()

for (df in 1:length(dflist_idx)){
    
  x = summarize(dflist_idx[[df]],
                Length = mean(Length),
                Mean_Mapped = mean(Reads_Mapped),
                Sum_Mapped = sum(Reads_Mapped),
                Min_Mapped = min(Reads_Mapped),
                Max_Mapped = max(Reads_Mapped),
                SD_Mapped = sd(Reads_Mapped))
  x[x == 0] <- NA

  temp <- summarize(x, Mean_Mapped_Non0 = mean(Mean_Mapped, na.rm = TRUE)) %>%
    mutate(Lib = lib[df],
           Not_Mapped = nrow(filter(x, is.na(Sum_Mapped))),
           N_Loci_Ref = nrow(x)) %>%
    select(Lib, N_Loci_Ref, Not_Mapped, Mean_Mapped_Non0)

  MapStats.idx <- bind_rows(MapStats.idx, temp)
}

MapStats.idx <- MapStats.idx %>%
  mutate(PROP_EMPTY = round(Not_Mapped/N_Loci_Ref*100, digits = 2),
         CONTIGS_MAPPED = N_Loci_Ref - Not_Mapped)

write.table(MapStats.idx, "results/MapStats.idx", quote = FALSE)

# remove large (duplicate) files
rm(CLE2.idx)
rm(CLE3.idx)

MapStats.idx

```

Format flagstats

```{r format flagstats}

# Files to be imported
filenames <- list.files(path='data/SEQ', pattern = '*.flagstats')

# create vectors of files to be imported
names <- substr(filenames, 1, 9)
lib <- substr(filenames, 1, 4)

# import data
for (i in names){
  filepath <- file.path('data/SEQ', paste(i, 'stats', sep =""))
  assign(i, read.csv(filepath, sep = "+", header = FALSE,
                     col.names = c("N_Reads", "CAT"),
                     stringsAsFactors = FALSE) %>%
           select(1:2))
}

# Create list of one dataframe per flagstats file and create tidy data set
# should be 3 elements/libraries
rm(dflist_flag)

dflist_flag <- lapply(ls(pattern = "*flag"), get)

# Change N_Reads to numeric
for (df in 1:length(dflist_flag)){
  x <- dflist_flag[[df]]
  x[['N_Reads']] <- as.numeric(x[['N_Reads']])
  dflist_flag[[df]] <- x
}

for (df in 1:length(dflist_flag)){
  x <- dflist_flag[[df]]
  
  n <- nrow(x)/14

  x <- x %>%
    filter(grepl("0 mapped|properly paired|mapQ>=5", CAT)) %>%
    mutate(MAPSTAT = ifelse(grepl("mapQ>=5", CAT), "Mismatch",
                   ifelse(grepl("properly", CAT), "Prop_Paired", "Mapped"))) %>%
    mutate(Ind = c(rep(1:n, each = 3))) %>% 
    # not sure if extra individual in there somehow
    select(4, 3, 1) %>%
    spread(MAPSTAT, N_Reads)

  dflist_flag[[df]] <- x
}

# Create new dataframes with summary stats and add to main final data frame
MapStats.flag <- data.frame()

for (df in 1:length(dflist_flag)){
  x = summarize(dflist_flag[[df]], Sum_Mapped = sum(Mapped),
                             Reads_Mapped = mean(Mapped),
                             Sum_Paired = sum(Prop_Paired),
                             Mean_Paired = mean(Prop_Paired),
                             Sum_Mismatch = sum(Mismatch),
                             Mean_Mismatch = mean(Mismatch)) %>%
  mutate(Lib = lib[df]) %>%
  select(7, 1:6)
  MapStats.flag <- bind_rows(MapStats.flag, x)
}

# write to file
write.table(MapStats.flag, "results/MapStats.flag", quote = FALSE)

MapStats.flag

# combine files
mapstats <- left_join(MapStats.idx, MapStats.flag) %>%
  mutate(PROP_MISMATCH = Sum_Mismatch/Sum_Mapped)

# write summary stats file
write.table(mapstats, file = "results/BWA_mapping.stats", quote = FALSE, sep = " ")

```

### Evaluate & compare mapping results

Compare number of reference contigs for which no reads mapped to per library.

```{r plot mapstats, fig.height=4, fig.width=12}

# plot no of loci vs "empty" loci
p1 <- ggplot(mapstats, aes(x = Lib, y = PROP_EMPTY)) +
  geom_bar(stat = "identity", color = "black", fill = "darkorange") +
  scale_y_continuous(limits = c(0, 6)) +
  labs(x = "", y = "% contigs w/no reads mapped") +
  theme_standard

p2 <- ggplot(mapstats, aes(x = Lib, y = Reads_Mapped)) +
  geom_bar(stat = "identity", color = "black", fill = "darkorange") +
  labs(x = "library", y = "mean reads mapped per indv") +
  theme_standard

p3 <- ggplot(mapstats, aes(x = Lib, y = PROP_MISMATCH)) +
  geom_bar(stat = "identity", color = "black", fill = "darkorange") +
  labs(x = "",y = "% reads not mapped as pair") +
  theme_standard

multiplot(p1, p2, p3, cols = 3)

```

# SNP calling

Transfer copy of `reference.fasta` and associated files for K1 = 4 and K2 = 1 into SNP calling directory.

Create symlinks from all `fq`, `bam` and `bam.bai`-files for each separately mapped library in `SNP_Calling` folder.

```{bash softlink files, eval=FALSE, include=FALSE}

ln -s /home/soleary/BULLSHARKS/CLE_POPGEN/data/SEQ/CLE-2/*.fq.gz /home/soleary/BULLSHARKS/CLE_POPGEN/data/SNP_Calling
ln -s /home/soleary/BULLSHARKS/CLE_POPGEN/data/SEQ/CLE-2/*.bam* /home/soleary/BULLSHARKS/CLE_POPGEN/data/SNP_Calling

ln -s /home/soleary/BULLSHARKS/CLE_POPGEN/data/SEQ/CLE-3/*.fq.gz /home/soleary/BULLSHARKS/CLE_POPGEN/data/SNP_Calling
ln -s /home/soleary/BULLSHARKS/CLE_POPGEN/data/SEQ/CLE-3/*.bam* /home/soleary/BULLSHARKS/CLE_POPGEN/data/SNP_Calling

# remove unnecessary files
cd /home/soleary/BULLSHARKS/CLE_POPGEN/data/SNP_Calling
rm cat-RRG.bam*

```

Execute `dDocent` from within `SNP_Calling`-folder to call variants across all individuals (all libraries).

```{bash call SNPs, eval=FALSE, include=FALSE}

cd /home/soleary/BULLSHARKS/CLE_POPGEN/data/SNP_Calling
dDocent

```

File `TotalrawSNPs.vcf` contains all raw SNP/INDEL calls. Do not need to keep links of `fq.gz`-, `bam`-, `.bam.bai`-files after SNPs have been called. Copy `TotalRaSNPs.vcf` to `VCF_Filtering` for SNP filtering.

```{bash copy TotalRawSNPs for filtering, eval=FALSE, include=FALSE}

cd /home/soleary/BULLSHARKS/CLE_POPGEN/data/SNP_Calling
rm *fq.gz *bam*

cp /home/soleary/BULLSHARKS/CLE_POPGEN/data/SNP_Calling/TotalRawSNPs.vcf /home/soleary/BULLSHARKS/CLE_POPGEN/data/VCF/temp/

```

# SNP filtering

dDocent uses FreeBayes to call SNPs and write a VCF-file `TotalrawSNPs.vcf`. This data set was filtered to remove low quality and artefactual SNP sites, paralogs and low quality individuals based on levels of missing data, minimum/maximum read depth, genotype call rate, and minor allele frequencies. Contigs may contain more than one SNP; the script `rad_haplotyper.pl` was used to create haplotypes for each locus.

## Raw data 

### Individuals/Populations sampled

Generate a list of all individuals included in the SNP data set called by FreeBayes in the dDocent pipeline.

```{bash write raw indv}

vcfsamplenames data/VCF/temp/TotalRawSNPs.vcf > data/VCF/raw.ind

```

Use `Ind_raw` file to write text files of individuals in each library and in regional groupings. 

```{r create Ind files, message=FALSE, warning=FALSE}

# import individuals in raw data set ----
Ind_RAW <- read.table("data/VCF/raw.ind", 
                      header = FALSE, stringsAsFactors = FALSE,
                      col.names = "LIB_ID") %>%
  separate(LIB_ID, into = c("SP", "WELL", "SAMPLE_ID"), sep = "_", remove = FALSE, extra = "merge")

# View(Ind_RAW)

# identify duplicates
temp <- Ind_RAW %>%
  group_by(SAMPLE_ID) %>%
  count() %>%
  filter(n > 1) %>%
  select(SAMPLE_ID)

dup <- Ind_RAW %>%
  filter(SAMPLE_ID %in% temp$SAMPLE_ID) %>%
  select(LIB_ID)

# View(temp)

write.table(dup, "data/VCF/duplicate.ind",
            col.names = FALSE, row.names = FALSE, quote = FALSE)

# create files with individuals by population ----

SampleInfo <- read_delim("data/POPGEN/SampleInfo.txt", delim = "\t")

Ind_RAW <- left_join(Ind_RAW, SampleInfo)

temp <- Ind_RAW %>%
  filter(POP == "ARA") %>%
  select(LIB_ID)

write.table(temp, "data/VCF/ARA.ind",
            col.names = FALSE, row.names = FALSE, quote = FALSE)

temp <- Ind_RAW %>%
  filter(POP == "CC") %>%
  select(LIB_ID)

write.table(temp, "data/VCF/CC.ind",
            col.names = FALSE, row.names = FALSE, quote = FALSE)

temp <- Ind_RAW %>%
  filter(POP == "GAL") %>%
  select(LIB_ID)

write.table(temp, "data/VCF/GAL.ind",
            col.names = FALSE, row.names = FALSE, quote = FALSE)

temp <- Ind_RAW %>%
  filter(POP == "MAT") %>%
  select(LIB_ID)

write.table(temp, "data/VCF/MAT.ind",
            col.names = FALSE, row.names = FALSE, quote = FALSE)

temp <- Ind_RAW %>%
  filter(POP == "SA") %>%
  select(LIB_ID)

write.table(temp, "data/VCF/SA.ind",
            col.names = FALSE, row.names = FALSE, quote = FALSE)

temp <- Ind_RAW %>%
  filter(POP == "SL") %>%
  select(LIB_ID)

write.table(temp, "data/VCF/SL.ind",
            col.names = FALSE, row.names = FALSE, quote = FALSE)

# create files with individuals per library ----
temp <- Ind_RAW %>%
  filter(grepl("2A", WELL) |
         grepl("2B", WELL)) %>%
  select(LIB_ID)

write.table(temp, "data/VCF/CLE2.ind",
            col.names = FALSE, row.names = FALSE, quote = FALSE)

temp <- Ind_RAW %>%
  filter(grepl("3A", WELL) |
         grepl("3B", WELL)) %>%
  select(LIB_ID)

write.table(temp, "data/VCF/CLE3.ind",
            col.names = FALSE, row.names = FALSE, quote = FALSE)

```

### Compare Individual & SNP stats

Use `VCFtools` to create stats files for depth, missing data, heterozygosity and site quality for the raw data.

```{bash query raw stats}

vcftools --vcf data/VCF/temp/TotalRawSNPs.vcf --out data/VCF/CLE_raw --depth
vcftools --vcf data/VCF/temp/TotalRawSNPs.vcf --out data/VCF/CLE_raw --site-mean-depth
vcftools --vcf data/VCF/temp/TotalRawSNPs.vcf --out data/VCF/CLE_raw --site-quality
vcftools --vcf data/VCF/temp/TotalRawSNPs.vcf --out data/VCF/CLE_raw --missing-indv
vcftools --vcf data/VCF/temp/TotalRawSNPs.vcf --out data/VCF/CLE_raw --missing-site
vcftools --vcf data/VCF/temp/TotalRawSNPs.vcf --out data/VCF/CLE_raw --het
vcftools --vcf data/VCF/temp/TotalRawSNPs.vcf --out data/VCF/CLE_raw --singletons

```

Compare raw stats.
    
```{r stats raw, fig.height=20, fig.width=10, message=FALSE, warning=FALSE}

# load stats files ----
ind_stats_raw <- read.ind.stats(dir = "data/VCF", vcf = "CLE_raw") %>%
  separate(INDV, into = c("SP", "LIB", "SAMPLE_ID"), sep = "_", remove = FALSE)

loc_stats_raw <- read.loc.stats(dir = "data/VCF/", vcf = "CLE_raw")

# plot missing data per indv ----
p1 <- ggplot(ind_stats_raw, aes(x = MISS_CLE_raw)) +
  geom_histogram(binwidth = .01, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(MISS_CLE_raw, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 0.5),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "missing data per indv") +
  theme_standard

# plot read depth per indv ----
p2 <- ggplot(ind_stats_raw, aes(x = MEAN_DEPTH_CLE_raw)) +
  geom_histogram(binwidth = 10, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(MEAN_DEPTH_CLE_raw, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 20),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "mean read depth per indv") +
  theme_standard

# plot depth vs missing ----
p3 <- ggplot(ind_stats_raw, aes(x = MEAN_DEPTH_CLE_raw, y = MISS_CLE_raw)) +
  geom_point() +
  geom_vline(aes(xintercept = mean(MEAN_DEPTH_CLE_raw, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 20),
                 color = "darkblue", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = mean(MISS_CLE_raw, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = 0.5),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "mean depth per indv", y = "% missing data") +
  theme_standard

# plot Fis per indv ----
p4 <- ggplot(ind_stats_raw, aes(x = Fis_CLE_raw)) +
  geom_histogram(binwidth = .01, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(Fis_CLE_raw, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 0),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "Fis per indv") +
  theme_standard

# plot Fis vs missing data per indv ----
p5 <- ggplot(ind_stats_raw, aes(x = Fis_CLE_raw, y = MISS_CLE_raw)) +
  geom_point() +
  geom_vline(aes(xintercept = mean(Fis_CLE_raw, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 0),
                 color = "darkblue", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = mean(MISS_CLE_raw, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = 0.5),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "Fis per indv", y = "% missing data") +
  theme_standard

# plot Fis vs mean depth per indv ----
p6 <- ggplot(ind_stats_raw, aes(x = Fis_CLE_raw, y = MEAN_DEPTH_CLE_raw)) +
  geom_point() +
  geom_vline(aes(xintercept = mean(Fis_CLE_raw, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 0),
                 color = "darkblue", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = mean(MEAN_DEPTH_CLE_raw, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = 20),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "Fis per indv", y = "mean depth per indv") +
  theme_standard


# plot distribution missing data per locus ----
p7 <- ggplot(loc_stats_raw, aes(x = MISS_CLE_raw)) +
  geom_histogram(binwidth = 0.01, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(MISS_CLE_raw, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 0.1),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "% missing data per locus") +
  theme_standard

# plot distribution mean read depth ----
p8 <- ggplot(loc_stats_raw, aes(x = MEAN_DEPTH_CLE_raw)) +
  geom_histogram(binwidth = 20, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(MEAN_DEPTH_CLE_raw, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 20),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "mean read depth per locus") +
  theme_standard

# plot read depth vs missing data ----
p9 <- ggplot(loc_stats_raw, aes(x = MEAN_DEPTH_CLE_raw, y = MISS_CLE_raw)) +
  geom_point() +
  geom_vline(aes(xintercept = mean(MEAN_DEPTH_CLE_raw, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 20),
                 color = "darkblue", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = mean(MISS_CLE_raw, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = 0.1),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "mean depth per locus", y = "% missing data") +
  theme_standard

# plot depth vs SNP quality ----
site_qual <- read.table("data/VCF/CLE_raw.lqual", 
                        header = TRUE, stringsAsFactors = FALSE) %>%
  mutate(PROB = 10^(-QUAL/10))

temp <- data.frame(loc_stats_raw$MEAN_DEPTH_CLE_raw, site_qual$QUAL) %>%
  rename(depth = loc_stats_raw.MEAN_DEPTH_CLE_raw, qual = site_qual.QUAL)

p10 <- ggplot(temp, aes(x = depth, y = qual)) +
  geom_point(size = 1) +
  geom_vline(aes(xintercept = mean(depth, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 20),
                 color = "darkblue", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = mean(qual, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = 20),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "mean depth per locus", y = "SNP quality") +
  theme_standard

# plot number of SNPs per contig vs. mean depth ----
temp <- loc_stats_raw %>%
  count(CHR)

p11 <- left_join(temp, loc_stats_raw) %>%
  ggplot() +
  geom_point(aes(x = n, y = MEAN_DEPTH_CLE_raw)) +
  labs(x = "number of SNPs per contig", y = "mean depth") +
  theme_standard

# plot no of SNPs per locus ----
p12 <- loc_stats_raw %>%
  count(CHR) %>%
  ggplot(aes(x = n)) +
  geom_histogram(binwidth = 1, color = "black", fill = "darkorange") + 
  labs(x = "number of SNPs per locus") +
  theme_standard

mraw <- multiplot(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, cols=2)

```

## Choose threshold values for quality score, coverage, missing data, minor alleles and mapping/variant calling artifacts

### Filter 0: Remove LQ individuals

Remove (known) low quality individuals from data set:

Identify low quality individuals to remove from the data set; defined as individuals with a mean coverage of three or less reads across all loci and more than 85% missing data:


```{r lq indv}

LQindv <- ind_stats_raw %>%
  filter(MEAN_DEPTH_CLE_raw < 5 | MISS_CLE_raw >= 0.75) %>%
  select(INDV)

# View(LQindv)

write_delim(LQindv, "data/VCF/LQ_raw.ind", delim = "\t")

```

Remove low quality individuals and decompose indels.

```{bash filter lq indv, eval=FALSE, include=FALSE}

# decompose indels
vcfallelicprimitives data/VCF/temp/TotalRawSNPs.vcf --keep-info --keep-geno > data/VCF/temp/CLE.prim.vcf

# retain only SNPs / remove LQ individuals
vcftools --vcf data/VCF/temp/CLE.prim.vcf --out data/VCF/temp/CLE.F0 --remove-indels --remove data/VCF/LQ_raw.ind --recode --recode-INFO-all

# get duplicates stats
vcftools --vcf data/VCF/temp/TotalRawSNPs.vcf --out data/VCF/F0 --keep data/VCF/duplicate.ind --012
vcftools --vcf data/VCF/temp/TotalRawSNPs.vcf --out data/VCF/F0 --keep data/VCF/duplicate.ind --geno-depth
vcftools --vcf data/VCF/temp/TotalRawSNPs.vcf --out data/VCF/F0 --keep data/VCF/duplicate.ind --singletons

# query stats
vcftools --vcf data/VCF/temp/CLE.F0.recode.vcf --out data/VCF/CLE.F0 --depth
vcftools --vcf data/VCF/temp/CLE.F0.recode.vcf --out data/VCF/CLE.F0 --site-mean-depth
vcftools --vcf data/VCF/temp/CLE.F0.recode.vcf --out data/VCF/CLE.F0 --missing-indv
vcftools --vcf data/VCF/temp/CLE.F0.recode.vcf --out data/VCF/CLE.F0 --missing-site
vcftools --vcf data/VCF/temp/CLE.F0.recode.vcf --out data/VCF/CLE.F0 --het
vcftools --vcf data/VCF/temp/CLE.F0.recode.vcf --out data/VCF/CLE.F0 --singletons
vcftools --vcf data/VCF/temp/CLE.F0.recode.vcf --out data/VCF/CLE.F0 --geno-depth

```

Compare stats:

```{r stats F0, fig.height=20, fig.width=10, message=TRUE, warning=TRUE}

# load stats files ----
ind_stats_F0 <- read.ind.stats(dir = "data/VCF", vcf = "CLE.F0") %>%
  separate(INDV, into = c("SP", "LIB", "SAMPLE_ID"), sep = "_", remove = FALSE, extra = "merge")

loc_stats_F0 <- read.loc.stats(dir = "data/VCF/", vcf = "CLE.F0")

```

Import singletons and genotype depth file to create list of loci to exclude.

```{r read singleton genotype depth, message=FALSE, warning=FALSE}

# number of individuals
n <- nrow(ind_stats_F0)+2

# read singletons file
singletons <- read_table2("data/VCF/CLE.F0.singletons") %>%
  mutate(VARIANT = ifelse(ALLELE %in% c("A", "T", "C", "G"), "SNP", "INDEL"))

chrom <- unique(singletons$CHROM)

# read genotype depth file and join with singletons
gdepth <- read_table2("data/VCF/CLE.F0.gdepth") %>%
  filter(CHROM %in% chrom) %>%
  gather(key = INDV, value = DEPTH, 3:n)

singletons <- left_join(singletons, gdepth)
  
```

Data set contains `nrow(singletons)` singletons.

```{r doubletons}

# filter doubletons
doubletons <- singletons %>%
  filter(`SINGLETON/DOUBLETON` == "D")

```

Of those `nrow(doubletons)` are called as a homozygote in one individual.

```{r singletons per contig, fig.height=3, fig.width=4}

# number of contigs in data set
contigs_total <- loc_stats_F0 %>%
  distinct(CHROM)

contigs_singletons <- singletons %>%
  distinct(CHROM)

contigs <- singletons %>%
  count(CHROM)

ggplot(contigs, aes(x = n)) +
  geom_histogram(binwidth = 1, color = "black", fill = "darkorange") + 
  geom_vline(aes(xintercept = mean(n, na.rm = TRUE)),
             color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "number of singletons per locus") +
  theme_standard

```

The data set contains `nrow(contigs_total)` contigs, `nrow(contigs_singletons)` (`2round( (nrow(contigs_singletons)/nrow(contigs_total)*100), digits = 2)`%) contain singletons.

```{r distribution singpletons per contig, fig.height=3, fig.width=6}

contigs <- singletons %>%
  group_by(`SINGLETON/DOUBLETON`) %>%
  count(CHROM)

ggplot(contigs, aes(x = n)) +
  geom_histogram(binwidth = 1, color = "black", fill = "darkorange") + 
  facet_grid(. ~ `SINGLETON/DOUBLETON`) +
  labs(x = "number of doubletons/singeltons per locus") +
  theme_standard

```

Target is 20 reads per locus; identify number of singletons/doubletons with depth < 10 reads.

```{r}

count(singletons, DEPTH <= 5)

```

Distriubtion of genotype depth per singleton/doubleton.

```{r distribution geno depth per singleton, fig.height=3, fig.width=6}

ggplot(singletons, aes(x = DEPTH)) +
  geom_histogram(binwidth = 20, color = "black", fill = "darkorange") +
  facet_grid(. ~ `SINGLETON/DOUBLETON`, scales = "free") +
  labs(x = "read depth") +
  scale_y_sqrt() +
  theme_standard

```

Quantiles distribution of read depth for SNP loci called in only one individual.

```{r quantile singleton depth}

quantile(singletons$DEPTH, probs = c(.05, .25, .5, .75, .95, .99))

```

Distribution of doubletons (homzygous genotype for that individual) and singletons (heterozygote genotype).

```{r distribution no singletons per indv, fig.height=3, fig.width=6, message=FALSE, warning=FALSE}

Ind <- singletons %>%
  group_by(`SINGLETON/DOUBLETON`) %>%
  count(INDV)

Ind <- left_join(Ind, ind_stats_F0)

ggplot(Ind, aes(x = n)) +
  geom_histogram(binwidth = 50, color = "black", fill = "darkorange") + 
  facet_grid(. ~ `SINGLETON/DOUBLETON`) +
  labs(x = "number of singletons per indv") +
  theme_standard

```

Quantiles distribution number of singletons called in one individual.

```{r quantile singletons per indv}

quantile(Ind$n, probs = c(.01, .05, .25, .5, .75, .99))

```

Compare the number of SNPs vs complex variants (INDELs).

```{r singleton variant type}

count(singletons, VARIANT)

```

### Filter 1: Confidence in SNP call
    
The QUAL column of a VCF file is a phred based score indicating the probability that the variant shown in the ALT column is wrong. Given the Phred quality score (Q), and the probability that a base is incorrectly called (P), Q = -10(Log10P).

A quality score of 20 indicates, a 1 in 100 chance that the SNP site has been called incorrectly (i.e. 99% probability that correct call).

Filter loci with quality score < 20. Code genotypes with genotype call quality < 20 or  genotype depth < 5 as missing.

```{bash filter LQ SNP calls}

# filter LQ SNP calls
vcftools --vcf data/VCF/temp/CLE.F0.recode.vcf --out data/VCF/temp/CLE.F1 --minQ 20 --minGQ 20 --minDP 5 --recode --recode-INFO-all

vcftools --vcf data/VCF/temp/CLE.F1.recode.vcf --out data/VCF/F1 --keep data/VCF/duplicate.ind --012
vcftools --vcf data/VCF/temp/CLE.F1.recode.vcf --out data/VCF/F1 --keep data/VCF/duplicate.ind --geno-depth

# query stats
vcftools --vcf data/VCF/temp/CLE.F1.recode.vcf --out data/VCF/CLE.F1 --depth
vcftools --vcf data/VCF/temp/CLE.F1.recode.vcf --out data/VCF/CLE.F1 --site-mean-depth
vcftools --vcf data/VCF/temp/CLE.F1.recode.vcf --out data/VCF/CLE.F1 --missing-indv
vcftools --vcf data/VCF/temp/CLE.F1.recode.vcf --out data/VCF/CLE.F1 --missing-site
vcftools --vcf data/VCF/temp/CLE.F1.recode.vcf --out data/VCF/CLE.F1 --het
vcftools --vcf data/VCF/temp/CLE.F1.recode.vcf --out data/VCF/CLE.F1 --geno-depth
vcftools --vcf data/VCF/temp/CLE.F1.recode.vcf --out data/VCF/CLE.F1 --singletons
vcftools --vcf data/VCF/temp/CLE.F1.recode.vcf --out data/VCF/CLE.F1 --indv-freq-burden
vcftools --vcf data/VCF/temp/CLE.F1.recode.vcf --out data/VCF/CLE.F1 --freq 

```

Compare stats post-filtering.

```{r stats F1, fig.height=12, fig.width=10, message=FALSE, warning=FALSE}

# load stats files ----
ind_stats_F1 <- read.ind.stats(dir = "data/VCF", vcf = "CLE.F1") %>%
  separate(INDV, into = c("SP", "LIB", "SAMPLE_ID"), sep = "_", remove = FALSE)

loc_stats_F1 <- read.loc.stats(dir = "data/VCF/", vcf = "CLE.F1")

```

Data set contains `r nrow(ind_stats_F1)` individuals and `r nrow(loc_stats_F1)` SNP sites.

Removing low confidence SNP loci and genotype calls results in a reduction in the number of the (maximum) SNPs per locus.

```{r N SNPs, fig.height=6, fig.width=5, message=FALSE, warning=FALSE}

p1 <- loc_stats_raw %>%
  count(CHR) %>%
  ggplot(aes(x = n)) +
  geom_histogram(binwidth = 1, color = "black", fill = "darkorange") + 
  geom_vline(aes(xintercept = mean(n, na.rm = TRUE)),
             color = "darkblue", linetype = "dashed", size = 1) +
  scale_x_continuous(limits = c(0, 100)) +
  labs(x = "number of SNPs per locus") +
  theme_standard

p2 <- loc_stats_F1 %>%
  count(CHR) %>%
  ggplot(aes(x = n)) +
  geom_histogram(binwidth = 1, color = "black", fill = "darkorange") + 
  geom_vline(aes(xintercept = mean(n, na.rm = TRUE)),
             color = "darkblue", linetype = "dashed", size = 1) +
  scale_x_continuous(limits = c(0, 100)) +
  labs(x = "number of SNPs per locus") +
  theme_standard

m1a <- multiplot(p1, p2, cols=1)

```

Coding genotypes with low read depth (< 3) as missing, results in an overall increase in missing data per locus and a shift in more individuals with more missing data - this is because individuals (and loci) with coverage issues are now characterized by higher missing data.

```{r missing data F1, fig.height=4, fig.width=8, message=FALSE, warning=FALSE}

iraw <- read.table("data/VCF/CLE_raw.imiss",
                    header = TRUE, stringsAsFactors = FALSE) %>%
  select(INDV, F_MISS) %>%
  rename(raw = F_MISS)

imiss <- read.table("data/VCF/CLE.F1.imiss",
                    header = TRUE, stringsAsFactors = FALSE) %>%
  select(INDV, F_MISS) %>%
  rename(F1 = F_MISS)

imiss <- left_join(imiss, iraw)

p1 <- ggplot(imiss, aes(x = raw, y = F1)) +
  geom_point(shape = 1) +
  geom_abline(slope = 1, color = "darkblue", linetype = "dashed", size = 1) +
  scale_x_continuous(limits = c(0, 1)) +
  scale_y_continuous(limits = c(0, 1)) +
  labs(x = "indv missing data raw", y = "indv missing data F1") +
  theme_standard

lraw <- read.table("data/VCF/CLE_raw.lmiss",
                    header = TRUE, stringsAsFactors = FALSE) %>%
  select(CHR, POS, F_MISS) %>%
  rename(raw = F_MISS)

lmiss <- read.table("data/VCF/CLE.F1.lmiss",
                    header = TRUE, stringsAsFactors = FALSE) %>%
  select(CHR, POS, F_MISS) %>%
  rename(F1 = F_MISS)

lmiss <- left_join(lmiss, lraw)

p2 <- ggplot(lmiss, aes(x = raw, y = F1)) +
  geom_point(shape = 1) +
  geom_abline(slope = 1, color = "darkblue", linetype = "dashed", size = 1) +
  scale_x_continuous(limits = c(0, 1)) +
  scale_y_continuous(limits = c(0, 1)) +
  labs(x = "loci missing data raw", y = "loci missing data F1") +
  theme_standard

m1b <- multiplot(p1, p2, cols=2)

```

Compare depth individuals after removing LQ SNP loci and genotypes.

```{r depth F1, fig.height=4, fig.width=5, message=FALSE, warning=FALSE}

iraw <- read.table("data/VCF/CLE_raw.idepth",
                    header = TRUE, stringsAsFactors = FALSE) %>%
  select(INDV, MEAN_DEPTH) %>%
  rename(raw = MEAN_DEPTH)

idepth <- read.table("data/VCF/CLE.F1.idepth",
                    header = TRUE, stringsAsFactors = FALSE) %>%
  select(INDV, MEAN_DEPTH) %>%
  rename(F1 = MEAN_DEPTH)

idepth <- left_join(iraw, idepth)

ggplot(idepth, aes(x = raw, y = F1)) +
  geom_point(shape = 1) +
  geom_abline(slope = 1, color = "darkblue", linetype = "dashed", size = 1) +
  scale_x_continuous(limits = c(0, 150)) +
  scale_y_continuous(limits = c(0, 150)) +
  labs(x = "mean depth ind raw", y = "mean depth ind F1") +
  theme_standard

```

### Filter 2: Genotype call rate and allowed missing data per indv

Remove loci with genotype call rate < 50% and mean depth < 20 reads across all individuals.

```{bash 2a geno < 50}

vcftools --vcf data/VCF/temp/CLE.F1.recode.vcf --out data/VCF/temp/CLE.F2a --max-missing 0.5 --min-meanDP 15 --recode --recode-INFO-all

vcftools --vcf data/VCF/temp/CLE.F2a.recode.vcf --out data/VCF/F2a --keep data/VCF/duplicate.ind --012
vcftools --vcf data/VCF/temp/CLE.F2a.recode.vcf --out data/VCF/F2a --keep data/VCF/duplicate.ind --geno-depth

# library CLE-4
vcftools --vcf data/VCF/temp/CLE.F2a.recode.vcf --out data/VCF/temp/CLE2 --keep data/VCF/CLE2.ind  --recode --recode-INFO-all

vcftools --vcf data/VCF/temp/CLE2.recode.vcf --out data/VCF/CLE2 --missing-site

# library CLE-5
vcftools --vcf data/VCF/temp/CLE.F2a.recode.vcf --out data/VCF/temp/CLE3 --keep data/VCF/CLE3.ind  --recode --recode-INFO-all

vcftools --vcf data/VCF/temp/CLE3.recode.vcf --out data/VCF/CLE3 --missing-site

# Aransas Bay
vcftools --vcf data/VCF/temp/CLE.F2a.recode.vcf --out data/VCF/temp/ARA --keep data/VCF/ARA.ind  --recode --recode-INFO-all
vcftools --vcf data/VCF/temp/ARA.recode.vcf --out data/VCF/ARA --missing-site

# Corpus Christi Bay
vcftools --vcf data/VCF/temp/CLE.F2a.recode.vcf --out data/VCF/temp/CC --keep data/VCF/CC.ind  --recode --recode-INFO-all
vcftools --vcf data/VCF/temp/CC.recode.vcf --out data/VCF/CC --missing-site

# Galveston Bay
vcftools --vcf data/VCF/temp/CLE.F2a.recode.vcf --out data/VCF/temp/GAL --keep data/VCF/GAL.ind  --recode --recode-INFO-all
vcftools --vcf data/VCF/temp/GAL.recode.vcf --out data/VCF/GAL --missing-site

# Matagorda Bay
vcftools --vcf data/VCF/temp/CLE.F2a.recode.vcf --out data/VCF/temp/MAT --keep data/VCF/MAT.ind  --recode --recode-INFO-all
vcftools --vcf data/VCF/temp/MAT.recode.vcf --out data/VCF/MAT --missing-site

# San Antionio
vcftools --vcf data/VCF/temp/CLE.F2a.recode.vcf --out data/VCF/temp/SA --keep data/VCF/SA.ind  --recode --recode-INFO-all
vcftools --vcf data/VCF/temp/SA.recode.vcf --out data/VCF/SA --missing-site

# Sabine Lake
vcftools --vcf data/VCF/temp/CLE.F2a.recode.vcf --out data/VCF/temp/SL --keep data/VCF/SL.ind  --recode --recode-INFO-all
vcftools --vcf data/VCF/temp/SL.recode.vcf --out data/VCF/SL --missing-site

```

Compare distribution of missing data per locus per library.

```{r lmiss per lib, fig.height=4, fig.width=3, message=FALSE, warning=FALSE}

# create empty list
loc_missing <- list()

# import missing data per locus
loc_missing[[1]] <- read.table("data/VCF/CLE2.lmiss", 
                        header = TRUE, stringsAsFactors = FALSE) %>%
  select(CHR, POS, F_MISS) %>%
  mutate(LIB = "CLE2")

loc_missing[[2]] <- read.table("data/VCF/CLE3.lmiss", 
                        header = TRUE, stringsAsFactors = FALSE) %>%
  select(CHR, POS, F_MISS) %>%
  mutate(LIB = "CLE3")

# create data frame with all information
loc_missing <- ldply(loc_missing, data.frame)

ggplot(loc_missing, aes(x = F_MISS)) +
  geom_histogram(binwidth = .1, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = 0.5),
             color = "darkred", linetype = "dashed", size = 1) +
  labs(x = "missing data per locus") +
  facet_grid(LIB ~ .) +
  theme_standard

```

Flag loci that were not called in > 25% of individuals in a given library.

```{r}

# identify loci with high missing data in each library
SNPs <- filter(loc_missing, F_MISS > 0.5) %>%
  arrange(CHR, POS)

count(SNPs, LIB)

LQloci_lib <- SNPs %>%
  select(CHR, POS) %>%
  unique()

```

A total of `r LQloci_lib` loci were called in less than 50% of individuals in one or more libraries. Loci being inconsistently called between libraries can result in library effects.

Compare distribution of missing data per locus per sample location.

```{r lmiss per loc, fig.height=6, fig.width=3, message=FALSE, warning=FALSE}

# create empty list
loc_missing <- list()

# import missing data per locus
loc_missing[[1]] <- read.table("data/VCF/ARA.lmiss", 
                        header = TRUE, stringsAsFactors = FALSE) %>%
  select(CHR, POS, F_MISS) %>%
  mutate(POP = "ARA")

loc_missing[[2]] <- read.table("data/VCF/CC.lmiss", 
                        header = TRUE, stringsAsFactors = FALSE) %>%
  select(CHR, POS, F_MISS) %>%
  mutate(POP = "CC")

loc_missing[[3]] <- read.table("data/VCF/GAL.lmiss", 
                        header = TRUE, stringsAsFactors = FALSE) %>%
  select(CHR, POS, F_MISS) %>%
  mutate(POP = "GAL")

loc_missing[[4]] <- read.table("data/VCF/MAT.lmiss", 
                        header = TRUE, stringsAsFactors = FALSE) %>%
  select(CHR, POS, F_MISS) %>%
  mutate(POP = "MAT")

loc_missing[[5]] <- read.table("data/VCF/SA.lmiss", 
                        header = TRUE, stringsAsFactors = FALSE) %>%
  select(CHR, POS, F_MISS) %>%
  mutate(POP = "SA")

loc_missing[[6]] <- read.table("data/VCF/SL.lmiss", 
                        header = TRUE, stringsAsFactors = FALSE) %>%
  select(CHR, POS, F_MISS) %>%
  mutate(POP = "SL")

# create data frame with all information
loc_missing <- ldply(loc_missing, data.frame)

ggplot(loc_missing, aes(x = F_MISS)) +
  geom_histogram(binwidth = .025, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = 0.25),
             color = "darkred", linetype = "dashed", size = 1) +
  labs(x = "missing data per locus") +
  facet_grid(POP ~ .) +
  scale_y_sqrt() +
  theme_standard

```

Flag loci that were not called in > 75% of individuals at a given sample location.

```{r}

# identify loci with high missing data in each library
SNPs <- filter(loc_missing, F_MISS > 0.25) %>%
  arrange(CHR, POS)

count(SNPs, POP)

LQloci_pop <- SNPs %>%
  select(CHR, POS) %>%
  unique()

LQloci <- bind_rows(LQloci_lib, LQloci_pop) %>%
  unique()

# Write contig/position to file
write.table(LQloci, file = "data/VCF/LQ_F2b.loci", 
            col.names= FALSE, row.names = FALSE, quote = FALSE)

```

Remove loci that were not consistently called across all libraries and sample locations.

```{bash filter missing by lib}

vcftools --vcf data/VCF/temp/CLE.F2a.recode.vcf --out data/VCF/temp/CLE.F2b --exclude-positions data/VCF/LQ_F2b.loci --recode --recode-INFO-all

vcftools --vcf data/VCF/temp/CLE.F2b.recode.vcf --out data/VCF/F2b --keep data/VCF/duplicate.ind --012
vcftools --vcf data/VCF/temp/CLE.F2b.recode.vcf --out data/VCF/F2b --keep data/VCF/duplicate.ind --geno-depth

vcftools --vcf data/VCF/temp/CLE.F2b.recode.vcf --out data/VCF/CLE.F2b --missing-indv

```

Identify individuals with > 50% missing data

```{r imiss > 40, fig.height=3, fig.width=4}

# determine cutoff
imiss <- read.table("data/VCF/CLE.F2b.imiss", 
           header = TRUE, stringsAsFactors = FALSE) 

ggplot(imiss, aes(x = F_MISS)) +
  geom_histogram(binwidth = 0.01, color = "black", fill = "darkorange") +
  geom_vline(xintercept = 0.25, color = "darkblue", linetype = "dashed", size = 1) +
  theme_standard

imiss <- imiss %>%
  filter(F_MISS > 0.25) %>%
  select(INDV)

View(imiss)

write.table(imiss, "data/VCF/F2b_LQ.indv",
            col.names = TRUE, row.names = FALSE, quote = FALSE)

```

Remove flagged individuals.

```{bash filter LQ indv}

vcftools --vcf data/VCF/temp/CLE.F2b.recode.vcf --out data/VCF/temp/CLE.F2 --remove data/VCF/F2b_LQ.indv --recode --recode-INFO-all

vcftools --vcf data/VCF/temp/CLE.F2.recode.vcf --out data/VCF/F2 --keep data/VCF/duplicate.ind --012
vcftools --vcf data/VCF/temp/CLE.F2.recode.vcf --out data/VCF/F2 --keep data/VCF/duplicate.ind --geno-depth

vcftools --vcf data/VCF/temp/CLE.F2.recode.vcf --out data/VCF/CLE.F2 --depth
vcftools --vcf data/VCF/temp/CLE.F2.recode.vcf --out data/VCF/CLE.F2 --site-mean-depth
vcftools --vcf data/VCF/temp/CLE.F2.recode.vcf --out data/VCF/CLE.F2 --missing-indv
vcftools --vcf data/VCF/temp/CLE.F2.recode.vcf --out data/VCF/CLE.F2 --missing-site
vcftools --vcf data/VCF/temp/CLE.F2.recode.vcf --out data/VCF/CLE.F2 --het
vcftools --vcf data/VCF/temp/CLE.F2.recode.vcf --out data/VCF/CLE.F2 --geno-depth

```

Analyze stats post-filtering:

```{r stats F2, fig.height=20, fig.width=10, message=FALSE, warning=FALSE}

# load stats files ----
ind_stats_F2 <- read.ind.stats(dir = "data/VCF", vcf = "CLE.F2") %>%
  separate(INDV, into = c("SP", "LIB", "SAMPLE_ID"), sep = "_", remove = FALSE)

loc_stats_F2 <- read.loc.stats(dir = "data/VCF/", vcf = "CLE.F2")

```

Compare change in missing data of loci and individuals.

```{r missing data F2, fig.height=3.5, fig.width=8, message=FALSE, warning=FALSE}

# import imiss ----

# create empty list
imiss <- list()

# import missing data per locus
imiss[[1]] <- read.table("data/VCF/CLE_raw.imiss", 
                        header = TRUE, stringsAsFactors = FALSE) %>%
  select(INDV, F_MISS) %>%
  mutate(FIL = "raw")

imiss[[2]] <- read.table("data/VCF/CLE.F1.imiss", 
                        header = TRUE, stringsAsFactors = FALSE) %>%
  select(INDV, F_MISS) %>%
  mutate(FIL = "F1")

imiss[[3]] <- read.table("data/VCF/CLE.F2.imiss", 
                        header = TRUE, stringsAsFactors = FALSE) %>%
  select(INDV, F_MISS) %>%
  mutate(FIL = "F2")

# create data frame with all information
imiss <- ldply(imiss, data.frame) %>%
  spread(key = FIL, value = F_MISS)

# import lmiss ----

# create empty list
lmiss <- list()

# import missing data per locus
lmiss[[1]] <- read.table("data/VCF/CLE_raw.lmiss", 
                        header = TRUE, stringsAsFactors = FALSE) %>%
  select(CHR, POS, F_MISS) %>%
  mutate(FIL = "raw")

lmiss[[2]] <- read.table("data/VCF/CLE.F1.lmiss", 
                        header = TRUE, stringsAsFactors = FALSE) %>%
  select(CHR, POS, F_MISS) %>%
  mutate(FIL = "F1")

lmiss[[3]] <- read.table("data/VCF/CLE.F2.lmiss", 
                        header = TRUE, stringsAsFactors = FALSE) %>%
  select(CHR, POS, F_MISS) %>%
  mutate(FIL = "F2")

# create data frame with all information
lmiss <- ldply(lmiss, data.frame) %>%
  spread(key = FIL, value = F_MISS)

# plot ----
p1 <- ggplot(imiss, aes(x = F2, y = raw)) +
  geom_point(shape = 1) +
  geom_abline(slope = 1, color = "darkblue", linetype = "dashed", size = 1) +
  scale_x_continuous(limits = c(0, 0.25)) +
  scale_y_continuous(limits = c(0, 0.25)) +
  labs(x = "imiss F2", y = "imiss raw") +
  theme_standard

p2 <- ggplot(lmiss, aes(x = F2, y = raw)) +
  geom_point(shape = 1) +
  geom_abline(slope = 1, color = "darkblue", linetype = "dashed", size = 1) +
  scale_x_continuous(limits = c(0, 0.25)) +
  scale_y_continuous(limits = c(0, 0.25)) +
  labs(x = "lmiss F2", y = "lmiss raw") +
  theme_standard

m2 <- multiplot(p1, p2, cols=2)

```

### Filter 3: Filter loci and individuals based on depth, variance in depth and genotype call rate

Determine mean depth and variance per locus per library.

```{bash depth data per lib}

# library CLE-4
vcftools --vcf data/VCF/temp/CLE.F2.recode.vcf --out data/VCF/temp/CLE2 --keep data/VCF/CLE2.ind  --recode --recode-INFO-all

vcftools --vcf data/VCF/temp/CLE2.recode.vcf --out data/VCF/CLE2 --site-mean-depth

# library CLE-5
vcftools --vcf data/VCF/temp/CLE.F2.recode.vcf --out data/VCF/temp/CLE3 --keep data/VCF/CLE3.ind  --recode --recode-INFO-all

vcftools --vcf data/VCF/temp/CLE3.recode.vcf --out data/VCF/CLE3 --site-mean-depth

```

Compare distribution of depth coverage per locus per library. Identify loci that do not have consistent coverage between libraries (can lead to library effects), and/or across individuals. 

```{r depth per locus per lib, fig.height=5, fig.width=4, message=FALSE, warning=FALSE}

# create empty list
loc_depth <- list()

# import depth data per locus
loc_depth[[1]] <- read.table("data/VCF/CLE2.ldepth.mean", 
                        header = TRUE, stringsAsFactors = FALSE) %>%
  mutate(LIB = "CLE2")

loc_depth[[2]] <- read.table("data/VCF/CLE3.ldepth.mean", 
                        header = TRUE, stringsAsFactors = FALSE) %>%
  mutate(LIB = "CLE3")

# create data frame with all information
loc_depth <- ldply(loc_depth, data.frame)

ggplot(loc_depth, aes(x = MEAN_DEPTH)) +
  geom_histogram(binwidth = 20, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(MEAN_DEPTH, na.rm = TRUE)),
             color = "darkred", linetype = "dashed", size = 1) +
  geom_vline(xintercept = 20,
             color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "mean depth per locus") +
  facet_grid(LIB ~ .) +
  theme_standard

```

Identify loci with large variance in mean depth across libraries and/or individuals by calculating the coefficient of variance (STD/MEAN).

```{r calculate variation in depth, message=FALSE, warning=FALSE}

# calculate mean depth weighted by library
depth_comp <- loc_depth %>%
  group_by(LIB) %>%
  distinct(CHROM, .keep_all = TRUE) %>%
  select(-POS) %>%
  ungroup() %>%
  group_by(CHROM) %>%
  summarise(MEAN = mean(MEAN_DEPTH),
            STD = sd(MEAN_DEPTH))

# mean depth across all individuals
temp <- read.table("data/VCF/CLE.F2.ldepth.mean", 
                   header = TRUE, stringsAsFactors = FALSE) %>%
  distinct(CHROM, .keep_all = TRUE) %>%
  select(-POS) %>%
  mutate(STD_DEPTH = sqrt(VAR_DEPTH))

# calculate coefficient of variation
depth_comp <- left_join(depth_comp, temp) %>%
  mutate(COEFF_VAR_LIB = STD/MEAN*100,
         COEFF_VAR_IND = STD_DEPTH/MEAN_DEPTH*100)

```

Compare mean across all individual to mean weighted by library and coefficient of variance of read depth across individuals and between libraries:

```{r mean std depth, fig.height=7, fig.width=8.5}

p1 <- ggplot(depth_comp, aes(x = MEAN_DEPTH, y = MEAN)) +
  geom_point(shape = 1) +
  labs(x = "mean depth per locus", y = "mean weighted by lib") +
  geom_abline(slope = 1, linetype = "dashed", color = "darkred", size = 1) +
  theme_standard

p2 <- ggplot(depth_comp, aes(x = COEFF_VAR_IND)) +
  geom_histogram(binwidth = 10, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = quantile(COEFF_VAR_IND, .99)),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "coeff var depth across all indv") +
  theme_standard

p3 <- ggplot(depth_comp, aes(x = COEFF_VAR_IND, y = COEFF_VAR_LIB)) +
  geom_point(shape = 1) +
  scale_x_continuous(limits = c(0, 200)) +
  scale_y_continuous(limits = c(0, 100)) +
  labs(x = "coeff var depth across all indv", y = "coeff var mean depth betw lib") +
  theme_standard

p4 <- ggplot(depth_comp, aes(x = COEFF_VAR_LIB)) +
  geom_histogram(binwidth = 10, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = quantile(COEFF_VAR_LIB, .99)),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "coeff var depth between lib") +
  theme_standard

m3a <- multiplot(p1, p2, p3, p4, cols=2)

```

If loci have consistent coverage across loci the mean read depth per locus across all individuals and weighted by library should fall on the red diagonal. 

Flag loci that have high variation in mean coverage between individuals and between libraries.

```{r LQ loci variation coverage}

# identify loci high variation in coverage
contigs_var <- depth_comp %>%
  filter(COEFF_VAR_LIB > 90 | COEFF_VAR_IND > 120)

SNPs_var <- filter(loc_depth, CHROM %in% contigs_var$CHROM) %>%
  distinct(CHROM, POS)

# Write contig/position to text file, use file with vcftools to remove positions from dataset 
write.table(SNPs_var, file = "data/VCF/LQ_F3.loci", 
            col.names= FALSE, row.names = FALSE, quote = FALSE)

```

Remove loci flagged for high variance in depth across all individuals and between libraries (removes library effects), filter loci with mean depth < 20 and genotype call rate < 75%

```{bash filter variance in coverage loci}

vcftools --vcf data/VCF/temp/CLE.F2.recode.vcf --out data/VCF/temp/CLE.F3a --exclude-positions data/VCF/LQ_F3.loci --min-meanDP 15 --max-missing 0.75 --recode --recode-INFO-all

vcftools --vcf data/VCF/temp/CLE.F3a.recode.vcf --out data/VCF/F3a --keep data/VCF/duplicate.ind --012
vcftools --vcf data/VCF/temp/CLE.F3a.recode.vcf --out data/VCF/F3a --keep data/VCF/duplicate.ind --geno-depth

vcftools --vcf data/VCF/temp/CLE.F3a.recode.vcf --out data/VCF/CLE.F3a --depth
vcftools --vcf data/VCF/temp/CLE.F3a.recode.vcf --out data/VCF/CLE.F3a --geno-depth

```

Compare mean depth and number of sites called per individual.

```{r idepth sites vs depth I, fig.height=3, fig.width=4.5, message=FALSE, warning=FALSE}

read_table2("data/VCF/CLE.F3a.idepth") %>%
  ggplot(aes(x = N_SITES, y = MEAN_DEPTH)) +
  geom_point(shape = 1, size = 1.5) +
  geom_hline(yintercept = 10, linetype = "dashed", color = "darkred", size = 0.75) +
  geom_hline(yintercept = 20, linetype = "dashed", color = "darkblue", size = 0.75) +
  labs(x = "number of sites", y = "mean read depth") +
  theme_standard

```

Use genotype depth file to identify individuals with high variance in read depth across loci.

```{r variance depth individual I}

# number of individuals
n <- nrow(ind_stats_F2)+1

# read genotype depth & code values < 5 as missing
gdepth <- read_table2("data/VCF/CLE.F3a.gdepth") %>%
  select(-POS) %>%
  distinct(CHROM, .keep_all = TRUE) %>%
  gather(key = INDV, value = DEPTH, 3:n) %>%
  mutate(DEPTH = as.numeric(gsub(-1, 0, DEPTH)),
         DEPTH = as.numeric(gsub("\\<1\\>", 0, DEPTH)),
         DEPTH = as.numeric(gsub("\\<2\\>", 0, DEPTH)),
         DEPTH = as.numeric(gsub("\\<3\\>", 0, DEPTH)),
         DEPTH = as.numeric(gsub("\\<4\\>", 0, DEPTH)))

# calculate summary statistics
idepth <- gdepth %>%
  group_by(INDV) %>%
  summarize(TOTAL = sum(DEPTH),
            MAX = max(DEPTH),
            MEAN_NON0 = mean(DEPTH[DEPTH > 0]),
            MEAN = mean(DEPTH),
            MEDIAN = median(DEPTH),
            VAR = var(DEPTH),
            STD = sd(DEPTH)) %>%
  mutate(COEFF_VAR = STD/MEAN,
         RATIO_MEAN_MEDIAN = MEAN/MEDIAN)

idepth <- left_join(idepth, ind_stats_F2)

```

Compare variability of depth within individuals.

```{r, fig.height=15, fig.width=10}

p1 <- ggplot(idepth, aes(x = MAX, y = TOTAL)) +
  geom_smooth(method = "auto",linetype = "dashed", size = 1, color = "darkred") +
  geom_point(shape = 1) +
  theme_standard

p2 <- ggplot(idepth, aes(x = TOTAL, y = VAR)) +
  geom_smooth(method = "auto",linetype = "dashed", size = 1, color = "darkred") +
  geom_point(shape = 1) +
  theme_standard

p3 <- ggplot(idepth, aes(MAX, MEDIAN)) +
  geom_smooth(method = "auto",linetype = "dashed", size = 1, color = "darkred") +
  geom_point(shape = 1) +
  theme_standard

p4 <- ggplot(idepth, aes(x = MEAN, y = MEDIAN)) +
  geom_smooth(method = "auto",linetype = "dashed", size = 1, color = "darkred") +
  geom_point(shape = 1) +
  theme_standard

p5 <- ggplot(idepth, aes(x = COEFF_VAR, y = RATIO_MEAN_MEDIAN)) +
  geom_smooth(method = "auto",linetype = "dashed", size = 1, color = "darkred") +
  geom_point(shape = 1) +
  theme_standard

p6 <- ggplot(idepth, aes(MEAN)) +
  geom_histogram(binwidth = 5, color = "black", fill = "darkorange") +
  geom_vline(xintercept = 15, color = "darkred", linetype = "dashed") +
  theme_standard

p7 <- ggplot(idepth, aes(MEDIAN)) +
  geom_histogram(binwidth = 5, color = "black", fill = "darkorange") +
  geom_vline(xintercept = 10, color = "darkred", linetype = "dashed") +
  theme_standard

p8 <- ggplot(idepth, aes(RATIO_MEAN_MEDIAN)) +
  geom_histogram(binwidth = 0.1, color = "black", fill = "darkorange") +
  theme_standard

p9 <- ggplot(idepth, aes(COEFF_VAR)) +
  geom_histogram(binwidth = 0.025, color = "black", fill = "darkorange") +
  theme_standard

p10 <- ggplot(idepth, aes(x = COEFF_VAR, y = MISS_CLE.F2)) +
  geom_point(shape = 1) +
  geom_hline(yintercept = 0.25, linetype = "dashed", color = "darkred", size = 0.75) +
  theme_standard


m3b <- multiplot(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, cols=2)

```

Flag LQ individuals.

```{r}

LQ_depth <- idepth %>%
 filter(MEAN <= 10 | MEDIAN <= 5) %>%
 select(INDV)

LQ_depth
 
write.table(LQ_depth, "data/VCF/F3_LQ.indv",
           col.names = TRUE, row.names = FALSE, quote = FALSE)

```

Remove flagged loci and individuals.

```{bash filter variance in coverage}

vcftools --vcf data/VCF/temp/CLE.F3a.recode.vcf --out data/VCF/temp/CLE.F3 --remove data/VCF/F3_LQ.indv --recode --recode-INFO-all

vcftools --vcf data/VCF/temp/CLE.F3.recode.vcf --out data/VCF/F3 --keep data/VCF/duplicate.ind --012
vcftools --vcf data/VCF/temp/CLE.F3.recode.vcf --out data/VCF/F3 --keep data/VCF/duplicate.ind --geno-depth

vcftools --vcf data/VCF/temp/CLE.F3.recode.vcf --out data/VCF/CLE.F3 --depth
vcftools --vcf data/VCF/temp/CLE.F3.recode.vcf --out data/VCF/CLE.F3 --site-mean-depth
vcftools --vcf data/VCF/temp/CLE.F3.recode.vcf --out data/VCF/CLE.F3 --missing-indv
vcftools --vcf data/VCF/temp/CLE.F3.recode.vcf --out data/VCF/CLE.F3 --missing-site
vcftools --vcf data/VCF/temp/CLE.F3.recode.vcf --out data/VCF/CLE.F3 --het

```

Compare stats post-filtering:

```{r stats F3, fig.height=10, fig.width=8, message=FALSE, warning=FALSE}

# load stats files ----
ind_stats_F3 <- read.ind.stats(dir = "data/VCF", vcf = "CLE.F3") %>%
  separate(INDV, into = c("SP", "LIB", "SAMPLE_ID"), sep = "_", remove = FALSE)

loc_stats_F3 <- read.loc.stats(dir = "data/VCF/", vcf = "CLE.F3")

# plot missing data per indv ----
p1 <- ggplot(ind_stats_F3, aes(x = MISS_CLE.F3)) +
  geom_histogram(binwidth = .01, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(MISS_CLE.F3, na.rm = TRUE)),
             color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 0.25),
             color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "missing data per indv") +
  theme_standard

# plot read depth per indv ----
p2 <- ggplot(ind_stats_F3, aes(x = MEAN_DEPTH_CLE.F3)) +
  geom_histogram(binwidth = 10, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(MEAN_DEPTH_CLE.F3, na.rm = TRUE)),
             color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 20),
             color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "mean read depth per indv") +
  theme_standard

# plot depth vs missing ----
p3 <- ggplot(ind_stats_F3, aes(x = MEAN_DEPTH_CLE.F3, y = MISS_CLE.F3)) +
  geom_point() +
  geom_vline(aes(xintercept = mean(MEAN_DEPTH_CLE.F3, na.rm = TRUE)),
             color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 20),
             color = "darkblue", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = mean(MISS_CLE.F3, na.rm = TRUE)),
             color = "red", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = 0.25),
             color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "mean depth per indv", y = "% missing data") +
  theme_standard

# plot distribution missing data per locus ----
p4 <- ggplot(loc_stats_F3, aes(x = MISS_CLE.F3)) +
  geom_histogram(binwidth = 0.01, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(MISS_CLE.F3, na.rm = TRUE)),
             color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 0.1),
             color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "% missing data per locus") +
  theme_standard

# plot distribution mean read depth ----
p5 <- ggplot(loc_stats_F3, aes(x = MEAN_DEPTH_CLE.F3)) +
  geom_histogram(binwidth = 20, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(MEAN_DEPTH_CLE.F3, na.rm = TRUE)),
             color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 20),
             color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "mean read depth per locus") +
  theme_standard

# plot read depth vs missing data ----
p6 <- ggplot(loc_stats_F3, aes(x = MEAN_DEPTH_CLE.F3, y = MISS_CLE.F3)) +
  geom_point() +
  geom_vline(aes(xintercept = mean(MEAN_DEPTH_CLE.F3, na.rm = TRUE)),
             color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 20),
             color = "darkblue", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = mean(MISS_CLE.F3, na.rm = TRUE)),
             color = "red", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = 0.1),
             color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "mean depth per locus", y = "% missing data") +
  theme_standard

m3 <- multiplot(p1, p2, p3, p4, p5, p6, cols=2)

```

### Filter 4: Allele balance

AB: Allele balance at heterozygous sites: a number between 0 and 1 representing the ratio of reads showing the reference allele to all reads, considering only reads from individuals called as heterozygous

```{bash query INFO stats I, eval=FALSE, include=FALSE}

cut -f8 CLE.F3.recode.vcf | grep -oe "AB=[[:digit:]].[[:digit:]][[:digit:]][[:digit:]]" | sed -s 's/AB=//g' > CLE.F4.AB

```

Allele balance is the ratio of reads for reference allele to all reads, considering only reads from individuals called as heterozygous. Values range from 0 - 1; allele balance (for real loci) should be approx. 0.5. Filter contigs SNPs for which the with allele balance < 0.25 and > 0.75.

```{r plot AB, fig.height=4, fig.width=5}

read.table("data/VCF/temp/CLE.F4.AB",
           col.names = "AB", stringsAsFactors = FALSE) %>%
  ggplot(aes(x = AB)) +
  geom_histogram(binwidth = 0.01, color = "black", fill = "darkorange") +
  geom_vline(xintercept = 0.5, color = "red", linetype = "dashed", size = 1) +
  geom_vline(xintercept = 0.25, color = "darkblue", linetype = "dashed", size = 1) +
  geom_vline(xintercept = 0.75, color = "darkblue", linetype = "dashed", size = 1) +
  theme_standard

```

Filter contigs with SNP calls with AB > 0.25, AB > 0.75; retain loci very close to 0 (retain loci that are fixed variants). Remove genotypes if the quality sum of the reference or alternate allele was 0.

```{bash, eval=FALSE, include=FALSE}

# this works... not sure why cant write to file in the folder
vcffilter -s -f "AB > 0.25 & AB < 0.75 | AB < 0.01 | AB > 0.99" -s -g "QR > 0 | QA > 0" CLE.F3.recode.vcf > CLE.F4.vcf 

mawk '!/#/' CLE.F4.vcf | wc -l

```

Remaining SNPs: 65,837.


### Filter 5: quality/depth ratio

```{bash}

# site depth
cut -f8 CLE.F4.vcf | grep -oe "DP=[0-9]*" | sed -s 's/DP=//g' > CLE.4.DEPTH

# quality score
mawk '!/#/'  CLE.F4.vcf | cut -f1,2,6 > CLE.F4.loci.qual

```

Compare quality/depth ratio.

```{r fig.height=3, fig.width=4, message=FALSE, warning=FALSE}

# depth
depth <- read.table("data/VCF/temp/CLE.4.DEPTH",
                    col.names = "depth")

# quality score
qual <- read.table("data/VCF/temp/CLE.F4.loci.qual",
                   col.names = c("locus", "pos", "qual"))

temp <- bind_cols(qual, depth) %>%
  mutate(ratio = qual/depth)

ggplot(temp, aes(x = ratio)) +
  geom_histogram(binwidth = 0.25, color = "black", fill = "grey85") +
  geom_vline(xintercept = 0.2, color = "darkred", linetype = "dashed", size = 1) +
  theme_standard

```

Remove loci with quality/depth ratio < 0.2

```{bash}

vcffilter -s -f "QUAL / DP > 0.2" CLE.F4.vcf > CLE.F5.vcf 

mawk '!/#/' CLE.F5.vcf | wc -l

```

Remaining SNPs: 38,334

### Filter 6: mapping quality

```{bash query mapping quality, eval=FALSE, include=FALSE}

cut -f8 CLE.F5.vcf | grep -oe "MQM=[0-9]*" | sed -s 's/MQM=//g' > CLE.F5.MQM

cut -f8 CLE.F5.vcf | grep -oe "MQMR=[0-9]*" | sed -s 's/MQMR=//g' > CLE.F5.MQMR

```

Remove loci based on ratio of mapping quality for reference and alternate allele, i.e. sites that have a high discrepancy between the mapping qualities of two alleles.

```{r plot map qual ratios, fig.height=4, fig.width=5}

temp <- read.table("data/VCF/temp/CLE.F5.MQM", col.names = "MQM")

mapqual <- read.table("data/VCF/temp/CLE.F5.MQMR", col.names = "MQMR")

mapqual <- bind_cols(mapqual, temp) %>%
  mutate(ratio = MQM/MQMR)

filter <- mapqual %>%
  filter(ratio < 0.25 | ratio > 1.75)

ggplot(mapqual, aes(x = MQM, y = MQMR)) +
  geom_point(shape = 1, size = 1) + 
  geom_abline(intercept = 0, slope = 1, size = 1, color = "red", linetype = "dashed") +
  geom_abline(intercept = 0, slope = 4, size = 1, color = "darkblue", linetype = "dashed") +
  geom_abline(intercept = 0, slope = 0.571, size = 1, color = "darkblue", linetype = "dashed") +
  geom_point(data = filter, aes(x = MQM, y = MQMR), shape = 21, color = "black", fill = "red") +
  scale_x_continuous(limits = c(0, 65)) +
  scale_y_continuous(limits = c(0, 65)) +
  theme_standard

```

Filter loci with mapping quality ratio < 0.25 and > 1.75.

```{bash filter map qual ratio}

vcffilter -s -f "MQM / MQMR > 0.25 & MQM / MQMR < 1.75" CLE.F5.vcf > CLE.F6.vcf

mawk '!/#/' CLE.F6.vcf | wc -l

```

Remaining SNPs: 36,633.

### Filter 7: Strand balance

SRF: Number of reference observations on the forward strand
SAF: Number of alternate observations on the forward strand
SRR: Number of reference observations on the reverse strand
SAR: Number of alternate observations on the reverse strand

```{bash query strandedness, eval=FALSE, include=FALSE}

cut -f8 CLE.F6.vcf | grep -oe "SAF=[0-9]*" | sed -s 's/SAF=//g' > CLE.F6.SAF

cut -f8 CLE.F6.vcf | grep -oe "SAR=[0-9]*" | sed -s 's/SAR=//g' > CLE.F6.SAR

cut -f8 CLE.F6.vcf | grep -oe "SRF=[0-9]*" | sed -s 's/SRF=//g' > CLE.F6.SRF

cut -f8 CLE.F6.vcf | grep -oe "SRR=[0-9]*" | sed -s 's/SRR=//g' > CLE.F6.SRR

```

Paired end reads should not overlap, and a SNP site should only be covered by either the forward or reverse read (strand). 

```{r plot strandedness, fig.height=4, fig.width=5}

SAF <- read.table("data/VCF/temp/CLE.F6.SAF",
                  col.names = "SAF")

SAR <- read.table("data/VCF/temp/CLE.F6.SAR",
                  col.names = "SAR")

strands <- bind_cols(SAF, SAR)

SRF <- read.table("data/VCF/temp/CLE.F6.SRF",
                  col.names = "SRF")

strands <- bind_cols(strands, SRF)

SRR <- read.table("data/VCF/temp/CLE.F6.SRR",
                  col.names = "SRR")

strands <- bind_cols(strands, SRR) %>%
  mutate(ratioA = SAF/SAR, ratioR = SRF/SRR)

ggplot(strands, aes(x = SAF, y = SAR)) +
  geom_point(shape = 1) +
  geom_abline(intercept = 0, slope = 0.1, color = "darkblue", linetype = "dashed", size = 1) +
  geom_abline(intercept = 0, slope = 100, color = "darkblue", linetype = "dashed", size = 1) +
  theme_standard

```

Remove SNP sites that have > 100x more forward alternate reads than reverse alternate reads and > 100x more forward reverse reads than reverse alternate reads.

```{bash filter strandedness}

vcffilter -f "SAF / SAR > 100 & SRF / SRR > 100 | SAR / SAF > 100 & SRR / SRF > 100" -s CLE.F6.vcf > CLE.F7.vcf

mawk '!/#/' CLE.F7.vcf | wc -l

```

Number of SNPs remaining: 34,587.

### Filter 8: Properly paired status

PAIRED: Proportion of observed alternate alleles which are supported by properly paired read fragments
PAIREDR: Proportion of observed reference alleles which are supported by properly paired read fragments

Identify loci with only unpaired reads mapping to them - an artifact introduced due de novo reference assembly. Compare number of paired reads mapping the reference and the alternate alleles to identify discrepancy in the paired status for reads supporting reference and alternate alleles.

```{bash filter paired status, eval=FALSE, include=FALSE}

vcffilter -f "PAIRED > 0.05 & PAIREDR > 0.05 & PAIREDR / PAIRED < 1.75 & PAIREDR / PAIRED > 0.25 | PAIRED < 0.05 & PAIREDR < 0.05" -s CLE.F7.vcf > CLE.F8.vcf

mawk '!/#/' CLE.F8.vcf | wc -l

```

Number of SNPs in data set: 32,445.

### Filter 9: Maximum depth & Quality

Identify distribution of depth (based on original data set) to identify loci with excess coverage.

Original number of individuals in data set is `r nrow(ind_stats_raw)` (INFO flags in filtered data set are are based on original number of individuals in data set).

Create file with the original site depth and quality score for each site:

```{bash, eval=FALSE, include=FALSE}

# site depth
cut -f8 CLE.F8.vcf | grep -oe "DP=[0-9]*" | sed -s 's/DP=//g' > CLE.F8.DEPTH

# quality score
mawk '!/#/'  CLE.F8.vcf | cut -f1,2,6 > CLE.F8.loci.qual

```

Calculate average depth and standard deviation:

```{r plot depth vs qual, fig.height=3, fig.width=5}

# depth
depth <- read.table("data/VCF/temp/CLE.F8.DEPTH",
                    col.names = "depth")

# quality score
qual <- read.table("data/VCF/temp/CLE.F8.loci.qual",
                   col.names = c("locus", "pos", "qual"))

# mean depth
mean_depth <- mean(depth$depth)

# standard deviation
std <- sd(depth$depth)

# calculate cutoff
cutoff <- sum(mean_depth + (2*std))

# identify SNPs with excess (i.e. depth > mean depth + 1 standard deviation
# and quality score < 2x the depth at that site
df <- bind_cols(qual, depth) %>%
  mutate(qualcutoff = 2*depth)

removeloc <- df %>%
  filter(depth > cutoff) %>%
  filter(qual < 2*depth)

# plot
ggplot(df, aes(x = depth, y = qual)) +
  geom_point(shape = 1) +
  geom_point(data = removeloc, aes(x = depth, y = qual), shape = 21, color = "black", fill = "red") +
  geom_line(data = df, aes(x = depth, y = qualcutoff), color = "blue",  linetype = "dashed", size = 1) +
  geom_vline(xintercept = cutoff, color = "blue", linetype = "dashed", size = 1) +
  theme_standard

LQ <- removeloc %>%
  select(locus, pos)

write.table(LQ, "data/VCF/temp/LQ_F9.loci", 
            col.names = FALSE, row.names = FALSE, quote = FALSE)

```

Mean depth per locus (across all indivuals) is `r mean_depth` and the standard deviation is `r std`. 

Filter SNP site with depth > mean depth + 1 standard deviation = `r sum(mean_depth + 2*std)` and that have quality scores < 2x the depth at that site and output depth per site.

```{bash filter depth qual, eval=FALSE, include=FALSE}

vcftools --vcf  data/VCF/temp/CLE.F8.vcf --exclude-positions data/VCF/temp/LQ_F9.loci --recode --recode-INFO-all --out data/VCF/temp/CLE.F9a

vcftools --vcf data/VCF/temp/CLE.F4.vcf --out data/VCF/F4 --keep data/VCF/duplicate.ind --012
vcftools --vcf data/VCF/temp/CLE.F5.vcf --out data/VCF/F5 --keep data/VCF/duplicate.ind --012
vcftools --vcf data/VCF/temp/CLE.F6.vcf --out data/VCF/F6 --keep data/VCF/duplicate.ind --012
vcftools --vcf data/VCF/temp/CLE.F7.vcf --out data/VCF/F7 --keep data/VCF/duplicate.ind --012
vcftools --vcf data/VCF/temp/CLE.F8.vcf --out data/VCF/F8 --keep data/VCF/duplicate.ind --012
vcftools --vcf data/VCF/temp/CLE.F9a.recode.vcf --out data/VCF/F9a --keep data/VCF/duplicate.ind --012

vcftools --vcf data/VCF/temp/CLE.F4.vcf --out data/VCF/F4 --keep data/VCF/duplicate.ind --geno-depth
vcftools --vcf data/VCF/temp/CLE.F5.vcf --out data/VCF/F5 --keep data/VCF/duplicate.ind --geno-depth
vcftools --vcf data/VCF/temp/CLE.F6.vcf --out data/VCF/F6 --keep data/VCF/duplicate.ind --geno-depth
vcftools --vcf data/VCF/temp/CLE.F7.vcf --out data/VCF/F7 --keep data/VCF/duplicate.ind --geno-depth
vcftools --vcf data/VCF/temp/CLE.F8.vcf --out data/VCF/F8 --keep data/VCF/duplicate.ind --geno-depth
vcftools --vcf data/VCF/temp/CLE.F9a.recode.vcf --out data/VCF/F9a --keep data/VCF/duplicate.ind --geno-depth

vcftools --vcf data/VCF/temp/CLE.F9a.recode.vcf --out data/VCF/CLE.F9a --site-mean-depth

```

Compare the distribution of mean depth per site averaged across individuals to determine cut-off value of sites with excessively high depth indicative of paralogs/multicopy loci.

```{r plot depth dist, fig.height=3, fig.width=5, message=FALSE, warning=FALSE}

# calculate mean depth per site (177 individuals)
read.table("data/VCF/CLE.F9a.ldepth.mean",
           header = TRUE, stringsAsFactors = FALSE) %>% 
  ggplot(aes(x = MEAN_DEPTH)) +
  geom_histogram(binwidth = 20, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(MEAN_DEPTH, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = quantile(MEAN_DEPTH, .95)),
                 color = "darkblue", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = quantile(MEAN_DEPTH, .99)),
                 color = "darkblue", linetype = "dashed", size = 1) +
  scale_y_sqrt() +
  labs(x = "mean depth per site") +
  theme_standard

```

Choose cut-off for maximum mean read depth = 250.

```{bash filter max depth}

vcftools --vcf  data/VCF/temp/CLE.F8.vcf --out data/VCF/temp/CLE.F9 --max-meanDP 180 --exclude-positions data/VCF/temp/LQ_F9.loci --recode --recode-INFO-all 

vcftools --vcf data/VCF/temp/CLE.F9.recode.vcf --out data/VCF/F9 --keep data/VCF/duplicate.ind --012
vcftools --vcf data/VCF/temp/CLE.F9.recode.vcf --out data/VCF/F9 --keep data/VCF/duplicate.ind --geno-depth

vcftools --vcf data/VCF/temp/CLE.F9.recode.vcf --out data/VCF/CLE.F9 --depth
vcftools --vcf data/VCF/temp/CLE.F9.recode.vcf --out data/VCF/CLE.F9 --site-mean-depth
vcftools --vcf data/VCF/temp/CLE.F9.recode.vcf --out data/VCF/CLE.F9 --missing-indv
vcftools --vcf data/VCF/temp/CLE.F9.recode.vcf --out data/VCF/CLE.F9 --missing-site
vcftools --vcf data/VCF/temp/CLE.F9.recode.vcf --out data/VCF/CLE.F9 --het

```

Depth distribution per locus after filtering:

```{r fig.height=3, fig.width=4, message=FALSE, warning=FALSE}

read_table2("data/VCF/CLE.F9.ldepth.mean") %>%
  ggplot(aes(x = MEAN_DEPTH)) +
  geom_histogram(binwidth = 10, color = "black", fill = "darkorange") +
  theme_standard

```

Analyze stats post-filtering:

```{r stats F9, fig.height=20, fig.width=10, message=FALSE, warning=FALSE}

# load stats files ----
ind_stats_F9 <- read.ind.stats(dir = "data/VCF", vcf = "CLE.F9") %>%
  separate(INDV, into = c("SP", "LIB", "SAMPLE_ID"), sep = "_", remove = FALSE)

loc_stats_F9 <- read.loc.stats(dir = "data/VCF/", vcf = "CLE.F9")

```

Data set contains `r nrow(loc_stats_F9)` SNP sites and `r nrow(ind_stats_F9)` individuals.

### Filter 10: Remove Indels

Remove Indels from data set.

```{bash remove indels}

vcfallelicprimitives data/VCF/temp/CLE.F9.recode.vcf --keep-info --keep-geno > data/VCF/temp/CLE.prim.vcf

vcftools --vcf data/VCF/temp/CLE.prim.vcf --out data/VCF/temp/CLE.F10 --remove-indels --recode --recode-INFO-all

vcftools --vcf data/VCF/temp/CLE.F10.recode.vcf --out data/VCF/F10 --keep data/VCF/duplicate.ind --012
vcftools --vcf data/VCF/temp/CLE.F10.recode.vcf --out data/VCF/F10 --keep data/VCF/duplicate.ind --geno-depth

vcftools --vcf data/VCF/temp/CLE.F10.recode.vcf --out data/VCF/CLE.F10 --depth
vcftools --vcf data/VCF/temp/CLE.F10.recode.vcf --out data/VCF/CLE.F10 --site-mean-depth
vcftools --vcf data/VCF/temp/CLE.F10.recode.vcf --out data/VCF/CLE.F10 --missing-indv
vcftools --vcf data/VCF/temp/CLE.F10.recode.vcf --out data/VCF/CLE.F10 --missing-site
vcftools --vcf data/VCF/temp/CLE.F10.recode.vcf --out data/VCF/CLE.F10 --het

```

Analyze stats post-filtering:

```{r stats F10, fig.height=12, fig.width=10, message=FALSE, warning=FALSE}

# load stats files ----
ind_stats_F10 <- read.ind.stats(dir = "data/VCF", vcf = "CLE.F10") %>%
  separate(INDV, into = c("SP", "LIB", "SAMPLE_ID"), sep = "_", remove = FALSE)

loc_stats_F10 <- read.loc.stats(dir = "data/VCF/", vcf = "CLE.F10")

# plot missing data per indv ----
p1 <- ggplot(ind_stats_F10, aes(x = MISS_CLE.F10)) +
  geom_histogram(binwidth = .01, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(MISS_CLE.F10, na.rm = TRUE)),
             color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 0.25),
             color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "missing data per indv") +
  theme_standard

# plot read depth per indv ----
p2 <- ggplot(ind_stats_F10, aes(x = MEAN_DEPTH_CLE.F10)) +
  geom_histogram(binwidth = 10, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(MEAN_DEPTH_CLE.F10, na.rm = TRUE)),
             color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 20),
             color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "mean read depth per indv") +
  theme_standard

# plot depth vs missing ----
p3 <- ggplot(ind_stats_F10, aes(x = MEAN_DEPTH_CLE.F10, y = MISS_CLE.F10)) +
  geom_point() +
  geom_vline(aes(xintercept = mean(MEAN_DEPTH_CLE.F10, na.rm = TRUE)),
             color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 20),
             color = "darkblue", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = mean(MISS_CLE.F10, na.rm = TRUE)),
             color = "red", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = 0.25),
             color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "mean depth per indv", y = "% missing data") +
  theme_standard

# plot distribution missing data per locus ----
p4 <- ggplot(loc_stats_F10, aes(x = MISS_CLE.F10)) +
  geom_histogram(binwidth = 0.01, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(MISS_CLE.F10, na.rm = TRUE)),
             color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 0.1),
             color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "% missing data per locus") +
  theme_standard

# plot distribution mean read depth ----
p5 <- ggplot(loc_stats_F10, aes(x = MEAN_DEPTH_CLE.F10)) +
  geom_histogram(binwidth = 5, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(MEAN_DEPTH_CLE.F10, na.rm = TRUE)),
             color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 20),
             color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "mean read depth per locus") +
  theme_standard

# plot read depth vs missing data ----
p6 <- ggplot(loc_stats_F10, aes(x = MEAN_DEPTH_CLE.F10, y = MISS_CLE.F10)) +
  geom_point() +
  geom_vline(aes(xintercept = mean(MEAN_DEPTH_CLE.F10, na.rm = TRUE)),
             color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 20),
             color = "darkblue", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = mean(MISS_CLE.F10, na.rm = TRUE)),
             color = "red", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = 0.1),
             color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "mean depth per locus", y = "% missing data") +
  theme_standard

m3 <- multiplot(p1, p2, p3, p4, p5, p6, cols=2)

```

Data set contains `r nrow(loc_stats_F10)` SNP sites and `r nrow(ind_stats_F10)` individuals.

### Filter 11: Missing data and minimum depth per locus

Assess individuals in terms of low coverage, high missing data, and frequency burden (i.e. distribution of the number of variants within each individual of a specific frequency).

```{bash geno 90 meanDP 15}

vcftools --vcf data/VCF/temp/CLE.F10.recode.vcf --out data/VCF/temp/CLE.F11 --min-meanDP 15 --max-missing 0.9 --recode --recode-INFO-all

vcftools --vcf data/VCF/temp/CLE.F11.recode.vcf --out data/VCF/F11 --keep data/VCF/duplicate.ind --012
vcftools --vcf data/VCF/temp/CLE.F11.recode.vcf --out data/VCF/F11 --keep data/VCF/duplicate.ind --geno-depth

vcftools --vcf data/VCF/temp/CLE.F11.recode.vcf --out data/VCF/CLE.F11 --depth
vcftools --vcf data/VCF/temp/CLE.F11.recode.vcf --out data/VCF/CLE.F11 --site-mean-depth
vcftools --vcf data/VCF/temp/CLE.F11.recode.vcf --out data/VCF/CLE.F11 --missing-indv
vcftools --vcf data/VCF/temp/CLE.F11.recode.vcf --out data/VCF/CLE.F11 --missing-site
vcftools --vcf data/VCF/temp/CLE.F11.recode.vcf --out data/VCF/CLE.F11 --het
vcftools --vcf data/VCF/temp/CLE.F11.recode.vcf --out data/VCF/CLE.F11 --hardy

```

### Filter 12: Excess heterozygosity

Identify SNPs with more than 0.5 heterozygosity and significant.

```{r}

# calculate observed heterozygosity
hwe <- read.table("data/VCF/CLE.F11.hwe", 
                  stringsAsFactors = FALSE, header = TRUE) %>%
  select(-ChiSq_HWE) %>%
  separate(`OBS.HOM1.HET.HOM2.`, 
           into = c("obs_hom1", "obs_het", "obs_hom2"), 
           sep = "/", convert = TRUE) %>%
  separate(`E.HOM1.HET.HOM2.`, 
           into = c("exp_hom1", "exp_het", "exp_hom2"), 
           sep = "/", convert = TRUE) %>%   
  mutate(Ho = obs_het/(obs_hom1 + obs_hom2 + obs_het),
         He = exp_het/(exp_hom1 + exp_hom2 + exp_het)) %>%
  select(CHR, POS, obs_hom1, exp_hom1, obs_hom2, exp_hom2, P_HET_DEFICIT, obs_het, exp_het, P_HET_EXCESS, Ho, He, P_HWE)

```

Distribution of observed heterozygosity.

```{r fig.height=3, fig.width=4}

ggplot(hwe, aes(x = Ho)) + 
  geom_histogram(binwidth = 0.05, color = "black", fill = "darkorange") +
  geom_vline(xintercept = 0.5, color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "observed heterozygosity") +
  theme_standard

```

Identify loci with heterozygosity > 0.5, then correct p-values for multiple comparisons using Benjamini-Hochberg method.

```{r}

# identify contigs with SNPs with Ho > 0.5 & significant
excess_hwe <- hwe %>%
  filter(Ho > 0.5) %>%
  mutate(p_adj = p.adjust(P_HET_EXCESS), method = "fdr") %>%
  filter(p_adj < 0.01)

contigs <- hwe %>%
  filter(CHR %in% excess_hwe$CHR) %>%
  select(CHR) %>%
  unique()

# identify all SNPs on contigs
SNPs <- hwe %>%
  filter(CHR %in% contigs$CHR) %>%
  select(CHR, POS)

# View(contigs)

# Write contig/position to text file, use file with vcftools to remove positions from dataset 
write.table(SNPs, file = "data/VCF/hetexcess.loci", 
            col.names= FALSE, row.names = FALSE, quote = FALSE)

```

Remove SNPs with excess heterozygosity and contigs with more than one SNP w/excess heterozygosity.

```{bash}

vcftools --vcf data/VCF/temp/CLE.F11.recode.vcf --out data/VCF/temp/CLE.F12 --exclude-positions data/VCF/hetexcess.loci --recode --recode-INFO-all

vcftools --vcf data/VCF/temp/CLE.F12.recode.vcf --out data/VCF/F12 --keep data/VCF/duplicate.ind --012
vcftools --vcf data/VCF/temp/CLE.F12.recode.vcf --out data/VCF/F12 --keep data/VCF/duplicate.ind --geno-depth

vcftools --vcf data/VCF/temp/CLE.F12.recode.vcf --out data/VCF/CLE.F12 --depth
vcftools --vcf data/VCF/temp/CLE.F12.recode.vcf --out data/VCF/CLE.F12 --site-mean-depth
vcftools --vcf data/VCF/temp/CLE.F12.recode.vcf --out data/VCF/CLE.F12 --missing-indv
vcftools --vcf data/VCF/temp/CLE.F12.recode.vcf --out data/VCF/CLE.F12 --missing-site
vcftools --vcf data/VCF/temp/CLE.F12.recode.vcf --out data/VCF/CLE.F12 --het
vcftools --vcf data/VCF/temp/CLE.F12.recode.vcf --out data/VCF/CLE.F12 --geno-depth

```

Visualize stats:

```{r stats F12, fig.height=20, fig.width=10, message=FALSE, warning=FALSE}

# load stats files ----
ind_stats_F12 <- read.ind.stats(dir = "data/VCF", vcf = "CLE.F12") %>%
  separate(INDV, into = c("SP", "LIB", "SAMPLE_ID"), sep = "_", remove = FALSE)

loc_stats_F12 <- read.loc.stats(dir = "data/VCF/", vcf = "CLE.F12")

```

Data set contains `r nrow(loc_stats_F12)` SNP sites and `r nrow(ind_stats_F12)` individuals.

### Filter 13: Filter LQ individuals

Compare mean depth and number of sites called per individual.

```{r idepth sites vs depth, fig.height=3, fig.width=5, message=FALSE, warning=FALSE}

read_table2("data/VCF/CLE.F12.idepth") %>%
  ggplot(aes(x = N_SITES, y = MEAN_DEPTH)) +
  geom_point(shape = 1, size = 1) +
  labs(x = "number of sites", y = "mean read depth") +
  theme_standard

```

Use genotype depth file to identify individuals with high variance in read depth across loci.

```{r variance depth individual}

# number of individuals
n <- nrow(ind_stats_F12)+1

# read genotype depth & code values < 5 as missing, retain only one locus per contig
gdepth <- read_table2("data/VCF/CLE.F12.gdepth") %>%
  select(-POS) %>%
  distinct(CHROM, .keep_all = TRUE) %>%
  gather(key = INDV, value = GDEPTH, 2:n) %>%
  mutate(GDEPTH = as.numeric(gsub(-1, 0, GDEPTH))) %>%
  mutate(DEPTH = GDEPTH) %>%
  mutate(DEPTH = as.numeric(gsub("\\<1\\>", 0, DEPTH))) %>%
  mutate(DEPTH = as.numeric(gsub("\\<2\\>", 0, DEPTH))) %>%
  mutate(DEPTH = as.numeric(gsub("\\<3\\>", 0, DEPTH))) %>%
  mutate(DEPTH = as.numeric(gsub("\\<4\\>", 0, DEPTH))) %>%
  mutate(MISSING = ifelse(DEPTH == 0, "missing", "genotyped")) %>%
  mutate(GDEPTHBIN = cut(GDEPTH, 
                         breaks = c(0, 5, 10, 25, 50, 100, 500, max(GDEPTH)),
                         labels = c("<5", "3-10", "10-25", "25-50", "50-100", "100-500", ">500"))) %>%
  separate(INDV, into = c("SP", "LIB", "SAMPLE_ID"), sep = "_", remove = FALSE, extra = "merge") %>%
  mutate(LIB = str_replace_all(LIB, "2A.*", "CLE2"),
         LIB = str_replace_all(LIB, "2B.*", "CLE2"),
         LIB = str_replace_all(LIB, "3A.*", "CLE3"),
         LIB = str_replace_all(LIB, "3B.*", "CLE3"))

gdepth <- left_join(gdepth, SampleInfo, by = "SAMPLE_ID")

```

Compare distribution of depth.

```{r genotype depth heatmap pop, fig.height=20, fig.width=20}

# would need to join with Sample Info to get pop specifications
ggplot(gdepth, aes(x = INDV, y = CHROM)) +
  geom_tile(aes(fill = GDEPTHBIN)) +
  scale_fill_viridis(direction = -1, option = "viridis",
                     name = 'genotype depth',
                     discrete = TRUE) +
  facet_grid(. ~ POP, space = "free", scales = "free", drop = TRUE) +
  labs(x = "individual", y = "locus") +
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        axis.text.y = element_blank(),
        axis.ticks.y = element_blank(),
        legend.position = "bottom")

```

Compare distribution of depth of individuals grouped by library.

```{r genotype depth heatmap lib, fig.height=20, fig.width=20}

ggplot(gdepth, aes(x = INDV, y = CHROM)) + 
  geom_tile(aes(fill = GDEPTHBIN)) + 
  scale_fill_viridis(direction = -1, option = "viridis",
                     name = 'genotype depth',
                     discrete = TRUE) +
  facet_grid(. ~ LIB, space = "free", scales = "free", drop = TRUE) +
  labs(x = "individual", y = "locus") +
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        axis.text.y = element_blank(),
        axis.ticks.y = element_blank(),
        legend.position = "bottom")

```

Compare distribution of genotype depths (across all individuals).

```{r distribution genotype depths, fig.height=3.5, fig.width=5.5}

temp <- count(gdepth, GDEPTHBIN)

ggplot(temp, aes(x = GDEPTHBIN, y = n)) +
  geom_bar(stat= "identity", color = "black", fill = "darkorange") +
  theme_standard

```

Identify variance in depth w/in individuals.

```{r, fig.height=10, fig.width=9}

# calculate summary statistics
idepth <- gdepth %>%
  group_by(INDV) %>%
  summarize(TOTAL = sum(DEPTH),
            MAX = max(DEPTH),
            MEAN = mean(DEPTH),
            MEDIAN = median(DEPTH),
            VAR = var(DEPTH),
            STD = sd(DEPTH)) %>%
  mutate(COEFF_VAR = STD/MEAN,
         RATIO_MEAN_MEDIAN = MEAN/MEDIAN)

p1 <- ggplot(idepth, aes(x = MEAN, y = MEDIAN)) +
  geom_smooth(method = "auto",linetype = "dashed", size = 1, color = "darkred") +
  geom_point(shape = 1) +
  theme_standard

p2 <- ggplot(idepth, aes(x = MAX, y = TOTAL)) +
  geom_smooth(method = "lm",linetype = "dashed", size = 1, color = "darkred") +
  geom_point(shape = 1) +
  theme_standard

p3 <- ggplot(idepth, aes(x = TOTAL, y = VAR)) +
  geom_smooth(method = "auto",linetype = "dashed", size = 1, color = "darkred") +
  geom_point(shape = 1) +
  theme_standard

p4 <- ggplot(idepth, aes(MEDIAN)) +
  geom_histogram(binwidth = 10, color = "black", fill = "darkorange") +
  theme_standard

p5 <- ggplot(idepth, aes(RATIO_MEAN_MEDIAN)) +
  geom_histogram(binwidth = 0.1, color = "black", fill = "darkorange") +
  theme_standard

p6 <- ggplot(idepth, aes(COEFF_VAR)) +
  geom_histogram(binwidth = 0.05, color = "black", fill = "darkorange") +
  theme_standard

m14b <- multiplot(p1, p2, p3, p4, p5, p6, cols=2)

```

Compare depth, missing data and individual heterozygosity levels.

```{r istats, fig.height=18, fig.width=10, message=FALSE, warning=FALSE}

imiss <- read_table2("data/VCF/CLE.F12.imiss") %>%
  select(INDV, N_DATA, F_MISS)

istats <- left_join(idepth, imiss)

Fis <- read_table2("data/VCF/CLE.F12.het") %>%
  select(-N_SITES)

istats <- left_join(istats, Fis)

p1 <- ggplot(istats, aes(x = MEAN, y = F_MISS, fill = `F`)) +
  geom_point(shape = 21, size = 2, color = "black") +
  scale_fill_viridis(direction = 1, option = "viridis",
                     name = 'Fis', discrete = FALSE) +
  theme_standard

p2 <- ggplot(istats, aes(x = COEFF_VAR, y = MEAN, fill = F_MISS)) +
 geom_point(shape = 21, size = 2, color = "black") +
  scale_fill_viridis(direction = -1, option = "viridis",
                     name = '% missing', discrete = FALSE) +
  theme_standard

p3 <- ggplot(istats, aes(x = `O(HOM)`, y = MEAN, fill = F_MISS)) +
  geom_point(shape = 21, size = 2, color = "black") +
  scale_fill_viridis(direction = -1, option = "viridis",
                     name = 'mean read depth', discrete = FALSE) +
  theme_standard

p4 <- ggplot(istats, aes(x = MEAN)) +
  geom_histogram(binwidth = 10, color = "black", fill = "darkorange") +
  theme_standard

p5 <- ggplot(istats, aes(x = `F`, y = MEAN, fill = F_MISS)) +
  geom_point(shape = 21, size = 2, color = "black") +
  scale_fill_viridis(direction = -1, option = "viridis",
                     name = '% missing', discrete = FALSE) +
  theme_standard

p6 <- ggplot(istats, aes(x = `F`, y = COEFF_VAR, fill = F_MISS)) +
  geom_point(shape = 21, size = 2, color = "black") +
  scale_fill_viridis(direction = -1, option = "viridis",
                     name = '% missing', discrete = FALSE) +
  theme_standard

p7 <- ggplot(istats, aes(x = `F`, y = RATIO_MEAN_MEDIAN, fill = F_MISS)) +
  geom_point(shape = 21, size = 2, color = "black") +
  scale_fill_viridis(direction = -1, option = "viridis",
                     name = '% missing', discrete = FALSE) +
  scale_y_continuous(limits = c(0.8, 4)) +
  theme_standard

p8 <- ggplot(istats, aes(x = F_MISS)) +
  geom_histogram(binwidth = 0.01, color = "black", fill = "darkorange") +
  theme_standard


m14b <- multiplot(p1, p2, p3, p4, p5, p6, p7, p8, cols=2)

```

Flag low quality individuals

```{r}

# View(istats)

LQ_ind <- istats %>%
   filter(`F` > .9 | `F` < -.3) %>%
   select(INDV)

write.table(LQ_ind, "data/VCF/F13_LQ.indv",
            col.names = TRUE, row.names = FALSE, quote = FALSE)

```

### Filter 14: Missing data by sample location

```{bash missing by sample locat}

vcftools --vcf data/VCF/temp/CLE.F12.recode.vcf --out data/VCF/temp/CLE.F13 --remove data/VCF/F13_LQ.indv --recode --recode-INFO-all

vcftools --vcf data/VCF/temp/CLE.F13.recode.vcf --out data/VCF/F13 --keep data/VCF/duplicate.ind --012
vcftools --vcf data/VCF/temp/CLE.F13.recode.vcf --out data/VCF/F13 --keep data/VCF/duplicate.ind --geno-depth

# Aransas Bay
vcftools --vcf data/VCF/temp/CLE.F13.recode.vcf --out data/VCF/temp/ARA --keep data/VCF/ARA.ind  --recode --recode-INFO-all
vcftools --vcf data/VCF/temp/ARA.recode.vcf --out data/VCF/ARA --missing-site

# Corpus Christi Bay
vcftools --vcf data/VCF/temp/CLE.F13.recode.vcf --out data/VCF/temp/CC --keep data/VCF/CC.ind  --recode --recode-INFO-all
vcftools --vcf data/VCF/temp/CC.recode.vcf --out data/VCF/CC --missing-site

# Galveston Bay
vcftools --vcf data/VCF/temp/CLE.F13.recode.vcf --out data/VCF/temp/GAL --keep data/VCF/GAL.ind  --recode --recode-INFO-all
vcftools --vcf data/VCF/temp/GAL.recode.vcf --out data/VCF/GAL --missing-site

# Matagorda Bay
vcftools --vcf data/VCF/temp/CLE.F13.recode.vcf --out data/VCF/temp/MAT --keep data/VCF/MAT.ind  --recode --recode-INFO-all
vcftools --vcf data/VCF/temp/MAT.recode.vcf --out data/VCF/MAT --missing-site

# San Antonio Bay
vcftools --vcf data/VCF/temp/CLE.F13.recode.vcf --out data/VCF/temp/SA --keep data/VCF/SA.ind  --recode --recode-INFO-all
vcftools --vcf data/VCF/temp/SA.recode.vcf --out data/VCF/SA --missing-site

# Sabine Lake 
vcftools --vcf data/VCF/temp/CLE.F13.recode.vcf --out data/VCF/temp/SL --keep data/VCF/SL.ind  --recode --recode-INFO-all
vcftools --vcf data/VCF/temp/SL.recode.vcf --out data/VCF/SL --missing-site

```

Compare missing data per sample region:

```{r compare missing by locat, fig.height=8, fig.width=3, message=FALSE, warning=FALSE}

# create empty list
loc_missing <- list()

# import missing data per locus
loc_missing[[1]] <- read.table("data/VCF/ARA.lmiss", 
                         header = TRUE, stringsAsFactors = FALSE) %>%
   select(CHR, POS, F_MISS) %>%
   mutate(POP = "ARA")
 
 loc_missing[[2]] <- read.table("data/VCF/CC.lmiss", 
                         header = TRUE, stringsAsFactors = FALSE) %>%
   select(CHR, POS, F_MISS) %>%
   mutate(POP = "CC")

loc_missing[[3]] <- read.table("data/VCF/GAL.lmiss", 
                        header = TRUE, stringsAsFactors = FALSE) %>%
  select(CHR, POS, F_MISS) %>%
  mutate(POP = "GAL")

loc_missing[[4]] <- read.table("data/VCF/MAT.lmiss", 
                        header = TRUE, stringsAsFactors = FALSE) %>%
  select(CHR, POS, F_MISS) %>%
  mutate(POP = "MAT")

loc_missing[[5]] <- read.table("data/VCF/SA.lmiss", 
                        header = TRUE, stringsAsFactors = FALSE) %>%
  select(CHR, POS, F_MISS) %>%
  mutate(POP = "SA")

loc_missing[[6]] <- read.table("data/VCF/SL.lmiss", 
                        header = TRUE, stringsAsFactors = FALSE) %>%
  select(CHR, POS, F_MISS) %>%
  mutate(POP = "SL")

# create data frame with all information
loc_missing <- ldply(loc_missing, data.frame)

ggplot(loc_missing, aes(x = F_MISS)) +
  geom_histogram(binwidth = .025, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = 0.1),
             color = "darkred", linetype = "dashed", size = 1) +
  labs(x = "missing data per locus") +
  scale_y_sqrt() +
  facet_grid(POP ~ .) +
  theme_standard

```

identify loci with high missing data in each sample location

```{r}
# identify loci with high missing data in each region
SNPs <- filter(loc_missing, F_MISS > 0.1) 

count(SNPs, POP)

contigs <- SNPs %>%
  distinct(CHR)

SNPs <- SNPs %>%
  select(CHR, POS)

# Write contig/position to text file, use file with vcftools to remove positions from dataset 
write.table(SNPs, file = "data/VCF/LQ_F13.loci", 
            col.names= FALSE, row.names = FALSE, quote = FALSE)
```

Remove loci from data set:

```{bash}

vcftools --vcf data/VCF/temp/CLE.F13.recode.vcf --out data/VCF/temp/CLE.F14 --exclude-positions data/VCF/LQ_F13.loci --recode --recode-INFO-all

vcftools --vcf data/VCF/temp/CLE.F14.recode.vcf --out data/VCF/F14 --keep data/VCF/duplicate.ind --012
vcftools --vcf data/VCF/temp/CLE.F14.recode.vcf --out data/VCF/F14 --keep data/VCF/duplicate.ind --geno-depth

vcftools --vcf data/VCF/temp/CLE.F14.recode.vcf --out data/VCF/CLE.F14 --site-quality
vcftools --vcf data/VCF/temp/CLE.F14.recode.vcf --out data/VCF/CLE.F14 --depth
vcftools --vcf data/VCF/temp/CLE.F14.recode.vcf --out data/VCF/CLE.F14 --site-mean-depth
vcftools --vcf data/VCF/temp/CLE.F14.recode.vcf --out data/VCF/CLE.F14 --missing-indv
vcftools --vcf data/VCF/temp/CLE.F14.recode.vcf --out data/VCF/CLE.F14 --missing-site
vcftools --vcf data/VCF/temp/CLE.F14.recode.vcf --out data/VCF/CLE.F14 --het
vcftools --vcf data/VCF/temp/CLE.F14.recode.vcf --out data/VCF/CLE.F14 --geno-depth
vcftools --vcf data/VCF/temp/CLE.F14.recode.vcf --out data/VCF/CLE.F14 --singletons
vcftools --vcf data/VCF/temp/CLE.F14.recode.vcf --out data/VCF/CLE.F14 --indv-freq-burden
vcftools --vcf data/VCF/temp/CLE.F14.recode.vcf --out data/VCF/CLE.F14 --freq

```

Compare stats:

```{r stats F14, fig.height=20, fig.width=10, message=FALSE, warning=FALSE}

# load stats files ----
ind_stats_F14 <- read.ind.stats(dir = "data/VCF", vcf = "CLE.F14") %>%
  separate(INDV, into = c("SP", "LIB", "SAMPLE_ID"), sep = "_", remove = FALSE, extra = "merge")

loc_stats_F14 <- read.loc.stats(dir = "data/VCF", vcf = "CLE.F14")

# plot missing data per indv ----
p1 <- ggplot(ind_stats_F14, aes(x = MISS_CLE.F14)) +
  geom_histogram(binwidth = .01, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(MISS_CLE.F14, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 0.25),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "missing data per indv") +
  theme_standard

# plot read depth per indv ----
p2 <- ggplot(ind_stats_F14, aes(x = MEAN_DEPTH_CLE.F14)) +
  geom_histogram(binwidth = 10, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(MEAN_DEPTH_CLE.F14, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 20),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "mean read depth per indv") +
  theme_standard

# plot depth vs missing ----
p3 <- ggplot(ind_stats_F14, aes(x = MEAN_DEPTH_CLE.F14, y = MISS_CLE.F14)) +
  geom_point() +
  geom_vline(aes(xintercept = mean(MEAN_DEPTH_CLE.F14, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 20),
                 color = "darkblue", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = mean(MISS_CLE.F14, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = 0.25),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "mean depth per indv", y = "% missing data") +
  theme_standard

# plot Fis per indv ----
p4 <- ggplot(ind_stats_F14, aes(x = Fis_CLE.F14)) +
  geom_histogram(binwidth = .01, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(Fis_CLE.F14, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 0),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "Fis per indv") +
  theme_standard

# plot Fis vs missing data per indv ----
p5 <- ggplot(ind_stats_F14, aes(x = Fis_CLE.F14, y = MISS_CLE.F14)) +
  geom_point() +
  geom_vline(aes(xintercept = mean(Fis_CLE.F14, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 0),
                 color = "darkblue", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = mean(MISS_CLE.F14, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = 0.25),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "Fis per indv", y = "% missing data") +
  theme_standard

# plot Fis vs mean depth per indv ----
p6 <- ggplot(ind_stats_F14, aes(x = Fis_CLE.F14, y = MEAN_DEPTH_CLE.F14)) +
  geom_point() +
  geom_vline(aes(xintercept = mean(Fis_CLE.F14, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 0),
                 color = "darkblue", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = mean(MEAN_DEPTH_CLE.F14, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = 20),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "Fis per indv", y = "mean depth per indv") +
  theme_standard


# plot distribution missing data per locus ----
p7 <- ggplot(loc_stats_F14, aes(x = MISS_CLE.F14)) +
  geom_histogram(binwidth = 0.01, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(MISS_CLE.F14, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 0.1),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "% missing data per locus") +
  theme_standard

# plot distribution mean read depth ----
p8 <- ggplot(loc_stats_F14, aes(x = MEAN_DEPTH_CLE.F14)) +
  geom_histogram(binwidth = 5, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(MEAN_DEPTH_CLE.F14, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 20),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "mean read depth per locus") +
  theme_standard

# plot read depth vs missing data ----
p9 <- ggplot(loc_stats_F14, aes(x = MEAN_DEPTH_CLE.F14, y = MISS_CLE.F14)) +
  geom_point() +
  geom_vline(aes(xintercept = mean(MEAN_DEPTH_CLE.F14, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 20),
                 color = "darkblue", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = mean(MISS_CLE.F14, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = 0.1),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "mean depth per locus", y = "% missing data") +
  theme_standard

# plot depth vs SNP quality ----
site_qual <- read.table("data/VCF/CLE.F14.lqual", 
                        header = TRUE, stringsAsFactors = FALSE) %>%
  mutate(PROB = 10^(-QUAL/10))

temp <- data.frame(loc_stats_F14$MEAN_DEPTH_CLE.F14, site_qual$QUAL) %>%
  rename(depth = loc_stats_F14.MEAN_DEPTH_CLE.F14, qual = site_qual.QUAL)

p10 <- ggplot(temp, aes(x = depth, y = qual)) +
  geom_point(size = 1) +
  geom_vline(aes(xintercept = mean(depth, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 20),
                 color = "darkblue", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = mean(qual, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = 20),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "mean depth per locus", y = "SNP quality") +
  theme_standard

# plot number of SNPs per contig vs. mean depth ----
temp <- loc_stats_F14 %>%
  count(CHR)

p11 <- left_join(temp, loc_stats_F14) %>%
  ggplot() +
  geom_point(aes(x = n, y = MEAN_DEPTH_CLE.F14)) +
  labs(x = "number of SNPs per contig", y = "mean depth") +
  theme_standard

# plot no of SNPs per locus ----
p12 <- loc_stats_F14 %>%
  count(CHR) %>%
  ggplot(aes(x = n)) +
  geom_histogram(binwidth = 1, color = "black", fill = "darkorange") + 
  labs(x = "number of SNPs per locus") +
  theme_standard

m15 <- multiplot(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, cols=2)

```

Compare number of contigs vs number of SNPs.

```{r}

# number of SNPs
nrow(loc_stats_F14)

# number of contigs in data set
contigs <- loc_stats_F14 %>%
  distinct(CHR)

nrow(contigs)

```

Mean number of SNPs per contig is `r nrow(loc_stats_F14)/nrow(contigs)`.

```{r}

istats <- left_join(ind_stats_raw, ind_stats_F14)

```


## Final thresholds values for filtered data set CLE:

* minimum sequence quality (minQ): 20
* minimum genotype quality (minGQ): 20
* minimum genotype call rate per locus (geno): 90%
* minimum genotype depth (minD): 5
* maximum missing data per individual (missInd): 25%
* mean minimum depth per locus (m-minD): 15
* mean maximum depth (m-minD): 180
* dDocent filters run for allelic balance, mapping quality, strandedness and paired status, depth/quality ratio
* Indels removed from data set
* minimum genotype call rate by population: 90%

Data set contains `r nrow(loc_stats_F14)` SNP sites on `r nrow(contigs)` and `r nrow(ind_stats_F14)` individuals.

# Haplotyping

## Prep for haplotyping

Create list of individuals retained in final vcf file:

Change so that new file is printed into Haplotyping/temp folder

```{bash}

vcftools --vcf data/VCF/temp/CLE.F14.recode.vcf --out data/Haplotyping/temp/CLE --recode --recode-INFO-all

vcfsamplenames data/Haplotyping/temp/CLE.recode.vcf > data/Haplotyping/temp/CLE.individuals

```

Use `CLE-CLE.individuals` to create `popmap` as a tab-separated file of individuals and their population designation, with one individual per line (make sure UNIX format). This file is needed to write the genepop file, if not provided the script will run through the process but not write a genepop file and place into same folder `rad_haplotyper.pl` will be run from.

```{r}

popmap <- read.table("data/Haplotyping/temp/CLE.individuals", 
                     col.names = "INDV", stringsAsFactors = FALSE) %>%
  separate(INDV, into = c("POP", "LIB", "ID"), sep = "_", remove = FALSE, extra = "merge") %>%
  select(INDV, POP)

write.table(popmap, "data/Haplotyping/temp/popmap", 
            col.names = FALSE, row.names = FALSE, quote = FALSE)

```

Place all necessary files in `data/Haplotyping/temp` directory (bam, fastq, reference.fasta, vcf-file)

```{bash, eval=FALSE, include=FALSE}

ln -s /home/soleary/BULLSHARKS/CLE_POPGEN/data/SEQ/CLE-2/*.fq.gz /home/soleary/BULLSHARKS/CLE_POPGEN/data/Haplotyping/temp
ln -s /home/soleary/BULLSHARKS/CLE_POPGEN/data/SEQ/CLE-2/*.bam* /home/soleary/BULLSHARKS/CLE_POPGEN/data/Haplotyping/temp
ln -s /home/soleary/BULLSHARKS/CLE_POPGEN/data/SEQ/CLE-3/*.fq.gz /home/soleary/BULLSHARKS/CLE_POPGEN/data/Haplotyping/temp
ln -s /home/soleary/BULLSHARKS/CLE_POPGEN/data/SEQ/CLE-3/*.bam* /home/soleary/BULLSHARKS/CLE_POPGEN/data/Haplotyping/temp

```

## Run Haplotyper

```{bash, eval=FALSE, include=FALSE}

# difficulty running from R
nohup rad_haplotyper.pl -v CLE.recode.vcf -r reference.fasta -x 35 -d 10 -z 3 -m .25 -g CLE.gen -p popmap

```

Copy files for analysis into `data/Haplotyping/`-folder

```{bash}

cd /home/soleary/BULLSHARKS/CLE_POPGEN/data/Haplotyping/temp
rm *.fq.gz
rm *.bam*

cp /home/soleary/BULLSHARKS/CLE_POPGEN/data/Haplotyping/temp/ind_stats.out /home/soleary/BULLSHARKS/CLE_POPGEN/data/Haplotyping/

cp /home/soleary/BULLSHARKS/CLE_POPGEN/data/Haplotyping/temp/stats.out /home/soleary/BULLSHARKS/CLE_POPGEN/data/Haplotyping/

cp /home/soleary/BULLSHARKS/CLE_POPGEN/data/Haplotyping/temp/CLE.gen /home/soleary/BULLSHARKS/CLE_POPGEN/data/Haplotyping/

```

# Haplotype filtering

## Overview of haplotyping success

Comparison of the number of loci that were filtered due to excess number of missing data or suspected paralogs during the haplotyping process to those that passed. Haplotyer was run without any threshold values (other than default values). The genepop file only contains loci that passed haplotyping stage.

```{r import stats.out file, fig.height=3, fig.width=3, message=FALSE, warning=FALSE}

# import stats files generated by haplotyper
hap_stats <- read.hap.stats("data/Haplotyping/stats.out")


hap_ind_stats <- read.delim("data/Haplotyping/ind_stats.out",
                           stringsAsFactors = FALSE)

# view comparison of filtered vs. passed loci
plot.filter.status(hap_stats)

```

Remove filtered loci from data set.

```{r}

count(hap_stats, Status == "FILTERED")

hap_stats <- remove.filtered.haps(hap_stats)

```

`r nrow(hap_stats)` loci passed haplotyping process.

```{r plot stats.out file, fig.height=6, fig.width=9, message=FALSE, warning=FALSE}

plot.hap.stats(hap_stats)

```

Data set contains `r nrow(hap_ind_stats)` individuals:

```{r plot ind.stats.out file, fig.height=6, fig.width=9, message=FALSE, warning=FALSE}

plot.ind.hap.stats(hap_ind_stats)

```

## Identify threshold values to filter data set

### Proportion of individuals haplotyped

```{r prop indv haplotyped, fig.height=4, fig.width=13, message=FALSE, warning=FALSE}

# hap_stats <- read.hap.stats("data/Haplotyping/stats.out")

p1 <- ggplot(hap_stats, aes(x = Prop_Haplotyped)) +
  geom_histogram(binwidth = 0.025, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(Prop_Haplotyped, na.rm = TRUE)),
                 color = "darkred", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 0.9),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "proportion individuals haplotyped") +
  theme_standard

p2 <- ggplot(hap_stats, aes(x = Prop_Haplotyped, y = Poss_Paralog)) +
  geom_point(shape = 21, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(Prop_Haplotyped, na.rm = TRUE)),
                 color = "darkred", linetype = "dashed", size = 1) +
  labs(x = "proportion individuals haplotyped", y = "possible paralogs") +
  theme_standard

p3 <- ggplot(hap_stats, aes(x = Prop_Haplotyped, y = `Low_Cov.Geno_Err`)) +
  geom_point(shape = 21, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(Prop_Haplotyped, na.rm = TRUE)),
                 color = "darkred", linetype = "dashed", size = 1) +
  labs(x = "proportion individuals haplotyped", y = "low coverage error") +
  theme_standard

multiplot(p1, p2, p3, cols = 3)

```

Flag loci successfully haplotyped in < 90% of individuals.

```{r filter loci genotype call rate, message=FALSE, warning=FALSE}

# number of loci called in >95% individuals
count(hap_stats, Prop_Haplotyped < 0.9)

# create vector of loci to remove (choose cut-off)
Prop_Haplotyped <- filter.loci.prop_haplotyped(hap_stats, 0.9)

```

`r length(Prop_Haplotyped)` loci were flagged as genotype call rate < 0.9.

### Possible paralogs per locus

Loci are flagged as possible paralogs for an individuals when more than the expected number of haplotypes based on the SNP genotype call (homozygote, heterozygote) are detected.

```{r poss paralogs per locus, fig.height=3, fig.width=4, message=FALSE, warning=FALSE}

ggplot(hap_stats, aes(x = Poss_Paralog)) +
  geom_histogram(binwidth = 5, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(Poss_Paralog, na.rm = TRUE)),
                 color = "darkred", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = (nrow(hap_ind_stats)*0.01)),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "possible paralogs per locus") +
  theme_standard

```

1 % of individuals: `r nrow(hap_ind_stats)*0.01` 
5 % of individuals: `r nrow(hap_ind_stats)*0.05` 

Flag loci that are flagged as potential paralogs in 3 or more individuals.

```{r filter loci paralogs}

# number of loci with Poss_Paralogs
count(hap_stats, Poss_Paralog > 2)

# create vector of loci to remove (choose cut-off)
Poss_Paralogs <- filter.loci.paralogs(hap_stats, 2)

```

`r length(Poss_Paralogs)` loci were flagged as possible paralogs.

### Number of SNPs & Haplotypes per locus

Each locus varies in the number of SNPs detected which determines the number of haplotypes expected in that population. 

```{r no SNPs, fig.height=3, fig.width=4, message=FALSE, warning=FALSE}

ggplot(hap_stats, aes(x = Sites)) +
  geom_histogram(binwidth = 1, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(Sites, na.rm = TRUE)),
                 color = "darkred", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 25),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "number of SNPs per contig") +
  theme_standard

```

Filtering loci based on number of SNPs contained on that locus could bias the data set as loci with high recombination may be removed. On the other hand, assuming an approximate length of 250 bp loci with more than `r 250*0.1` SNPs would mean that 10% of bases are a polymorphisms.

```{r sites vs paralogs, fig.height=3, fig.width=8}

p1 <- ggplot(hap_stats, aes(x = Haplotypes, y = Poss_Paralog)) +
  geom_point(shape = 21, color = "black", fill = "darkorange") +
  geom_smooth(method = "lm") +
  labs(x = "no. haplotypes", y = "possible paralogs") +
  theme_standard

p2 <- ggplot(hap_stats, aes(x = Sites, y = Poss_Paralog)) +
  geom_point(shape = 21, color = "black", fill = "darkorange") +
  geom_smooth(method = "lm") +
  labs(x = "no. of sites", y = " ") +
  theme_standard
  
multiplot(p1, p2, cols = 2)

```

Identify number of haplotypes loci with > 35 SNP sites.

```{r filter no of SNPs}

count(hap_stats, Sites > 35)

NoSNps <- filter.loci.noSNPs(hap_stats, 35)

```

`r length(NoSNps)` loci were flagged.

Assuming that mutation is the only mechanism resulting in new haplotypes, the maximum expected number of haplotypes per locus is number of SNPs N + 1.

```{r xtra haplotypes, fig.height=12, fig.width=10, message=FALSE, warning=FALSE}

p1 <- ggplot(hap_stats, aes(x = Haplotypes)) +
  geom_histogram(binwidth = 1, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(Haplotypes, na.rm = TRUE)),
                 color = "darkred", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = quantile(Haplotypes, .99, na.rm = TRUE)),
                 color = "darkred", linetype = "dashed", size = 1) +
  labs(x = "# haplotypes per locus", y = "# loci") +
  theme_standard

temp <- hap_stats %>%
  select(Locus, Sites, Haplotypes, Prop_Haplotyped, Low_Cov.Geno_Err, Poss_Paralog) %>%
  mutate(exp_sites = Sites + 1) %>%
  mutate(xtra = Haplotypes - Sites)

p2 <- ggplot(temp, aes(x = Sites, y = Poss_Paralog)) +
  geom_point() +
  geom_hline(yintercept = 5, color = "darkblue", linetype = "dashed", size = 1) +
  geom_vline(xintercept = 25, color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "# SNP sites per locus", y = "possible paralogs") +
  theme_standard

p3 <- ggplot(temp, aes(x = Sites, y = xtra)) +
  geom_point() +
  geom_hline(yintercept = 10, color = "darkblue", linetype = "dashed", size = 1) +
  geom_vline(xintercept = 25, color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "# SNP sites per locus", y = "# extra haplotypes") +
  theme_standard

p4 <- ggplot(temp, aes(x = xtra, y = Prop_Haplotyped)) +
  geom_point(size = 1) +
  geom_hline(yintercept = 0.9, color = "darkblue", linetype = "dashed", size = 1) +
  geom_vline(xintercept = 25, color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "# extra haplotypes", y = "proportion of indv haplotyped") +
  theme_standard

p5 <- ggplot(temp, aes(x = xtra, y = Low_Cov.Geno_Err)) +
  geom_point(size = 1) +
  geom_hline(yintercept = 5, color = "darkblue", linetype = "dashed", size = 1) +
  geom_vline(xintercept = 25, color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "# extra haplotypes", y = "potential genotyping error") +
  theme_standard

p6 <- ggplot(temp, aes(x = xtra, y = Poss_Paralog)) +
  geom_point(size = 1) +
  geom_hline(yintercept = 5, color = "darkblue", linetype = "dashed", size = 1) +
  geom_vline(xintercept = 25, color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "# extra haplotypes", y = "possible paralogs") +
  theme_standard

m21 <- multiplot(p1, p2, p3, p4, p5, p6, cols = 2)

```

Again, removing loci with unexpectedly high numbers of haplotypes may bias the data set, as loci with e.g. high recombination may be removed from the data set.

```{r filter no of haplotypes}

count(temp, xtra >= 50)

# create vector of loci to remove
NoHaps <- filter(temp, xtra >= 50) 

NoHaps <- NoHaps$Locus

```

`r length(NoHaps)` loci were flagged for excessive number of haplotypes compared to the number of SNPs at that locus.

### Low Coverage/Genotyping Error

After combining SNPs on the same contig during the haplotyping process, it is possible to observe fewer than the expected number of haplotypes based on the genotype call (heterozygote/homozygote). When this occurs, that genotype is coded as missing. For each locus the number of individuals for which is it flagged as a potential a potential genotyping error or suffering from null alleles due to low coverage detected for a given locus is recorded. 

```{r low cov genotypg error, fig.height=4, fig.width=5, message=FALSE, warning=FALSE}

ggplot(hap_stats, aes(x = Low_Cov.Geno_Err)) +
  geom_histogram(binwidth = 5, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(Low_Cov.Geno_Err, na.rm = TRUE)),
                 color = "darkred", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = (nrow(hap_ind_stats)*0.05)),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "potential genotyping error") +
  theme_standard

```

Loci with pronounced patterns of genotyping error likely due to low coverage will have low success rate for genotyping, i.e. they will be caught in missing data filters. Coverage issues are likely genotype specfic; previous filters have targeted loci and individuals with high variance in coverage and suspect genotpes have been coded as missing, i.e. this filter need not be very conservative, regardless, loci that are repeatedly being flagged as problematic should be removed.

```{r filter low coverage/genotyping error, message=FALSE, warning=FALSE}

# summary of count of possible genotyping errors
count(hap_stats, Low_Cov.Geno_Err > 10)

# create vector of loci to remove
Geno_Err <- filter.loci.geno_err(hap_stats, 10)

```

`r length(Geno_Err)` loci were flagged as potentially affected by genotyping error.

### Flag problematic individuals

Loci that are not successfully haplotyped in an individual due to missing data, complex locus, haplotyped genotype is higher/lower than SNP haplotype in a given individual are coded as missing for that individual. Problematic individual can be identified as having a high proportion of missing data.

**Individuals with low proportion of successfully haplotyped loci**

```{r missing indv, fig.height=4, fig.width=5, message=FALSE, warning=FALSE}

ggplot(hap_ind_stats, aes(x = Prop_Success)) +
  geom_histogram(binwidth = 0.01, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(Prop_Success, na.rm = TRUE)),
                 color = "darkred", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 0.9),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "proportion loci successfully haplotyped") +
  theme_standard

```

Problem individuals can be identified by choosing a cut-off value for the minium proportion of sucessfully haplotyped loci and excluding individuals below that threshold value. Removing loci with low haplotyping success with decrease the amount of missing data. Therefore, final missing data cut-offs should be applied *after* removing those loci from the data set, though it is important to flag potentially problematic loci based on their haplotyping stats at this point.

```{r filter success haplotyped indv}

# count individuals above set threshold
count(hap_ind_stats, Prop_Success < .9)

# create vector of indv to remove (choose cut-off)
PropSuccessInd <- filter.ind.prop_success(hap_ind_stats, .9)

write.table(PropSuccessInd, "data/POPGEN/propSucc.indv", 
            col.names = TRUE, row.names = FALSE, quote = FALSE, sep = "\t")

# View(as.data.frame(PropSuccessInd))

```

**Possible paralogs per individual**

Individuals with high proportion of loci flagged as possible paralogs should be flagged as potential problem individuals. 

```{r poss paralogs per indv, fig.height=4, fig.width=5, message=FALSE, warning=FALSE}

ggplot(hap_ind_stats, aes(x = Poss_Paralogs)) +
  geom_histogram(binwidth = 10, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(Poss_Paralogs, na.rm = TRUE)),
                 color = "darkred", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = (nrow(hap_stats)*0.01)),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "no of loci potential paralogs") +
  theme_standard

```

Cut-off for individuals with loci flagged paralogs in > 5% of loci is `r nrow(hap_stats)*0.05`.

```{r filter paralogs indv}

# count individuals above set threshold
count(hap_ind_stats, Poss_Paralogs > 150)

# create vector of indv to remove (choose cut-off)
Poss_ParalogInd <- filter.ind.paralogs(hap_ind_stats, 150)

write.table(Poss_ParalogInd, "data/POPGEN/possParal.indv", 
            col.names = TRUE, row.names = FALSE, quote = FALSE, sep = "\t")

# View(as.data.frame(Poss_ParalogInd))

```

The highest number of flagged loci in an individuals is `r max(hap_ind_stats$Poss_Paralogs, na.rm = TRUE)`, which is equivalent to `r round(max(hap_ind_stats$Poss_Paralogs, na.rm = TRUE)/nrow(hap_stats), digits = 4)*100`% of loci.

**Individuals with high proportion of potential allele dropout/genotyping error**

```{r genotypg error per indv, fig.height=3, fig.width=4, message=FALSE, warning=FALSE}

ggplot(hap_ind_stats, aes(x = Low_Coverage.Errors)) +
  geom_histogram(binwidth = 5, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(Low_Coverage.Errors, na.rm = TRUE)),
                 color = "darkred", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = (nrow(hap_stats)*0.01)),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "# loci potential genotyping error", y = "# indv") +
  theme_standard

```

Remove individuals with high proportion of loci that have been flagged as potential allele dropouts/genotyping error.

```{r filter genotyping error indv}

# count individuals above set threshold
count(hap_ind_stats, Low_Coverage.Errors > 150)

# create vector of indv to remove (choose cut-off)
GenoErrIndv <- filter.ind.geno_err(hap_ind_stats, 150)

write.table(GenoErrIndv, "data/POPGEN/GenoErr.indv", 
            col.names = TRUE, row.names = FALSE, quote = FALSE, sep = "\t")

# View(GenoErrIndv)

```

The highest number of flagged loci in an individuals is `r max(hap_ind_stats$Low_Coverage.Errors, na.rm = TRUE)`, which is equivalent to `r round(max(hap_ind_stats$Low_Coverage.Errors, na.rm = TRUE)/nrow(hap_stats), digits = 4)*100`% of loci.

## Filter flagged loci and individuals as necessary

Load genepop file with haplotyped loci.

```{r load genepop, message=FALSE, warning=FALSE}

# import genepop file
gen <- read.genepop(file = "data/Haplotyping/CLE.gen",
                    ncode = 3L, quiet = FALSE)

gen

```

Remove flagged loci and individuals.

```{r remove loci individuals, message=FALSE, warning=FALSE}

# remove flagged loci
removeloc <- c(Poss_Paralogs, NoSNps, NoHaps, Geno_Err, Prop_Haplotyped)

gen <- genind.rem.loci(gen, removeloc)

# remove flagged individuals
removeInd <- c(Poss_ParalogInd, PropSuccessInd, GenoErrIndv)
gen <- gen.ind.rem.Ind(gen, removeInd)

gen

```

# Compare duplicate individuals

```{r comp dulplicate indv, message=FALSE, warning=FALSE}

# extract genotype matrix
df <- genind2df(gen,
                usepop = TRUE,
                sep = ":", oneColPerAll = FALSE) %>%
  select(-pop) %>%
  rownames_to_column()

# create list of duplicates
pairs <- list()

# input each set of duplicates as a vector of a list
pairs[[1]] <- c("CLE_2A-A02_Cleu_Ara002", "CLE_3A-G07_Cleu_Ara002")
pairs[[2]] <- c("CLE_2A-A10_Cleu_Ara010", "CLE_2A-C07_Cleu_Ara010")
pairs[[3]] <- c("CLE_2A-B11_Cleu_Mat001", "CLE_3A-C07_Cleu_Mat001")
pairs[[4]] <- c("CLE_2A-C08_Cleu_Ara091", "CLE_2B-A01_Cleu_Ara091")
pairs[[5]] <- c("CLE_2A-F02_Cleu_San004", "CLE_3B-C07_Cleu_San004")
pairs[[6]] <- c("CLE_2A-G07_Cleu_Dic013", "CLE_3A-B03_Cleu_Dic013")
pairs[[7]] <- c("CLE_2A-G08_Cleu_Ara064", "CLE_3B-A03_Cleu_Ara064")
pairs[[8]] <- c("CLE_2B-B05_Cleu_Dic070", "CLE_3B-C08_Cleu_Dic070")
pairs[[9]] <- c("CLE_2B-B10_Cleu_Mat030", "CLE_3A-G08_Cleu_Mat030")
pairs[[10]] <- c("CLE_2B-F02_Cleu_Ara082", "CLE_3A-C08_Cleu_Ara082")
pairs[[11]] <- c("CLE_2B-G07_Cleu_Dic018", "CLE_3A-B08_Cleu_Dic018")
pairs[[12]] <- c("CLE_2B-G08_Cleu_Mat023", "CLE_3B-C03_Cleu_Mat023")
pairs[[13]] <- c("CLE_3B-G04_Cleu_Dic049", "CLE_3B-G08_Cleu_Dic049")
pairs[[14]] <- c("CLE_2A-B04_Cleu_Dic006", "CLE_3A-F06_Cleu_Ara055")
pairs[[15]] <- c("CLE_2A-E06_Cleu_Sab006", "CLE_3A-E05_Cleu_Sab015")
pairs[[16]] <- c("CLE_3B-F05_Cleu_Dic038", "CLE_3B-F09_Cleu_Dic042")
pairs[[17]] <- c("CLE_2A-B02_Cleu_Dic004", "CLE_3A-F04_Cleu_Ara053")
pairs[[18]] <- c("CLE_2A-E05_Cleu_Sab005", "CLE_3A-E04_Cleu_Sab014")
pairs[[19]] <- c("CLE_2A-A11_Cleu_Dic001", "CLE_3B-B11_Cleu_Dic033")
pairs[[20]] <- c("CLE_2A-B06_Cleu_Dic008", "CLE_3A-F07_Cleu_Ara056")
pairs[[21]] <- c("CLE_2A-B05_Cleu_Dic007", "CLE_3A-F05_Cleu_Ara054")
pairs[[22]] <- c("CLE_2A-E03_Cleu_Sab003", "CLE_3A-E02_Cleu_Sab012")
pairs[[23]] <- c("CLE_2A-B01_Cleu_Dic003", "CLE_3A-F03_Cleu_Ara052")
pairs[[24]] <- c("CLE_2A-B07_Cleu_Dic009", "CLE_3A-F08_Cleu_Ara057")
pairs[[25]] <- c("CLE_2A-E02_Cleu_Sab002", "CLE_3A-E01_Cleu_Sab011")
pairs[[26]] <- c("CLE_2A-E01_Cleu_Sab001", "CLE_3B-E12_Cleu_Sab034")
pairs[[27]] <- c("CLE_2A-E08_Cleu_Sab008", "CLE_3A-E07_Cleu_Sab017")
pairs[[28]] <- c("CLE_2A-B08_Cleu_Dic010", "CLE_3A-F09_Cleu_Ara058")
pairs[[29]] <- c("CLE_2A-E07_Cleu_Sab007", "CLE_3A-E06_Cleu_Sab016")
pairs[[30]] <- c("CLE_2A-F09_Cleu_Ara021", "CLE_2A-G05_Cleu_Ara029")

# create empty list for genotype error (discordant loci)
genoerror <- list()

# create empty vector for duplicates (retains first indv in pair)
dup <- character()

# identify discordant genotypes for each set of duplicates
for (p in pairs){

# select duplicates from genotype matrix
geno <- df %>%
  filter(rowname %in% p) %>%
  select(-rowname)

# compare genotypes
comp <- (t(geno))

contigs <- as.data.frame(comp) %>%
  rownames_to_column(var = "LOCUS") %>%
  mutate(V1 = as.character(V1),
         V2 = as.character(V2)) %>%
  filter(V1 != V2)

# write vector with first individual in pair
ind <- p[1]

dup <- c(dup, ind)

genoerror[[ind]] <- contigs

}

# if it throws error object V1 or V2 not found it means one or more of the samples names are not correct

# combine into one dataframe
genoerror <- ldply(genoerror, data.frame) %>%
  rename(DUP1 = `.id`,
         GENO_INDV1 = V1,
         GENO_INDV2 = V2)

write_delim(genoerror, "results/haplotyped.genoerror", delim = "\t")

```

Compare genotype depth for discordant loci:

```{r message=FALSE, warning=FALSE}

# import genotype depth keep one depth per contig
n <- nrow(ind_stats_F14) + 1

gdepth <- read_table2("data/VCF/CLE.F14.gdepth") %>%
  select(-POS) %>%
  distinct(CHROM, .keep_all = TRUE) %>%
  gather(key = INDV, value = GDEPTH, 2:n) %>%
  rename(LOCUS = CHROM)

# create dataframe with duplicates
dup_indv <- data.frame(matrix(ncol = 2, nrow = 0))
colnames(dup_indv) <- c("DUP1", "DUP2")

for (p in pairs){

p <- as.data.frame(t(p))

colnames(p) <-  c("DUP1", "DUP2")

dup_indv <- bind_rows(dup_indv, p)
   
}

# add duplicate 2 to genoerror data frame
genoerror <- left_join(genoerror, dup_indv)

# add genotype depth information for duplicate 1
gdepth <- gdepth %>%
  rename(DUP1 = INDV)

genoerror <- left_join(genoerror, gdepth) %>%
  rename(GDEPTH1 = GDEPTH)

# add genotype depth information for duplicate 2
gdepth <- gdepth %>%
  rename(DUP2 = DUP1)

genoerror <- left_join(genoerror, gdepth) %>%
  rename(GDEPTH2 = GDEPTH)

# identify homozygote vs heterozygote genotypes
genoerror <- genoerror %>%
  separate(col = GENO_INDV1, into = c("GENO1a", "GENO1b"), sep = ":", remove = FALSE) %>%
  separate(col = GENO_INDV2, into = c("GENO2a", "GENO2b"), sep = ":", remove = FALSE) %>%
  mutate(GENOTYPE1 = ifelse(GENO1a == GENO1b, "HOMOZYGOTE", "HETEROZYGOTE"),
         GENOTYPE2 = ifelse(GENO2a == GENO2b, "HOMOZYGOTE", "HETEROZYGOTE"))

```

Determine number of times discordant genotype is due to heterozygote/homozygote.

```{r}

count(genoerror, GENOTYPE1 == GENOTYPE2)

```

Compare number of discordant genotype calls per duplicate set.

```{r, fig.height=3, fig.width=4}

total <- as.numeric(length(locNames(gen)))

# compar number of loci by pair
per_ind <- count(genoerror, DUP1) %>%
  mutate(freq = n/total*100)

ggplot(per_ind, aes(freq)) +
  geom_histogram(binwidth = 0.25, fill = "darkorange", color = "black") +
  geom_vline(aes(xintercept = mean(freq, na.rm = TRUE)),
             color = "darkred", linetype = "dashed", size = 1) +
  labs(x = "% genotyping error per indv") +
  theme_standard

```

Identify loci consistently affected by genotyping error

```{r fig.height=3, fig.width=4, message=FALSE, warning=FALSE}

total <- as.numeric(length(pairs))

# compare loci affected by genotyping error
per_loc <- count(genoerror, LOCUS) %>%
  mutate(freq = n/total*100) %>%
  rename(Locus = LOCUS)

per_loc <- left_join(per_loc, hap_stats)

ggplot(per_loc, aes(x = n)) +
  geom_histogram(binwidth = 1, fill = "darkorange", color = "black") +
  geom_vline(aes(xintercept = mean(n, na.rm = TRUE)),
             color = "darkred", linetype = "dashed", size = 1) +
  labs(x = "same locus affected in n indv") +
  theme_standard

```

```{r}

count(per_loc, n > 3)

```

Remove flagged loci and one individual per pair.

```{r}

# remove duplicates
removeInd <- dup

gen <- gen.ind.rem.Ind(gen, removeInd)

# remove flagged loci
removeloc <- filter(per_loc, n > 3)
gen <- genind.rem.loci(gen, removeloc)

gen

```

`r nrow(temp)` loci were removed as potentially affected by genotyping error.

# Relatedness & Inbreeding

Create input file fore `related` package.

```{r}

# need to change to R 32
library(related)

# create input file ----
df <- genind2df(gen, usepop = FALSE, oneColPerAll = TRUE) %>%
  rownames_to_column("INDV") %>%
  separate(INDV, into = c("POP", "LIB", "INDV"), sep = "_", extra = "merge") %>%
  select(-POP) %>%
  unite(IND, LIB, INDV, sep = "_")

# missing data must be 0
df[df=="NA"] <- 0

write.table(df, "scratch/related.input",
            row.names = FALSE, col.names = FALSE, sep = "\t", quote = FALSE)

# import input file as list (gdata, nloci, nalleles, ninds, freqs)
genotypedata <- readgenotypedata("scratch/related.input")

```

Calculate pairwise relatedness according to Lynch & Ritland 1999; uses inbreeding estimates for each individuals to estimate relatedness.

```{r lynchrd, fig.height=8, fig.width=7}

# pairwise relatedness (Lynch & Ritland 1999)
relatedness_lynchrd <- coancestry(genotypedata$gdata,
                                  lynchrd = 1)

relatedn <- relatedness_lynchrd$relatedness %>%
  select(pair.no, ind1.id, ind2.id, lynchrd)

write_delim(relatedn, "results/pairwise_relatedness")

View(ind_stats_F14)

inbreed <- relatedness_lynchrd$inbreeding %>%
  select(ind.id, L3, LH) %>%
  rename(INDV = ind.id)

write_delim(inbreed, "results/inbreeding")

```

Distribution of levels of pairwise relatedness:

```{r lynchrd distrib relatedn, fig.height=3, fig.width=8}

ggplot(relatedn, aes(x = lynchrd)) +
  geom_histogram(binwidth = 0.01, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(lynchrd, na.rm = TRUE)),
             color = "darkblue", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = quantile(lynchrd, 0.95)),
             color = "darkred", linetype = "dashed", size = 1) +
  scale_y_sqrt() +
  labs(x = "relatedness", y = "number of pairs") +
  theme_standard

```

Distribution of levels of homozygosity:

```{r lynchrd inbreeding, fig.height=3, fig.width=4}

ggplot(inbreed, aes(x = L3)) +
  geom_histogram(binwidth = 0.005, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(LH, na.rm = TRUE)),
             color = "darkblue", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = quantile(LH, 0.95)),
             color = "darkred", linetype = "dashed", size = 1) +
  labs(x = "inbreeding coefficient", y = "individuals") +
  theme_standard

```

# Heterozygosity and HWE

Generate summary statistics

```{r, message=FALSE, warning=FALSE}

SampleInfo <- read_delim("data/POPGEN/SampleInfo.txt", delim = "\t") %>%
  distinct(SAMPLE_ID, .keep_all = TRUE)

Inds <- as.data.frame(indNames(gen)) %>%
  rename(LIB_ID = `indNames(gen)`) %>%
  separate(LIB_ID, into = c("SP", "WELL", "SAMPLE_ID"), sep = "_", extra = "merge", remove = FALSE)

strata <- left_join(Inds, SampleInfo, by = "SAMPLE_ID") %>%
  distinct()

strata(gen) <- strata

# define groups using strata information - need to add POP for missing ones
setPop(gen) <- ~POP

popNames(gen)

# generate genetic diversity stats
GenDiv <- adegenet::summary(gen)

dat <- hierfstat:::.genind2hierfstat(gen)
GenDiv2 <- basic.stats(dat)

```

Compare observed (Ho) and expected (He) heterozygosity for all individuals across all populations.

```{r}

# observed heterozygosity per locus
Ho <- as.data.frame(GenDiv$Hobs) %>% 
  rownames_to_column("LOCUS") %>%
  rename(Ho = `GenDiv$Hobs`)

# expected heterozygosity per locus
Hs <- as.data.frame(GenDiv$Hexp) %>% 
  rownames_to_column("LOCUS") %>%
  rename(Hexp = `GenDiv$Hexp`)

# expected and observed heterozysity per locus
Het <- left_join(Ho, Hs, by = "LOCUS")

```

Identify loci that are now monomorphic and remove from data set:

```{r}

monomorphic <- filter(Ho, Ho == 0)

View(monomorphic)

monomorphic <- monomorphic$LOCUS

# remove flagged loci
removeloc <- c(monomorphic)
gen <- genind.rem.loci(gen, removeloc)

gen

```

Expected vs observed heterozygosity across all populations.

```{r fig.height=4, fig.width=4}

temp <- Het %>%
  filter(Ho != 0)

# plot Ho vs Hs across all individuals
ggplot(temp, aes(x = Ho, y = Hexp)) +
  geom_point() +
  geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed", size = 1) +
  xlim(0, 1) +
  ylim(0, 1) +
  labs(x = "observed heterozygosity Ho", y = "within cluster diversity Hs (Hexp)") +
  theme_standard

```

Compare distribution of observed heterozygosity (Ho) in each putative population:

```{r}

Ho_per <- as.data.frame(GenDiv2$Ho) %>%
  rownames_to_column("LOCUS") %>%
  rename(ARA = `1`, GAL = `2`, ULM = `3`, LMM = `4`, MAT = `5`, 
         SL = `6`, SA = `7`, CC = `8`) %>%
  select(LOCUS, ARA, GAL, ULM, LMM, MAT, SL, SA, CC) %>%
  gather(Group, Ho, 2:9)

# compare observed heterozygosity per locus and cluster
ggplot(Ho_per, aes(x = Group, y = Ho)) +
  geom_boxplot() +
  labs(x = "Group", y = "observed heterozygosity Ho") +
  theme_standard

```

Compare distribution of expected heterozygosity He, (within cluster diversity) in each putative population:

```{r}

# within cluster diversity/exp diversity per locus (rows) and cluster (columns)
Hs_per <- as.data.frame(GenDiv2$Hs) %>%
  rownames_to_column("LOCUS") %>%
  rename(ARA = `1`, GAL = `2`, ULM = `3`, LMM = `4`, MAT = `5`, 
         SL = `6`, SA = `7`, CC = `8`) %>%
  select(LOCUS, ARA, GAL, ULM, LMM, MAT, SL, SA, CC) %>%
  gather(Group, Hs, 2:9)

# compare expected heterozygosity per locus and cluster
ggplot(Hs_per, aes(x = Group, y = Hs)) +
  geom_boxplot() +
  labs(x = "Group", y = "expected heterozygosity Ho") +
  theme_standard

```

Comparison of oberved vs. heterozygosity within each population:

```{r, fig.height=8, fig.width=12, message=FALSE, warning=FALSE}

# join data frames
Het_per <- left_join(Hs_per, Ho_per)

# plot per cluster
ggplot(Het_per, aes(x = Ho, y = Hs)) +
  geom_point(shape = 19, size = 1) +
  geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed", size = 1) +
  xlim(0, 1) +
  ylim(0, 1) +
  facet_grid(. ~ Group) +
  labs(x = "observed heterozygosity Ho", y = "within cluster diversity Hs (Hexp)") +
  theme_standard

```

Test for deviations from Hardy-Weinberg equilibrium within each group (POP level):

```{r}

# setPop(gen) <- ~POP
# 
# popNames(gen)
# 
# # should be able to use lapply
# HWE <- seppop(gen) %>% 
#    lapply(hw.test, B=1000)
#  
# HWE[["ALL"]] <- hw.test(gen, B = 1000)
#  
# # convert to data.frames
# hwe <- list()
#  
# for (m in names(HWE)) {
#    hwe[[m]] <- as.data.frame(HWE[[m]]) %>%
#    rename(pval = Pr.exact) %>%
#    rownames_to_column("LOCUS") %>%
#    select(LOCUS, pval)
# }
#  
# hwe <- ldply(hwe, data.frame) %>%
#    rename(POP = `.id`) %>%
#    mutate(HWE = ifelse(pval <= 0.05, "OUT", "IN"))
# 
# write_delim(hwe, "results/HWEbypop", delim = "\t")

hwe <- read_delim("results/HWEbypop", delim = "\t")

hwe_perpop <- hwe %>%
  group_by(POP) %>%
  count(HWE)

hwe_perloc <- hwe %>%
  filter(POP != "ALL") %>%
  group_by(LOCUS) %>%
  count(HWE) %>%
  filter(HWE == "OUT")

```

Format output and view results:

```{r HWE results, fig.height=3, fig.width=12, message=FALSE, warning=FALSE}

# plot data
ggplot(hwe_perpop, aes(x = HWE, y = n)) +
  geom_bar(stat = "identity", color = "black", fill = "darkorange") +
  labs(x = "significant deviation from HWE (p > 0.05)", y = "number of loci") +
  facet_grid( ~ POP) +
  theme_standard

```

Distribution of number of times loci are out of HWE

```{r fig.height=3, fig.width=4}

ggplot(hwe_perloc, aes(x = n)) +
  geom_histogram(binwidth = 1, color = "black", fill = "darkorange") +
  labs(x = "no populations out of HWE", y = "no. loci") +
  theme_standard

```

Remove loci out of HWE in every population:

```{r}

# remove flagged loci
removeloc <- hwe_perloc %>%
  filter(n >= 5)

gen <- genind.rem.loci(gen, removeloc$LOCUS)

gen

```

# Final data set

Write file with filtered data set:

```{r write filtered data set}

# library(radiator)
# 
# setPop(gen) <- ~POP
# 
# # convert to tidy data set
# tidy <- genomic_converter(data = gen,
#                           output = c("genepop", "genind"), filename = "data/POPGEN/CLE")
# 
# tidy_gen <- tidy$tidy.data
# 
# write_delim(tidy_gen, "data/POPGEN/CLE.tidy.genotypyes", delim = "\t")
# 
# # import to convert and write out arlequin and genepop formatted files
# tidy <- read_delim("data/POPGEN/CLE.tidy.genotypyes", delim = "\t")
# 
# gen <- genomic_converter(data = tidy, output = c("genepop", "genind"), filename = "data/POPGEN/CLE")
# 
# # create genind object
# gen <- gen$genind.no.imputation
# 
# gen

df <- genind2df(gen, pop = NULL, sep = ":") %>%
  rownames_to_column("LIB_ID")

write_delim(df, "data/POPGEN/CLE.filtered.genotypes", delim = "\t")

```

Write files with individuals and Contigs still contained in data set and use to filter vcf file.

```{r}

keeploci <- as.data.frame(locNames(gen)) %>%
  rename(LOCUS = `locNames(gen)`)

keeploci <- filter(loc_stats_F14, CHR %in% keeploci$LOCUS) %>%
  select(CHR, POS)

write.table(keeploci, "data/VCF/filtered.loci",
            col.names = FALSE, row.names = FALSE, quote = FALSE)

keepind <- as.data.frame(indNames(gen)) %>%
  rename(LIB_ID = `indNames(gen)`) %>%
  separate(LIB_ID, into = c("SP", "LIB", "WELL", "SAMPLE_ID"), sep = "-", extra = "merge") %>%
  separate(SAMPLE_ID, into = c("P1", "P2"), sep = "-", remove = TRUE) %>%
  unite(SAMPLE_ID, 4:5, sep = "_") %>%
  unite(LIBWELL, 2:3, sep = "-") %>%
  unite(LIB_ID, 1:3, sep = "_")
  
keepind <-filter(ind_stats_F14, INDV %in% keepind$LIB_ID) %>%
  select(INDV)

write.table(keepind, "data/VCF/filtered.ind",
            col.names = FALSE, row.names = FALSE, quote = FALSE)

```

Write vcf file and 012 format:

```{bash}

vcftools --vcf data/VCF/temp/CLE.F14.recode.vcf --out data/VCF/temp/CLE --positions data/VCF/filtered.loci --keep data/VCF/filtered.ind --recode --recode-INFO-all

vcftools --vcf data/VCF/temp/CLE.recode.vcf --out data/POPGEN/CLE --012

vcftools --vcf data/VCF/temp/CLE.recode.vcf --out data/VCF/CLE.fil --depth
vcftools --vcf data/VCF/temp/CLE.recode.vcf --out data/VCF/CLE.fil --site-mean-depth
vcftools --vcf data/VCF/temp/CLE.recode.vcf --out data/VCF/CLE.fil --missing-indv
vcftools --vcf data/VCF/temp/CLE.recode.vcf --out data/VCF/CLE.fil --missing-site
vcftools --vcf data/VCF/temp/CLE.recode.vcf --out data/VCF/CLE.fil --het
vcftools --vcf data/VCF/temp/CLE.recode.vcf --out data/VCF/CLE.fil --hardy
vcftools --vcf data/VCF/temp/CLE.recode.vcf --out data/VCF/CLE.fil --site-quality
vcftools --vcf data/VCF/temp/CLE.recode.vcf --out data/VCF/CLE.fil --geno-depth

```

Compare stats for final filtered data set:
    
```{r stats final, fig.height=20, fig.width=10, message=FALSE, warning=FALSE}

# load stats files ----
ind_stats_fil <- read.ind.stats(dir = "data/VCF", vcf = "CLE.fil") %>%
  separate(INDV, into = c("SP", "LIB", "SAMPLE_ID"), sep = "_", remove = FALSE)

loc_stats_fil <- read.loc.stats(dir = "data/VCF/", vcf = "CLE.fil")

View(ind_stats_fil)
View(hap_ind_stats)

# plot missing data per indv ----
p1 <- ggplot(ind_stats_fil, aes(x = MISS_CLE.fil)) +
  geom_histogram(binwidth = .01, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(MISS_CLE.fil, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 0.25),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "missing data per indv") +
  theme_standard

# plot Fis per indv ----
p2 <- ggplot(ind_stats_fil, aes(x = Fis_CLE.fil)) +
  geom_histogram(binwidth = .01, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(Fis_CLE.fil, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 0),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "Fis per indv") +
  theme_standard

# plot read depth per indv ----
p3 <- ggplot(ind_stats_fil, aes(x = MEAN_DEPTH_CLE.fil)) +
  geom_histogram(binwidth = 10, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(MEAN_DEPTH_CLE.fil, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 20),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "mean read depth per indv") +
  theme_standard

# plot depth vs missing ----
p4 <- ggplot(ind_stats_fil, aes(x = MEAN_DEPTH_CLE.fil, y = MISS_CLE.fil)) +
  geom_point() +
  geom_vline(aes(xintercept = mean(MEAN_DEPTH_CLE.fil, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 20),
                 color = "darkblue", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = mean(MISS_CLE.fil, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = 0.25),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "mean depth per indv", y = "% missing data") +
  theme_standard

# plot Fis vs missing data per indv ----
p5 <- ggplot(ind_stats_fil, aes(x = Fis_CLE.fil, y = MISS_CLE.fil)) +
  geom_point() +
  geom_vline(aes(xintercept = mean(Fis_CLE.fil, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 0),
                 color = "darkblue", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = mean(MISS_CLE.fil, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = 0.25),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "Fis per indv", y = "% missing data") +
  theme_standard

# plot Fis vs mean depth per indv ----
p6 <- ggplot(ind_stats_fil, aes(x = Fis_CLE.fil, y = MEAN_DEPTH_CLE.fil)) +
  geom_point() +
  geom_vline(aes(xintercept = mean(Fis_CLE.fil, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 0),
                 color = "darkblue", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = mean(MEAN_DEPTH_CLE.fil, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = 20),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "Fis per indv", y = "mean depth per indv") +
  theme_standard

# plot distribution missing data per locus ----
p7 <- ggplot(ind_stats_fil, aes(x = MISS_CLE.fil)) +
  geom_histogram(binwidth = 0.01, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(MISS_CLE.fil, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 0.1),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "% missing data per locus") +
  theme_standard

# plot distribution mean read depth ----
p8 <- ggplot(loc_stats_fil, aes(x = MEAN_DEPTH_CLE.fil)) +
  geom_histogram(binwidth = 5, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(MEAN_DEPTH_CLE.fil, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 20),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "mean read depth per locus") +
  theme_standard

# plot read depth vs missing data ----
p9 <- ggplot(loc_stats_fil, aes(x = MEAN_DEPTH_CLE.fil, y = MISS_CLE.fil)) +
  geom_point() +
  geom_vline(aes(xintercept = mean(MEAN_DEPTH_CLE.fil, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 20),
                 color = "darkblue", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = mean(MISS_CLE.fil, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = 0.1),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "mean depth per locus", y = "% missing data") +
  theme_standard

# plot no of SNPs per locus ----
p10 <- loc_stats_fil %>%
  count(CHR) %>%
  ggplot(aes(x = n)) +
  geom_histogram(binwidth = 1, color = "black", fill = "darkorange") + 
  labs(x = "number of SNPs per locus") +
  theme_standard

temp <- loc_stats_fil %>%
  count(CHR)

# plot number of SNPs per contig vs. mean depth ----
p11 <- left_join(temp, loc_stats_fil) %>%
  ggplot() +
  geom_point(aes(x = n, y = MEAN_DEPTH_CLE.fil)) +
  labs(x = "number of SNPs per contig", y = "mean depth") +
  theme_standard

# plot depth vs SNP quality ----
site_qual <- read.table("data/VCF/CLE.fil.lqual", 
                        header = TRUE, stringsAsFactors = FALSE) %>%
  mutate(PROB = 10^(-QUAL/10))

temp <- data.frame(loc_stats_fil$MEAN_DEPTH_CLE.fil, site_qual$QUAL) %>%
  rename(depth = loc_stats_fil.MEAN_DEPTH_CLE.fil, qual = site_qual.QUAL)

p12 <- ggplot(temp, aes(x = depth, y = qual)) +
  geom_point(size = 1) +
  geom_vline(aes(xintercept = mean(depth, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 20),
                 color = "darkblue", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = mean(qual, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = 20),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "mean depth per locus", y = "SNP quality") +
  theme_standard

mfil <- multiplot(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, cols=2)

```