-
Notifications
You must be signed in to change notification settings - Fork 1
/
1.File.formatting.R
61 lines (48 loc) · 2.09 KB
/
1.File.formatting.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
## ################ ##
## File formatting ##
## Narinder Singh ##
## ################ ##
# load functions and required libraries
if (!require("pacman")) install.packages("pacman")
pacman::p_load('data.table', 'ape', 'phyclust', 'rrBLUP', 'scatterplot3d', 'rgl', 'ggplot2',
'corrplot', 'plyr', 'multtest', 'gplots', 'LDheatmap', 'genetics', 'compiler',
'mapdata', 'maps', 'ggmap', 'snpStats')
source('functions.R')
if(!file.exists('project.log')) file.create('project.log')
# read and format the data file
hap <- fread("data/Ae_tauschiiDiversity.hmp.txt", header = T, check.names = F, data.table = F)
table(hap$chrom)
hap <- hap[grep('NWV', hap$chrom, invert = T), ]
table(hap$chrom)
# update chromosome designation
chrom.designation <- read.table(file = 'data/pseudoChrom2chrom.txt', header = T)
for (i in 1:nrow(chrom.designation)) {
hap$chrom[hap$chrom == chrom.designation$pseudoChrom[i]] = chrom.designation$chrom[i]
}
table(hap$chrom)
# retain only biallelic SNP sites
biallelic = sapply(strsplit(as.character(hap$alleles), '/'), length) == 2
sum(!biallelic)
hap = hap[biallelic, ]
# replace het IUPAC codes with H and N
missAmbiguous <- c('0', '+', '-')
hetCodes <- c('R','Y','S','W','K','M','B','D','H','V')
hapgeno <- as.matrix(hap[, 12:ncol(hap)])
hapgeno[hapgeno %in% missAmbiguous] = 'N'
hapgeno[hapgeno %in% hetCodes] = 'H'
sort(unique(c(hapgeno)))
hap=cbind(hap[,1:11], hapgeno)
rm(biallelic, hapgeno, hetCodes, missAmbiguous)
# using empty columns
colnames(hap)[5:11]=c('alleleA', 'alleleB', 'het', 'het_prop', 'missing', 'propmiss', 'maf')
# removing known wrong sample
hap <- hap[, !colnames(hap) %in% remove.samples]
# writing hap file in text file format
write.table(hap, file="data/hapFile.txt" , col.names=TRUE, row.names=FALSE, sep="\t", quote=FALSE)
# write sample names to a file
if(!file.exists('output')) dir.create('output')
initial.sample.names <- cbind('TA'=sort(colnames(hap)[-c(1:11)]), 'Sample'=NA)
write.table(initial.sample.names, file = "output/initial.sample.names.txt", row.names = F, quote = F, sep = '\t')
#########
## END ##
#########