-
Notifications
You must be signed in to change notification settings - Fork 6
/
BRCA1_SGE2_global_20180111.Rmd
2772 lines (2116 loc) · 482 KB
/
BRCA1_SGE2_global_20180111.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
---
title: "BRCA1_SGE2_global_20171119.Rmd"
author: "Greg Findlay"
date: "11/19/2017"
output: html_document
---
Updated global analysis scripts to apply filters and get final functional scores with updated pipeline for both SGE1 and SGE2, includes mixture modeling to define thresholds and filter off concordance,
(previously have run the XrL4_all_ggplot_removed scripts and the positional modeling v6 scripts)
```{r}
#MUST RUN POSITIONAL CORRECTION MARKDOWN SCRIPTS
#i.e. positional_modeling_SGE2_v6_20171012.Rmd
# tHDR_pos_merge_df are built with XyyrL41_prog_map_thresh5_df_rL42_int_ord_df -- prog_map means: all_hdr_variant == TRUE, !=WT ,POS!= REF, and > threshold of 0.00001
#add a position from PAM based on exon...
# merge df's built off the prog_map_thresh_5 data sets.
#previous change in code in sam_to_edits script dictated that 'hdr5' is always the cut-proximal site, and 'hdr3' is always the cut-distal site, so here even though X2, X5 and X15 are cut at the 3' PAM site, 'hdr5' is still used to indicate we want the data at the cut site.
require(ggplot2)
require(gridExtra)
require(mclust)
require(VennDiagram)
library(gam)
library(mixtools)
library(plotROC)
named_conseq_colors_old = c("STOP_GAINED" = "#F3001C","CANONICAL_SPLICE" = "#FD9100","INTRONIC" = "#0B3DB9", "NON_SYNONYMOUS" = "#00BA74", "SPLICE_SITE" = "#3885DB", "SYNONYMOUS" = "#456CCC","STOP_LOST" = "#9D0012","REGULATORY"="#B5A695","5PRIME_UTR"="#B5A695", "REF" = "#000000")
named_conseq_colors_full = c("STOP_GAINED" = "#F3001C","CANONICAL_SPLICE" = "#FD9100","INTRONIC" = "#0B3DB9", "NON_SYNONYMOUS" = "#00BA74", "SPLICE_SITE" = "#CF7700", "SYNONYMOUS" = "#456CCC","STOP_LOST" = "#9D0012","REGULATORY"="#B5A695","5PRIME_UTR"="#B5A695", "REF" = "#000000", "absent" = "#BFBCBC","Benign" = "#013DD3","Conflicting interpretations of pathogenicity" = "#FFCB00","Likely benign" = "#2360F9","Likely pathogenic" = "#FA2039","Pathogenic" = "#A40000","Uncertain significance" = "#00A54D" )
named_conseq_colors_full_simple = c("STOP_GAINED" = "#F3001C","CANONICAL_SPLICE" = "#FD9100","INTRONIC" = "#456CCC", "NON_SYNONYMOUS" = "#00BA74", "SPLICE_SITE" = "#456CCC", "SYNONYMOUS" = "#456CCC","STOP_LOST" = "#9D0012","REGULATORY"="#B5A695","5PRIME_UTR"="#B5A695", "REF" = "#000000", "absent" = "#BFBCBC","Benign" = "#013DD3","Conflicting interpretations of pathogenicity" = "#FFC927","Likely benign" = "#2360F9","Likely pathogenic" = "#FA2039","Pathogenic" = "#A40000","Uncertain significance" = "#31947B","Gray" = "#000000" )
named_conseq_colors = c("STOP_GAINED" = "#F3001C","CANONICAL_SPLICE" = "#FD9100","INTRONIC" = "#6793F8", "NON_SYNONYMOUS" = "#00BA74", "SPLICE_SITE" = "#002372", "SYNONYMOUS" = "#0C4EE5","STOP_LOST" = "#9D0012","REGULATORY"="#B5A695","5PRIME_UTR"="#B5A695", "REF" = "#000000")
X2rL4_tHDR_pos_merge_df$hdr_cut <- X2rL4_tHDR_pos_merge_df$hdr5
X3rL4_tHDR_pos_merge_df$hdr_cut <- X3rL4_tHDR_pos_merge_df$hdr5
X4rL4_tHDR_pos_merge_df$hdr_cut <- X4rL4_tHDR_pos_merge_df$hdr5
X5rL4_tHDR_pos_merge_df$hdr_cut <- X5rL4_tHDR_pos_merge_df$hdr5
X15rL4_tHDR_pos_merge_df$hdr_cut <- X15rL4_tHDR_pos_merge_df$hdr5
X16rL4_tHDR_pos_merge_df$hdr_cut <- X16rL4_tHDR_pos_merge_df$hdr5
X17rL4_tHDR_pos_merge_df$hdr_cut <- X17rL4_tHDR_pos_merge_df$hdr5
X18rL4_tHDR_pos_merge_df$hdr_cut <- X18rL4_tHDR_pos_merge_df$hdr5
X19rL4_tHDR_pos_merge_df$hdr_cut <- X19rL4_tHDR_pos_merge_df$hdr5
X20rL4_tHDR_pos_merge_df$hdr_cut <- X20rL4_tHDR_pos_merge_df$hdr5
X21rL4_tHDR_pos_merge_df$hdr_cut <- X21rL4_tHDR_pos_merge_df$hdr5
X22rL4_tHDR_pos_merge_df$hdr_cut <- X22rL4_tHDR_pos_merge_df$hdr5
X23rL4_tHDR_pos_merge_df$hdr_cut <- X23rL4_tHDR_pos_merge_df$hdr5
#calling these again in case old script re-assigned without the 0-indexing adjustment
X2_cut_pos <- 41276033.5+1
X3_cut_pos <- 41267763.5+1
X4_cut_pos <- 41258518.5+1
X5_cut_pos <- 41256892.5+1
X15_cut_pos <- 41222951.5+1
X16_cut_pos <- 41219676.5+1
X17_cut_pos <- 41215950.5+1
X18_cut_pos <- 41215377.5+1
X19_cut_pos <- 41209129.5+1
X20_cut_pos <- 41203127.5+1
X21_cut_pos <- 41201207.5+1
X22_cut_pos <- 41199706.5+1
X23_cut_pos <- 41197794.5+1
#binding all of the merge df's together...
XrL4_all_tHDR_pos_merge_master_df <- rbind(X2rL4_tHDR_pos_merge_df,X3rL4_tHDR_pos_merge_df,X4rL4_tHDR_pos_merge_df,X5rL4_tHDR_pos_merge_df,X15rL4_tHDR_pos_merge_df,X16rL4_tHDR_pos_merge_df,X17rL4_tHDR_pos_merge_df,X18rL4_tHDR_pos_merge_df,X19rL4_tHDR_pos_merge_df,X20rL4_tHDR_pos_merge_df,X21rL4_tHDR_pos_merge_df,X22rL4_tHDR_pos_merge_df,X23rL4_tHDR_pos_merge_df)
XrL4_all_tHDR_pos_merge_master_df$pos_alt <- paste(XrL4_all_tHDR_pos_merge_master_df$pos, XrL4_all_tHDR_pos_merge_master_df$alt, sep='')
#add a clinvar transcript annotation here
#add a clinvar mutation string i.e. c.
#add some sort of intronic indication (i.e. c. 5017 -3 C -> A)
write.table(XrL4_all_tHDR_pos_merge_master_df, "/mount/SGE/BRCA1/XrL4_all_tHDR_pos_merge_v6_df_20171015.txt", sep="\t")
XrL4_all_tHDR_pos_merge_master_df <- read.table("/mount/SGE/BRCA1/XrL4_all_tHDR_pos_merge_v6_df_20171015.txt", sep="\t", header=TRUE)
#import this table back in as a data frame now that all analysis is final... re-run from here.
#can everything be re-run from this df?
### Detect outliers in data set by comparing post-pre selection and pre-lib selection
#accounting for position again, but emphasizing outliers
ggplot(XrL4_all_tHDR_pos_merge_master_df, aes(x=log2(tHDR_post_pre_ratio_rL41rL42_mean_synnorm),y=rL41rL42_tHDR_pre_lib_loess_rL41rL42, color=conseq,size = rL41rL42_tHDR_post_lib_loess_rL41rL42))+geom_point(alpha=0.6)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank(),legend.position='None')+ylab('Pre/lib loess2 corr. log-2')+xlab('Post/pre log-2')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)+scale_size_continuous(range = c(0.1, 2))+geom_text(aes(label=paste(exon,pos),alpha=abs(log2(tHDR_post_pre_ratio_rL41rL42_mean_synnorm)-rL41rL42_tHDR_pre_lib_loess_rL41rL42)),hjust=0.5, vjust=0.5, size = 1.4,colour='Black')
#same plot as above but fitting curves by conseq, as well as total fit
ggplot(XrL4_all_tHDR_pos_merge_master_df, aes(x=log2(tHDR_post_pre_ratio_rL41rL42_mean_synnorm),y=rL41rL42_tHDR_pre_lib_loess_rL41rL42, color=conseq,size = rL41rL42_tHDR_post_lib_loess_rL41rL42))+geom_point(alpha=0.3)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank(),legend.position = 'None')+ylab('Pre/lib loess2 corr. log-2')+xlab('Post/pre log-2')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)+scale_size_continuous(range = c(0.1, 2))+geom_smooth(aes(color = XrL4_all_tHDR_pos_merge_master_df$conseq), method = 'lm',se=FALSE)+geom_smooth(aes(color='REF'),method = 'lm')
#does post/lib give a different result than pre/lib+post/pre?
ggplot(XrL4_all_tHDR_pos_merge_master_df, aes(x=log2(tHDR_post_pre_ratio_rL41rL42_mean_synnorm) + rL41rL42_tHDR_pre_lib_loess_rL41rL42, y=rL41rL42_tHDR_post_lib_loess_rL41rL42,color=conseq))+geom_point(alpha=0.3)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank(),legend.position='None')+ylab('Post/lib corr. log-2')+xlab('Post/pre + pre/lib corr. log-2')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors_full_simple)+scale_color_manual(values=named_conseq_colors_full_simple)+scale_size_continuous(range = c(0.1, 2))
#does post/lib give a different result than pre/lib+post/pre (labelled with exon)
ggplot(XrL4_all_tHDR_pos_merge_master_df, aes(x=log2(tHDR_post_pre_ratio_rL41rL42_mean_synnorm) + rL41rL42_tHDR_pre_lib_loess_rL41rL42, y=rL41rL42_tHDR_post_lib_loess_rL41rL42,color=conseq))+geom_point(alpha=0.3)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank(),legend.position='None')+ylab('Post/lib corr. log-2')+xlab('Post/pre + pre/lib corr. log-2')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)+scale_size_continuous(range = c(0.1, 2))+geom_text(aes(label=paste(exon,paste(pos,alt,sep=''))),hjust=0.5, vjust=0.5, size = 1.4,colour='Black')
#and with a density contour...
ggplot(XrL4_all_tHDR_pos_merge_master_df, aes(x=log2(tHDR_post_pre_ratio_rL41rL42_mean_synnorm),y=rL41rL42_tHDR_pre_lib_loess_rL41rL42))+geom_point(aes(color=conseq),alpha=0.3)+stat_density2d(aes(alpha=..level..), geom="polygon") +scale_alpha_continuous(limits=c(0.2,5), breaks=seq(0,0.05,by=0.002)) + theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank(),legend.position = 'None')+ylab('Pre/lib loess2 corr. log-2')+xlab('Post/pre log-2')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)
#ID outliers with high pre/lib:
XrL4_all_tHDR_pos_merge_master_df[which(as.numeric(as.character(XrL4_all_tHDR_pos_merge_master_df$rL41rL42_tHDR_pre_lib_loess_rL41rL42)) > 1.2),c('exon','pos','conseq','rL41rL42_tHDR_pre_lib_loess_rL41rL42','tHDR_post_pre_ratio_rL41rL42_mean_synnorm')]
#ID outliers with lost pre/lib
XrL4_all_tHDR_pos_merge_master_df[which(as.numeric(as.character(XrL4_all_tHDR_pos_merge_master_df$rL41rL42_tHDR_pre_lib_loess_rL41rL42)) < -2 ),c('exon','pos','conseq','rL41rL42_tHDR_pre_lib_loess_rL41rL42','tHDR_post_pre_ratio_rL41rL42_mean_synnorm')]
#global plot vs. error:
ggplot(XrL4_all_tHDR_pos_merge_master_df, aes(x=corr_pre_pseudo/pre_pseudo,y=pre_pseudo_freq, color=conseq,size=corr_pre))+geom_point(alpha=1)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank(),legend.position='None')+ylab('pre freq.2')+xlab('fraction reads est. from error')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)
#do any variants drop out completely? lib to pre
#with size being correction rate:
ggplot(XrL4_all_tHDR_pos_merge_master_df, aes(y=log10((pre_pseudo_freq+rL42_pre_pseudo_freq)/2),x=log10(lib_freq), color=conseq,size=(1-(corr_pre_pseudo/pre_pseudo+rL42_corr_pre_pseudo/rL42_pre_pseudo)/2)))+geom_point(alpha=1)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+xlab('lib freq. log-10')+ylab('mean. pre freq. (cSNV) log-10')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)+geom_text(aes(label=paste(exon,paste(pos,alt)),alpha=abs(log2(tHDR_post_pre_ratio_rL41rL42_mean_synnorm)-rL41rL42_tHDR_pre_lib_loess_rL41rL42)),hjust=0.5, vjust=0.5, size = 1.4,colour='Black')
#tHDR reads
ggplot(XrL4_all_tHDR_pos_merge_master_df, aes(y=log10((tHDR_pre_pseudo_freq+rL42_tHDR_pre_pseudo_freq)/2),x=log10(lib_freq), color=conseq,size=rL41rL42_tHDR_post_lib_loess_rL41rL42))+geom_point(alpha=1)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank(),legend.position='None')+xlab('lib_freq')+ylab('tHDR_pre_lib_ratio_mean')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)+geom_text(aes(label=paste(exon,paste(pos,alt)),alpha=abs(log2(tHDR_post_pre_ratio_rL41rL42_mean_synnorm)-rL41rL42_tHDR_pre_lib_loess_rL41rL42)),hjust=0.5, vjust=0.5, size = 1.4,colour='Black')
#do any variants drop out completely? pre to post
ggplot(XrL4_all_tHDR_pos_merge_master_df, aes(y=log10((post_pseudo_freq+rL42_post_pseudo_freq)/2),x=log10((pre_pseudo_freq+rL42_pre_pseudo_freq)/2), color=conseq,size=rL41rL42_tHDR_post_lib_loess_rL41rL42))+geom_point(alpha=1)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank(),legend.position='None')+xlab('mean cSNV pre freq. log-10')+ylab('mean cSNV post freq. log-10')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)+geom_text(aes(label=paste(exon,paste(pos,alt)),alpha=abs(log2(tHDR_post_pre_ratio_rL41rL42_mean_synnorm)-rL41rL42_tHDR_pre_lib_loess_rL41rL42)),hjust=0.5, vjust=0.5, size = 1.4,colour='Black')
#variant filtering:
#rL41 frequency already must be >= .00001
length(XrL4_all_tHDR_pos_merge_master_df$tHDR_lib_pseudo_freq)
length(which(XrL4_all_tHDR_pos_merge_master_df$tHDR_lib_pseudo_freq > 10^-4))
length(which(XrL4_all_tHDR_pos_merge_master_df$rL42_pre_freq >= 0.00001))
#NOTE -- DEMAND that only the one matching the exon's cut end is yes!! This will solve all NAG / NTG issues...
length(which(XrL4_all_tHDR_pos_merge_master_df$c_hdr_snv == 'True'))
length(which(XrL4_all_tHDR_pos_merge_master_df$hdr3 =='yes')) #distal edits
length(which(XrL4_all_tHDR_pos_merge_master_df$hdr5 =='yes')) #proximal edits
#want this one!!
length(which(XrL4_all_tHDR_pos_merge_master_df$hdr_cut =='yes' & XrL4_all_tHDR_pos_merge_master_df$tHDR_lib_pseudo_freq > 10^-4 ))
XrL4_all_filter_lib_hdr_cut_df <- XrL4_all_tHDR_pos_merge_master_df[which(XrL4_all_tHDR_pos_merge_master_df$hdr_cut =='yes'),]
#high sequencing error contribution -- average of two replicates must be > 0.5 in cSNV context
length(which((XrL4_all_tHDR_pos_merge_master_df$corr_pre_pseudo/XrL4_all_tHDR_pos_merge_master_df$pre_pseudo + XrL4_all_tHDR_pos_merge_master_df$rL42_corr_pre_pseudo/XrL4_all_tHDR_pos_merge_master_df$rL42_pre_pseudo)/2 <0.5))
length(which((XrL4_all_filter_lib_hdr_cut_df$corr_pre_pseudo/XrL4_all_filter_lib_hdr_cut_df$pre_pseudo + XrL4_all_filter_lib_hdr_cut_df$rL42_corr_pre_pseudo/XrL4_all_filter_lib_hdr_cut_df$rL42_pre_pseudo)/2 <0.5))
#which points are poorly represented in cSNV context compared to potential seq error? not worth using!
XrL4_all_filter_lib_hdr_cut_error_df <- XrL4_all_tHDR_pos_merge_master_df[which(XrL4_all_tHDR_pos_merge_master_df$tHDR_lib_pseudo_freq > 10^-4 & XrL4_all_tHDR_pos_merge_master_df$rL42_pre_freq >= 0.00001 & XrL4_all_tHDR_pos_merge_master_df$hdr_cut =='yes' & (XrL4_all_tHDR_pos_merge_master_df$corr_pre_pseudo/XrL4_all_tHDR_pos_merge_master_df$pre_pseudo + XrL4_all_tHDR_pos_merge_master_df$rL42_corr_pre_pseudo/XrL4_all_tHDR_pos_merge_master_df$rL42_pre_pseudo)/2 <0.5),]
#KEEP THIS!!!
XrL4_all_filter_lib_hdr_cut_df <- XrL4_all_tHDR_pos_merge_master_df[which(XrL4_all_tHDR_pos_merge_master_df$tHDR_lib_pseudo_freq > 10^-4 & XrL4_all_tHDR_pos_merge_master_df$rL42_pre_freq >= 0.00001 & XrL4_all_tHDR_pos_merge_master_df$hdr_cut =='yes'),]
#created a new column pasting the pos and alt rows together above -- all of these were manually identified as outliers in the pre/lib loess.2-adjsuted tHDR mean values.
#list of NCGG variants at cut PAMs by pos_alt:
exclude_NCGG_active <- c('41256887C','41222959G','41209124C','41199701C','41197789C')
#list of NCGG PAMs
exclude_NCGG_all <- c('41256887C','41222959G','41209124C','41199701C','41197789C','41267771G','41215385G','41203121C','41201202C')
#list of NXG variants at cut PAMs (already exlcuded by requirement of prox pam present):
exclude_NXG_all <- c('41276039A','41267769A','41267769T','41222957A','41222957T','41215383A','41215383T','41209126T','41209126A','41203123T','41203123A','41201204A','41201204T','41199703T','41199703A','41197791T','41197791A')
#list of X5 NAG variants at cut PAM and PAM+1 variants (poor fit):
exclude_X5_PAM <- c('41256890G','41256890C','41256890T','41256887C','41256887G','41256887A')
#list of other variants near cut site depleted heavily, including X2 pam, and X16 ps var's.
exclude_other_ps_pam <- c('41276038A','41219672G')
#list of variants that may have splice difference in context of PAM edit:
#ones with RNA evidence of depletion in edited PAM context (must go)
#these are filtered out later with SNVs not in CADD (i.e. not true SNVs)
#exclude_pam_splice_low_rna <- c('41256962A','41199702G')
#exclude_pam_splice_no_test_GGT_donor <- c('41258513A','41215945A','41203122A','41203067A','41201127A')
#exclude_pam_splice_no_test_HGT_donor <- c('41219675C','41209125A','41199702A','41197790A','41197769A')
#fall substantially above:
length(which(XrL4_all_tHDR_pos_merge_master_df$rL41rL42_tHDR_pre_lib_loess_rL41rL42 > 1))
exclude_high_pre_lib <- XrL4_all_tHDR_pos_merge_master_df[which(XrL4_all_tHDR_pos_merge_master_df$rL41rL42_tHDR_pre_lib_loess_rL41rL42 > 1),c('pos_alt')]
XrL4_all_exclude_list <- c(exclude_NCGG_all,exclude_NXG_all,exclude_X5_PAM,exclude_other_ps_pam,exclude_high_pre_lib)
#only removing re-editing outliers and inflated pre/lib's
XrL4_all_tHDR_pos_merge_exclude <- XrL4_all_tHDR_pos_merge_master_df[! XrL4_all_tHDR_pos_merge_master_df$pos_alt %in% XrL4_all_exclude_list,]
#removing all points flagged.
XrL4_all_tHDR_pos_merge_all_filters <- XrL4_all_filter_lib_hdr_cut_df[! XrL4_all_filter_lib_hdr_cut_df$pos_alt %in% XrL4_all_exclude_list,]
#filtered sets compared to full set: XrL4_all_tHDR_pos_merge_all_filters
ggplot(XrL4_all_tHDR_pos_merge_master_df, aes(x=log2(tHDR_post_pre_ratio_rL41rL42_mean_synnorm),y=rL41rL42_tHDR_pre_lib_loess_rL41rL42, color=conseq))+geom_point(alpha=0.5)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank(),legend.position='bottom')+ylab('Pre / lib corrected (log-2)')+xlab('Post / pre (log-2)')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)+coord_cartesian(ylim = c(-6,2.5),xlim=c(-6,2.5))
ggplot(XrL4_all_tHDR_pos_merge_all_filters, aes(x=log2(tHDR_post_pre_ratio_rL41rL42_mean_synnorm),y=rL41rL42_tHDR_pre_lib_loess_rL41rL42, color=conseq,size = rL41rL42_tHDR_post_lib_loess_rL41rL42))+geom_point(alpha=0.6)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank(),legend.position='None')+ylab('Pre/lib loess2 corr. log-2')+xlab('Post/pre log-2')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)+scale_size_continuous(range = c(0.1, 2))+geom_text(aes(label=paste(exon,paste(pos,alt,sep='')),alpha=abs(log2(tHDR_post_pre_ratio_rL41rL42_mean_synnorm)-rL41rL42_tHDR_pre_lib_loess_rL41rL42)),hjust=0.5, vjust=0.5, size = 1.4,colour='Black')+coord_cartesian(ylim = c(-5,2.5),xlim=c(-5,2.5))
#testing which points are drastically different between pre/lib+post/pre and post/lib:
#does post/lib give a different result than pre/lib+post/pre (labelled with exon)
grid.arrange(ggplot(XrL4_all_tHDR_pos_merge_master_df, aes(x=log2(tHDR_post_pre_ratio_rL41rL42_mean_synnorm) + rL41rL42_tHDR_pre_lib_loess_rL41rL42, y=rL41rL42_tHDR_post_lib_loess_rL41rL42,color=conseq))+geom_point(alpha=0.3)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank(),legend.position='None')+ylab('Post/lib corr. log-2')+xlab('Post/pre + pre/lib corr. log-2')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)+scale_size_continuous(range = c(0.1, 2)),ggplot(XrL4_all_tHDR_pos_merge_all_filters, aes(x=log2(tHDR_post_pre_ratio_rL41rL42_mean_synnorm) + rL41rL42_tHDR_pre_lib_loess_rL41rL42, y=rL41rL42_tHDR_post_lib_loess_rL41rL42,color=conseq))+geom_point(alpha=0.3)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank(),legend.position='None')+ylab('Post/lib corr. log-2')+xlab('Post/pre + pre/lib corr. log-2')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)+scale_size_continuous(range = c(0.1, 2)),ncol = 2)
#unfiltered vs. CADD:
cor(log2(XrL4_all_tHDR_pos_merge_master_df_CADD$tHDR_post_pre_ratio_rL41rL42_mean_synnorm) + XrL4_all_tHDR_pos_merge_master_df_CADD$rL41rL42_tHDR_pre_lib_loess_rL41rL42,as.numeric(as.character(XrL4_all_tHDR_pos_merge_master_df_CADD$CADD.phred)),method='spearman')
cor(XrL4_all_tHDR_pos_merge_master_df_CADD$rL41rL42_tHDR_post_lib_loess_rL41rL42,as.numeric(as.character(XrL4_all_tHDR_pos_merge_master_df_CADD$CADD.phred)),method='spearman')
cor(XrL4_all_tHDR_pos_merge_master_df_CADD$rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean,as.numeric(as.character(XrL4_all_tHDR_pos_merge_master_df_CADD$CADD.phred)),method='spearman')
#filtered vs. CADD (very slight improvement)
cor(log2(XrL4_all_tHDR_pos_merge_all_filters_CADD$tHDR_post_pre_ratio_rL41rL42_mean_synnorm) + XrL4_all_tHDR_pos_merge_all_filters_CADD$rL41rL42_tHDR_pre_lib_loess_rL41rL42,as.numeric(as.character(XrL4_all_tHDR_pos_merge_all_filters_CADD$CADD.phred)),method='spearman')
cor(XrL4_all_tHDR_pos_merge_all_filters_CADD$rL41rL42_tHDR_post_lib_loess_rL41rL42,as.numeric(as.character(XrL4_all_tHDR_pos_merge_all_filters_CADD$CADD.phred)),method='spearman')
cor(XrL4_all_tHDR_pos_merge_all_filters_CADD$rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean,as.numeric(as.character(XrL4_all_tHDR_pos_merge_all_filters_CADD$CADD.phred)),method='spearman')
#RNA analysis (excluding exon 18 for now):
XrL4_all_tHDR_pos_merge_all_filters_rna_syn <- XrL4_all_tHDR_pos_merge_all_filters[which((XrL4_all_tHDR_pos_merge_all_filters$conseq == 'SYNONYMOUS' & XrL4_all_tHDR_pos_merge_all_filters$c_hdr_snv == 'True' & XrL4_all_tHDR_pos_merge_all_filters$exon != 'X18' ) | (XrL4_all_tHDR_pos_merge_all_filters$conseq == 'SPLICE_SITE' & XrL4_all_tHDR_pos_merge_all_filters$c_hdr_snv == 'True' & XrL4_all_tHDR_pos_merge_all_filters$CDSpos != 'NA'& XrL4_all_tHDR_pos_merge_all_filters$exon != 'X18')),]
#missense and syn
XrL4_all_tHDR_pos_merge_all_filters_rna_syn_mis_spl <- XrL4_all_tHDR_pos_merge_all_filters[which((XrL4_all_tHDR_pos_merge_all_filters$conseq == 'SYNONYMOUS' & XrL4_all_tHDR_pos_merge_all_filters$c_hdr_snv == 'True' & XrL4_all_tHDR_pos_merge_all_filters$exon != 'X18') | (XrL4_all_tHDR_pos_merge_all_filters$conseq == 'NON_SYNONYMOUS' & XrL4_all_tHDR_pos_merge_all_filters$c_hdr_snv == 'True' & XrL4_all_tHDR_pos_merge_all_filters$exon != 'X18') | (XrL4_all_tHDR_pos_merge_all_filters$conseq == 'SPLICE_SITE' & XrL4_all_tHDR_pos_merge_all_filters$c_hdr_snv == 'True' & XrL4_all_tHDR_pos_merge_all_filters$CDSpos != 'NA' & XrL4_all_tHDR_pos_merge_all_filters$exon != 'X18')),]
RNA_intervals <- seq(-5,2,length.out=71)
LOF_SNV_fractions <- c()
SNVs_below_RNA_thresh_count <- c()
for (x in RNA_intervals){
snvs_below <- which(log2(XrL4_all_tHDR_pos_merge_all_filters_rna_syn_mis_spl$tHDR_rna_pre_ratio_rL41rL42_mean_synnorm) <= x)
count_snvs_below <- length(snvs_below)
SNVs_below_RNA_thresh_count <- append(SNVs_below_RNA_thresh_count,count_snvs_below)
low_fit_count <- length(which(XrL4_all_tHDR_pos_merge_all_filters_rna_syn_mis_spl[snvs_below,'rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean'] <= -1.25)) #threshold for fitness arbitrarily set here as rough bimodal point
LOF_SNV_fractions <- append(LOF_SNV_fractions,low_fit_count/count_snvs_below)
}
rna_fitness_df <- data.frame(fraction_snvs_LOF = LOF_SNV_fractions, log2_rna_expression_threshold = RNA_intervals, snvs_below_RNA_thresh = SNVs_below_RNA_thresh_count)
#plot showing percent SNVs tolerated below given RNA level
ggplot(rna_fitness_df)+geom_point(aes(x=log2_rna_expression_threshold,y=1-fraction_snvs_LOF,size=SNVs_below_RNA_thresh_count))
#comparison of different enrichment score metrics across all exons:
XrL4_all_tHDR_pos_merge_all_filters_mis_clinvar_p_b <- XrL4_all_tHDR_pos_merge_all_filters[which((XrL4_all_tHDR_pos_merge_all_filters$conseq == 'NON_SYNONYMOUS' & XrL4_all_tHDR_pos_merge_all_filters$clinvar_simple == 'Pathogenic') | (XrL4_all_tHDR_pos_merge_all_filters$conseq == 'NON_SYNONYMOUS' & XrL4_all_tHDR_pos_merge_all_filters$clinvar_simple == 'Benign')),]
ggplot(XrL4_all_tHDR_pos_merge_all_filters_mis_clinvar_p_b, aes(x=rL41rL42_tHDR_post_lib_loess_rL41rL42, fill=clinvar_simple))+geom_histogram(alpha=1,bins=20)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank(),legend.position='None')+ylab('SNVs')+xlab('Enrichment (Log-2)')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors_full_simple)+scale_color_manual(values=named_conseq_colors_full_simple)
ggplot(XrL4_all_tHDR_pos_merge_all_filters_mis_clinvar_p_b, aes(x=rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean, fill=clinvar_simple))+geom_histogram(alpha=1,bins=20)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank(),legend.position='None')+ylab('SNVs')+xlab('Enrichment (Log-2)')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors_full_simple)+scale_color_manual(values=named_conseq_colors_full_simple)
XrL4_all_tHDR_pos_merge_all_filters_clinvar_p_b <- XrL4_all_tHDR_pos_merge_all_filters[which( XrL4_all_tHDR_pos_merge_all_filters$clinvar_simple == 'Pathogenic' | XrL4_all_tHDR_pos_merge_all_filters$conseq == 'NON_SYNONYMOUS' & XrL4_all_tHDR_pos_merge_all_filters$clinvar_simple == 'Benign'),]
ggplot(XrL4_all_tHDR_pos_merge_all_filters_clinvar_p_b, aes(x=rL41rL42_tHDR_post_lib_loess_rL41rL42, fill=clinvar_simple,color=conseq))+geom_histogram(alpha=1,bins=20)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank(),legend.position='None')+ylab('SNVs')+xlab('Enrichment (Log-2)')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors_full_simple)+scale_color_manual(values=named_conseq_colors_full_simple)
ggplot(XrL4_all_tHDR_pos_merge_all_filters_clinvar_p_b, aes(x=rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean, fill=clinvar_simple,color=conseq))+geom_histogram(alpha=1,bins=20)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank(),legend.position='None')+ylab('SNVs')+xlab('Enrichment (Log-2)')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors_full_simple)+scale_color_manual(values=named_conseq_colors_full_simple)
XrL4_all_tHDR_pos_merge_all_filters_clinvar_p_b_lp_lb <- XrL4_all_tHDR_pos_merge_all_filters[which( XrL4_all_tHDR_pos_merge_all_filters$clinvar_simple == 'Pathogenic' | XrL4_all_tHDR_pos_merge_all_filters$clinvar_simple == 'Benign' | XrL4_all_tHDR_pos_merge_all_filters$clinvar_simple == 'Likely pathogenic' | XrL4_all_tHDR_pos_merge_all_filters$clinvar_simple == 'Likely benign'),]
clinvar_ord_XrL4_all_tHDR_pos_merge_all_filters_clinvar_p_b_lp_lb <- XrL4_all_tHDR_pos_merge_all_filters_clinvar_p_b_lp_lb
clinvar_ord_XrL4_all_tHDR_pos_merge_all_filters_clinvar_p_b_lp_lb$clinvar_simple <- factor(clinvar_ord_XrL4_all_tHDR_pos_merge_all_filters_clinvar_p_b_lp_lb$clinvar_simple, levels = c("Pathogenic","Likely pathogenic","Likely benign","Benign","Uncertain significance","Conflicting interpretations of pathogenicity","absent","REF"))
ggplot(clinvar_ord_XrL4_all_tHDR_pos_merge_all_filters_clinvar_p_b_lp_lb, aes(x=rL41rL42_tHDR_post_lib_loess_rL41rL42, fill=clinvar_simple))+geom_histogram(alpha=1,bins=20)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank(),legend.position='None')+ylab('SNVs')+xlab('Enrichment (Log-2)')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors_full_simple)+scale_color_manual(values=named_conseq_colors_full_simple)
ggplot(clinvar_ord_XrL4_all_tHDR_pos_merge_all_filters_clinvar_p_b_lp_lb, aes(x=rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean, fill=clinvar_simple))+geom_histogram(alpha=1,bins=20)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank(),legend.position='None')+ylab('SNVs')+xlab('Enrichment (Log-2)')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors_full_simple)+scale_color_manual(values=named_conseq_colors_full_simple)
ggplot(clinvar_ord_XrL4_all_tHDR_pos_merge_all_filters_clinvar_p_b_lp_lb, aes(x=rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean, fill=clinvar_simple,color=conseq))+geom_histogram(alpha=1,bins=20)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank(),legend.position='None')+ylab('SNVs')+xlab('Enrichment (Log-2)')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors_full_simple)+scale_color_manual(values=named_conseq_colors_full_simple)
#all missense variants ordered by clinvar
XrL4_all_tHDR_pos_merge_all_filters_mis <- XrL4_all_tHDR_pos_merge_all_filters[which((XrL4_all_tHDR_pos_merge_all_filters$conseq == 'NON_SYNONYMOUS')),]
clinvar_ord_XrL4_all_tHDR_pos_merge_all_filters_mis <- XrL4_all_tHDR_pos_merge_all_filters_mis
clinvar_ord_XrL4_all_tHDR_pos_merge_all_filters_mis$clinvar_simple <- factor(clinvar_ord_XrL4_all_tHDR_pos_merge_all_filters_mis$clinvar_simple, levels = c("Pathogenic","Likely pathogenic","Likely benign","Benign","Uncertain significance","Conflicting interpretations of pathogenicity","absent","REF"))
ggplot(clinvar_ord_XrL4_all_tHDR_pos_merge_all_filters_mis, aes(x=rL41rL42_tHDR_post_lib_loess_rL41rL42, fill=clinvar_simple))+geom_histogram(alpha=1,bins=30)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank(),legend.position = 'None')+ylab('SNVs')+xlab('Enrichment (Log-2)')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors_full_simple)+scale_color_manual(values=named_conseq_colors_full_simple)
ggplot(clinvar_ord_XrL4_all_tHDR_pos_merge_all_filters_mis, aes(x=rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean, fill=clinvar_simple))+geom_histogram(alpha=1,bins=30)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank(),legend.position = 'None')+ylab('SNVs')+xlab('Enrichment (Log-2)')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors_full_simple)+scale_color_manual(values=named_conseq_colors_full_simple)
#all variants ordered by ClinVar annotation
clinvar_ord_XrL4_all_tHDR_pos_merge_all_filters <- XrL4_all_tHDR_pos_merge_all_filters
clinvar_ord_XrL4_all_tHDR_pos_merge_all_filters$clinvar_simple <- factor(clinvar_ord_XrL4_all_tHDR_pos_merge_all_filters$clinvar_simple, levels = c("Pathogenic","Likely pathogenic","Likely benign","Benign","Uncertain significance","Conflicting interpretations of pathogenicity","absent","REF"))
ggplot(clinvar_ord_XrL4_all_tHDR_pos_merge_all_filters, aes(x=rL41rL42_tHDR_post_lib_loess_rL41rL42, fill=clinvar_simple))+geom_histogram(alpha=1,bins=30)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+ylab('SNVs')+xlab('Enrichment (Log-2)')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors_full_simple)+scale_color_manual(values=named_conseq_colors_full_simple)
ggplot(clinvar_ord_XrL4_all_tHDR_pos_merge_all_filters, aes(x=rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean, fill=clinvar_simple))+geom_histogram(alpha=1,bins=30)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+ylab('SNVs')+xlab('Enrichment (Log-2)')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors_full_simple)+scale_color_manual(values=named_conseq_colors_full_simple)
##NEED TO FILTER THESE CALL SETS IN FINAL ANALYSIS BASED ON GLOBAL FILTERS ABOVE, and perform global min/max normalization and scale each exon by median nonsense depletion across all.
X2rL4_final_df <- XrL4_all_tHDR_pos_merge_all_filters[which(XrL4_all_tHDR_pos_merge_all_filters$exon == 'X2'),]
X3rL4_final_df <- XrL4_all_tHDR_pos_merge_all_filters[which(XrL4_all_tHDR_pos_merge_all_filters$exon == 'X3'),]
X4rL4_final_df <- XrL4_all_tHDR_pos_merge_all_filters[which(XrL4_all_tHDR_pos_merge_all_filters$exon == 'X4'),]
X5rL4_final_df <- XrL4_all_tHDR_pos_merge_all_filters[which(XrL4_all_tHDR_pos_merge_all_filters$exon == 'X5'),]
X15rL4_final_df <- XrL4_all_tHDR_pos_merge_all_filters[which(XrL4_all_tHDR_pos_merge_all_filters$exon == 'X15'),]
X16rL4_final_df <- XrL4_all_tHDR_pos_merge_all_filters[which(XrL4_all_tHDR_pos_merge_all_filters$exon == 'X16'),]
X17rL4_final_df <- XrL4_all_tHDR_pos_merge_all_filters[which(XrL4_all_tHDR_pos_merge_all_filters$exon == 'X17'),]
X18rL4_final_df <- XrL4_all_tHDR_pos_merge_all_filters[which(XrL4_all_tHDR_pos_merge_all_filters$exon == 'X18'),]
X19rL4_final_df <- XrL4_all_tHDR_pos_merge_all_filters[which(XrL4_all_tHDR_pos_merge_all_filters$exon == 'X19'),]
X20rL4_final_df <- XrL4_all_tHDR_pos_merge_all_filters[which(XrL4_all_tHDR_pos_merge_all_filters$exon == 'X20'),]
X21rL4_final_df <- XrL4_all_tHDR_pos_merge_all_filters[which(XrL4_all_tHDR_pos_merge_all_filters$exon == 'X21'),]
X22rL4_final_df <- XrL4_all_tHDR_pos_merge_all_filters[which(XrL4_all_tHDR_pos_merge_all_filters$exon == 'X22'),]
X23rL4_final_df <- XrL4_all_tHDR_pos_merge_all_filters[which(XrL4_all_tHDR_pos_merge_all_filters$exon == 'X23'),]
X2rL4_final_exon_df <- X2rL4_final_df[which(X2rL4_final_df[,'cDNApos'] !='REF' & X2rL4_final_df[,'cDNApos']!='NA'),]
X3rL4_final_exon_df <- X3rL4_final_df[which(X3rL4_final_df[,'cDNApos'] !='REF' & X3rL4_final_df[,'cDNApos']!='NA'),]
X4rL4_final_exon_df <- X4rL4_final_df[which(X4rL4_final_df[,'cDNApos'] !='REF' & X4rL4_final_df[,'cDNApos']!='NA'),]
X5rL4_final_exon_df <- X5rL4_final_df[which(X5rL4_final_df[,'cDNApos'] !='REF' & X5rL4_final_df[,'cDNApos']!='NA'),]
X15rL4_final_exon_df <- X15rL4_final_df[which(X15rL4_final_df[,'cDNApos'] !='REF' & X15rL4_final_df[,'cDNApos']!='NA'),]
X16rL4_final_exon_df <- X16rL4_final_df[which(X16rL4_final_df[,'cDNApos'] !='REF' & X16rL4_final_df[,'cDNApos']!='NA'),]
X17rL4_final_exon_df <- X17rL4_final_df[which(X17rL4_final_df[,'cDNApos'] !='REF' & X17rL4_final_df[,'cDNApos']!='NA'),]
X18rL4_final_exon_df <- X18rL4_final_df[which(X18rL4_final_df[,'cDNApos'] !='REF' & X18rL4_final_df[,'cDNApos']!='NA'),]
X19rL4_final_exon_df <- X19rL4_final_df[which(X19rL4_final_df[,'cDNApos'] !='REF' & X19rL4_final_df[,'cDNApos']!='NA'),]
X20rL4_final_exon_df <- X20rL4_final_df[which(X20rL4_final_df[,'cDNApos'] !='REF' & X20rL4_final_df[,'cDNApos']!='NA'),]
X21rL4_final_exon_df <- X21rL4_final_df[which(X21rL4_final_df[,'cDNApos'] !='REF' & X21rL4_final_df[,'cDNApos']!='NA'),]
X22rL4_final_exon_df <- X22rL4_final_df[which(X22rL4_final_df[,'cDNApos'] !='REF' & X22rL4_final_df[,'cDNApos']!='NA'),]
X23rL4_final_exon_df <- X23rL4_final_df[which(X23rL4_final_df[,'cDNApos'] !='REF' & X23rL4_final_df[,'cDNApos']!='NA'),]
#plots for RNA analysis!
require(gridExtra)
#X2
X2_tHDR_post_lib_loess_e.sns_rL41rL42_mean_by_pos <- ggplot(X2rL4_final_exon_df, aes(y=rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean,x=as.numeric(as.character(CDSpos))+as.numeric(as.character(CDSpos_nudge))))+geom_segment(aes(yend=rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean,xend=as.numeric(as.character(CDSpos))+as.numeric(as.character(CDSpos_nudge)),colour=conseq),yend=0)+geom_point(aes(colour=conseq),alpha=1,size=1.3)+theme(panel.background = element_rect(fill = 'white', colour = 'white')) +theme(legend.title=element_blank(),legend.key = element_blank(),legend.position = "none")+ylab('Log-2 fitness score')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_blank(), axis.title.x = element_blank(),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors_full_simple)+scale_color_manual(values=named_conseq_colors_full_simple)+geom_text(aes(label=rev_comp),hjust=0.5, vjust=0.5, size = 1.4,colour='White')
X2_tHDR_rna_pre_ratio_rL41rL42_mean_synnorm_by_pos <- ggplot(X2rL4_final_exon_df, aes(y=log2(tHDR_rna_pre_ratio_rL41rL42_mean_synnorm),x=as.numeric(as.character(CDSpos))+as.numeric(as.character(CDSpos_nudge))))+geom_segment(aes(yend=log2(tHDR_rna_pre_ratio_rL41rL42_mean_synnorm),xend=as.numeric(as.character(CDSpos))+as.numeric(as.character(CDSpos_nudge)),colour='Gray',alpha=0.15),yend=0)+geom_point(aes(colour=conseq),alpha=1,size=1.3)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank(),legend.position = "none")+ylab('Log-2 RNA')+xlab('CDS position')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors_full_simple)+scale_color_manual(values=named_conseq_colors_full_simple)+geom_text(aes(label=rev_comp),hjust=0.5, vjust=0.5, size = 1.4,colour='White')
grid.arrange(X2_tHDR_post_lib_loess_e.sns_rL41rL42_mean_by_pos, X2_tHDR_rna_pre_ratio_rL41rL42_mean_synnorm_by_pos, nrow=2)
#same plots with cDNA position instead of CDSpos
X2_tHDR_post_lib_loess_e.sns_rL41rL42_mean_by_cDNA_pos <- ggplot(X2rL4_final_exon_df, aes(y=rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean,x=as.numeric(as.character(cDNApos))+as.numeric(as.character(CDSpos_nudge))))+geom_segment(aes(yend=rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean,xend=as.numeric(as.character(cDNApos))+as.numeric(as.character(CDSpos_nudge)),colour=conseq),yend=0)+geom_point(aes(colour=conseq),alpha=1,size=1.3)+theme(panel.background = element_rect(fill = 'white', colour = 'white')) +theme(legend.title=element_blank(),legend.key = element_blank(),legend.position = "none")+ylab('Log-2 fitness score')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_blank(), axis.title.x = element_blank(),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors_full_simple)+scale_color_manual(values=named_conseq_colors_full_simple)+geom_text(aes(label=rev_comp),hjust=0.5, vjust=0.5, size = 1.4,colour='White')
X2_tHDR_rna_pre_ratio_rL41rL42_mean_synnorm_by_cDNA_pos <- ggplot(X2rL4_final_exon_df, aes(y=log2(tHDR_rna_pre_ratio_rL41rL42_mean_synnorm),x=as.numeric(as.character(cDNApos))+as.numeric(as.character(CDSpos_nudge))))+geom_segment(aes(yend=log2(tHDR_rna_pre_ratio_rL41rL42_mean_synnorm),xend=as.numeric(as.character(cDNApos))+as.numeric(as.character(CDSpos_nudge)),colour='Gray',alpha=0.15),yend=0)+geom_point(aes(colour=conseq),alpha=1,size=1.3)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank(),legend.position = "none")+ylab('Log-2 RNA')+xlab('cDNA position')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors_full_simple)+scale_color_manual(values=named_conseq_colors_full_simple)+geom_text(aes(label=rev_comp),hjust=0.5, vjust=0.5, size = 1.4,colour='White')
grid.arrange(X2_tHDR_post_lib_loess_e.sns_rL41rL42_mean_by_cDNA_pos, X2_tHDR_rna_pre_ratio_rL41rL42_mean_synnorm_by_cDNA_pos, nrow=2)
#rna_pre ratio across replicates:
ggplot(X2rL4_final_exon_df, aes(x=log2(tHDR_rna_pre_ratio_synnorm),y=log2(rL42_tHDR_rna_pre_ratio_synnorm)))+geom_point(aes(colour=conseq),size=1)+geom_abline(intercept=0,slope=1,lty=2)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+ylab('Rep. 2 rna/pre')+xlab('Rep. 1 rna/pre')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)
#rna vs. pre by replicate, rL41
ggplot(X2rL4_final_exon_df, aes(x=log2(tHDR_pre_pseudo_freq),y=log2(tHDR_rna_pseudo_freq)))+geom_point(aes(colour=conseq,size=rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean),alpha=1)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+ylab('RNA freq. (log-2)')+xlab('DNA freq. D5 (log-2)')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)
#rna vs. pre by replicate, rL42
ggplot(X2rL4_final_exon_df, aes(x=log2(rL42_tHDR_pre_pseudo_freq),y=log2(rL42_tHDR_rna_pseudo_freq)))+geom_point(aes(colour=conseq,size=rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean),alpha=1)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+ylab('RNA freq. (log-2)')+xlab('DNA freq. D5 (log-2)')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)
#average rna vs. average pre
ggplot(X2rL4_final_exon_df, aes(x=log2(tHDR_pre_pseudo_freq/2+rL42_tHDR_pre_pseudo_freq/2),y=log2(tHDR_rna_pseudo_freq/2+rL42_tHDR_rna_pseudo_freq/2)))+geom_point(aes(colour=conseq,size=rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean),alpha=1)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+ylab('RNA freq. (log-2)')+xlab('DNA freq. D5 (log-2)')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)
#which correlates best to pre-ratios?
cor(log2(X2rL4_final_exon_df$tHDR_pre_pseudo_freq),log2(X2rL4_final_exon_df$tHDR_rna_pseudo_freq),method='spearman')
cor(log2(X2rL4_final_exon_df$rL42_tHDR_pre_pseudo_freq),log2(X2rL4_final_exon_df$rL42_tHDR_rna_pseudo_freq),method='spearman')
cor(log2(X2rL4_final_exon_df$tHDR_pre_pseudo_freq/2+X2rL4_final_exon_df$rL42_tHDR_pre_pseudo_freq/2),log2(X2rL4_final_exon_df$tHDR_rna_pseudo_freq/2+X2rL4_final_exon_df$rL42_tHDR_rna_pseudo_freq/2),method='spearman')
#X3
X3_tHDR_post_lib_loess_e.sns_rL41rL42_mean_by_pos <- ggplot(X3rL4_final_exon_df, aes(y=rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean,x=as.numeric(as.character(CDSpos))+as.numeric(as.character(CDSpos_nudge))))+geom_segment(aes(yend=rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean,xend=as.numeric(as.character(CDSpos))+as.numeric(as.character(CDSpos_nudge)),colour=conseq),yend=0)+geom_point(aes(colour=conseq),alpha=1,size=1.3)+theme(panel.background = element_rect(fill = 'white', colour = 'white')) +theme(legend.title=element_blank(),legend.key = element_blank(),legend.position = "none")+ylab('Log-2 fitness score')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_blank(), axis.title.x = element_blank(),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors_full_simple)+scale_color_manual(values=named_conseq_colors_full_simple)+geom_text(aes(label=rev_comp),hjust=0.5, vjust=0.5, size = 1.4,colour='White')
X3_tHDR_rna_pre_ratio_rL41rL42_mean_synnorm_by_pos <- ggplot(X3rL4_final_exon_df, aes(y=log2(tHDR_rna_pre_ratio_rL41rL42_mean_synnorm),x=as.numeric(as.character(CDSpos))+as.numeric(as.character(CDSpos_nudge))))+geom_segment(aes(yend=log2(tHDR_rna_pre_ratio_rL41rL42_mean_synnorm),xend=as.numeric(as.character(CDSpos))+as.numeric(as.character(CDSpos_nudge)),colour='Gray',alpha=0.15),yend=0)+geom_point(aes(colour=conseq),alpha=1,size=1.3)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank(),legend.position = "none")+ylab('Log-2 RNA')+xlab('CDS position')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors_full_simple)+scale_color_manual(values=named_conseq_colors_full_simple)+geom_text(aes(label=rev_comp),hjust=0.5, vjust=0.5, size = 1.4,colour='White')
grid.arrange(X3_tHDR_post_lib_loess_e.sns_rL41rL42_mean_by_pos, X3_tHDR_rna_pre_ratio_rL41rL42_mean_synnorm_by_pos, nrow=2)
#rna_pre ratio across replicates:
ggplot(X3rL4_final_exon_df, aes(x=log2(tHDR_rna_pre_ratio_synnorm),y=log2(rL42_tHDR_rna_pre_ratio_synnorm)))+geom_point(aes(colour=conseq),size=1)+geom_abline(intercept=0,slope=1,lty=2)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+ylab('Rep. 2 rna/pre')+xlab('Rep. 1 rna/pre')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)
#rna vs. pre by replicate, rL41
ggplot(X3rL4_final_exon_df, aes(x=log2(tHDR_pre_pseudo_freq),y=log2(tHDR_rna_pseudo_freq)))+geom_point(aes(colour=conseq,size=rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean),alpha=1)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+ylab('RNA freq. (log-2)')+xlab('DNA freq. D5 (log-2)')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)
#rna vs. pre by replicate, rL42
ggplot(X3rL4_final_exon_df, aes(x=log2(rL42_tHDR_pre_pseudo_freq),y=log2(rL42_tHDR_rna_pseudo_freq)))+geom_point(aes(colour=conseq,size=rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean),alpha=1)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+ylab('RNA freq. (log-2)')+xlab('DNA freq. D5 (log-2)')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)
#average rna vs. average pre
ggplot(X3rL4_final_exon_df, aes(x=log2(tHDR_pre_pseudo_freq/2+rL42_tHDR_pre_pseudo_freq/2),y=log2(tHDR_rna_pseudo_freq/2+rL42_tHDR_rna_pseudo_freq/2)))+geom_point(aes(colour=conseq,size=rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean),alpha=1)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+ylab('RNA freq. (log-2)')+xlab('DNA freq. D5 (log-2)')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)
#which correlates best to pre-ratios?
cor(log2(X3rL4_final_exon_df$tHDR_pre_pseudo_freq),log2(X3rL4_final_exon_df$tHDR_rna_pseudo_freq),method='spearman')
cor(log2(X3rL4_final_exon_df$rL42_tHDR_pre_pseudo_freq),log2(X3rL4_final_exon_df$rL42_tHDR_rna_pseudo_freq),method='spearman')
cor(log2(X3rL4_final_exon_df$tHDR_pre_pseudo_freq/2+X3rL4_final_exon_df$rL42_tHDR_pre_pseudo_freq/2),log2(X3rL4_final_exon_df$tHDR_rna_pseudo_freq/2+X3rL4_final_exon_df$rL42_tHDR_rna_pseudo_freq/2),method='spearman')
#X4
X4_tHDR_post_lib_loess_e.sns_rL41rL42_mean_by_pos <- ggplot(X4rL4_final_exon_df, aes(y=rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean,x=as.numeric(as.character(CDSpos))+as.numeric(as.character(CDSpos_nudge))))+geom_segment(aes(yend=rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean,xend=as.numeric(as.character(CDSpos))+as.numeric(as.character(CDSpos_nudge)),colour=conseq),yend=0)+geom_point(aes(colour=conseq),alpha=1,size=1.3)+theme(panel.background = element_rect(fill = 'white', colour = 'white')) +theme(legend.title=element_blank(),legend.key = element_blank(),legend.position = "none")+ylab('Log-2 fitness score')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_blank(), axis.title.x = element_blank(),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors_full_simple)+scale_color_manual(values=named_conseq_colors_full_simple)+geom_text(aes(label=rev_comp),hjust=0.5, vjust=0.5, size = 1.4,colour='White')
X4_tHDR_rna_pre_ratio_rL41rL42_mean_synnorm_by_pos <- ggplot(X4rL4_final_exon_df, aes(y=log2(tHDR_rna_pre_ratio_rL41rL42_mean_synnorm),x=as.numeric(as.character(CDSpos))+as.numeric(as.character(CDSpos_nudge))))+geom_segment(aes(yend=log2(tHDR_rna_pre_ratio_rL41rL42_mean_synnorm),xend=as.numeric(as.character(CDSpos))+as.numeric(as.character(CDSpos_nudge)),colour='Gray',alpha=0.15),yend=0)+geom_point(aes(colour=conseq),alpha=1,size=1.3)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank(),legend.position = "none")+ylab('Log-2 RNA')+xlab('CDS position')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors_full_simple)+scale_color_manual(values=named_conseq_colors_full_simple)+geom_text(aes(label=rev_comp),hjust=0.5, vjust=0.5, size = 1.4,colour='White')
grid.arrange(X4_tHDR_post_lib_loess_e.sns_rL41rL42_mean_by_pos, X4_tHDR_rna_pre_ratio_rL41rL42_mean_synnorm_by_pos, nrow=2)
#rna_pre ratio across replicates:
ggplot(X4rL4_final_exon_df, aes(x=log2(tHDR_rna_pre_ratio_synnorm),y=log2(rL42_tHDR_rna_pre_ratio_synnorm)))+geom_point(aes(colour=conseq),size=1)+geom_abline(intercept=0,slope=1,lty=2)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+ylab('Rep. 2 rna/pre')+xlab('Rep. 1 rna/pre')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)
#rna vs. pre by replicate, rL41
ggplot(X4rL4_final_exon_df, aes(x=log2(tHDR_pre_pseudo_freq),y=log2(tHDR_rna_pseudo_freq)))+geom_point(aes(colour=conseq,size=rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean),alpha=1)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+ylab('RNA freq. (log-2)')+xlab('DNA freq. D5 (log-2)')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)
#rna vs. pre by replicate, rL42
ggplot(X4rL4_final_exon_df, aes(x=log2(rL42_tHDR_pre_pseudo_freq),y=log2(rL42_tHDR_rna_pseudo_freq)))+geom_point(aes(colour=conseq,size=rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean),alpha=1)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+ylab('RNA freq. (log-2)')+xlab('DNA freq. D5 (log-2)')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)
#average rna vs. average pre
ggplot(X4rL4_final_exon_df, aes(x=log2(tHDR_pre_pseudo_freq/2+rL42_tHDR_pre_pseudo_freq/2),y=log2(tHDR_rna_pseudo_freq/2+rL42_tHDR_rna_pseudo_freq/2)))+geom_point(aes(colour=conseq,size=rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean),alpha=1)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+ylab('RNA freq. (log-2)')+xlab('DNA freq. D5 (log-2)')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)
#which correlates best to pre-ratios?
cor(log2(X4rL4_final_exon_df$tHDR_pre_pseudo_freq),log2(X4rL4_final_exon_df$tHDR_rna_pseudo_freq),method='spearman')
cor(log2(X4rL4_final_exon_df$rL42_tHDR_pre_pseudo_freq),log2(X4rL4_final_exon_df$rL42_tHDR_rna_pseudo_freq),method='spearman')
cor(log2(X4rL4_final_exon_df$tHDR_pre_pseudo_freq/2+X4rL4_final_exon_df$rL42_tHDR_pre_pseudo_freq/2),log2(X4rL4_final_exon_df$tHDR_rna_pseudo_freq/2+X4rL4_final_exon_df$rL42_tHDR_rna_pseudo_freq/2),method='spearman')
#X5
X5_tHDR_post_lib_loess_e.sns_rL41rL42_mean_by_pos <- ggplot(X5rL4_final_exon_df, aes(y=rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean,x=as.numeric(as.character(CDSpos))+as.numeric(as.character(CDSpos_nudge))))+geom_segment(aes(yend=rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean,xend=as.numeric(as.character(CDSpos))+as.numeric(as.character(CDSpos_nudge)),colour=conseq),yend=0)+geom_point(aes(colour=conseq),alpha=1,size=1.3)+theme(panel.background = element_rect(fill = 'white', colour = 'white')) +theme(legend.title=element_blank(),legend.key = element_blank(),legend.position = "none")+ylab('Log-2 fitness score')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_blank(), axis.title.x = element_blank(),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors_full_simple)+scale_color_manual(values=named_conseq_colors_full_simple)+geom_text(aes(label=rev_comp),hjust=0.5, vjust=0.5, size = 1.4,colour='White')
X5_tHDR_rna_pre_ratio_rL41rL42_mean_synnorm_by_pos <- ggplot(X5rL4_final_exon_df, aes(y=log2(tHDR_rna_pre_ratio_rL41rL42_mean_synnorm),x=as.numeric(as.character(CDSpos))+as.numeric(as.character(CDSpos_nudge))))+geom_segment(aes(yend=log2(tHDR_rna_pre_ratio_rL41rL42_mean_synnorm),xend=as.numeric(as.character(CDSpos))+as.numeric(as.character(CDSpos_nudge)),colour='Gray',alpha=0.15),yend=0)+geom_point(aes(colour=conseq),alpha=1,size=1.3)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank(),legend.position = "none")+ylab('Log-2 RNA')+xlab('CDS position')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors_full_simple)+scale_color_manual(values=named_conseq_colors_full_simple)+geom_text(aes(label=rev_comp),hjust=0.5, vjust=0.5, size = 1.4,colour='White')
grid.arrange(X5_tHDR_post_lib_loess_e.sns_rL41rL42_mean_by_pos, X5_tHDR_rna_pre_ratio_rL41rL42_mean_synnorm_by_pos, nrow=2)
#rna_pre ratio across replicates:
ggplot(X5rL4_final_exon_df, aes(x=log2(tHDR_rna_pre_ratio_synnorm),y=log2(rL42_tHDR_rna_pre_ratio_synnorm)))+geom_point(aes(colour=conseq),size=1)+geom_abline(intercept=0,slope=1,lty=2)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+ylab('Rep. 2 rna/pre')+xlab('Rep. 1 rna/pre')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)
#rna vs. pre by replicate, rL41
ggplot(X5rL4_final_exon_df, aes(x=log2(tHDR_pre_pseudo_freq),y=log2(tHDR_rna_pseudo_freq)))+geom_point(aes(colour=conseq,size=rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean),alpha=1)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+ylab('RNA freq. (log-2)')+xlab('DNA freq. D5 (log-2)')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)
#rna vs. pre by replicate, rL42
ggplot(X5rL4_final_exon_df, aes(x=log2(rL42_tHDR_pre_pseudo_freq),y=log2(rL42_tHDR_rna_pseudo_freq)))+geom_point(aes(colour=conseq,size=rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean),alpha=1)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+ylab('RNA freq. (log-2)')+xlab('DNA freq. D5 (log-2)')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)
#average rna vs. average pre
ggplot(X5rL4_final_exon_df, aes(x=log2(tHDR_pre_pseudo_freq/2+rL42_tHDR_pre_pseudo_freq/2),y=log2(tHDR_rna_pseudo_freq/2+rL42_tHDR_rna_pseudo_freq/2)))+geom_point(aes(colour=conseq,size=rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean),alpha=1)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+ylab('RNA freq. (log-2)')+xlab('DNA freq. D5 (log-2)')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)
#which correlates best to pre-ratios?
cor(log2(X5rL4_final_exon_df$tHDR_pre_pseudo_freq),log2(X5rL4_final_exon_df$tHDR_rna_pseudo_freq),method='spearman')
cor(log2(X5rL4_final_exon_df$rL42_tHDR_pre_pseudo_freq),log2(X5rL4_final_exon_df$rL42_tHDR_rna_pseudo_freq),method='spearman')
cor(log2(X5rL4_final_exon_df$tHDR_pre_pseudo_freq/2+X5rL4_final_exon_df$rL42_tHDR_pre_pseudo_freq/2),log2(X5rL4_final_exon_df$tHDR_rna_pseudo_freq/2+X5rL4_final_exon_df$rL42_tHDR_rna_pseudo_freq/2),method='spearman')
#X15
X15_tHDR_post_lib_loess_e.sns_rL41rL42_mean_by_pos <- ggplot(X15rL4_final_exon_df, aes(y=rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean,x=as.numeric(as.character(CDSpos))+as.numeric(as.character(CDSpos_nudge))))+geom_segment(aes(yend=rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean,xend=as.numeric(as.character(CDSpos))+as.numeric(as.character(CDSpos_nudge)),colour=conseq),yend=0)+geom_point(aes(colour=conseq),alpha=1,size=1.3)+theme(panel.background = element_rect(fill = 'white', colour = 'white')) +theme(legend.title=element_blank(),legend.key = element_blank(),legend.position = "none")+ylab('Log-2 fitness score')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_blank(), axis.title.x = element_blank(),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors_full_simple)+scale_color_manual(values=named_conseq_colors_full_simple)+geom_text(aes(label=rev_comp),hjust=0.5, vjust=0.5, size = 1.4,colour='White')
X15_tHDR_rna_pre_ratio_rL41rL42_mean_synnorm_by_pos <- ggplot(X15rL4_final_exon_df, aes(y=log2(tHDR_rna_pre_ratio_rL41rL42_mean_synnorm),x=as.numeric(as.character(CDSpos))+as.numeric(as.character(CDSpos_nudge))))+geom_segment(aes(yend=log2(tHDR_rna_pre_ratio_rL41rL42_mean_synnorm),xend=as.numeric(as.character(CDSpos))+as.numeric(as.character(CDSpos_nudge)),colour='Gray',alpha=0.15),yend=0)+geom_point(aes(colour=conseq),alpha=1,size=1.3)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank(),legend.position = "none")+ylab('Log-2 RNA')+xlab('CDS position')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors_full_simple)+scale_color_manual(values=named_conseq_colors_full_simple)+geom_text(aes(label=rev_comp),hjust=0.5, vjust=0.5, size = 1.4,colour='White')
grid.arrange(X15_tHDR_post_lib_loess_e.sns_rL41rL42_mean_by_pos, X15_tHDR_rna_pre_ratio_rL41rL42_mean_synnorm_by_pos, nrow=2)
#rna_pre ratio across replicates:
ggplot(X15rL4_final_exon_df, aes(x=log2(tHDR_rna_pre_ratio_synnorm),y=log2(rL42_tHDR_rna_pre_ratio_synnorm)))+geom_point(aes(colour=conseq),size=1)+geom_abline(intercept=0,slope=1,lty=2)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+ylab('Rep. 2 rna/pre')+xlab('Rep. 1 rna/pre')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)
#rna vs. pre by replicate, rL41
ggplot(X15rL4_final_exon_df, aes(x=log2(tHDR_pre_pseudo_freq),y=log2(tHDR_rna_pseudo_freq)))+geom_point(aes(colour=conseq,size=rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean),alpha=1)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+ylab('RNA freq. (log-2)')+xlab('DNA freq. D5 (log-2)')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)
#rna vs. pre by replicate, rL42
ggplot(X15rL4_final_exon_df, aes(x=log2(rL42_tHDR_pre_pseudo_freq),y=log2(rL42_tHDR_rna_pseudo_freq)))+geom_point(aes(colour=conseq,size=rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean),alpha=1)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+ylab('RNA freq. (log-2)')+xlab('DNA freq. D5 (log-2)')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)
#average rna vs. average pre
ggplot(X15rL4_final_exon_df, aes(x=log2(tHDR_pre_pseudo_freq/2+rL42_tHDR_pre_pseudo_freq/2),y=log2(tHDR_rna_pseudo_freq/2+rL42_tHDR_rna_pseudo_freq/2)))+geom_point(aes(colour=conseq,size=rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean),alpha=1)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+ylab('RNA freq. (log-2)')+xlab('DNA freq. D5 (log-2)')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)
#which correlates best to pre-ratios?
cor(log2(X15rL4_final_exon_df$tHDR_pre_pseudo_freq),log2(X15rL4_final_exon_df$tHDR_rna_pseudo_freq),method='spearman')
cor(log2(X15rL4_final_exon_df$rL42_tHDR_pre_pseudo_freq),log2(X15rL4_final_exon_df$rL42_tHDR_rna_pseudo_freq),method='spearman')
cor(log2(X15rL4_final_exon_df$tHDR_pre_pseudo_freq/2+X15rL4_final_exon_df$rL42_tHDR_pre_pseudo_freq/2),log2(X15rL4_final_exon_df$tHDR_rna_pseudo_freq/2+X15rL4_final_exon_df$rL42_tHDR_rna_pseudo_freq/2),method='spearman')
#X16
X16_tHDR_post_lib_loess_e.sns_rL41rL42_mean_by_pos <- ggplot(X16rL4_final_exon_df, aes(y=rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean,x=as.numeric(as.character(CDSpos))+as.numeric(as.character(CDSpos_nudge))))+geom_segment(aes(yend=rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean,xend=as.numeric(as.character(CDSpos))+as.numeric(as.character(CDSpos_nudge)),colour=conseq),yend=0)+geom_point(aes(colour=conseq),alpha=1,size=1.3)+theme(panel.background = element_rect(fill = 'white', colour = 'white')) +theme(legend.title=element_blank(),legend.key = element_blank(),legend.position = "none")+ylab('Log-2 fitness score')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_blank(), axis.title.x = element_blank(),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors_full_simple)+scale_color_manual(values=named_conseq_colors_full_simple)+geom_text(aes(label=rev_comp),hjust=0.5, vjust=0.5, size = 1.4,colour='White')+scale_y_continuous(breaks=c(-2,-1,0))
X16_tHDR_rna_pre_ratio_rL41rL42_mean_synnorm_by_pos <- ggplot(X16rL4_final_exon_df, aes(y=log2(tHDR_rna_pre_ratio_rL41rL42_mean_synnorm),x=as.numeric(as.character(CDSpos))+as.numeric(as.character(CDSpos_nudge))))+geom_segment(aes(yend=log2(tHDR_rna_pre_ratio_rL41rL42_mean_synnorm),xend=as.numeric(as.character(CDSpos))+as.numeric(as.character(CDSpos_nudge)),colour='Gray',alpha=0.15),yend=0)+geom_point(aes(colour=conseq),alpha=1,size=1.3)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank(),legend.position = "none")+ylab('Log-2 RNA')+xlab('CDS position')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors_full_simple)+scale_color_manual(values=named_conseq_colors_full_simple)+geom_text(aes(label=rev_comp),hjust=0.5, vjust=0.5, size = 1.4,colour='White')
grid.arrange(X16_tHDR_post_lib_loess_e.sns_rL41rL42_mean_by_pos, X16_tHDR_rna_pre_ratio_rL41rL42_mean_synnorm_by_pos, nrow=2)
#rna_pre ratio across replicates:
ggplot(X16rL4_final_exon_df, aes(x=log2(tHDR_rna_pre_ratio_synnorm),y=log2(rL42_tHDR_rna_pre_ratio_synnorm)))+geom_point(aes(colour=conseq),size=1)+geom_abline(intercept=0,slope=1,lty=2)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+ylab('Rep. 2 rna/pre')+xlab('Rep. 1 rna/pre')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)
#rna vs. pre by replicate, rL41
ggplot(X16rL4_final_exon_df, aes(x=log2(tHDR_pre_pseudo_freq),y=log2(tHDR_rna_pseudo_freq)))+geom_point(aes(colour=conseq,size=rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean),alpha=1)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+ylab('RNA freq. (log-2)')+xlab('DNA freq. D5 (log-2)')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)
#rna vs. pre by replicate, rL42
ggplot(X16rL4_final_exon_df, aes(x=log2(rL42_tHDR_pre_pseudo_freq),y=log2(rL42_tHDR_rna_pseudo_freq)))+geom_point(aes(colour=conseq,size=rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean),alpha=1)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+ylab('RNA freq. (log-2)')+xlab('DNA freq. D5 (log-2)')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)
#average rna vs. average pre
ggplot(X16rL4_final_exon_df, aes(x=log2(tHDR_pre_pseudo_freq/2+rL42_tHDR_pre_pseudo_freq/2),y=log2(tHDR_rna_pseudo_freq/2+rL42_tHDR_rna_pseudo_freq/2)))+geom_point(aes(colour=conseq,size=rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean),alpha=1)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+ylab('RNA freq. (log-2)')+xlab('DNA freq. D5 (log-2)')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)
#which correlates best to pre-ratios?
cor(log2(X16rL4_final_exon_df$tHDR_pre_pseudo_freq),log2(X16rL4_final_exon_df$tHDR_rna_pseudo_freq),method='spearman')
cor(log2(X16rL4_final_exon_df$rL42_tHDR_pre_pseudo_freq),log2(X16rL4_final_exon_df$rL42_tHDR_rna_pseudo_freq),method='spearman')
cor(log2(X16rL4_final_exon_df$tHDR_pre_pseudo_freq/2+X16rL4_final_exon_df$rL42_tHDR_pre_pseudo_freq/2),log2(X16rL4_final_exon_df$tHDR_rna_pseudo_freq/2+X16rL4_final_exon_df$rL42_tHDR_rna_pseudo_freq/2),method='spearman')
#X17
X17_tHDR_post_lib_loess_e.sns_rL41rL42_mean_by_pos <- ggplot(X17rL4_final_exon_df, aes(y=rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean,x=as.numeric(as.character(CDSpos))+as.numeric(as.character(CDSpos_nudge))))+geom_segment(aes(yend=rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean,xend=as.numeric(as.character(CDSpos))+as.numeric(as.character(CDSpos_nudge)),colour=conseq),yend=0)+geom_point(aes(colour=conseq),alpha=1,size=1.3)+theme(panel.background = element_rect(fill = 'white', colour = 'white')) +theme(legend.title=element_blank(),legend.key = element_blank(),legend.position = "none")+ylab('Log-2 fitness score')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_blank(), axis.title.x = element_blank(),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors_full_simple)+scale_color_manual(values=named_conseq_colors_full_simple)+geom_text(aes(label=rev_comp),hjust=0.5, vjust=0.5, size = 1.4,colour='White')
X17_tHDR_rna_pre_ratio_rL41rL42_mean_synnorm_by_pos <- ggplot(X17rL4_final_exon_df, aes(y=log2(tHDR_rna_pre_ratio_rL41rL42_mean_synnorm),x=as.numeric(as.character(CDSpos))+as.numeric(as.character(CDSpos_nudge))))+geom_segment(aes(yend=log2(tHDR_rna_pre_ratio_rL41rL42_mean_synnorm),xend=as.numeric(as.character(CDSpos))+as.numeric(as.character(CDSpos_nudge)),colour='Gray',alpha=0.15),yend=0)+geom_point(aes(colour=conseq),alpha=1,size=1.3)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank(),legend.position = "none")+ylab('Log-2 RNA')+xlab('CDS position')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors_full_simple)+scale_color_manual(values=named_conseq_colors_full_simple)+geom_text(aes(label=rev_comp),hjust=0.5, vjust=0.5, size = 1.4,colour='White')
grid.arrange(X17_tHDR_post_lib_loess_e.sns_rL41rL42_mean_by_pos, X17_tHDR_rna_pre_ratio_rL41rL42_mean_synnorm_by_pos, nrow=2)
#rna_pre ratio across replicates:
ggplot(X17rL4_final_exon_df, aes(x=log2(tHDR_rna_pre_ratio_synnorm),y=log2(rL42_tHDR_rna_pre_ratio_synnorm)))+geom_point(aes(colour=conseq),size=1)+geom_abline(intercept=0,slope=1,lty=2)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+ylab('Rep. 2 rna/pre')+xlab('Rep. 1 rna/pre')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)
#rna vs. pre by replicate, rL41
ggplot(X17rL4_final_exon_df, aes(x=log2(tHDR_pre_pseudo_freq),y=log2(tHDR_rna_pseudo_freq)))+geom_point(aes(colour=conseq,size=rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean),alpha=1)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+ylab('RNA freq. (log-2)')+xlab('DNA freq. D5 (log-2)')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)
#rna vs. pre by replicate, rL42
ggplot(X17rL4_final_exon_df, aes(x=log2(rL42_tHDR_pre_pseudo_freq),y=log2(rL42_tHDR_rna_pseudo_freq)))+geom_point(aes(colour=conseq,size=rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean),alpha=1)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+ylab('RNA freq. (log-2)')+xlab('DNA freq. D5 (log-2)')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)
#average rna vs. average pre
ggplot(X17rL4_final_exon_df, aes(x=log2(tHDR_pre_pseudo_freq/2+rL42_tHDR_pre_pseudo_freq/2),y=log2(tHDR_rna_pseudo_freq/2+rL42_tHDR_rna_pseudo_freq/2)))+geom_point(aes(colour=conseq,size=rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean),alpha=1)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+ylab('RNA freq. (log-2)')+xlab('DNA freq. D5 (log-2)')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)
#which correlates best to pre-ratios?
cor(log2(X17rL4_final_exon_df$tHDR_pre_pseudo_freq),log2(X17rL4_final_exon_df$tHDR_rna_pseudo_freq),method='spearman')
cor(log2(X17rL4_final_exon_df$rL42_tHDR_pre_pseudo_freq),log2(X17rL4_final_exon_df$rL42_tHDR_rna_pseudo_freq),method='spearman')
cor(log2(X17rL4_final_exon_df$tHDR_pre_pseudo_freq/2+X17rL4_final_exon_df$rL42_tHDR_pre_pseudo_freq/2),log2(X17rL4_final_exon_df$tHDR_rna_pseudo_freq/2+X17rL4_final_exon_df$rL42_tHDR_rna_pseudo_freq/2),method='spearman')
#X18
X18_tHDR_post_lib_loess_e.sns_rL41rL42_mean_by_pos <- ggplot(X18rL4_final_exon_df, aes(y=rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean,x=as.numeric(as.character(CDSpos))+as.numeric(as.character(CDSpos_nudge))))+geom_segment(aes(yend=rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean,xend=as.numeric(as.character(CDSpos))+as.numeric(as.character(CDSpos_nudge)),colour=conseq),yend=0)+geom_point(aes(colour=conseq),alpha=1,size=1.3)+theme(panel.background = element_rect(fill = 'white', colour = 'white')) +theme(legend.title=element_blank(),legend.key = element_blank(),legend.position = "none")+ylab('Log-2 fitness score')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_blank(), axis.title.x = element_blank(),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors_full_simple)+scale_color_manual(values=named_conseq_colors_full_simple)+geom_text(aes(label=rev_comp),hjust=0.5, vjust=0.5, size = 1.4,colour='White')
X18_tHDR_rna_pre_ratio_rL41rL42_mean_synnorm_by_pos <- ggplot(X18rL4_final_exon_df, aes(y=log2(tHDR_rna_pre_ratio_rL41rL42_mean_synnorm),x=as.numeric(as.character(CDSpos))+as.numeric(as.character(CDSpos_nudge))))+geom_segment(aes(yend=log2(tHDR_rna_pre_ratio_rL41rL42_mean_synnorm),xend=as.numeric(as.character(CDSpos))+as.numeric(as.character(CDSpos_nudge)),colour='Gray',alpha=0.15),yend=0)+geom_point(aes(colour=conseq),alpha=1,size=1.3)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank(),legend.position = "none")+ylab('Log-2 RNA')+xlab('CDS position')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors_full_simple)+scale_color_manual(values=named_conseq_colors_full_simple)+geom_text(aes(label=rev_comp),hjust=0.5, vjust=0.5, size = 1.4,colour='White')
grid.arrange(X18_tHDR_post_lib_loess_e.sns_rL41rL42_mean_by_pos, X18_tHDR_rna_pre_ratio_rL41rL42_mean_synnorm_by_pos, nrow=2)
#rna_pre ratio across replicates:
ggplot(X18rL4_final_exon_df, aes(x=log2(tHDR_rna_pre_ratio_synnorm),y=log2(rL42_tHDR_rna_pre_ratio_synnorm)))+geom_point(aes(colour=conseq),size=1)+geom_abline(intercept=0,slope=1,lty=2)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+ylab('Rep. 2 rna/pre')+xlab('Rep. 1 rna/pre')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)
#rna vs. pre by replicate, rL41
ggplot(X18rL4_final_exon_df, aes(x=log2(tHDR_pre_pseudo_freq),y=log2(tHDR_rna_pseudo_freq)))+geom_point(aes(colour=conseq,size=rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean),alpha=1)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+ylab('RNA freq. (log-2)')+xlab('DNA freq. D5 (log-2)')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)
#rna vs. pre by replicate, rL42
ggplot(X18rL4_final_exon_df, aes(x=log2(rL42_tHDR_pre_pseudo_freq),y=log2(rL42_tHDR_rna_pseudo_freq)))+geom_point(aes(colour=conseq,size=rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean),alpha=1)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+ylab('RNA freq. (log-2)')+xlab('DNA freq. D5 (log-2)')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)
#average rna vs. average pre
ggplot(X18rL4_final_exon_df, aes(x=log2(tHDR_pre_pseudo_freq/2+rL42_tHDR_pre_pseudo_freq/2),y=log2(tHDR_rna_pseudo_freq/2+rL42_tHDR_rna_pseudo_freq/2)))+geom_point(aes(colour=conseq,size=rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean),alpha=1)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+ylab('RNA freq. (log-2)')+xlab('DNA freq. D5 (log-2)')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)
#which correlates best to pre-ratios?
cor(log2(X18rL4_final_exon_df$tHDR_pre_pseudo_freq),log2(X18rL4_final_exon_df$tHDR_rna_pseudo_freq),method='spearman')
cor(log2(X18rL4_final_exon_df$rL42_tHDR_pre_pseudo_freq),log2(X18rL4_final_exon_df$rL42_tHDR_rna_pseudo_freq),method='spearman')
cor(log2(X18rL4_final_exon_df$tHDR_pre_pseudo_freq/2+X18rL4_final_exon_df$rL42_tHDR_pre_pseudo_freq/2),log2(X18rL4_final_exon_df$tHDR_rna_pseudo_freq/2+X18rL4_final_exon_df$rL42_tHDR_rna_pseudo_freq/2),method='spearman')
#X19
X19_tHDR_post_lib_loess_e.sns_rL41rL42_mean_by_pos <- ggplot(X19rL4_final_exon_df, aes(y=rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean,x=as.numeric(as.character(CDSpos))+as.numeric(as.character(CDSpos_nudge))))+geom_segment(aes(yend=rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean,xend=as.numeric(as.character(CDSpos))+as.numeric(as.character(CDSpos_nudge)),colour=conseq),yend=0)+geom_point(aes(colour=conseq),alpha=1,size=1.3)+theme(panel.background = element_rect(fill = 'white', colour = 'white')) +theme(legend.title=element_blank(),legend.key = element_blank(),legend.position = "none")+ylab('Log-2 fitness score')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_blank(), axis.title.x = element_blank(),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors_full_simple)+scale_color_manual(values=named_conseq_colors_full_simple)+geom_text(aes(label=rev_comp),hjust=0.5, vjust=0.5, size = 1.4,colour='White')
#modified code to scale limits -- excludes the PAM points
X19_tHDR_rna_pre_ratio_rL41rL42_mean_synnorm_by_pos <- ggplot(X19rL4_final_exon_df, aes(y=log2(tHDR_rna_pre_ratio_rL41rL42_mean_synnorm),x=as.numeric(as.character(CDSpos))+as.numeric(as.character(CDSpos_nudge))))+geom_segment(aes(yend=log2(tHDR_rna_pre_ratio_rL41rL42_mean_synnorm),xend=as.numeric(as.character(CDSpos))+as.numeric(as.character(CDSpos_nudge)),colour='Gray',alpha=0.15),yend=0)+geom_point(aes(colour=conseq),alpha=1,size=1.3)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank(),legend.position = "none")+ylab('Log-2 RNA')+xlab('CDS position')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors_full_simple)+scale_color_manual(values=named_conseq_colors_full_simple)+geom_text(aes(label=rev_comp),hjust=0.5, vjust=0.5, size = 1.4,colour='White')+scale_y_continuous(breaks=c(-3,-2,-1,0,1), limits=c(-3,2))
grid.arrange(X19_tHDR_post_lib_loess_e.sns_rL41rL42_mean_by_pos, X19_tHDR_rna_pre_ratio_rL41rL42_mean_synnorm_by_pos, nrow=2)
#rna_pre ratio across replicates:
ggplot(X19rL4_final_exon_df, aes(x=log2(tHDR_rna_pre_ratio_synnorm),y=log2(rL42_tHDR_rna_pre_ratio_synnorm)))+geom_point(aes(colour=conseq),size=1)+geom_abline(intercept=0,slope=1,lty=2)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+ylab('Rep. 2 rna/pre')+xlab('Rep. 1 rna/pre')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)
#rna vs. pre by replicate, rL41
ggplot(X19rL4_final_exon_df, aes(x=log2(tHDR_pre_pseudo_freq),y=log2(tHDR_rna_pseudo_freq)))+geom_point(aes(colour=conseq,size=rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean),alpha=1)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+ylab('RNA freq. (log-2)')+xlab('DNA freq. D5 (log-2)')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)
#rna vs. pre by replicate, rL42
ggplot(X19rL4_final_exon_df, aes(x=log2(rL42_tHDR_pre_pseudo_freq),y=log2(rL42_tHDR_rna_pseudo_freq)))+geom_point(aes(colour=conseq,size=rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean),alpha=1)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+ylab('RNA freq. (log-2)')+xlab('DNA freq. D5 (log-2)')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)
#average rna vs. average pre
ggplot(X19rL4_final_exon_df, aes(x=log2(tHDR_pre_pseudo_freq/2+rL42_tHDR_pre_pseudo_freq/2),y=log2(tHDR_rna_pseudo_freq/2+rL42_tHDR_rna_pseudo_freq/2)))+geom_point(aes(colour=conseq,size=rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean),alpha=1)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+ylab('RNA freq. (log-2)')+xlab('DNA freq. D5 (log-2)')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)
#which correlates best to pre-ratios?
cor(log2(X19rL4_final_exon_df$tHDR_pre_pseudo_freq),log2(X19rL4_final_exon_df$tHDR_rna_pseudo_freq),method='spearman')
cor(log2(X19rL4_final_exon_df$rL42_tHDR_pre_pseudo_freq),log2(X19rL4_final_exon_df$rL42_tHDR_rna_pseudo_freq),method='spearman')
cor(log2(X19rL4_final_exon_df$tHDR_pre_pseudo_freq/2+X19rL4_final_exon_df$rL42_tHDR_pre_pseudo_freq/2),log2(X19rL4_final_exon_df$tHDR_rna_pseudo_freq/2+X19rL4_final_exon_df$rL42_tHDR_rna_pseudo_freq/2),method='spearman')
#X20
X20_tHDR_post_lib_loess_e.sns_rL41rL42_mean_by_pos <- ggplot(X20rL4_final_exon_df, aes(y=rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean,x=as.numeric(as.character(CDSpos))+as.numeric(as.character(CDSpos_nudge))))+geom_segment(aes(yend=rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean,xend=as.numeric(as.character(CDSpos))+as.numeric(as.character(CDSpos_nudge)),colour=conseq),yend=0)+geom_point(aes(colour=conseq),alpha=1,size=1.3)+theme(panel.background = element_rect(fill = 'white', colour = 'white')) +theme(legend.title=element_blank(),legend.key = element_blank(),legend.position = "none")+ylab('Log-2 fitness score')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_blank(), axis.title.x = element_blank(),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors_full_simple)+scale_color_manual(values=named_conseq_colors_full_simple)+geom_text(aes(label=rev_comp),hjust=0.5, vjust=0.5, size = 1.4,colour='White')
X20_tHDR_rna_pre_ratio_rL41rL42_mean_synnorm_by_pos <- ggplot(X20rL4_final_exon_df, aes(y=log2(tHDR_rna_pre_ratio_rL41rL42_mean_synnorm),x=as.numeric(as.character(CDSpos))+as.numeric(as.character(CDSpos_nudge))))+geom_segment(aes(yend=log2(tHDR_rna_pre_ratio_rL41rL42_mean_synnorm),xend=as.numeric(as.character(CDSpos))+as.numeric(as.character(CDSpos_nudge)),colour='Gray',alpha=0.15),yend=0)+geom_point(aes(colour=conseq),alpha=1,size=1.3)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank(),legend.position = "none")+ylab('Log-2 RNA')+xlab('CDS position')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors_full_simple)+scale_color_manual(values=named_conseq_colors_full_simple)+geom_text(aes(label=rev_comp),hjust=0.5, vjust=0.5, size = 1.4,colour='White')
grid.arrange(X20_tHDR_post_lib_loess_e.sns_rL41rL42_mean_by_pos, X20_tHDR_rna_pre_ratio_rL41rL42_mean_synnorm_by_pos, nrow=2)
#rna_pre ratio across replicates:
ggplot(X20rL4_final_exon_df, aes(x=log2(tHDR_rna_pre_ratio_synnorm),y=log2(rL42_tHDR_rna_pre_ratio_synnorm)))+geom_point(aes(colour=conseq),size=1)+geom_abline(intercept=0,slope=1,lty=2)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+ylab('Rep. 2 rna/pre')+xlab('Rep. 1 rna/pre')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)
#rna vs. pre by replicate, rL41
ggplot(X20rL4_final_exon_df, aes(x=log2(tHDR_pre_pseudo_freq),y=log2(tHDR_rna_pseudo_freq)))+geom_point(aes(colour=conseq,size=rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean),alpha=1)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+ylab('RNA freq. (log-2)')+xlab('DNA freq. D5 (log-2)')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)
#rna vs. pre by replicate, rL42
ggplot(X20rL4_final_exon_df, aes(x=log2(rL42_tHDR_pre_pseudo_freq),y=log2(rL42_tHDR_rna_pseudo_freq)))+geom_point(aes(colour=conseq,size=rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean),alpha=1)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+ylab('RNA freq. (log-2)')+xlab('DNA freq. D5 (log-2)')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)
#average rna vs. average pre
ggplot(X20rL4_final_exon_df, aes(x=log2(tHDR_pre_pseudo_freq/2+rL42_tHDR_pre_pseudo_freq/2),y=log2(tHDR_rna_pseudo_freq/2+rL42_tHDR_rna_pseudo_freq/2)))+geom_point(aes(colour=conseq,size=rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean),alpha=1)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+ylab('RNA freq. (log-2)')+xlab('DNA freq. D5 (log-2)')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)
#which correlates best to pre-ratios?
cor(log2(X20rL4_final_exon_df$tHDR_pre_pseudo_freq),log2(X20rL4_final_exon_df$tHDR_rna_pseudo_freq),method='spearman')
cor(log2(X20rL4_final_exon_df$rL42_tHDR_pre_pseudo_freq),log2(X20rL4_final_exon_df$rL42_tHDR_rna_pseudo_freq),method='spearman')
cor(log2(X20rL4_final_exon_df$tHDR_pre_pseudo_freq/2+X20rL4_final_exon_df$rL42_tHDR_pre_pseudo_freq/2),log2(X20rL4_final_exon_df$tHDR_rna_pseudo_freq/2+X20rL4_final_exon_df$rL42_tHDR_rna_pseudo_freq/2),method='spearman')
#X21
X21_tHDR_post_lib_loess_e.sns_rL41rL42_mean_by_pos <- ggplot(X21rL4_final_exon_df, aes(y=rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean,x=as.numeric(as.character(CDSpos))+as.numeric(as.character(CDSpos_nudge))))+geom_segment(aes(yend=rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean,xend=as.numeric(as.character(CDSpos))+as.numeric(as.character(CDSpos_nudge)),colour=conseq),yend=0)+geom_point(aes(colour=conseq),alpha=1,size=1.3)+theme(panel.background = element_rect(fill = 'white', colour = 'white')) +theme(legend.title=element_blank(),legend.key = element_blank(),legend.position = "none")+ylab('Log-2 fitness score')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_blank(), axis.title.x = element_blank(),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors_full_simple)+scale_color_manual(values=named_conseq_colors_full_simple)+geom_text(aes(label=rev_comp),hjust=0.5, vjust=0.5, size = 1.4,colour='White')
X21_tHDR_rna_pre_ratio_rL41rL42_mean_synnorm_by_pos <- ggplot(X21rL4_final_exon_df, aes(y=log2(tHDR_rna_pre_ratio_rL41rL42_mean_synnorm),x=as.numeric(as.character(CDSpos))+as.numeric(as.character(CDSpos_nudge))))+geom_segment(aes(yend=log2(tHDR_rna_pre_ratio_rL41rL42_mean_synnorm),xend=as.numeric(as.character(CDSpos))+as.numeric(as.character(CDSpos_nudge)),colour='Gray',alpha=0.15),yend=0)+geom_point(aes(colour=conseq),alpha=1,size=1.3)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank(),legend.position = "none")+ylab('Log-2 RNA')+xlab('CDS position')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors_full_simple)+scale_color_manual(values=named_conseq_colors_full_simple)+geom_text(aes(label=rev_comp),hjust=0.5, vjust=0.5, size = 1.4,colour='White')
grid.arrange(X21_tHDR_post_lib_loess_e.sns_rL41rL42_mean_by_pos, X21_tHDR_rna_pre_ratio_rL41rL42_mean_synnorm_by_pos, nrow=2)
#rna_pre ratio across replicates:
ggplot(X21rL4_final_exon_df, aes(x=log2(tHDR_rna_pre_ratio_synnorm),y=log2(rL42_tHDR_rna_pre_ratio_synnorm)))+geom_point(aes(colour=conseq),size=1)+geom_abline(intercept=0,slope=1,lty=2)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+ylab('Rep. 2 rna/pre')+xlab('Rep. 1 rna/pre')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)
#rna vs. pre by replicate, rL41
ggplot(X21rL4_final_exon_df, aes(x=log2(tHDR_pre_pseudo_freq),y=log2(tHDR_rna_pseudo_freq)))+geom_point(aes(colour=conseq,size=rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean),alpha=1)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+ylab('RNA freq. (log-2)')+xlab('DNA freq. D5 (log-2)')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)
#rna vs. pre by replicate, rL42
ggplot(X21rL4_final_exon_df, aes(x=log2(rL42_tHDR_pre_pseudo_freq),y=log2(rL42_tHDR_rna_pseudo_freq)))+geom_point(aes(colour=conseq,size=rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean),alpha=1)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+ylab('RNA freq. (log-2)')+xlab('DNA freq. D5 (log-2)')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)
#average rna vs. average pre
ggplot(X21rL4_final_exon_df, aes(x=log2(tHDR_pre_pseudo_freq/2+rL42_tHDR_pre_pseudo_freq/2),y=log2(tHDR_rna_pseudo_freq/2+rL42_tHDR_rna_pseudo_freq/2)))+geom_point(aes(colour=conseq,size=rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean),alpha=1)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+ylab('RNA freq. (log-2)')+xlab('DNA freq. D5 (log-2)')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)
#which correlates best to pre-ratios?
cor(log2(X21rL4_final_exon_df$tHDR_pre_pseudo_freq),log2(X21rL4_final_exon_df$tHDR_rna_pseudo_freq),method='spearman')
cor(log2(X21rL4_final_exon_df$rL42_tHDR_pre_pseudo_freq),log2(X21rL4_final_exon_df$rL42_tHDR_rna_pseudo_freq),method='spearman')
cor(log2(X21rL4_final_exon_df$tHDR_pre_pseudo_freq/2+X21rL4_final_exon_df$rL42_tHDR_pre_pseudo_freq/2),log2(X21rL4_final_exon_df$tHDR_rna_pseudo_freq/2+X21rL4_final_exon_df$rL42_tHDR_rna_pseudo_freq/2),method='spearman')
#X22
X22_tHDR_post_lib_loess_e.sns_rL41rL42_mean_by_pos <- ggplot(X22rL4_final_exon_df, aes(y=rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean,x=as.numeric(as.character(CDSpos))+as.numeric(as.character(CDSpos_nudge))))+geom_segment(aes(yend=rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean,xend=as.numeric(as.character(CDSpos))+as.numeric(as.character(CDSpos_nudge)),colour=conseq),yend=0)+geom_point(aes(colour=conseq),alpha=1,size=1.3)+theme(panel.background = element_rect(fill = 'white', colour = 'white')) +theme(legend.title=element_blank(),legend.key = element_blank(),legend.position = "none")+ylab('Log-2 fitness score')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_blank(), axis.title.x = element_blank(),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors_full_simple)+scale_color_manual(values=named_conseq_colors_full_simple)+geom_text(aes(label=rev_comp),hjust=0.5, vjust=0.5, size = 1.4,colour='White')
X22_tHDR_rna_pre_ratio_rL41rL42_mean_synnorm_by_pos <- ggplot(X22rL4_final_exon_df, aes(y=log2(tHDR_rna_pre_ratio_rL41rL42_mean_synnorm),x=as.numeric(as.character(CDSpos))+as.numeric(as.character(CDSpos_nudge))))+geom_segment(aes(yend=log2(tHDR_rna_pre_ratio_rL41rL42_mean_synnorm),xend=as.numeric(as.character(CDSpos))+as.numeric(as.character(CDSpos_nudge)),colour='Gray',alpha=0.15),yend=0)+geom_point(aes(colour=conseq),alpha=1,size=1.3)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank(),legend.position = "none")+ylab('Log-2 RNA')+xlab('CDS position')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors_full_simple)+scale_color_manual(values=named_conseq_colors_full_simple)+geom_text(aes(label=rev_comp),hjust=0.5, vjust=0.5, size = 1.4,colour='White')
grid.arrange(X22_tHDR_post_lib_loess_e.sns_rL41rL42_mean_by_pos, X22_tHDR_rna_pre_ratio_rL41rL42_mean_synnorm_by_pos, nrow=2)
#rna_pre ratio across replicates:
ggplot(X22rL4_final_exon_df, aes(x=log2(tHDR_rna_pre_ratio_synnorm),y=log2(rL42_tHDR_rna_pre_ratio_synnorm)))+geom_point(aes(colour=conseq),size=1)+geom_abline(intercept=0,slope=1,lty=2)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+ylab('Rep. 2 rna/pre')+xlab('Rep. 1 rna/pre')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)
#rna vs. pre by replicate, rL41
ggplot(X22rL4_final_exon_df, aes(x=log2(tHDR_pre_pseudo_freq),y=log2(tHDR_rna_pseudo_freq)))+geom_point(aes(colour=conseq,size=rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean),alpha=1)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+ylab('RNA freq. (log-2)')+xlab('DNA freq. D5 (log-2)')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)
#rna vs. pre by replicate, rL42
ggplot(X22rL4_final_exon_df, aes(x=log2(rL42_tHDR_pre_pseudo_freq),y=log2(rL42_tHDR_rna_pseudo_freq)))+geom_point(aes(colour=conseq,size=rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean),alpha=1)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+ylab('RNA freq. (log-2)')+xlab('DNA freq. D5 (log-2)')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)
#average rna vs. average pre
ggplot(X22rL4_final_exon_df, aes(x=log2(tHDR_pre_pseudo_freq/2+rL42_tHDR_pre_pseudo_freq/2),y=log2(tHDR_rna_pseudo_freq/2+rL42_tHDR_rna_pseudo_freq/2)))+geom_point(aes(colour=conseq,size=rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean),alpha=1)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+ylab('RNA freq. (log-2)')+xlab('DNA freq. D5 (log-2)')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)
#which correlates best to pre-ratios?
cor(log2(X22rL4_final_exon_df$tHDR_pre_pseudo_freq),log2(X22rL4_final_exon_df$tHDR_rna_pseudo_freq),method='spearman')
cor(log2(X22rL4_final_exon_df$rL42_tHDR_pre_pseudo_freq),log2(X22rL4_final_exon_df$rL42_tHDR_rna_pseudo_freq),method='spearman')
cor(log2(X22rL4_final_exon_df$tHDR_pre_pseudo_freq/2+X22rL4_final_exon_df$rL42_tHDR_pre_pseudo_freq/2),log2(X22rL4_final_exon_df$tHDR_rna_pseudo_freq/2+X22rL4_final_exon_df$rL42_tHDR_rna_pseudo_freq/2),method='spearman')
#X23
X23_tHDR_post_lib_loess_e.sns_rL41rL42_mean_by_pos <- ggplot(X23rL4_final_exon_df, aes(y=rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean,x=as.numeric(as.character(CDSpos))+as.numeric(as.character(CDSpos_nudge))))+geom_segment(aes(yend=rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean,xend=as.numeric(as.character(CDSpos))+as.numeric(as.character(CDSpos_nudge)),colour=conseq),yend=0)+geom_point(aes(colour=conseq),alpha=1,size=1.3)+theme(panel.background = element_rect(fill = 'white', colour = 'white')) +theme(legend.title=element_blank(),legend.key = element_blank(),legend.position = "none")+ylab('Log-2 fitness score')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_blank(), axis.title.x = element_blank(),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors_full_simple)+scale_color_manual(values=named_conseq_colors_full_simple)+geom_text(aes(label=rev_comp),hjust=0.5, vjust=0.5, size = 1.4,colour='White')
X23_tHDR_rna_pre_ratio_rL41rL42_mean_synnorm_by_pos <- ggplot(X23rL4_final_exon_df, aes(y=log2(tHDR_rna_pre_ratio_rL41rL42_mean_synnorm),x=as.numeric(as.character(CDSpos))+as.numeric(as.character(CDSpos_nudge))))+geom_segment(aes(yend=log2(tHDR_rna_pre_ratio_rL41rL42_mean_synnorm),xend=as.numeric(as.character(CDSpos))+as.numeric(as.character(CDSpos_nudge)),colour='Gray',alpha=0.15),yend=0)+geom_point(aes(colour=conseq),alpha=1,size=1.3)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank(),legend.position = "none")+ylab('Log-2 RNA')+xlab('CDS position')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors_full_simple)+scale_color_manual(values=named_conseq_colors_full_simple)+geom_text(aes(label=rev_comp),hjust=0.5, vjust=0.5, size = 1.4,colour='White')
grid.arrange(X23_tHDR_post_lib_loess_e.sns_rL41rL42_mean_by_pos, X23_tHDR_rna_pre_ratio_rL41rL42_mean_synnorm_by_pos, nrow=2)
#rna_pre ratio across replicates:
ggplot(X23rL4_final_exon_df, aes(x=log2(tHDR_rna_pre_ratio_synnorm),y=log2(rL42_tHDR_rna_pre_ratio_synnorm)))+geom_point(aes(colour=conseq),size=1)+geom_abline(intercept=0,slope=1,lty=2)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+ylab('Rep. 2 rna/pre')+xlab('Rep. 1 rna/pre')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)
#rna vs. pre by replicate, rL41
ggplot(X23rL4_final_exon_df, aes(x=log2(tHDR_pre_pseudo_freq),y=log2(tHDR_rna_pseudo_freq)))+geom_point(aes(colour=conseq,size=rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean),alpha=1)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+ylab('RNA freq. (log-2)')+xlab('DNA freq. D5 (log-2)')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)
#rna vs. pre by replicate, rL42
ggplot(X23rL4_final_exon_df, aes(x=log2(rL42_tHDR_pre_pseudo_freq),y=log2(rL42_tHDR_rna_pseudo_freq)))+geom_point(aes(colour=conseq,size=rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean),alpha=1)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+ylab('RNA freq. (log-2)')+xlab('DNA freq. D5 (log-2)')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)
#average rna vs. average pre
ggplot(X23rL4_final_exon_df, aes(x=log2(tHDR_pre_pseudo_freq/2+rL42_tHDR_pre_pseudo_freq/2),y=log2(tHDR_rna_pseudo_freq/2+rL42_tHDR_rna_pseudo_freq/2)))+geom_point(aes(colour=conseq,size=rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean),alpha=1)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+ylab('RNA freq. (log-2)')+xlab('DNA freq. D5 (log-2)')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)
#which correlates best to pre-ratios?
cor(log2(X23rL4_final_exon_df$tHDR_pre_pseudo_freq),log2(X23rL4_final_exon_df$tHDR_rna_pseudo_freq),method='spearman')
cor(log2(X23rL4_final_exon_df$rL42_tHDR_pre_pseudo_freq),log2(X23rL4_final_exon_df$rL42_tHDR_rna_pseudo_freq),method='spearman')
cor(log2(X23rL4_final_exon_df$tHDR_pre_pseudo_freq/2+X23rL4_final_exon_df$rL42_tHDR_pre_pseudo_freq/2),log2(X23rL4_final_exon_df$tHDR_rna_pseudo_freq/2+X23rL4_final_exon_df$rL42_tHDR_rna_pseudo_freq/2),method='spearman')
### end of RNA section
# add a new column to:
#compute the exonic position of each variant (make a string with negative or positive annotation instead of NA --> will have to order this to be able to plot (plot one order, and then add vector of strings for labels, maybe?)
# clinvar nucleotide positions / other categorical translations between data sets...
# conseq without 'splice' but either exonic or intronic
#FINAL NORMALIZATION SECTION (SHOULD COME BEFORE RNA PLOTS above)
# min/max normalization on filtered data...
#use conseq or conseq_simp to not include splice?
#general formula: b/(a-c)*(A-C) where b is snv, a is exon med syn, c is exon med nonsense, A is global med syn, C is global med nonsense.
#do this first for the repl. mean fits and scores :
XrL4_all_tHDR_post_lib_loess_rL41rL42_median_syn <- median(XrL4_all_tHDR_pos_merge_all_filters[which(XrL4_all_tHDR_pos_merge_all_filters$conseq == 'SYNONYMOUS'),c('rL41rL42_tHDR_post_lib_loess_rL41rL42')])
XrL4_all_tHDR_post_lib_loess_rL41rL42_median_ns <- median(XrL4_all_tHDR_pos_merge_all_filters[which(XrL4_all_tHDR_pos_merge_all_filters$conseq == 'STOP_GAINED'),c('rL41rL42_tHDR_post_lib_loess_rL41rL42')])
X2_tHDR_post_lib_loess_rL41rL42_median_syn <- median(X2rL4_final_df[which(X2rL4_final_df$conseq == 'SYNONYMOUS'),c('rL41rL42_tHDR_post_lib_loess_rL41rL42')])
X2_tHDR_post_lib_loess_rL41rL42_median_ns <- median(X2rL4_final_df[which(X2rL4_final_df$conseq == 'STOP_GAINED'),c('rL41rL42_tHDR_post_lib_loess_rL41rL42')])
X2rL4_final_df$tHDR_post_lib_loess_rL41rL42_sns_norm <- (X2rL4_final_df$rL41rL42_tHDR_post_lib_loess_rL41rL42/(X2_tHDR_post_lib_loess_rL41rL42_median_syn-X2_tHDR_post_lib_loess_rL41rL42_median_ns))*(XrL4_all_tHDR_post_lib_loess_rL41rL42_median_syn-XrL4_all_tHDR_post_lib_loess_rL41rL42_median_ns)
X3_tHDR_post_lib_loess_rL41rL42_median_syn <- median(X3rL4_final_df[which(X3rL4_final_df$conseq == 'SYNONYMOUS'),c('rL41rL42_tHDR_post_lib_loess_rL41rL42')])
X3_tHDR_post_lib_loess_rL41rL42_median_ns <- median(X3rL4_final_df[which(X3rL4_final_df$conseq == 'STOP_GAINED'),c('rL41rL42_tHDR_post_lib_loess_rL41rL42')])
X3rL4_final_df$tHDR_post_lib_loess_rL41rL42_sns_norm <- (X3rL4_final_df$rL41rL42_tHDR_post_lib_loess_rL41rL42/(X3_tHDR_post_lib_loess_rL41rL42_median_syn-X3_tHDR_post_lib_loess_rL41rL42_median_ns))*(XrL4_all_tHDR_post_lib_loess_rL41rL42_median_syn-XrL4_all_tHDR_post_lib_loess_rL41rL42_median_ns)
X4_tHDR_post_lib_loess_rL41rL42_median_syn <- median(X4rL4_final_df[which(X4rL4_final_df$conseq == 'SYNONYMOUS'),c('rL41rL42_tHDR_post_lib_loess_rL41rL42')])
X4_tHDR_post_lib_loess_rL41rL42_median_ns <- median(X4rL4_final_df[which(X4rL4_final_df$conseq == 'STOP_GAINED'),c('rL41rL42_tHDR_post_lib_loess_rL41rL42')])
X4rL4_final_df$tHDR_post_lib_loess_rL41rL42_sns_norm <- (X4rL4_final_df$rL41rL42_tHDR_post_lib_loess_rL41rL42/(X4_tHDR_post_lib_loess_rL41rL42_median_syn-X4_tHDR_post_lib_loess_rL41rL42_median_ns))*(XrL4_all_tHDR_post_lib_loess_rL41rL42_median_syn-XrL4_all_tHDR_post_lib_loess_rL41rL42_median_ns)
X5_tHDR_post_lib_loess_rL41rL42_median_syn <- median(X5rL4_final_df[which(X5rL4_final_df$conseq == 'SYNONYMOUS'),c('rL41rL42_tHDR_post_lib_loess_rL41rL42')])
X5_tHDR_post_lib_loess_rL41rL42_median_ns <- median(X5rL4_final_df[which(X5rL4_final_df$conseq == 'STOP_GAINED'),c('rL41rL42_tHDR_post_lib_loess_rL41rL42')])
X5rL4_final_df$tHDR_post_lib_loess_rL41rL42_sns_norm <- (X5rL4_final_df$rL41rL42_tHDR_post_lib_loess_rL41rL42/(X5_tHDR_post_lib_loess_rL41rL42_median_syn-X5_tHDR_post_lib_loess_rL41rL42_median_ns))*(XrL4_all_tHDR_post_lib_loess_rL41rL42_median_syn-XrL4_all_tHDR_post_lib_loess_rL41rL42_median_ns)
X15_tHDR_post_lib_loess_rL41rL42_median_syn <- median(X15rL4_final_df[which(X15rL4_final_df$conseq == 'SYNONYMOUS'),c('rL41rL42_tHDR_post_lib_loess_rL41rL42')])
X15_tHDR_post_lib_loess_rL41rL42_median_ns <- median(X15rL4_final_df[which(X15rL4_final_df$conseq == 'STOP_GAINED'),c('rL41rL42_tHDR_post_lib_loess_rL41rL42')])
X15rL4_final_df$tHDR_post_lib_loess_rL41rL42_sns_norm <- (X15rL4_final_df$rL41rL42_tHDR_post_lib_loess_rL41rL42/(X15_tHDR_post_lib_loess_rL41rL42_median_syn-X15_tHDR_post_lib_loess_rL41rL42_median_ns))*(XrL4_all_tHDR_post_lib_loess_rL41rL42_median_syn-XrL4_all_tHDR_post_lib_loess_rL41rL42_median_ns)
X16_tHDR_post_lib_loess_rL41rL42_median_syn <- median(X16rL4_final_df[which(X16rL4_final_df$conseq == 'SYNONYMOUS'),c('rL41rL42_tHDR_post_lib_loess_rL41rL42')])
X16_tHDR_post_lib_loess_rL41rL42_median_ns <- median(X16rL4_final_df[which(X16rL4_final_df$conseq == 'STOP_GAINED'),c('rL41rL42_tHDR_post_lib_loess_rL41rL42')])
X16rL4_final_df$tHDR_post_lib_loess_rL41rL42_sns_norm <- (X16rL4_final_df$rL41rL42_tHDR_post_lib_loess_rL41rL42/(X16_tHDR_post_lib_loess_rL41rL42_median_syn-X16_tHDR_post_lib_loess_rL41rL42_median_ns))*(XrL4_all_tHDR_post_lib_loess_rL41rL42_median_syn-XrL4_all_tHDR_post_lib_loess_rL41rL42_median_ns)
X17_tHDR_post_lib_loess_rL41rL42_median_syn <- median(X17rL4_final_df[which(X17rL4_final_df$conseq == 'SYNONYMOUS'),c('rL41rL42_tHDR_post_lib_loess_rL41rL42')])
X17_tHDR_post_lib_loess_rL41rL42_median_ns <- median(X17rL4_final_df[which(X17rL4_final_df$conseq == 'STOP_GAINED'),c('rL41rL42_tHDR_post_lib_loess_rL41rL42')])
X17rL4_final_df$tHDR_post_lib_loess_rL41rL42_sns_norm <- (X17rL4_final_df$rL41rL42_tHDR_post_lib_loess_rL41rL42/(X17_tHDR_post_lib_loess_rL41rL42_median_syn-X17_tHDR_post_lib_loess_rL41rL42_median_ns))*(XrL4_all_tHDR_post_lib_loess_rL41rL42_median_syn-XrL4_all_tHDR_post_lib_loess_rL41rL42_median_ns)
X18_tHDR_post_lib_loess_rL41rL42_median_syn <- median(X18rL4_final_df[which(X18rL4_final_df$conseq == 'SYNONYMOUS'),c('rL41rL42_tHDR_post_lib_loess_rL41rL42')])
X18_tHDR_post_lib_loess_rL41rL42_median_ns <- median(X18rL4_final_df[which(X18rL4_final_df$conseq == 'STOP_GAINED'),c('rL41rL42_tHDR_post_lib_loess_rL41rL42')])
X18rL4_final_df$tHDR_post_lib_loess_rL41rL42_sns_norm <- (X18rL4_final_df$rL41rL42_tHDR_post_lib_loess_rL41rL42/(X18_tHDR_post_lib_loess_rL41rL42_median_syn-X18_tHDR_post_lib_loess_rL41rL42_median_ns))*(XrL4_all_tHDR_post_lib_loess_rL41rL42_median_syn-XrL4_all_tHDR_post_lib_loess_rL41rL42_median_ns)
X19_tHDR_post_lib_loess_rL41rL42_median_syn <- median(X19rL4_final_df[which(X19rL4_final_df$conseq == 'SYNONYMOUS'),c('rL41rL42_tHDR_post_lib_loess_rL41rL42')])
X19_tHDR_post_lib_loess_rL41rL42_median_ns <- median(X19rL4_final_df[which(X19rL4_final_df$conseq == 'STOP_GAINED'),c('rL41rL42_tHDR_post_lib_loess_rL41rL42')])
X19rL4_final_df$tHDR_post_lib_loess_rL41rL42_sns_norm <- (X19rL4_final_df$rL41rL42_tHDR_post_lib_loess_rL41rL42/(X19_tHDR_post_lib_loess_rL41rL42_median_syn-X19_tHDR_post_lib_loess_rL41rL42_median_ns))*(XrL4_all_tHDR_post_lib_loess_rL41rL42_median_syn-XrL4_all_tHDR_post_lib_loess_rL41rL42_median_ns)
X20_tHDR_post_lib_loess_rL41rL42_median_syn <- median(X20rL4_final_df[which(X20rL4_final_df$conseq == 'SYNONYMOUS'),c('rL41rL42_tHDR_post_lib_loess_rL41rL42')])
X20_tHDR_post_lib_loess_rL41rL42_median_ns <- median(X20rL4_final_df[which(X20rL4_final_df$conseq == 'STOP_GAINED'),c('rL41rL42_tHDR_post_lib_loess_rL41rL42')])
X20rL4_final_df$tHDR_post_lib_loess_rL41rL42_sns_norm <- (X20rL4_final_df$rL41rL42_tHDR_post_lib_loess_rL41rL42/(X20_tHDR_post_lib_loess_rL41rL42_median_syn-X20_tHDR_post_lib_loess_rL41rL42_median_ns))*(XrL4_all_tHDR_post_lib_loess_rL41rL42_median_syn-XrL4_all_tHDR_post_lib_loess_rL41rL42_median_ns)
X21_tHDR_post_lib_loess_rL41rL42_median_syn <- median(X21rL4_final_df[which(X21rL4_final_df$conseq == 'SYNONYMOUS'),c('rL41rL42_tHDR_post_lib_loess_rL41rL42')])
X21_tHDR_post_lib_loess_rL41rL42_median_ns <- median(X21rL4_final_df[which(X21rL4_final_df$conseq == 'STOP_GAINED'),c('rL41rL42_tHDR_post_lib_loess_rL41rL42')])
X21rL4_final_df$tHDR_post_lib_loess_rL41rL42_sns_norm <- (X21rL4_final_df$rL41rL42_tHDR_post_lib_loess_rL41rL42/(X21_tHDR_post_lib_loess_rL41rL42_median_syn-X21_tHDR_post_lib_loess_rL41rL42_median_ns))*(XrL4_all_tHDR_post_lib_loess_rL41rL42_median_syn-XrL4_all_tHDR_post_lib_loess_rL41rL42_median_ns)
X22_tHDR_post_lib_loess_rL41rL42_median_syn <- median(X22rL4_final_df[which(X22rL4_final_df$conseq == 'SYNONYMOUS'),c('rL41rL42_tHDR_post_lib_loess_rL41rL42')])
X22_tHDR_post_lib_loess_rL41rL42_median_ns <- median(X22rL4_final_df[which(X22rL4_final_df$conseq == 'STOP_GAINED'),c('rL41rL42_tHDR_post_lib_loess_rL41rL42')])
X22rL4_final_df$tHDR_post_lib_loess_rL41rL42_sns_norm <- (X22rL4_final_df$rL41rL42_tHDR_post_lib_loess_rL41rL42/(X22_tHDR_post_lib_loess_rL41rL42_median_syn-X22_tHDR_post_lib_loess_rL41rL42_median_ns))*(XrL4_all_tHDR_post_lib_loess_rL41rL42_median_syn-XrL4_all_tHDR_post_lib_loess_rL41rL42_median_ns)
X23_tHDR_post_lib_loess_rL41rL42_median_syn <- median(X23rL4_final_df[which(X23rL4_final_df$conseq == 'SYNONYMOUS'),c('rL41rL42_tHDR_post_lib_loess_rL41rL42')])
X23_tHDR_post_lib_loess_rL41rL42_median_ns <- median(X23rL4_final_df[which(X23rL4_final_df$conseq == 'STOP_GAINED'),c('rL41rL42_tHDR_post_lib_loess_rL41rL42')])
X23rL4_final_df$tHDR_post_lib_loess_rL41rL42_sns_norm <- (X23rL4_final_df$rL41rL42_tHDR_post_lib_loess_rL41rL42/(X23_tHDR_post_lib_loess_rL41rL42_median_syn-X23_tHDR_post_lib_loess_rL41rL42_median_ns))*(XrL4_all_tHDR_post_lib_loess_rL41rL42_median_syn-XrL4_all_tHDR_post_lib_loess_rL41rL42_median_ns)
#repeat the above for the e.sns replicate means... what does this look like keeping the replicate normalized scored separate?
XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn <- median(XrL4_all_tHDR_pos_merge_all_filters[which(XrL4_all_tHDR_pos_merge_all_filters$conseq == 'SYNONYMOUS'),c('rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean')])
XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns <- median(XrL4_all_tHDR_pos_merge_all_filters[which(XrL4_all_tHDR_pos_merge_all_filters$conseq == 'STOP_GAINED'),c('rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean')])
X2_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn <- median(X2rL4_final_df[which(X2rL4_final_df$conseq == 'SYNONYMOUS'),c('rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean')])
X2_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns <- median(X2rL4_final_df[which(X2rL4_final_df$conseq == 'STOP_GAINED'),c('rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean')])
X2rL4_final_df$tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm <- (X2rL4_final_df$rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean/(X2_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-X2_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns))*(XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns)
X3_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn <- median(X3rL4_final_df[which(X3rL4_final_df$conseq == 'SYNONYMOUS'),c('rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean')])
X3_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns <- median(X3rL4_final_df[which(X3rL4_final_df$conseq == 'STOP_GAINED'),c('rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean')])
X3rL4_final_df$tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm <- (X3rL4_final_df$rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean/(X3_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-X3_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns))*(XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns)
X4_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn <- median(X4rL4_final_df[which(X4rL4_final_df$conseq == 'SYNONYMOUS'),c('rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean')])
X4_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns <- median(X4rL4_final_df[which(X4rL4_final_df$conseq == 'STOP_GAINED'),c('rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean')])
X4rL4_final_df$tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm <- (X4rL4_final_df$rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean/(X4_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-X4_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns))*(XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns)
X5_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn <- median(X5rL4_final_df[which(X5rL4_final_df$conseq == 'SYNONYMOUS'),c('rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean')])
X5_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns <- median(X5rL4_final_df[which(X5rL4_final_df$conseq == 'STOP_GAINED'),c('rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean')])
X5rL4_final_df$tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm <- (X5rL4_final_df$rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean/(X5_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-X5_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns))*(XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns)
X15_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn <- median(X15rL4_final_df[which(X15rL4_final_df$conseq == 'SYNONYMOUS'),c('rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean')])
X15_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns <- median(X15rL4_final_df[which(X15rL4_final_df$conseq == 'STOP_GAINED'),c('rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean')])
X15rL4_final_df$tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm <- (X15rL4_final_df$rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean/(X15_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-X15_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns))*(XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns)
X16_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn <- median(X16rL4_final_df[which(X16rL4_final_df$conseq == 'SYNONYMOUS'),c('rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean')])
X16_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns <- median(X16rL4_final_df[which(X16rL4_final_df$conseq == 'STOP_GAINED'),c('rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean')])
X16rL4_final_df$tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm <- (X16rL4_final_df$rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean/(X16_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-X16_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns))*(XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns)
X17_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn <- median(X17rL4_final_df[which(X17rL4_final_df$conseq == 'SYNONYMOUS'),c('rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean')])
X17_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns <- median(X17rL4_final_df[which(X17rL4_final_df$conseq == 'STOP_GAINED'),c('rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean')])
X17rL4_final_df$tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm <- (X17rL4_final_df$rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean/(X17_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-X17_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns))*(XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns)
X18_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn <- median(X18rL4_final_df[which(X18rL4_final_df$conseq == 'SYNONYMOUS'),c('rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean')])
X18_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns <- median(X18rL4_final_df[which(X18rL4_final_df$conseq == 'STOP_GAINED'),c('rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean')])
X18rL4_final_df$tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm <- (X18rL4_final_df$rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean/(X18_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-X18_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns))*(XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns)
X19_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn <- median(X19rL4_final_df[which(X19rL4_final_df$conseq == 'SYNONYMOUS'),c('rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean')])
X19_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns <- median(X19rL4_final_df[which(X19rL4_final_df$conseq == 'STOP_GAINED'),c('rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean')])
X19rL4_final_df$tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm <- (X19rL4_final_df$rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean/(X19_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-X19_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns))*(XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns)
X20_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn <- median(X20rL4_final_df[which(X20rL4_final_df$conseq == 'SYNONYMOUS'),c('rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean')])
X20_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns <- median(X20rL4_final_df[which(X20rL4_final_df$conseq == 'STOP_GAINED'),c('rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean')])
X20rL4_final_df$tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm <- (X20rL4_final_df$rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean/(X20_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-X20_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns))*(XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns)
X21_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn <- median(X21rL4_final_df[which(X21rL4_final_df$conseq == 'SYNONYMOUS'),c('rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean')])
X21_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns <- median(X21rL4_final_df[which(X21rL4_final_df$conseq == 'STOP_GAINED'),c('rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean')])
X21rL4_final_df$tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm <- (X21rL4_final_df$rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean/(X21_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-X21_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns))*(XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns)
X22_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn <- median(X22rL4_final_df[which(X22rL4_final_df$conseq == 'SYNONYMOUS'),c('rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean')])
X22_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns <- median(X22rL4_final_df[which(X22rL4_final_df$conseq == 'STOP_GAINED'),c('rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean')])
X22rL4_final_df$tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm <- (X22rL4_final_df$rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean/(X22_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-X22_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns))*(XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns)
X23_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn <- median(X23rL4_final_df[which(X23rL4_final_df$conseq == 'SYNONYMOUS'),c('rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean')])
X23_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns <- median(X23rL4_final_df[which(X23rL4_final_df$conseq == 'STOP_GAINED'),c('rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean')])
X23rL4_final_df$tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm <- (X23rL4_final_df$rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean/(X23_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-X23_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns))*(XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns)
#make the same calculations on a per replicate score basis...:
#keeping all scalers as medians of replicate averages (i.e. global syn/ns and per exon syn/ns will be called from medians, but )
X2rL4_final_df$tHDR_post_lib_loess_e.sns_rL41_sns_norm <- (X2rL4_final_df$rL41_tHDR_post_lib_loess_rL41_e.sns/(X2_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-X2_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns))*(XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns)
X2rL4_final_df$tHDR_post_lib_loess_e.sns_rL42_sns_norm <- (X2rL4_final_df$rL42_tHDR_post_lib_loess_rL42_e.sns/(X2_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-X2_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns))*(XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns)
X3rL4_final_df$tHDR_post_lib_loess_e.sns_rL41_sns_norm <- (X3rL4_final_df$rL41_tHDR_post_lib_loess_rL41_e.sns/(X3_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-X3_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns))*(XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns)
X3rL4_final_df$tHDR_post_lib_loess_e.sns_rL42_sns_norm <- (X3rL4_final_df$rL42_tHDR_post_lib_loess_rL42_e.sns/(X3_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-X3_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns))*(XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns)
X4rL4_final_df$tHDR_post_lib_loess_e.sns_rL41_sns_norm <- (X4rL4_final_df$rL41_tHDR_post_lib_loess_rL41_e.sns/(X4_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-X4_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns))*(XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns)
X4rL4_final_df$tHDR_post_lib_loess_e.sns_rL42_sns_norm <- (X4rL4_final_df$rL42_tHDR_post_lib_loess_rL42_e.sns/(X4_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-X4_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns))*(XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns)
X5rL4_final_df$tHDR_post_lib_loess_e.sns_rL41_sns_norm <- (X5rL4_final_df$rL41_tHDR_post_lib_loess_rL41_e.sns/(X5_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-X5_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns))*(XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns)
X5rL4_final_df$tHDR_post_lib_loess_e.sns_rL42_sns_norm <- (X5rL4_final_df$rL42_tHDR_post_lib_loess_rL42_e.sns/(X5_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-X5_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns))*(XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns)
X15rL4_final_df$tHDR_post_lib_loess_e.sns_rL41_sns_norm <- (X15rL4_final_df$rL41_tHDR_post_lib_loess_rL41_e.sns/(X15_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-X15_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns))*(XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns)
X15rL4_final_df$tHDR_post_lib_loess_e.sns_rL42_sns_norm <- (X15rL4_final_df$rL42_tHDR_post_lib_loess_rL42_e.sns/(X15_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-X15_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns))*(XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns)
X16rL4_final_df$tHDR_post_lib_loess_e.sns_rL41_sns_norm <- (X16rL4_final_df$rL41_tHDR_post_lib_loess_rL41_e.sns/(X16_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-X16_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns))*(XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns)
X16rL4_final_df$tHDR_post_lib_loess_e.sns_rL42_sns_norm <- (X16rL4_final_df$rL42_tHDR_post_lib_loess_rL42_e.sns/(X16_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-X16_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns))*(XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns)
X17rL4_final_df$tHDR_post_lib_loess_e.sns_rL41_sns_norm <- (X17rL4_final_df$rL41_tHDR_post_lib_loess_rL41_e.sns/(X17_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-X17_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns))*(XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns)
X17rL4_final_df$tHDR_post_lib_loess_e.sns_rL42_sns_norm <- (X17rL4_final_df$rL42_tHDR_post_lib_loess_rL42_e.sns/(X17_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-X17_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns))*(XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns)
X18rL4_final_df$tHDR_post_lib_loess_e.sns_rL41_sns_norm <- (X18rL4_final_df$rL41_tHDR_post_lib_loess_rL41_e.sns/(X18_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-X18_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns))*(XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns)
X18rL4_final_df$tHDR_post_lib_loess_e.sns_rL42_sns_norm <- (X18rL4_final_df$rL42_tHDR_post_lib_loess_rL42_e.sns/(X18_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-X18_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns))*(XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns)
X19rL4_final_df$tHDR_post_lib_loess_e.sns_rL41_sns_norm <- (X19rL4_final_df$rL41_tHDR_post_lib_loess_rL41_e.sns/(X19_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-X19_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns))*(XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns)
X19rL4_final_df$tHDR_post_lib_loess_e.sns_rL42_sns_norm <- (X19rL4_final_df$rL42_tHDR_post_lib_loess_rL42_e.sns/(X19_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-X19_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns))*(XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns)
X20rL4_final_df$tHDR_post_lib_loess_e.sns_rL41_sns_norm <- (X20rL4_final_df$rL41_tHDR_post_lib_loess_rL41_e.sns/(X20_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-X20_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns))*(XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns)
X20rL4_final_df$tHDR_post_lib_loess_e.sns_rL42_sns_norm <- (X20rL4_final_df$rL42_tHDR_post_lib_loess_rL42_e.sns/(X20_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-X20_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns))*(XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns)
X21rL4_final_df$tHDR_post_lib_loess_e.sns_rL41_sns_norm <- (X21rL4_final_df$rL41_tHDR_post_lib_loess_rL41_e.sns/(X21_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-X21_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns))*(XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns)
X21rL4_final_df$tHDR_post_lib_loess_e.sns_rL42_sns_norm <- (X21rL4_final_df$rL42_tHDR_post_lib_loess_rL42_e.sns/(X21_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-X21_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns))*(XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns)
X22rL4_final_df$tHDR_post_lib_loess_e.sns_rL41_sns_norm <- (X22rL4_final_df$rL41_tHDR_post_lib_loess_rL41_e.sns/(X22_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-X22_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns))*(XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns)
X22rL4_final_df$tHDR_post_lib_loess_e.sns_rL42_sns_norm <- (X22rL4_final_df$rL42_tHDR_post_lib_loess_rL42_e.sns/(X22_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-X22_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns))*(XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns)
X23rL4_final_df$tHDR_post_lib_loess_e.sns_rL41_sns_norm <- (X23rL4_final_df$rL41_tHDR_post_lib_loess_rL41_e.sns/(X23_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-X23_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns))*(XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns)
X23rL4_final_df$tHDR_post_lib_loess_e.sns_rL42_sns_norm <- (X23rL4_final_df$rL42_tHDR_post_lib_loess_rL42_e.sns/(X23_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-X23_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns))*(XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_syn-XrL4_all_tHDR_post_lib_loess_e.sns_rL41rL42_median_ns)
#binding all of the sns-normalized df's together...
#this data frame has been variant filtered and then normalized!!!
XrL4_all_final_sns_norm <- rbind(X2rL4_final_df,X3rL4_final_df,X4rL4_final_df,X5rL4_final_df,X15rL4_final_df,X16rL4_final_df,X17rL4_final_df,X18rL4_final_df,X19rL4_final_df,X20rL4_final_df,X21rL4_final_df,X22rL4_final_df,X23rL4_final_df)
### calculate replicate dependent post-pre + lo.e.sns pre-lib scores for all points
#calculate a post-pre + pre-lib metric per replicate and compare to replicate reproducibility with other functional scores
XrL4_all_final_sns_norm$rL41_tHDR_pre_lib_lo_plus_post_pre <- XrL4_all_final_sns_norm$rL41_tHDR_pre_lib_loess_rL41+log2(XrL4_all_final_sns_norm$tHDR_post_pre_ratio_synnorm)
XrL4_all_final_sns_norm$rL42_tHDR_pre_lib_lo_plus_post_pre <- XrL4_all_final_sns_norm$rL42_tHDR_pre_lib_loess_rL41+log2(XrL4_all_final_sns_norm$rL42_tHDR_post_pre_ratio_synnorm)
XrL4_all_final_sns_norm$tHDR_pre_lib_lo_plus_post_pre_rL41rL42_mean <- (XrL4_all_final_sns_norm$rL42_tHDR_pre_lib_lo_plus_post_pre+XrL4_all_final_sns_norm$rL41_tHDR_pre_lib_lo_plus_post_pre)/2
#weighted by pre reads...
XrL4_all_final_sns_norm$tHDR_post_lib_loess_e.sns_rL41rL42w_sns_norm <- XrL4_all_final_sns_norm$tHDR_post_lib_loess_e.sns_rL41_sns_norm*XrL4_all_final_sns_norm$tHDR_pre_weight + XrL4_all_final_sns_norm$tHDR_post_lib_loess_e.sns_rL42_sns_norm*XrL4_all_final_sns_norm$rL42_tHDR_pre_weight
#write this out:
#write.table(XrL4_all_final_sns_norm, "/mount/SGE/BRCA1/XrL4_all_final_sns_norm_20171016.txt", sep="\t")
#XrL4_all_final_sns_norm <- read.table("/mount/SGE/BRCA1/XrL4_all_final_sns_norm_20171016.txt", sep="\t", header=TRUE)
conseq_ord_XrL4_all_final_sns_norm <- XrL4_all_final_sns_norm
conseq_ord_XrL4_all_final_sns_norm$conseq <- factor(conseq_ord_XrL4_all_final_sns_norm$conseq, levels = c('STOP_GAINED','CANONICAL_SPLICE','SYNONYMOUS','INTRONIC','SPLICE_SITE','5PRIME_UTR','NON_SYNONYMOUS'))
#all scoring metrics compared...
grid.arrange(ggplot(conseq_ord_XrL4_all_final_sns_norm, aes(x=log2(tHDR_post_pre_ratio_rL41rL42_mean_synnorm), fill=conseq))+geom_histogram(alpha=1,bins=30)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank(),legend.position='None')+ylab('SNVs')+xlab('Enrichment (Log-2)')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)+coord_cartesian(xlim=c(-3,1)),ggplot(conseq_ord_XrL4_all_final_sns_norm, aes(x=log2(tHDR_post_lib_ratio_rL41rL42_mean_synnorm), fill=conseq))+geom_histogram(alpha=1,bins=30)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank(),legend.position='None')+ylab('SNVs')+xlab('Enrichment (Log-2)')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)+coord_cartesian(xlim=c(-3,1)),ggplot(conseq_ord_XrL4_all_final_sns_norm, aes(x=rL41rL42_tHDR_post_lib_loess_rL41rL42, fill=conseq))+geom_histogram(alpha=1,bins=30)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank(),legend.position='None')+ylab('SNVs')+xlab('Enrichment (Log-2)')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)+coord_cartesian(xlim=c(-3,1)),ggplot(conseq_ord_XrL4_all_final_sns_norm, aes(x=tHDR_post_lib_loess_rL41rL42_sns_norm, fill=conseq))+geom_histogram(alpha=1,bins=30)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank(),legend.position='None')+ylab('SNVs')+xlab('Enrichment (Log-2)')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)+coord_cartesian(xlim=c(-3,1)),ggplot(conseq_ord_XrL4_all_final_sns_norm, aes(x=tHDR_post_lib_loess_e.sns_rL41rL42w_sns_norm, fill=conseq))+geom_histogram(alpha=1,bins=30)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank(),legend.position='None')+ylab('SNVs')+xlab('Enrichment (Log-2)')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)+coord_cartesian(xlim=c(-3,1)),nrow=5)
XrL4_all_final_sns_norm_mis_clinvar_p_b <- XrL4_all_final_sns_norm[which((XrL4_all_final_sns_norm$conseq == 'NON_SYNONYMOUS' & XrL4_all_final_sns_norm$clinvar_simple == 'Pathogenic') | (XrL4_all_final_sns_norm$conseq == 'NON_SYNONYMOUS' & XrL4_all_final_sns_norm$clinvar_simple == 'Benign')),]
ggplot(XrL4_all_final_sns_norm_mis_clinvar_p_b, aes(x=tHDR_post_lib_loess_rL41rL42_sns_norm, fill=clinvar_simple))+geom_histogram(alpha=1,bins=20)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank(),legend.position='None')+ylab('SNVs')+xlab('Enrichment (Log-2)')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors_full_simple)+scale_color_manual(values=named_conseq_colors_full_simple)
ggplot(XrL4_all_final_sns_norm_mis_clinvar_p_b, aes(x=tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm, fill=clinvar_simple))+geom_histogram(alpha=1,bins=20)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank(),legend.position='None')+ylab('SNVs')+xlab('Enrichment (Log-2)')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors_full_simple)+scale_color_manual(values=named_conseq_colors_full_simple)
XrL4_all_final_sns_norm_clinvar_p_b <- XrL4_all_final_sns_norm[which( XrL4_all_final_sns_norm$clinvar_simple == 'Pathogenic' | XrL4_all_final_sns_norm$conseq == 'NON_SYNONYMOUS' & XrL4_all_final_sns_norm$clinvar_simple == 'Benign'),]
ggplot(XrL4_all_final_sns_norm_clinvar_p_b, aes(x=tHDR_post_lib_loess_rL41rL42_sns_norm, fill=clinvar_simple))+geom_histogram(alpha=1,bins=20)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank(),legend.position='None')+ylab('SNVs')+xlab('Enrichment (Log-2)')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors_full_simple)+scale_color_manual(values=named_conseq_colors_full_simple)
ggplot(XrL4_all_final_sns_norm_clinvar_p_b, aes(x=tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm, fill=clinvar_simple))+geom_histogram(alpha=1,bins=20)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank(),legend.position='None')+ylab('SNVs')+xlab('Enrichment (Log-2)')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors_full_simple)+scale_color_manual(values=named_conseq_colors_full_simple)
XrL4_all_final_sns_norm_clinvar_p_b_lp_lb <- XrL4_all_final_sns_norm[which( XrL4_all_final_sns_norm$clinvar_simple == 'Pathogenic' | XrL4_all_final_sns_norm$conseq == 'NON_SYNONYMOUS' & XrL4_all_final_sns_norm$clinvar_simple == 'Benign' | XrL4_all_final_sns_norm$clinvar_simple == 'Likely pathogenic' | XrL4_all_final_sns_norm$clinvar_simple == 'Likely benign'),]
clinvar_ord_XrL4_all_final_sns_norm_clinvar_p_b_lp_lb <- XrL4_all_final_sns_norm_clinvar_p_b_lp_lb
clinvar_ord_XrL4_all_final_sns_norm_clinvar_p_b_lp_lb$clinvar_simple <- factor(clinvar_ord_XrL4_all_final_sns_norm_clinvar_p_b_lp_lb$clinvar_simple, levels = c("Pathogenic","Likely pathogenic","Likely benign","Benign","Uncertain significance","Conflicting interpretations of pathogenicity","absent","REF"))
#with colored boundaries for conseq
grid.arrange(ggplot(clinvar_ord_XrL4_all_final_sns_norm_clinvar_p_b_lp_lb, aes(x=tHDR_post_lib_loess_rL41rL42_sns_norm, fill=clinvar_simple,color=conseq))+geom_histogram(alpha=1,bins=20)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank(),legend.position='None')+ylab('SNVs')+xlab('Enrichment (Log-2)')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors_full_simple)+scale_color_manual(values=named_conseq_colors_full_simple),ggplot(clinvar_ord_XrL4_all_final_sns_norm_clinvar_p_b_lp_lb, aes(x=tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm, fill=clinvar_simple,color=conseq))+geom_histogram(alpha=1,bins=20)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank(),legend.position='None')+ylab('SNVs')+xlab('Enrichment (Log-2)')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors_full_simple)+scale_color_manual(values=named_conseq_colors_full_simple),nrow=2)
#without colored boundaries for conseq
ggplot(clinvar_ord_XrL4_all_final_sns_norm_clinvar_p_b_lp_lb, aes(x=tHDR_post_lib_loess_rL41rL42_sns_norm, fill=clinvar_simple))+geom_histogram(alpha=1,bins=20)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank(),legend.position='None')+ylab('SNVs')+xlab('Enrichment (Log-2)')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors_full_simple)+scale_color_manual(values=named_conseq_colors_full_simple)
ggplot(clinvar_ord_XrL4_all_final_sns_norm_clinvar_p_b_lp_lb, aes(x=tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm, fill=clinvar_simple))+geom_histogram(alpha=1,bins=20)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank(),legend.position='None')+ylab('SNVs')+xlab('Enrichment (Log-2)')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors_full_simple)+scale_color_manual(values=named_conseq_colors_full_simple)
#all missense variants ordered by clinvar
XrL4_all_final_sns_norm_mis <- XrL4_all_final_sns_norm[which((XrL4_all_final_sns_norm$conseq == 'NON_SYNONYMOUS')),]
clinvar_ord_XrL4_all_final_sns_norm_mis <- XrL4_all_final_sns_norm_mis
clinvar_ord_XrL4_all_final_sns_norm_mis$clinvar_simple <- factor(clinvar_ord_XrL4_all_final_sns_norm_mis$clinvar_simple, levels = c("Pathogenic","Likely pathogenic","Likely benign","Benign","Uncertain significance","Conflicting interpretations of pathogenicity","absent","REF"))
ggplot(clinvar_ord_XrL4_all_final_sns_norm_mis, aes(x=tHDR_post_lib_loess_rL41rL42_sns_norm, fill=clinvar_simple))+geom_histogram(alpha=1,bins=30)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+ylab('SNVs')+xlab('Enrichment (Log-2)')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors_full_simple)+scale_color_manual(values=named_conseq_colors_full_simple)
ggplot(clinvar_ord_XrL4_all_final_sns_norm_mis, aes(x=tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm, fill=clinvar_simple))+geom_histogram(alpha=1,bins=30)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank(),legend.position = 'None')+ylab('SNVs')+xlab('Enrichment (Log-2)')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors_full_simple)+scale_color_manual(values=named_conseq_colors_full_simple)
#clinvar_ord_XrL4_all_final_sns_norm_mis filtered on p/lp b/lb
clinvar_ord_XrL4_all_final_sns_norm_mis_p_b_lp_lb <- clinvar_ord_XrL4_all_final_sns_norm_mis[which( clinvar_ord_XrL4_all_final_sns_norm_mis$clinvar_simple == 'Pathogenic' | clinvar_ord_XrL4_all_final_sns_norm_mis$clinvar_simple == 'Benign' | clinvar_ord_XrL4_all_final_sns_norm_mis$clinvar_simple == 'Likely pathogenic' | clinvar_ord_XrL4_all_final_sns_norm_mis$clinvar_simple == 'Likely benign'),]
#only missense
ggplot(clinvar_ord_XrL4_all_final_sns_norm_mis_p_b_lp_lb, aes(x=tHDR_post_lib_loess_rL41rL42_sns_norm, fill=clinvar_simple))+geom_histogram(alpha=1,bins=30)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank(),legend.position = 'None')+ylab('SNVs')+xlab('Enrichment (Log-2)')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors_full_simple)+scale_color_manual(values=named_conseq_colors_full_simple)
#all variants ordered by ClinVar annotation
clinvar_ord_XrL4_all_final_sns_norm <- XrL4_all_final_sns_norm
clinvar_ord_XrL4_all_final_sns_norm$clinvar_simple <- factor(clinvar_ord_XrL4_all_final_sns_norm$clinvar_simple, levels = c("Pathogenic","Likely pathogenic","Likely benign","Benign","Uncertain significance","Conflicting interpretations of pathogenicity","absent","REF"))
ggplot(clinvar_ord_XrL4_all_final_sns_norm, aes(x=tHDR_post_lib_loess_rL41rL42_sns_norm, fill=clinvar_simple))+geom_histogram(alpha=1,bins=30)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+ylab('SNVs')+xlab('Enrichment (Log-2)')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors_full_simple)+scale_color_manual(values=named_conseq_colors_full_simple)
ggplot(clinvar_ord_XrL4_all_final_sns_norm, aes(x=tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm, fill=clinvar_simple))+geom_histogram(alpha=1,bins=30)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+ylab('SNVs')+xlab('Enrichment (Log-2)')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors_full_simple)+scale_color_manual(values=named_conseq_colors_full_simple)
#isolate all synonymous and missense variants and test how rna_pre_ratio correlates with survival (c_hdr_snv only, so RNA levels can be detected.
#only syn
XrL4_all_final_sns_norm_syn <- XrL4_all_final_sns_norm[which((XrL4_all_final_sns_norm$conseq == 'SYNONYMOUS' & XrL4_all_final_sns_norm$c_hdr_snv == 'True' & XrL4_all_final_sns_norm$exon != 'X18') | (XrL4_all_final_sns_norm$conseq == 'SPLICE_SITE' & XrL4_all_final_sns_norm$c_hdr_snv == 'True' & XrL4_all_final_sns_norm$CDSpos != 'NA' & XrL4_all_final_sns_norm$exon != 'X18')),]
XrL4_all_final_sns_norm_ns <- XrL4_all_final_sns_norm[which((XrL4_all_final_sns_norm$conseq == 'STOP_GAINED' & XrL4_all_final_sns_norm$c_hdr_snv == 'True')),]
XrL4_all_final_sns_norm_ns_no18 <- XrL4_all_final_sns_norm[which((XrL4_all_final_sns_norm$conseq == 'STOP_GAINED' & XrL4_all_final_sns_norm$c_hdr_snv == 'True' & XrL4_all_final_sns_norm$exon != 'X18')),]
#log2 median and sd for rna ratios
XrL4_all_final_sns_norm_syn_rna_median <- median(log2(XrL4_all_final_sns_norm_syn$tHDR_rna_pre_ratio_rL41rL42_mean_synnorm))
XrL4_all_final_sns_norm_syn_rna_sd <- sd(log2(XrL4_all_final_sns_norm_syn$tHDR_rna_pre_ratio_rL41rL42_mean_synnorm))
#defined as being within 1 SD from median (log2 taken here to compare
XrL4_all_final_sns_norm_syn_norm_exp <- XrL4_all_final_sns_norm_syn[which(XrL4_all_final_sns_norm_syn_rna_median + XrL4_all_final_sns_norm_syn_rna_sd >= log2(XrL4_all_final_sns_norm_syn$tHDR_rna_pre_ratio_rL41rL42_mean_synnorm) & log2(XrL4_all_final_sns_norm_syn$tHDR_rna_pre_ratio_rL41rL42_mean_synnorm) >= XrL4_all_final_sns_norm_syn_rna_median-XrL4_all_final_sns_norm_syn_rna_sd),]
ggplot(XrL4_all_final_sns_norm_syn, aes(y=tHDR_post_lib_loess_rL41rL42_sns_norm,x=log2(tHDR_rna_pre_ratio_rL41rL42_mean_synnorm), fill=conseq))+geom_point(aes(colour=conseq),alpha=0.7)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+ylab('Fitness (D11/lib, log-2 corr.)')+xlab('D5 RNA expression (log-2)')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)
ggplot(XrL4_all_final_sns_norm_syn, aes(y=tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm,x=log2(tHDR_rna_pre_ratio_rL41rL42_mean_synnorm), fill=conseq))+geom_point(aes(colour=conseq),alpha=0.7)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+ylab('Fitness (D11/lib, log-2 corr.)')+xlab('D5 RNA expression (log-2)')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)
#with labels to understand outliers (post-lib discordancy label)... seems to be problem with one replicate's 'post' value?
ggplot(XrL4_all_final_sns_norm_syn, aes(y=tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm,x=log2(tHDR_rna_pre_ratio_rL41rL42_mean_synnorm), fill=conseq))+geom_point(aes(colour=conseq),alpha=0.7)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+ylab('Fitness (D11/lib, log-2 corr.)')+xlab('D5 RNA expression (log-2)')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)+geom_text(aes(label=paste(paste(exon,pos),paste(tHDR_post_lib_loess_e.sns_rL41_sns_norm,tHDR_post_lib_loess_e.sns_rL42_sns_norm),' ') ,alpha=abs(tHDR_post_lib_loess_e.sns_rL41_sns_norm-tHDR_post_lib_loess_e.sns_rL42_sns_norm)),hjust=0.5, vjust=0.5, size = 1.4,colour='Black')
### global correlations for different scores, and global comparisons of 'final' scores
### this is still a hanger on call to y=tHDR_post_lib_loess_e.sns_rL41rL42
ggplot(XrL4_all_final_sns_norm, aes(x=tHDR_pre_lib_lo_plus_post_pre_rL41rL42_mean, y=rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean, fill=conseq))+geom_point(aes(colour=conseq),alpha=0.3,size=1)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+ylab('e.sns functional score')+xlab('pre.lib_plus_post.pre functional score')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)
cor(XrL4_all_final_sns_norm$tHDR_pre_lib_lo_plus_post_pre_rL41rL42_mean,XrL4_all_final_sns_norm$rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean,method='spearman')
ggplot(XrL4_all_final_sns_norm, aes(x=tHDR_pre_lib_lo_plus_post_pre_rL41rL42_mean, y=rL41rL42_tHDR_post_lib_loess_rL41rL42, fill=conseq))+geom_point(aes(colour=conseq),alpha=0.3,size=1)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+ylab('mean-fit score')+xlab('pre.lib_plus_post.pre score')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)
cor(XrL4_all_final_sns_norm$tHDR_pre_lib_lo_plus_post_pre_rL41rL42_mean,XrL4_all_final_sns_norm$rL41rL42_tHDR_post_lib_loess_rL41rL42,method='spearman')
ggplot(XrL4_all_final_sns_norm, aes(x=rL41rL42_tHDR_post_lib_loess_rL41rL42, y=rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean, fill=conseq))+geom_point(aes(colour=conseq),alpha=0.3,size=1)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+ylab('e.sns functional score')+xlab('mean-fit score')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)
cor(XrL4_all_final_sns_norm$rL4_tHDR_post_lib_loess_e.sns_rL41rL42_mean,XrL4_all_final_sns_norm$rL41rL42_tHDR_post_lib_loess_rL41rL42,method='spearman')
#normalized scores (two different ways).
ggplot(XrL4_all_final_sns_norm, aes(x=tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm, y=tHDR_post_lib_loess_rL41rL42_sns_norm, fill=conseq))+geom_point(aes(colour=conseq),alpha=0.3,size=1)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+ylab('mean-fit score')+xlab('pre.lib_plus_post.pre score')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors_full_simple)+scale_color_manual(values=named_conseq_colors_full_simple)
cor(XrL4_all_final_sns_norm$tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm,XrL4_all_final_sns_norm$tHDR_post_lib_loess_rL41rL42_sns_norm,method='spearman')
### global correlations with CADD to determine optimal 'fitness score' from these candidates
#has annotation in CADD
XrL4_variants_with_CADD <- XrL4_all_tHDR_pos_merge_master_df[which(XrL4_all_tHDR_pos_merge_master_df$CADD.phred != 'NA' & XrL4_all_tHDR_pos_merge_master_df$CADD.phred != 'REF' ),c('pos_alt')]
#filter XrL4_all_final_sns_norm on CADD variants:
XrL4_all_final_sns_norm_CADD <- XrL4_all_final_sns_norm[XrL4_all_final_sns_norm$pos_alt %in% XrL4_variants_with_CADD,]
#compare correlations of top three metrics...
cor(XrL4_all_final_sns_norm_CADD$tHDR_post_lib_loess_rL41rL42_sns_norm,as.numeric(as.character(XrL4_all_final_sns_norm_CADD$CADD.phred)),method='spearman')
cor(XrL4_all_final_sns_norm_CADD$tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm,as.numeric(as.character(XrL4_all_final_sns_norm_CADD$CADD.phred)),method='spearman')
cor(XrL4_all_final_sns_norm_CADD$rL41rL42_tHDR_post_lib_loess_rL41rL42,as.numeric(as.character(XrL4_all_final_sns_norm_CADD$CADD.phred)),method='spearman')
cor(XrL4_all_final_sns_norm_CADD$tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm,as.numeric(as.character(XrL4_all_final_sns_norm_CADD$CADD.phred)),method='spearman')
cor(XrL4_all_final_sns_norm_CADD$tHDR_post_lib_loess_e.sns_rL41rL42w_sns_norm,as.numeric(as.character(XrL4_all_final_sns_norm_CADD$CADD.phred)),method='spearman')
#individual scores vs. cadd scores
cor(XrL4_all_final_sns_norm_CADD$rL41_tHDR_post_lib_loess_rL41_e.sns,as.numeric(as.character(XrL4_all_final_sns_norm_CADD$CADD.phred)),method='spearman')
cor(XrL4_all_final_sns_norm_CADD$rL42_tHDR_post_lib_loess_rL42_e.sns,as.numeric(as.character(XrL4_all_final_sns_norm_CADD$CADD.phred)),method='spearman')
### WORKING IN THESE UPDATES!
### final filtering strategy to reduce discordant calls between replicates?
### with remaining points, stats to get P-values
### global QC correlations:
#with labels to understand outliers (post-lib discordancy label)... seems to be problem with one replicate's 'post' value?
#replicate discrepancy plot w/o labels: which scoring system is more reproducible? about same
#pre sns normalization across exons...
grid.arrange(ggplot(XrL4_all_final_sns_norm, aes(x=rL41_tHDR_post_lib_loess_rL41_e.sns, y=rL42_tHDR_post_lib_loess_rL42_e.sns, fill=conseq))+geom_point(aes(colour=conseq),alpha=0.3,size=1)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+ylab('Functional score rep. 2')+xlab('Functional score rep. 1')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)+coord_cartesian(xlim=c(-5,2),ylim=c(-5,2)),ggplot(XrL4_all_final_sns_norm, aes(x=rL41_tHDR_pre_lib_lo_plus_post_pre, y=rL42_tHDR_pre_lib_lo_plus_post_pre, fill=conseq))+geom_point(aes(colour=conseq),alpha=0.3,size=1)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+ylab('Functional score rep. 2')+xlab('Functional score rep. 1')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)+coord_cartesian(xlim=c(-5,2),ylim=c(-5,2)),nrow=2)
cor(XrL4_all_final_sns_norm$rL41_tHDR_post_lib_loess_rL41_e.sns,XrL4_all_final_sns_norm$rL42_tHDR_post_lib_loess_rL42_e.sns,method='spearman')
cor(XrL4_all_final_sns_norm$rL41_tHDR_pre_lib_lo_plus_post_pre,XrL4_all_final_sns_norm$rL42_tHDR_pre_lib_lo_plus_post_pre,method='spearman')
#post sns normalization across exons...
ggplot(XrL4_all_final_sns_norm, aes(x=tHDR_post_lib_loess_e.sns_rL41_sns_norm,y=tHDR_post_lib_loess_e.sns_rL42_sns_norm, fill=conseq))+geom_point(aes(colour=conseq),alpha=0.3,size=1)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+ylab('Functional score rep. 2')+xlab('Functional score rep. 1')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)
cor(XrL4_all_final_sns_norm$tHDR_post_lib_loess_e.sns_rL41_sns_norm,XrL4_all_final_sns_norm$tHDR_post_lib_loess_e.sns_rL42_sns_norm,method='spearman')
#replicate discrepancy plot with labels:
ggplot(XrL4_all_final_sns_norm, aes(x=tHDR_post_lib_loess_e.sns_rL41_sns_norm,y=tHDR_post_lib_loess_e.sns_rL42_sns_norm, fill=conseq))+geom_point(aes(colour=conseq,size=pre_weight),alpha=0.3)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+ylab('Functional score rep. 2')+xlab('Functional score rep. 1')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)+geom_text(aes(label=paste(paste(exon,pos),paste(tHDR_post_lib_loess_e.sns_rL41_sns_norm,tHDR_post_lib_loess_e.sns_rL42_sns_norm),' ') ,alpha=abs(tHDR_post_lib_loess_e.sns_rL41_sns_norm-tHDR_post_lib_loess_e.sns_rL42_sns_norm)),hjust=0.5, vjust=0.5, size = 1.4,colour='Black')
#fitness scores of SNVs with with normal expression
ggplot(XrL4_all_final_sns_norm_syn_norm_exp, aes(y=tHDR_post_lib_loess_e.sns_rL41_sns_norm,x=log2(tHDR_rna_pre_ratio_rL41rL42_mean_synnorm), fill=conseq))+geom_point(aes(colour=conseq),alpha=0.7)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+ylab('Fitness (D11/lib, log-2 corr.)')+xlab('D5 RNA expression (log-2)')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)
#histogram comparing snv's with normal expression values and those with all expression
null_compare_tHDR_post_lib_loess_rL41rL42_sns_norm <- grid.arrange(ggplot(XrL4_all_final_sns_norm_syn, aes(x=tHDR_post_lib_loess_rL41rL42_sns_norm, fill=conseq))+geom_histogram(aes(colour=conseq),alpha=0.7)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+xlab('Fitness (D11/lib, log-2 p.snsn)')+ylab('SNVs')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)+coord_cartesian(xlim=c(-3.2,1.2)),ggplot(XrL4_all_final_sns_norm_syn_norm_exp, aes(x=tHDR_post_lib_loess_rL41rL42_sns_norm, fill=conseq))+geom_histogram(aes(colour=conseq),alpha=0.7)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+xlab('Fitness (D11/lib, log-2 p.snsn)')+ylab('SNVs')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)+coord_cartesian(xlim=c(-3.2,1.2)),nrow=2)
null_compare_tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm <- grid.arrange(ggplot(XrL4_all_final_sns_norm_syn, aes(x=tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm, fill=conseq))+geom_histogram(aes(colour=conseq),alpha=0.7)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+xlab('Fitness (D11/lib, e.sns.mean log-2)')+ylab('SNVs')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)+coord_cartesian(xlim=c(-3.2,1.2)),ggplot(XrL4_all_final_sns_norm_syn_norm_exp, aes(x=tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm, fill=conseq))+geom_histogram(aes(colour=conseq),alpha=0.7)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+xlab('Fitness (D11/lib, e.sns.mean log-2)')+ylab('SNVs')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)+coord_cartesian(xlim=c(-3.2,1.2)),nrow=2)
grid.arrange(null_compare_tHDR_post_lib_loess_rL41rL42_sns_norm,null_compare_tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm,nrow=1)
mean(XrL4_all_final_sns_norm_syn_norm_exp$tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm)
sd(XrL4_all_final_sns_norm_syn_norm_exp$tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm)
mean(XrL4_all_final_sns_norm_ns$tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm)
sd(XrL4_all_final_sns_norm_ns$tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm)
mean(XrL4_all_final_sns_norm_syn_norm_exp$tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm)-2*sd(XrL4_all_final_sns_norm_syn_norm_exp$tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm)
mean(XrL4_all_final_sns_norm_ns$tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm)+2*sd(XrL4_all_final_sns_norm_ns$tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm)
mean(XrL4_all_final_sns_norm_syn_norm_exp$tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm)-3*sd(XrL4_all_final_sns_norm_syn_norm_exp$tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm)
mean(XrL4_all_final_sns_norm_ns$tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm)+3*sd(XrL4_all_final_sns_norm_ns$tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm)
#add nonsense variants:
XrL4_all_final_sns_norm_ns_syn_normex <- rbind(XrL4_all_final_sns_norm_ns,XrL4_all_final_sns_norm_syn_norm_exp)
#three ways to compare functional / non-functional split, all similar:
bimodal_compare_tHDR_pre_lib_lo_plus_post_pre_rL41rL42_mean <- ggplot(XrL4_all_final_sns_norm_ns_syn_normex, aes(x=tHDR_pre_lib_lo_plus_post_pre_rL41rL42_mean, fill=conseq))+geom_histogram(aes(fill=conseq),alpha=1)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+xlab('Functional score (pr.lib.lo+po.pr)')+ylab('SNVs')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)+coord_cartesian(xlim=c(-4,1.2))
bimodal_compare_tHDR_post_lib_loess_rL41rL42_sns_norm <- ggplot(XrL4_all_final_sns_norm_ns_syn_normex, aes(x=tHDR_post_lib_loess_rL41rL42_sns_norm, fill=conseq))+geom_histogram(aes(fill=conseq),alpha=1)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+xlab('Functional score (mean.p.sns)')+ylab('SNVs')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)+coord_cartesian(xlim=c(-4,1.2))
bimodal_compare_tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm <- ggplot(XrL4_all_final_sns_norm_ns_syn_normex, aes(x=tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm, fill=conseq))+geom_histogram(aes(fill=conseq),alpha=1)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+xlab('Functional score (e.sns.mean)')+ylab('SNVs')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)+coord_cartesian(xlim=c(-4,1.2))
#it makes virtually no difference if this is weighted by 'rL41 or rL42 pre frequency'
bimodal_compare_tHDR_post_lib_loess_e.sns_rL41rL42w_sns_norm <- ggplot(XrL4_all_final_sns_norm_ns_syn_normex, aes(x=tHDR_post_lib_loess_e.sns_rL41rL42w_sns_norm, fill=conseq))+geom_histogram(aes(fill=conseq),alpha=1)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank())+xlab('Functional score (e.sns.mean)')+ylab('SNVs')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)+coord_cartesian(xlim=c(-4,1.2))
grid.arrange(bimodal_compare_tHDR_pre_lib_lo_plus_post_pre_rL41rL42_mean,bimodal_compare_tHDR_post_lib_loess_rL41rL42_sns_norm,bimodal_compare_tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm,bimodal_compare_tHDR_post_lib_loess_e.sns_rL41rL42w_sns_norm,nrow=2)
#missense and syn
XrL4_all_final_sns_norm_syn_mis_spl_noX18 <- XrL4_all_final_sns_norm[which((XrL4_all_final_sns_norm$conseq == 'SYNONYMOUS' & XrL4_all_final_sns_norm$c_hdr_snv == 'True' & XrL4_all_final_sns_norm$exon != 'X18') | (XrL4_all_final_sns_norm$conseq == 'NON_SYNONYMOUS' & XrL4_all_final_sns_norm$c_hdr_snv == 'True' & XrL4_all_final_sns_norm$exon != 'X18') | (XrL4_all_final_sns_norm$conseq == 'SPLICE_SITE' & XrL4_all_final_sns_norm$c_hdr_snv == 'True' & XrL4_all_final_sns_norm$CDSpos != 'NA' & XrL4_all_final_sns_norm$exon != 'X18')),]
#### modeling section
library(gam)
#define 'lof' as a 1 or 0 based on nonsense SNVs / normal expression SYN SNVs.
assign_lof <- function(conseq){
if (conseq == 'STOP_GAINED') return(1) else return(0)
}
XrL4_all_final_sns_norm_ns_syn_normex_gam <- XrL4_all_final_sns_norm_ns_syn_normex
XrL4_all_final_sns_norm_ns_syn_normex_gam$lof <- sapply(XrL4_all_final_sns_norm_ns_syn_normex_gam$conseq,assign_lof)
#modeling with GAM -- use interacting terms and make this better? risk of overfitting
XrL4_all_gam_out <- gam(lof ~ tHDR_post_lib_loess_e.sns_rL41_sns_norm+tHDR_post_lib_loess_e.sns_rL42_sns_norm, data = XrL4_all_final_sns_norm_ns_syn_normex_gam, family = "binomial")
XrL4_all_gam_out$coefficients
XrL4_all_gam_pred <- predict(XrL4_all_gam_out, newdata = XrL4_all_final_sns_norm, se=TRUE)
XrL4_all_gam_pred$fit
XrL4_all_gam_out <- gam(lof ~ tHDR_post_lib_loess_e.sns_rL41_sns_norm+tHDR_post_lib_loess_e.sns_rL42_sns_norm, data = XrL4_all_final_sns_norm_ns_syn_normex_gam, family = "binomial")
XrL4_all_gam_out$coefficients
XrL4_all_gam_pred <- predict(XrL4_all_gam_out, newdata = XrL4_all_final_sns_norm, se=TRUE)
XrL4_all_gam_pred$fit
#MclustDA approach for Gaussian Mixture Modelling...
#using class from above... LOF = 1, F = 0
require(mclust)
XrL4_all_mclust.train <- MclustDA(data = XrL4_all_final_sns_norm_ns_syn_normex_gam[,c('tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm')], class = XrL4_all_final_sns_norm_ns_syn_normex_gam$lof, G = 1, modelNames = c('V'))
summary(XrL4_all_mclust.train, parameters = TRUE)
XrL4_all_mclust_lof_out <- rep('all_snvs',dim(XrL4_all_final_sns_norm)[1])
summary(XrL4_all_mclust.train, newdata = XrL4_all_final_sns_norm[, c('tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm')], newclass = XrL4_all_mclust_lof_out)
XrL4_all_mclust.predict <- predict.MclustDA(XrL4_all_mclust.train, newdata = XrL4_all_final_sns_norm[, c('tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm')], newclass = XrL4_all_mclust_lof_out,prior = c(0.5,0.5))
XrL4_all_final_sns_norm$mclust_z_lof <- XrL4_all_mclust.predict$z[,2]
XrL4_all_final_sns_norm$mclust_class_lof <- XrL4_all_mclust.predict$classification
max(XrL4_all_final_sns_norm[which(XrL4_all_final_sns_norm$mclust_class_lof == 0),c('mclust_z_lof')])
min(XrL4_all_final_sns_norm[which(XrL4_all_final_sns_norm$mclust_class_lof == 0),c('tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm')])
min(XrL4_all_final_sns_norm[which(XrL4_all_final_sns_norm$mclust_class_lof == 1),c('mclust_z_lof')])
max(XrL4_all_final_sns_norm[which(XrL4_all_final_sns_norm$mclust_class_lof == 1),c('tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm')])
mclust_functional_thresh <- mean(c(min(XrL4_all_final_sns_norm[which(XrL4_all_final_sns_norm$mclust_class_lof == 0),c('tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm')]),max(XrL4_all_final_sns_norm[which(XrL4_all_final_sns_norm$mclust_class_lof == 1),c('tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm')])))
mclust_lof_95 <- max(XrL4_all_final_sns_norm[which(XrL4_all_final_sns_norm$mclust_z_lof > 0.95),c('tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm')])
mclust_lof_99 <- max(XrL4_all_final_sns_norm[which(XrL4_all_final_sns_norm$mclust_z_lof > 0.99),c('tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm')])
mclust_f_05 <- min(XrL4_all_final_sns_norm[which(XrL4_all_final_sns_norm$mclust_z_lof < 0.05),c('tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm')])
mclust_f_01 <- min(XrL4_all_final_sns_norm[which(XrL4_all_final_sns_norm$mclust_z_lof < 0.01),c('tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm')])
min(XrL4_all_final_sns_norm[which(XrL4_all_final_sns_norm$mclust_z_lof < 0.001),c('tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm')])
#How do these compare to the threshold using normalmixEM?
#frequentist, two normal distributions:
#redo null distributions with this...
#get all the P-values using syn_norm_exp
XrL4_test_p_syn <- pnorm(XrL4_all_final_sns_norm$tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm,mean = mean(XrL4_all_final_sns_norm_syn_norm_exp$tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm), sd = sd(XrL4_all_final_sns_norm_syn_norm_exp$tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm))
#get all the P-values using ns
XrL4_test_p_ns <- pnorm(XrL4_all_final_sns_norm$tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm,mean = mean(XrL4_all_final_sns_norm_ns$tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm), sd = sd(XrL4_all_final_sns_norm_ns$tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm), lower.tail = FALSE)
XrL4_ratio_ns_syn <- XrL4_test_p_ns/XrL4_test_p_syn
XrL4_log_ratio_ns_syn <- log10(XrL4_ratio_ns_syn)
XrL4_all_final_sns_norm$XrL4_test_p_syn <- XrL4_test_p_syn
XrL4_all_final_sns_norm$XrL4_test_p_ns <- XrL4_test_p_ns
XrL4_all_final_sns_norm$XrL4_ratio_ns_syn <- XrL4_ratio_ns_syn
XrL4_all_final_sns_norm$XrL4_log_ratio_ns_syn <- XrL4_log_ratio_ns_syn
XrL4_all_final_sns_norm$XrL4_bh_p_syn <- p.adjust(XrL4_all_final_sns_norm$XrL4_test_p_syn, method = 'BH')
XrL4_all_final_sns_norm$XrL4_bh_p_ns <- p.adjust(XrL4_all_final_sns_norm$XrL4_test_p_ns, method = 'BH')
#suggested thresholds...
#uncorrected threshold for less than functional...
max(XrL4_all_final_sns_norm[which(XrL4_all_final_sns_norm$XrL4_test_p_syn < 0.05),c('tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm')])
#corrected threshold for less than functional...
XrL4_bh_p_syn_max05 <- max(XrL4_all_final_sns_norm[which(XrL4_all_final_sns_norm$XrL4_bh_p_syn < 0.05),c('tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm')])
max(XrL4_all_final_sns_norm[which(XrL4_all_final_sns_norm$XrL4_bh_p_syn < 0.01),c('tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm')])
#uncorrected threshold for greater than nonsense...
min(XrL4_all_final_sns_norm[which(XrL4_all_final_sns_norm$XrL4_test_p_ns < 0.05),c('tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm')])
#corrected threshold for greater than nonsense...
XrL4_bh_p_ns_min05 <- min(XrL4_all_final_sns_norm[which(XrL4_all_final_sns_norm$XrL4_bh_p_ns < 0.05),c('tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm')])
min(XrL4_all_final_sns_norm[which(XrL4_all_final_sns_norm$XrL4_bh_p_ns < 0.01),c('tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm')])
#calculate the 'inflection point' threshold in the data... where P(ns) and P(syn) are equal...
#take the mean between the two points on either side of the threshold..
XrL4_bh_p_binary_cutoff <- mean(c(max(XrL4_all_final_sns_norm[which(XrL4_all_final_sns_norm$XrL4_bh_p_ns > XrL4_all_final_sns_norm$XrL4_bh_p_syn),c('tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm')]),min(XrL4_all_final_sns_norm[which(XrL4_all_final_sns_norm$XrL4_bh_p_ns < XrL4_all_final_sns_norm$XrL4_bh_p_syn),c('tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm')])))
#order the dataframe in descending order by P(SYN)
#histograms of training data and threshold for p-value FDR approach:
grid.arrange(ggplot(XrL4_all_final_sns_norm_ns_syn_normex, aes(x=tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm, fill=conseq))+geom_histogram(aes(fill=conseq),alpha=1)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank(),legend.position='None')+xlab('Functional score (e.sns.mean)')+ylab('SNVs')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)+coord_cartesian(xlim=c(-4,1.2))+geom_vline(xintercept=XrL4_bh_p_syn_max05 ,lty=2)+geom_vline(xintercept=XrL4_bh_p_ns_min05,lty=2),ggplot(conseq_ord_XrL4_all_final_sns_norm, aes(x=tHDR_post_lib_loess_rL41rL42_sns_norm, fill=conseq))+geom_histogram(alpha=1,bins=30)+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(legend.title=element_blank(),legend.key = element_blank(),legend.position='None')+ylab('SNVs')+xlab('Functional score (e.sns.mean)')+theme(panel.background = element_rect(fill = 'white', colour = 'white'))+theme(text = element_text(size=12),axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),strip.background = element_blank())+scale_fill_manual(values=named_conseq_colors)+scale_color_manual(values=named_conseq_colors)+coord_cartesian(xlim=c(-4,1.2))+geom_vline(xintercept=XrL4_bh_p_syn_max05 ,lty=2)+geom_vline(xintercept=XrL4_bh_p_ns_min05,lty=2),nrow=2)
#BAYSEIAN MODELING SECTION
library(mixtools)
library(plotROC)
##mixtools normalmixEM --> Basic fit on known distributions... re-try this using fixed
ns_syn_normex_mm.opt <- normalmixEM(x = XrL4_all_final_sns_norm_ns_syn_normex$tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm, lambda = c(dim(XrL4_all_final_sns_norm_syn_norm_exp)[1]/dim(XrL4_all_final_sns_norm_ns_syn_normex)[1],dim(XrL4_all_final_sns_norm_ns)[1]/dim(XrL4_all_final_sns_norm_ns_syn_normex)[1]),mean.constr = c(mean(XrL4_all_final_sns_norm_syn_norm_exp$tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm),mean(XrL4_all_final_sns_norm_ns$tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm)), sd.constr = c(sd(XrL4_all_final_sns_norm_syn_norm_exp$tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm),sd(XrL4_all_final_sns_norm_ns$tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm)))
ns_syn_normex_mm.opt$mu
ns_syn_normex_mm.opt$sigma
#try calling on the complete data set --> means and sds start at defined locations, then move.
normalmixEM(x = XrL4_all_final_sns_norm$tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm, mu = c(mean(XrL4_all_final_sns_norm_syn_norm_exp$tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm),mean(XrL4_all_final_sns_norm_ns$tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm)),sigma = c(sd(XrL4_all_final_sns_norm_syn_norm_exp$tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm),sd(XrL4_all_final_sns_norm_ns$tHDR_post_lib_loess_e.sns_rL41rL42_sns_norm)),lambda = c(0.5,0.5))
#try calling on the complete data set, using fixed mu and sigma...with lambda = c(0.5,0.5) (arbitrary) --> starting from 0.5, 0.5.