From 02f114f0313670840617f703acce621f1518496d Mon Sep 17 00:00:00 2001
From: Marc Sturm <marc.sturm@med.uni-tuebingen.de>
Date: Tue, 10 Sep 2024 08:21:45 +0200
Subject: [PATCH 1/2] updated db download scripts

---
 data/download_dbs.sh            | 64 ++++++++++++++++-----------------
 data/download_dbs_rna.sh        | 19 ++++++----
 data/download_dbs_validation.sh |  0
 data/misc/Makefile              |  7 ----
 4 files changed, 45 insertions(+), 45 deletions(-)
 mode change 100644 => 100755 data/download_dbs.sh
 mode change 100644 => 100755 data/download_dbs_rna.sh
 mode change 100644 => 100755 data/download_dbs_validation.sh

diff --git a/data/download_dbs.sh b/data/download_dbs.sh
old mode 100644
new mode 100755
index 2268af9dd..4ae9becd8
--- a/data/download_dbs.sh
+++ b/data/download_dbs.sh
@@ -29,7 +29,7 @@ cd $dbs
 mkdir -p cancerhotspots
 cd cancerhotspots
 wget https://www.cancerhotspots.org/files/hotspots_v2.xls
-wget http://download.cbioportal.org/cancerhotspots/cancerhotspots.v2.maf.gz
+wget https://cbioportal-download.s3.amazonaws.com/cancerhotspots.v2.maf.gz
 ssconvert -O 'separator="	" format=raw' -T Gnumeric_stf:stf_assistant -S hotspots_v2.xls hotspots.tsv
 php $src/Tools/db_converter_cancerhotspots.php -in hotspots.tsv.0 -maf cancerhotspots.v2.maf.gz -out cancerhotspots_snv.tsv
 rm hotspots_v2.xls
@@ -39,7 +39,7 @@ rm cancerhotspots.v2.maf.gz
 
 #Install ClinGen dosage sensitivity - http://ftp.ncbi.nlm.nih.gov/pub/dbVar/clingen
 cd $dbs
-mkdir ClinGen
+mkdir -p ClinGen
 cd ClinGen
 wget http://ftp.clinicalgenome.org/ClinGen_gene_curation_list_GRCh38.tsv
 cat ClinGen_gene_curation_list_GRCh38.tsv | php $src/Tools/db_converter_clingen_dosage.php > dosage_sensitive_disease_genes_GRCh38.bed
@@ -47,21 +47,21 @@ $ngsbits/BedSort -in dosage_sensitive_disease_genes_GRCh38.bed -out dosage_sensi
 
 #Install NCG7.1 - information about oncogenes and tumor suppressor genes
 cd $dbs
-mkdir NCG7.1
+mkdir -p NCG7.1
 cd NCG7.1
 curl 'http://network-cancer-genes.org/download.php' --silent -X POST --data-raw 'downloadcancergenes=Download' | sed -e "1s/^/#/" > ncg.tsv
 php $src/Tools/db_converter_ncg.php -in ncg.tsv -prefix NCG7.1 -outfolder "."
 
 #Install REPEATMASKER - http://www.repeatmasker.org/species/hg.html
 cd $dbs
-mkdir RepeatMasker
+mkdir -p RepeatMasker
 cd RepeatMasker
 wget -O - http://www.repeatmasker.org/genomes/hg38/RepeatMasker-rm405-db20140131/hg38.fa.out.gz | gunzip > hg38.fa.out
 cat hg38.fa.out | php $src/Tools/db_converter_repeatmasker.php | $ngsbits/BedSort > RepeatMasker_GRCh38.bed
 
 #Install ClinVar - https://www.ncbi.nlm.nih.gov/clinvar/
 cd $dbs
-mkdir ClinVar 
+mkdir -p ClinVar 
 cd ClinVar
 wget -O - http://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/archive_2.0/2024/clinvar_20240805.vcf.gz | gunzip | php $src/Tools/db_converter_clinvar.php | $ngsbits/VcfStreamSort | bgzip > clinvar_20240805_converted_GRCh38.vcf.gz
 tabix -C -m 9 -p vcf clinvar_20240805_converted_GRCh38.vcf.gz
@@ -72,14 +72,14 @@ $ngsbits/BedSort -with_name -in clinvar_cnvs_2024-08.bed -out clinvar_cnvs_2024-
 
 #Install HGNC - http://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/tsv/
 cd $dbs
-mkdir HGNC
+mkdir -p HGNC
 cd HGNC
 wget -O - http://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/tsv/hgnc_complete_set.txt > hgnc_complete_set.tsv
 wget -O - http://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/tsv/withdrawn.txt > hgnc_withdrawn.tsv
 
 #Install gnomAD (genome data) - 
 cd $dbs
-mkdir gnomAD
+mkdir -p gnomAD
 cd gnomAD
 wget -O - https://storage.googleapis.com/gcp-public-data--gnomad/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chr1.vcf.bgz | gunzip | $ngsbits/VcfLeftNormalize -stream -ref $genome | $ngsbits/VcfStreamSort | php $src/Tools/db_converter_gnomad.php -header > gnomAD_genome_v4.1_GRCh38.vcf
 wget -O - https://storage.googleapis.com/gcp-public-data--gnomad/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chr2.vcf.bgz | gunzip | $ngsbits/VcfLeftNormalize -stream -ref $genome | $ngsbits/VcfStreamSort | php $src/Tools/db_converter_gnomad.php >> gnomAD_genome_v4.1_GRCh38.vcf
@@ -113,13 +113,13 @@ tabix -C -m 9 -p vcf gnomAD_genome_v3.1.mito_GRCh38.vcf.gz
 
 #Install phyloP
 cd $dbs
-mkdir phyloP
+mkdir -p phyloP
 cd phyloP
 wget http://hgdownload.soe.ucsc.edu/goldenPath/hg38/phyloP100way/hg38.phyloP100way.bw
 
 #Install CADD
 cd $dbs
-mkdir CADD
+mkdir -p CADD
 cd CADD
 wget -O - https://krishna.gs.washington.edu/download/CADD/v1.7/GRCh38/gnomad.genomes.r4.0.indel.tsv.gz > CADD_InDels_1.7_GRCh38.tsv.gz
 wget -O - https://krishna.gs.washington.edu/download/CADD/v1.7/GRCh38/whole_genome_SNVs.tsv.gz > CADD_SNVs_1.7_GRCh38.tsv.gz
@@ -132,7 +132,7 @@ $ngsbits/VcfCheck -in CADD_SNVs_1.7_GRCh38.vcf.gz -lines 1000 -ref $genome
 
 #download and convert REVEL - https://sites.google.com/site/revelgenomics/downloads
 cd $dbs
-mkdir REVEL
+mkdir -p REVEL
 cd REVEL
 wget https://zenodo.org/record/7072866/files/revel-v1.3_all_chromosomes.zip
 unzip -p revel-v1.3_all_chromosomes.zip | php $src/Tools/db_converter_revel.php > tmp.vcf
@@ -144,7 +144,7 @@ $ngsbits/VcfCheck -in REVEL_1.3.vcf.gz -lines 1000 -ref $genome
 
 #download and convert AlphaMissense - Attention: for non-commercial use only!
 cd $dbs
-mkdir AlphaMissense
+mkdir -p AlphaMissense
 cd AlphaMissense
 wget https://storage.googleapis.com/dm_alphamissense/AlphaMissense_hg38.tsv.gz
 php $src/Tools/db_converter_alphamissense.php AlphaMissense_hg38.tsv.gz > AlphaMissense_hg38.vcf
@@ -154,7 +154,7 @@ tabix -p vcf AlphaMissense_hg38.vcf.gz
 
 #download annotation file for SpliceAI
 cd $dbs
-mkdir SpliceAI
+mkdir -p SpliceAI
 cd SpliceAI
 wget https://download.imgag.de/public/splicing/spliceai_scores_2024_08_26_GRCh38.vcf.gz -O spliceai_scores_2024_08_26_GRCh38.vcf.gz
 tabix -C -m 9 -p vcf spliceai_scores_2024_08_26_GRCh38.vcf.gz
@@ -166,29 +166,29 @@ cd TandemRepeats
 wget https://github.com/PacificBiosciences/pbsv/raw/master/annotations/human_GRCh38_no_alt_analysis_set.trf.bed
 
 
-#install OMIM (you might need a license, only possible after ngs-bits is installed - including reference genome and NGSD setup)
-#cd $dbs
-#mkdir OMIM
-#cd OMIM
-#manual download of http://ftp.omim.org/OMIM/genemap2.txt
-#php $src/Tools/db_converter_omim.php | $ngsbits/BedSort -with_name > omim.bed
+# install OMIM (you might need a license, only possible after ngs-bits is installed - including reference genome and NGSD setup)
+# cd $dbs
+# mkdir -p OMIM
+# cd OMIM
+# manual download of http://ftp.omim.org/OMIM/genemap2.txt
+# php $src/Tools/db_converter_omim.php | $ngsbits/BedSort -with_name > omim.bed
 
-#Install HGMD (you need a license, only possible after ngs-bits is installed - including reference genome and NGSD setup)
-#manual download of files HGMD_Pro_2024.2_hg38.vcf.gz  and hgmd_pro-2024.2.dump.gz from https://apps.ingenuity.com/ingsso/login
-#zcat HGMD_Pro_2024.2_hg38.vcf.gz | php $src/Tools/db_converter_hgmd.php | bgzip > HGMD_PRO_2024_2_fixed.vcf.gz
-#tabix -p vcf HGMD_PRO_2024_2_fixed.vcf.gz
-##CNVs
-#zcat hgmd_pro-2024.2.dump.gz | php $src/Tools/db_converter_hgmd_cnvs.php > HGMD_CNVS_2024_2.bed
-#$ngsbits/BedSort -with_name -in HGMD_CNVS_2024_2.bed -out HGMD_CNVS_2024_2.bed
+# Install HGMD (you need a license, only possible after ngs-bits is installed - including reference genome and NGSD setup)
+# manual download of files HGMD_Pro_2024.2_hg38.vcf.gz  and hgmd_pro-2024.2.dump.gz from https://apps.ingenuity.com/ingsso/login
+# zcat HGMD_Pro_2024.2_hg38.vcf.gz | php $src/Tools/db_converter_hgmd.php | bgzip > HGMD_PRO_2024_2_fixed.vcf.gz
+# tabix -p vcf HGMD_PRO_2024_2_fixed.vcf.gz
+# #CNVs
+# zcat hgmd_pro-2024.2.dump.gz | php $src/Tools/db_converter_hgmd_cnvs.php > HGMD_CNVS_2024_2.bed
+# $ngsbits/BedSort -with_name -in HGMD_CNVS_2024_2.bed -out HGMD_CNVS_2024_2.bed
 
-#Install COSMIC Cancer Mutation Census CMC  (you need a license, the files have to be downloaded manually from https://apps.ingenuity.com/ingsso/login)
-#the necessary files are: CancerMutationCensus_AllData_Tsv_v99_GRCh38.tar, Cosmic_GenomeScreensMutant_Vcf_v99_GRCh38.tar, Cosmic_CompleteTargetedScreensMutant_Vcf_v99_GRCh38.tar, Cosmic_NonCodingVariants_Vcf_v99_GRCh38.tar
+# Install COSMIC Cancer Mutation Census CMC  (you need a license, the files have to be downloaded manually from https://apps.ingenuity.com/ingsso/login)
+# the necessary files are: CancerMutationCensus_AllData_Tsv_v99_GRCh38.tar, Cosmic_GenomeScreensMutant_Vcf_v99_GRCh38.tar, Cosmic_CompleteTargetedScreensMutant_Vcf_v99_GRCh38.tar, Cosmic_NonCodingVariants_Vcf_v99_GRCh38.tar
 # unpack the files to get the necessary vcf.gz files: 
-#cd $dbs
-#mkdir -p COSMIC
-#cd COSMIC
-#gunzip -c CancerMutationCensus_AllData_v99_GRCh38.tsv.gz | php db_converter_cosmic.php -in_cmc - -in_genome_vcf Cosmic_GenomeScreensMutant_v99_GRCh38.vcf.gz -in_non_coding_vcf Cosmic_NonCodingVariants_v99_GRCh38.vcf.gz -in_target_screens_vcf Cosmic_CompleteTargetedScreensMutant_v99_GRCh38.vcf.gz -out cmc_export_v99.vcf.gz
-#install NGSD
+# cd $dbs
+# mkdir -p COSMIC
+# cd COSMIC
+# gunzip -c CancerMutationCensus_AllData_v99_GRCh38.tsv.gz | php db_converter_cosmic.php -in_cmc - -in_genome_vcf Cosmic_GenomeScreensMutant_v99_GRCh38.vcf.gz -in_non_coding_vcf Cosmic_NonCodingVariants_v99_GRCh38.vcf.gz -in_target_screens_vcf Cosmic_CompleteTargetedScreensMutant_v99_GRCh38.vcf.gz -out cmc_export_v99.vcf.gz
+# install NGSD
 
 # NGSD export and annotation
 #The usage of the NGSD annotation is optional. 
diff --git a/data/download_dbs_rna.sh b/data/download_dbs_rna.sh
old mode 100644
new mode 100755
index f117d4a41..4ef51dd7c
--- a/data/download_dbs_rna.sh
+++ b/data/download_dbs_rna.sh
@@ -21,9 +21,8 @@ wget -O - https://www.proteinatlas.org/download/rna_tissue_consensus.tsv.zip | g
 #download Ensembl data in GTF format - KEEP AT ENSEMBL VERSION 109, DB import for RNA works on Transcript base and will break if the transcripts change.
 cd $data_folder
 mkdir -p dbs/gene_annotations
-wget -O - 'https://ftp.ensembl.org/pub/release-109/gtf/homo_sapiens/Homo_sapiens.GRCh38.109.gtf.gz' | \
-  gzip -cd | \
-  awk '{ if ($1 !~ /^#/) { print "chr"$0 } else { print $0 } }' > GRCh38.gtf
+cd dbs/gene_annotations
+wget -O - 'https://ftp.ensembl.org/pub/release-109/gtf/homo_sapiens/Homo_sapiens.GRCh38.109.gtf.gz' | gzip -cd | awk '{ if ($$1 !~ /^#/) { print "chr"$0 } else { print $0 } }' > GRCh38.gtf
 
 #STAR: index genome
 cd $data_folder
@@ -35,8 +34,16 @@ $data_folder/tools/STAR-2.7.11b/bin/Linux_x86_64/STAR \
 --genomeFastaFiles genomes/GRCh38.fa \
 --sjdbGTFfile dbs/gene_annotations/GRCh38.gtf
 
+#create hemoglobin FASTA file
+cd $data_folder
+cd misc
+wget -O - 'https://ftp.ensembl.org/pub/release-109/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz' | zcat | \
+	awk -v RS=">" -v FS="\n" '$$1 ~ / gene_symbol:(HBA1|HBA2|HBB) / { print ">"$$1; {for (i=2; i<=NF; i++) printf("%s", $$i)}; printf("\n") }' | \
+	sed '/^>/s/ /|kraken:taxid|9606 /' \
+	> human_hemoglobin_tx.fa
+
 #build kraken2 database
 cd $data_folder
-tools/kraken2-2.1.2/bin/kraken2-build -db dbs/kraken2_filter_hb --download-taxonomy --skip-map  --use-ftp
-tools/kraken2-2.1.2/bin/kraken2-build -db dbs/kraken2_filter_hb --add-to-library $data_folder/misc/human_hemoglobin_tx.fa
-tools/kraken2-2.1.2/bin/kraken2-build --build  --threads 5 --db dbs/kraken2_filter_hb
+tools/kraken2-2.1.3/bin/kraken2-build -db dbs/kraken2_filter_hb --download-taxonomy --skip-map  --use-ftp
+tools/kraken2-2.1.3/bin/kraken2-build -db dbs/kraken2_filter_hb --add-to-library $data_folder/misc/human_hemoglobin_tx.fa
+tools/kraken2-2.1.3/bin/kraken2-build --build  --threads 5 --db dbs/kraken2_filter_hb
diff --git a/data/download_dbs_validation.sh b/data/download_dbs_validation.sh
old mode 100644
new mode 100755
diff --git a/data/misc/Makefile b/data/misc/Makefile
index 72cdc039d..65612eb66 100644
--- a/data/misc/Makefile
+++ b/data/misc/Makefile
@@ -42,10 +42,3 @@ telomere_centromere:
 
 circos_karyotype_files:
 	echo "These files where created manually by merging the chromosome regions and the centromer_telomer.bed (extended by 3 columns and replaced names by color)"
-
-hemoglobin_transcripts:
-	wget -O - 'https://ftp.ensembl.org/pub/release-109/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz' | \
-		zcat | \
-		awk -v RS=">" -v FS="\n" '$$1 ~ / gene_symbol:(HBA1|HBA2|HBB) / { print ">"$$1; {for (i=2; i<=NF; i++) printf("%s", $$i)}; printf("\n") }' | \
-		sed '/^>/s/ /|kraken:taxid|9606 /' \
-		> human_hemoglobin_tx.fa
\ No newline at end of file

From 239ac8e1bf2fbe1a218c5c4acff785476e06f73a Mon Sep 17 00:00:00 2001
From: Marc Sturm <marc.sturm@med.uni-tuebingen.de>
Date: Tue, 10 Sep 2024 08:26:36 +0200
Subject: [PATCH 2/2] updated docu for release 2024_09

---
 README.md           | 7 +------
 doc/install_unix.md | 2 +-
 2 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index d50140477..7a4fa2000 100644
--- a/README.md
+++ b/README.md
@@ -47,11 +47,6 @@ Please cite the GitHub repository for now.
 
 Major changes of master since last release:
 
-* updated tools and databases
-* added support for Nova Seq X Plus
-* updated long-read pipeline
-* improved PRS calculation (imputed variants, etc)
-* added optional RefSeq annotation (see [settings documentation](doc/settings.md#section-general)).
-* added optional custom annotations in GSvar/VCF from VCFs (see [settings documentation](doc/settings.md#section-general)).
+* none so far
 
 For older changes see [releases](https://github.com/imgag/megSAP/releases).
diff --git a/doc/install_unix.md b/doc/install_unix.md
index 5b6d1a6e2..aabc64cfa 100644
--- a/doc/install_unix.md
+++ b/doc/install_unix.md
@@ -33,7 +33,7 @@ RHEL 8.3
 
 Clone the latest release of megSAP:
 
-	> git clone -b 2023_11 https://github.com/imgag/megSAP.git
+	> git clone -b 2024_09 https://github.com/imgag/megSAP.git
 
 Or, if you want to test the current development version: