From 02f114f0313670840617f703acce621f1518496d Mon Sep 17 00:00:00 2001 From: Marc Sturm Date: Tue, 10 Sep 2024 08:21:45 +0200 Subject: [PATCH 1/2] updated db download scripts --- data/download_dbs.sh | 64 ++++++++++++++++----------------- data/download_dbs_rna.sh | 19 ++++++---- data/download_dbs_validation.sh | 0 data/misc/Makefile | 7 ---- 4 files changed, 45 insertions(+), 45 deletions(-) mode change 100644 => 100755 data/download_dbs.sh mode change 100644 => 100755 data/download_dbs_rna.sh mode change 100644 => 100755 data/download_dbs_validation.sh diff --git a/data/download_dbs.sh b/data/download_dbs.sh old mode 100644 new mode 100755 index 2268af9dd..4ae9becd8 --- a/data/download_dbs.sh +++ b/data/download_dbs.sh @@ -29,7 +29,7 @@ cd $dbs mkdir -p cancerhotspots cd cancerhotspots wget https://www.cancerhotspots.org/files/hotspots_v2.xls -wget http://download.cbioportal.org/cancerhotspots/cancerhotspots.v2.maf.gz +wget https://cbioportal-download.s3.amazonaws.com/cancerhotspots.v2.maf.gz ssconvert -O 'separator=" " format=raw' -T Gnumeric_stf:stf_assistant -S hotspots_v2.xls hotspots.tsv php $src/Tools/db_converter_cancerhotspots.php -in hotspots.tsv.0 -maf cancerhotspots.v2.maf.gz -out cancerhotspots_snv.tsv rm hotspots_v2.xls @@ -39,7 +39,7 @@ rm cancerhotspots.v2.maf.gz #Install ClinGen dosage sensitivity - http://ftp.ncbi.nlm.nih.gov/pub/dbVar/clingen cd $dbs -mkdir ClinGen +mkdir -p ClinGen cd ClinGen wget http://ftp.clinicalgenome.org/ClinGen_gene_curation_list_GRCh38.tsv cat ClinGen_gene_curation_list_GRCh38.tsv | php $src/Tools/db_converter_clingen_dosage.php > dosage_sensitive_disease_genes_GRCh38.bed @@ -47,21 +47,21 @@ $ngsbits/BedSort -in dosage_sensitive_disease_genes_GRCh38.bed -out dosage_sensi #Install NCG7.1 - information about oncogenes and tumor suppressor genes cd $dbs -mkdir NCG7.1 +mkdir -p NCG7.1 cd NCG7.1 curl 'http://network-cancer-genes.org/download.php' --silent -X POST --data-raw 'downloadcancergenes=Download' | sed -e "1s/^/#/" > ncg.tsv php $src/Tools/db_converter_ncg.php -in ncg.tsv -prefix NCG7.1 -outfolder "." #Install REPEATMASKER - http://www.repeatmasker.org/species/hg.html cd $dbs -mkdir RepeatMasker +mkdir -p RepeatMasker cd RepeatMasker wget -O - http://www.repeatmasker.org/genomes/hg38/RepeatMasker-rm405-db20140131/hg38.fa.out.gz | gunzip > hg38.fa.out cat hg38.fa.out | php $src/Tools/db_converter_repeatmasker.php | $ngsbits/BedSort > RepeatMasker_GRCh38.bed #Install ClinVar - https://www.ncbi.nlm.nih.gov/clinvar/ cd $dbs -mkdir ClinVar +mkdir -p ClinVar cd ClinVar wget -O - http://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/archive_2.0/2024/clinvar_20240805.vcf.gz | gunzip | php $src/Tools/db_converter_clinvar.php | $ngsbits/VcfStreamSort | bgzip > clinvar_20240805_converted_GRCh38.vcf.gz tabix -C -m 9 -p vcf clinvar_20240805_converted_GRCh38.vcf.gz @@ -72,14 +72,14 @@ $ngsbits/BedSort -with_name -in clinvar_cnvs_2024-08.bed -out clinvar_cnvs_2024- #Install HGNC - http://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/tsv/ cd $dbs -mkdir HGNC +mkdir -p HGNC cd HGNC wget -O - http://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/tsv/hgnc_complete_set.txt > hgnc_complete_set.tsv wget -O - http://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/tsv/withdrawn.txt > hgnc_withdrawn.tsv #Install gnomAD (genome data) - cd $dbs -mkdir gnomAD +mkdir -p gnomAD cd gnomAD wget -O - https://storage.googleapis.com/gcp-public-data--gnomad/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chr1.vcf.bgz | gunzip | $ngsbits/VcfLeftNormalize -stream -ref $genome | $ngsbits/VcfStreamSort | php $src/Tools/db_converter_gnomad.php -header > gnomAD_genome_v4.1_GRCh38.vcf wget -O - https://storage.googleapis.com/gcp-public-data--gnomad/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chr2.vcf.bgz | gunzip | $ngsbits/VcfLeftNormalize -stream -ref $genome | $ngsbits/VcfStreamSort | php $src/Tools/db_converter_gnomad.php >> gnomAD_genome_v4.1_GRCh38.vcf @@ -113,13 +113,13 @@ tabix -C -m 9 -p vcf gnomAD_genome_v3.1.mito_GRCh38.vcf.gz #Install phyloP cd $dbs -mkdir phyloP +mkdir -p phyloP cd phyloP wget http://hgdownload.soe.ucsc.edu/goldenPath/hg38/phyloP100way/hg38.phyloP100way.bw #Install CADD cd $dbs -mkdir CADD +mkdir -p CADD cd CADD wget -O - https://krishna.gs.washington.edu/download/CADD/v1.7/GRCh38/gnomad.genomes.r4.0.indel.tsv.gz > CADD_InDels_1.7_GRCh38.tsv.gz wget -O - https://krishna.gs.washington.edu/download/CADD/v1.7/GRCh38/whole_genome_SNVs.tsv.gz > CADD_SNVs_1.7_GRCh38.tsv.gz @@ -132,7 +132,7 @@ $ngsbits/VcfCheck -in CADD_SNVs_1.7_GRCh38.vcf.gz -lines 1000 -ref $genome #download and convert REVEL - https://sites.google.com/site/revelgenomics/downloads cd $dbs -mkdir REVEL +mkdir -p REVEL cd REVEL wget https://zenodo.org/record/7072866/files/revel-v1.3_all_chromosomes.zip unzip -p revel-v1.3_all_chromosomes.zip | php $src/Tools/db_converter_revel.php > tmp.vcf @@ -144,7 +144,7 @@ $ngsbits/VcfCheck -in REVEL_1.3.vcf.gz -lines 1000 -ref $genome #download and convert AlphaMissense - Attention: for non-commercial use only! cd $dbs -mkdir AlphaMissense +mkdir -p AlphaMissense cd AlphaMissense wget https://storage.googleapis.com/dm_alphamissense/AlphaMissense_hg38.tsv.gz php $src/Tools/db_converter_alphamissense.php AlphaMissense_hg38.tsv.gz > AlphaMissense_hg38.vcf @@ -154,7 +154,7 @@ tabix -p vcf AlphaMissense_hg38.vcf.gz #download annotation file for SpliceAI cd $dbs -mkdir SpliceAI +mkdir -p SpliceAI cd SpliceAI wget https://download.imgag.de/public/splicing/spliceai_scores_2024_08_26_GRCh38.vcf.gz -O spliceai_scores_2024_08_26_GRCh38.vcf.gz tabix -C -m 9 -p vcf spliceai_scores_2024_08_26_GRCh38.vcf.gz @@ -166,29 +166,29 @@ cd TandemRepeats wget https://github.com/PacificBiosciences/pbsv/raw/master/annotations/human_GRCh38_no_alt_analysis_set.trf.bed -#install OMIM (you might need a license, only possible after ngs-bits is installed - including reference genome and NGSD setup) -#cd $dbs -#mkdir OMIM -#cd OMIM -#manual download of http://ftp.omim.org/OMIM/genemap2.txt -#php $src/Tools/db_converter_omim.php | $ngsbits/BedSort -with_name > omim.bed +# install OMIM (you might need a license, only possible after ngs-bits is installed - including reference genome and NGSD setup) +# cd $dbs +# mkdir -p OMIM +# cd OMIM +# manual download of http://ftp.omim.org/OMIM/genemap2.txt +# php $src/Tools/db_converter_omim.php | $ngsbits/BedSort -with_name > omim.bed -#Install HGMD (you need a license, only possible after ngs-bits is installed - including reference genome and NGSD setup) -#manual download of files HGMD_Pro_2024.2_hg38.vcf.gz and hgmd_pro-2024.2.dump.gz from https://apps.ingenuity.com/ingsso/login -#zcat HGMD_Pro_2024.2_hg38.vcf.gz | php $src/Tools/db_converter_hgmd.php | bgzip > HGMD_PRO_2024_2_fixed.vcf.gz -#tabix -p vcf HGMD_PRO_2024_2_fixed.vcf.gz -##CNVs -#zcat hgmd_pro-2024.2.dump.gz | php $src/Tools/db_converter_hgmd_cnvs.php > HGMD_CNVS_2024_2.bed -#$ngsbits/BedSort -with_name -in HGMD_CNVS_2024_2.bed -out HGMD_CNVS_2024_2.bed +# Install HGMD (you need a license, only possible after ngs-bits is installed - including reference genome and NGSD setup) +# manual download of files HGMD_Pro_2024.2_hg38.vcf.gz and hgmd_pro-2024.2.dump.gz from https://apps.ingenuity.com/ingsso/login +# zcat HGMD_Pro_2024.2_hg38.vcf.gz | php $src/Tools/db_converter_hgmd.php | bgzip > HGMD_PRO_2024_2_fixed.vcf.gz +# tabix -p vcf HGMD_PRO_2024_2_fixed.vcf.gz +# #CNVs +# zcat hgmd_pro-2024.2.dump.gz | php $src/Tools/db_converter_hgmd_cnvs.php > HGMD_CNVS_2024_2.bed +# $ngsbits/BedSort -with_name -in HGMD_CNVS_2024_2.bed -out HGMD_CNVS_2024_2.bed -#Install COSMIC Cancer Mutation Census CMC (you need a license, the files have to be downloaded manually from https://apps.ingenuity.com/ingsso/login) -#the necessary files are: CancerMutationCensus_AllData_Tsv_v99_GRCh38.tar, Cosmic_GenomeScreensMutant_Vcf_v99_GRCh38.tar, Cosmic_CompleteTargetedScreensMutant_Vcf_v99_GRCh38.tar, Cosmic_NonCodingVariants_Vcf_v99_GRCh38.tar +# Install COSMIC Cancer Mutation Census CMC (you need a license, the files have to be downloaded manually from https://apps.ingenuity.com/ingsso/login) +# the necessary files are: CancerMutationCensus_AllData_Tsv_v99_GRCh38.tar, Cosmic_GenomeScreensMutant_Vcf_v99_GRCh38.tar, Cosmic_CompleteTargetedScreensMutant_Vcf_v99_GRCh38.tar, Cosmic_NonCodingVariants_Vcf_v99_GRCh38.tar # unpack the files to get the necessary vcf.gz files: -#cd $dbs -#mkdir -p COSMIC -#cd COSMIC -#gunzip -c CancerMutationCensus_AllData_v99_GRCh38.tsv.gz | php db_converter_cosmic.php -in_cmc - -in_genome_vcf Cosmic_GenomeScreensMutant_v99_GRCh38.vcf.gz -in_non_coding_vcf Cosmic_NonCodingVariants_v99_GRCh38.vcf.gz -in_target_screens_vcf Cosmic_CompleteTargetedScreensMutant_v99_GRCh38.vcf.gz -out cmc_export_v99.vcf.gz -#install NGSD +# cd $dbs +# mkdir -p COSMIC +# cd COSMIC +# gunzip -c CancerMutationCensus_AllData_v99_GRCh38.tsv.gz | php db_converter_cosmic.php -in_cmc - -in_genome_vcf Cosmic_GenomeScreensMutant_v99_GRCh38.vcf.gz -in_non_coding_vcf Cosmic_NonCodingVariants_v99_GRCh38.vcf.gz -in_target_screens_vcf Cosmic_CompleteTargetedScreensMutant_v99_GRCh38.vcf.gz -out cmc_export_v99.vcf.gz +# install NGSD # NGSD export and annotation #The usage of the NGSD annotation is optional. diff --git a/data/download_dbs_rna.sh b/data/download_dbs_rna.sh old mode 100644 new mode 100755 index f117d4a41..4ef51dd7c --- a/data/download_dbs_rna.sh +++ b/data/download_dbs_rna.sh @@ -21,9 +21,8 @@ wget -O - https://www.proteinatlas.org/download/rna_tissue_consensus.tsv.zip | g #download Ensembl data in GTF format - KEEP AT ENSEMBL VERSION 109, DB import for RNA works on Transcript base and will break if the transcripts change. cd $data_folder mkdir -p dbs/gene_annotations -wget -O - 'https://ftp.ensembl.org/pub/release-109/gtf/homo_sapiens/Homo_sapiens.GRCh38.109.gtf.gz' | \ - gzip -cd | \ - awk '{ if ($1 !~ /^#/) { print "chr"$0 } else { print $0 } }' > GRCh38.gtf +cd dbs/gene_annotations +wget -O - 'https://ftp.ensembl.org/pub/release-109/gtf/homo_sapiens/Homo_sapiens.GRCh38.109.gtf.gz' | gzip -cd | awk '{ if ($$1 !~ /^#/) { print "chr"$0 } else { print $0 } }' > GRCh38.gtf #STAR: index genome cd $data_folder @@ -35,8 +34,16 @@ $data_folder/tools/STAR-2.7.11b/bin/Linux_x86_64/STAR \ --genomeFastaFiles genomes/GRCh38.fa \ --sjdbGTFfile dbs/gene_annotations/GRCh38.gtf +#create hemoglobin FASTA file +cd $data_folder +cd misc +wget -O - 'https://ftp.ensembl.org/pub/release-109/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz' | zcat | \ + awk -v RS=">" -v FS="\n" '$$1 ~ / gene_symbol:(HBA1|HBA2|HBB) / { print ">"$$1; {for (i=2; i<=NF; i++) printf("%s", $$i)}; printf("\n") }' | \ + sed '/^>/s/ /|kraken:taxid|9606 /' \ + > human_hemoglobin_tx.fa + #build kraken2 database cd $data_folder -tools/kraken2-2.1.2/bin/kraken2-build -db dbs/kraken2_filter_hb --download-taxonomy --skip-map --use-ftp -tools/kraken2-2.1.2/bin/kraken2-build -db dbs/kraken2_filter_hb --add-to-library $data_folder/misc/human_hemoglobin_tx.fa -tools/kraken2-2.1.2/bin/kraken2-build --build --threads 5 --db dbs/kraken2_filter_hb +tools/kraken2-2.1.3/bin/kraken2-build -db dbs/kraken2_filter_hb --download-taxonomy --skip-map --use-ftp +tools/kraken2-2.1.3/bin/kraken2-build -db dbs/kraken2_filter_hb --add-to-library $data_folder/misc/human_hemoglobin_tx.fa +tools/kraken2-2.1.3/bin/kraken2-build --build --threads 5 --db dbs/kraken2_filter_hb diff --git a/data/download_dbs_validation.sh b/data/download_dbs_validation.sh old mode 100644 new mode 100755 diff --git a/data/misc/Makefile b/data/misc/Makefile index 72cdc039d..65612eb66 100644 --- a/data/misc/Makefile +++ b/data/misc/Makefile @@ -42,10 +42,3 @@ telomere_centromere: circos_karyotype_files: echo "These files where created manually by merging the chromosome regions and the centromer_telomer.bed (extended by 3 columns and replaced names by color)" - -hemoglobin_transcripts: - wget -O - 'https://ftp.ensembl.org/pub/release-109/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz' | \ - zcat | \ - awk -v RS=">" -v FS="\n" '$$1 ~ / gene_symbol:(HBA1|HBA2|HBB) / { print ">"$$1; {for (i=2; i<=NF; i++) printf("%s", $$i)}; printf("\n") }' | \ - sed '/^>/s/ /|kraken:taxid|9606 /' \ - > human_hemoglobin_tx.fa \ No newline at end of file From 239ac8e1bf2fbe1a218c5c4acff785476e06f73a Mon Sep 17 00:00:00 2001 From: Marc Sturm Date: Tue, 10 Sep 2024 08:26:36 +0200 Subject: [PATCH 2/2] updated docu for release 2024_09 --- README.md | 7 +------ doc/install_unix.md | 2 +- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index d50140477..7a4fa2000 100644 --- a/README.md +++ b/README.md @@ -47,11 +47,6 @@ Please cite the GitHub repository for now. Major changes of master since last release: -* updated tools and databases -* added support for Nova Seq X Plus -* updated long-read pipeline -* improved PRS calculation (imputed variants, etc) -* added optional RefSeq annotation (see [settings documentation](doc/settings.md#section-general)). -* added optional custom annotations in GSvar/VCF from VCFs (see [settings documentation](doc/settings.md#section-general)). +* none so far For older changes see [releases](https://github.com/imgag/megSAP/releases). diff --git a/doc/install_unix.md b/doc/install_unix.md index 5b6d1a6e2..aabc64cfa 100644 --- a/doc/install_unix.md +++ b/doc/install_unix.md @@ -33,7 +33,7 @@ RHEL 8.3 Clone the latest release of megSAP: - > git clone -b 2023_11 https://github.com/imgag/megSAP.git + > git clone -b 2024_09 https://github.com/imgag/megSAP.git Or, if you want to test the current development version: