Skip to content

Commit

Permalink
Merge pull request #7 from bento-platform/qa/v2.0.0
Browse files Browse the repository at this point in the history
patches/upgrades from qa/v2.0.0
  • Loading branch information
brouillette authored Aug 25, 2021
2 parents 9292962 + 086de18 commit 0a5af36
Show file tree
Hide file tree
Showing 7 changed files with 251 additions and 168 deletions.
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -43,4 +43,7 @@ bin/*

# tmp directories
*/tmp
*/*/tmp
*/*/tmp

*.vcf
*.vcf.gz
75 changes: 75 additions & 0 deletions preprocess.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
#!/bin/bash

# credits:
# - https://www.biostars.org/p/78929/

FILE=$1

if [[ "$FILE" == "" ]];then
echo Missing gz file name!
exit 0
fi

if [[ -f $FILE ]];then
echo "$FILE exists"
else
echo "$FILE doesn't exist"
exit 0
fi

echo
echo Preprocessing :
echo
echo Splitting $FILE into individual VCFs using PERL
echo
echo Step 1 : creating common and private txt files -
echo This may take a while...

time zcat $FILE | perl -lane '
if (/^#/) { if (/^##/) { print STDERR } else {
print STDERR join "\t", @F[0..8]; @samples = @F;
} } else {
print STDERR join "\t", @F[0..8];
for ($i = 9; $i <= $#F; $i++) {
if ($F[$i] =~ /^..[1-9]/) {
print STDOUT join "\t", $samples[$i], $lc, $F[$i];
} } } $lc++' 2> vcfs/_vcf.common.txt | sort -k1,1 -k2,2n > vcfs/_vcf.private.txt

echo Step 2 : converting common and private txt files to individual VCF files -
echo This also may take a while...

mkdir -p vcfs/split
time perl -lane 'BEGIN {
open IN, "vcfs/_vcf.common.txt" or die $!;
chomp(@common = <IN>); foreach (@common) {
if (/^##/) { $headers .= "$_\n" } else { $headers .= $_; last }
} close IN }
if ($F[0] ne $previousSample) {
close OUT if $previousSample;
open OUT, ">vcfs/split/$F[0].vcf";
print OUT "$headers\t$F[0]";
} $previousSample = $F[0];
print OUT "$common[$F[1]]\t$F[2]";
END { close OUT }' vcfs/_vcf.private.txt

echo Step 3 : compressing individual VCF files -
echo This also may take a while...

time for file in vcfs/split/*vcf; do
gzip -f $file;
# tabix -fp vcf $file.gz
done

# for file in split/*vcf.gz; do
# gunzip $file
# done

# rm *vcf.gz
# rm *vcf.gz.tbi

# Clean up
mv vcfs/split/*.vcf.gz vcfs/
rmdir vcfs/split/

rm vcfs/_vcf.private.txt
rm vcfs/_vcf.common.txt
2 changes: 1 addition & 1 deletion src/api/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ func main() {

// Service Singletons
az := services.NewAuthzService(&cfg)
iz := services.NewIngestionService()
iz := services.NewIngestionService(es)

// Configure Server
e.Use(middleware.Recover())
Expand Down
8 changes: 3 additions & 5 deletions src/api/mvc/variants.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,6 @@ func VariantsCountBySampleId(c echo.Context) error {
}

func VariantsIngest(c echo.Context) error {
es := c.(*contexts.GohanContext).Es7Client

cfg := c.(*contexts.GohanContext).Config
vcfPath := cfg.Api.VcfPath
drsUrl := cfg.Drs.Url
Expand Down Expand Up @@ -115,7 +113,7 @@ func VariantsIngest(c echo.Context) error {
}
}

fmt.Printf("Found .vcf.gz files: %s\n", vcfGzfiles)
//fmt.Printf("Found .vcf.gz files: %s\n", vcfGzfiles)

// Locate fileName from request inside found files
for _, fileName := range fileNames {
Expand Down Expand Up @@ -213,7 +211,7 @@ func VariantsIngest(c echo.Context) error {
}

// --- load back into memory and process
ingestionService.ProcessVcf(vcfFilePath, drsFileId, es)
ingestionService.ProcessVcf(vcfFilePath, drsFileId)

// --- delete the temporary vcf file
os.Remove(vcfFilePath)
Expand All @@ -222,7 +220,7 @@ func VariantsIngest(c echo.Context) error {
// (WARNING : Only do this when running over a single file)
//os.RemoveAll(vcfTmpPath)

fmt.Printf("Ingest duration for file at %s : %s\n", vcfFilePath, time.Now().Sub(startTime))
fmt.Printf("Ingest duration for file at %s : %s\n", vcfFilePath, time.Since(startTime))

reqStat.State = ingest.Done
ingestionService.IngestRequestChan <- reqStat
Expand Down
6 changes: 3 additions & 3 deletions src/api/repositories/elasticsearch/elasticsearch.go
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ func GetDocumentsContainerVariantOrSampleIdInPositionRange(es *elasticsearch.Cli

// Temp
resultString := res.String()
fmt.Println(resultString)
// fmt.Println(resultString)
// --

// Declared an empty interface
Expand Down Expand Up @@ -316,7 +316,7 @@ func CountDocumentsContainerVariantOrSampleIdInPositionRange(es *elasticsearch.C

// Temp
resultString := res.String()
fmt.Println(resultString)
// fmt.Println(resultString)
// --

// Declared an empty interface
Expand Down Expand Up @@ -373,7 +373,7 @@ func GetBucketsByKeyword(es *elasticsearch.Client, keyword string) map[string]in

// Temp
resultString := res.String()
fmt.Println(resultString)
//fmt.Println(resultString)
// --

// Declared an empty interface
Expand Down
Loading

0 comments on commit 0a5af36

Please sign in to comment.