diff --git a/docs/.gitignore b/docs/.gitignore new file mode 100644 index 0000000..4d7fb32 --- /dev/null +++ b/docs/.gitignore @@ -0,0 +1,2 @@ +/.quarto/ +_site diff --git a/docs/_quarto.yml b/docs/_quarto.yml new file mode 100644 index 0000000..c06fca0 --- /dev/null +++ b/docs/_quarto.yml @@ -0,0 +1,23 @@ +project: + type: website + preview: + port: 4100 + +website: + title: "HTRNASeq" + repo-url: https://github.com/viash-hub/htrnaseq + repo-actions: [edit, issue] + + + sidebar: + style: "docked" + search: true + contents: + - index.qmd + - architecture/overview.qmd + - architecture + - data_origin.qmd +format: + html: + mermaid: + theme: neutral diff --git a/docs/architecture/overview.qmd b/docs/architecture/overview.qmd new file mode 100644 index 0000000..e63e0db --- /dev/null +++ b/docs/architecture/overview.qmd @@ -0,0 +1,38 @@ +# Architecture + +```{mermaid} +flowchart TB + + WellBAM --> parse_star_bam + + subgraph f_data["Generate Feature Data"] + annotation((GTF)) --> create_f_data --> f_data_item(("Feature\nData")) + end + subgraph demux_mapping["Demultiplexing & mapping"] + + demux_pools((Fastqs)) & demux_input_genome(("Genome")) & demux_barcodes(("Barcodes")) ----> cutadapt_parallel_map["Cutadapt & STAR"] +--> WellBAM(("Well\nBAM files")) & WellSTARLogs((Well\nSTAR logs)) & CountMatrices((Count\nMatrices)) + end + subgraph parse_star_output["Parse BAM Output Pt. 1"] + parse_star_bam["generate_well_statistics"] --> parsedBAMLogs(("BAM\nStatistics")) + end + subgraph test[" "] + parsedBAMLogs & WellSTARLogs & CountMatrices --> join_well{{"Group Wells\nInto Pools "}} --> bam_list(("BAM statistics \nfiles list")) & star_log_list(("Pool STAR\nlogs list")) + join_well --> count_matrix_list(("Count\nMatrix List")) + end + + subgraph parse_star_output_2["Parse BAM output Pt. 2"] + bam_list --> combine_bam_statistics["generate_pool_statistics"] --> bam_statistics_pool(("Pool BAM\nStatistics")) + end + subgraph parse_star_output_3["Parse STAR Logs"] + star_log_list --> combine_star_logs --> parsed_sta_logs(("Parsed\nSTAR Logs")) + end + subgraph p_data["Generate Phenotype Data"] + parsed_sta_logs & bam_statistics_pool -->create_p_data--> p_data_item(("Phenotype\nData")) + + end + subgraph create_eset_sub["Create ExpressionSet"] + f_data_item ------------> create_eset + count_matrix_list & p_data_item -->create_eset--> eset_item(("eSet")) + end +``` \ No newline at end of file diff --git a/docs/data_origin.qmd b/docs/data_origin.qmd new file mode 100644 index 0000000..01f3640 --- /dev/null +++ b/docs/data_origin.qmd @@ -0,0 +1,61 @@ +# Data Origin + +## Feature Data + +The following metadat is available for the features, they can be acessed by using `featureData(eset)`: + +| Column name | Data origin | Origin field | Component | Comment | +|-----------------------|--------------------------|---------------------------|-----------------| -------------------------------| +| start | Genome annotation (gtf) | Column 4 | create_fdata | | +| end | Genome annotation (gtf) | Column 5 | create_fdata | | +| strand | Genome annotation (gtf) | Column 7 | create_fdata | | +| gene_biotype | Genome annotation (gtf) | Column 9 | create_fdata | | +| gene_id | Genome annotation (gtf) | Column 9 | create_fdata | | +| gene_name | Genome annotation (gtf) | Column 9 | create_fdata | | +| gene_source | Genome annotation (gtf) | Column 9 | create_fdata | | +| gene_version | Genome annotation (gtf) | Column 9 | create_fdata | | +| transcript_id | Genome annotation (gtf) | Column 9 | create_fdata | | +| ENSEMBL_with_version | Calculated | gene_id | create_fdata | gene_id starts with 'ENS' | +| ENSEMBL | Calculated | ENSEMBL_with_version | create_fdata | Remove version number from end | +| SYMBOL | Calculated | '(n/N)ame' or 'gene_name' | create_fdata | Whichever origin is available | + +## Sample information (phenoData) + +The + +| Column name | Data origin | Origin field | Component | Comment | +|--------------------------------------------------|----------------------------|-----------------------------------------------|--------------------------| ----------------------------------| +| NumberOfInputReads | STAR: Log.final.out | Number of input reads | combine_star_logs | | +| NumberOfMappedReads | STAR: Log.final.out | Uniquely mapped reads number | combine_star_logs | | +| PctMappedReads | STAR: Log.final.out | Uniquely mapped reads % | combine_star_logs | | +| NumberOfReadsMappedToMultipleLoci | STAR: Log.final.out | Number of reads mapped to multiple loci | combine_star_logs | | +| PectOfReadsMappedToMultipleLoci | STAR: Log.final.out | % of reads mapped to multiple loci | combine_star_logs | | +| NumberOfReadsMappedToTooManyLoci | STAR: Log.final.out | Number of reads mapped to too many loci | combine_star_logs | | +| PectOfReadsMappedToTooManyLoci | STAR: Log.final.out | % of reads mapped to too many loci | combine_star_logs | | +| NumberOfReadsUnmappedTooManyMismatches | STAR: Log.final.out | Number of reads unmapped: too many mismatches | combine_star_logs | | +| PectOfReadsUnmappedTooManyMismatches | STAR: Log.final.out | % of reads unmapped: too many mismatches | combine_star_logs | | +| NumberOfReadsUnmappedTooShort | STAR: Log.final.out | Number of reads unmapped: too short | combine_star_logs | | +| PectOfReadsUnmappedTooShort | STAR: Log.final.out | % of reads unmapped: too short | combine_star_logs | | +| NumberOfReadsUnmappedOther | STAR: Log.final.out | Number of reads unmapped: other | combine_star_logs | | +| PectOfReadsUnmappedOther | STAR: Log.final.out | Number of reads unmapped: other | combine_star_logs | | +| ReadsWithValidBarcodes | STAR: summary.csv | Sequencing Saturation | combine_star_logs | | +| SequencingSaturation | STAR: summary.csv | Sequencing Saturation | combine_star_logs | | +| Q30BasesInCB+UMI | STAR: summary.csv | Sequencing Saturation | combine_star_logs | | +| ReadsMappedToTranscriptome:Unique+MultipeGenes | STAR: summary.csv | Q30 Bases in CB+UMI | combine_star_logs | | +| EstimatedNumberOfCells | STAR: summary.csv | Estimated Number of Cells | combine_star_logs | | +| FractionOfReadsInCells | STAR: summary.csv | Fraction of Reads in Cells | combine_star_logs | | +| MeanReadsPerCell | STAR: summary.csv | Mean Reads per Cell | combine_star_logs | | +| NumberOfUMIs | STAR: summary.csv | UMIsInCells | combine_star_logs | | +| NumberOfCountedReads | STAR: ReadsPerGene.out.tab | Second column | combine_star_logs | | +| NumberOfReads | Calculated from BAM file | CB tag (= error-corrected cell barcodes) | generate_well_statistics | Count the number per CB tag | +| NumberOfGenes | Calculated from BAM file | GX tag (= gene ID) | generate_well_statistics | Number of unique GX tags | +| NumberOfMTReads | Calculated from BAM file | GX tag (= gene ID) | generate_pool_statistics | NumberOfReads, filtered on string | +| pctMT | Calculated from BAM file | GX tag (= gene ID) | generate_pool_statistics | Based on NumberOfMTReads | +| NumberOfERCCReads | Calculated from BAM file | GX tag (= gene ID) | generate_pool_statistics | NumberOfReads, filtered on string | +| pctERCC | Calculated from BAM file | GX tag (= gene ID) | generate_pool_statistics | Based on NumberOfERCCReads | +| NumberOfChromReads | Calculated from BAM file | GX tag (= gene ID) | generate_pool_statistics | NumberOfReads, filtered on string | +| pctChrom | Calculated from BAM file | GX tag (= gene ID) | generate_pool_statistics | Based on NumberOfChromReads | + +# Count data + +One row in the count matrix populated from the contents of the `Solo.out/Gene/raw/matrix.mtx` file (i.e. each file represents the information for one well). \ No newline at end of file diff --git a/docs/index.qmd b/docs/index.qmd new file mode 100644 index 0000000..0c3fbf6 --- /dev/null +++ b/docs/index.qmd @@ -0,0 +1,6 @@ +--- +Title: HTRNASeq +--- + +# Introduction +