Skip to content

Commit

Permalink
add smk data
Browse files Browse the repository at this point in the history
  • Loading branch information
rcannood committed Aug 3, 2024
1 parent b536557 commit 6e59f50
Show file tree
Hide file tree
Showing 3 changed files with 168 additions and 9 deletions.
124 changes: 115 additions & 9 deletions src/bd_rhapsody/bd_rhapsody_sequence_analysis/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
reference_small_gtf = meta["resources_dir"] / "test_data" / "reference_small.gtf"
reference_small_fa = meta["resources_dir"] / "test_data" / "reference_small.fa"
bdabseq_panel_fa = meta["resources_dir"] / "test_data" / "BDAbSeq_ImmuneDiscoveryPanel.fasta"
sampletagsequences_fa = meta["resources_dir"] / "test_data" / "SampleTagSequences_HomoSapiens_ver1.fasta"

config_file = Path("reference_config.yml")
reference_file = Path("Rhap_reference.tar.gz")
Expand Down Expand Up @@ -62,6 +63,8 @@
reference_fasta_dict = SeqIO.to_dict(SeqIO.parse(handle, "fasta"))
with open(str(bdabseq_panel_fa), "r") as handle:
bdabseq_panel_fasta_dict = SeqIO.to_dict(SeqIO.parse(handle, "fasta"))
with open(str(sampletagsequences_fa), "r") as handle:
sampletagsequences_fasta_dict = SeqIO.to_dict(SeqIO.parse(handle, "fasta"))

# create in memory db
reference_gtf_db = gffutils.create_db(
Expand Down Expand Up @@ -306,6 +309,85 @@ def generate_bd_abc_fastq_files(

return r1_reads, r2_reads

def generate_bd_smk_read(
cell_index: int = 0,
bead_version: str = "EnhV2",
umi_length: int = 14,
transcript_length: int = 72,
num_sample_tags: int = 3,
):
"""
Generate a BD Rhapsody SMK read pair for a given cell index.
Args:
cell_index: The cell index to generate reads for.
bead_version: The bead version to use for generating the cell label.
umi_length: The length of the UMI to generate.
transcript_length: The length of the transcript to generate
num_sample_tags: The number of sample tags to use
Returns:
A tuple of two strings, the first string being the R1 read and the second string being the R2 read.
"""
# generate metadata
per_row = np.floor((32967 - 1000) / 9)
per_col = np.floor((37059 - 1000) / 9)

assert cell_index >= 0 and cell_index < per_row * per_col, f"cell_index must be between 0 and {per_row} * {per_col}"
x = 1000 + (cell_index % per_row) * 9
y = 1000 + (cell_index // per_row) * 9
instrument_id = "A00226"
run_id = "970"
flowcell_id = "H5FGVDMXY"

meta_r1 = generate_bd_read_metadata(instrument_id=instrument_id, run_id=run_id, flowcell_id=flowcell_id, x=x, y=y, illumina_flag="1:N:0")
meta_r2 = generate_bd_read_metadata(instrument_id=instrument_id, run_id=run_id, flowcell_id=flowcell_id, x=x, y=y, illumina_flag="2:N:0")

# generate r1 (cls1 + link + cls2 + link + cls3 + umi)
assert cell_index >= 0 and cell_index < 384 * 384 * 384
cell_label = index_to_sequence(cell_index + 1, bead_version=bead_version)
# sample random umi
umi = "".join(random.choices("ACGT", k=umi_length))
quality_r1 = "I" * (len(cell_label) + len(umi))
r1 = f"{meta_r1}\n{cell_label}{umi}\n+\n{quality_r1}\n"

# generate r2 by selecting the cell_index %% num_sample_tags sample tags
sampletag_index = cell_index % num_sample_tags
sampletag_seq = str(list(sampletagsequences_fasta_dict.values())[sampletag_index].seq)
smk_data = sampletag_seq[:transcript_length]
smk_suffix = "A" * (transcript_length - len(smk_data))
quality_r2 = "I" * len(smk_data) + "#" * len(smk_suffix)
r2 = f"{meta_r2}\n{smk_data}{smk_suffix}\n+\n{quality_r2}\n"

return r1, r2

def generate_bd_smk_fastq_files(
num_cells: int = 100,
num_reads_per_cell: int = 1000,
num_sample_tags: int = 3,
) -> Tuple[str, str]:
"""
Generate BD Rhapsody SMK FASTQ files for a given number of cells and transcripts per cell.
Args:
num_cells: The number of cells to generate
num_reads_per_cell: The number of reads to generate per cell
num_sample_tags: The number of sample tags to use
Returns:
A tuple of two strings, the first string being the R1 reads and the second string being the R2 reads.
"""
r1_reads = ""
r2_reads = ""
for cell_index in range(num_cells):
for _ in range(num_reads_per_cell):
r1, r2 = generate_bd_smk_read(cell_index, num_sample_tags=num_sample_tags)
r1_reads += r1
r2_reads += r2

return r1_reads, r2_reads

#########################################################################################

# Prepare WTA, ABC, and SMK test data
print("> Prepare WTA test data", flush=True)
Expand All @@ -322,6 +404,13 @@ def generate_bd_abc_fastq_files(
with gzip.open("ABCreads_R2.fq.gz", "wt") as f:
f.write(abc_reads_r2_str)

print("> Prepare SMK test data", flush=True)
smk_reads_r1_str, smk_reads_r2_str = generate_bd_smk_fastq_files(num_cells=100, num_reads_per_cell=1000, num_sample_tags=3)
with gzip.open("SMKreads_R1.fq.gz", "wt") as f:
f.write(smk_reads_r1_str)
with gzip.open("SMKreads_R2.fq.gz", "wt") as f:
f.write(smk_reads_r2_str)

#########################################################################################

# Run executable
Expand All @@ -330,14 +419,17 @@ def generate_bd_abc_fastq_files(
subprocess.run([
meta['executable'],
"--reads=WTAreads_R1.fq.gz;WTAreads_R2.fq.gz",
"--reads=ABCreads_R1.fq.gz;ABCreads_R2.fq.gz",
f"--reference_archive={reference_file}",
"--reads=ABCreads_R1.fq.gz;ABCreads_R2.fq.gz",
f"--abseq_reference={bdabseq_panel_fa}",
"--reads=SMKreads_R1.fq.gz;SMKreads_R2.fq.gz",
"--tag_names=1-Sample1;2-Sample2;3-Sample3",
"--sample_tags_version=human",
"--output_dir=output",
"--exact_cell_count=100",
f"---cpus={meta['cpus'] or 1}",
f"---memory={meta['memory_mb'] or 2048}mb",
"--output_seurat=seurat.rds",
# "--output_seurat=seurat.rds",
"--output_mudata=mudata.h5mu",
"--metrics_summary=metrics_summary.csv",
"--pipeline_report=pipeline_report.html",
Expand All @@ -351,11 +443,11 @@ def generate_bd_abc_fastq_files(
assert (output_dir / "sample_Pipeline_Report.html").exists()
assert (output_dir / "sample_RSEC_MolsPerCell_MEX.zip").exists()
assert (output_dir / "sample_RSEC_MolsPerCell_Unfiltered_MEX.zip").exists()
assert (output_dir / "sample_Seurat.rds").exists()
# assert (output_dir / "sample_Seurat.rds").exists()
assert (output_dir / "sample.h5mu").exists()

# check individual outputs
assert Path("seurat.rds").exists()
# assert Path("seurat.rds").exists()
assert Path("mudata.h5mu").exists()
assert Path("metrics_summary.csv").exists()
assert Path("pipeline_report.html").exists()
Expand All @@ -365,17 +457,31 @@ def generate_bd_abc_fastq_files(

assert data.n_obs == 100, "Number of cells is incorrect"
assert "rna" in data.mod, "RNA data is missing"
assert "prot" in data.mod, "Protein data is missing"


# check rna data
data_rna = data.mod["rna"]
assert data_rna.n_vars == 1, "Number of genes is incorrect"

# row sum should be greater than 950
assert data_rna.X.sum(axis=1).min() > 950, "Number of reads per cell is incorrect"
assert data_rna.var.Raw_Reads.sum() == 100000, "Number of reads is incorrect"

# TODO: add VDJ, SMK, ATAC, and targeted RNA to test

# check prot data
data_prot = data.mod["prot"]
assert data_prot.n_vars == bdabseq_panel_fa.n_seqs, "Number of proteins is incorrect"
assert data_prot.X.sum(axis=1).min() > 950, "Number of reads per cell is incorrect"
assert data_prot.var.Raw_Reads.sum() == 100000, "Number of reads is incorrect"

# check smk data
expected_sample_tags = (["SampleTag01_hs", "SampleTag02_hs", "SampleTag03_hs"] * 34)[:100]
expected_sample_names = (["Sample1", "Sample2", "Sample3"] * 34)[:100]
sample_tags = data_rna.obs["Sample_Tag"]
assert sample_tags.nunique() == 3, "Number of sample tags is incorrect"
assert sample_tags.tolist() == expected_sample_tags, "Sample tags are incorrect"
sample_names = data_rna.obs["Sample_Name"]
assert sample_names.nunique() == 3, "Number of sample names is incorrect"
assert sample_names.tolist() == expected_sample_names, "Sample names are incorrect"

# TODO: add VDJ, ATAC, and targeted RNA to test

#########################################################################################

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
>SampleTag01_hs|stAbO
GTTGTCAAGATGCTACCGTTCAGAGATTCAAGGGCAGCCGCGTCACGATTGGATACGACTGTTGGACCGG
>SampleTag02_hs|stAbO
GTTGTCAAGATGCTACCGTTCAGAGTGGATGGGATAAGTGCGTGATGGACCGAAGGGACCTCGTGGCCGG
>SampleTag03_hs|stAbO
GTTGTCAAGATGCTACCGTTCAGAGCGGCTCGTGCTGCGTCGTCTCAAGTCCAGAAACTCCGTGTATCCT
>SampleTag04_hs|stAbO
GTTGTCAAGATGCTACCGTTCAGAGATTGGGAGGCTTTCGTACCGCTGCCGCCACCAGGTGATACCCGCT
>SampleTag05_hs|stAbO
GTTGTCAAGATGCTACCGTTCAGAGCTCCCTGGTGTTCAATACCCGATGTGGTGGGCAGAATGTGGCTGG
>SampleTag06_hs|stAbO
GTTGTCAAGATGCTACCGTTCAGAGTTACCCGCAGGAAGACGTATACCCCTCGTGCCAGGCGACCAATGC
>SampleTag07_hs|stAbO
GTTGTCAAGATGCTACCGTTCAGAGTGTCTACGTCGGACCGCAAGAAGTGAGTCAGAGGCTGCACGCTGT
>SampleTag08_hs|stAbO
GTTGTCAAGATGCTACCGTTCAGAGCCCCACCAGGTTGCTTTGTCGGACGAGCCCGCACAGCGCTAGGAT
>SampleTag09_hs|stAbO
GTTGTCAAGATGCTACCGTTCAGAGGTGATCCGCGCAGGCACACATACCGACTCAGATGGGTTGTCCAGG
>SampleTag10_hs|stAbO
GTTGTCAAGATGCTACCGTTCAGAGGCAGCCGGCGTCGTACGAGGCACAGCGGAGACTAGATGAGGCCCC
>SampleTag11_hs|stAbO
GTTGTCAAGATGCTACCGTTCAGAGCGCGTCCAATTTCCGAAGCCCCGCCCTAGGAGTTCCCCTGCGTGC
>SampleTag12_hs|stAbO
GTTGTCAAGATGCTACCGTTCAGAGGCCCATTCATTGCACCCGCCAGTGATCGACCCTAGTGGAGCTAAG
29 changes: 29 additions & 0 deletions src/bd_rhapsody/test_data/script.sh
Original file line number Diff line number Diff line change
Expand Up @@ -110,3 +110,32 @@ TTTGGAGGGTAGCTAGTTGCAGTTCGTGGTCGTTTC
>IgD|IGHD|AHS0058|pAbO Catalog_940026
TGAGGGATGTATAGCGAGAATTGCGACCGTAGACTT
EOF

# this was obtained by running the command:
# docker run bdgenomics/rhapsody:2.2.1 cat /rhapsody/control_files/SampleTagSequences_HomoSapiens_ver1.fasta
cat > $OUT_DIR/SampleTagSequences_HomoSapiens_ver1.fasta <<EOF
>SampleTag01_hs|stAbO
GTTGTCAAGATGCTACCGTTCAGAGATTCAAGGGCAGCCGCGTCACGATTGGATACGACTGTTGGACCGG
>SampleTag02_hs|stAbO
GTTGTCAAGATGCTACCGTTCAGAGTGGATGGGATAAGTGCGTGATGGACCGAAGGGACCTCGTGGCCGG
>SampleTag03_hs|stAbO
GTTGTCAAGATGCTACCGTTCAGAGCGGCTCGTGCTGCGTCGTCTCAAGTCCAGAAACTCCGTGTATCCT
>SampleTag04_hs|stAbO
GTTGTCAAGATGCTACCGTTCAGAGATTGGGAGGCTTTCGTACCGCTGCCGCCACCAGGTGATACCCGCT
>SampleTag05_hs|stAbO
GTTGTCAAGATGCTACCGTTCAGAGCTCCCTGGTGTTCAATACCCGATGTGGTGGGCAGAATGTGGCTGG
>SampleTag06_hs|stAbO
GTTGTCAAGATGCTACCGTTCAGAGTTACCCGCAGGAAGACGTATACCCCTCGTGCCAGGCGACCAATGC
>SampleTag07_hs|stAbO
GTTGTCAAGATGCTACCGTTCAGAGTGTCTACGTCGGACCGCAAGAAGTGAGTCAGAGGCTGCACGCTGT
>SampleTag08_hs|stAbO
GTTGTCAAGATGCTACCGTTCAGAGCCCCACCAGGTTGCTTTGTCGGACGAGCCCGCACAGCGCTAGGAT
>SampleTag09_hs|stAbO
GTTGTCAAGATGCTACCGTTCAGAGGTGATCCGCGCAGGCACACATACCGACTCAGATGGGTTGTCCAGG
>SampleTag10_hs|stAbO
GTTGTCAAGATGCTACCGTTCAGAGGCAGCCGGCGTCGTACGAGGCACAGCGGAGACTAGATGAGGCCCC
>SampleTag11_hs|stAbO
GTTGTCAAGATGCTACCGTTCAGAGCGCGTCCAATTTCCGAAGCCCCGCCCTAGGAGTTCCCCTGCGTGC
>SampleTag12_hs|stAbO
GTTGTCAAGATGCTACCGTTCAGAGGCCCATTCATTGCACCCGCCAGTGATCGACCCTAGTGGAGCTAAG
EOF

0 comments on commit 6e59f50

Please sign in to comment.