Skip to content

Commit

Permalink
wip abc testing code
Browse files Browse the repository at this point in the history
  • Loading branch information
rcannood committed Aug 1, 2024
1 parent aa60d98 commit 89eccfe
Showing 1 changed file with 98 additions and 16 deletions.
114 changes: 98 additions & 16 deletions src/bd_rhapsody/bd_rhapsody_sequence_analysis/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
meta = {
"name": "bd_rhapsody_sequence_analysis",
"executable": "target/docker/bd_rhapsody/bd_rhapsody_sequence_analysis/bd_rhapsody_sequence_analysis",
"resources_dir": "src/bd_rhapsody/bd_rhapsody_sequence_analysis",
"resources_dir": "src/bd_rhapsody",
"cpus": 8,
"memory_mb": 4096,
}
Expand All @@ -29,8 +29,9 @@
# Generate index
print("> Generate index", flush=True)
cwl_file = meta["resources_dir"] / "bd_rhapsody_make_reference.cwl"
gtf_file = meta["resources_dir"] / "test_data" / "reference_small.gtf"
fasta_file = meta["resources_dir"] / "test_data" / "reference_small.fa"
reference_small_gtf = meta["resources_dir"] / "test_data" / "reference_small.gtf"
reference_small_fa = meta["resources_dir"] / "test_data" / "reference_small.fa"
bdabseq_panel_fa = meta["resources_dir"] / "test_data" / "BDAbSeq_ImmuneDiscoveryPanel.fasta"

config_file = Path("reference_config.yml")
reference_file = Path("Rhap_reference.tar.gz")
Expand All @@ -43,9 +44,9 @@
".",
str(cwl_file),
"--Genome_fasta",
str(fasta_file),
str(reference_small_fa),
"--Gtf",
str(gtf_file),
str(reference_small_gtf),
"--Extra_STAR_params",
"--genomeSAindexNbases 4"
])
Expand All @@ -57,12 +58,14 @@
import gffutils

# Load FASTA sequence
with open(str(fasta_file), "r") as handle:
with open(str(reference_small_fa), "r") as handle:
reference_fasta_dict = SeqIO.to_dict(SeqIO.parse(handle, "fasta"))
with open(str(bdabseq_panel_fa), "r") as handle:
bdabseq_panel_fasta_dict = SeqIO.to_dict(SeqIO.parse(handle, "fasta"))

# create in memory db
reference_gtf_db = gffutils.create_db(
str(gtf_file),
str(reference_small_gtf),
dbfn=":memory:",
force=True,
keep_order=True,
Expand Down Expand Up @@ -142,14 +145,14 @@ def generate_bd_wta_transcript(
return sequence


def generate_bd_read(
def generate_bd_wta_read(
cell_index: int = 0,
bead_version: str = "EnhV2",
umi_length: int = 14,
transcript_length: int = 42,
) -> Tuple[str, str]:
"""
Generate a pair of BD Rhapsody FASTQ reads for a given cell index.
Generate a BD Rhapsody WTA read pair for a given cell index.
Args:
cell_index: The cell index to generate reads for.
Expand Down Expand Up @@ -218,7 +221,80 @@ def generate_bd_wta_fastq_files(
r2_reads = ""
for cell_index in range(num_cells):
for _ in range(num_reads_per_cell):
r1, r2 = generate_bd_read(cell_index)
r1, r2 = generate_bd_wta_read(cell_index)
r1_reads += r1
r2_reads += r2

return r1_reads, r2_reads

def generate_bd_abc_read(
cell_index: int = 0,
bead_version: str = "EnhV2",
umi_length: int = 14,
transcript_length: int = 72,
) -> Tuple[str, str]:
"""
Generate a BD Rhapsody ABC read pair for a given cell index.
Args:
cell_index: The cell index to generate reads for.
bead_version: The bead version to use for generating the cell label.
umi_length: The length of the UMI to generate.
transcript_length: The length of the transcript to generate
Returns:
A tuple of two strings, the first string being the R1 read and the second string being the R2 read.
"""
# generate metadata
per_row = np.floor((32967 - 1000) / 9)
per_col = np.floor((37059 - 1000) / 9)

assert cell_index >= 0 and cell_index < per_row * per_col, f"cell_index must be between 0 and {per_row} * {per_col}"
x = 1000 + (cell_index % per_row) * 9
y = 1000 + (cell_index // per_row) * 9
meta_r1 = generate_bd_read_metadata(x=x, y=y, illumina_flag="1:N:0")
meta_r2 = generate_bd_read_metadata(x=x, y=y, illumina_flag="2:N:0")

# generate r1 (cls1 + link + cls2 + link + cls3 + umi)
assert cell_index >= 0 and cell_index < 384 * 384 * 384
cell_label = index_to_sequence(cell_index + 1, bead_version=bead_version)
# sample random umi
umi = "".join(random.choices("ACGT", k=umi_length))
quality_r1 = "I" * (len(cell_label) + len(umi))
r1 = f"{meta_r1}\n{cell_label}{umi}\n+\n{quality_r1}\n"

# generate r2 by sampling sequence from bdabseq_panel_fa
abseq_seq = str(random.choice(list(bdabseq_panel_fasta_dict.values())).seq)
abc_prefix = "N" #+ "".join(random.choices("ACGT", k=12))
abc_data = abseq_seq[:transcript_length - len(abc_prefix)]
abc_suffix = "A" * (transcript_length - len(abc_prefix) - len(abc_data))

abc_transcript = f"{abc_prefix}{abc_data}{abc_suffix}"

quality_r2 = "#" + "I" * (len(abc_transcript) - 1)
r2 = f"{meta_r2}\n{abc_transcript}\n+\n{quality_r2}\n"

return r1, r2

def generate_bd_abc_fastq_files(
num_cells: int = 100,
num_reads_per_cell: int = 1000,
) -> Tuple[str, str]:
"""
Generate BD Rhapsody ABC FASTQ files for a given number of cells and transcripts per cell.
Args:
num_cells: The number of cells to generate
num_reads_per_cell: The number of reads to generate per cell
Returns:
A tuple of two strings, the first string being the R1 reads and the second string being the R2 reads.
"""
r1_reads = ""
r2_reads = ""
for cell_index in range(num_cells):
for _ in range(num_reads_per_cell):
r1, r2 = generate_bd_abc_read(cell_index)
r1_reads += r1
r2_reads += r2

Expand All @@ -227,23 +303,30 @@ def generate_bd_wta_fastq_files(

# Prepare WTA, ABC, and SMK test data
print("> Prepare WTA test data", flush=True)

wta_reads_r1_str, wta_reads_r2_str = generate_bd_wta_fastq_files(num_cells=100, num_reads_per_cell=1000)
with gzip.open("WTAreads_R1.fq.gz", "wt") as f:
f.write(wta_reads_r1_str)
with gzip.open("WTAreads_R2.fq.gz", "wt") as f:
f.write(wta_reads_r2_str)

print("> Prepare ABC test data", flush=True)
abc_reads_r1_str, abc_reads_r2_str = generate_bd_abc_fastq_files(num_cells=100, num_reads_per_cell=1000)
with gzip.open("ABCreads_R1.fq.gz", "wt") as f:
f.write(abc_reads_r1_str)
with gzip.open("ABCreads_R2.fq.gz", "wt") as f:
f.write(abc_reads_r2_str)

#########################################################################################

# Run executable
print(f">> Run {meta['name']}", flush=True)
output_dir = Path("output")
subprocess.run([
meta['executable'],
"--reads=WTAreads_R1.fq.gz",
"--reads=WTAreads_R2.fq.gz",
"--reads=WTAreads_R1.fq.gz;WTAreads_R2.fq.gz",
"--reads=ABCreads_R1.fq.gz;ABCreads_R2.fq.gz",
f"--reference_archive={reference_file}",
f"--abseq_reference={bdabseq_panel_fa}",
"--output_dir=output",
"--exact_cell_count=100",
f"---cpus={meta['cpus'] or 1}",
Expand Down Expand Up @@ -277,16 +360,15 @@ def generate_bd_wta_fastq_files(
assert data.n_obs == 100, "Number of cells is incorrect"
assert "rna" in data.mod, "RNA data is missing"


data_rna = data.mod["rna"]
assert data_rna.n_vars == 1, "Number of genes is incorrect"

# row sum should be greater than 950
assert data_rna.X.sum(axis=1).min() > 950, "Number of reads per cell is incorrect"
assert data_rna.var.Raw_Reads.sum() == 100000, "Number of reads is incorrect"

# TODO: check contents
# TODO: check whether individual outputs also work
# TODO: add ABC, VDJ, SMK, ATAC, and targeted RNA to test
# TODO: add VDJ, SMK, ATAC, and targeted RNA to test


#########################################################################################
Expand Down

0 comments on commit 89eccfe

Please sign in to comment.