Skip to content

Commit

Permalink
Fix count testing and fastq import tests
Browse files Browse the repository at this point in the history
  • Loading branch information
aghozlane committed Oct 30, 2024
1 parent 5791053 commit b1df6e0
Show file tree
Hide file tree
Showing 6 changed files with 59 additions and 20 deletions.
35 changes: 31 additions & 4 deletions meteor/counter.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,6 @@ def launch_mapping(self) -> None:
self.mapping_type,
self.trim,
self.alignment_number,
self.identity_threshold,
)
mapping_process.execute()

Expand All @@ -93,11 +92,13 @@ def write_table(self, cramfile: Path, outfile: Path) -> None:
:param cramfile: (Path) cram file to count
:param outfile: (Path) count table
:return: The total read count of aligned sequence
"""
# indStats = pd.read_csv(StringIO(pysam.idxstats(cramfile)), sep = '\t',
# header = None, names = ['contig', 'length', 'mapped', 'unmapped'])
# create count table
table = idxstats(str(cramfile.resolve()))
total_reads = 0
# write the count table
with lzma.open(outfile, "wt", preset=0) as out:
out.write("gene_id\tgene_length\tvalue\n")
Expand All @@ -110,6 +111,8 @@ def write_table(self, cramfile: Path, outfile: Path) -> None:
# ), "Gene length does not match counted nucleotides"
s = "\t".join(line.split("\t")[0:3])
out.write(f"{s}\n")
total_reads += int(line.split("\t")[2])
return total_reads

def get_aligned_nucleotides(self, element) -> Iterator[int]:
"""Select aligned nucleotides
Expand All @@ -118,6 +121,15 @@ def get_aligned_nucleotides(self, element) -> Iterator[int]:
"""
yield from (item[1] for item in element.cigartuples if item[0] < 3)

def set_counter_config(self, counted_reads):
"""Define the count of reads"""
return {
"counting": {
"counted_reads": counted_reads,
"identity_threshold": round(self.identity_threshold, 2),
}
}

def filter_alignments(
self, cramdesc: AlignmentFile
) -> tuple[dict[str, list[AlignedSegment]], dict[str, list[int]]]:
Expand Down Expand Up @@ -340,11 +352,15 @@ def write_stat(self, output: Path, abundance_dict: dict, database: dict) -> None
:param output: [STRING] = output filename
:param abundance_dict: [DICT] = contains abundance of each reference genes
:param database: [DICT] = contrains length of each reference genes
:return: (float) The total number of reads aligned
"""
total_reads = 0
with lzma.open(output, "wt", preset=0) as out:
out.write("gene_id\tgene_length\tvalue\n")
for genes, abundance in sorted(abundance_dict.items()):
out.write(f"{genes}\t{database[genes]}\t{abundance}\n")
total_reads += abundance
return total_reads

def save_cram_strain(
self,
Expand Down Expand Up @@ -395,6 +411,7 @@ def launch_counting(
cramfile_strain: Path,
count_file: Path,
ref_json: dict,
census_json: dict,
):
"""Function that count reads from a cram file, using the given methods in count:
"total" or "shared" or "unique".
Expand Down Expand Up @@ -437,7 +454,7 @@ def launch_counting(
# Calculate reference abundance & write count table
abundance = self.compute_abs_meteor(database, unique_on_gene, multiple)
# abundance = self.compute_abs(unique_on_gene, multiple)
self.write_stat(count_file, abundance, database)
total_read_count = self.write_stat(count_file, abundance, database)

else:
if self.counting_type == "unique":
Expand Down Expand Up @@ -475,7 +492,14 @@ def launch_counting(
str(cramfile_unsorted.resolve()),
catch_stdout=False,
)
self.write_table(cramfile_sorted, count_file)
total_read_count = self.write_table(cramfile_sorted, count_file)
config = self.set_counter_config(total_read_count)
Stage1Json = (
self.meteor.mapping_dir
/ f"{census_json['sample_info']['sample_name']}_census_stage_1.json"
)

self.save_config(census_json.update(config), Stage1Json)
if self.keep_filtered_alignments:
cramfile_strain_unsorted = Path(mkstemp(dir=self.meteor.tmp_dir)[1])
self.save_cram_strain(
Expand Down Expand Up @@ -532,6 +556,7 @@ def execute(self) -> None:

# mapping of each sample against reference
for library in census_json_files:
print(library)
census_json = self.read_json(library)
sample_info = census_json["sample_info"]
stage1_dir = self.meteor.mapping_dir / sample_info["sample_name"]
Expand Down Expand Up @@ -571,7 +596,9 @@ def execute(self) -> None:
/ f"{sample_info['sample_name']}.tsv.xz"
)
start = perf_counter()
self.launch_counting(raw_cram_file, cram_file, count_file, ref_json)
self.launch_counting(
raw_cram_file, cram_file, count_file, ref_json, census_json
)
logging.info("Completed counting in %f seconds", perf_counter() - start)
if not self.keep_all_alignments:
logging.info(
Expand Down
16 changes: 7 additions & 9 deletions meteor/fastqimporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,21 +145,19 @@ def execute(self) -> None:
sys.exit(1)
else:
tag = "single"
if self.ispaired:
sample_name = self.get_paired_dirname(fastq_file.name, tag)
else:
sample_name = full_sample_name
# split full sample name (in fact library/run name) in order
if self.mask_sample_name:
# to extract sample_name according to regex mask
full_sample_name_array = re.search(
self.mask_sample_name, full_sample_name
)
if full_sample_name_array:
sample_name = full_sample_name_array[0]
sample_name_array = re.search(self.mask_sample_name, sample_name)
if sample_name_array:
sample_name = sample_name_array[0]
else:
logging.warning("Regular expression does not match %s", fastq_file)
continue
if self.ispaired:
sample_name = self.get_paired_dirname(fastq_file.name, tag)
else:
sample_name = full_sample_name
logging.info("Importing %s in sample %s", fastq_file, sample_name)
# Create directory for the sample and symlink fastq file into
samples_names.add(sample_name)
Expand Down
5 changes: 2 additions & 3 deletions meteor/mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@ class Mapper(Session):
mapping_type: str
trim: int
alignment_number: int
identity_threshold: float

def __post_init__(self) -> None:
if self.mapping_type not in Mapper.MAPPING_TYPES:
Expand Down Expand Up @@ -77,9 +76,8 @@ def set_mapping_config(
"trim": str(self.trim),
"alignment_number": self.alignment_number,
"mapping_type": self.mapping_type,
"identity_threshold": round(self.identity_threshold, 2),
"total_read_count": mapping_data[0],
"mapped_read_count": mapping_data[2] + mapping_data[3],
"mapped_reads": mapping_data[2] + mapping_data[3],
"overall_alignment_rate": round(
(mapping_data[2] + mapping_data[3]) / mapping_data[0] * 100, 2
),
Expand Down Expand Up @@ -178,6 +176,7 @@ def execute(self) -> None:
mapping_log = findall(r"([0-9]+)\s+\(", mapping_result)
assert len(mapping_log) == 4
mapping_data = [int(i) for i in mapping_log]
print(mapping_data)
except AssertionError:
logging.error("Could not access the mapping result from bowtie2")
sys.exit(1)
Expand Down
20 changes: 17 additions & 3 deletions meteor/tests/test_counter.py
Original file line number Diff line number Diff line change
Expand Up @@ -304,10 +304,14 @@ def test_launch_counting_unique(counter_unique: Counter, datadir: Path, tmp_path
raw_cramfile = datadir / "total_raw.cram"
cramfile = datadir / "total.cram"
countfile = tmp_path / "count.tsv.xz"
census_json_file = counter_unique.meteor.fastq_dir / "part1_census_stage_0.json"
census_json = counter_unique.read_json(census_json_file)
ref_json = counter_unique.read_json(
counter_unique.meteor.ref_dir / "mock_reference.json"
)
counter_unique.launch_counting(raw_cramfile, cramfile, countfile, ref_json)
counter_unique.launch_counting(
raw_cramfile, cramfile, countfile, ref_json, census_json
)
with countfile.open("rb") as out:
assert md5(out.read()).hexdigest() == "f5bc528dcbf594b5089ad7f6228ebab5"

Expand All @@ -316,10 +320,14 @@ def test_launch_counting_total(counter_total: Counter, datadir: Path, tmp_path:
raw_cramfile = datadir / "total_raw.cram"
cramfile = datadir / "total.cram"
countfile = tmp_path / "count.tsv.xz"
census_json_file = counter_total.meteor.fastq_dir / "part1_census_stage_0.json"
census_json = counter_total.read_json(census_json_file)
ref_json = counter_total.read_json(
counter_total.meteor.ref_dir / "mock_reference.json"
)
counter_total.launch_counting(raw_cramfile, cramfile, countfile, ref_json)
counter_total.launch_counting(
raw_cramfile, cramfile, countfile, ref_json, census_json
)
with countfile.open("rb") as out:
assert md5(out.read()).hexdigest() == "f010e4136323ac408d4c127e243756c2"

Expand All @@ -330,10 +338,16 @@ def test_launch_counting_smart_shared(
raw_cramfile = datadir / "total_raw.cram"
cramfile = datadir / "total.cram"
countfile = tmp_path / "count.tsv.xz"
census_json_file = (
counter_smart_shared.meteor.fastq_dir / "part1_census_stage_0.json"
)
census_json = counter_smart_shared.read_json(census_json_file)
ref_json = counter_smart_shared.read_json(
counter_smart_shared.meteor.ref_dir / "mock_reference.json"
)
counter_smart_shared.launch_counting(raw_cramfile, cramfile, countfile, ref_json)
counter_smart_shared.launch_counting(
raw_cramfile, cramfile, countfile, ref_json, census_json
)
# with countfile.open("rb") as out:
# assert md5(out.read()).hexdigest() == "4bdd7327cbad8e71d210feb0c6375077"
expected_output = pd.read_csv(
Expand Down
2 changes: 2 additions & 0 deletions meteor/tests/test_fastq_importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,8 @@ def test_execute_paired_mask(
builder_paired_mask: FastqImporter, expected_dir_mask: tuple
) -> None:
builder_paired_mask.execute()
for dir in expected_dir_mask:
print(builder_paired_mask.meteor.fastq_dir / dir)
assert all(
Path(builder_paired_mask.meteor.fastq_dir / dir).exists()
for dir in expected_dir_mask
Expand Down
1 change: 0 additions & 1 deletion meteor/tests/test_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,6 @@ def mapping_builder(datadir: Path, tmp_path: Path) -> Mapper:
"end-to-end",
80,
10000,
0.95,
)


Expand Down

0 comments on commit b1df6e0

Please sign in to comment.