Fix count testing and fastq import tests

metagenopolis · Oct 30, 2024 · b1df6e0 · b1df6e0
1 parent 5791053
commit b1df6e0
Show file tree

Hide file tree

Showing 6 changed files with 59 additions and 20 deletions.
diff --git a/meteor/counter.py b/meteor/counter.py
@@ -82,7 +82,6 @@ def launch_mapping(self) -> None:
                 self.mapping_type,
                 self.trim,
                 self.alignment_number,
-                self.identity_threshold,
             )
             mapping_process.execute()
 
@@ -93,11 +92,13 @@ def write_table(self, cramfile: Path, outfile: Path) -> None:
 
         :param cramfile: (Path) cram file to count
         :param outfile: (Path) count table
+        :return: The total read count of aligned sequence
         """
         # indStats = pd.read_csv(StringIO(pysam.idxstats(cramfile)), sep = '\t',
         # header = None, names = ['contig', 'length', 'mapped', 'unmapped'])
         # create count table
         table = idxstats(str(cramfile.resolve()))
+        total_reads = 0
         # write the count table
         with lzma.open(outfile, "wt", preset=0) as out:
             out.write("gene_id\tgene_length\tvalue\n")
@@ -110,6 +111,8 @@ def write_table(self, cramfile: Path, outfile: Path) -> None:
                 # ), "Gene length does not match counted nucleotides"
                 s = "\t".join(line.split("\t")[0:3])
                 out.write(f"{s}\n")
+                total_reads += int(line.split("\t")[2])
+        return total_reads
 
     def get_aligned_nucleotides(self, element) -> Iterator[int]:
         """Select aligned nucleotides
@@ -118,6 +121,15 @@ def get_aligned_nucleotides(self, element) -> Iterator[int]:
         """
         yield from (item[1] for item in element.cigartuples if item[0] < 3)
 
+    def set_counter_config(self, counted_reads):
+        """Define the count of reads"""
+        return {
+            "counting": {
+                "counted_reads": counted_reads,
+                "identity_threshold": round(self.identity_threshold, 2),
+            }
+        }
+
     def filter_alignments(
         self, cramdesc: AlignmentFile
     ) -> tuple[dict[str, list[AlignedSegment]], dict[str, list[int]]]:
@@ -340,11 +352,15 @@ def write_stat(self, output: Path, abundance_dict: dict, database: dict) -> None
         :param output: [STRING] = output filename
         :param abundance_dict: [DICT] = contains abundance of each reference genes
         :param database: [DICT] = contrains length of each reference genes
+        :return: (float) The total number of reads aligned
         """
+        total_reads = 0
         with lzma.open(output, "wt", preset=0) as out:
             out.write("gene_id\tgene_length\tvalue\n")
             for genes, abundance in sorted(abundance_dict.items()):
                 out.write(f"{genes}\t{database[genes]}\t{abundance}\n")
+                total_reads += abundance
+        return total_reads
 
     def save_cram_strain(
         self,
@@ -395,6 +411,7 @@ def launch_counting(
         cramfile_strain: Path,
         count_file: Path,
         ref_json: dict,
+        census_json: dict,
     ):
         """Function that count reads from a cram file, using the given methods in count:
         "total" or "shared" or "unique".
@@ -437,7 +454,7 @@ def launch_counting(
                 # Calculate reference abundance & write count table
                 abundance = self.compute_abs_meteor(database, unique_on_gene, multiple)
                 # abundance = self.compute_abs(unique_on_gene, multiple)
-                self.write_stat(count_file, abundance, database)
+                total_read_count = self.write_stat(count_file, abundance, database)
 
             else:
                 if self.counting_type == "unique":
@@ -475,7 +492,14 @@ def launch_counting(
                     str(cramfile_unsorted.resolve()),
                     catch_stdout=False,
                 )
-                self.write_table(cramfile_sorted, count_file)
+                total_read_count = self.write_table(cramfile_sorted, count_file)
+            config = self.set_counter_config(total_read_count)
+            Stage1Json = (
+                self.meteor.mapping_dir
+                / f"{census_json['sample_info']['sample_name']}_census_stage_1.json"
+            )
+
+            self.save_config(census_json.update(config), Stage1Json)
             if self.keep_filtered_alignments:
                 cramfile_strain_unsorted = Path(mkstemp(dir=self.meteor.tmp_dir)[1])
                 self.save_cram_strain(
@@ -532,6 +556,7 @@ def execute(self) -> None:
 
             #  mapping of each sample against reference
             for library in census_json_files:
+                print(library)
                 census_json = self.read_json(library)
                 sample_info = census_json["sample_info"]
                 stage1_dir = self.meteor.mapping_dir / sample_info["sample_name"]
@@ -571,7 +596,9 @@ def execute(self) -> None:
                     / f"{sample_info['sample_name']}.tsv.xz"
                 )
                 start = perf_counter()
-                self.launch_counting(raw_cram_file, cram_file, count_file, ref_json)
+                self.launch_counting(
+                    raw_cram_file, cram_file, count_file, ref_json, census_json
+                )
                 logging.info("Completed counting in %f seconds", perf_counter() - start)
                 if not self.keep_all_alignments:
                     logging.info(

diff --git a/meteor/fastqimporter.py b/meteor/fastqimporter.py
@@ -145,21 +145,19 @@ def execute(self) -> None:
                     sys.exit(1)
             else:
                 tag = "single"
+            if self.ispaired:
+                sample_name = self.get_paired_dirname(fastq_file.name, tag)
+            else:
+                sample_name = full_sample_name
             # split full sample name (in fact library/run name) in order
             if self.mask_sample_name:
                 # to extract sample_name according to regex mask
-                full_sample_name_array = re.search(
-                    self.mask_sample_name, full_sample_name
-                )
-                if full_sample_name_array:
-                    sample_name = full_sample_name_array[0]
+                sample_name_array = re.search(self.mask_sample_name, sample_name)
+                if sample_name_array:
+                    sample_name = sample_name_array[0]
                 else:
                     logging.warning("Regular expression does not match %s", fastq_file)
                     continue
-            if self.ispaired:
-                sample_name = self.get_paired_dirname(fastq_file.name, tag)
-            else:
-                sample_name = full_sample_name
             logging.info("Importing %s in sample %s", fastq_file, sample_name)
             # Create directory for the sample and symlink fastq file into
             samples_names.add(sample_name)

diff --git a/meteor/mapper.py b/meteor/mapper.py
@@ -45,7 +45,6 @@ class Mapper(Session):
     mapping_type: str
     trim: int
     alignment_number: int
-    identity_threshold: float
 
     def __post_init__(self) -> None:
         if self.mapping_type not in Mapper.MAPPING_TYPES:
@@ -77,9 +76,8 @@ def set_mapping_config(
                 "trim": str(self.trim),
                 "alignment_number": self.alignment_number,
                 "mapping_type": self.mapping_type,
-                "identity_threshold": round(self.identity_threshold, 2),
                 "total_read_count": mapping_data[0],
-                "mapped_read_count": mapping_data[2] + mapping_data[3],
+                "mapped_reads": mapping_data[2] + mapping_data[3],
                 "overall_alignment_rate": round(
                     (mapping_data[2] + mapping_data[3]) / mapping_data[0] * 100, 2
                 ),
@@ -178,6 +176,7 @@ def execute(self) -> None:
                 mapping_log = findall(r"([0-9]+)\s+\(", mapping_result)
                 assert len(mapping_log) == 4
                 mapping_data = [int(i) for i in mapping_log]
+                print(mapping_data)
             except AssertionError:
                 logging.error("Could not access the mapping result from bowtie2")
                 sys.exit(1)

diff --git a/meteor/tests/test_counter.py b/meteor/tests/test_counter.py
@@ -304,10 +304,14 @@ def test_launch_counting_unique(counter_unique: Counter, datadir: Path, tmp_path
     raw_cramfile = datadir / "total_raw.cram"
     cramfile = datadir / "total.cram"
     countfile = tmp_path / "count.tsv.xz"
+    census_json_file = counter_unique.meteor.fastq_dir / "part1_census_stage_0.json"
+    census_json = counter_unique.read_json(census_json_file)
     ref_json = counter_unique.read_json(
         counter_unique.meteor.ref_dir / "mock_reference.json"
     )
-    counter_unique.launch_counting(raw_cramfile, cramfile, countfile, ref_json)
+    counter_unique.launch_counting(
+        raw_cramfile, cramfile, countfile, ref_json, census_json
+    )
     with countfile.open("rb") as out:
         assert md5(out.read()).hexdigest() == "f5bc528dcbf594b5089ad7f6228ebab5"
 
@@ -316,10 +320,14 @@ def test_launch_counting_total(counter_total: Counter, datadir: Path, tmp_path:
     raw_cramfile = datadir / "total_raw.cram"
     cramfile = datadir / "total.cram"
     countfile = tmp_path / "count.tsv.xz"
+    census_json_file = counter_total.meteor.fastq_dir / "part1_census_stage_0.json"
+    census_json = counter_total.read_json(census_json_file)
     ref_json = counter_total.read_json(
         counter_total.meteor.ref_dir / "mock_reference.json"
     )
-    counter_total.launch_counting(raw_cramfile, cramfile, countfile, ref_json)
+    counter_total.launch_counting(
+        raw_cramfile, cramfile, countfile, ref_json, census_json
+    )
     with countfile.open("rb") as out:
         assert md5(out.read()).hexdigest() == "f010e4136323ac408d4c127e243756c2"
 
@@ -330,10 +338,16 @@ def test_launch_counting_smart_shared(
     raw_cramfile = datadir / "total_raw.cram"
     cramfile = datadir / "total.cram"
     countfile = tmp_path / "count.tsv.xz"
+    census_json_file = (
+        counter_smart_shared.meteor.fastq_dir / "part1_census_stage_0.json"
+    )
+    census_json = counter_smart_shared.read_json(census_json_file)
     ref_json = counter_smart_shared.read_json(
         counter_smart_shared.meteor.ref_dir / "mock_reference.json"
     )
-    counter_smart_shared.launch_counting(raw_cramfile, cramfile, countfile, ref_json)
+    counter_smart_shared.launch_counting(
+        raw_cramfile, cramfile, countfile, ref_json, census_json
+    )
     # with countfile.open("rb") as out:
     #     assert md5(out.read()).hexdigest() == "4bdd7327cbad8e71d210feb0c6375077"
     expected_output = pd.read_csv(

diff --git a/meteor/tests/test_fastq_importer.py b/meteor/tests/test_fastq_importer.py
@@ -161,6 +161,8 @@ def test_execute_paired_mask(
     builder_paired_mask: FastqImporter, expected_dir_mask: tuple
 ) -> None:
     builder_paired_mask.execute()
+    for dir in expected_dir_mask:
+        print(builder_paired_mask.meteor.fastq_dir / dir)
     assert all(
         Path(builder_paired_mask.meteor.fastq_dir / dir).exists()
         for dir in expected_dir_mask

diff --git a/meteor/tests/test_mapper.py b/meteor/tests/test_mapper.py
@@ -52,7 +52,6 @@ def mapping_builder(datadir: Path, tmp_path: Path) -> Mapper:
         "end-to-end",
         80,
         10000,
-        0.95,
     )
-Original file line number
+Diff line change
@@ Expand Up / @@ -52,7 +52,6 @@ def mapping_builder(datadir: Path, tmp_path: Path) -> Mapper: @@
             "end-to-end",
 ,
 ,
-.95,
         )
@@ Expand Down @@