diff --git a/meteor/counter.py b/meteor/counter.py index 65781e5..0d29b56 100644 --- a/meteor/counter.py +++ b/meteor/counter.py @@ -82,7 +82,6 @@ def launch_mapping(self) -> None: self.mapping_type, self.trim, self.alignment_number, - self.identity_threshold, ) mapping_process.execute() @@ -93,11 +92,13 @@ def write_table(self, cramfile: Path, outfile: Path) -> None: :param cramfile: (Path) cram file to count :param outfile: (Path) count table + :return: The total read count of aligned sequence """ # indStats = pd.read_csv(StringIO(pysam.idxstats(cramfile)), sep = '\t', # header = None, names = ['contig', 'length', 'mapped', 'unmapped']) # create count table table = idxstats(str(cramfile.resolve())) + total_reads = 0 # write the count table with lzma.open(outfile, "wt", preset=0) as out: out.write("gene_id\tgene_length\tvalue\n") @@ -110,6 +111,8 @@ def write_table(self, cramfile: Path, outfile: Path) -> None: # ), "Gene length does not match counted nucleotides" s = "\t".join(line.split("\t")[0:3]) out.write(f"{s}\n") + total_reads += int(line.split("\t")[2]) + return total_reads def get_aligned_nucleotides(self, element) -> Iterator[int]: """Select aligned nucleotides @@ -118,6 +121,15 @@ def get_aligned_nucleotides(self, element) -> Iterator[int]: """ yield from (item[1] for item in element.cigartuples if item[0] < 3) + def set_counter_config(self, counted_reads): + """Define the count of reads""" + return { + "counting": { + "counted_reads": counted_reads, + "identity_threshold": round(self.identity_threshold, 2), + } + } + def filter_alignments( self, cramdesc: AlignmentFile ) -> tuple[dict[str, list[AlignedSegment]], dict[str, list[int]]]: @@ -340,11 +352,15 @@ def write_stat(self, output: Path, abundance_dict: dict, database: dict) -> None :param output: [STRING] = output filename :param abundance_dict: [DICT] = contains abundance of each reference genes :param database: [DICT] = contrains length of each reference genes + :return: (float) The total number of reads aligned """ + total_reads = 0 with lzma.open(output, "wt", preset=0) as out: out.write("gene_id\tgene_length\tvalue\n") for genes, abundance in sorted(abundance_dict.items()): out.write(f"{genes}\t{database[genes]}\t{abundance}\n") + total_reads += abundance + return total_reads def save_cram_strain( self, @@ -395,6 +411,7 @@ def launch_counting( cramfile_strain: Path, count_file: Path, ref_json: dict, + census_json: dict, ): """Function that count reads from a cram file, using the given methods in count: "total" or "shared" or "unique". @@ -437,7 +454,7 @@ def launch_counting( # Calculate reference abundance & write count table abundance = self.compute_abs_meteor(database, unique_on_gene, multiple) # abundance = self.compute_abs(unique_on_gene, multiple) - self.write_stat(count_file, abundance, database) + total_read_count = self.write_stat(count_file, abundance, database) else: if self.counting_type == "unique": @@ -475,7 +492,14 @@ def launch_counting( str(cramfile_unsorted.resolve()), catch_stdout=False, ) - self.write_table(cramfile_sorted, count_file) + total_read_count = self.write_table(cramfile_sorted, count_file) + config = self.set_counter_config(total_read_count) + Stage1Json = ( + self.meteor.mapping_dir + / f"{census_json['sample_info']['sample_name']}_census_stage_1.json" + ) + + self.save_config(census_json.update(config), Stage1Json) if self.keep_filtered_alignments: cramfile_strain_unsorted = Path(mkstemp(dir=self.meteor.tmp_dir)[1]) self.save_cram_strain( @@ -532,6 +556,7 @@ def execute(self) -> None: # mapping of each sample against reference for library in census_json_files: + print(library) census_json = self.read_json(library) sample_info = census_json["sample_info"] stage1_dir = self.meteor.mapping_dir / sample_info["sample_name"] @@ -571,7 +596,9 @@ def execute(self) -> None: / f"{sample_info['sample_name']}.tsv.xz" ) start = perf_counter() - self.launch_counting(raw_cram_file, cram_file, count_file, ref_json) + self.launch_counting( + raw_cram_file, cram_file, count_file, ref_json, census_json + ) logging.info("Completed counting in %f seconds", perf_counter() - start) if not self.keep_all_alignments: logging.info( diff --git a/meteor/fastqimporter.py b/meteor/fastqimporter.py index fd53cc7..0f79e03 100644 --- a/meteor/fastqimporter.py +++ b/meteor/fastqimporter.py @@ -145,21 +145,19 @@ def execute(self) -> None: sys.exit(1) else: tag = "single" + if self.ispaired: + sample_name = self.get_paired_dirname(fastq_file.name, tag) + else: + sample_name = full_sample_name # split full sample name (in fact library/run name) in order if self.mask_sample_name: # to extract sample_name according to regex mask - full_sample_name_array = re.search( - self.mask_sample_name, full_sample_name - ) - if full_sample_name_array: - sample_name = full_sample_name_array[0] + sample_name_array = re.search(self.mask_sample_name, sample_name) + if sample_name_array: + sample_name = sample_name_array[0] else: logging.warning("Regular expression does not match %s", fastq_file) continue - if self.ispaired: - sample_name = self.get_paired_dirname(fastq_file.name, tag) - else: - sample_name = full_sample_name logging.info("Importing %s in sample %s", fastq_file, sample_name) # Create directory for the sample and symlink fastq file into samples_names.add(sample_name) diff --git a/meteor/mapper.py b/meteor/mapper.py index 524be16..234629b 100644 --- a/meteor/mapper.py +++ b/meteor/mapper.py @@ -45,7 +45,6 @@ class Mapper(Session): mapping_type: str trim: int alignment_number: int - identity_threshold: float def __post_init__(self) -> None: if self.mapping_type not in Mapper.MAPPING_TYPES: @@ -77,9 +76,8 @@ def set_mapping_config( "trim": str(self.trim), "alignment_number": self.alignment_number, "mapping_type": self.mapping_type, - "identity_threshold": round(self.identity_threshold, 2), "total_read_count": mapping_data[0], - "mapped_read_count": mapping_data[2] + mapping_data[3], + "mapped_reads": mapping_data[2] + mapping_data[3], "overall_alignment_rate": round( (mapping_data[2] + mapping_data[3]) / mapping_data[0] * 100, 2 ), @@ -178,6 +176,7 @@ def execute(self) -> None: mapping_log = findall(r"([0-9]+)\s+\(", mapping_result) assert len(mapping_log) == 4 mapping_data = [int(i) for i in mapping_log] + print(mapping_data) except AssertionError: logging.error("Could not access the mapping result from bowtie2") sys.exit(1) diff --git a/meteor/tests/test_counter.py b/meteor/tests/test_counter.py index a0f758b..664a612 100644 --- a/meteor/tests/test_counter.py +++ b/meteor/tests/test_counter.py @@ -304,10 +304,14 @@ def test_launch_counting_unique(counter_unique: Counter, datadir: Path, tmp_path raw_cramfile = datadir / "total_raw.cram" cramfile = datadir / "total.cram" countfile = tmp_path / "count.tsv.xz" + census_json_file = counter_unique.meteor.fastq_dir / "part1_census_stage_0.json" + census_json = counter_unique.read_json(census_json_file) ref_json = counter_unique.read_json( counter_unique.meteor.ref_dir / "mock_reference.json" ) - counter_unique.launch_counting(raw_cramfile, cramfile, countfile, ref_json) + counter_unique.launch_counting( + raw_cramfile, cramfile, countfile, ref_json, census_json + ) with countfile.open("rb") as out: assert md5(out.read()).hexdigest() == "f5bc528dcbf594b5089ad7f6228ebab5" @@ -316,10 +320,14 @@ def test_launch_counting_total(counter_total: Counter, datadir: Path, tmp_path: raw_cramfile = datadir / "total_raw.cram" cramfile = datadir / "total.cram" countfile = tmp_path / "count.tsv.xz" + census_json_file = counter_total.meteor.fastq_dir / "part1_census_stage_0.json" + census_json = counter_total.read_json(census_json_file) ref_json = counter_total.read_json( counter_total.meteor.ref_dir / "mock_reference.json" ) - counter_total.launch_counting(raw_cramfile, cramfile, countfile, ref_json) + counter_total.launch_counting( + raw_cramfile, cramfile, countfile, ref_json, census_json + ) with countfile.open("rb") as out: assert md5(out.read()).hexdigest() == "f010e4136323ac408d4c127e243756c2" @@ -330,10 +338,16 @@ def test_launch_counting_smart_shared( raw_cramfile = datadir / "total_raw.cram" cramfile = datadir / "total.cram" countfile = tmp_path / "count.tsv.xz" + census_json_file = ( + counter_smart_shared.meteor.fastq_dir / "part1_census_stage_0.json" + ) + census_json = counter_smart_shared.read_json(census_json_file) ref_json = counter_smart_shared.read_json( counter_smart_shared.meteor.ref_dir / "mock_reference.json" ) - counter_smart_shared.launch_counting(raw_cramfile, cramfile, countfile, ref_json) + counter_smart_shared.launch_counting( + raw_cramfile, cramfile, countfile, ref_json, census_json + ) # with countfile.open("rb") as out: # assert md5(out.read()).hexdigest() == "4bdd7327cbad8e71d210feb0c6375077" expected_output = pd.read_csv( diff --git a/meteor/tests/test_fastq_importer.py b/meteor/tests/test_fastq_importer.py index e42a4d6..38a8ce2 100644 --- a/meteor/tests/test_fastq_importer.py +++ b/meteor/tests/test_fastq_importer.py @@ -161,6 +161,8 @@ def test_execute_paired_mask( builder_paired_mask: FastqImporter, expected_dir_mask: tuple ) -> None: builder_paired_mask.execute() + for dir in expected_dir_mask: + print(builder_paired_mask.meteor.fastq_dir / dir) assert all( Path(builder_paired_mask.meteor.fastq_dir / dir).exists() for dir in expected_dir_mask diff --git a/meteor/tests/test_mapper.py b/meteor/tests/test_mapper.py index 5627ff6..547609a 100644 --- a/meteor/tests/test_mapper.py +++ b/meteor/tests/test_mapper.py @@ -52,7 +52,6 @@ def mapping_builder(datadir: Path, tmp_path: Path) -> Mapper: "end-to-end", 80, 10000, - 0.95, )