resolves #33, #30, #27

- addressed oversight in object identity within Community - added TOML-format profile definition - fixed normalisation error introduced with addition support for fragmented genomes - better error reporting reading profiles - minor fix of version stamp
cerebis · Jan 30, 2024 · 74e1b6a · 74e1b6a
1 parent 26fb55b
commit 74e1b6a
Show file tree

Hide file tree

Showing 5 changed files with 294 additions and 114 deletions.
diff --git a/sim3C/_version.py b/sim3C/_version.py
@@ -1,4 +1,4 @@
-__version__ = '0.6'
+__version__ = '0.6.1'
 
 __copyright__ = """Copyright (C) 2019 Matthew Z DeMaere
 This is free software.  You may redistribute copies of it under the terms of
@@ -14,9 +14,9 @@ def version_stamp(full):
     :return: a version stamp string
     """
     if full:
-        return 'sim3C {}\n{}'.format(__version__, __copyright__)
+        return 'Sim3C version {}\n{}'.format(__version__, __copyright__)
     else:
-        return 'sim3C {}'.format(__version__)
+        return '{}'.format(__version__)
 
 
 def date_stamp():
@@ -32,4 +32,4 @@ def runtime_info():
     """
     :return: Return runtime info (version and date stamps) as a dict
     """
-    return {'qc3C_version': version_stamp(False), 'run_timestamp': date_stamp()}
+    return {'sim3C_version': f'{__version__}', 'run_timestamp': date_stamp()}
diff --git a/sim3C/abundance.py b/sim3C/abundance.py
@@ -16,12 +16,13 @@
 You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 """
-from collections import OrderedDict
+from collections import OrderedDict, defaultdict
 
 import logging
 import numpy as np
 import re
 
+from .exceptions import Sim3CException
 from .random import np_uniform, np_lognormal
 
 
@@ -206,9 +207,47 @@ def normalize(self):
         """
         Normalize a profile so that all abundance entries sum to 1.
         """
-        val_sum = sum([ai.abundance for ai in self.values()])
-        for ai in self.values():
-            ai.abundance /= val_sum
+        # The structure of the Profile means that abundance is stored for every
+        # sequence record, rather than the cell to which they belong.
+        # Therefore, we first collect a non-redundant list of abundances,
+        # warning the user if there are multiple entries for the same cell that are not the same.
+
+        abundances = {}
+        cell2chrom = defaultdict(list)
+        for chrom in self:
+            cell2chrom[chrom.cell].append(chrom)
+            if chrom.cell in abundances and chrom.abundance != abundances[chrom.cell]:
+                logger.warning(f"Multiple abundances identified for the cell {chrom.cell} a={chrom.abundance}")
+            abundances[chrom.cell] = chrom.abundance
+
+        total_abundance = sum(abundances.values())
+        for cell, chrom_list in cell2chrom.items():
+            for chrom in chrom_list:
+                chrom.abundance /= total_abundance
+
+
+def read_toml(hndl, normalise=False):
+    """
+    Read a profile from a TOML file. The returned object
+    is a dictionary rather than the profile class.
+    :param hndl: the input file name or file object
+    :param normalise: when true, normalise after reading
+    :return: dict-style community definition
+    """
+    import toml
+
+    try:
+        comm_dict = toml.load(hndl)
+    except toml.decoder.TomlDecodeError as e:
+        raise Sim3CException(f'IO error reading profile. Invalid TOML file: {e}')
+
+    if normalise:
+        abun = np.asarray([cell['abundance'] for cell in comm_dict['community']['cells']])
+        abun /= abun.sum()
+        for i in range(len(comm_dict['community']['cells'])):
+            comm_dict['community']['cells'][i]['abundance'] = abun[i]
+
+    return comm_dict
 
 
 def read_profile(hndl, normalise=False):
@@ -237,7 +276,7 @@ def read_profile(hndl, normalise=False):
                 chrom, cell, molecule, abn, cn = re.split(r'[\s,]+', line)
                 profile.add(chrom, abn, cn, cell, molecule)
             except Exception:
-                raise IOError('Error: invalid table at line {} [{}]'.format(n, line))
+                raise Sim3CException(f'IO error reading profile. Invalid table: line {n} [{line}]')
             n += 1
 
         if normalise:

diff --git a/sim3C/command_line.py b/sim3C/command_line.py
@@ -112,8 +112,6 @@ def main():
     parser.add_argument('--insert-max', metavar='INT', type=int, default=None,
                         help='Maximum allowed insert size [None]')
 
-    parser.add_argument('--create-cids', default=False, action='store_true',
-                        help='Simulate chromosome interacting domains')
     parser.add_argument('--linear', default=False, action='store_true',
                         help='Treat all replicons as linear molecules')
     parser.add_argument('--efficiency', metavar='FLOAT', type=float,
@@ -129,6 +127,8 @@ def main():
                         help='Community abundance profile')
     parser.add_argument('--profile-name', metavar='FILE', default='profile.tsv',
                         help='Output file name for a procedural community profile')
+    parser.add_argument('--profile-format', metavar='STRING', default='table',
+                        choices=['table', 'toml'], help='Profile defintion format [table]')
 
     parser.add_argument('--dist', metavar='DISTNAME', choices=['equal', 'uniform', 'lognormal'],
                         help='Abundance profile distribution choices: equal, uniform, lognormal')
@@ -228,7 +228,7 @@ def main():
                     'anti_rate', 'spurious_rate', 'trans_rate',
                     'efficiency',
                     'ins_rate', 'del_rate',
-                    'create_cids', 'simple_reads', 'linear', 'convert_symbols']
+                    'simple_reads', 'linear', 'convert_symbols', 'profile_format']
 
         # extract these parameters from the parsed arguments
         kw_args = {k: v for k, v in vars(args).items() if k in kw_names}