Skip to content

Commit

Permalink
resolves #33, #30, #27
Browse files Browse the repository at this point in the history
- addressed oversight in object identity within Community
- added TOML-format profile definition
- fixed normalisation error introduced with addition support for fragmented genomes
- better error reporting reading profiles
- minor fix of version stamp
  • Loading branch information
cerebis committed Jan 30, 2024
1 parent 26fb55b commit 74e1b6a
Show file tree
Hide file tree
Showing 5 changed files with 294 additions and 114 deletions.
8 changes: 4 additions & 4 deletions sim3C/_version.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = '0.6'
__version__ = '0.6.1'

__copyright__ = """Copyright (C) 2019 Matthew Z DeMaere
This is free software. You may redistribute copies of it under the terms of
Expand All @@ -14,9 +14,9 @@ def version_stamp(full):
:return: a version stamp string
"""
if full:
return 'sim3C {}\n{}'.format(__version__, __copyright__)
return 'Sim3C version {}\n{}'.format(__version__, __copyright__)
else:
return 'sim3C {}'.format(__version__)
return '{}'.format(__version__)


def date_stamp():
Expand All @@ -32,4 +32,4 @@ def runtime_info():
"""
:return: Return runtime info (version and date stamps) as a dict
"""
return {'qc3C_version': version_stamp(False), 'run_timestamp': date_stamp()}
return {'sim3C_version': f'{__version__}', 'run_timestamp': date_stamp()}
49 changes: 44 additions & 5 deletions sim3C/abundance.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,13 @@
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
from collections import OrderedDict
from collections import OrderedDict, defaultdict

import logging
import numpy as np
import re

from .exceptions import Sim3CException
from .random import np_uniform, np_lognormal


Expand Down Expand Up @@ -206,9 +207,47 @@ def normalize(self):
"""
Normalize a profile so that all abundance entries sum to 1.
"""
val_sum = sum([ai.abundance for ai in self.values()])
for ai in self.values():
ai.abundance /= val_sum
# The structure of the Profile means that abundance is stored for every
# sequence record, rather than the cell to which they belong.
# Therefore, we first collect a non-redundant list of abundances,
# warning the user if there are multiple entries for the same cell that are not the same.

abundances = {}
cell2chrom = defaultdict(list)
for chrom in self:
cell2chrom[chrom.cell].append(chrom)
if chrom.cell in abundances and chrom.abundance != abundances[chrom.cell]:
logger.warning(f"Multiple abundances identified for the cell {chrom.cell} a={chrom.abundance}")
abundances[chrom.cell] = chrom.abundance

total_abundance = sum(abundances.values())
for cell, chrom_list in cell2chrom.items():
for chrom in chrom_list:
chrom.abundance /= total_abundance


def read_toml(hndl, normalise=False):
"""
Read a profile from a TOML file. The returned object
is a dictionary rather than the profile class.
:param hndl: the input file name or file object
:param normalise: when true, normalise after reading
:return: dict-style community definition
"""
import toml

try:
comm_dict = toml.load(hndl)
except toml.decoder.TomlDecodeError as e:
raise Sim3CException(f'IO error reading profile. Invalid TOML file: {e}')

if normalise:
abun = np.asarray([cell['abundance'] for cell in comm_dict['community']['cells']])
abun /= abun.sum()
for i in range(len(comm_dict['community']['cells'])):
comm_dict['community']['cells'][i]['abundance'] = abun[i]

return comm_dict


def read_profile(hndl, normalise=False):
Expand Down Expand Up @@ -237,7 +276,7 @@ def read_profile(hndl, normalise=False):
chrom, cell, molecule, abn, cn = re.split(r'[\s,]+', line)
profile.add(chrom, abn, cn, cell, molecule)
except Exception:
raise IOError('Error: invalid table at line {} [{}]'.format(n, line))
raise Sim3CException(f'IO error reading profile. Invalid table: line {n} [{line}]')
n += 1

if normalise:
Expand Down
6 changes: 3 additions & 3 deletions sim3C/command_line.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,8 +112,6 @@ def main():
parser.add_argument('--insert-max', metavar='INT', type=int, default=None,
help='Maximum allowed insert size [None]')

parser.add_argument('--create-cids', default=False, action='store_true',
help='Simulate chromosome interacting domains')
parser.add_argument('--linear', default=False, action='store_true',
help='Treat all replicons as linear molecules')
parser.add_argument('--efficiency', metavar='FLOAT', type=float,
Expand All @@ -129,6 +127,8 @@ def main():
help='Community abundance profile')
parser.add_argument('--profile-name', metavar='FILE', default='profile.tsv',
help='Output file name for a procedural community profile')
parser.add_argument('--profile-format', metavar='STRING', default='table',
choices=['table', 'toml'], help='Profile defintion format [table]')

parser.add_argument('--dist', metavar='DISTNAME', choices=['equal', 'uniform', 'lognormal'],
help='Abundance profile distribution choices: equal, uniform, lognormal')
Expand Down Expand Up @@ -228,7 +228,7 @@ def main():
'anti_rate', 'spurious_rate', 'trans_rate',
'efficiency',
'ins_rate', 'del_rate',
'create_cids', 'simple_reads', 'linear', 'convert_symbols']
'simple_reads', 'linear', 'convert_symbols', 'profile_format']

# extract these parameters from the parsed arguments
kw_args = {k: v for k, v in vars(args).items() if k in kw_names}
Expand Down
Loading

0 comments on commit 74e1b6a

Please sign in to comment.