diff --git a/doc/choosing-hash-sizes.txt b/doc/choosing-table-sizes.txt similarity index 93% rename from doc/choosing-hash-sizes.txt rename to doc/choosing-table-sizes.txt index 57db7339f2..3d5eafb621 100644 --- a/doc/choosing-hash-sizes.txt +++ b/doc/choosing-table-sizes.txt @@ -1,11 +1,11 @@ -============================= -Choosing hash sizes for khmer -============================= +============================== +Choosing table sizes for khmer +============================== If you look at the documentation for the scripts (:doc:`scripts`) you'll -see two mysterious parameters -- ``-N`` and ``-x``, or, more verbosely, -``-n_hashes`` and ``--hashsize``. What are these, and how do you -specify them? +see two mysterious parameters -- :option:`-N` and :option:`-x`, or, more +verbosely, :option:`-n_tables` and :option:`--tablesize`. What are these, and +how do you specify them? The really short version ======================== @@ -27,7 +27,7 @@ structure in khmer, which is basically N big hash tables of size x. The **product** of the number of hash tables and the size of the hash tables specifies the total amount of memory used. -This hash table is used to track k-mers. If it is too small, khmer +This table is used to track k-mers. If it is too small, khmer will fail in various ways (and should complain), but there is no harm in making it too large. So, **the absolute safest thing to do is to specify as much memory as is available**. Most scripts will inform @@ -48,8 +48,8 @@ which multiplies out to 128 Gbits of RAM, or 16 Gbytes. Life is a bit more complicated than this, however, because some scripts -- load-into-counting and load-graph -- keep ancillary information that will -consume memory beyond this hash data structure. So if you run out of -memory, decrease the hash table size. +consume memory beyond this table data structure. So if you run out of +memory, decrease the table size. Also see the rules of thumb, below. diff --git a/doc/galaxy.txt b/doc/galaxy.txt index fa747e1eaf..d8de7a21e6 100644 --- a/doc/galaxy.txt +++ b/doc/galaxy.txt @@ -48,7 +48,7 @@ If not then you may need to `set their datatype manually .stoptags after each pmap file. + in :program:`make-initial-stoptags.py` from each of the waypoints in that + partition; this should identify all of the HCKs in that partition. These + HCKs are output to .stoptags after each pmap file. Parameter choice is reasonably important. See the pipeline in :doc:`partitioning-big-data` for an example run. @@ -70,7 +70,8 @@ def get_parser(): parser.add_argument('--min-tablesize', '-x', type=float, default=DEFAULT_COUNTING_HT_SIZE, help='lower bound on' ' the size of the k-mer counting table(s)') - parser.add_argument('graphbase') + parser.add_argument('graphbase', help='Basename for the input and output ' + 'files.') parser.add_argument('--version', action='version', version='%(prog)s ' + khmer.__version__) return parser diff --git a/scripts/load-graph.py b/scripts/load-graph.py index 335d3b7409..62ab334c54 100755 --- a/scripts/load-graph.py +++ b/scripts/load-graph.py @@ -32,8 +32,11 @@ def get_parser(): parser.add_argument('--no-build-tagset', '-n', default=False, action='store_true', dest='no_build_tagset', help='Do NOT construct tagset while loading sequences') - parser.add_argument('output_filename') - parser.add_argument('input_filenames', nargs='+') + parser.add_argument('output_filename', + metavar='output_presence_table_filename', help='output' + ' k-mer presence table filename.') + parser.add_argument('input_filenames', metavar='input_sequence_filename', + nargs='+', help='input FAST[AQ] sequence filename') return parser diff --git a/scripts/make-initial-stoptags.py b/scripts/make-initial-stoptags.py index acc0d8b262..7095f4fba5 100755 --- a/scripts/make-initial-stoptags.py +++ b/scripts/make-initial-stoptags.py @@ -59,7 +59,8 @@ def get_parser(): help='Set subset size (default 1e4 is prob ok)') parser.add_argument('--stoptags', '-S', metavar='filename', default='', help="Use stoptags in this file during partitioning") - parser.add_argument('graphbase') + parser.add_argument('graphbase', help='basename for input and output ' + 'filenames') return parser diff --git a/scripts/merge-partitions.py b/scripts/merge-partitions.py index c7434d5e84..7805be4db0 100755 --- a/scripts/merge-partitions.py +++ b/scripts/merge-partitions.py @@ -31,14 +31,16 @@ def get_parser(): Take the ${graphbase}.subset.#.pmap files and merge them all into a single ${graphbase}.pmap.merged file for :program:`annotate-partitions.py` to use. """ - parser = argparse.ArgumentParser(description="Merge pmap files.", - epilog=textwrap.dedent(epilog)) + parser = argparse.ArgumentParser( + description="Merge partition map '.pmap' files.", + epilog=textwrap.dedent(epilog)) parser.add_argument('--ksize', '-k', type=int, default=DEFAULT_K, help="k-mer size (default: %d)" % DEFAULT_K) parser.add_argument('--keep-subsets', dest='remove_subsets', default=True, action='store_false', help='Keep individual subsets (default: False)') - parser.add_argument('graphbase') + parser.add_argument('graphbase', help='basename for input and output ' + 'files') parser.add_argument('--version', action='version', version='%(prog)s ' + khmer.__version__) return parser diff --git a/scripts/normalize-by-median.py b/scripts/normalize-by-median.py index 10a0a0992f..585759eaeb 100755 --- a/scripts/normalize-by-median.py +++ b/scripts/normalize-by-median.py @@ -191,7 +191,8 @@ def get_parser(): dest='single_output_filename', default='', help='only output a single' ' file with the specified filename') - parser.add_argument('input_filenames', nargs='+') + parser.add_argument('input_filenames', metavar='input_sequence_filename', + help='Input FAST[AQ] sequence filename.', nargs='+') add_loadhash_args(parser) return parser