timebunch

#!/usr/bin/env python
"""
Group the files or directories in the argument list into directories by time.

Usage:
  timebunch [-c TH] [-d] [-t TI] [-p] [--verbose] FILES...
  timebunch (-h | -l | --version)

Arguments:
  FILES    Files to bunch.  They can also be directories, links, etc.

Options:
  -c TH --color=TH    Color theme to use for output.
                      dark: suitable for a dark background.
                      bright: suitable for a bright background. (Not yet implemented)
                      [default: none]
  -d --dry-run        Do not make any directories, but show what would be done.
                      Implies --verbose
  -h --help           Show this help and exit.
  -l --license        Print the license and exit.
  -p --prefix         Attempt to name the directories by the longest common
                      prefix of their members.
  -t TI --time=TI     Minimum gap between groups, in the given unit.
                      TIMEUNITHELP
                      The default unit is hours.
                      Examples: 1d = 1 day, 3u = 3e-6s, 4 = 4h.
                      [default: 24h]
  --verbose           Show time values for each group.
  --version           Print the version and exit.
  
If any groups with at least two members are found, they are moved to
directories named by the average modification time of their members (see -p).

Note that the modification time of jpegs can be set to their EXIF time (if any)
with jhead -ft *.jpg.
"""
from __future__ import absolute_import, print_function
gpl = """
Copyright (C) 2008-2015 Rob Reid, barlennan@gmail.com

This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
Foundation; either version 3 of the License, or (at your option) any later
version.

This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
details.

You should have received a copy of the GNU General Public License along with
this program; if not, please contact Rob Reid, rreid@nrao.edu, or if he
cannot be reached, write to the Free Software Foundation, Inc., 59 Temple
Place, Suite 330, Boston, MA 02111-1307 USA
"""

#from cluster import HierarchicalClustering
import fastcluster as fc
from collections import OrderedDict
from docopt import docopt
import numpy as np
import os, sys, time
import scipy.cluster.hierarchy as sch
from scipy.spatial.distance import pdist

tuhindent = [line for line in __doc__.split("\n") if 'TIMEUNITHELP' in line][0].split('T')[0]
timeunithelp  = "Time units:\n"
timeunithelp += tuhindent + "letter  # of seconds  definition    comment\n"
timeunits = OrderedDict([
    ('a', (1e-18, 'attosecond', 'For computers with')),
    ('f', (1e-15, 'femtosecond', 'excellent time')),
    ('p', (1e-12, 'picosecond', 'resolution and even')),
    ('n', (1e-9, 'nanosecond', 'faster storage')),
    ('u', (1e-6, 'microsecond', '')),
    ('s', (1, 'second', '')),
    ('m', (60, 'minute', '')),
    ('h', (3600, 'hour', '')),
    ('d', (24 * 3600, '(solar) day', '')),
    ('S', (23 * 3600 + 56 * 60 + 4.1, 'Sidereal day', 'For astronomical items')),
    ('M', (365.25 * 24 * 3600 / 12.0, '(mean) month', '')),
    ('y', (365.25 * 24 * 3600, '(mean) year', ''))])
timeunithelp += "\n".join([tuhindent + "%s       % 12g  %- 12s  %s\n" % ((tu,) + tut)
                           for tu, tut in timeunits.items()])
__doc__ = __doc__.replace('TIMEUNITHELP', timeunithelp)


class Tepr:
    def __init__(self, theme=None):
        """Set up the text properties."""
        self.textpropdict = {
            'PURPLE': '\033[95m',
            'CYAN': '\033[96m',
            'DARKCYAN': '\033[36m',
            'BLUE': '\033[94m',
            'GREEN': '\033[92m',
            'YELLOW': '\033[93m',
            'RED': '\033[91m',
            'BOLD': '\033[1m',
            'UNDERLINE': '\033[4m',
            'END': '\033[0m'
            }
        self.h = ''     # header
        self.subh = ''  # subheader
        self.n = ''     # normal
        if hasattr(theme, 'lower') and theme.lower() == 'dark':
            self.h = self.textpropdict['YELLOW'] + self.textpropdict['BOLD'] + self.textpropdict['UNDERLINE']
            self.subh = self.textpropdict['END'] + self.textpropdict['BOLD'] + self.textpropdict['GREEN']
            self.n = self.textpropdict['END']

def report_bunch(bunch, bunchnum, bunches, meantimes, mean, dirname, tepr):
    bunchmean = meantimes[tuple(bunch)]
    bunchsd = sum([(mtimes[f] - mean - bunchmean)**2 for f in bunch])
    bunchsd = (bunchsd / (len(bunch) - 1.0))**0.5

    maxdiff = 0.0
    for i in range(1, len(bunch)):
        diff = mtimes[bunch[i]] - mtimes[bunch[i - 1]]
        if diff > maxdiff:
            maxdiff = diff
            
    print(tepr.h + "Group %2d: %s" % (bunchnum, dirname))
    print(tepr.subh + "  Mean: %.2f\tsd: %.2f\tmaxdiff: %.2f" % (bunchmean, bunchsd, maxdiff), end=' ')

    if bunchnum > 0:
        print("\tmean-to-mean: %.2f" % (bunchmean -
                                        meantimes[tuple(bunches[bunchnum - 1])]), end=' ')
    else:
        print("\t\t", end=' ')
    if bunchnum < len(bunches) - 1:
        print("\t%.2f" % (meantimes[tuple(bunches[bunchnum + 1])] -
                          bunchmean), end=' ')
    print()

    tfis = [(mtimes[bunch[i]], bunch[i], i) for i in range(len(bunch))]
    tfis.sort()
    print(tepr.n + "\t%s: % 3.2f" % (tfis[0][1], tfis[0][0] - mean - bunchmean))
    for tfi in tfis[1:]:
        print("\t%s: % 3.2f\t%.2f" % (tfi[1], tfi[0] - mean - bunchmean,
                                      tfi[0] - mtimes[bunch[i - 1]]))


def report_singlet(bunchnum, bunch, bunches, meantimes):
    print("Group %2d (%s):\t%2f" % (bunchnum, bunch[0], meantimes[tuple(bunch)]), end=' ')
    if bunchnum > 1:
        print("\t%.2f" % (meantimes[tuple(bunch)] -
                          meantimes[tuple(bunches[bunchnum - 1])]), end=' ')
    if bunchnum < len(bunches) - 1:
        print("\t%.2f" % (meantimes[tuple(bunches[bunchnum + 1])] -
                          meantimes[tuple(bunch)]), end=' ')
    print("\n", end=' ')


class DirectoryNamer:
    """
    Returns a callable object for returning a suitable directory name for a
    collection of items.

    If useprefix is True, it will attempt to use the longest common prefix for
    the items.  If it fails (no prefix as long as minlength), or useprefix is
    False, it will use the average modification time, with a format appropriate
    for the spread of modification times in files.
    """
    def __init__(self, useprefix, files, mtimes, minlength=1):
        self.useprefix = useprefix
        self.mtimes = mtimes        # Must come before setuptimes().
        self.setuptimes(files)
        self.minlength = minlength

    def __call__(self, bunch):
        if self.useprefix:
            success, dirname = self.name_from_common_prefix(bunch,
                                                            self.minlength)
            if success:           # Otherwise, fall through.
                return dirname
        return self.name_from_avgtime(bunch)

    def setuptimes(self, files):
        _minute = 60.0
        _hour = 60.0 * _minute
        _day = 24.0 * _hour
        _month = 30.5 * _day
        _year = 12.0 * _month
        dirnamefmtitems = []
        spread = self.mtimes[files[-1]] - self.mtimes[files[0]]
        if spread > _year:
            dirnamefmtitems.append("%Y")
        if spread > _month and timelength < _year:        # Month
            dirnamefmtitems.append("%m")
        if spread > _day and timelength < _month:
            dirnamefmtitems.append("%d")
        if spread > _hour and timelength < _day:
            dirnamefmtitems.append("%H")                 # 24H
        if spread > _minute and timelength < _hour:
            dirnamefmtitems.append("%M")                 # Minute [00,59]
            if timelength < _minute:
                dirnamefmtitems.append("%S")
        if len(dirnamefmtitems) > 0:
            self.dirnamefmt = '-'.join(dirnamefmtitems)
            #self.dirnamefmt = self.dirnamefmt.replace('H-', 'H:')
            #self.dirnamefmt = self.dirnamefmt.replace('Y-', 'Y/')
        else:
            self.dirnamefmt = "%Y-%m-%d-%H:%M:%S"

    def name_from_common_prefix(self, items, minlength=1):
        """
        Returns (success, prefix) where if success is True, prefix is
        the longest (at least as long as minlength) common prefix in the list of
        strings items.  If success is False, prefix may be garbage, since the
        search may not have gone through all of items.
        """
        if minlength < 1:
            print("Warning: what were you thinking giving minlength =", minlength)
            print("  to name_from_common_prefix()?  Using 1...")
            minlength = 1

        success = False
        prefix = items[0]
        prelen = len(prefix)
        for item in items[1:]:
            i = 0
            if len(item) < prelen:
                prelen = len(item)
            while i < prelen and prefix[i] == item[i]:
                i += 1
            prelen = i
            if prelen < minlength:
                break
        if prelen >= minlength:
            if not os.path.isdir(prefix[:prelen]):
                success = True
        return success, prefix[:prelen]        
    
    def name_from_avgtime(self, bunch):
        avgtime = sum([self.mtimes[f] for f in bunch]) / nitems
        return time.strftime(self.dirnamefmt, time.localtime(avgtime))


if __name__ == '__main__':
    from docopt import docopt

    args = docopt(__doc__, version="1.1.0")

    if args['--license']:
        print(gpl)
        sys.exit(0)

    if len(args['FILES']) < 3:
        print("There must be at least 3 items to sort.")
        sys.exit(1)    

    timelength = args['--time']
    if timelength[-1] not in timeunits:
        timelength += 'h'
    tut = timeunits[timelength[-1]]
    # For historical compatibility, convert timelength to h.
    timelength = float(timelength[:-1]) * tut[0] / 3600.0

    if args['--dry-run']:
        args['--verbose'] = True
        
    # Get mtimes
    mtimes = {f: os.stat(f).st_mtime for f in args['FILES']}
    times = [mtimes[f] for f in args['FILES']]
    times = np.array([times]).T
    
    ## Sort files by mtime.
    #files.sort(lambda x, y: cmp(mtimes[x], mtimes[y]))

    #cl = HierarchicalClustering(args['FILES'], lambda x, y: abs(mtimes[x] - mtimes[y]))
    #bunches = cl.getlevel(3600.0 * timelength)
    d = pdist(times, 'cityblock')
    dendro = fc.single(d)
    del d
    flatclusts = sch.fcluster(dendro, 3600.0 * timelength, 'distance')
    del dendro
    bunches = {}
    for i, ci in enumerate(flatclusts):
        if ci not in bunches:
            bunches[ci] = []
        bunches[ci].append(i)
    bunches = [[args['FILES'][i] for i in v] for v in bunches.values()]

    # Finally: If there is more than one group, move them into directories
    if len(bunches) > 1:
        if args['--verbose']:
            mean = sum([mtimes[f] for f in args['FILES']]) / len(args['FILES'])
            meantimes = {}
            for grp in bunches:
                grp.sort(lambda x, y: cmp(mtimes[x], mtimes[y]))
                meantimes[tuple(grp)] = sum([mtimes[f] - mean for f in grp]) / len(grp)
            bunches.sort(lambda x, y: cmp(meantimes[tuple(x)], meantimes[tuple(y)]))

        dn = DirectoryNamer(args['--prefix'], args['FILES'], mtimes)

        for bunchnum, bunch in enumerate(bunches):
            nitems = len(bunch)
            if nitems > 1:
                try:
                    dirnamecand = dn(bunch)
                    dirname = dirnamecand
                    dirsubnum = 1
                    while os.path.exists(dirname):
                        if dirsubnum > 999999:
                            raise ValueError("There are too many directory name collisions")
                        dirname = dirnamecand + "%06d" % dirsubnum
                        dirsubnum += 1
                    if args['--verbose']:
                        report_bunch(bunch, bunchnum, bunches,
                                     meantimes, mean, dirname, Tepr(args['--color']))
                    if not args['--dry-run']:
                        os.makedirs(dirname)
                        for item in bunch:
                            os.rename(item, dirname + '/' + item)
                except Exception as e:
                    print("Error", e)
                    print("\tin attempt to move\n\t\t", end=' ')
                    print("\n\t\t".join(bunch))
                    print("\t to", dirname)
            else:
                if args['--verbose']:
                    report_singlet(bunchnum, bunch, bunches, meantimes)