diff --git a/CHANGELOG.md b/CHANGELOG.md index 359a6c0..c7f264d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,7 +10,7 @@ - adding `requirements.txt` for easy creation of environment in "spacesavers2" docker (#68, @kopardev) - `grubbers` has new `--outfile` argument. -- `blamematrix` has 3 new arguments `--humanreable`, `--includezeros` and `--outfile`. +- `blamematrix` has now been moved into `mimeo`. - `mimeo` files.gz always includes the original file as the first one in the filelist. - `mimeo` now has kronatools compatible output. ktImportText is also run if in PATH to generate HTML report for duplicates only. (#46, @kopardev) - documentation updated. @@ -23,6 +23,8 @@ - `blamematrix` fixed to account for changes due to #71 - `usurp` fixed to account for changes due to #71. Now using the new "original file" column while creating hard-links. - `e2e` overhauled, improved and well commented. +- total size now closely resemble `df` results (fix #75 @kopardev) +- files with future timestamps are handles correctly (fix #76, @kopardev) ## spacesavers2 0.10.2 diff --git a/README.md b/README.md index 90e8518..c31b6a2 100644 --- a/README.md +++ b/README.md @@ -20,19 +20,13 @@ Welcome! `spacesavers2`: > New improved parallel implementation of [`spacesavers`](https://github.com/CCBR/spacesavers). `spacesavers` is soon to be decommissioned! -> Note: `spacesavers2` requires [python version 3.11](https://www.python.org/downloads/release/python-3110/) or later and the [xxhash](https://pypi.org/project/xxhash/) library. These dependencies are already installed on biowulf (as a conda env). The environment for running `spacesavers2` can get set up using: -> -> ```bash -> . "/data/CCBR_Pipeliner/db/PipeDB/Conda/etc/profile.d/conda.sh" && \ -> conda activate py311 -> ``` +> Note: `spacesavers2` requires [python version 3.11](https://www.python.org/downloads/release/python-3110/) or later and the [xxhash](https://pypi.org/project/xxhash/) library. These dependencies are already installed on biowulf (as a conda env). ## `spacesavers2` has the following Basic commands: - spacesavers2_catalog - spacesavers2_mimeo - spacesavers2_grubbers -- spacesavers2_blamematrix - spacesavers2_e2e - spacesavers2_usurp diff --git a/docs/assets/images/spacesavers2.png b/docs/assets/images/spacesavers2.png index a1e9a16..7b1d669 100644 Binary files a/docs/assets/images/spacesavers2.png and b/docs/assets/images/spacesavers2.png differ diff --git a/docs/blamematrix.md b/docs/blamematrix.md deleted file mode 100644 index 7868c96..0000000 --- a/docs/blamematrix.md +++ /dev/null @@ -1,49 +0,0 @@ -## spacesavers2_blamematrix - -This takes in the `allusers.mimeo.files.gz` generated by `spacesavers2_mimeo` and processes it to create a matrix with: - -- folder paths as row-names -- usernames as column-names -- duplicate bytes as values in the matrix - -Deleting these high-value duplicates first will have the biggest impact on the users overall digital footprint. - -### Inputs - -- `--filesgz` output file from `spacesavers2_mimeo`. -- `--level` depth at with to cutoff the output. -- `--humanreable` make the output human readable, that is, output in MiB, GiB, TiB, etc. instead of bytes. -- `--includezeros` include empty folders. -- `--outfile` path to the output file. - -```bash -% spacesavers2_blamematrix --help -usage: spacesavers2_blamematrix [-h] -f FILESGZ [-l LEVEL] [-r | --humanreable | --no-humanreable] [-z | --includezeros | --no-includezeros] [-o OUTFILE] [-v] - -spacesavers2_blamematrix: get per user duplicate sizes at a given folder level (default 3) - -options: - -h, --help show this help message and exit - -f FILESGZ, --filesgz FILESGZ - spacesavers2_mimeo prefix.allusers.mimeo.files.gz file - -l LEVEL, --level LEVEL - folder level to use for creating matrix - -r, --humanreable, --no-humanreable - sizes are printed in human readable format ... (default: Bytes) - -z, --includezeros, --no-includezeros - include folders where totalbytes is zero. - -o OUTFILE, --outfile OUTFILE - output tab-delimited file (default STDOUT) - -v, --version show program's version number and exit - -Version: - v0.10.2-dev -Example: - > spacesavers2_blamematrix -f /output/from/spacesavers2_mimeo/prefix.allusers.mimeo.files.gz -d 3 -o prefix.blamematrix.tsv -``` - -### Outputs - -Counts matrix with duplicate bytes per user per folder. - -> Note this can be used to generate a heatmap for quickly find folders with high duplicates and the user they belong to. diff --git a/mkdocs.yml b/mkdocs.yml index 3f8197e..3929983 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -102,6 +102,5 @@ nav: - catalog: catalog.md - mimeo: mimeo.md - grubbers: grubbers.md - - blamematrix: blamematrix.md - usurp: usurp.md - e2e: e2e.md diff --git a/spacesavers2_blamematrix b/spacesavers2_blamematrix deleted file mode 100755 index 9885c7c..0000000 --- a/spacesavers2_blamematrix +++ /dev/null @@ -1,144 +0,0 @@ -#!/usr/bin/env python3 -import sys -import os -import gzip -import textwrap -import time - -from src.VersionCheck import version_check -from src.VersionCheck import __version__ - -version_check() - -# from src.FileDetails import FileDetails -from src.dfUnit import fgzblamer - -# from src.Summary import Summary -from src.utils import * -from datetime import date - -import argparse - - -def main(): - start = time.time() - scriptname = os.path.basename(__file__) - elog = textwrap.dedent( - """\ - Version: - {} - Example: - > spacesavers2_blamematrix -f /output/from/spacesavers2_mimeo/prefix.allusers.mimeo.files.gz -d 3 -o prefix.blamematrix.tsv - """.format( - __version__ - ) - ) - parser = argparse.ArgumentParser( - description="spacesavers2_blamematrix: get per user duplicate sizes at a given folder level (default 3)", - epilog=elog, - formatter_class=argparse.RawDescriptionHelpFormatter, - ) - - parser.add_argument( - "-f", - "--filesgz", - dest="filesgz", - required=True, - type=str, - default=sys.stdin, - help="spacesavers2_mimeo prefix.allusers.mimeo.files.gz file", - ) - parser.add_argument( - "-l", - "--level", - dest="level", - required=False, - type=int, - default=3, - help="folder level to use for creating matrix", - ) - parser.add_argument( - "-r", - "--humanreable", - dest="humanreable", - required=False, - action=argparse.BooleanOptionalAction, - help="sizes are printed in human readable format ... (default: Bytes)", - ) - parser.add_argument( - "-z", - "--includezeros", - dest="includezeros", - required=False, - action=argparse.BooleanOptionalAction, - help="include folders where totalbytes is zero.", - ) - parser.add_argument( - "-o", - "--outfile", - dest="outfile", - required=False, - type=str, - help="output tab-delimited file (default STDOUT)", - ) - parser.add_argument("-v", "--version", action="version", version=__version__) - - print_with_timestamp( - start=start, scriptname=scriptname, string="version: {}".format(__version__) - ) - - global args - args = parser.parse_args() - - blamematrix = dict() - blamematrix["allusers"] = dict() - with gzip.open(os.path.join(args.filesgz), "rt") as filesgz: - for l in filesgz: - dfu = fgzblamer() - properly_set = dfu.set(l, args.level) - if not properly_set: - continue - for user in dfu.users: - if not user in blamematrix: - blamematrix[user] = dict() - for folder in dfu.bm[user]: - if not folder in blamematrix[user]: - blamematrix[user][folder] = 0 - if not folder in blamematrix["allusers"]: - blamematrix["allusers"][folder] = 0 - blamematrix[user][folder] += dfu.bm[user][folder] - blamematrix["allusers"][folder] += dfu.bm[user][folder] - - if args.outfile: - of = open(args.outfile, "w") - else: - of = sys.stdout - - users = list(blamematrix.keys()) - folders = list(blamematrix["allusers"].keys()) - users2 = ["folder"] - users2.extend(users) - outstr = "\t".join(users2) - of.write("%s\n"%(outstr)) - for folder in folders: - outlist = [] - outlist.append(str(folder)) - for user in users: - try: - hrsize = blamematrix[user][folder] - if args.humanreable: - hrsize = get_human_readable_size(hrsize) - except KeyError: - hrsize = "0" - outlist.append(str(hrsize)) - if blamematrix["allusers"][folder] == 0 : - if args.includezeros: - of.write("%s\n"%("\t".join(outlist))) - else: - of.write("%s\n"%("\t".join(outlist))) - if args.outfile: of.close() - print_with_timestamp(start=start, scriptname=scriptname, string="Done!") - - -if __name__ == "__main__": - main() diff --git a/spacesavers2_catalog b/spacesavers2_catalog index 0ba81ca..30b0d35 100755 --- a/spacesavers2_catalog +++ b/spacesavers2_catalog @@ -17,19 +17,17 @@ from pathlib import Path def task(f): - if not os.path.isfile(f): - return "" - else: - fd = FileDetails() - fd.initialize( - f, - buffersize=args.buffersize, - thresholdsize=args.ignoreheadersize, - tb=args.buffersize, - sed=sed, - bottomhash=args.bottomhash, - ) - return "%s" % (fd) + fd = FileDetails() + fd.initialize( + f, + buffersize=args.buffersize, + thresholdsize=args.ignoreheadersize, + tb=args.buffersize, + sed=sed, + bottomhash=args.bottomhash, + st_block_byte_size=args.st_block_byte_size, + ) + return "%s" % (fd) def main(): @@ -84,7 +82,7 @@ def main(): help="this sized header of the file is ignored before extracting buffer of buffersize for xhash creation (only for special extensions files) default = 1024 * 1024 * 1024 bytes", ) parser.add_argument( - "-s", + "-x", "--se", dest="se", required=False, @@ -92,6 +90,15 @@ def main(): default="bam,bai,bigwig,bw,csi", help="comma separated list of special extensions (default=bam,bai,bigwig,bw,csi)", ) + parser.add_argument( + "-s", + "--st_block_byte_size", + dest="st_block_byte_size", + required=False, + default=512, + type=int, + help="st_block_byte_size on current filesystem (default 512)", + ) parser.add_argument( "-o", "--outfile", @@ -120,7 +127,9 @@ def main(): folder = args.folder p = Path(folder) - files = p.glob("**/*") + files = [p] + files2 = p.glob("**/*") + files.extend(files2) if args.outfile: outfh = open(args.outfile, "w") diff --git a/spacesavers2_e2e b/spacesavers2_e2e index 26a3744..57d2bd2 100755 --- a/spacesavers2_e2e +++ b/spacesavers2_e2e @@ -13,8 +13,8 @@ source ${SCRIPT_DIR}/resources/argparse.bash || exit 1 argparse "$@" < ${outfile_catalog_log} 2> ${outfile_catalog_err} fi +sleep 60 + # spacesavers2_mimeo if [ "$?" == "0" ];then echo "Running spacesavers2_mimeo" -command -V ktImportText 2>/dev/null || module load kronatools || (>&2 echo "module kronatools could not be loaded"; exit 1) +command -V ktImportText 2>/dev/null || module load kronatools || (>&2 echo "module kronatools could not be loaded") spacesavers2_mimeo \ --catalog ${outfile_catalog} \ --outdir ${OUTFOLDER} \ --quota $QUOTA \ --duplicatesonly \ - --maxdepth 3 \ + --maxdepth $MAXDEPTH \ --p $prefix \ --kronaplot \ > ${outfile_mimeo_log} 2> ${outfile_mimeo_err} fi +sleep 60 + # spacesavers2_grubbers if [ "$?" == "0" ];then echo "Running spacesavers2_grubbers" && \ @@ -84,13 +88,4 @@ for filegz in `ls ${OUTFOLDER}/${prefix}*files.gz`;do done fi -# spacesavers2_blamematrix -if [ "$?" == "0" ];then -echo "Running spacesavers2_blamematrix" && \ -spacesavers2_blamematrix \ - --filesgz ${OUTFOLDER}/${prefix}.allusers.mimeo.files.gz \ - --level $LEVEL \ - --outfile ${outfile_blamematrix} \ - > ${outfile_blamematrix_log} 2> ${outfile_blamematrix_err} -fi -echo "Done!" +echo "Done!" \ No newline at end of file diff --git a/spacesavers2_grubbers b/spacesavers2_grubbers index 5bbd2a5..adc2c59 100755 --- a/spacesavers2_grubbers +++ b/spacesavers2_grubbers @@ -92,7 +92,7 @@ def main(): of = sys.stdout for fgitem in dups: - if fgitem.totalsize < top_limit: + if fgitem.totalsize <= top_limit: break saved += fgitem.totalsize of.write("%s\n"%(fgitem)) @@ -100,11 +100,11 @@ def main(): if args.outfile: of.close() - saved = get_human_readable_size(saved) + hrsaved = get_human_readable_size(saved) print_with_timestamp( start=start, scriptname=scriptname, - string="Deleting top grubbers will save {}!".format(saved), + string="Deleting top grubbers will save {} [ {} Bytes ] !".format(hrsaved,saved), ) print_with_timestamp(start=start, scriptname=scriptname, string="Done!") diff --git a/spacesavers2_mimeo b/spacesavers2_mimeo index 04a61e3..822c4ee 100755 --- a/spacesavers2_mimeo +++ b/spacesavers2_mimeo @@ -24,19 +24,6 @@ from datetime import date import argparse -def check_terminal_list(p,tlist): - outcome = -1 # append path to tlist - for i,p2 in enumerate(tlist): - if p.len < p2.len: - if p.path in p2.path: - outcome = -2 # path already in tlist - return outcome - else: - if p2.path in p.path: - outcome = i - return outcome - return outcome - def process_hh( uid, hashhash, @@ -45,7 +32,8 @@ def process_hh( maxdepth, uid2uname, gid2gname, - peruser_perfolder_summaries, + perfolder_summaries, + perfolder_dups, user_output, ): for h in hashhash.keys(): @@ -66,63 +54,42 @@ def process_hh( foldest = hashhash[h].flist[oldest_index] user_owns_original = False if foldest.uid == uid or 0 == uid : user_owns_original = True - uid_dup_file_index = [] - if hashhash[h].ndup_inode > 1: # there are duplicate inodes and hence there are duplicate files - inodes_already_summerized = list() + uid_file_index = list(filter(lambda x:x!=oldest_index,uid_file_index)) # remove oldest if present in list + inodes_already_summerized = list() + if hashhash[h].ndup_files > 0: # we have duplicates for i in uid_file_index: f = hashhash[h].flist[i] + fpath = f.apath + parent = fpath.parent fpaths = f.get_paths(mindepth, maxdepth) - if ( - i == oldest_index - ): # its the original file ... not a duplicate + if f.inode in inodes_already_summerized: # it is a hardlink for p in fpaths: - peruser_perfolder_summaries[p].nnondup_files += 1 - peruser_perfolder_summaries[p].non_dup_Bytes.append(f.size) - peruser_perfolder_summaries[p].non_dup_ages.append(f.mtime) - inodes_already_summerized.append(f.inode) # scenario where original already has a hard-link + perfolder_summaries[p].ndup_files += 1 else: - uid_dup_file_index.append(i) - # has the inode already been summarized - if f.inode in inodes_already_summerized: - for p in fpaths: - peruser_perfolder_summaries[p].ndup_files += 1 - else: - inodes_already_summerized.append(f.inode) - for p in fpaths: - peruser_perfolder_summaries[p].ndup_files+=1 - peruser_perfolder_summaries[p].dup_Bytes.append(f.size) - peruser_perfolder_summaries[p].dup_ages.append(f.mtime) - else: - # ndup_inode == 1 .. meaning there are no duplicate inodes .. can still have multiple hard linked files - # only count 1 file/hardlink for summary - i = uid_file_index[0] - f = hashhash[h].flist[i] - fpaths = f.get_paths(mindepth, maxdepth) - for p in fpaths: - peruser_perfolder_summaries[p].nnondup_files += 1 - peruser_perfolder_summaries[p].non_dup_Bytes.append(f.size) - peruser_perfolder_summaries[p].non_dup_ages.append(f.mtime) - if args.duplicatesonly: - if len(uid_dup_file_index) > 0: # this user has some duplicates - out_index = [oldest_index] - out_index.extend(uid_dup_file_index) - user_output.write( - "{}\n".format( - hashhash[h].str_with_name( - uid2uname, gid2gname, out_index - ) - ) - ) - else: - out_index = [] - if user_owns_original == False: - out_index.append(oldest_index) - out_index.extend(uid_file_index) - user_output.write( - "{}\n".format( - hashhash[h].str_with_name(uid2uname, gid2gname, out_index) - ) + inodes_already_summerized.append(f.inode) + if not parent in perfolder_dups: + perfolder_dups[fpath.parent] = 0 + perfolder_dups[fpath.parent] += f.calculated_size + for p in fpaths: + perfolder_summaries[p].ndup_files+=1 + perfolder_summaries[p].dup_Bytes.append(f.calculated_size) + perfolder_summaries[p].dup_ages.append(f.mtime) + else: # we only have 1 original file + if user_owns_original: + fpaths = foldest.get_paths(mindepth, maxdepth) + for p in fpaths: + perfolder_summaries[p].nnondup_files += 1 + perfolder_summaries[p].non_dup_Bytes.append(foldest.calculated_size) + perfolder_summaries[p].non_dup_ages.append(foldest.mtime) + out_index = [] + out_index.append(oldest_index) + out_index.extend(uid_file_index) + if args.duplicatesonly and len(out_index)==1: continue + user_output.write( + "{}\n".format( + hashhash[h].str_with_name(uid2uname, gid2gname, out_index) ) + ) def main(): @@ -241,19 +208,24 @@ def main(): start=start, scriptname=scriptname, string="Reading in catalog file..." ) set_complete = True + folder_info = dict() with open(args.catalog) as catalog: for l in catalog: fd = FileDetails() set_complete = fd.set(l) if not set_complete: continue - if fd.issyml: + if fd.fld != "d" and fd.fld !="f": # not a file or folder continue # ignore all symlinks users.add(fd.uid) groups.add(fd.gid) - path_lens.add(get_file_depth(fd.apath)) + path_lens.add(fd.get_depth()) for p in fd.get_paths_at_all_depths(): paths.add(p) + if fd.fld == "d": + if not fd.apath in folder_info: + folder_info[fd.apath] = fd + continue hash = fd.xhash_top + "#" + fd.xhash_bottom if hash == "#": # happens when file cannot be read sys.stderr.write( @@ -304,8 +276,13 @@ def main(): scriptname=scriptname, string="Total Number of users: %d" % len(users), ) - + blamematrixtsv = os.path.join( + os.path.abspath(args.outdir), "blamematrix.tsv" + ) + blamematrix = dict() + all_blamematrix_paths = set() for uid in users: + blamematrix[uid] = dict() print_with_timestamp( start=start, scriptname=scriptname, @@ -337,9 +314,22 @@ def main(): with gzip.open(useroutputpath, "wt") as user_output, open( summaryfilepath, "a" ) as user_summary: - peruser_perfolder_summaries = dict() + perfolder_summaries = dict() + perfolder_dups = dict() for p in paths: - peruser_perfolder_summaries[p] = Summary(p) + perfolder_summaries[p] = Summary(p) + if not p in folder_info: + folder_info[p] = FileDetails() + folder_info[p].initialize(p) + fd = folder_info[p] + for p2 in fd.get_paths(mindepth,maxdepth): + if not p2 in folder_info: + folder_info[p2] = FileDetails() + folder_info[p2].initialize(p2) + fd2 = folder_info[p2] + if fd2.uid == uid or uid == 0: + perfolder_summaries[p2].folder_Bytes += fd.calculated_size + hashhashsplits = dict() # dict to collect instances where the files are NOT duplicates has same hashes but different sizes (and different inodes) ... new suffix is added to bottomhash .."_iterator" process_hh( uid, @@ -349,7 +339,8 @@ def main(): maxdepth, uid2uname, gid2gname, - peruser_perfolder_summaries, + perfolder_summaries, + perfolder_dups, user_output, ) if len(hashhashsplits) != 0: @@ -362,42 +353,35 @@ def main(): maxdepth, uid2uname, gid2gname, - peruser_perfolder_summaries, + perfolder_summaries, + perfolder_dups, user_output, ) del hashhashsplitsdummy del hashhashsplits for p in paths: - peruser_perfolder_summaries[p].update_scores(quota) - user_summary.write(f"{peruser_perfolder_summaries[p]}\n") + perfolder_summaries[p].update_scores(quota) + user_summary.write(f"{perfolder_summaries[p]}\n") + for p in perfolder_summaries: + dummy = FileDetails() + dummy.initialize(p) + if dummy.get_depth() == mindepth + 1: + all_blamematrix_paths.add(p) + blamematrix[uid][p] = sum(perfolder_summaries[p].dup_Bytes) if args.kronaplot: - terminal_paths = [] - with open(summaryfilepath,'r') as infile: - count = 0 - for l in infile: - l = l.strip().split("\t") - count += 1 - if count==1: - continue #header - if count==2: - terminal_paths.append(pathlen(l[0],int(l[2]))) - continue - p = pathlen(l[0],int(l[2])) - outcome = check_terminal_list(p,terminal_paths) - if outcome == -1: # new ... append - terminal_paths.append(p) - elif outcome == -2: # already in list .. move on - continue - elif outcome > -1: # better than current one in list ... swap - terminal_paths[outcome] = p + print_with_timestamp( + start=start, + scriptname=scriptname, + string="Creating Kronachart for user: %s" % (uid2uname[uid]), + ) with open(kronatsv,'w') as ktsv: - for p in terminal_paths: - path = p.path + for p in perfolder_dups: + path = str(p) path = path.replace('/','\t') path = path.replace('\t\t','\t') - if p.dupbytes != 0 : - ktsv.write("%d\t%s\n"%(p.dupbytes,path)) + if perfolder_dups[p] != 0: + ktsv.write("%d\t%s\n"%(perfolder_dups[p],path)) if ktImportText_in_path: cmd = "ktImportText %s -o %s"%(kronatsv,kronahtml) srun = subprocess.run(cmd,shell=True, capture_output=True, text=True) @@ -405,8 +389,31 @@ def main(): sys.stderr.write("%s\n"%(srun.stderr)) del hashhash - print_with_timestamp(start=start, scriptname=scriptname, string="Finished!") + print_with_timestamp( + start=start, + scriptname=scriptname, + string="Creating Blamematrix", + ) + with open(blamematrixtsv,'w') as btsv: + outlist = ["path"] + uids = list(blamematrix.keys()) + uids.sort() + for uid in uids: + outlist.append(uid2uname[uid]) + btsv.write("\t".join(outlist)+"\n") + for p in all_blamematrix_paths: + outlist = [str(p)] + s = 0 + for uid in uids: + if p in blamematrix[uid]: + s += blamematrix[uid][p] + outlist.append(str(blamematrix[uid][p])) + else: + outlist.append(str(0)) + if s != 0 : btsv.write("\t".join(outlist)+"\n") + + print_with_timestamp(start=start, scriptname=scriptname, string="Finished!") if __name__ == "__main__": main() diff --git a/spacesavers2_usurp b/spacesavers2_usurp index 1ed5cac..2c79c9a 100755 --- a/spacesavers2_usurp +++ b/spacesavers2_usurp @@ -11,10 +11,6 @@ from src.VersionCheck import __version__ version_check() -# from src.FileDetails import FileDetails -from src.dfUnit import fgzblamer - -# from src.Summary import Summary from src.utils import * from datetime import date @@ -81,7 +77,7 @@ def main(): lhash = l[0] if args.hash in lhash: original_copy = Path(l[4].strip('"')) - dupfiles = l[5].split(";") + dupfiles = l[5].split("##") print_with_timestamp( start=start, scriptname=scriptname, diff --git a/src/FileDetails.py b/src/FileDetails.py index 1204257..ce7a4db 100644 --- a/src/FileDetails.py +++ b/src/FileDetails.py @@ -9,8 +9,8 @@ except ImportError: exit(f"{sys.argv[0]} requires xxhash module") -THRESHOLDSIZE = 1024 * 1024 * 1024 -BUFFERSIZE = 128 * 1024 +THRESHOLDSIZE = 1024 * 1024 * 1024 # 1 MiB +BUFFERSIZE = 128 * 1024 # 128 KiB TB = THRESHOLDSIZE+BUFFERSIZE SEED = 20230502 MINDEPTH = 3 @@ -21,14 +21,33 @@ SED[se]=1 def convert_time_to_age(t): - currenttime=int(time.time()) - return int((currenttime - t)/86400)+1 + currenttime = int(time.time()) + age = int((currenttime - t)/86400)+1 + if age < 0: age = 0 + return age + +def get_type(p): + x = "u" # unknown + if not p.exists(): + x = "a" # absent + return x + if p.is_symlink(): + x = "l" # link or symlink + return x + if p.is_dir(): + x = "d" # directory + return x + if p.is_file(): + x = "f" # file + return x + return x class FileDetails: def __init__(self): self.apath = "" # absolute path of file - self.issyml = False + self.fdl = "u" # is it file or directory or link or unknown or absent ... values are f d l u a self.size = -1 + self.calculated_size = -1 self.dev = -1 self.inode = -1 self.nlink = -1 @@ -40,12 +59,13 @@ def __init__(self): self.xhash_top = "" self.xhash_bottom = "" - def initialize(self,f,thresholdsize=THRESHOLDSIZE, buffersize=BUFFERSIZE, tb=TB, sed=SED, bottomhash=False): + def initialize(self,f,thresholdsize=THRESHOLDSIZE, buffersize=BUFFERSIZE, tb=TB, sed=SED, bottomhash=False,st_block_byte_size=512): self.apath = Path(f).absolute() # path is of type PosixPath ext = self.apath.suffix - self.issyml = self.apath.is_symlink() # is a symbolic link - st = os.stat(self.apath) # gather all stats + self.fld = get_type(self.apath) # get if it is a file or directory or link or unknown or absent + st = self.apath.stat(follow_symlinks=False) # gather stat results self.size = st.st_size # size in bytes + self.calculated_size = st.st_blocks * st_block_byte_size # st_blocks gives number of 512 bytes blocks used self.dev = st.st_dev # Device id self.inode = st.st_ino # Inode self.nlink = st.st_nlink # number of hardlinks @@ -54,39 +74,40 @@ def initialize(self,f,thresholdsize=THRESHOLDSIZE, buffersize=BUFFERSIZE, tb=TB, self.ctime = convert_time_to_age(st.st_ctime) # creation time self.uid = st.st_uid # user id self.gid = st.st_gid # group id - try: - with open(self.apath,'rb') as fh: - if ext in sed: - if self.size > tb: - data = fh.read(thresholdsize) - data = fh.read(buffersize) - self.xhash_top = xxhash.xxh128(data,seed=SEED).hexdigest() - if bottomhash: - fh.seek(-1 * buffersize,2) + if self.fld == "f": + try: + with open(self.apath,'rb') as fh: + if ext in sed: + if self.size > tb: + data = fh.read(thresholdsize) data = fh.read(buffersize) - self.xhash_bottom = xxhash.xxh128(data,seed=SEED).hexdigest() + self.xhash_top = xxhash.xxh128(data,seed=SEED).hexdigest() + if bottomhash: + fh.seek(-1 * buffersize,2) + data = fh.read(buffersize) + self.xhash_bottom = xxhash.xxh128(data,seed=SEED).hexdigest() + else: + self.xhash_bottom = self.xhash_top else: + data = fh.read() + self.xhash_top = xxhash.xxh128(data,seed=SEED).hexdigest() self.xhash_bottom = self.xhash_top else: - data = fh.read() - self.xhash_top = xxhash.xxh128(data,seed=SEED).hexdigest() - self.xhash_bottom = self.xhash_top - else: - if self.size > buffersize: - data = fh.read(buffersize) - self.xhash_top = xxhash.xxh128(data,seed=SEED).hexdigest() - if bottomhash: - fh.seek(-1 * buffersize,2) + if self.size > buffersize: data = fh.read(buffersize) - self.xhash_bottom = xxhash.xxh128(data,seed=SEED).hexdigest() + self.xhash_top = xxhash.xxh128(data,seed=SEED).hexdigest() + if bottomhash: + fh.seek(-1 * buffersize,2) + data = fh.read(buffersize) + self.xhash_bottom = xxhash.xxh128(data,seed=SEED).hexdigest() + else: + self.xhash_bottom = self.xhash_top else: + data = fh.read() + self.xhash_top = xxhash.xxh128(data,seed=SEED).hexdigest() self.xhash_bottom = self.xhash_top - else: - data = fh.read() - self.xhash_top = xxhash.xxh128(data,seed=SEED).hexdigest() - self.xhash_bottom = self.xhash_top - except: - sys.stderr.write("spacesavers2:{}:File cannot be read:{}\n".format(self.__class__.__name__,str(self.apath))) + except: + sys.stderr.write("spacesavers2:{}:File cannot be read:{}\n".format(self.__class__.__name__,str(self.apath))) def set(self,ls_line): original_ls_line=ls_line @@ -105,9 +126,9 @@ def set(self,ls_line): self.nlink = int(ls_line.pop(-1)) self.inode = int(ls_line.pop(-1)) self.dev = int(ls_line.pop(-1)) + self.calculated_size = int(ls_line.pop(-1)) self.size = int(ls_line.pop(-1)) - issyml = ls_line.pop(-1) - self.issyml = issyml == 'True' + self.fld = ls_line.pop(-1) self.apath = Path(";".join(ls_line)) # sometimes filename have ";" in them ... hence this! return True except: @@ -119,8 +140,9 @@ def str_with_name(self,uid2uname,gid2gname):# method for printing output in mime # return_str = "\"%s\";"%(self.apath) # path may have newline char which should not be interpretted as new line char return_str = "\"%s\";"%(str(self.apath).encode('unicode_escape').decode('utf-8')) - # return_str += "%s;"%(self.issyml) + return_str += "%s;"%(self.fld) return_str += "%d;"%(self.size) + return_str += "%d;"%(self.calculated_size) return_str += "%d;"%(self.dev) return_str += "%d;"%(self.inode) return_str += "%d;"%(self.nlink) @@ -137,8 +159,9 @@ def __str__(self): # return_str = "\"%s\";"%(self.apath) # path may have newline char which should not be interpretted as new line char return_str = "\"%s\";"%(str(self.apath).encode('unicode_escape').decode('utf-8')) - return_str += "%s;"%(self.issyml) + return_str += "%s;"%(self.fld) return_str += "%d;"%(self.size) + return_str += "%d;"%(self.calculated_size) return_str += "%d;"%(self.dev) # device id return_str += "%d;"%(self.inode) return_str += "%d;"%(self.nlink) @@ -151,12 +174,27 @@ def __str__(self): return_str += "%s;"%(self.xhash_bottom) return return_str - def get_paths_at_all_depths(self): # for files - return self.apath.parents[:-1] # remove the last one ... which will be '/' + def get_paths_at_all_depths(self): # for files and folders + p = self.apath + paths = [] + if self.fld == "d": + paths.append(p) + paths.extend(p.parents[:-1]) # remove the last one ... which will be '/' + return paths def get_paths(self,mindepth,maxdepth): - parents = list(self.apath.parents[0:-1]) - parents = list(filter(lambda x:get_folder_depth(x) <= maxdepth,parents)) - parents = list(filter(lambda x:get_folder_depth(x) >= mindepth,parents)) - return parents + paths = self.get_paths_at_all_depths() + paths = list(filter(lambda x:get_folder_depth(x) <= maxdepth,paths)) + paths = list(filter(lambda x:get_folder_depth(x) >= mindepth,paths)) + return paths + def get_depth(self): + p = self.apath + try: + if p.is_dir(): # folder + return len(list(p.parents)) + else: # file + return len(list(p.parents)) - 1 + except: + print('get_file_depth error for file:"{}", type:{}'.format(path, type(path))) + exit() \ No newline at end of file diff --git a/src/Summary.py b/src/Summary.py index fe5ab54..28435fa 100644 --- a/src/Summary.py +++ b/src/Summary.py @@ -33,6 +33,7 @@ def __init__(self,path): self.ndup_files = 0 self.non_dup_Bytes = [] self.dup_Bytes = [] + self.folder_Bytes = 0 self.non_dup_ages = [] self.dup_ages = [] self.non_dup_age_scores = [] @@ -71,7 +72,7 @@ def print_header(self): def __str__(self): dup_Bytes = sum(self.dup_Bytes) - tot_Bytes = sum(self.non_dup_Bytes) + dup_Bytes + tot_Bytes = sum(self.non_dup_Bytes) + dup_Bytes + self.folder_Bytes try: dup_mean_age = sum(self.dup_ages)/len(self.dup_ages) except ZeroDivisionError: diff --git a/src/dfUnit.py b/src/dfUnit.py index 6be1a68..73c9e1b 100644 --- a/src/dfUnit.py +++ b/src/dfUnit.py @@ -3,7 +3,7 @@ def get_filename_from_fgzlistitem(string): string = string.strip().split(";")[:-1] - for i in range(9): + for i in range(11): dummy = string.pop(-1) filename = ";".join(string) return filename @@ -12,12 +12,13 @@ def get_filename_from_fgzlistitem(string): class dfUnit: def __init__(self,hash): self.hash = hash # typically hash_top + "#" + hash_bottom - self.flist = [] # list of _ls files with the same hash - self.fsize = -1 # size of each file + self.flist = [] # list of catalog files with the same hash + self.fsize = -1 # calculated size of each file self.ndup = -1 # files in flist with same size, but different inode (they already have the same hash) self.ndup_files = -1 # number of duplicate files ... used for counting duplicate files self.ndup_inode = -1 # number of duplicate inodes ... used for counting duplicate bytes self.size_set = set() # set of unique sizes ... if len(size_set) then split is required + self.calculated_size_list = [] self.uid_list = [] # list of uids of files added self.inode_list = [] # list of inodes of files added self.oldest_inode = -1 # oldest_ ... is for the file which is NOT the duplicate or is the original @@ -33,6 +34,7 @@ def add_fd(self,fd): self.flist.append(fd) # add size if not already present self.size_set.add(fd.size) + self.calculated_size_list.append(fd.calculated_size) # add uid self.uid_list.append(fd.uid) # add inode @@ -71,7 +73,7 @@ def compute(self,hashhashsplits): self.ndup = len(self.inode_list) - 1 #ndup is zero if same len(size_set)==1 and len(inode_list)==1 self.ndup_inode = len(set(self.inode_list)) - 1 self.ndup_files = len(self.inode_list) - 1 - self.fsize = self.flist[0].size + self.fsize = self.flist[0].calculated_size return split_required def get_user_file_index(self,uid): @@ -87,10 +89,10 @@ def get_user_file_index(self,uid): def __str__(self): - return "{0} : {1} {2} {3}".format(self.hash, self.ndup, self.fsize,"##".join(map(lambda x:str(x),self.flist))) + return "{0} : {1} {2} {3}".format(self.hash, self.ndup_inode, self.fsize,"##".join(map(lambda x:str(x),self.flist))) def str_with_name(self,uid2uname, gid2gname,findex): - return "{0} : {1} {2} {3}".format(self.hash, self.ndup, self.fsize,"##".join(map(lambda x:x.str_with_name(uid2uname,gid2gname),[self.flist[i] for i in findex]))) + return "{0} : {1} {2} {3}".format(self.hash, self.ndup_inode, self.fsize,"##".join(map(lambda x:x.str_with_name(uid2uname,gid2gname),[self.flist[i] for i in findex]))) class fgz: # used by grubber @@ -109,10 +111,10 @@ def __str__(self): outstring=[] outstring.append(str(self.hash)) outstring.append(str(self.ndup)) - outstring.append(get_human_readable_size(self.totalsize)) - outstring.append(get_human_readable_size(self.filesize)) + outstring.append(str(self.totalsize)) + outstring.append(str(self.filesize)) outstring.append(get_filename_from_fgzlistitem(self.of)) - outstring.append(";".join(map(lambda x:get_filename_from_fgzlistitem(x),self.fds))) + outstring.append("##".join(map(lambda x:get_filename_from_fgzlistitem(x),self.fds))) return "\t".join(outstring) # return "{0} {1} {2} {3} {4}".format(self.hash,self.ndup,get_human_readable_size(self.totalsize), get_human_readable_size(self.filesize), ";".join(map(lambda x:get_filename_from_fgzlistitem(x),self.fds))) # return "{0} {1} {2} {3} {4}".format(self.hash,self.ndup,self.totalsize, self.filesize, ";".join(map(lambda x:get_filename_from_fgzlistitem(x),self.fds))) @@ -135,90 +137,7 @@ def set(self,inputline): self.ndup = total_ndup # these are user number of duplicates/files self.of = fds.pop(0) # one file is the original self.fds = fds # others are dupicates - inodes_set = set() - for f in fds: - l = f.split(";") - inodes_set.add(l[-7]) - self.totalsize = self.ndup * self.filesize - return True - except: - sys.stderr.write("spacesavers2:{0}:files.gz Do not understand line:{1} with {2} elements.\n".format(self.__class__.__name__,original_line,len(inputline))) - # exit() - return False - - -class FileDetails2: - def __init__(self): - self.apath = "" - self.size = -1 - self.dev = -1 - self.inode = -1 - self.nlink = -1 - self.mtime = -1 - self.uid = -1 - self.gid = -1 - self.uname = "" - self.gname = "" - - def set(self,fgzline): - original_fgzline=fgzline - # print(ls_line) - try: - fgzline = fgzline.strip().replace("\"","").split(";")[:-1] - if len(fgzline) < 10: - raise Exception("Less than 10 items in the line.") - self.gname = fgzline.pop(-1) - self.uname = fgzline.pop(-1) - self.gid = int(fgzline.pop(-1)) - self.uid = int(fgzline.pop(-1)) - self.mtime = int(fgzline.pop(-1)) - self.nlink = int(fgzline.pop(-1)) - self.inode = int(fgzline.pop(-1)) - self.dev = int(fgzline.pop(-1)) - self.size = int(fgzline.pop(-1)) - apath = ";".join(fgzline) - apath = apath.strip("\"") - self.apath = Path(apath) # sometimes filename have ";" in them ... hence this! - return True - except: - sys.stderr.write("spacesavers2:{0}:catalog Do not understand line:\"{1}\" with {2} elements.\n".format(self.__class__.__name__,original_fgzline,len(fgzline))) - # exit() - return False - -class fgzblamer: # used by blamematrix - def __init__(self): - self.hash = "" - self.ndup = -1 - self.users = set() - self.folders = set() - self.bm = dict() - self.fds = [] - - def set(self,inputline,depth): - original_line = inputline - try: - inputline = inputline.strip().split(" ") - if len(inputline) < 5: - raise Exception("Less than 5 items in the line.") - self.hash = inputline.pop(0) - dummy = inputline.pop(0) - self.ndup = int(inputline.pop(0)) - if self.ndup == 0 or self.ndup == 1: return False - self.filesize = int(inputline.pop(0)) - full_fds = " ".join(inputline) - fds = full_fds.split("##") - for f in fds: - fd = FileDetails2() - fd.set(f) - self.users.add(fd.uname) - fad=get_folder_at_depth(fd.apath,depth) - self.folders.add(fad) - if not fd.uname in self.bm: - self.bm[fd.uname] = dict() - if not fad in self.bm[fd.uname]: - self.bm[fd.uname][fad] = 0 - self.bm[fd.uname][fad] += self.filesize - self.fds = [] + self.totalsize = total_ndup * self.filesize return True except: sys.stderr.write("spacesavers2:{0}:files.gz Do not understand line:{1} with {2} elements.\n".format(self.__class__.__name__,original_line,len(inputline))) diff --git a/src/utils.py b/src/utils.py index 279046d..68c4b51 100644 --- a/src/utils.py +++ b/src/utils.py @@ -72,18 +72,6 @@ def get_folder_depth(path): return len(list(path.parents)) -def get_file_depth(path): -# example -# >>> len(list(Path("/f1/f2/f3/f4/a.xyz").absolute().parents))-1 -# 4 -# a.k.a. file a.xyz is 4 folders deep - try: - return len(list(path.parents)) - 1 - except: - print('get_file_depth error for file:"{}", type:{}'.format(path, type(path))) - exit() - - def get_timestamp(start): e = time.time() return "%08.2fs" % (e - start)