diff --git a/CHANGELOG.md b/CHANGELOG.md index 86f9e4b..b65cf73 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,11 +2,29 @@ ### New features -- adding `requirements.txt` for easy creation of environment in "spacesavers2" docker (#68, @kopardev) +### Bug fixes -## Bug fixes +## spacesavers2 0.11.0 -- +### New features + +- Add `requirements.txt` for easy creation of environment in "spacesavers2" docker (#68, @kopardev) +- `grubbers` has new `--outfile` argument. +- `blamematrix` has now been moved into `mimeo`. +- `mimeo` files.gz always includes the original file as the first one in the filelist. +- `mimeo` now has kronatools compatible output. ktImportText is also run if in PATH to generate HTML report for duplicates only. (#46, @kopardev) +- Update documentation. + +### Bug fixes + +- `e2e` overhauled, improved and well commented. +- `grubbers` `--limit` can be < 1 GiB (float) (#70, @kopardev) +- `grubbers` output file format changed. New original file column added. Original file is required by `usurp`. +- `mimeo` `--duplicateonly` now correctly handles duplicates owned by different UIDs. (#71, @kopardev) + - Update `blamematrix` and to account for corrected duplicate handling in `mimeo`. +- `usurp` now uses the new "original file" column from `grubbers` while creating hard-links. +- Total size now closely resembles `df` results (fix #75 @kopardev) +- Files with future timestamps are handled correctly (fix #76, @kopardev) ## spacesavers2 0.10.2 diff --git a/README.md b/README.md index 90e8518..c31b6a2 100644 --- a/README.md +++ b/README.md @@ -20,19 +20,13 @@ Welcome! `spacesavers2`: > New improved parallel implementation of [`spacesavers`](https://github.com/CCBR/spacesavers). `spacesavers` is soon to be decommissioned! -> Note: `spacesavers2` requires [python version 3.11](https://www.python.org/downloads/release/python-3110/) or later and the [xxhash](https://pypi.org/project/xxhash/) library. These dependencies are already installed on biowulf (as a conda env). The environment for running `spacesavers2` can get set up using: -> -> ```bash -> . "/data/CCBR_Pipeliner/db/PipeDB/Conda/etc/profile.d/conda.sh" && \ -> conda activate py311 -> ``` +> Note: `spacesavers2` requires [python version 3.11](https://www.python.org/downloads/release/python-3110/) or later and the [xxhash](https://pypi.org/project/xxhash/) library. These dependencies are already installed on biowulf (as a conda env). ## `spacesavers2` has the following Basic commands: - spacesavers2_catalog - spacesavers2_mimeo - spacesavers2_grubbers -- spacesavers2_blamematrix - spacesavers2_e2e - spacesavers2_usurp diff --git a/docs/assets/images/spacesavers2.png b/docs/assets/images/spacesavers2.png index a1e9a16..7b1d669 100644 Binary files a/docs/assets/images/spacesavers2.png and b/docs/assets/images/spacesavers2.png differ diff --git a/docs/blamematrix.md b/docs/blamematrix.md deleted file mode 100644 index e4f60f6..0000000 --- a/docs/blamematrix.md +++ /dev/null @@ -1,38 +0,0 @@ -## spacesavers2_grubbers - -This takes in the `allusers.files.gz` generated by `spacesavers2_mimeo` and processes it to create a matrix with: - -- folder paths as row-names -- usernames as column-names -- duplicate bytes as values in the matrix - -Deleting these high-value duplicates first will have the biggest impact on the users overall digital footprint. - -### Inputs - -- `--filesgz` output file from `spacesavers2_mimeo`. -- `--limit` lower cut-off for output display (default 5 GiB). This means that duplicates with overall size of less than 5 GiB will not be displayed. - -```bash -% spacesavers2_blamematrix --help -spacesavers2_blamematrix:00000.19s:version: v0.5 -usage: spacesavers2_blamematrix [-h] -f FILESGZ [-l LEVEL] - -spacesavers2_blamematrix: get per user duplicate sizes at a given folder level (default 3) - -options: - -h, --help show this help message and exit - -f FILESGZ, --filesgz FILESGZ - spacesavers2_mimeo prefix.allusers.files.gz file - -l LEVEL, --level LEVEL - folder level to use for creating matrix - -Version: - v0.5 -Example: - > spacesavers2_blamematrix -f /output/from/spacesavers2_mimeo/prefix.allusers.files.gz -d 3 -``` - -### Outputs - -Counts matrix with duplicate bytes per user per folder. diff --git a/docs/catalog.md b/docs/catalog.md index 6405d9b..5b51044 100644 --- a/docs/catalog.md +++ b/docs/catalog.md @@ -58,7 +58,7 @@ Example: `spacesavers2_catalog` creates one semi-colon seperated output line per input file. Here is an example line: ```bash -% head -n1 test.ls_out +% head -n1 test.catalog "/data/CBLCCBR/kopardevn_tmp/spacesavers2_testing/_data_CCBR_Pipeliner_db_PipeDB_Indices.ls.old";False;1653453;47;372851499;1;1;5;5;37513;57886;4707e661a1f3beca1861b9e0e0177461;52e5038016c3dce5b6cdab635765cc79; ``` The 13 items in the line are as follows: diff --git a/docs/e2e.md b/docs/e2e.md index a47b59e..d81e7aa 100644 --- a/docs/e2e.md +++ b/docs/e2e.md @@ -18,12 +18,16 @@ End-to-end run of spacesavers2 options: -h, --help show this help message and exit - -i INFOLDER, --infolder INFOLDER - Folder to run spacesavers_ls on. + -f FOLDER, --folder FOLDER + Folder to run spacesavers_catalog on. -p THREADS, --threads THREADS number of threads to use + -d MAXDEPTH, --maxdepth MAXDEPTH + maxdepth for mimeo + -l LIMIT, --limit LIMIT + limit for running spacesavers_grubbers -q QUOTA, --quota QUOTA total size of the volume (default = 200 for /data/CCBR) -o OUTFOLDER, --outfolder OUTFOLDER - Folder where all spacesavers_finddup output files will be saved + Folder where all spacesavers_e2e output files will be saved ``` \ No newline at end of file diff --git a/docs/grubbers.md b/docs/grubbers.md index 5c38682..a71e9e2 100644 --- a/docs/grubbers.md +++ b/docs/grubbers.md @@ -1,35 +1,38 @@ ## spacesavers2_grubbers -This takes in the `.files.gz` generated by `spacesavers2_mimeo` and processes it to: +This takes in the `mimeo.files.gz` generated by `spacesavers2_mimeo` and processes it to: - sort duplicates by total size - reports the "high-value" duplicates. -Deleting these high-value duplicates first will have the biggest impact on the users overall digital footprint +Deleting these high-value duplicates first will have the biggest impact on the users overall digital footprint. ### Inputs - `--filesgz` output file from `spacesavers2_mimeo`. -- `--limit` lower cut-off for output display (default 5 GiB). This means that duplicates with overall size of less than 5 GiB will not be displayed. +- `--limit` lower cut-off for output display (default 5 GiB). This means that duplicates with overall size of less than 5 GiB will not be displayed. Set 0 to report all. ```bash -% spacesavers2_grubbers --help -spacesavers2_grubbers:00000.01s:version: v0.5 -usage: spacesavers2_grubbers [-h] -f FILESGZ [-l LIMIT] +╰─○ spacesavers2_grubbers --help +spacesavers2_grubbers:00000.00s:version: v0.10.2-dev +usage: spacesavers2_grubbers [-h] -f FILESGZ [-l LIMIT] [-o OUTFILE] [-v] spacesavers2_grubbers: get list of large duplicates sorted by total size options: -h, --help show this help message and exit -f FILESGZ, --filesgz FILESGZ - spacesavers2_mimeo prefix..files.gz file + spacesavers2_mimeo prefix..mimeo.files.gz file -l LIMIT, --limit LIMIT - stop showing duplicates with total size smaller then (5 default) GiB + stop showing duplicates with total size smaller than (5 default) GiB. Set 0 for unlimited. + -o OUTFILE, --outfile OUTFILE + output tab-delimited file (default STDOUT) + -v, --version show program's version number and exit Version: - v0.5 + v0.10.2-dev Example: - > spacesavers2_grubbers -f /output/from/spacesavers2_mimeo/prefix.files.gz + > spacesavers2_grubbers -f /output/from/spacesavers2_finddup/prefix.files.gz ``` ### Outputs @@ -40,18 +43,16 @@ The output is displayed on STDOUT and is tab-delimited with these columns: | ------ | ------------------------------------- | | 1 | combined hash | | 2 | number of duplicates found | -| 3 | total size of all duplicates | -| 4 | size of each duplicate | -| 5 | ";"-separated list of duplicates | -| 6 | duplicate files | +| 3 | total size of all duplicates (human readable) | +| 4 | size of each duplicate (human readable) | +| 5 | original file | +| 6 | ";"-separated list of duplicates files | Here is an example output line: ```bash -ca269c980de3f0d8e6668b88d9065c8f#5003f92f52d71437741e4e79c4339a66 3 21.99 GiB 7.33 GiB "/data/CCBR/ccbr754_Yoshimi/ccbr754/workdir_170403_postinitialrnas -eq2/0h_1_S25.p2.Aligned.toTranscriptome.sorted.bam";"/data/CCBR/ccbr754_Yoshimi/ccbr754targz/data/CCBR/projects/ccbr754/workdir_170403_postinitialrnaseq2/0h_1_S25.p2.Aligned.toTr -anscriptome.sorted.bam";"/data/CCBR/ccbr754_Yoshimi/ccbr754targz/data/CCBR/projects/ccbr754/workdir_170403_postinitialrnaseq2/0h_1_S25.p2.Aligned.toTranscriptome.sorted.sorted.ba -m" +183e9dc341073d9b75c817f5ed07b9ac#183e9dc341073d9b75c817f5ed07b9ac 5 0.07 KiB 0.01 KiB "/data/CCBR/abdelmaksoudaa/test/a" "/data/CCBR/abdelmaksoudaa/test/b";"/data/CCBR/abde +lmaksoudaa/test/c";"/data/CCBR/abdelmaksoudaa/test/d";"/data/CCBR/abdelmaksoudaa/test/e";"/data/CCBR/abdelmaksoudaa/test/f" ``` > `spacesavers2_grubbers` is typical used to find the "low-hanging" fruits ... aka ... the "high-value" duplicates which need to be deleted first to quickly have the biggest impact on the users overall digital footprint. \ No newline at end of file diff --git a/docs/mimeo.md b/docs/mimeo.md index 215ae48..2336721 100644 --- a/docs/mimeo.md +++ b/docs/mimeo.md @@ -1,13 +1,13 @@ ## spacesavers2_mimeo -This takes in the `ls_out` generated by `spacesavers2_catalog` and processes it to: +This takes in the `catalog` file generated by `spacesavers2_catalog` and processes it to: - find duplicates - create per-user summary reports for each user (and all users). ### Inputs -- `--lsout` is the output file from `spacesavers2_catalog`. Thus, `spacesavers2_catalog` needs to be run before running `spacesavers2_mimeo`. +- `--catalog` is the output file from `spacesavers2_catalog`. Thus, `spacesavers2_catalog` needs to be run before running `spacesavers2_mimeo`. - `--maxdepth` maximum folder depth upto which reports are aggregated - `--outdir` path to the output folder - `--prefix` prefix to be added to the output file names eg. date etc. @@ -16,17 +16,16 @@ This takes in the `ls_out` generated by `spacesavers2_catalog` and processes it ```bash % spacesavers2_mimeo --help -spacesavers2_mimeo:00000.02s:version: v0.5 -usage: spacesavers2_mimeo [-h] -f LSOUT [-d MAXDEPTH] [-o OUTDIR] [-p PREFIX] [-q QUOTA] [-z | --duplicatesonly | --no-duplicatesonly] +usage: spacesavers2_mimeo [-h] -f CATALOG [-d MAXDEPTH] [-o OUTDIR] [-p PREFIX] [-q QUOTA] [-z | --duplicatesonly | --no-duplicatesonly] [-k | --kronaplot | --no-kronaplot] [-v] spacesavers2_mimeo: find duplicates options: -h, --help show this help message and exit - -f LSOUT, --catalog LSOUT + -f CATALOG, --catalog CATALOG spacesavers2_catalog output from STDIN or from catalog file -d MAXDEPTH, --maxdepth MAXDEPTH - folder max. depth upto which reports are aggregated + folder max. depth upto which reports are aggregated ... absolute path is used to calculate depth (Default: 10) -o OUTDIR, --outdir OUTDIR output folder -p PREFIX, --prefix PREFIX @@ -35,16 +34,21 @@ options: total quota of the mount eg. 200 TB for /data/CCBR -z, --duplicatesonly, --no-duplicatesonly Print only duplicates to per user output file. + -k, --kronaplot, --no-kronaplot + Make kronaplots for duplicates.(ktImportText must be in PATH!) + -v, --version show program's version number and exit Version: - v0.5 + v0.10.2-dev Example: - > spacesavers2_mimeo -f /output/from/spacesavers2_catalog -o /path/to/output/folder -d 7 -q 10 + > spacesavers2_mimeo -f /output/from/spacesavers2_catalog -o /path/to/output/folder -d 7 -q 10 -k ``` ### Outputs -After completion of run, `spacesavers2_mimeo` creates `.files.gz` (list of duplicate files) and `.summary.txt` (overall stats at various depths) files in the provided output folder. Here are the details: +After completion of run, `spacesavers2_mimeo` creates `*.mimeo.files.gz` (list of files per user + one "allusers" file) and `.summary.txt` (overall stats at various depths) files in the provided output folder. if `-k` is provided (and ktImportText from [kronatools](https://github.com/marbl/Krona/wiki/KronaTools) is in PATH) then krona specific TSV and HTML pages are also generated. It also generates a `blamematrix.tsv` file with folders on rows and users on columns with duplicate bytes per-folder-per-user. This file can be used to create a "heatmap" to pinpoint folder with highest duplicates overall as well as on a per-user basis. + +Here are the details: #### Duplicates @@ -54,7 +58,7 @@ After completion of run, `spacesavers2_mimeo` creates `.files.gz` (list of dupli - Check if each bin has unique sized files. If a bin has more than 1 size, then it needs to be binned further. Sometimes, xxHash of top and bottom chunks also gives the same combination of hash for differing files. These files will have different sizes. Hence, re-bin them accordingly. - If same size, then check inodes. If all files in the same bin have the same inode, then these are just hard-links. But, if there are multiple inodes, then we have **duplicates**! - If we have duplicates, then `spacesavers2_mimeo` keeps track of number of duplicates per bin. Number of duplicates is equal to number of inodes in each bin minus one. -- If we have duplicates, then the oldest find is identified and considered to be the original file. All other files are marked _duplicate_, irrespective of user id. +- If we have duplicates, then the oldest file is identified and considered to be the original file. All other files are marked _duplicate_, irrespective of user id. - duplicate files are reported in gzip format with the following columns for all users and per-user basis Here is what the `.files.gz` file columns (space-separated) represent: @@ -63,17 +67,19 @@ Here is what the `.files.gz` file columns (space-separated) represent: | ------ | ------------------------------------------------ | | 1 | top chunk and bottom chunk hashes separated by "#" | | 2 | separator ":" | -| 3 | Number of duplicates | +| 3 | Number of duplicates files (not duplicate inodes) | | 4 | Size of each file | | 5 | List of users duplicates serapated by "##" | -Each file in the last column above is ":" separated with the same 13 items as described in the `ls_out` file. The only difference is that the user id and group id are now replaced by user name and group name. +> NOTE: Number of dupicate files can be greater than number of duplicate inodes as each file can have multiple hard links already. Hence, while calculating total duplicate bytes we use (total_number_of_unique_inodes_per_group_of_duplicate_files - 1) X size_of_each_file. The "minus 1" is to not count the size of the original file. + +Each file in the last column above is ";" separated with the same 13 items as described in the `catalog` file. The only difference is that the username and groupame are now appended to each file entry. -Along with creating one `.files.gz` and `.summary.txt` file per user encountered, `spacesavers2_mimeo` also generates a `allusers.files.gz` file for all users combined. This file is later used by `spacesavers2_blamematrix` as input. +Along with creating one `.mimeo.files.gz` and `.mimeo.summary.txt` file per user encountered, `spacesavers2_mimeo` also generates a `allusers.mimeo.files.gz` file for all users combined. This file is later used by `spacesavers2_blamematrix` as input. #### Summaries -Summaries, files ending with `.summary.txt` are collected and reported for all users (`allusers.summary.txt`) and per-user (`USERNAME.summary.txt`) basis for user-defined depth (and beyond). The columns (tab-delimited) in the summary file: +Summaries, files ending with `.mimeo.summary.txt` are collected and reported for all users (`allusers.mimeo.summary.txt`) and per-user (`USERNAME.mimeo.summary.txt`) basis for user-defined depth (and beyond). The columns (tab-delimited) in the summary file: | Column | Description | | ------ | ------------------------------------- | @@ -93,3 +99,13 @@ Summaries, files ending with `.summary.txt` are collected and reported for all u For columns 10 through 13, the same logic is used as [spacesavers](https://ccbr.github.io/spacesavers/usage/df/). +#### KronaTSV and KronaHTML + +- KronaTSV is tab-delimited with first column showing the number of duplicate bytes and every subsequent column giving the folder depths. +- ktImportText is then used to convert the KronaTSV to KronaHTML which can be shared easily and only needs a HTML5 supporting browser for viewing. + +#### Blamematrix + +- rows are folders as 1 level deeper than the "mindepth" +- columns are all individual usernames, plus an "allusers" column +- only duplicate-bytes are reported \ No newline at end of file diff --git a/docs/usurp.md b/docs/usurp.md index 3580055..708c6ad 100644 --- a/docs/usurp.md +++ b/docs/usurp.md @@ -19,10 +19,10 @@ The GRUBBER file has the following columns: | ------ | ------------------------------------- | | 1 | combined hash | | 2 | number of duplicates found | -| 3 | total size of all duplicates | -| 4 | size of each duplicate | -| 5 | ";"-separated list of duplicates | -| 6 | duplicate files | +| 3 | total size of all duplicates (human readable) | +| 4 | size of each duplicate (human readable) | +| 5 | original file | +| 6 | ";"-separated list of duplicates files | ```bash usage: spacesavers2_usurp [-h] -g GRUBBER -x HASH [-f | --force | --no-force] diff --git a/mkdocs.yml b/mkdocs.yml index 3f8197e..3929983 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -102,6 +102,5 @@ nav: - catalog: catalog.md - mimeo: mimeo.md - grubbers: grubbers.md - - blamematrix: blamematrix.md - usurp: usurp.md - e2e: e2e.md diff --git a/spacesavers2_blamematrix b/spacesavers2_blamematrix deleted file mode 100755 index 20cf569..0000000 --- a/spacesavers2_blamematrix +++ /dev/null @@ -1,106 +0,0 @@ -#!/usr/bin/env python3 -import sys -import os -import gzip -import textwrap -import time - -from src.VersionCheck import version_check -from src.VersionCheck import __version__ - -version_check() - -# from src.FileDetails import FileDetails -from src.dfUnit import fgzblamer - -# from src.Summary import Summary -from src.utils import * -from datetime import date - -import argparse - - -def main(): - start = time.time() - scriptname = os.path.basename(__file__) - elog = textwrap.dedent( - """\ - Version: - {} - Example: - > spacesavers2_blamematrix -f /output/from/spacesavers2_finddup/prefix.allusers.files.gz -d 3 - """.format( - __version__ - ) - ) - parser = argparse.ArgumentParser( - description="spacesavers2_blamematrix: get per user duplicate sizes at a given folder level (default 3)", - epilog=elog, - formatter_class=argparse.RawDescriptionHelpFormatter, - ) - - parser.add_argument( - "-f", - "--filesgz", - dest="filesgz", - required=True, - type=str, - default=sys.stdin, - help="spacesavers2_mimeo prefix.allusers.files.gz file", - ) - parser.add_argument( - "-l", - "--level", - dest="level", - required=False, - type=int, - default=3, - help="folder level to use for creating matrix", - ) - parser.add_argument("-v", "--version", action="version", version=__version__) - - print_with_timestamp( - start=start, scriptname=scriptname, string="version: {}".format(__version__) - ) - - global args - args = parser.parse_args() - - blamematrix = dict() - blamematrix["allusers"] = dict() - with gzip.open(os.path.join(args.filesgz), "rt") as filesgz: - for l in filesgz: - dfu = fgzblamer() - properly_set = dfu.set(l, args.level) - if not properly_set: - continue - for user in dfu.users: - if not user in blamematrix: - blamematrix[user] = dict() - for folder in dfu.bm[user]: - if not folder in blamematrix[user]: - blamematrix[user][folder] = 0 - if not folder in blamematrix["allusers"]: - blamematrix["allusers"][folder] = 0 - blamematrix[user][folder] += dfu.bm[user][folder] - blamematrix["allusers"][folder] += dfu.bm[user][folder] - - users = list(blamematrix.keys()) - folders = list(blamematrix["allusers"].keys()) - users2 = ["folder"] - users2.extend(users) - print("\t".join(users2)) - for folder in folders: - print(folder, end="") - for user in users: - try: - hrsize = get_human_readable_size(blamematrix[user][folder]) - except KeyError: - hrsize = "0" - print("\t{}".format(hrsize), end="") - print("") - print_with_timestamp(start=start, scriptname=scriptname, string="Done!") - - -if __name__ == "__main__": - main() diff --git a/spacesavers2_catalog b/spacesavers2_catalog index 5c98749..30b0d35 100755 --- a/spacesavers2_catalog +++ b/spacesavers2_catalog @@ -17,19 +17,17 @@ from pathlib import Path def task(f): - if not os.path.isfile(f): - return "" - else: - fd = FileDetails() - fd.initialize( - f, - buffersize=args.buffersize, - thresholdsize=args.ignoreheadersize, - tb=args.buffersize, - sed=sed, - bottomhash=args.bottomhash, - ) - return "%s" % (fd) + fd = FileDetails() + fd.initialize( + f, + buffersize=args.buffersize, + thresholdsize=args.ignoreheadersize, + tb=args.buffersize, + sed=sed, + bottomhash=args.bottomhash, + st_block_byte_size=args.st_block_byte_size, + ) + return "%s" % (fd) def main(): @@ -63,7 +61,7 @@ def main(): required=False, type=int, default=4, - help="number of threads to be used", + help="number of threads to be used (default 4)", ) parser.add_argument( "-b", @@ -72,7 +70,7 @@ def main(): required=False, type=int, default=128 * 1024, - help="buffersize for xhash creation", + help="buffersize for xhash creation (default=128 * 1028 bytes)", ) parser.add_argument( "-i", @@ -81,16 +79,25 @@ def main(): required=False, type=int, default=1024 * 1024 * 1024, - help="this sized header of the file is ignored before extracting buffer of buffersize for xhash creation (only for special extensions files)", + help="this sized header of the file is ignored before extracting buffer of buffersize for xhash creation (only for special extensions files) default = 1024 * 1024 * 1024 bytes", ) parser.add_argument( - "-s", + "-x", "--se", dest="se", required=False, type=str, default="bam,bai,bigwig,bw,csi", - help="comma separated list of special extensions", + help="comma separated list of special extensions (default=bam,bai,bigwig,bw,csi)", + ) + parser.add_argument( + "-s", + "--st_block_byte_size", + dest="st_block_byte_size", + required=False, + default=512, + type=int, + help="st_block_byte_size on current filesystem (default 512)", ) parser.add_argument( "-o", @@ -120,7 +127,9 @@ def main(): folder = args.folder p = Path(folder) - files = p.glob("**/*") + files = [p] + files2 = p.glob("**/*") + files.extend(files2) if args.outfile: outfh = open(args.outfile, "w") diff --git a/spacesavers2_e2e b/spacesavers2_e2e index 5fe2a3e..57d2bd2 100755 --- a/spacesavers2_e2e +++ b/spacesavers2_e2e @@ -11,10 +11,12 @@ SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) ARGPARSE_DESCRIPTION="End-to-end run of spacesavers2" source ${SCRIPT_DIR}/resources/argparse.bash || exit 1 argparse "$@" < ${OUTFOLDER}/${outfile_catalog} 2> ${OUTFOLDER}/${outfile_catalog_err} +spacesavers2_catalog \ + --folder $FOLDER \ + --threads $THREADS \ + --outfile ${outfile_catalog} \ + --bottomhash \ + > ${outfile_catalog_log} 2> ${outfile_catalog_err} fi + +sleep 60 + +# spacesavers2_mimeo if [ "$?" == "0" ];then -echo "Running spacesavers2_mimeo" && \ -spacesavers2_mimeo -f ${OUTFOLDER}/${outfile_catalog} -o ${OUTFOLDER} -q $QUOTA -z -d 3 -p $prefix > ${OUTFOLDER}/${outfile_mimeo_log} 2> ${OUTFOLDER}/${outfile_mimeo_err} +echo "Running spacesavers2_mimeo" +command -V ktImportText 2>/dev/null || module load kronatools || (>&2 echo "module kronatools could not be loaded") +spacesavers2_mimeo \ + --catalog ${outfile_catalog} \ + --outdir ${OUTFOLDER} \ + --quota $QUOTA \ + --duplicatesonly \ + --maxdepth $MAXDEPTH \ + --p $prefix \ + --kronaplot \ + > ${outfile_mimeo_log} 2> ${outfile_mimeo_err} fi + +sleep 60 + +# spacesavers2_grubbers if [ "$?" == "0" ];then echo "Running spacesavers2_grubbers" && \ -for f in `ls ${OUTFOLDER}/${prefix}*files.gz`;do - outfile=`echo $f|sed "s/files.gz/grubbers.tsv/g"` - errfile=`echo $f|sed "s/files.gz/grubbers.err/g"` - spacesavers2_grubbers -f $f > $outfile 2> $errfile +for filegz in `ls ${OUTFOLDER}/${prefix}*files.gz`;do + outfile=`echo $filegz|sed "s/mimeo.files.gz/grubbers.tsv/g"` + logfile=`echo $filegz|sed "s/mimeo.files.gz/grubbers.log/g"` + errfile=`echo $filegz|sed "s/mimeo.files.gz/grubbers.err/g"` + spacesavers2_grubbers \ + --filesgz $filegz \ + --limit $LIMIT \ + --outfile $outfile \ + > $logfile 2> $errfile done fi -if [ "$?" == "0" ];then -echo "Running spacesavers2_blamematrix" && \ -spacesavers2_blamematrix -f ${OUTFOLDER}/${prefix}.allusers.files.gz > ${OUTFOLDER}/${prefix}.blamematrix.tsv 2> ${OUTFOLDER}/${prefix}.blamematrix.err -fi -echo "Done!" + +echo "Done!" \ No newline at end of file diff --git a/spacesavers2_grubbers b/spacesavers2_grubbers index 77a4709..adc2c59 100755 --- a/spacesavers2_grubbers +++ b/spacesavers2_grubbers @@ -46,16 +46,24 @@ def main(): required=True, type=str, default=sys.stdin, - help="spacesavers2_mimeo prefix..files.gz file", + help="spacesavers2_mimeo prefix..mimeo.files.gz file", ) parser.add_argument( "-l", "--limit", dest="limit", required=False, - type=int, + type=float, default=5, - help="stop showing duplicates with total size smaller then (5 default) GiB", + help="stop showing duplicates with total size smaller than (5 default) GiB. Set 0 for unlimited.", + ) + parser.add_argument( + "-o", + "--outfile", + dest="outfile", + required=False, + type=str, + help="output tab-delimited file (default STDOUT)", ) parser.add_argument("-v", "--version", action="version", version=__version__) print_with_timestamp( @@ -70,24 +78,33 @@ def main(): for l in filesgz: dfu = fgz() properly_set = dfu.set(l) - if not properly_set: + if not properly_set: # could not read line properly or there are no duplicates continue + if dfu.ndup == 0: continue # in case mimeo was run without -z dups.append(dfu) - dups.sort() + dups.sort() # look at __lt__ ... its sorting from highest to lowest totalsize saved = 0 top_limit = args.limit * 1024 * 1024 * 1024 # 5 GiB + if args.outfile: + of = open(args.outfile, "w") + else: + of = sys.stdout + for fgitem in dups: - saved += fgitem.totalsize - if fgitem.totalsize < top_limit: + if fgitem.totalsize <= top_limit: break - print(fgitem) + saved += fgitem.totalsize + of.write("%s\n"%(fgitem)) + + if args.outfile: + of.close() - saved = get_human_readable_size(saved) + hrsaved = get_human_readable_size(saved) print_with_timestamp( start=start, scriptname=scriptname, - string="Deleting top grubbers will save {}!".format(saved), + string="Deleting top grubbers will save {} [ {} Bytes ] !".format(hrsaved,saved), ) print_with_timestamp(start=start, scriptname=scriptname, string="Done!") diff --git a/spacesavers2_mimeo b/spacesavers2_mimeo index 9212e41..597f0d5 100755 --- a/spacesavers2_mimeo +++ b/spacesavers2_mimeo @@ -4,6 +4,8 @@ import os import gzip import textwrap import time +import shutil +import subprocess MINDEPTH = 3 QUOTA_TB = 20 @@ -16,12 +18,12 @@ version_check() from src.FileDetails import FileDetails from src.dfUnit import dfUnit from src.Summary import Summary +from src.Summary import pathlen from src.utils import * from datetime import date import argparse - def process_hh( uid, hashhash, @@ -30,57 +32,64 @@ def process_hh( maxdepth, uid2uname, gid2gname, - peruser_perfolder_summaries, + perfolder_summaries, + perfolder_dups, user_output, ): for h in hashhash.keys(): - split_required = False - hashhash[h].compute( - hashhashsplits, split_required + # if files have the same forward and reverse hashes but different sizes then + # hashes are split into multiple hashes with suffix + # being added to the bottom hash for each size + split_required = hashhash[h].compute( + hashhashsplits ) # compute if split is needed or if we have duplicates if split_required: - continue # split is required so move on to the next hash + continue # split is required so move on to the next hash as new hashes with have been created by compute and added to hashhashsplits ... deal with them there! + # get indexes to files in the flist that belong to user with uid + # if uid is zero, then get all file indexes uid_file_index = hashhash[h].get_user_file_index(uid) - if len(uid_file_index) == 0: + if len(uid_file_index) == 0: # user with uid has no files in this set continue - uid_dup_file_index = [] - if hashhash[h].ndup > 1: + oldest_index = hashhash[h].oldest_index + foldest = hashhash[h].flist[oldest_index] + user_owns_original = False + if foldest.uid == uid or 0 == uid : user_owns_original = True + uid_file_index = list(filter(lambda x:x!=oldest_index,uid_file_index)) # remove oldest if present in list + inodes_already_summerized = [foldest.inode] + if hashhash[h].ndup_files > 0: # we have duplicates for i in uid_file_index: f = hashhash[h].flist[i] + fpath = f.apath + parent = fpath.parent fpaths = f.get_paths(mindepth, maxdepth) - if ( - i == hashhash[h].oldest_index - ): # its the original file ... not a duplicate + if f.inode in inodes_already_summerized: # it is a hardlink for p in fpaths: - peruser_perfolder_summaries[p].non_dup_Bytes.append(f.size) - peruser_perfolder_summaries[p].non_dup_ages.append(f.mtime) + perfolder_summaries[p].ndup_files += 1 else: - uid_dup_file_index.append(i) + inodes_already_summerized.append(f.inode) + if not parent in perfolder_dups: + perfolder_dups[fpath.parent] = 0 + perfolder_dups[fpath.parent] += f.calculated_size for p in fpaths: - peruser_perfolder_summaries[p].dup_Bytes.append(f.size) - peruser_perfolder_summaries[p].dup_ages.append(f.mtime) - else: # ndup == 1 .. meaning there are no duplicates .. just one file - for i in uid_file_index: - f = hashhash[h].flist[i] - fpaths = f.get_paths(mindepth, maxdepth) + perfolder_summaries[p].ndup_files+=1 + perfolder_summaries[p].dup_Bytes.append(f.calculated_size) + perfolder_summaries[p].dup_ages.append(f.mtime) + else: # we only have 1 original file + if user_owns_original: + fpaths = foldest.get_paths(mindepth, maxdepth) for p in fpaths: - peruser_perfolder_summaries[p].non_dup_Bytes.append(f.size) - peruser_perfolder_summaries[p].non_dup_ages.append(f.mtime) - if args.duplicatesonly: - if len(uid_dup_file_index) > 0: - user_output.write( - "{}\n".format( - hashhash[h].str_with_name( - uid2uname, gid2gname, uid_dup_file_index - ) - ) - ) - else: - user_output.write( - "{}\n".format( - hashhash[h].str_with_name(uid2uname, gid2gname, uid_file_index) - ) + perfolder_summaries[p].nnondup_files += 1 + perfolder_summaries[p].non_dup_Bytes.append(foldest.calculated_size) + perfolder_summaries[p].non_dup_ages.append(foldest.mtime) + out_index = [] + out_index.append(oldest_index) + out_index.extend(uid_file_index) + if args.duplicatesonly and len(out_index)==1: continue + user_output.write( + "{}\n".format( + hashhash[h].str_with_name(uid2uname, gid2gname, out_index) ) + ) def main(): @@ -105,7 +114,7 @@ def main(): parser.add_argument( "-f", "--catalog", - dest="lsout", + dest="catalog", required=True, type=str, default=sys.stdin, @@ -119,7 +128,7 @@ def main(): required=False, type=int, default=10, - help="folder max. depth upto which reports are aggregated", + help="folder max. depth upto which reports are aggregated ... absolute path is used to calculate depth (Default: 10)", ) parser.add_argument( @@ -160,6 +169,16 @@ def main(): action=argparse.BooleanOptionalAction, help="Print only duplicates to per user output file.", ) + + parser.add_argument( + "-k", + "--kronaplot", + dest="kronaplot", + required=False, + action=argparse.BooleanOptionalAction, + help="Make kronaplots for duplicates.(ktImportText must be in PATH!)", + ) + parser.add_argument("-v", "--version", action="version", version=__version__) print_with_timestamp( @@ -170,31 +189,43 @@ def main(): args = parser.parse_args() quota = args.quota * 1024 * 1024 * 1024 * 1024 + if args.kronaplot: + ktImportText_in_path = False + if shutil.which("ktImportText") == None: + sys.stderr.write("ktImportText(from kronaTools) not found in PATH. kronaplots will not be generated.\n") + else: + ktImportText_in_path = True + uid2uname = dict() gid2gname = dict() hashhash = dict() - users = set() # list of all users found + users = set() # list of all uids found users.add(0) # 0 == all users - groups = set() # list of groups - paths = set() - path_lens = [] + groups = set() # list of gids + paths = set() # set of all paths possible + path_lens = set() # set of all path depths print_with_timestamp( start=start, scriptname=scriptname, string="Reading in catalog file..." ) set_complete = True - with open(args.lsout) as lsout: - for l in lsout: + folder_info = dict() + with open(args.catalog) as catalog: + for l in catalog: fd = FileDetails() set_complete = fd.set(l) if not set_complete: continue - if fd.issyml: + if fd.fld != "d" and fd.fld !="f": # not a file or folder continue # ignore all symlinks users.add(fd.uid) groups.add(fd.gid) - path_lens.append(get_file_depth(fd.apath)) + path_lens.add(fd.get_depth()) for p in fd.get_paths_at_all_depths(): paths.add(p) + if fd.fld == "d": + if not fd.apath in folder_info: + folder_info[fd.apath] = fd + continue hash = fd.xhash_top + "#" + fd.xhash_bottom if hash == "#": # happens when file cannot be read sys.stderr.write( @@ -245,8 +276,13 @@ def main(): scriptname=scriptname, string="Total Number of users: %d" % len(users), ) - + blamematrixtsv = os.path.join( + os.path.abspath(args.outdir), args.prefix + "." + "blamematrix.tsv" + ) + blamematrix = dict() + all_blamematrix_paths = set() for uid in users: + blamematrix[uid] = dict() print_with_timestamp( start=start, scriptname=scriptname, @@ -258,11 +294,19 @@ def main(): outfilenameprefix = get_username_groupname(uid) summaryfilepath = os.path.join( - os.path.abspath(args.outdir), outfilenameprefix + ".summary.txt" + os.path.abspath(args.outdir), outfilenameprefix + ".mimeo.summary.txt" ) useroutputpath = os.path.join( - os.path.abspath(args.outdir), outfilenameprefix + ".files.gz" + os.path.abspath(args.outdir), outfilenameprefix + ".mimeo.files.gz" ) + if args.kronaplot: + kronatsv = os.path.join( + os.path.abspath(args.outdir), outfilenameprefix + ".mimeo.krona.tsv" + ) + if ktImportText_in_path: + kronahtml = os.path.join( + os.path.abspath(args.outdir), outfilenameprefix + ".mimeo.krona.html" + ) with open(summaryfilepath, "w") as user_summary: user_summary.write("%s\n" % (Summary.HEADER)) @@ -270,12 +314,23 @@ def main(): with gzip.open(useroutputpath, "wt") as user_output, open( summaryfilepath, "a" ) as user_summary: - peruser_perfolder_summaries = dict() + perfolder_summaries = dict() + perfolder_dups = dict() for p in paths: - peruser_perfolder_summaries[p] = Summary(p) - hashhashsplits = ( - dict() - ) # dict to collect instances where the files are NOT duplicates has same hashes but different sizes (and different inodes) ... new suffix is added to bottomhash .."_iterator" + perfolder_summaries[p] = Summary(p) + if not p in folder_info: + folder_info[p] = FileDetails() + folder_info[p].initialize(p) + fd = folder_info[p] + for p2 in fd.get_paths(mindepth,maxdepth): + if not p2 in folder_info: + folder_info[p2] = FileDetails() + folder_info[p2].initialize(p2) + fd2 = folder_info[p2] + if fd2.uid == uid or uid == 0: + perfolder_summaries[p2].folder_Bytes += fd.calculated_size + + hashhashsplits = dict() # dict to collect instances where the files are NOT duplicates has same hashes but different sizes (and different inodes) ... new suffix is added to bottomhash .."_iterator" process_hh( uid, hashhash, @@ -284,30 +339,81 @@ def main(): maxdepth, uid2uname, gid2gname, - peruser_perfolder_summaries, - user_output, - ) - hashhashsplitsdummy = dict() - process_hh( - uid, - hashhashsplits, - hashhashsplitsdummy, - mindepth, - maxdepth, - uid2uname, - gid2gname, - peruser_perfolder_summaries, + perfolder_summaries, + perfolder_dups, user_output, ) + if len(hashhashsplits) != 0: + hashhashsplitsdummy = dict() + process_hh( + uid, + hashhashsplits, + hashhashsplitsdummy, + mindepth, + maxdepth, + uid2uname, + gid2gname, + perfolder_summaries, + perfolder_dups, + user_output, + ) + del hashhashsplitsdummy + del hashhashsplits for p in paths: - peruser_perfolder_summaries[p].update_scores(quota) - user_summary.write(f"{peruser_perfolder_summaries[p]}\n") + perfolder_summaries[p].update_scores(quota) + user_summary.write(f"{perfolder_summaries[p]}\n") + for p in perfolder_summaries: + dummy = FileDetails() + dummy.initialize(p) + if dummy.get_depth() == mindepth + 1: + all_blamematrix_paths.add(p) + blamematrix[uid][p] = sum(perfolder_summaries[p].dup_Bytes) + + if args.kronaplot: + print_with_timestamp( + start=start, + scriptname=scriptname, + string="Creating Kronachart for user: %s" % (uid2uname[uid]), + ) + with open(kronatsv,'w') as ktsv: + for p in perfolder_dups: + path = str(p) + path = path.replace('/','\t') + path = path.replace('\t\t','\t') + if perfolder_dups[p] != 0: + ktsv.write("%d\t%s\n"%(perfolder_dups[p],path)) + if ktImportText_in_path: + cmd = "ktImportText %s -o %s"%(kronatsv,kronahtml) + srun = subprocess.run(cmd,shell=True, capture_output=True, text=True) + if srun.returncode !=0: + sys.stderr.write("%s\n"%(srun.stderr)) del hashhash - del hashhashsplits - del hashhashsplitsdummy - print_with_timestamp(start=start, scriptname=scriptname, string="Finished!") + print_with_timestamp( + start=start, + scriptname=scriptname, + string="Creating Blamematrix", + ) + with open(blamematrixtsv,'w') as btsv: + outlist = ["path"] + uids = list(blamematrix.keys()) + uids.sort() + for uid in uids: + outlist.append(uid2uname[uid]) + btsv.write("\t".join(outlist)+"\n") + for p in all_blamematrix_paths: + outlist = [str(p)] + s = 0 + for uid in uids: + if p in blamematrix[uid]: + s += blamematrix[uid][p] + outlist.append(str(blamematrix[uid][p])) + else: + outlist.append(str(0)) + if s != 0 : btsv.write("\t".join(outlist)+"\n") + + print_with_timestamp(start=start, scriptname=scriptname, string="Finished!") if __name__ == "__main__": main() diff --git a/spacesavers2_usurp b/spacesavers2_usurp index baeded3..2c79c9a 100755 --- a/spacesavers2_usurp +++ b/spacesavers2_usurp @@ -11,10 +11,6 @@ from src.VersionCheck import __version__ version_check() -# from src.FileDetails import FileDetails -from src.dfUnit import fgzblamer - -# from src.Summary import Summary from src.utils import * from datetime import date @@ -70,9 +66,6 @@ def main(): global args args = parser.parse_args() - # if args.version: - # version_print() - print_with_timestamp( start=start, scriptname=scriptname, string="version: {}".format(__version__) ) @@ -83,18 +76,30 @@ def main(): l = l.strip().split("\t") lhash = l[0] if args.hash in lhash: - dupfiles = l[4].split(";") - original_copy = Path(dupfiles.pop(0).strip('"')) + original_copy = Path(l[4].strip('"')) + dupfiles = l[5].split("##") print_with_timestamp( start=start, scriptname=scriptname, string="Original copy: {}".format(original_copy), ) + if not os.access(original_copy, os.R_OK): + print_with_timestamp( + start=start, + scriptname=scriptname, + string="Original copy is not readable. Hardlinks cannot be created!", + ) + exit(1) + inode_set = set() for dup in dupfiles: dup = Path(dup.strip('"')) duptmp = Path(str(dup) + "." + str(uuid.uuid4())) st = os.stat(dup) - total_saved += st.st_size + fsize = st.st_size + finode = st.st_ino + if not finode in inode_set: + inode_set.add(finode) + total_saved += fsize print_with_timestamp( start=start, scriptname=scriptname, @@ -110,20 +115,21 @@ def main(): os.remove(dup) os.rename(duptmp, dup) except OSError: - print_with_timestamp( - start=start, - scriptname=scriptname, - string="OSError occurred while creating hard-link. Probably trying to create a cross-device hard-link", - ) if args.force: print_with_timestamp( start=start, scriptname=scriptname, - string="Creating sym-link instead!", + string="Creating symlink file: {}".format(dup), ) os.remove(dup) os.symlink(original_copy, dup) - break + else: + print_with_timestamp( + start=start, + scriptname=scriptname, + string="OSError occurred while creating hard-link. Probably trying to create a cross-device hard-link. Try using --force to create symlink instead.", + ) + # break total_saved_human_readable = get_human_readable_size(total_saved) print_with_timestamp( start=start, diff --git a/src/FileDetails.py b/src/FileDetails.py index 90d786d..ce7a4db 100644 --- a/src/FileDetails.py +++ b/src/FileDetails.py @@ -9,8 +9,8 @@ except ImportError: exit(f"{sys.argv[0]} requires xxhash module") -THRESHOLDSIZE = 1024 * 1024 * 1024 -BUFFERSIZE = 128 * 1024 +THRESHOLDSIZE = 1024 * 1024 * 1024 # 1 MiB +BUFFERSIZE = 128 * 1024 # 128 KiB TB = THRESHOLDSIZE+BUFFERSIZE SEED = 20230502 MINDEPTH = 3 @@ -21,14 +21,33 @@ SED[se]=1 def convert_time_to_age(t): - currenttime=int(time.time()) - return int((currenttime - t)/86400)+1 + currenttime = int(time.time()) + age = int((currenttime - t)/86400)+1 + if age < 0: age = 0 + return age + +def get_type(p): + x = "u" # unknown + if not p.exists(): + x = "a" # absent + return x + if p.is_symlink(): + x = "l" # link or symlink + return x + if p.is_dir(): + x = "d" # directory + return x + if p.is_file(): + x = "f" # file + return x + return x class FileDetails: def __init__(self): self.apath = "" # absolute path of file - self.issyml = False + self.fdl = "u" # is it file or directory or link or unknown or absent ... values are f d l u a self.size = -1 + self.calculated_size = -1 self.dev = -1 self.inode = -1 self.nlink = -1 @@ -40,12 +59,13 @@ def __init__(self): self.xhash_top = "" self.xhash_bottom = "" - def initialize(self,f,thresholdsize=THRESHOLDSIZE, buffersize=BUFFERSIZE, tb=TB, sed=SED, bottomhash=False): + def initialize(self,f,thresholdsize=THRESHOLDSIZE, buffersize=BUFFERSIZE, tb=TB, sed=SED, bottomhash=False,st_block_byte_size=512): self.apath = Path(f).absolute() # path is of type PosixPath ext = self.apath.suffix - self.issyml = self.apath.is_symlink() # is a symbolic link - st = os.stat(self.apath) # gather all stats + self.fld = get_type(self.apath) # get if it is a file or directory or link or unknown or absent + st = self.apath.stat(follow_symlinks=False) # gather stat results self.size = st.st_size # size in bytes + self.calculated_size = st.st_blocks * st_block_byte_size # st_blocks gives number of 512 bytes blocks used self.dev = st.st_dev # Device id self.inode = st.st_ino # Inode self.nlink = st.st_nlink # number of hardlinks @@ -54,39 +74,40 @@ def initialize(self,f,thresholdsize=THRESHOLDSIZE, buffersize=BUFFERSIZE, tb=TB, self.ctime = convert_time_to_age(st.st_ctime) # creation time self.uid = st.st_uid # user id self.gid = st.st_gid # group id - try: - with open(self.apath,'rb') as fh: - if ext in sed: - if self.size > tb: - data = fh.read(thresholdsize) - data = fh.read(buffersize) - self.xhash_top = xxhash.xxh128(data,seed=SEED).hexdigest() - if bottomhash: - fh.seek(-1 * buffersize,2) + if self.fld == "f": + try: + with open(self.apath,'rb') as fh: + if ext in sed: + if self.size > tb: + data = fh.read(thresholdsize) data = fh.read(buffersize) - self.xhash_bottom = xxhash.xxh128(data,seed=SEED).hexdigest() + self.xhash_top = xxhash.xxh128(data,seed=SEED).hexdigest() + if bottomhash: + fh.seek(-1 * buffersize,2) + data = fh.read(buffersize) + self.xhash_bottom = xxhash.xxh128(data,seed=SEED).hexdigest() + else: + self.xhash_bottom = self.xhash_top else: + data = fh.read() + self.xhash_top = xxhash.xxh128(data,seed=SEED).hexdigest() self.xhash_bottom = self.xhash_top else: - data = fh.read() - self.xhash_top = xxhash.xxh128(data,seed=SEED).hexdigest() - self.xhash_bottom = self.xhash_top - else: - if self.size > buffersize: - data = fh.read(buffersize) - self.xhash_top = xxhash.xxh128(data,seed=SEED).hexdigest() - if bottomhash: - fh.seek(-1 * buffersize,2) + if self.size > buffersize: data = fh.read(buffersize) - self.xhash_bottom = xxhash.xxh128(data,seed=SEED).hexdigest() + self.xhash_top = xxhash.xxh128(data,seed=SEED).hexdigest() + if bottomhash: + fh.seek(-1 * buffersize,2) + data = fh.read(buffersize) + self.xhash_bottom = xxhash.xxh128(data,seed=SEED).hexdigest() + else: + self.xhash_bottom = self.xhash_top else: + data = fh.read() + self.xhash_top = xxhash.xxh128(data,seed=SEED).hexdigest() self.xhash_bottom = self.xhash_top - else: - data = fh.read() - self.xhash_top = xxhash.xxh128(data,seed=SEED).hexdigest() - self.xhash_bottom = self.xhash_top - except: - sys.stderr.write("spacesavers2:{}:File cannot be read:{}\n".format(self.__class__.__name__,str(self.apath))) + except: + sys.stderr.write("spacesavers2:{}:File cannot be read:{}\n".format(self.__class__.__name__,str(self.apath))) def set(self,ls_line): original_ls_line=ls_line @@ -105,9 +126,9 @@ def set(self,ls_line): self.nlink = int(ls_line.pop(-1)) self.inode = int(ls_line.pop(-1)) self.dev = int(ls_line.pop(-1)) + self.calculated_size = int(ls_line.pop(-1)) self.size = int(ls_line.pop(-1)) - issyml = ls_line.pop(-1) - self.issyml = issyml == 'True' + self.fld = ls_line.pop(-1) self.apath = Path(";".join(ls_line)) # sometimes filename have ";" in them ... hence this! return True except: @@ -115,12 +136,13 @@ def set(self,ls_line): # exit() return False - def str_with_name(self,uid2uname,gid2gname):# method for printing output in finddup ... replace "xhash_top;xhash_bottom" with "username;groupname" at the end of the string + def str_with_name(self,uid2uname,gid2gname):# method for printing output in mimeo ... replace "xhash_top;xhash_bottom" with "username;groupname" at the end of the string # return_str = "\"%s\";"%(self.apath) # path may have newline char which should not be interpretted as new line char return_str = "\"%s\";"%(str(self.apath).encode('unicode_escape').decode('utf-8')) - # return_str += "%s;"%(self.issyml) + return_str += "%s;"%(self.fld) return_str += "%d;"%(self.size) + return_str += "%d;"%(self.calculated_size) return_str += "%d;"%(self.dev) return_str += "%d;"%(self.inode) return_str += "%d;"%(self.nlink) @@ -137,9 +159,10 @@ def __str__(self): # return_str = "\"%s\";"%(self.apath) # path may have newline char which should not be interpretted as new line char return_str = "\"%s\";"%(str(self.apath).encode('unicode_escape').decode('utf-8')) - return_str += "%s;"%(self.issyml) + return_str += "%s;"%(self.fld) return_str += "%d;"%(self.size) - return_str += "%d;"%(self.dev) + return_str += "%d;"%(self.calculated_size) + return_str += "%d;"%(self.dev) # device id return_str += "%d;"%(self.inode) return_str += "%d;"%(self.nlink) return_str += "%d;"%(self.atime) @@ -151,11 +174,27 @@ def __str__(self): return_str += "%s;"%(self.xhash_bottom) return return_str - def get_paths_at_all_depths(self): # for files - return self.apath.parents[:-1] + def get_paths_at_all_depths(self): # for files and folders + p = self.apath + paths = [] + if self.fld == "d": + paths.append(p) + paths.extend(p.parents[:-1]) # remove the last one ... which will be '/' + return paths def get_paths(self,mindepth,maxdepth): - parents = list(self.apath.parents[0:-1]) - parents = list(filter(lambda x:get_folder_depth(x) <= maxdepth,parents)) - parents = list(filter(lambda x:get_folder_depth(x) >= mindepth,parents)) - return parents + paths = self.get_paths_at_all_depths() + paths = list(filter(lambda x:get_folder_depth(x) <= maxdepth,paths)) + paths = list(filter(lambda x:get_folder_depth(x) >= mindepth,paths)) + return paths + + def get_depth(self): + p = self.apath + try: + if p.is_dir(): # folder + return len(list(p.parents)) + else: # file + return len(list(p.parents)) - 1 + except: + print('get_file_depth error for file:"{}", type:{}'.format(path, type(path))) + exit() \ No newline at end of file diff --git a/src/Summary.py b/src/Summary.py index e055f0a..28435fa 100644 --- a/src/Summary.py +++ b/src/Summary.py @@ -29,8 +29,11 @@ class Summary: def __init__(self,path): self.path = path + self.nnondup_files = 0 + self.ndup_files = 0 self.non_dup_Bytes = [] self.dup_Bytes = [] + self.folder_Bytes = 0 self.non_dup_ages = [] self.dup_ages = [] self.non_dup_age_scores = [] @@ -69,7 +72,7 @@ def print_header(self): def __str__(self): dup_Bytes = sum(self.dup_Bytes) - tot_Bytes = sum(self.non_dup_Bytes) + dup_Bytes + tot_Bytes = sum(self.non_dup_Bytes) + dup_Bytes + self.folder_Bytes try: dup_mean_age = sum(self.dup_ages)/len(self.dup_ages) except ZeroDivisionError: @@ -78,8 +81,10 @@ def __str__(self): tot_mean_age = (sum(self.dup_ages) + sum(self.non_dup_ages))/(len(self.dup_ages)+len(self.non_dup_ages)) except ZeroDivisionError: tot_mean_age = 0 - dup_files = len(self.dup_Bytes) - tot_files = dup_files + len(self.non_dup_Bytes) + # dup_files = len(self.dup_Bytes) + # tot_files = dup_files + len(self.non_dup_Bytes) + dup_files = self.ndup_files + tot_files = self.nnondup_files + dup_files return_str = str(self.path)+"\t" return_str += "%d\t"%(tot_Bytes) return_str += "%d\t"%(dup_Bytes) @@ -101,3 +106,12 @@ def __str__(self): return_str += "%d"%(self.OverallScore) return return_str +class pathlen: + def __init__(self,p,dupbytes): + self.path=p + self.len=len(p.split("/")) + self.dupbytes=dupbytes + + def __str__(self): + returnstr="%s"%(self.path) + return returnstr diff --git a/src/VERSION b/src/VERSION index 5eef0f1..d9df1bb 100644 --- a/src/VERSION +++ b/src/VERSION @@ -1 +1 @@ -0.10.2 +0.11.0 diff --git a/src/dfUnit.py b/src/dfUnit.py index 6100e1f..73c9e1b 100644 --- a/src/dfUnit.py +++ b/src/dfUnit.py @@ -3,7 +3,7 @@ def get_filename_from_fgzlistitem(string): string = string.strip().split(";")[:-1] - for i in range(9): + for i in range(11): dummy = string.pop(-1) filename = ";".join(string) return filename @@ -12,23 +12,29 @@ def get_filename_from_fgzlistitem(string): class dfUnit: def __init__(self,hash): self.hash = hash # typically hash_top + "#" + hash_bottom - self.flist = [] # list of _ls files with the same hash - self.fsize = -1 # size of each file + self.flist = [] # list of catalog files with the same hash + self.fsize = -1 # calculated size of each file self.ndup = -1 # files in flist with same size, but different inode (they already have the same hash) + self.ndup_files = -1 # number of duplicate files ... used for counting duplicate files + self.ndup_inode = -1 # number of duplicate inodes ... used for counting duplicate bytes self.size_set = set() # set of unique sizes ... if len(size_set) then split is required + self.calculated_size_list = [] self.uid_list = [] # list of uids of files added self.inode_list = [] # list of inodes of files added - self.oldest_inode = -1 # oldest_ ... is for the file which is NOT the duplicate + self.oldest_inode = -1 # oldest_ ... is for the file which is NOT the duplicate or is the original self.oldest_index = -1 self.oldest_age = -1 self.oldest_uid = -1 - + def nfiles_with_hash(self): # return number of files in this hash (total ... all users included) + return len(self.flist) + def add_fd(self,fd): # add the file to flist self.flist.append(fd) # add size if not already present self.size_set.add(fd.size) + self.calculated_size_list.append(fd.calculated_size) # add uid self.uid_list.append(fd.uid) # add inode @@ -44,7 +50,14 @@ def filter_flist_by_uid(self,uid): for i,f in enumerate(self.flist): if f.uid == uid : self.keep.append(i) - def compute(self,hashhashsplits,split_required): # 1. move oldest to the first position 2. find ndup 3. find size 4. filter by uid 5. get depth folder + def compute(self,hashhashsplits): + # find if files have the same hashes, but different sizes then... + # 1. split them into different hashes by size and + # 2. append them to hashhashsplits + # else ... aka .. .no spliting is required + # 1. count number of duplicate inodes and + # 2. size of each file + split_required = False # check if spliting is required if len(self.size_set) > 1: # more than 1 size in this hash split_required = True @@ -57,13 +70,16 @@ def compute(self,hashhashsplits,split_required): # 1. move oldest to the first p if fd.size == size: hashhashsplits[newhash].add_fd(fd) else: # there only 1 size ... no splits required - self.ndup = len(self.inode_list) - 1 #ndup is zero if same size and only 1 inode - self.fsize = self.flist[0].size + self.ndup = len(self.inode_list) - 1 #ndup is zero if same len(size_set)==1 and len(inode_list)==1 + self.ndup_inode = len(set(self.inode_list)) - 1 + self.ndup_files = len(self.inode_list) - 1 + self.fsize = self.flist[0].calculated_size + return split_required def get_user_file_index(self,uid): uid_file_index = [] if not uid in self.uid_list: - if uid == 0: uid_file_index = list(range(0,len(self.flist))) + if uid == 0: uid_file_index = list(range(0,len(self.flist))) # uid == 0 is all users return uid_file_index else: for i,j in enumerate(self.flist): @@ -73,19 +89,20 @@ def get_user_file_index(self,uid): def __str__(self): - return "{0} : {1} {2} {3}".format(self.hash, self.ndup, self.fsize,"##".join(map(lambda x:str(x),self.flist))) + return "{0} : {1} {2} {3}".format(self.hash, self.ndup_inode, self.fsize,"##".join(map(lambda x:str(x),self.flist))) def str_with_name(self,uid2uname, gid2gname,findex): - return "{0} : {1} {2} {3}".format(self.hash, self.ndup, self.fsize,"##".join(map(lambda x:x.str_with_name(uid2uname,gid2gname),[self.flist[i] for i in findex]))) + return "{0} : {1} {2} {3}".format(self.hash, self.ndup_inode, self.fsize,"##".join(map(lambda x:x.str_with_name(uid2uname,gid2gname),[self.flist[i] for i in findex]))) class fgz: # used by grubber def __init__(self): self.hash = "" - self.ndup = -1 + self.ndup = -1 # number of duplicate files and not duplicate inodes self.filesize = -1 self.totalsize = -1 - self.fds = [] + self.fds = [] # list of duplicate files + self.of = "" # original file def __lt__(self,other): return self.totalsize > other.totalsize @@ -94,9 +111,10 @@ def __str__(self): outstring=[] outstring.append(str(self.hash)) outstring.append(str(self.ndup)) - outstring.append(get_human_readable_size(self.totalsize)) - outstring.append(get_human_readable_size(self.filesize)) - outstring.append(";".join(map(lambda x:get_filename_from_fgzlistitem(x),self.fds))) + outstring.append(str(self.totalsize)) + outstring.append(str(self.filesize)) + outstring.append(get_filename_from_fgzlistitem(self.of)) + outstring.append("##".join(map(lambda x:get_filename_from_fgzlistitem(x),self.fds))) return "\t".join(outstring) # return "{0} {1} {2} {3} {4}".format(self.hash,self.ndup,get_human_readable_size(self.totalsize), get_human_readable_size(self.filesize), ";".join(map(lambda x:get_filename_from_fgzlistitem(x),self.fds))) # return "{0} {1} {2} {3} {4}".format(self.hash,self.ndup,self.totalsize, self.filesize, ";".join(map(lambda x:get_filename_from_fgzlistitem(x),self.fds))) @@ -107,100 +125,19 @@ def set(self,inputline): try: inputline = inputline.strip().split(" ") if len(inputline) < 5: - raise Exception("Less than 5 items in the line.") + raise Exception("Less than 5 items in mimeo.files.gz line.") self.hash = inputline.pop(0) - dummy = inputline.pop(0) + dummy = inputline.pop(0) # the colon total_ndup = int(inputline.pop(0)) - if total_ndup == 0: # may be finddup was run to output all files .. not just dups + if total_ndup == 0: # may be mimeo was run to output all files .. not just dups .. aka without the -z option return False self.filesize = int(inputline.pop(0)) - full_fds = " ".join(inputline) - fds = full_fds.split("##") - self.ndup = len(fds) # these are user number of duplicates/files - if self.ndup == (total_ndup + 1): # one file is the original ... other are all duplicates - dummy = fds.pop(0) - self.ndup -= 1 - self.fds = fds - self.totalsize = self.ndup * self.filesize - return True - except: - sys.stderr.write("spacesavers2:{0}:files.gz Do not understand line:{1} with {2} elements.\n".format(self.__class__.__name__,original_line,len(inputline))) - # exit() - return False - - -class FileDetails2: - def __init__(self): - self.apath = "" - self.size = -1 - self.dev = -1 - self.inode = -1 - self.nlink = -1 - self.mtime = -1 - self.uid = -1 - self.gid = -1 - self.uname = "" - self.gname = "" - - def set(self,fgzline): - original_fgzline=fgzline - # print(ls_line) - try: - fgzline = fgzline.strip().replace("\"","").split(";")[:-1] - if len(fgzline) < 10: - raise Exception("Less than 10 items in the line.") - self.gname = fgzline.pop(-1) - self.uname = fgzline.pop(-1) - self.gid = int(fgzline.pop(-1)) - self.uid = int(fgzline.pop(-1)) - self.mtime = int(fgzline.pop(-1)) - self.nlink = int(fgzline.pop(-1)) - self.inode = int(fgzline.pop(-1)) - self.dev = int(fgzline.pop(-1)) - self.size = int(fgzline.pop(-1)) - apath = ";".join(fgzline) - apath = apath.strip("\"") - self.apath = Path(apath) # sometimes filename have ";" in them ... hence this! - return True - except: - sys.stderr.write("spacesavers2:{0}:catalog Do not understand line:\"{1}\" with {2} elements.\n".format(self.__class__.__name__,original_fgzline,len(fgzline))) - # exit() - return False - -class fgzblamer: # used by blamematrix - def __init__(self): - self.hash = "" - self.ndup = -1 - self.users = set() - self.folders = set() - self.bm = dict() - self.fds = [] - - def set(self,inputline,depth): - original_line = inputline - try: - inputline = inputline.strip().split(" ") - if len(inputline) < 5: - raise Exception("Less than 5 items in the line.") - self.hash = inputline.pop(0) - dummy = inputline.pop(0) - self.ndup = int(inputline.pop(0)) - if self.ndup == 0 or self.ndup == 1: return False - self.filesize = int(inputline.pop(0)) - full_fds = " ".join(inputline) + full_fds = " ".join(inputline) # bcos file names can contain spaces fds = full_fds.split("##") - for f in fds: - fd = FileDetails2() - fd.set(f) - self.users.add(fd.uname) - fad=get_folder_at_depth(fd.apath,depth) - self.folders.add(fad) - if not fd.uname in self.bm: - self.bm[fd.uname] = dict() - if not fad in self.bm[fd.uname]: - self.bm[fd.uname][fad] = 0 - self.bm[fd.uname][fad] += self.filesize - self.fds = [] + self.ndup = total_ndup # these are user number of duplicates/files + self.of = fds.pop(0) # one file is the original + self.fds = fds # others are dupicates + self.totalsize = total_ndup * self.filesize return True except: sys.stderr.write("spacesavers2:{0}:files.gz Do not understand line:{1} with {2} elements.\n".format(self.__class__.__name__,original_line,len(inputline))) diff --git a/src/utils.py b/src/utils.py index 0c67e74..68c4b51 100644 --- a/src/utils.py +++ b/src/utils.py @@ -72,14 +72,6 @@ def get_folder_depth(path): return len(list(path.parents)) -def get_file_depth(path): - try: - return len(list(path.parents)) - 1 - except: - print('get_file_depth error for file:"{}", type:{}'.format(path, type(path))) - exit() - - def get_timestamp(start): e = time.time() return "%08.2fs" % (e - start)