From 58fdf250e831ced8479b60505a8525c4cdbdb4db Mon Sep 17 00:00:00 2001 From: Pollus Brodeur Date: Sun, 14 Apr 2024 10:47:21 -0400 Subject: [PATCH] Fix issue #11 --- PSOneTools/2.4/Find-PSOneDuplicateFile.ps1 | 72 +++++++-------- .../2.4/Find-PSOneDuplicateFileFast.ps1 | 92 +++++++++---------- 2 files changed, 82 insertions(+), 82 deletions(-) diff --git a/PSOneTools/2.4/Find-PSOneDuplicateFile.ps1 b/PSOneTools/2.4/Find-PSOneDuplicateFile.ps1 index 6eaa3ff..1104cc6 100644 --- a/PSOneTools/2.4/Find-PSOneDuplicateFile.ps1 +++ b/PSOneTools/2.4/Find-PSOneDuplicateFile.ps1 @@ -9,11 +9,11 @@ .EXAMPLE $Path = [Environment]::GetFolderPath('MyDocuments') - Find-PSOneDuplicateFile -Path $Path + Find-PSOneDuplicateFile -Path $Path Find duplicate files in the user documents folder .EXAMPLE - Find-PSOneDuplicateFile -Path c:\windows -Filter *.log + Find-PSOneDuplicateFile -Path c:\windows -Filter *.log find log files in the Windows folder with duplicate content .LINK @@ -27,27 +27,27 @@ [String] [Parameter(Mandatory)] $Path, - - # Filter to apply. Default is '*' (all Files) + + # Filter to apply. Default is '*' (all Files) [String] $Filter = '*' ) # get a hashtable of all files of size greater 0 # grouped by their length - - + + # ENUMERATE ALL FILES RECURSIVELY # call scriptblocks directly and pipe them together # this is by far the fastest way and much faster than # using Foreach-Object: - & { + & { try { # try and use the fast API way of enumerating files recursively # this FAILS whenever there is any "Access Denied" errors Write-Progress -Activity 'Acquiring Files' -Status 'Fast Method' - [IO.DirectoryInfo]::new($Path).GetFiles('*', 'AllDirectories') + [IO.DirectoryInfo]::new($Path).GetFiles($Filter, 'AllDirectories') } catch { @@ -55,7 +55,7 @@ Write-Progress -Activity 'Acquiring Files' -Status 'Falling Back to Slow Method' Get-ChildItem -Path $Path -File -Recurse -ErrorAction Ignore } - } | + } | # EXCLUDE EMPTY FILES: # use direct process blocks with IF (which is much faster than Where-Object): & { @@ -68,37 +68,37 @@ $_ } } - } | + } | # GROUP FILES BY LENGTH, AND RETURN ONLY FILES WHERE THERE IS AT LEAST ONE # OTHER FILE WITH SAME SIZE # use direct scriptblocks with own hashtable (which is much faster than Group-Object) - & { - begin + & { + begin # start with an empty hashtable - { $hash = @{} } + { $hash = @{} } - process - { + process + { # group files by their length # (use "length" as hashtable key) $file = $_ $key = $file.Length.toString() - + # if we see this key for the first time, create a generic # list to hold group items, and store FileInfo objects in this list # (specialized generic lists are faster than ArrayList): - if ($hash.ContainsKey($key) -eq $false) + if ($hash.ContainsKey($key) -eq $false) { $hash[$key] = [Collections.Generic.List[System.IO.FileInfo]]::new() } # add file to appropriate hashtable key: $hash[$key].Add($file) - } - - end - { + } + + end + { # return only the files from groups with at least two files - # (if there is only one file with a given length, then it + # (if there is only one file with a given length, then it # cannot have any duplicates for sure): foreach($pile in $hash.Values) { @@ -109,8 +109,8 @@ $pile } } - } - } | + } + } | # CALCULATE THE NUMBER OF FILES TO HASH # collect all files and hand over en-bloc & { @@ -119,58 +119,58 @@ # GROUP FILES BY HASH, AND RETURN ONLY HASHES THAT HAVE AT LEAST TWO FILES: # use a direct scriptblock call with a hashtable (much faster than Group-Object): & { - begin + begin { # start with an empty hashtable $hash = @{} - + # since this is a length procedure, a progress bar is in order # keep a counter of processed files: $c = 0 } - + process { $totalNumber = $_.Count foreach($file in $_) { - + # update progress bar $c++ - + # update progress bar every 20 files: if ($c % 20 -eq 0) { $percentComplete = $c * 100 / $totalNumber Write-Progress -Activity 'Hashing File Content' -Status $file.Name -PercentComplete $percentComplete } - + # use the file hash of this file PLUS file length as a key to the hashtable # use the fastest algorithm SHA1 $result = Get-FileHash -Path $file.FullName -Algorithm SHA1 $key = '{0}:{1}' -f $result.Hash, $file.Length - + # if we see this key the first time, add a generic list to this key: if ($hash.ContainsKey($key) -eq $false) { $hash.Add($key, [Collections.Generic.List[System.IO.FileInfo]]::new()) } - + # add the file to the approriate group: $hash[$key].Add($file) } } - + end { # remove all hashtable keys with only one file in them - + # first, CLONE the list of hashtable keys # (we cannot remove hashtable keys while enumerating the live # keys list): # remove keys $keys = @($hash.Keys).Clone() - + # enumerate all keys... foreach($key in $keys) { @@ -180,7 +180,7 @@ $hash.Remove($key) } } - + # return the hashtable with only duplicate files left: $hash } diff --git a/PSOneTools/2.4/Find-PSOneDuplicateFileFast.ps1 b/PSOneTools/2.4/Find-PSOneDuplicateFileFast.ps1 index 18cd0cb..86a9d49 100644 --- a/PSOneTools/2.4/Find-PSOneDuplicateFileFast.ps1 +++ b/PSOneTools/2.4/Find-PSOneDuplicateFileFast.ps1 @@ -13,11 +13,11 @@ .EXAMPLE $Path = [Environment]::GetFolderPath('MyDocuments') - Find-PSOneDuplicateFileFast -Path $Path + Find-PSOneDuplicateFileFast -Path $Path Find duplicate files in the user documents folder .EXAMPLE - Find-PSOneDuplicateFileFast -Path c:\windows -Filter *.log + Find-PSOneDuplicateFileFast -Path c:\windows -Filter *.log find log files in the Windows folder with duplicate content .LINK @@ -31,18 +31,18 @@ [String] [Parameter(Mandatory)] $Path, - - # Filter to apply. Default is '*' (all Files) + + # Filter to apply. Default is '*' (all Files) [String] $Filter = '*', - + # when there are multiple files with same partial hash # they may still be different. When setting this switch, # full hashes are calculated which may take a very long time # for large files and/or slow networks [switch] $TestPartialHash, - + # use partial hashes for files larger than this: [int64] $MaxFileSize = 100KB @@ -50,19 +50,19 @@ # get a hashtable of all files of size greater 0 # grouped by their length - - + + # ENUMERATE ALL FILES RECURSIVELY # call scriptblocks directly and pipe them together # this is by far the fastest way and much faster than # using Foreach-Object: - & { + & { try { # try and use the fast API way of enumerating files recursively # this FAILS whenever there is any "Access Denied" errors Write-Progress -Activity 'Acquiring Files' -Status 'Fast Method' - [IO.DirectoryInfo]::new($Path).GetFiles('*', 'AllDirectories') + [IO.DirectoryInfo]::new($Path).GetFiles($Filter, 'AllDirectories') } catch { @@ -70,7 +70,7 @@ Write-Progress -Activity 'Acquiring Files' -Status 'Falling Back to Slow Method' Get-ChildItem -Path $Path -File -Recurse -ErrorAction Ignore } - } | + } | # EXCLUDE EMPTY FILES: # use direct process blocks with IF (which is much faster than Where-Object): & { @@ -83,37 +83,37 @@ $_ } } - } | + } | # GROUP FILES BY LENGTH, AND RETURN ONLY FILES WHERE THERE IS AT LEAST ONE # OTHER FILE WITH SAME SIZE # use direct scriptblocks with own hashtable (which is much faster than Group-Object) - & { - begin + & { + begin # start with an empty hashtable - { $hash = @{} } + { $hash = @{} } - process - { + process + { # group files by their length # (use "length" as hashtable key) $file = $_ $key = $file.Length.toString() - + # if we see this key for the first time, create a generic # list to hold group items, and store FileInfo objects in this list # (specialized generic lists are faster than ArrayList): - if ($hash.ContainsKey($key) -eq $false) + if ($hash.ContainsKey($key) -eq $false) { $hash[$key] = [Collections.Generic.List[System.IO.FileInfo]]::new() } # add file to appropriate hashtable key: $hash[$key].Add($file) - } - - end - { + } + + end + { # return only the files from groups with at least two files - # (if there is only one file with a given length, then it + # (if there is only one file with a given length, then it # cannot have any duplicates for sure): foreach($pile in $hash.Values) { @@ -124,8 +124,8 @@ $pile } } - } - } | + } + } | # CALCULATE THE NUMBER OF FILES TO HASH # collect all files and hand over en-bloc & { @@ -134,37 +134,37 @@ # GROUP FILES BY HASH, AND RETURN ONLY HASHES THAT HAVE AT LEAST TWO FILES: # use a direct scriptblock call with a hashtable (much faster than Group-Object): & { - begin + begin { # start with an empty hashtable $hash = @{} - + # since this is a length procedure, a progress bar is in order # keep a counter of processed files: $c = 0 } - + process { $totalNumber = $_.Count foreach($file in $_) { - + # update progress bar $c++ - + # update progress bar every 20 files: if ($c % 20 -eq 0 -or $file.Length -gt 100MB) { $percentComplete = $c * 100 / $totalNumber Write-Progress -Activity 'Hashing File Content' -Status $file.Name -PercentComplete $percentComplete } - + # use the file hash of this file PLUS file length as a key to the hashtable # use the fastest algorithm SHA1, and use partial hashes for files larger than 100KB: $bufferSize = [Math]::Min(100KB, $MaxFileSize) $result = Get-PsOneFileHash -StartPosition 1KB -Length $MaxFileSize -BufferSize $bufferSize -AlgorithmName SHA1 -Path $file.FullName - + # add a "P" to partial hashes: if ($result.IsPartialHash) { $partialHash = 'P' @@ -173,27 +173,27 @@ { $partialHash = '' } - - + + $key = '{0}:{1}{2}' -f $result.Hash, $file.Length, $partialHash - + # if we see this key the first time, add a generic list to this key: if ($hash.ContainsKey($key) -eq $false) { $hash.Add($key, [Collections.Generic.List[System.IO.FileInfo]]::new()) } - + # add the file to the approriate group: $hash[$key].Add($file) } } - + end { # remove all hashtable keys with only one file in them - - - + + + # do a detail check on partial hashes if ($TestPartialHash) { @@ -223,10 +223,10 @@ } } } - + # enumerate all keys... $keys = @($hash.Keys).Clone() - + foreach($key in $keys) { # ...if key has only one file, remove it: @@ -235,9 +235,9 @@ $hash.Remove($key) } } - - - + + + # return the hashtable with only duplicate files left: $hash }