Find duplicate files: Difference between revisions

Content added Content deleted

Inline

Revision as of 23:59, 26 March 2016

In a large directory structure it is easy to inadvertently leave unnecessary copies of files around, which can use considerable disk space and create confusion. Create a program which, given a minimum size and a folder/directory, will find all files of at least size bytes with duplicate contents under the directory and output or show the sets of duplicate files in order of decreasing size.

The program may be command-line or graphical, and duplicate content may be determined by direct comparison or by calculating a hash of the data. Specify which filesystems or operating systems your program works with if it has any filesystem- or OS-specific requirements. Identify hard links (filenames referencing the same content) in the output if applicable for the filesystem. For extra points detect when whole directory sub-trees are identical, or optionally remove or link identical files.

Elixir

Translation of: Ruby

<lang elixir>defmodule Files do

 def find_duplicate_files(dir) do
   IO.puts "\nDirectory : #{dir}"
   File.cd!(dir, fn ->
     Enum.filter(File.ls!, fn fname -> File.regular?(fname) end)
     |> Enum.group_by(fn file -> File.stat!(file).size end)
     |> Enum.filter(fn {_, files} -> length(files)>1 end)
     |> Enum.each(fn {size, files} ->
          Enum.group_by(files, fn file -> :erlang.md5(File.read!(file)) end)
          |> Enum.filter(fn {_, files} -> length(files)>1 end)
          |> Enum.each(fn {_md5, fs} ->
               IO.puts "  --------------------------------------------"
               Enum.each(fs, fn file ->
                 IO.puts "  #{inspect File.stat!(file).mtime}\t#{size}  #{file}"
               end)
             end)
        end)
   end)
 end

end

hd(System.argv) |> Files.find_duplicate_files</lang>

Output:

C:\Elixir>elixir find_dup_file.exs \Windows\System32

Directory : \Windows\System32
  --------------------------------------------
  {{2009, 7, 14}, {1, 0, 32}}   31548  perfd009.dat
  {{2010, 11, 21}, {7, 14, 4}}  31548  perfd011.dat
  --------------------------------------------
  {{2015, 4, 29}, {18, 21, 50}} 5120  msdxm.ocx
  {{2015, 4, 29}, {18, 21, 50}} 5120  dxmasf.dll
  --------------------------------------------
  {{2010, 11, 21}, {3, 23, 55}} 91648  mapi32.dll
  {{2010, 11, 21}, {3, 23, 55}} 91648  mapistub.dll
  --------------------------------------------
  {{2014, 4, 11}, {13, 39, 56}} 18088  msvcp110_clr0400.dll
  {{2014, 4, 11}, {13, 39, 56}} 18088  msvcr100_clr0400.dll
  {{2014, 4, 11}, {13, 39, 56}} 18088  msvcr110_clr0400.dll

Haskell

- checks for wrong command line input (not existing directory / negative size)
- works on Windows as well as Unix Systems (tested with Mint 17 / Windows 7)

<lang Haskell> import Crypto.Hash.MD5 (hash) import Data.ByteString as BS (readFile, ByteString()) import System.Environment (getArgs, getProgName) import System.Directory (doesDirectoryExist, getDirectoryContents) import System.FilePath.Posix ((</>)) import Control.Monad (forM) import Text.Printf (printf) import System.IO (withFile, IOMode(ReadMode), hFileSize)

type File = (BS.ByteString, -- md5hash

            FilePath)      -- filepath

type FileSize = Integer

getRecursiveContents :: FilePath -> FileSize -> IO [File] getRecursiveContents curDir maxsize = do

 names <- getDirectoryContents curDir
 let dirs = filter (`notElem` [".", ".."]) names
 files <- forM dirs $ \path -> do
            let path' = curDir </> path
            exists <- doesDirectoryExist path'
            if exists
               then getRecursiveContents path' maxsize
               else genFileHash path' maxsize
 return $ concat files

genFileHash :: FilePath -> FileSize -> IO [File] genFileHash path maxsize = do

 size <- withFile path ReadMode hFileSize
 if size <= maxsize
   then BS.readFile path >>= \bs -> return [(hash bs, path)]
   else return []

findDuplicates :: FilePath -> FileSize -> IO () findDuplicates dir bytes = do

 exists <- doesDirectoryExist dir
 if exists
   then getRecursiveContents dir bytes >>= findSameHashes
   else printf "Sorry, the directory \"%s\" does not exist...\n" dir

findSameHashes :: [File] -> IO () findSameHashes [] = return () findSameHashes ((hash, fp):xs) = do

 case lookup hash xs of
   (Just dupFile) -> printf "===========================\n\
                           \Found duplicate:\n\
                           \=> %s \n\
                           \=> %s \n\n" fp dupFile
                     >> findSameHashes xs
   (_)            -> findSameHashes xs

main :: IO () main = do

 args <- getArgs
 case args of
   [dir, mbytes] | [(bytes ,"")] <- reads mbytes
                  , bytes >= 1 -> findDuplicates dir bytes
   (_) -> do
     name <- getProgName
     printf "Something went wrong - please use ./%s <dir> <bytes>\n" name

</lang>

Example output:

$./finddups ~/Documents/MyGit/Haskell/ 20000
===========================
Found duplicate:
=> /home/rewrite/Documents/MyGit/Haskell/.git/logs/HEAD 
=> /home/rewrite/Documents/MyGit/Haskell/.git/logs/refs/heads/master 

===========================
Found duplicate:
=> /home/rewrite/Documents/MyGit/Haskell/.git/refs/remotes/origin/master 
=> /home/rewrite/Documents/MyGit/Haskell/.git/refs/heads/master 

===========================
Found duplicate:
=> /home/rewrite/Documents/MyGit/Haskell/RosettaCode/Find-duplicate-files/sampletext.txt 
=> /home/rewrite/Documents/MyGit/Haskell/RosettaCode/otherdup.txt 

===========================
Found duplicate:
=> /home/rewrite/Documents/MyGit/Haskell/RWH/FileManipulation/toupper-imp.hs 
=> /home/rewrite/Documents/MyGit/Haskell/RWH/FileManipulation/toupper-imp.hs~ 


$./finddups /home/rewrite/NotExistingDir 200000
Sorry, the directory "/home/rewrite/NotExistingDir" does not exist...


$./finddups /home/rewrite/ -100
Something went wrong - please use ./finddups <dir> <bytes>

Racket

lang racket

(struct F (name id size [links #:mutable]))

(require openssl/sha1) (define (find-duplicate-files path size)

 (define Fs
   (sort
    (fold-files
     (λ(path type acc)
       (define s (and (eq? 'file type) (file-size path)))
       (define i (and s (<= size s) (file-or-directory-identity path)))
       (define ln (and i (findf (λ(x) (equal? i (F-id x))) acc)))
       (when ln (set-F-links! ln (cons (path->string path) (F-links ln))))
       (if (and i (not ln)) (cons (F path i s '()) acc) acc))
     '() path #f)
    > #:key F-size))
 (define (find-duplicates Fs)
   (define t (make-hash))
   (for ([F Fs])
     (define cksum (call-with-input-file (F-name F) sha1))
     (hash-set! t cksum (cons F (hash-ref t cksum '()))))
   (for/list ([(n Fs) (in-hash t)] #:unless (null? (cdr Fs))) Fs))
 (let loop ([Fs Fs])
   (if (null? Fs) '()
       (let-values ([(Fs Rs)
                     (splitf-at Fs (λ(F) (= (F-size F) (F-size (car Fs)))))])
         (append (find-duplicates Fs)
                 (loop Rs))))))

(define (show-duplicates path size)

 (for ([Fs (find-duplicate-files path size)])
   (define (links F)
     (if (null? (F-links F)) ""
         (format " also linked at ~a" (string-join (F-links F) ", "))))
   (printf "~a (~a)~a\n" (F-name (car Fs)) (F-size (car Fs)) (links (car Fs)))
   (for ([F (cdr Fs)]) (printf "  ~a~a\n" (F-name F) (links F)))))

(show-duplicates (find-system-path 'home-dir) 1024) </lang>

REXX

bare bones version

This REXX version works with DOS (with or without Microsoft Windows).
Note that the tFID (temp) file is hard coded to the C: drive.
Only minimal error checking is performed. <lang rexx>/*REXX program to reads a (DOS) directory and finds and displays files that identical.*/ sep=center(' files are identical in size and content: ',79,'═') /*define the header. */ tFID= 'c:\TEMP\FINDDUP.TMP' /*use this as a temporary FileID. */ arg maxSize aDir /*obtain optional arguments from the CL*/ if maxSize= | maxSize="," then maxSize=1000000 /*filesize limit (in bytes) [1 million]*/ aDir=strip(aDir) /*remove any leading or trailing blanks*/ if right(aDir,1)\=='\' then aDir=aDir"\" /*possibly add a trailing backslash [\]*/ "DIR" aDir '/a-d-s-h /oS /s | FIND "/" >' tFID /*the (DOS) DIR output ───► temp file. */ pFN= /*the previous filename and filesize. */ pSZ=; do j=0 while lines(tFID)\==0 /*process each of the files in the list*/

      aLine=linein(tFID)                        /*obtain (DOS) DIR's output about a FID*/
      parse var aLine . . sz fn                 /*obtain the filesize and its fileID.  */
      sz=space(translate(sz,,','),0)            /*elide any commas from the size number*/
      if sz>maxSize  then leave                 /*Is the file > maximum?  Ignore file. */
                                                /* [↓]  files identical?  (1st million)*/
      if sz==pSZ  then  if charin(aDir||pFN,1,sz)==charin(aDir||FN,1,sz)  then do
                                                                               say sep
                                                                               say pLine
                                                                               say aLine
                                                                               say
                                                                               end
      pSZ=sz;      pFN=FN;      pLine=aLine     /*remember the previous stuff for later*/
      end   /*j*/

if lines(tFID)\==0 then 'ERASE' tFID /*do housecleaning (delete temp file).*/

                                                /*stick a fork in it,  we're all done. */</lang>

output when using (checking) with the default root directory:

══════════════════ files are identical in size and content: ═══════════════════
04/13/2013  19:13                76 another.BK
04/13/2013  19:13                76 another.A

══════════════════ files are identical in size and content: ═══════════════════
04/13/2013  17:15               244 gettfid.1
04/13/2013  17:15               244 junk.1

══════════════════ files are identical in size and content: ═══════════════════
03/03/1995  01:46            10,897 $ERR.BK
03/03/1995  01:46            10,897 $ERR.ORI

with error checking

This version of the REXX program:

checks to see if running under the DOS environment
uses the TEMP folder for storing a temporary file
verifies that the maxSize is a positive integer
adjusts the name for a generic file specification
uses variables for some command names and command options
shows the number of files examined and also the directory name

<lang rexx>/*REXX program to reads a (DOS) directory and finds and displays files that identical.*/ sep=center(' files are identical in size and content: ',79,"═") /*define the header. */ parse arg !; if !all(arg()) then exit /*boilerplate HELP(?)*/ signal on halt; signal on novalue; signal on syntax /*handle exceptions, */

if \!dos then call err 'this program requires the DOS [environment].' call getTFID /*defines a temporary File ID for DOS.*/ arg maxSize aDir /*obtain optional arguments from the CL*/ if maxSize= | maxSize="," then maxSize=1000000 /*filesize limit (in bytes) [1 million]*/ if \isInt(maxSize) then call err "maxSize isn't an integer:" maxSize if maxSize<0 then call err "maxSize can't be negative:" maxSize if maxSize=0 then call err "maxSize can't be zero:" maxSize aDir=strip(aDir) /*remove any leading or trailing blanks*/ if right(aDir,3)=='*.*' then aDir=substr(aDir,1,length(aDir)-3) /*adjust the dir name.*/ if right(aDir,1)\=='\' then aDir=aDir"\" /*possibly add a trailing backslash [\]*/ @dir = 'DIR' /*literal for the (DOS) DIR command. */ @dirNots= '/a-d-s-h' /*ignore DIRs, SYSTEM, and HIDDEN files*/ @dirOpts= '/oS /s' /*sort DIR's (+ subdirs) files by size.*/ @filter = '| FIND "/"' /*the "lines" must have a slash [/]. */ @erase = 'ERASE' /*literal for the (DOS) ERASE command.*/ @dir aDir @dirNots @dirOpts @filter '>' tFID /*(DOS) DIR output ──► temporary file.*/ pFN= /*the previous filename and filesize. */ pSZ=; do j=0 while lines(tFID)\==0 /*process each of the files in the list*/

      aLine=linein(tFID)                        /*obtain (DOS) DIR's output about a FID*/
      parse var aLine . . sz fn                 /*obtain the filesize and its fileID.  */
      sz=space(translate(sz,,','),0)            /*elide any commas from the size number*/
      if sz>maxSize  then leave                 /*Is the file > maximum?  Ignore file. */
                                                /* [↓]  files identical?  (1st million)*/
      if sz==pSZ  then  if charin(aDir||pFN,1,sz)==charin(aDir||FN,1,sz)  then do
                                                                               say sep
                                                                               say pLine
                                                                               say aLine
                                                                               say
                                                                               end
      pSZ=sz;      pFN=FN;      pLine=aLine     /*remember the previous stuff for later*/
      end   /*j*/

say j 'file's(j) "examined in" aDir /*show information to the screen.*/ if lines(tFID)\==0 then 'ERASE' tFID /*do housecleaning (delete temp file).*/ exit /*stick a fork in it, we're all done. */ /*═════════════════════════════general 1─line subs══════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════*/ !all: !!=!;!=space(!);upper !;call !fid;!nt=right(!var('OS'),2)=="NT";!cls=word('CLS VMFCLEAR CLRSCREEN',1+!cms+!tso*2);if arg(1)\==1 then return 0;if wordpos(!,"? ?SAMPLES ?AUTHOR ?FLOW")==0 then return 0;!call=']$H';call "$H" !fn !;!call=;return 1 !cal: if symbol('!CALL')\=="VAR" then !call=; return !call !env: !env='ENVIRONMENT'; if !sys=="MSDOS" | !brexx | !r4 | !roo then !env='SYSTEM'; if !os2 then !env="OS2"!env; !ebcdic=1=='f1'x; return !fid: parse upper source !sys !fun !fid . 1 . . !fn !ft !fm .; call !sys; if !dos then do; _=lastpos('\',!fn); !fm=left(!fn,_); !fn=substr(!fn,_+1); parse var !fn !fn "." !ft; end; return word(0 !fn !ft !fm, 1+('0'arg(1))) !rex: parse upper version !ver !vernum !verdate .; !brexx='BY'==!vernum; !kexx="KEXX"==!ver; !pcrexx='REXX/PERSONAL'==!ver | "REXX/PC"==!ver; !r4='REXX-R4'==!ver; !regina="REXX-REGINA"==left(!ver,11); !roo='REXX-ROO'==!ver; call !env; return !sys: !cms=!sys=='CMS'; !os2=!sys=="OS2"; !tso=!sys=='TSO' | !sys=="MVS"; !vse=!sys=='VSE'; !dos=pos("DOS",!sys)\==0|pos('WIN',!sys)\==0|!sys=="CMD"; call !rex; return !var: call !fid; if !kexx then return space(dosenv(arg(1))); return space(value(arg(1),,!env)) err: say; say; say center(' error! ', 60, "*"); say; do j=1 for arg(); say arg(j); say; end; say; exit 13 getdTFID: tfid=p(!var("TMP") !var('TEMP') homedrive()"\"); if substr(tfid,2,1)==':'&substr(tfid,3,1)\=="\" then tfid=insert('\',t,2); return strip(tfid,"T",'\')"\"arg(1)'.'arg(2) getTFID: if symbol('TFID')=="LIT" then tfid=; if tfid\== then return tfid; gfn=word(arg(1) !fn,1);gft=word(arg(2) "TMP",1); tfid='TEMP';if !tso then tfid=gfn"."gft;if !cms then tfid=gfn','gft",A4";if !dos then tfid=getdTFID(gfn,gft);return tfid halt: call err 'program has been halted.' homedrive: if symbol('HOMEDRIVE')\=="VAR" then homedrive=p(!var('HOMEDRIVE') "C:"); return homedrive isint: return datatype(arg(1),'W') novalue: syntax: call err 'REXX program' condition("C") 'error',condition("D"),'REXX source statement (line' sigl"):",sourceline(sigl) p: return word(arg(1),1) s: if arg(1)==1 then return arg(3); return word(arg(2) 's',1)</lang> output when using the DIR (folder): H:\#\REX

══════════════════ files are identical in size and content: ═══════════════════
05/11/2015  18:49               838 UPDATECF.BU
05/11/2015  18:49               838 UPDATECF.TXT

══════════════════ files are identical in size and content: ═══════════════════
03/23/2014  21:55             2,736 EMIRP.RX_
03/26/2014  10:44             2,736 EMIRP2.RX_

══════════════════ files are identical in size and content: ═══════════════════
05/30/2015  17:30             4,542 JUSTIFY.RX_
11/25/2013  06:33             4,542 JUSTIFY.KX_

══════════════════ files are identical in size and content: ═══════════════════
06/15/2014  23:36            13,935 $BLOCK.KX_
05/30/2015  17:28            13,935 $BLOCK.RX_

1568 files examined in H:\#\REX\

Ruby

It confirms once by the file size. When the same, it confirms a digest (md5). <lang ruby>require 'digest/md5'

def find_duplicate_files(dir)

 puts "\nDirectory : #{dir}"
 Dir.chdir(dir) do
   file_size = Dir.foreach('.').select{|f| FileTest.file?(f)}.group_by{|f| File.size(f)}
   file_size.each do |size, files|
     next if files.size==1
     files.group_by{|f| Digest::MD5.file(f).to_s}.each do |md5,fs|
       next if fs.size==1
       puts "  --------------------------------------------"
       fs.each{|file| puts "  #{File.mtime(file)}  #{size}  #{file}"}
     end
   end
 end

end

find_duplicate_files("\\work\\backup") find_duplicate_files("\\Windows\\System32")</lang>

Sample Output:

Directory : \work\backup
  --------------------------------------------
  2015-01-20 15:11:34 +0900  956  abcd.bak
  2015-01-20 15:03:15 +0900  956  abcd.txt
  2014-07-08 17:45:49 +0900  956  DCurve_c.BAS

Directory : \Windows\System32
  --------------------------------------------
  2014-07-07 10:40:22 +0900  4096  dxmasf.dll
  2014-07-07 10:40:22 +0900  4096  msdxm.ocx
  --------------------------------------------
  2009-07-14 10:15:34 +0900  200192  ir50_qc.dll
  2009-07-14 10:15:34 +0900  200192  ir50_qcx.dll
  --------------------------------------------
  2010-11-21 12:24:08 +0900  76800  mapi32.dll
  2010-11-21 12:24:08 +0900  76800  mapistub.dll
  --------------------------------------------
  2009-07-14 10:15:34 +0900  120320  ir41_qc.dll
  2009-07-14 10:15:34 +0900  120320  ir41_qcx.dll

It checked the operation with MS Windows 7.

Sidef

It uses the portable File::Find module which means that it should work, virtually, on any platform. <lang ruby># usage: sidef fdf.sf [size] [dir1] [...]

func find_duplicate_files(Block code, size_min=0, *dirs) {

   var files = Hash()
   var f = frequire('File::Find')
   f.find(
       Hash(
           no_chdir => true,
           wanted   => func(arg) {
               var file = File(arg)
               file.is_file || return;
               file.is_link && return;
               var size = file.size
               size >= size_min || return;
               files{size} := [] << file
           },
       ) => dirs...
   )

   files.values.each { |set|
       set.len > 1 || next
       var dups = Hash()
       for i in (0 ..^ set.end-1) {
           for (var j = i+1; j <= set.end; j++) {
               if (set[i].compare(set[j]) == 0) {
                   dups{set[i]} := [] << set.pop_at(j--)
               }
           }
       }
       dups.each{ |k,v| code(k.to_file, v...) }
   }

   return;

}

var duplicates = Hash() func collect(*files) {

   duplicates{files[0].size} := [] << files

}

find_duplicate_files(collect, Num(ARGV.shift), ARGV...)

duplicates.keys.sort_by{ .to_i }.reverse.each { |key|

   say "=> Size: #{key}\n#{'~'*80}"
   duplicates{key}.each { |files|
       say "#{files.map{.to_s}.sort.join(%Q[\n])}\n#{'-'*80}"
   }

}</lang> Section of sample output:

% sidef fdf.sf 0 /tmp /usr/bin
=> Size: 5656
~~~~~~~~~~~~~~~~~~~~~~~~~~
/usr/bin/precat
/usr/bin/preunzip
/usr/bin/prezip
--------------------------
=> Size: 2305
~~~~~~~~~~~~~~~~~~~~~~~~~~
/usr/bin/gunzip
/usr/bin/uncompress
--------------------------
=> Size: 2
~~~~~~~~~~~~~~~~~~~~~~~~~~
/tmp/a.txt
/tmp/b.txt
--------------------------
/tmp/m.txt
/tmp/n.txt
--------------------------

Tcl

Only known to work on Unix. Uses both inode number checking and content hashing to do duplicate detection.

Library: Tcllib (Package: fileutil)

Library: Tcllib (Package: md5)

<lang tcl>package require fileutil package require md5

proc finddupfiles {dir {minsize 1}} {

   foreach fn [fileutil::find $dir] {

file lstat $fn stat if {$stat(size) < $minsize} continue dict lappend byino $stat(dev),$stat(ino) $fn if {$stat(type) ne "file"} continue set f [open $fn "rb"] set content [read $f] close $f set md5 [md5::md5 -hex $content] dict lappend byhash $md5 $fn

   }
   set groups {}
   foreach group [dict values $byino] {

if {[llength $group] <= 1} continue set gs [lsort $group] dict set groups [lindex $gs 0] $gs

   }
   foreach group [dict values $byhash] {

if {[llength $group] <= 1} continue foreach f $group { if {[dict exists $groups $f]} { dict set groups $f [lsort -unique \ [concat [dict get $groups $f] $group]] unset group break } } if {[info exist group]} { set gs [lsort $group] dict set groups [lindex $gs 0] $gs }

   }
   set masters {}
   dict for {n g} $groups {

lappend masters [list $n [llength $g],$n]

   }
   set result {}
   foreach p [lsort -decreasing -index 1 -dictionary $masters] {

set n [lindex $p 0] lappend result $n [dict get $groups $n]

   }
   return $result

}

foreach {leader dupes} [finddupfiles {*}$argv] {

   puts "$leader has duplicates"
   set n 0
   foreach d $dupes {

if {$d ne $leader} { puts " dupe #[incr n]: $d" }

}</lang> Section of sample output:

./compat/zlib/zconf.h has duplicates
   dupe #1: ./compat/zlib/zconf.h.in
./compat/zlib/contrib/vstudio/vc10/zlib.rc has duplicates
   dupe #1: ./compat/zlib/contrib/vstudio/vc9/zlib.rc
./compat/zlib/contrib/delphi/zlibd32.mak has duplicates
   dupe #1: ./compat/zlib/contrib/pascal/zlibd32.mak

@@ Line 245: / Line 245: @@
 ===with error checking===
 This version of the REXX program:
-::* &nbsp; checks to see if running under the '''DOS''' environment
+::* &nbsp; checks to see if running under the &nbsp; '''DOS''' &nbsp; environment
 ::* &nbsp; uses the &nbsp; '''TEMP''' &nbsp; folder for storing a temporary file
 ::* &nbsp; verifies that the &nbsp; '''maxSize''' &nbsp; is a positive integer
@@ Line 252: / Line 252: @@
 ::* &nbsp; shows the number of files examined and also the directory name
 <lang rexx>/*REXX program to reads a (DOS) directory  and  finds and displays files that identical.*/
-sep=center(' files are identical in size and content: ',79,'═')    /*define the header. */
+sep=center(' files are identical in size and content: ',79,"═")    /*define the header. */
-parse arg !;    if !all(arg())  then exit                          /*boilerplate code.  */
+parse arg !;     if !all(arg())  then exit                         /*boilerplate HELP(?)*/
-signal on halt; signal on novalue; signal on syntax                /*handle exceptions, */
+signal on halt;  signal on novalue;  signal on syntax              /*handle exceptions, */
 if \!dos  then call err 'this program requires the DOS [environment].'