File size distribution: Difference between revisions

Content added Content deleted

Inline

Revision as of 17:50, 31 May 2017

File Size Distribution

Beginning from the current directory, or optionally from a directory specified as a command-line argument, determine how many files there are of various sizes in a directory hierarchy. My suggestion is to sort by logarithmn of file size, since a few bytes here or there, or even a factor of two or three, may not be that significant. Don't forget that empty files may exist, to serve as a marker. Is your file system predominantly devoted to a large number of smaller files, or a smaller number of huge files?

Perl 6

Works with: Rakudo version 2017.05

By default, process the current and all readable sub-directories, or, pass in a directory path at the command line.

<lang perl6>sub MAIN($dir = '.') {

   sub log10 (Int $s) { $s ?? $s.log(10).Int !! 0 }
   my %fsize;
   my @dirs = $dir.IO;
   while @dirs {
       for @dirs.pop.dir -> $path {
           %fsize{$path.s.&log10}++ if $path.f;
           @dirs.push: $path if $path.d and $path.r
       }
   }
   my $max = %fsize.values.max;
   my $bar-size = 80;
   say "File size distribution in bytes for directory: $dir\n";
   say sprintf( "# Files @     0b %8s: ", %fsize{0} // 0 ),
       histogram( $max, %fsize{0} // 0, $bar-size );
   for 1 .. %fsize.keys.max {
         say sprintf( "# Files @ %5sb %8s: ", "10e{$_-1}", %fsize{$_} // 0 ),
             histogram( $max, %fsize{$_} // 0, $bar-size )
   }
   say %fsize.values.sum, ' total files.';

}

sub histogram ($max, $value, $width = 60) {

   my @blocks = <| ▏ ▎ ▍ ▌ ▋ ▊ ▉ █>;
   my $scaled = ($value * $width / $max).Int;
   my ($end, $bar) = $scaled.polymod(8);
   (@blocks[8] x $bar * 8) ~ (@blocks[$end] if $end) ~ "\n"

}</lang>

Output:

File size distribution in bytes for directory: /home

# Files @     0b      989: ▏

# Files @  10e0b     6655: ████████

# Files @  10e1b    31776: ████████████████████████████████████████

# Files @  10e2b    63165: ████████████████████████████████████████████████████████████████████████████████

# Files @  10e3b    19874: ████████████████████████▏

# Files @  10e4b     7730: ████████▏

# Files @  10e5b     3418: ▌

# Files @  10e6b     1378: ▏

# Files @  10e7b      199:

# Files @  10e8b       45:

135229 total files.

Python

The distribution is stored in a collections.Counter object (like a dictionary with automatic 0 value when a key is not found, useful when incrementing). Anything could be done with this object, here the number of files is printed for increasing sizes. No check is made during the directory walk: usually, safeguards would be needed or the program will fail on any unreadable file or directory (depending on rights, or too deep paths, for instance). Here links are skipped, so it should avoid cycles.

<lang python>import sys, os from collections import Counter

def dodir(path):

   global h

   for name in os.listdir(path):
       p = os.path.join(path, name)

       if os.path.islink(p):
           pass
       elif os.path.isfile(p):
           h[os.stat(p).st_size] += 1
       elif os.path.isdir(p):
           dodir(p)
       else:
           pass

def main(arg):

   global h
   h = Counter()
   for dir in arg:
       dodir(dir)
   
   s = n = 0
   for k, v in sorted(h.items()):
       print("Size %d -> %d file(s)" % (k, v))
       n += v
       s += k * v
   print("Total %d bytes for %d files" % (s, n))

main(sys.argv[1:])</lang>

Racket

<lang racket>#lang racket

(define (file-size-distribution (d (current-directory)) #:size-group-function (sgf values))

 (for/fold ((rv (hash)) (Σ 0) (n 0)) ((f (in-directory d)) #:when (file-exists? f))
   (define sz (file-size f))
   (values (hash-update rv (sgf sz) add1 0) (+ Σ sz) (add1 n))))

(define (log10-or-so x) (if (zero? x) #f (round (/ (log x) (log 10)))))

(define number-maybe-<

 (match-lambda** [(#f #f) #f]
                 [(#f _) #t]
                 [(_ #f) #f]
                 [(a b) (< a b)]))

(define ...s? (match-lambda** [(one 1) one] [(one n) (string-append one "s")]))

(define ((report-fsd f) fsd Σ n)

 (for/list ((k (in-list (sort (hash-keys fsd) number-maybe-<))))
   (printf "~a(size): ~a -> ~a ~a~%"
           (object-name f)
           k
           (hash-ref fsd k) (...s? "file" (hash-ref fsd k))))
 (printf "Total: ~a ~a in ~a ~a~%" Σ (...s? "byte" Σ) n (...s? "file" n)))

(module+ test

 (call-with-values (λ () (file-size-distribution #:size-group-function log10-or-so))
                   (report-fsd log10-or-so)))</lang>

Output:

log10-or-so(size): #f -> 3 files
log10-or-so(size): 0 -> 4 files
log10-or-so(size): 1.0 -> 39 files
log10-or-so(size): 2.0 -> 57 files
log10-or-so(size): 3.0 -> 406 files
log10-or-so(size): 4.0 -> 198 files
log10-or-so(size): 5.0 -> 20 files
log10-or-so(size): 6.0 -> 6 files
Total: 10210127 bytes in 733 files

Sidef

<lang ruby>func traverse(Block callback, Dir dir) {

   dir.open(\var dir_h) || return nil

   for entry in (dir_h.entries) {
       if (entry.kind_of(Dir)) {
           traverse(callback, entry)
       } else {
           callback(entry)
       }
   }

} var dir = (ARGV ? Dir(ARGV[0]) : Dir.cwd)

var group = Hash() var files_num = 0 var total_size = 0

traverse({ |file|

   group{file.size+1 -> log10.round} := 0 += 1
   total_size += file.size
   files_num += 1

}, dir)

for k,v in (group.sort_by { |k,_| Num(k) }) {

   say "log10(size) ~~ #{k} -> #{v} files"

}

say "Total: #{total_size} bytes in #{files_num} files"</lang>

Output:

$ sidef script.sf /usr/bin
log10(size) ~~ 1 -> 4 files
log10(size) ~~ 2 -> 70 files
log10(size) ~~ 3 -> 246 files
log10(size) ~~ 4 -> 1337 files
log10(size) ~~ 5 -> 815 files
log10(size) ~~ 6 -> 167 files
log10(size) ~~ 7 -> 9 files
log10(size) ~~ 8 -> 2 files
Total: 370026462 bytes in 2650 files

zkl

<lang zkl>pipe:=Thread.Pipe();

   // hoover all files in tree, don't return directories

fcn(pipe,dir){ File.globular(dir,"*",True,8,pipe); } .launch(pipe,vm.arglist[0]); // thread

dist,N,SZ,maxd:=List.createLong(50,0),0,0,0; foreach fnm in (pipe){

  sz,szd:=File.len(fnm), sz.numDigits;
  dist[szd]+=1;
  N+=1; SZ+=sz; maxd=maxd.max(szd);

} println("Found %d files, %,d bytes, %,d mean.".fmt(N,SZ,SZ/N)); scale:=50.0/(0.0).max(dist); szchrs,idx,comma:=",nnn"*20, -1, Walker.cycle(0,0,1).next; println("%15s %s (* = %.2f)".fmt("File size","Number of files",1.0/scale)); foreach sz,cnt in ([0..].zip(dist[0,maxd])){

  println("%15s : %s".fmt(szchrs[idx,*], "*"*(scale*cnt).round().toInt()));
  idx-=1 + comma();

}</lang>

Output:

$ zkl flSzDist.zkl ..
Found 1832 files, 108,667,806 bytes, 59,316 mean.
      File size   Number of files (* = 13.44)
              n : *
             nn : ***
            nnn : ********
          n,nnn : **********************************
         nn,nnn : **************************************************
        nnn,nnn : ********************************
      n,nnn,nnn : *******

$ zkl flSzDist.zkl /media/Tunes/
Found 4320 files, 67,627,849,052 bytes, 15,654,594 mean.
      File size   Number of files (* = 69.84)
              n : 
             nn : 
            nnn : 
          n,nnn : *
         nn,nnn : 
        nnn,nnn : 
      n,nnn,nnn : *
     nn,nnn,nnn : **************************************************
    nnn,nnn,nnn : ********
  n,nnn,nnn,nnn : *