File size distribution: Difference between revisions

From Rosetta Code
Content added Content deleted
 
(38 intermediate revisions by 16 users not shown)
Line 2: Line 2:


;Task:
;Task:
Beginning from the current directory, or optionally from a directory specified as a command-line argument, determine how many files there are of various sizes in a directory hierarchy.
Beginning from the current directory, or optionally from a directory specified as a command-line argument, determine how many files there are of various sizes in a directory hierarchy.




My suggestion is to sort by logarithmn of file size, since a few bytes here or there, or even a factor of two or three, may not be that significant.
My suggestion is to sort by logarithmn of file size, since a few bytes here or there, or even a factor of two or three, may not be that significant.

Don't forget that empty files may exist, to serve as a marker.
Don't forget that empty files may exist, to serve as a marker.




Is your file system predominantly devoted to a large number of smaller files, or a smaller number of huge files?
Is your file system predominantly devoted to a large number of smaller files, or a smaller number of huge files?
<br><br>
<br><br>

=={{header|Action!}}==
DOS 2.5 returns file size in number of sectors.
{{libheader|Action! Tool Kit}}
<syntaxhighlight lang="action!">INCLUDE "D2:PRINTF.ACT" ;from the Action! Tool Kit

PROC SizeDistribution(CHAR ARRAY filter INT ARRAY limits,counts BYTE count)
CHAR ARRAY line(255),tmp(4)
INT size
BYTE i,dev=[1]

FOR i=0 TO count-1
DO
counts(i)=0
OD

Close(dev)
Open(dev,filter,6)
DO
InputSD(dev,line)
IF line(0)=0 THEN
EXIT
FI
SCopyS(tmp,line,line(0)-3,line(0))
size=ValI(tmp)
FOR i=0 TO count-1
DO
IF size<limits(i) THEN
counts(i)==+1
EXIT
FI
OD
OD
Close(dev)
RETURN

PROC GenerateLimits(INT ARRAY limits BYTE count)
BYTE i
INT l

l=1
FOR i=0 TO count-1
DO
limits(i)=l
l==LSH 1
IF l>1000 THEN l=1000 FI
OD
RETURN

PROC PrintBar(INT len,max,size)
INT i,count

count=4*len*size/max
IF count=0 AND len>0 THEN
count=1
FI
FOR i=0 TO count/4-1
DO
Put(160)
OD
i=count MOD 4
IF i=1 THEN Put(22)
ELSEIF i=2 THEN Put(25)
ELSEIF i=3 THEN Put(130) FI
RETURN

PROC PrintResult(CHAR ARRAY filter
INT ARRAY limits,counts BYTE count)

BYTE i
CHAR ARRAY tmp(5)
INT min,max,total

total=0 max=0
FOR i=0 TO count-1
DO
total==+counts(i)
IF counts(i)>max THEN
max=counts(i)
FI
OD
PrintF("File size distribution of ""%S"" in sectors:%E",filter) PutE()
PrintE("From To Count Perc")
min=0
FOR i=0 TO count-1
DO
StrI(min,tmp) PrintF("%4S ",tmp)
StrI(limits(i)-1,tmp) PrintF("%3S ",tmp)
StrI(counts(i),tmp) PrintF("%3S ",tmp)
StrI(counts(i)*100/total,tmp) PrintF("%3S%% ",tmp)
PrintBar(counts(i),max,17) PutE()
min=limits(i)
OD
RETURN

PROC Main()
DEFINE LIMITCOUNT="11"
CHAR ARRAY filter="H1:*.*"
INT ARRAY limits(LIMITCOUNT),counts(LIMITCOUNT)

Put(125) PutE() ;clear the screen
GenerateLimits(limits,LIMITCOUNT)
SizeDistribution(filter,limits,counts,LIMITCOUNT)
PrintResult(filter,limits,counts,LIMITCOUNT)
RETURN</syntaxhighlight>
{{out}}
[https://gitlab.com/amarok8bit/action-rosetta-code/-/raw/master/images/File_size_distribution.png Screenshot from Atari 8-bit computer]
<pre>
File size distribution of "H1:*.*" in sectors:

From To Count Perc
0 0 2 0% ▌
1 1 20 3% █▌
2 3 44 8% ███▌
4 7 195 37% █████████████████
8 15 183 35% ███████████████▌
16 31 67 12% █████▌
32 63 6 1% ▌
64 127 0 0%
128 255 0 0%
256 511 0 0%
512 999 1 0% ▌
</pre>

=={{header|Ada}}==
{{libheader|Dir_Iterators}}
<syntaxhighlight lang="ada">with Ada.Numerics.Elementary_Functions;
with Ada.Directories; use Ada.Directories;
with Ada.Strings.Fixed; use Ada.Strings;
with Ada.Command_Line; use Ada.Command_Line;
with Ada.Text_IO; use Ada.Text_IO;

with Dir_Iterators.Recursive;

procedure File_Size_Distribution is

type Exponent_Type is range 0 .. 18;
type File_Count is range 0 .. Long_Integer'Last;
Counts : array (Exponent_Type) of File_Count := (others => 0);
Non_Zero_Index : Exponent_Type := 0;
Directory_Name : constant String := (if Argument_Count = 0
then "."
else Argument (1));
Directory_Walker : Dir_Iterators.Recursive.Recursive_Dir_Walk
:= Dir_Iterators.Recursive.Walk (Directory_Name);
begin
if not Exists (Directory_Name) or else Kind (Directory_Name) /= Directory then
Put_Line ("Directory does not exist");
return;
end if;

for Directory_Entry of Directory_Walker loop
declare
use Ada.Numerics.Elementary_Functions;
Size_Of_File : File_Size;
Exponent : Exponent_Type;
begin
if Kind (Directory_Entry) = Ordinary_File then
Size_Of_File := Size (Directory_Entry);
if Size_Of_File = 0 then
Counts (0) := Counts (0) + 1;
else
Exponent := Exponent_Type (Float'Ceiling (Log (Float (Size_Of_File),
Base => 10.0)));
Counts (Exponent) := Counts (Exponent) + 1;
end if;
end if;
end;
end loop;

for I in reverse Counts'Range loop
if Counts (I) /= 0 then
Non_Zero_Index := I;
exit;
end if;
end loop;

for I in Counts'First .. Non_Zero_Index loop
Put ("Less than 10**");
Put (Fixed.Trim (Exponent_Type'Image (I), Side => Left));
Put (": ");
Put (File_Count'Image (Counts (I)));
New_Line;
end loop;
end File_Size_Distribution;</syntaxhighlight>
{{out}}
<pre>Less than 10**0: 8
Less than 10**1: 0
Less than 10**2: 18
Less than 10**3: 88
Less than 10**4: 39
Less than 10**5: 8
Less than 10**6: 2
Less than 10**7: 1</pre>


=={{header|C}}==
=={{header|C}}==
The platform independent way to get the file size in C involves opening every file and reading the size. The implementation below works for Windows and utilizes command scripts to get size information quickly even for a large number of files, recursively traversing a large number of directories. Both textual and graphical ( ASCII ) outputs are shown. The same can be done for Linux by a combination of the find, ls and stat commands and my plan was to make it work on both OS types, but I don't have access to a Linux system right now. This would also mean either abandoning scaling the graphical output in order to fit the console buffer or porting that as well, thus including windows.h selectively.
The platform independent way to get the file size in C involves opening every file and reading the size. The implementation below works for Windows and utilizes command scripts to get size information quickly even for a large number of files, recursively traversing a large number of directories. Both textual and graphical ( ASCII ) outputs are shown. The same can be done for Linux by a combination of the find, ls and stat commands and my plan was to make it work on both OS types, but I don't have access to a Linux system right now. This would also mean either abandoning scaling the graphical output in order to fit the console buffer or porting that as well, thus including windows.h selectively.
===Windows===
===Windows===
<syntaxhighlight lang="c">
<lang C>
#include<windows.h>
#include<windows.h>
#include<string.h>
#include<string.h>
Line 30: Line 224:
double scale;
double scale;
FILE* fp;
FILE* fp;

if(argC==1)
if(argC==1)
printf("Usage : %s <followed by directory to start search from(. for current dir), followed by \n optional parameters (T or G) to show text or graph output>",argV[0]);
printf("Usage : %s <followed by directory to start search from(. for current dir), followed by \n optional parameters (T or G) to show text or graph output>",argV[0]);
Line 43: Line 237:
sprintf(commandString,"forfiles /p %s /s /c \"cmd /c echo @fsize\" 2>&1",startPath);
sprintf(commandString,"forfiles /p %s /s /c \"cmd /c echo @fsize\" 2>&1",startPath);
}
}

else if(strlen(argV[1])==1 && argV[1][0]=='.')
else if(strlen(argV[1])==1 && argV[1][0]=='.')
strcpy(commandString,"forfiles /s /c \"cmd /c echo @fsize\" 2>&1");
strcpy(commandString,"forfiles /s /c \"cmd /c echo @fsize\" 2>&1");

else
else
sprintf(commandString,"forfiles /p %s /s /c \"cmd /c echo @fsize\" 2>&1",argV[1]);
sprintf(commandString,"forfiles /p %s /s /c \"cmd /c echo @fsize\" 2>&1",argV[1]);
Line 58: Line 252:
fileSizeLog[strlen(str)]++;
fileSizeLog[strlen(str)]++;
}
}

if(argC==2 || (argC==3 && (argV[2][0]=='t'||argV[2][0]=='T'))){
if(argC==2 || (argC==3 && (argV[2][0]=='t'||argV[2][0]=='T'))){
for(i=0;i<MAXORDER;i++){
for(i=0;i<MAXORDER;i++){
Line 64: Line 258:
}
}
}
}

else if(argC==3 && (argV[2][0]=='g'||argV[2][0]=='G')){
else if(argC==3 && (argV[2][0]=='g'||argV[2][0]=='G')){
CONSOLE_SCREEN_BUFFER_INFO csbi;
CONSOLE_SCREEN_BUFFER_INFO csbi;
Line 72: Line 266:


max = fileSizeLog[0];
max = fileSizeLog[0];

for(i=1;i<MAXORDER;i++)
for(i=1;i<MAXORDER;i++)
(fileSizeLog[i]>max)?max=fileSizeLog[i]:max;
(fileSizeLog[i]>max)?max=fileSizeLog[i]:max;

(max < csbi.dwSize.X)?(scale=1):(scale=(1.0*(csbi.dwSize.X-50))/max);
(max < csbi.dwSize.X)?(scale=1):(scale=(1.0*(csbi.dwSize.X-50))/max);

for(i=0;i<MAXORDER;i++){
for(i=0;i<MAXORDER;i++){
printf("\nSize Order < 10^%2d bytes |",i);
printf("\nSize Order < 10^%2d bytes |",i);
Line 85: Line 279:
}
}
}
}

}
}
return 0;
return 0;
}
}
}
}
</syntaxhighlight>
</lang>
Invocation and textual output :
Invocation and textual output :
<pre>
<pre>
Line 156: Line 350:
{{libheader|POSIX}}
{{libheader|POSIX}}
This works on macOS 10.15. It should be OK for Linux as well.
This works on macOS 10.15. It should be OK for Linux as well.
<lang c>#include <ftw.h>
<syntaxhighlight lang="c">#include <ftw.h>
#include <locale.h>
#include <locale.h>
#include <stdint.h>
#include <stdint.h>
Line 177: Line 371:
total_size += file_size;
total_size += file_size;
size_t index = 0;
size_t index = 0;
for (; index <= nsizes && sizes[index] < file_size; ++index);
for (; index < nsizes && sizes[index] < file_size; ++index);
++count[index];
++count[index];
} else if (flag == FTW_DNR) {
} else if (flag == FTW_DNR) {
Line 203: Line 397:
printf("Total file size: %'lu\n", total_size);
printf("Total file size: %'lu\n", total_size);
return EXIT_SUCCESS;
return EXIT_SUCCESS;
}</lang>
}</syntaxhighlight>


{{out}}
{{out}}
Line 223: Line 417:


=={{header|C++}}==
=={{header|C++}}==
<lang cpp>#include <algorithm>
<syntaxhighlight lang="cpp">#include <algorithm>
#include <array>
#include <array>
#include <filesystem>
#include <filesystem>
Line 274: Line 468:
}
}
return EXIT_SUCCESS;
return EXIT_SUCCESS;
}</lang>
}</syntaxhighlight>


{{out}}
{{out}}
Line 291: Line 485:
Number of files: 7,874
Number of files: 7,874
Total file size: 11,963,566,673 bytes
Total file size: 11,963,566,673 bytes
</pre>
=={{header|Delphi}}==
{{libheader| System.SysUtils}}
{{libheader| System.Math}}
{{libheader| Winapi.Windows}}
{{Trans|Go}}
<syntaxhighlight lang="delphi">
program File_size_distribution;

{$APPTYPE CONSOLE}

uses
System.SysUtils,
System.Math,
Winapi.Windows;

function Commatize(n: Int64): string;
begin
result := n.ToString;
if n < 0 then
delete(result, 1, 1);
var le := result.Length;
var i := le - 3;
while i >= 1 do
begin
Insert(',', result, i + 1);
dec(i, 3);
end;

if n >= 0 then
exit;

Result := '-' + result;
end;

procedure Walk(Root: string; walkFunc: TProc<string, TWin32FindData>); overload;
var
rec: TWin32FindData;
h: THandle;
directory, PatternName: string;
begin
if not Assigned(walkFunc) then
exit;

Root := IncludeTrailingPathDelimiter(Root);

h := FindFirstFile(Pchar(Root + '*.*'), rec);
if (INVALID_HANDLE_VALUE <> h) then
repeat
if rec.cFileName[0] = '.' then
Continue;
walkFunc(directory, rec);
if ((rec.dwFileAttributes and FILE_ATTRIBUTE_DIRECTORY) =
FILE_ATTRIBUTE_DIRECTORY) and (rec.cFileName[0] <> '.') then
Walk(Root + rec.cFileName, walkFunc);
until not FindNextFile(h, rec);
FindClose(h);
end;

procedure FileSizeDistribution(root: string);
var
sizes: TArray<Integer>;
files, directories, totalSize, size, i: UInt64;
c: string;
begin
SetLength(sizes, 12);
files := 0;
directories := 0;
totalSize := 0;
size := 0;

Walk(root,
procedure(path: string; info: TWin32FindData)
var
logSize: Extended;
index: integer;
begin
inc(files);
if (info.dwFileAttributes and FILE_ATTRIBUTE_DIRECTORY) =
FILE_ATTRIBUTE_DIRECTORY then
inc(directories);
size := info.nFileSizeHigh shl 32 + info.nFileSizeLow;
if size = 0 then
begin
sizes[0] := sizes[0] + 1;
exit;
end;

inc(totalSize, size);
logSize := Log10(size);
index := Floor(logSize);
sizes[index] := sizes[index] + 1;
end);

writeln('File size distribution for "', root, '" :-'#10);
for i := 0 to High(sizes) do
begin
if i = 0 then
write(' ')
else
write('+ ');
writeln(format('Files less than 10 ^ %-2d bytes : %5d', [i, sizes[i]]));
end;
writeln(' -----');
writeln('= Total number of files : ', files: 5);
writeln(' including directories : ', directories: 5);
c := commatize(totalSize);
writeln(#10' Total size of files : ', c, 'bytes');
end;

begin
fileSizeDistribution('.');
readln;
end.</syntaxhighlight>

=={{header|Factor}}==
{{works with|Factor|0.99 2020-03-02}}
<syntaxhighlight lang="factor">USING: accessors assocs formatting io io.directories.search
io.files.types io.pathnames kernel math math.functions
math.statistics namespaces sequences ;

: classify ( m -- n ) [ 0 ] [ log10 >integer 1 + ] if-zero ;

: file-size-histogram ( path -- assoc )
recursive-directory-entries
[ type>> +directory+ = ] reject
[ size>> classify ] map histogram ;

current-directory get file-size-histogram dup
[ "Count of files < 10^%d bytes: %4d\n" printf ] assoc-each
nl values sum "Total files: %d\n" printf</syntaxhighlight>
{{out}}
<pre>
Count of files < 10^0 bytes: 20
Count of files < 10^1 bytes: 742
Count of files < 10^2 bytes: 3881
Count of files < 10^3 bytes: 2388
Count of files < 10^4 bytes: 3061
Count of files < 10^5 bytes: 486
Count of files < 10^6 bytes: 78
Count of files < 10^7 bytes: 27
Count of files < 10^8 bytes: 3
Count of files < 10^9 bytes: 1

Total files: 10687
</pre>
</pre>


=={{header|Go}}==
=={{header|Go}}==
{{trans|Kotlin}}
{{trans|Kotlin}}
<lang go>package main
<syntaxhighlight lang="go">package main


import (
import (
Line 366: Line 705:
func main() {
func main() {
fileSizeDistribution("./")
fileSizeDistribution("./")
}</lang>
}</syntaxhighlight>


{{out}}
{{out}}
Line 391: Line 730:
</pre>
</pre>
=={{header|Haskell}}==
=={{header|Haskell}}==
<p>
Uses a grouped frequency distribution. Program arguments are optional. Arguments include starting directory and initial frequency distribution group size. Distribution groups of 0 are removed. After the first frequency distribution is computed it further breaks it down for any group that exceeds 25% of the total file count, when possible.
Uses a grouped frequency distribution. Program arguments are optional. Arguments include starting directory and initial frequency distribution group size. After the first frequency distribution is computed it further breaks it down for any group that exceeds 25% of the total file count, when possible.
<lang haskell>{-# LANGUAGE TupleSections #-}
</p>
<syntaxhighlight lang="haskell">{-# LANGUAGE LambdaCase #-}


import Control.Concurrent (forkIO, setNumCapabilities)
import Control.Concurrent (forkIO, setNumCapabilities)
import Control.Concurrent.Chan (Chan, newChan, readChan, writeChan, writeList2Chan)
import Control.Concurrent.Chan (Chan, newChan, readChan,
writeChan, writeList2Chan)
import Control.Exception (catch, IOException)
import Control.Monad (filterM, join, replicateM, replicateM_, forever, (>=>))
import Control.Exception (IOException, catch)
import Data.Char (isDigit)
import Control.Monad (filterM, forever, join,
replicateM, replicateM_, (>=>))
import Data.List (sort)
import GHC.Conc (getNumProcessors)
import Control.Parallel.Strategies (parTraversable, rseq, using,
withStrategy)
import System.Directory (getDirectoryContents, doesFileExist
, doesDirectoryExist, pathIsSymbolicLink)
import Data.Char (isDigit)
import System.Environment (getArgs)
import Data.List (find, sort)
import qualified Data.Map.Strict as Map
import System.FilePath.Posix ((</>))
import System.IO (hFileSize, withFile, IOMode(ReadMode), FilePath
import GHC.Conc (getNumProcessors)
, hPutStrLn, stderr)
import System.Directory (doesDirectoryExist, doesFileExist,
listDirectory,
import Text.Printf (printf, hPrintf)
pathIsSymbolicLink)
import System.Environment (getArgs)
import System.FilePath.Posix ((</>))
import System.IO (FilePath, IOMode (ReadMode),
hFileSize, hPutStrLn, stderr,
withFile)
import Text.Printf (hPrintf, printf)


data Item = File FilePath Integer | Folder FilePath deriving (Show)
data Item = File FilePath Integer | Folder FilePath deriving (Show)

type FrequencyGroup = ((Integer, Integer), Integer)
type FGKey = (Integer, Integer)
type FrequencyGroup = (FGKey, Integer)
type FrequencyGroups = Map.Map FGKey Integer

newFrequencyGroups :: FrequencyGroups
newFrequencyGroups = Map.empty


fileSizes :: [Item] -> [Integer]
fileSizes :: [Item] -> [Integer]
fileSizes = foldr f []
fileSizes = foldr f [] where f (File _ n) acc = n:acc
f _ acc = acc
where
f (File _ n) acc = n : acc
f _ acc = acc


folders :: [Item] -> [FilePath]
folders :: [Item] -> [FilePath]
folders = foldr f []
folders = foldr f [] where f (Folder p) acc = p:acc
f _ acc = acc
where
f (Folder p) acc = p:acc
f _ acc = acc


totalBytes :: [Item] -> Integer
totalBytes :: [Item] -> Integer
Line 431: Line 781:
Folder _ -> (a, succ b)) (0, 0)
Folder _ -> (a, succ b)) (0, 0)


-- |Creates 'FrequencyGroups' from the provided size and data set.
frequencyGroups :: Int -> [Integer] -> [FrequencyGroup]
frequencyGroups _ [] = []
frequencyGroups :: Int -- ^ Desired number of frequency groups.
-> [Integer] -- ^ List of collected file sizes. Must be sorted.
frequencyGroups totalGroups xs
-> FrequencyGroups -- ^ Returns a 'FrequencyGroups' for the file sizes.
| length xs == 1 = [((head xs, head xs), 1)]
frequencyGroups _ [] = newFrequencyGroups
| otherwise = placeGroups xs groupMinMax
frequencyGroups totalGroups xs
| length xs == 1 = Map.singleton (head xs, head xs) 1
| otherwise = foldr placeGroups newFrequencyGroups xs `using` parTraversable rseq
where
where
range = maximum xs - minimum xs
range = maximum xs - minimum xs
groupSize = succ $ ceiling $ realToFrac range / realToFrac totalGroups
groupSize = succ $ ceiling $ realToFrac range / realToFrac totalGroups
groups = takeWhile (<=groupSize + maximum xs) $ iterate (+groupSize) 0
groups = takeWhile (<=groupSize + maximum xs) $ iterate (+groupSize) 0
groupMinMax = (,0) <$> zip groups (pred <$> tail groups)
groupMinMax = zip groups (pred <$> tail groups)
findGroup n = find (\(low, high) -> n >= low && n <= high)


incrementCount (Just n) = Just (succ n) -- Update count for range.
placeGroups [] = id
incrementCount Nothing = Just 1 -- Insert new range with initial count.
placeGroups (d:ds) = placeGroups ds .
fmap (\g@((min,max), count) ->
if d >= min && d <= max
then ((min, max), succ count)
else g
)


placeGroups n fgMap = case findGroup n groupMinMax of
expandGroups :: Int -> [Integer] -> Integer -> [FrequencyGroup] -> [FrequencyGroup]
Just k -> Map.alter incrementCount k fgMap
Nothing -> fgMap -- Should never happen.

expandGroups :: Int -- ^ Desired number of frequency groups.
-> [Integer] -- ^ List of collected file sizes.
-> Integer -- ^ Computed frequency group limit.
-> FrequencyGroups -- ^ Expanded 'FrequencyGroups'
expandGroups gsize fileSizes groupThreshold
expandGroups gsize fileSizes groupThreshold
| groupThreshold > 0 = loop 15
| groupThreshold > 0 = loop 15 $ frequencyGroups gsize sortedFileSizes
| otherwise = id
| otherwise = frequencyGroups gsize sortedFileSizes
where
where
sortedFileSizes = sort fileSizes
loop 0 gs = gs -- break out in case we can't go below threshold
loop 0 gs = gs -- break out in case we can't go below threshold
loop n gs
loop n gs | all (<= groupThreshold) $ Map.elems gs = gs
| all ((<= groupThreshold) . snd) gs = gs
| otherwise = loop (pred n) (expand gs)
| otherwise = loop (pred n) $ expand gs


expand = ((\g@((min, max), count) ->
expand :: FrequencyGroups -> FrequencyGroups
expand = foldr f . withStrategy (parTraversable rseq) <*>
if count > groupThreshold then
groupsFromGroup g
Map.mapWithKey groupsFromGroup . Map.filter (> groupThreshold)
else
where
f :: Maybe (FGKey, FrequencyGroups) -- ^ expanded frequency group
[g]
-> FrequencyGroups -- ^ accumulator
) =<<)
-> FrequencyGroups -- ^ merged accumulator
f (Just (k, fg)) acc = Map.union (Map.delete k acc) fg
f Nothing acc = acc


groupsFromGroup
groupsFromGroup ((min, max), count)
:: FGKey -- ^ Group Key
| length range > 1 = frequencyGroups gsize range
-> Integer -- ^ Count
| otherwise = [((min, max), count)]
-> Maybe (FGKey, FrequencyGroups) -- ^ Returns expanded 'FrequencyGroups' with base key it replaces.
where
collectBetween min max = filter (\n -> n >= min && n <= max)
groupsFromGroup (min, max) count
range = collectBetween min max fileSizes
| length range > 1 = Just ((min, max), frequencyGroups gsize range)
| otherwise = Nothing
where
range = filter (\n -> n >= min && n <= max) sortedFileSizes


displaySize :: Integer -> String
displaySize :: Integer -> String
displaySize n
displaySize n
| n <= 2^10 = show n <> "B"
| n <= 2^10 = printf "%8dB " n
| n >= 2^10 && n <= 2^20 = display "KB" $ 2^10
| n >= 2^10 && n <= 2^20 = display (2^10) "KB"
| n >= 2^20 && n <= 2^30 = display "MB" $ 2^20
| n >= 2^20 && n <= 2^30 = display (2^20) "MB"
| n >= 2^30 && n <= 2^40 = display "GB" $ 2^30
| n >= 2^30 && n <= 2^40 = display (2^30) "GB"
| n >= 2^40 && n <= 2^50 = display "TB" $ 2^40
| n >= 2^40 && n <= 2^50 = display (2^40) "TB"
| otherwise = "Too large!"
| otherwise = "Too large!"
where
where
display suffix = (<> suffix) . show . round . (realToFrac n /)
display :: Double -> String -> String
display b = printf "%7.2f%s " (realToFrac n / b)

displayFrequency :: Integer -> FrequencyGroup -> IO ()
displayFrequency filesCount ((min, max), count) = do
printf "%s <-> %s" (displaySize min) (displaySize max)
printf "= %-10d %6.3f%%: %-5s\n" count percentage bars
where
percentage :: Double
percentage = (realToFrac count / realToFrac filesCount) * 100
size = round percentage
bars | size == 0 = "▍"
| otherwise = replicate size '█'


folderWorker :: Chan FilePath -> Chan [Item] -> IO ()
folderWorker :: Chan FilePath -> Chan [Item] -> IO ()
Line 490: Line 864:


collectItems :: FilePath -> IO [Item]
collectItems :: FilePath -> IO [Item]
collectItems folderPath = catch tryCollect
collectItems folderPath = catch tryCollect $ \e -> do
(\e -> do
hPrintf stderr "Skipping: %s\n" $ show (e :: IOException)
hPrintf stderr "Skipping: %s\n" $ show (e :: IOException)
pure [])
pure []
where
where
tryCollect = do
tryCollect = (fmap (folderPath </>) <$> listDirectory folderPath) >>=
mapM (\p -> doesDirectoryExist p >>=
contents <- fmap (folderPath </>) <$> getDirectoryContents folderPath
\case True -> pure $ Folder p
files <- filterM doesFileExist contents
False -> File p <$> withFile p ReadMode hFileSize)
folders <- drop 2 <$> filterM doesDirectoryExist contents
items <- mapM (\f -> File f <$> withFile f ReadMode hFileSize) files
pure $ items <> fmap Folder folders

displayFrequency :: Integer -> FrequencyGroup -> IO ()
displayFrequency filesCount ((min, max), count) =
printf "%5s <-> %5s = %-10d %6.3f%%: %-5s\n" (displaySize min)
(displaySize max) count percentage bars
where
percentage :: Double
percentage = (realToFrac count / realToFrac filesCount) * 100
bars = replicate (round percentage) '█'

parseArgs :: [String] -> Either String (FilePath, Int)
parseArgs (x:y:xs)
| all isDigit y = Right (x, read y)
| otherwise = Left "Invalid frequency group size"
parseArgs (x:xs) = Right (x, 4)
parseArgs _ = Right (".", 4)


parallelItemCollector :: FilePath -> IO [Item]
parallelItemCollector :: FilePath -> IO [Item]
Line 530: Line 885:
loop :: Chan FilePath -> Chan [Item] -> [Item] -> IO [Item]
loop :: Chan FilePath -> Chan [Item] -> [Item] -> IO [Item]
loop folderChan resultItemsChan xs = do
loop folderChan resultItemsChan xs = do
let fs = folders xs
regularFolders <- filterM (pathIsSymbolicLink >=> (pure . not)) $ folders xs
regularFolders <- filterM (pathIsSymbolicLink >=> (pure . not)) fs
if null regularFolders then pure []
if null regularFolders then pure []
else do
else do
Line 538: Line 892:
result <- mapM (loop folderChan resultItemsChan) childItems
result <- mapM (loop folderChan resultItemsChan) childItems
pure (join childItems <> join result)
pure (join childItems <> join result)

parseArgs :: [String] -> Either String (FilePath, Int)
parseArgs (x:y:xs)
| all isDigit y = Right (x, read y)
| otherwise = Left "Invalid frequency group size"
parseArgs (x:xs) = Right (x, 4)
parseArgs _ = Right (".", 4)


main :: IO ()
main :: IO ()
main = parseArgs <$> getArgs >>= \case
main = do
args <- getArgs
case parseArgs args of
Left errorMessage -> hPutStrLn stderr errorMessage
Left errorMessage -> hPutStrLn stderr errorMessage
Right (path, groupSize) -> do
Right (path, groupSize) -> do
items <- parallelItemCollector path
items <- parallelItemCollector path
let (fileCount, folderCount) = counts items
let (fileCount, folderCount) = counts items
printf "Total files: %d\n" fileCount
printf "Total files: %d\nTotal folders: %d\n" fileCount folderCount
printf "Total folders: %d\n" folderCount
printf "Total size: %s\n" $ displaySize $ totalBytes items
printf "Total size: %s\n" $ displaySize $ totalBytes items
putStrLn "\nDistribution:\n"
printf "\nDistribution:\n\n%9s <-> %9s %7s\n" "From" "To" "Count"
putStrLn $ replicate 46 '-'
printf "%5s <-> %4s %8s\n" "From" "To" "Count"
let results = expandGroups groupSize (fileSizes items) (groupThreshold fileCount)
putStrLn $ replicate 37 '-'
mapM_ (displayFrequency fileCount) $ Map.assocs results
let results = expandedGroups groupSize (sizes items) (groupThreshold fileCount) items
mapM_ (displayFrequency fileCount) results
where
where
groupThreshold = round . (*0.25) . realToFrac</syntaxhighlight>
sizes = sort . fileSizes
initialGroups n = filter ((>0) . snd) . frequencyGroups n . sizes
groupThreshold = round . (*0.25) . realToFrac
expandedGroups gsize sizes n = filter ((>0) . snd)
. expandGroups gsize sizes n
. initialGroups gsize</lang>
{{out}}
{{out}}
<pre style="height: 50rem;">$ filedist ~/Music
<pre style="height: 50rem;">$ filedist ~/Music
Using 4 worker threads
Using 4 worker threads
Total files: 688
Total files: 688
Total folders: 663
Total folders: 663
Total size: 986MB
Total size: 985.85MB


Distribution:
Distribution:


From <-> To Count
From <-> To Count
-------------------------------------
----------------------------------------------
0B <-> 80B = 7 1.017%: █
0B <-> 80B = 7 1.017%: █
81B <-> 161B = 74 10.756%: ███████████
81B <-> 161B = 74 10.756%: ███████████
162B <-> 242B = 112 16.279%: ████████████████
162B <-> 242B = 112 16.279%: ████████████████
243B <-> 323B = 99 14.390%: ██████████████
243B <-> 323B = 99 14.390%: ██████████████
322B <-> 643B = 23 3.343%: ███
323B <-> 645B = 23 3.343%: ███
644B <-> 965B = 2 0.291%:
646B <-> 968B = 2 0.291%:
966B <-> 1KB = 1 0.145%:
969B <-> 1.26KB = 1 0.145%:
3KB <-> 6KB = 12 1.744%: ██
3.19KB <-> 6.38KB = 12 1.744%: ██
6KB <-> 10KB = 22 3.198%: ███
6.38KB <-> 9.58KB = 22 3.198%: ███
10KB <-> 13KB = 12 1.744%: ██
9.58KB <-> 12.77KB = 12 1.744%: ██
14KB <-> 27KB = 15 2.180%: ██
13.52KB <-> 27.04KB = 15 2.180%: ██
27KB <-> 41KB = 6 0.872%: █
27.04KB <-> 40.57KB = 6 0.872%: █
41KB <-> 54KB = 22 3.198%: ███
40.57KB <-> 54.09KB = 22 3.198%: ███
54KB <-> 108KB = 99 14.390%: ██████████████
54.20KB <-> 108.41KB = 99 14.390%: ██████████████
108KB <-> 163KB = 23 3.343%: ███
108.41KB <-> 162.61KB = 23 3.343%: ███
163KB <-> 217KB = 8 1.163%: █
162.61KB <-> 216.81KB = 8 1.163%: █
236KB <-> 473KB = 3 0.436%:
236.46KB <-> 472.93KB = 3 0.436%:
709KB <-> 946KB = 44 6.395%: ██████
709.39KB <-> 945.85KB = 44 6.395%: ██████
3MB <-> 5MB = 4 0.581%: █
3.30MB <-> 4.96MB = 4 0.581%: █
5MB <-> 7MB = 21 3.052%: ███
4.96MB <-> 6.61MB = 21 3.052%: ███
7MB <-> 13MB = 72 10.465%: ██████████
6.67MB <-> 13.33MB = 72 10.465%: ██████████
13MB <-> 20MB = 6 0.872%: █
13.33MB <-> 20.00MB = 6 0.872%: █
20MB <-> 27MB = 1 0.145%:
20.00MB <-> 26.66MB = 1 0.145%:


$ filedist ~/Music 10
$ filedist ~/Music 10
Line 601: Line 953:
Total files: 688
Total files: 688
Total folders: 663
Total folders: 663
Total size: 986MB
Total size: 985.85MB


Distribution:
Distribution:


From <-> To Count
From <-> To Count
-------------------------------------
----------------------------------------------
0B <-> 88B = 7 1.017%: █
0B <-> 88B = 7 1.017%: █
89B <-> 177B = 75 10.901%: ███████████
89B <-> 177B = 75 10.901%: ███████████
178B <-> 266B = 156 22.674%: ███████████████████████
178B <-> 266B = 156 22.674%: ███████████████████████
267B <-> 355B = 57 8.285%: ████████
267B <-> 355B = 57 8.285%: ████████
356B <-> 444B = 20 2.907%: ███
356B <-> 444B = 20 2.907%: ███
801B <-> 889B = 2 0.291%:
801B <-> 889B = 2 0.291%:
959B <-> 2KB = 1 0.145%:
959B <-> 1.87KB = 1 0.145%:
4KB <-> 5KB = 1 0.145%:
3.75KB <-> 4.68KB = 1 0.145%:
5KB <-> 6KB = 1 0.145%:
4.68KB <-> 5.62KB = 1 0.145%:
6KB <-> 7KB = 11 1.599%: ██
5.62KB <-> 6.55KB = 11 1.599%: ██
7KB <-> 7KB = 10 1.453%: █
6.56KB <-> 7.49KB = 10 1.453%: █
7KB <-> 8KB = 4 0.581%: █
7.49KB <-> 8.43KB = 4 0.581%: █
8KB <-> 9KB = 7 1.017%: █
8.43KB <-> 9.36KB = 7 1.017%: █
9KB <-> 19KB = 21 3.052%: ███
9.43KB <-> 18.85KB = 21 3.052%: ███
19KB <-> 28KB = 6 0.872%: █
18.85KB <-> 28.28KB = 6 0.872%: █
28KB <-> 38KB = 4 0.581%: █
28.28KB <-> 37.71KB = 4 0.581%: █
38KB <-> 47KB = 12 1.744%: ██
37.71KB <-> 47.13KB = 12 1.744%: ██
47KB <-> 57KB = 16 2.326%: ██
47.13KB <-> 56.56KB = 16 2.326%: ██
57KB <-> 66KB = 23 3.343%: ███
56.56KB <-> 65.99KB = 23 3.343%: ███
66KB <-> 75KB = 26 3.779%: ████
65.99KB <-> 75.41KB = 26 3.779%: ████
75KB <-> 85KB = 15 2.180%: ██
75.41KB <-> 84.84KB = 15 2.180%: ██
85KB <-> 94KB = 17 2.471%: ██
84.84KB <-> 94.27KB = 17 2.471%: ██
95KB <-> 189KB = 42 6.105%: ██████
94.59KB <-> 189.17KB = 42 6.105%: ██████
189KB <-> 284KB = 4 0.581%: █
189.17KB <-> 283.76KB = 4 0.581%: █
284KB <-> 378KB = 2 0.291%:
283.76KB <-> 378.35KB = 2 0.291%:
851KB <-> 946KB = 44 6.395%: ██████
851.28KB <-> 945.87KB = 44 6.395%: ██████
3MB <-> 5MB = 5 0.727%: █
2.67MB <-> 5.33MB = 5 0.727%: █
5MB <-> 8MB = 41 5.959%: ██████
5.33MB <-> 8.00MB = 41 5.959%: ██████
8MB <-> 11MB = 35 5.087%: █████
8.00MB <-> 10.67MB = 35 5.087%: █████
11MB <-> 13MB = 16 2.326%: ██
10.67MB <-> 13.33MB = 16 2.326%: ██
13MB <-> 16MB = 3 0.436%:
13.33MB <-> 16.00MB = 3 0.436%:
16MB <-> 19MB = 3 0.436%:
16.00MB <-> 18.67MB = 3 0.436%:
24MB <-> 27MB = 1 0.145%:
24.00MB <-> 26.66MB = 1 0.145%:
</pre>

=={{header|J}}==

We can get file sizes of all files under a specific path by inspecting the last column from dirtree. For example, the sizes of the files under the user's home directory would be <tt>;{:|:dirtree '~'</tt>

From there, we can bucket them by factors of ten, then display the limiting size of each bucket along with the number of files contained (we'll sort them, for legibility):

<syntaxhighlight lang="j"> ((10x^~.),.#/.~) <.10 ^.1>. /:~;{:|:dirtree '~'
1 2
10 8
100 37
1000 49
10000 20
100000 9
1000000 4
10000000 4</syntaxhighlight>

=={{header|Java}}==
<syntaxhighlight lang="java">

import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

public final class FileSizeDistribution {

public static void main(String[] aArgs) throws IOException {
List<Path> fileNames = Files.list(Path.of("."))
.filter( file -> ! Files.isDirectory(file) )
.map(Path::getFileName)
.toList();
Map<Integer, Integer> fileSizes = new HashMap<Integer, Integer>();
for ( Path path : fileNames ) {
fileSizes.merge(String.valueOf(Files.size(path)).length(), 1, Integer::sum);
}
final int fileCount = fileSizes.values().stream().mapToInt(Integer::valueOf).sum();
System.out.println("File size distribution for directory \".\":" + System.lineSeparator());
System.out.println("File size in bytes | Number of files | Percentage");
System.out.println("-------------------------------------------------");
for ( int key : fileSizes.keySet() ) {
final int value = fileSizes.get(key);
System.out.println(String.format("%s%d%s%d%15d%15.1f%%",
" 10^", ( key - 1 ), " to 10^", key, value, ( 100.0 * value ) / fileCount));
}
}

}
</syntaxhighlight>
{{ out }}
<pre>
File size distribution for directory ".":

File size in bytes | Number of files | Percentage
-------------------------------------------------
10^0 to 10^1 1 0.2%
10^1 to 10^2 1 0.2%
10^2 to 10^3 5 1.1%
10^3 to 10^4 3 0.6%
10^4 to 10^5 161 34.0%
10^5 to 10^6 196 41.4%
10^6 to 10^7 98 20.7%
10^7 to 10^8 9 1.9%
</pre>

=={{header|jq}}==
'''Works with jq, the C implementation of jq'''

'''Works with gojq, the Go implementation of jq'''

'''Works with jaq, the Rust implementation of jq'''

This entry illustrates how jq plays nicely with other command-line
tools; in this case jc (https://kellyjonbrazil.github.io/jc) is used to JSONify the output of `ls -Rl`.

(jq could also be used to parse the raw output of `ls`, but it would no doubt
be tricky to achieve portability.)

The invocation of jc and jq would be along the following lines:
<pre>
jc --ls -lR | jq -c -f file-size-distribution.jq
</pre>

In the present case, the output from the call to `histogram` is a stream of [category, count] pairs
beginning with [0, _] showing the number of files of size 0; thereafter, the boundaries
of the categories are defined logarithmically, i.e. a file of size of $n is assigned to
the category `1 + ($n | log10 | trunc)`.

The output shown below for an actual directory tree suggests a
unimodal distribution of file sizes.

<syntaxhighlight lang="jq">
# bag of words
def bow(stream):
reduce stream as $word ({}; .[($word|tostring)] += 1);

# `stream` is expected to be a stream of non-negative numbers or numeric strings.
# The output is a stream of [bucket, count] pairs, sorted by the value of `bucket`.
# No sorting except for the sorting of these bucket boundaries takes place.
def histogram(stream):
bow(stream)
| to_entries
| map( [(.key | tonumber), .value] )
| sort_by(.[0])
| .[];

histogram(.[] | .size | if . == 0 then 0 else 1 + (log10 | trunc) end)
</syntaxhighlight>
{{output}}
<pre>
[0,9]
[1,67]
[2,616]
[3,6239]
[4,3679]
[5,213]
[6,56]
[7,40]
[8,20]
[9,4]
[10,1]
</pre>
</pre>


Line 645: Line 1,124:
{{works with|Julia|0.6}}
{{works with|Julia|0.6}}


<lang julia>using Humanize
<syntaxhighlight lang="julia">using Humanize


function sizelist(path::AbstractString)
function sizelist(path::AbstractString)
Line 671: Line 1,150:
end
end


main(".")</lang>
main(".")</syntaxhighlight>


{{out}}
{{out}}
<pre>filesizes:
<pre>filesizes:
- between 0.0 B and 1.0 B bytes: 0
- between 0.0 B and 1.0 B bytes: 0
- between 1.0 B and 10.0 B bytes: 1
- between 1.0 B and 10.0 B bytes: 1
Line 689: Line 1,168:


=={{header|Kotlin}}==
=={{header|Kotlin}}==
<lang scala>// version 1.2.10
<syntaxhighlight lang="scala">// version 1.2.10


import java.io.File
import java.io.File
Line 736: Line 1,215:
fun main(args: Array<String>) {
fun main(args: Array<String>) {
fileSizeDistribution("./") // current directory
fileSizeDistribution("./") // current directory
}</lang>
}</syntaxhighlight>


{{out}}
{{out}}
Line 761: Line 1,240:
Number of inaccessible files : 0
Number of inaccessible files : 0
</pre>
</pre>

=={{header|Lang}}==
{{libheader|lang-io-module}}
<syntaxhighlight lang="lang">
# Load the IO module
# Replace "<pathToIO.lm>" with the location where the io.lm Lang module was installed to without "<" and ">"
ln.loadModule(<pathToIO.lm>)


fp.fileSizeDistribution = (&sizes, $[totalSize], $file) -> {
if([[io]]::fp.isDirectory($file)) {
&fileNames = [[io]]::fp.listFilesAndDirectories($file)
$path = [[io]]::fp.getCanonicalPath($file)
if($path == /) {
$path = \e
}
$fileName
foreach($[fileName], &fileNames) {
$innerFile = [[io]]::fp.openFile($path/$fileName)
$innerTotalSize = 0L
fp.fileSizeDistribution(&sizes, $innerTotalSize, $innerFile)
$*totalSize += $innerTotalSize
[[io]]::fp.closeFile($innerFile)
}
}else {
$len = [[io]]::fp.getSize($file)
if($len == null) {
return
}
$*totalSize += $len
if($len == 0) {
&sizes[0] += 1
}else {
$index = fn.int(fn.log10($len))
&sizes[$index] += 1
}
}
}

$path $= @&LANG_ARGS == 1?&LANG_ARGS[0]:{{{./}}}

&sizes = fn.arrayMake(12)
fn.arraySetAll(&sizes, 0)

$file = [[io]]::fp.openFile($path)

$totalSize = 0L

fp.fileSizeDistribution(&sizes, $totalSize, $file)

[[io]]::fp.closeFile($file)

fn.println(File size distribution for "$path":)
$i
repeat($[i], @&sizes) {
fn.printf(10 ^% 3d bytes: %d%n, $i, parser.op(&sizes[$i]))
}
fn.println(Number of files: fn.arrayReduce(&sizes, 0, fn.add))
fn.println(Total file size: $totalSize)
</syntaxhighlight>

=={{header|Mathematica}} / {{header|Wolfram Language}}==
<syntaxhighlight lang="mathematica">SetDirectory[NotebookDirectory[]];
Histogram[FileByteCount /@ Select[FileNames[__], DirectoryQ /* Not], {"Log", 15}, {"Log", "Count"}]</syntaxhighlight>

=={{header|Nim}}==
<syntaxhighlight lang="nim">import math, os, strformat

const
MaxPower = 10
Powers = [1, 10, 100]

func powerWithUnit(idx: int): string =
## Return a string representing value 10^idx with a unit.
if idx < 0:
"0B"
elif idx < 3:
fmt"{Powers[idx]}B"
elif idx < 6:
fmt"{Powers[idx - 3]}kB"
elif idx < 9:
fmt"{Powers[idx - 6]}MB"
else:
fmt"{Powers[idx - 9]}GB"


# Retrieve the directory path.
var dirpath: string
if paramCount() == 0:
dirpath = getCurrentDir()
else:
dirpath = paramStr(1)
if not dirExists(dirpath):
raise newException(ValueError, "wrong directory path: " & dirpath)

# Distribute sizes.
var counts: array[-1..MaxPower, Natural]
for path in dirpath.walkDirRec():
if not path.fileExists():
continue # Not a regular file.
let size = getFileSize(path)
let index = if size == 0: -1 else: log10(size.float).toInt
inc counts[index]

# Display distribution.
let total = sum(counts)
echo "File size distribution for directory: ", dirpath
echo ""
for idx, count in counts:
let rangeString = fmt"[{powerWithUnit(idx)}..{powerWithUnit(idx + 1)}[:"
echo fmt"Size in {rangeString: 14} {count:>7} {100 * count / total:5.2f}%"
echo ""
echo "Total number of files: ", sum(counts)</syntaxhighlight>

{{out}}
<pre>File size distribution for directory: /home/xxx

Size in [0B..1B[: 2782 1.28%
Size in [1B..10B[: 145 0.07%
Size in [10B..100B[: 2828 1.30%
Size in [100B..1kB[: 20781 9.55%
Size in [1kB..10kB[: 85469 39.29%
Size in [10kB..100kB[: 86594 39.81%
Size in [100kB..1MB[: 16629 7.64%
Size in [1MB..10MB[: 2053 0.94%
Size in [10MB..100MB[: 221 0.10%
Size in [100MB..1GB[: 38 0.02%
Size in [1GB..10GB[: 0 0.00%
Size in [10GB..100GB[: 0 0.00%

Total number of files: 217540</pre>


=={{header|Perl}}==
=={{header|Perl}}==
{{trans|Raku}}
{{trans|Raku}}
<lang perl>use File::Find;
<syntaxhighlight lang="perl">use File::Find;
use List::Util qw(max);
use List::Util qw(max);


Line 791: Line 1,406:


sub fsize { $fsize{ log10( (lstat($_))[7] ) }++ }
sub fsize { $fsize{ log10( (lstat($_))[7] ) }++ }
sub log10 { my($s) = @_; $s ? int log($s)/log(10) : 0 }</lang>
sub log10 { my($s) = @_; $s ? int log($s)/log(10) : 0 }</syntaxhighlight>
{{out}}
{{out}}
<pre>File size distribution in bytes for directory: .
<pre>File size distribution in bytes for directory: .
Line 805: Line 1,420:
=={{header|Phix}}==
=={{header|Phix}}==
Works on Windows and Linux. Uses "proper" sizes, ie 1MB==1024KB. Can be quite slow at first, but is pretty fast on the second and subsequent runs, that is once the OS has cached its (low-level) directory reads.
Works on Windows and Linux. Uses "proper" sizes, ie 1MB==1024KB. Can be quite slow at first, but is pretty fast on the second and subsequent runs, that is once the OS has cached its (low-level) directory reads.
<!--<syntaxhighlight lang="phix">(notonline)-->
<lang Phix>sequence sizes = {1},
<span style="color: #008080;">without</span> <span style="color: #008080;">js</span> <span style="color: #000080;font-style:italic;">-- file i/o</span>
res = {0}
<span style="color: #004080;">sequence</span> <span style="color: #000000;">sizes</span> <span style="color: #0000FF;">=</span> <span style="color: #0000FF;">{</span><span style="color: #000000;">1</span><span style="color: #0000FF;">},</span>
atom t1 = time()+1
<span style="color: #000000;">res</span> <span style="color: #0000FF;">=</span> <span style="color: #0000FF;">{</span><span style="color: #000000;">0</span><span style="color: #0000FF;">}</span>

<span style="color: #004080;">atom</span> <span style="color: #000000;">t1</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">time</span><span style="color: #0000FF;">()+</span><span style="color: #000000;">1</span>
function store_res(string filepath, sequence dir_entry)
if not find('d', dir_entry[D_ATTRIBUTES]) then
<span style="color: #008080;">function</span> <span style="color: #000000;">store_res</span><span style="color: #0000FF;">(</span><span style="color: #004080;">string</span> <span style="color: #000000;">filepath</span><span style="color: #0000FF;">,</span> <span style="color: #004080;">sequence</span> <span style="color: #000000;">dir_entry</span><span style="color: #0000FF;">)</span>
atom size = dir_entry[D_SIZE]
<span style="color: #008080;">if</span> <span style="color: #008080;">not</span> <span style="color: #7060A8;">find</span><span style="color: #0000FF;">(</span><span style="color: #008000;">'d'</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">dir_entry</span><span style="color: #0000FF;">[</span><span style="color: #004600;">D_ATTRIBUTES</span><span style="color: #0000FF;">])</span> <span style="color: #008080;">then</span>
integer sdx = 1
<span style="color: #004080;">atom</span> <span style="color: #000000;">size</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">dir_entry</span><span style="color: #0000FF;">[</span><span style="color: #004600;">D_SIZE</span><span style="color: #0000FF;">]</span>
while size>sizes[sdx] do
<span style="color: #004080;">integer</span> <span style="color: #000000;">sdx</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">1</span>
if sdx=length(sizes) then
<span style="color: #008080;">while</span> <span style="color: #000000;">size</span><span style="color: #0000FF;">></span><span style="color: #000000;">sizes</span><span style="color: #0000FF;">[</span><span style="color: #000000;">sdx</span><span style="color: #0000FF;">]</span> <span style="color: #008080;">do</span>
sizes &= sizes[$]*iff(mod(length(sizes),3)?10:10.24)
<span style="color: #008080;">if</span> <span style="color: #000000;">sdx</span><span style="color: #0000FF;">=</span><span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">sizes</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">then</span>
res &= 0
<span style="color: #000000;">sizes</span> <span style="color: #0000FF;">&=</span> <span style="color: #000000;">sizes</span><span style="color: #0000FF;">[$]*</span><span style="color: #008080;">iff</span><span style="color: #0000FF;">(</span><span style="color: #7060A8;">mod</span><span style="color: #0000FF;">(</span><span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">sizes</span><span style="color: #0000FF;">),</span><span style="color: #000000;">3</span><span style="color: #0000FF;">)?</span><span style="color: #000000;">10</span><span style="color: #0000FF;">:</span><span style="color: #000000;">10.24</span><span style="color: #0000FF;">)</span>
end if
<span style="color: #000000;">res</span> <span style="color: #0000FF;">&=</span> <span style="color: #000000;">0</span>
sdx += 1
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
end while
<span style="color: #000000;">sdx</span> <span style="color: #0000FF;">+=</span> <span style="color: #000000;">1</span>
res[sdx] += 1
<span style="color: #008080;">end</span> <span style="color: #008080;">while</span>
if time()>t1 then
<span style="color: #000000;">res</span><span style="color: #0000FF;">[</span><span style="color: #000000;">sdx</span><span style="color: #0000FF;">]</span> <span style="color: #0000FF;">+=</span> <span style="color: #000000;">1</span>
printf(1,"%,d files found\r",sum(res))
<span style="color: #008080;">if</span> <span style="color: #7060A8;">time</span><span style="color: #0000FF;">()></span><span style="color: #000000;">t1</span> <span style="color: #008080;">then</span>
t1 = time()+1
<span style="color: #7060A8;">printf</span><span style="color: #0000FF;">(</span><span style="color: #000000;">1</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"%,d files found\r"</span><span style="color: #0000FF;">,</span><span style="color: #7060A8;">sum</span><span style="color: #0000FF;">(</span><span style="color: #000000;">res</span><span style="color: #0000FF;">))</span>
end if
<span style="color: #000000;">t1</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">time</span><span style="color: #0000FF;">()+</span><span style="color: #000000;">1</span>
end if
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
return 0 -- keep going
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
end function
<span style="color: #008080;">return</span> <span style="color: #000000;">0</span> <span style="color: #000080;font-style:italic;">-- keep going</span>
integer exit_code = walk_dir(".", routine_id("store_res"), true)
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>

<span style="color: #004080;">integer</span> <span style="color: #000000;">exit_code</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">walk_dir</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"."</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">store_res</span><span style="color: #0000FF;">,</span> <span style="color: #004600;">true</span><span style="color: #0000FF;">)</span>
printf(1,"%,d files found\n",sum(res))
integer w = max(res)
<span style="color: #7060A8;">printf</span><span style="color: #0000FF;">(</span><span style="color: #000000;">1</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"%,d files found\n"</span><span style="color: #0000FF;">,</span><span style="color: #7060A8;">sum</span><span style="color: #0000FF;">(</span><span style="color: #000000;">res</span><span style="color: #0000FF;">))</span>
include builtins/pfile.e
<span style="color: #004080;">integer</span> <span style="color: #000000;">w</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">max</span><span style="color: #0000FF;">(</span><span style="color: #000000;">res</span><span style="color: #0000FF;">)</span>
for i=1 to length(res) do
<span style="color: #000080;font-style:italic;">--include builtins/pfile.e</span>
integer ri = res[i]
<span style="color: #008080;">for</span> <span style="color: #000000;">i</span><span style="color: #0000FF;">=</span><span style="color: #000000;">1</span> <span style="color: #008080;">to</span> <span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">res</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">do</span>
string s = file_size_k(sizes[i], 5),
<span style="color: #004080;">integer</span> <span style="color: #000000;">ri</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">res</span><span style="color: #0000FF;">[</span><span style="color: #000000;">i</span><span style="color: #0000FF;">]</span>
p = repeat('*',floor(60*ri/w))
<span style="color: #004080;">string</span> <span style="color: #000000;">s</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">file_size_k</span><span style="color: #0000FF;">(</span><span style="color: #000000;">sizes</span><span style="color: #0000FF;">[</span><span style="color: #000000;">i</span><span style="color: #0000FF;">],</span> <span style="color: #000000;">5</span><span style="color: #0000FF;">),</span>
printf(1,"files < %s: %s%,d\n",{s,p,ri})
<span style="color: #000000;">p</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">repeat</span><span style="color: #0000FF;">(</span><span style="color: #008000;">'*'</span><span style="color: #0000FF;">,</span><span style="color: #7060A8;">floor</span><span style="color: #0000FF;">(</span><span style="color: #000000;">60</span><span style="color: #0000FF;">*</span><span style="color: #000000;">ri</span><span style="color: #0000FF;">/</span><span style="color: #000000;">w</span><span style="color: #0000FF;">))</span>
end for</lang>
<span style="color: #7060A8;">printf</span><span style="color: #0000FF;">(</span><span style="color: #000000;">1</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"files &lt; %s: %s%,d\n"</span><span style="color: #0000FF;">,{</span><span style="color: #000000;">s</span><span style="color: #0000FF;">,</span><span style="color: #000000;">p</span><span style="color: #0000FF;">,</span><span style="color: #000000;">ri</span><span style="color: #0000FF;">})</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">for</span>
<!--</syntaxhighlight>-->
{{out}}
{{out}}
<pre>
<pre>
Line 858: Line 1,476:
The distribution is stored in a '''collections.Counter''' object (like a dictionary with automatic 0 value when a key is not found, useful when incrementing). Anything could be done with this object, here the number of files is printed for increasing sizes. No check is made during the directory walk: usually, safeguards would be needed or the program will fail on any unreadable file or directory (depending on rights, or too deep paths, for instance). Here links are skipped, so it should avoid cycles.
The distribution is stored in a '''collections.Counter''' object (like a dictionary with automatic 0 value when a key is not found, useful when incrementing). Anything could be done with this object, here the number of files is printed for increasing sizes. No check is made during the directory walk: usually, safeguards would be needed or the program will fail on any unreadable file or directory (depending on rights, or too deep paths, for instance). Here links are skipped, so it should avoid cycles.


<lang python>import sys, os
<syntaxhighlight lang="python">import sys, os
from collections import Counter
from collections import Counter


Line 881: Line 1,499:
for dir in arg:
for dir in arg:
dodir(dir)
dodir(dir)

s = n = 0
s = n = 0
for k, v in sorted(h.items()):
for k, v in sorted(h.items()):
Line 889: Line 1,507:
print("Total %d bytes for %d files" % (s, n))
print("Total %d bytes for %d files" % (s, n))


main(sys.argv[1:])</lang>
main(sys.argv[1:])</syntaxhighlight>


=={{header|Racket}}==
=={{header|Racket}}==


<lang racket>#lang racket
<syntaxhighlight lang="racket">#lang racket


(define (file-size-distribution (d (current-directory)) #:size-group-function (sgf values))
(define (file-size-distribution (d (current-directory)) #:size-group-function (sgf values))
Line 920: Line 1,538:
(module+ test
(module+ test
(call-with-values (λ () (file-size-distribution #:size-group-function log10-or-so))
(call-with-values (λ () (file-size-distribution #:size-group-function log10-or-so))
(report-fsd log10-or-so)))</lang>
(report-fsd log10-or-so)))</syntaxhighlight>


{{out}}
{{out}}
Line 938: Line 1,556:
By default, process the current and all readable sub-directories, or, pass in a directory path at the command line.
By default, process the current and all readable sub-directories, or, pass in a directory path at the command line.


<lang perl6>sub MAIN($dir = '.') {
<syntaxhighlight lang="raku" line>sub MAIN($dir = '.') {
sub log10 (Int $s) { $s ?? $s.log(10).Int !! 0 }
sub log10 (Int $s) { $s ?? $s.log(10).Int !! 0 }
my %fsize;
my %fsize;
Line 963: Line 1,581:
my ($end, $bar) = $scaled.polymod(8);
my ($end, $bar) = $scaled.polymod(8);
(@blocks[8] x $bar * 8) ~ (@blocks[$end] if $end) ~ "\n"
(@blocks[8] x $bar * 8) ~ (@blocks[$end] if $end) ~ "\n"
}</lang>
}</syntaxhighlight>


{{out}}
{{out}}
Line 991: Line 1,609:


=={{header|REXX}}==
=={{header|REXX}}==
This REXX version works for Microsoft Windows using the &nbsp; '''dir''' &nbsp; subcommand; &nbsp; extra code was added for
This REXX version works for Microsoft Windows using the &nbsp; '''dir''' &nbsp; subcommand; &nbsp; extra code was added for
<br>older versions of Windows that used suffixes to express big numbers &nbsp; (the size of a file), &nbsp; and also versions
<br>older versions of Windows that used suffixes to express big numbers &nbsp; (the size of a file), &nbsp; and also versions
<br>that used a mixed case for showing the output text.
<br>that used a mixed case for showing the output text.


Also, some Windows versions of the &nbsp; '''dir''' &nbsp; command insert commas into numbers, so code was added to elide them.
Also, some Windows versions of the &nbsp; '''dir''' &nbsp; command insert commas into numbers, so code was added to elide them.
<lang rexx>/*REXX program displays a histogram of filesize distribution of a directory structure(s)*/
<syntaxhighlight lang="rexx">/*REXX program displays a histogram of filesize distribution of a directory structure(s)*/
numeric digits 30 /*ensure enough decimal digits for a #.*/
numeric digits 30 /*ensure enough decimal digits for a #.*/
parse arg ds . /*obtain optional argument from the CL.*/
parse arg ds . /*obtain optional argument from the CL.*/
Line 1,041: Line 1,659:
exit /*stick a fork in it, we're all done. */
exit /*stick a fork in it, we're all done. */
/*──────────────────────────────────────────────────────────────────────────────────────*/
/*──────────────────────────────────────────────────────────────────────────────────────*/
commas: parse arg _; do j#=length(_)-3 to 1 by -3; _=insert(',', _, j#); end; return _</lang>
commas: parse arg _; do j#=length(_)-3 to 1 by -3; _=insert(',', _, j#); end; return _</syntaxhighlight>
This REXX program makes use of &nbsp; '''LINESIZE''' &nbsp; REXX program (or BIF) which is used to determine the screen width (or linesize) of the terminal (console) so as to maximize the width of the histogram.
This REXX program makes use of &nbsp; '''LINESIZE''' &nbsp; REXX program (or BIF) which is used to determine the screen width (or linesize) of the terminal (console) so as to maximize the width of the histogram.


The &nbsp; '''LINESIZE.REX''' &nbsp; REXX program is included here &nbsp; ──► &nbsp; [[LINESIZE.REX]].<br>
The &nbsp; '''LINESIZE.REX''' &nbsp; REXX program is included here &nbsp; ──► &nbsp; [[LINESIZE.REX]].<br>
Line 1,106: Line 1,724:
{{libheader|walkdir}}
{{libheader|walkdir}}
{{works with|Rust|2018}}
{{works with|Rust|2018}}
<lang rust>
<syntaxhighlight lang="rust">
use std::error::Error;
use std::error::Error;
use std::marker::PhantomData;
use std::marker::PhantomData;
Line 1,283: Line 1,901:
}
}
}
}
</syntaxhighlight>
</lang>
{{out}}
{{out}}
<pre>
<pre>
Line 1,303: Line 1,921:


=={{header|Sidef}}==
=={{header|Sidef}}==
<lang ruby>func traverse(Block callback, Dir dir) {
<syntaxhighlight lang="ruby">func traverse(Block callback, Dir dir) {
dir.open(\var dir_h) || return nil
dir.open(\var dir_h) || return nil
 
 
Line 1,331: Line 1,949:
}
}


say "Total: #{total_size} bytes in #{files_num} files"</lang>
say "Total: #{total_size} bytes in #{files_num} files"</syntaxhighlight>
{{out}}
{{out}}
<pre>
<pre>
Line 1,344: Line 1,962:
log10(size) ~~ 8 -> 2 files
log10(size) ~~ 8 -> 2 files
Total: 370026462 bytes in 2650 files
Total: 370026462 bytes in 2650 files
</pre>

=={{header|Tcl}}==
This is with the '''fileutil::traverse''' package from Tcllib to do the tree walking, a '''glob''' based alternative ignoring links but not hidden files is possible but would add a dozen of lines.
<syntaxhighlight lang="tcl">package require fileutil::traverse
namespace path {::tcl::mathfunc ::tcl::mathop}

# Ternary helper
proc ? {test a b} {tailcall if $test [list subst $a] [list subst $b]}

set dir [? {$argc} {[lindex $argv 0]} .]
fileutil::traverse Tobj $dir \
-prefilter {apply {path {ne [file type $path] link}}} \
-filter {apply {path {eq [file type $path] file}}}
Tobj foreach path {
set size [file size $path]
dict incr hist [? {$size} {[int [log10 $size]]} -1]
}
Tobj destroy

foreach key [lsort -int [dict keys $hist]] {
puts "[? {$key == -1} 0 {1e$key}]\t[dict get $hist $key]"
}</syntaxhighlight>
{{out}}
<pre>0 1
1e1 339
1e2 3142
1e3 2015
1e4 150
1e5 29
1e6 13
1e7 3</pre>

=={{header|UNIX Shell}}==
{{works with|Bourne Shell}}
Use POSIX conformant code unless the environment variable GNU is set to anything not empty.
<syntaxhighlight lang="sh">#!/bin/sh
set -eu

tabs -8
if [ ${GNU:-} ]
then
find -- "${1:-.}" -type f -exec du -b -- {} +
else
# Use a subshell to remove the last "total" line per each ARG_MAX
find -- "${1:-.}" -type f -exec sh -c 'wc -c -- "$@" | sed \$d' argv0 {} +
fi | awk -vOFS='\t' '
BEGIN {split("KB MB GB TB PB", u); u[0] = "B"}
{
++hist[$1 ? length($1) - 1 : -1]
total += $1
}
END {
max = -2
for (i in hist)
max = (i > max ? i : max)

print "From", "To", "Count\n"
for (i = -1; i <= max; ++i)
{
if (i in hist)
{
if (i == -1)
print "0B", "0B", hist[i]
else
print 10 ** (i % 3) u[int(i / 3)],
10 ** ((i + 1) % 3) u[int((i + 1) / 3)],
hist[i]
}
}
l = length(total) - 1
printf "\nTotal: %.1f %s in %d files\n",
total / (10 ** l), u[int(l / 3)], NR
}'</syntaxhighlight>
{{out}}
<pre>$ time ~/fsd.sh
From To Count

0B 0B 13
1B 10B 74
10B 100B 269
100B 1KB 5894
1KB 10KB 12727
10KB 100KB 12755
100KB 1MB 110922
1MB 10MB 50019
10MB 100MB 17706
100MB 1GB 5056
1GB 10GB 1139
10GB 100GB 141
100GB 1TB 1

Total: 8.9 TB in 216716 files
~/fsd.sh 1.28s user 2.55s system 134% cpu 2.842 total
$ time GNU=1 ~/fsd.sh
From To Count

0B 0B 13
1B 10B 74
10B 100B 269
100B 1KB 5894
1KB 10KB 12727
10KB 100KB 12755
100KB 1MB 110922
1MB 10MB 50019
10MB 100MB 17706
100MB 1GB 5056
1GB 10GB 1139
10GB 100GB 141
100GB 1TB 1

Total: 8.9 TB in 216716 files
GNU=1 ~/fsd.sh 0.81s user 1.33s system 135% cpu 1.586 total</pre>

=={{header|Wren}}==
{{libheader|Wren-math}}
{{libheader|Wren-fmt}}
<syntaxhighlight lang="wren">import "io" for Directory, File, Stat
import "os" for Process
import "./math" for Math
import "./fmt" for Fmt

var sizes = List.filled(12, 0)
var totalSize = 0
var numFiles = 0
var numDirs = 0

var fileSizeDist // recursive function
fileSizeDist = Fn.new { |path|
var files = Directory.list(path)
for (file in files) {
var path2 = "%(path)/%(file)"
var stat = Stat.path(path2)
if (stat.isFile) {
numFiles = numFiles + 1
var size = stat.size
if (size == 0) {
sizes[0] = sizes[0] + 1
} else {
totalSize = totalSize + size
var logSize = Math.log10(size)
var index = logSize.floor + 1
sizes[index] = sizes[index] + 1
}
} else if (stat.isDirectory) {
numDirs = numDirs + 1
fileSizeDist.call(path2)
}
}
}

var args = Process.arguments
var path = (args.count == 0) ? "./" : args[0]
if (!Directory.exists(path)) Fiber.abort("Path does not exist or is not a directory.")
fileSizeDist.call(path)

System.print("File size distribution for '%(path)' :-\n")
for (i in 0...sizes.count) {
System.write((i == 0) ? " " : "+ ")
Fmt.print("Files less than 10 ^ $-2d bytes : $,5d", i, sizes[i])
}
System.print(" -----")
Fmt.print("= Number of files : $,5d", numFiles)
Fmt.print(" Total size in bytes : $,d", totalSize)
Fmt.print(" Number of sub-directories : $,5d", numDirs)</syntaxhighlight>

{{out}}
<pre>
File size distribution for './' :-

Files less than 10 ^ 0 bytes : 4
+ Files less than 10 ^ 1 bytes : 2
+ Files less than 10 ^ 2 bytes : 135
+ Files less than 10 ^ 3 bytes : 946
+ Files less than 10 ^ 4 bytes : 746
+ Files less than 10 ^ 5 bytes : 79
+ Files less than 10 ^ 6 bytes : 11
+ Files less than 10 ^ 7 bytes : 3
+ Files less than 10 ^ 8 bytes : 0
+ Files less than 10 ^ 9 bytes : 0
+ Files less than 10 ^ 10 bytes : 0
+ Files less than 10 ^ 11 bytes : 0
-----
= Number of files : 1,926
Total size in bytes : 12,683,455
Number of sub-directories : 3
</pre>
</pre>


=={{header|zkl}}==
=={{header|zkl}}==
<lang zkl>pipe:=Thread.Pipe();
<syntaxhighlight lang="zkl">pipe:=Thread.Pipe();
// hoover all files in tree, don't return directories
// hoover all files in tree, don't return directories
fcn(pipe,dir){ File.globular(dir,"*",True,8,pipe); }
fcn(pipe,dir){ File.globular(dir,"*",True,8,pipe); }
Line 1,365: Line 2,169:
println("%15s : %s".fmt(szchrs[idx,*], "*"*(scale*cnt).round().toInt()));
println("%15s : %s".fmt(szchrs[idx,*], "*"*(scale*cnt).round().toInt()));
idx-=1 + comma();
idx-=1 + comma();
}</lang>
}</syntaxhighlight>
{{out}}
{{out}}
<pre>
<pre>
Line 1,382: Line 2,186:
Found 4320 files, 67,627,849,052 bytes, 15,654,594 mean.
Found 4320 files, 67,627,849,052 bytes, 15,654,594 mean.
File size Number of files (* = 69.84)
File size Number of files (* = 69.84)
n :
n :
nn :
nn :
nnn :
nnn :
n,nnn : *
n,nnn : *
nn,nnn :
nn,nnn :
nnn,nnn :
nnn,nnn :
n,nnn,nnn : *
n,nnn,nnn : *
nn,nnn,nnn : **************************************************
nn,nnn,nnn : **************************************************

Latest revision as of 04:58, 23 April 2024

Task
File size distribution
You are encouraged to solve this task according to the task description, using any language you may know.
Task

Beginning from the current directory, or optionally from a directory specified as a command-line argument, determine how many files there are of various sizes in a directory hierarchy.


My suggestion is to sort by logarithmn of file size, since a few bytes here or there, or even a factor of two or three, may not be that significant.

Don't forget that empty files may exist, to serve as a marker.


Is your file system predominantly devoted to a large number of smaller files, or a smaller number of huge files?

Action!

DOS 2.5 returns file size in number of sectors.

INCLUDE "D2:PRINTF.ACT" ;from the Action! Tool Kit

PROC SizeDistribution(CHAR ARRAY filter INT ARRAY limits,counts BYTE count)
  CHAR ARRAY line(255),tmp(4)
  INT size
  BYTE i,dev=[1]

  FOR i=0 TO count-1
  DO
    counts(i)=0
  OD

  Close(dev)
  Open(dev,filter,6)
  DO
    InputSD(dev,line)
    IF line(0)=0 THEN
      EXIT
    FI
    SCopyS(tmp,line,line(0)-3,line(0))
    size=ValI(tmp)
    FOR i=0 TO count-1
    DO
      IF size<limits(i) THEN
        counts(i)==+1
        EXIT
      FI
    OD
  OD
  Close(dev)
RETURN

PROC GenerateLimits(INT ARRAY limits BYTE count)
  BYTE i
  INT l

  l=1
  FOR i=0 TO count-1
  DO
    limits(i)=l
    l==LSH 1
    IF l>1000 THEN l=1000 FI
  OD
RETURN

PROC PrintBar(INT len,max,size)
  INT i,count

  count=4*len*size/max
  IF count=0 AND len>0 THEN
    count=1
  FI
  FOR i=0 TO count/4-1
  DO
    Put(160)
  OD
  i=count MOD 4
  IF i=1 THEN Put(22)
  ELSEIF i=2 THEN Put(25)
  ELSEIF i=3 THEN Put(130) FI
RETURN

PROC PrintResult(CHAR ARRAY filter
  INT ARRAY limits,counts BYTE count)

  BYTE i
  CHAR ARRAY tmp(5)
  INT min,max,total

  total=0 max=0
  FOR i=0 TO count-1
  DO
    total==+counts(i)
    IF counts(i)>max THEN
      max=counts(i)
    FI
  OD
  PrintF("File size distribution of ""%S"" in sectors:%E",filter) PutE()
  PrintE("From  To Count Perc")
  min=0
  FOR i=0 TO count-1
  DO
    StrI(min,tmp) PrintF("%4S ",tmp)
    StrI(limits(i)-1,tmp) PrintF("%3S   ",tmp)
    StrI(counts(i),tmp) PrintF("%3S ",tmp)
    StrI(counts(i)*100/total,tmp) PrintF("%3S%% ",tmp)
    PrintBar(counts(i),max,17) PutE()
    min=limits(i)
  OD
RETURN

PROC Main()
  DEFINE LIMITCOUNT="11"
  CHAR ARRAY filter="H1:*.*"
  INT ARRAY limits(LIMITCOUNT),counts(LIMITCOUNT)

  Put(125) PutE() ;clear the screen
  GenerateLimits(limits,LIMITCOUNT)
  SizeDistribution(filter,limits,counts,LIMITCOUNT)
  PrintResult(filter,limits,counts,LIMITCOUNT)
RETURN
Output:

Screenshot from Atari 8-bit computer

File size distribution of "H1:*.*" in sectors:

From  To Count Perc
  0   0     2   0% ▌
  1   1    20   3% █▌
  2   3    44   8% ███▌
  4   7   195  37% █████████████████
  8  15   183  35% ███████████████▌
 16  31    67  12% █████▌
 32  63     6   1% ▌
 64 127     0   0%
128 255     0   0%
256 511     0   0%
512 999     1   0% ▌

Ada

Library: Dir_Iterators
with Ada.Numerics.Elementary_Functions;
with Ada.Directories;    use Ada.Directories;
with Ada.Strings.Fixed;  use Ada.Strings;
with Ada.Command_Line;   use Ada.Command_Line;
with Ada.Text_IO;        use Ada.Text_IO;

with Dir_Iterators.Recursive;

procedure File_Size_Distribution is

   type Exponent_Type is range 0 .. 18;
   type File_Count    is range 0 .. Long_Integer'Last;
   Counts         : array (Exponent_Type) of File_Count := (others => 0);
   Non_Zero_Index : Exponent_Type   := 0;
   Directory_Name : constant String := (if Argument_Count = 0
                                        then "."
                                        else Argument (1));
   Directory_Walker : Dir_Iterators.Recursive.Recursive_Dir_Walk
     := Dir_Iterators.Recursive.Walk (Directory_Name);
begin
   if not Exists (Directory_Name) or else Kind (Directory_Name) /= Directory then
      Put_Line ("Directory does not exist");
      return;
   end if;

   for Directory_Entry of Directory_Walker loop
      declare
         use Ada.Numerics.Elementary_Functions;
         Size_Of_File : File_Size;
         Exponent     : Exponent_Type;
      begin
         if Kind (Directory_Entry) = Ordinary_File then
            Size_Of_File := Size (Directory_Entry);
            if Size_Of_File = 0 then
               Counts (0) := Counts (0) + 1;
            else
               Exponent := Exponent_Type (Float'Ceiling (Log (Float (Size_Of_File),
                                                              Base => 10.0)));
               Counts (Exponent) := Counts (Exponent) + 1;
            end if;
         end if;
      end;
   end loop;

   for I in reverse Counts'Range loop
      if Counts (I) /= 0 then
         Non_Zero_Index := I;
         exit;
      end if;
   end loop;

   for I in Counts'First .. Non_Zero_Index loop
      Put ("Less than 10**");
      Put (Fixed.Trim (Exponent_Type'Image (I), Side => Left));
      Put (": ");
      Put (File_Count'Image (Counts (I)));
      New_Line;
   end loop;
end File_Size_Distribution;
Output:
Less than 10**0:  8
Less than 10**1:  0
Less than 10**2:  18
Less than 10**3:  88
Less than 10**4:  39
Less than 10**5:  8
Less than 10**6:  2
Less than 10**7:  1

C

The platform independent way to get the file size in C involves opening every file and reading the size. The implementation below works for Windows and utilizes command scripts to get size information quickly even for a large number of files, recursively traversing a large number of directories. Both textual and graphical ( ASCII ) outputs are shown. The same can be done for Linux by a combination of the find, ls and stat commands and my plan was to make it work on both OS types, but I don't have access to a Linux system right now. This would also mean either abandoning scaling the graphical output in order to fit the console buffer or porting that as well, thus including windows.h selectively.

Windows

#include<windows.h>
#include<string.h>
#include<stdio.h>

#define MAXORDER 25

int main(int argC, char* argV[])
{
	char str[MAXORDER],commandString[1000],*startPath;
	long int* fileSizeLog = (long int*)calloc(sizeof(long int),MAXORDER),max;
	int i,j,len;
	double scale;
	FILE* fp;

	if(argC==1)
		printf("Usage : %s <followed by directory to start search from(. for current dir), followed by \n optional parameters (T or G) to show text or graph output>",argV[0]);
	else{
		if(strchr(argV[1],' ')!=NULL){
		len = strlen(argV[1]);
		startPath = (char*)malloc((len+2)*sizeof(char));
		startPath[0] = '\"';
		startPath[len+1]='\"';
		strncpy(startPath+1,argV[1],len);
		startPath[len+2] = argV[1][len];
		sprintf(commandString,"forfiles /p %s /s /c \"cmd /c echo @fsize\" 2>&1",startPath);
	}

	else if(strlen(argV[1])==1 && argV[1][0]=='.')
		strcpy(commandString,"forfiles /s /c \"cmd /c echo @fsize\" 2>&1");

	else
		sprintf(commandString,"forfiles /p %s /s /c \"cmd /c echo @fsize\" 2>&1",argV[1]);

	fp = popen(commandString,"r");

	while(fgets(str,100,fp)!=NULL){
			if(str[0]=='0')
				fileSizeLog[0]++;
			else
				fileSizeLog[strlen(str)]++;
	}

	if(argC==2 || (argC==3 && (argV[2][0]=='t'||argV[2][0]=='T'))){
		for(i=0;i<MAXORDER;i++){
			printf("\nSize Order < 10^%2d bytes : %Ld",i,fileSizeLog[i]);
		}
	}

	else if(argC==3 && (argV[2][0]=='g'||argV[2][0]=='G')){
		CONSOLE_SCREEN_BUFFER_INFO csbi;
		int val = GetConsoleScreenBufferInfo(GetStdHandle( STD_OUTPUT_HANDLE ),&csbi);
		if(val)
		{

				max = fileSizeLog[0];

				for(i=1;i<MAXORDER;i++)
					(fileSizeLog[i]>max)?max=fileSizeLog[i]:max;

				(max < csbi.dwSize.X)?(scale=1):(scale=(1.0*(csbi.dwSize.X-50))/max);

				for(i=0;i<MAXORDER;i++){
					printf("\nSize Order < 10^%2d bytes |",i);
					for(j=0;j<(int)(scale*fileSizeLog[i]);j++)
						printf("%c",219);
					printf("%Ld",fileSizeLog[i]);
				}
		}

	}
	return 0;
	}
}

Invocation and textual output :

C:\My Projects\threeJS>fileSize.exe "C:\My Projects" t

Size Order < 10^ 0 bytes : 1770
Size Order < 10^ 1 bytes : 1
Size Order < 10^ 2 bytes : 20
Size Order < 10^ 3 bytes : 219
Size Order < 10^ 4 bytes : 1793
Size Order < 10^ 5 bytes : 1832
Size Order < 10^ 6 bytes : 631
Size Order < 10^ 7 bytes : 124
Size Order < 10^ 8 bytes : 26
Size Order < 10^ 9 bytes : 0
Size Order < 10^10 bytes : 0
Size Order < 10^11 bytes : 0
Size Order < 10^12 bytes : 0
Size Order < 10^13 bytes : 0
Size Order < 10^14 bytes : 0
Size Order < 10^15 bytes : 0
Size Order < 10^16 bytes : 0
Size Order < 10^17 bytes : 0
Size Order < 10^18 bytes : 0
Size Order < 10^19 bytes : 0
Size Order < 10^20 bytes : 0
Size Order < 10^21 bytes : 0
Size Order < 10^22 bytes : 0
Size Order < 10^23 bytes : 0
Size Order < 10^24 bytes : 0

Invocation and graphical output :

C:\My Projects\threeJS>fileSize.exe "C:\My Projects" g

Size Order < 10^ 0 bytes |█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████1770
Size Order < 10^ 1 bytes |1
Size Order < 10^ 2 bytes |██20
Size Order < 10^ 3 bytes |█████████████████████████████219
Size Order < 10^ 4 bytes |████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████1793
Size Order < 10^ 5 bytes |██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████1832
Size Order < 10^ 6 bytes |██████████████████████████████████████████████████████████████████████████████████████631
Size Order < 10^ 7 bytes |████████████████124
Size Order < 10^ 8 bytes |███26
Size Order < 10^ 9 bytes |0
Size Order < 10^10 bytes |0
Size Order < 10^11 bytes |0
Size Order < 10^12 bytes |0
Size Order < 10^13 bytes |0
Size Order < 10^14 bytes |0
Size Order < 10^15 bytes |0
Size Order < 10^16 bytes |0
Size Order < 10^17 bytes |0
Size Order < 10^18 bytes |0
Size Order < 10^19 bytes |0
Size Order < 10^20 bytes |0
Size Order < 10^21 bytes |0
Size Order < 10^22 bytes |0
Size Order < 10^23 bytes |0
Size Order < 10^24 bytes |0

Note that it is possible to track files up to 10^24 (Yottabyte) in size with this implementation, but if you have a file that large, you shouldn't be needing such programs. :)

POSIX

Library: POSIX

This works on macOS 10.15. It should be OK for Linux as well.

#include <ftw.h>
#include <locale.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>

static const uintmax_t sizes[] = {
    0, 1000, 10000, 100000, 1000000, 10000000,
    100000000, 1000000000, 10000000000
};
static const size_t nsizes = sizeof(sizes)/sizeof(sizes[0]);
static uintmax_t count[nsizes + 1] = { 0 };
static uintmax_t files = 0;
static uintmax_t total_size = 0;

static int callback(const char* file, const struct stat* sp, int flag) {
    if (flag == FTW_F) {
        uintmax_t file_size = sp->st_size;
        ++files;
        total_size += file_size;
        size_t index = 0;
        for (; index < nsizes && sizes[index] < file_size; ++index);
        ++count[index];
    } else if (flag == FTW_DNR) {
        fprintf(stderr, "Cannot read directory %s.\n", file);
    }
    return 0;
}

int main(int argc, char** argv) {
    setlocale(LC_ALL, "");
    const char* directory = argc > 1 ? argv[1] : ".";
    if (ftw(directory, callback, 512) != 0) {
        perror(directory);
        return EXIT_FAILURE;
    }
    printf("File size distribution for '%s':\n", directory);
    for (size_t i = 0; i <= nsizes; ++i) {
        if (i == nsizes)
            printf("> %'lu", sizes[i - 1]);
        else
            printf("%'16lu", sizes[i]);
        printf(" bytes: %'lu\n", count[i]);
    }
    printf("Number of files: %'lu\n", files);
    printf("Total file size: %'lu\n", total_size);
    return EXIT_SUCCESS;
}
Output:
File size distribution for '.':
               0 bytes: 0
           1,000 bytes: 3
          10,000 bytes: 111
         100,000 bytes: 2,457
       1,000,000 bytes: 2,645
      10,000,000 bytes: 2,483
     100,000,000 bytes: 172
   1,000,000,000 bytes: 3
  10,000,000,000 bytes: 0
> 10,000,000,000 bytes: 0
Number of files: 7,874
Total file size: 11,963,566,673

C++

#include <algorithm>
#include <array>
#include <filesystem>
#include <iomanip>
#include <iostream>

void file_size_distribution(const std::filesystem::path& directory) {
    constexpr size_t n = 9;
    constexpr std::array<std::uintmax_t, n> sizes = { 0, 1000, 10000,
        100000, 1000000, 10000000, 100000000, 1000000000, 10000000000 };
    std::array<size_t, n + 1> count = { 0 };
    size_t files = 0;
    std::uintmax_t total_size = 0;
    std::filesystem::recursive_directory_iterator iter(directory);
    for (const auto& dir_entry : iter) {
        if (dir_entry.is_regular_file() && !dir_entry.is_symlink()) {
            std::uintmax_t file_size = dir_entry.file_size();
            total_size += file_size;
            auto i = std::lower_bound(sizes.begin(), sizes.end(), file_size);
            size_t index = std::distance(sizes.begin(), i);
            ++count[index];
            ++files;
        }
    }
    std::cout << "File size distribution for " << directory << ":\n";
    for (size_t i = 0; i <= n; ++i) {
        if (i == n)
            std::cout << "> " << sizes[i - 1];
        else
            std::cout << std::setw(16) << sizes[i];
        std::cout << " bytes: " << count[i] << '\n';
    }
    std::cout << "Number of files: " << files << '\n';
    std::cout << "Total file size: " << total_size << " bytes\n";
}

int main(int argc, char** argv) {
    std::cout.imbue(std::locale(""));
    try {
        const char* directory(argc > 1 ? argv[1] : ".");
        std::filesystem::path path(directory);
        if (!is_directory(path)) {
            std::cerr << directory << " is not a directory.\n";
            return EXIT_FAILURE;
        }
        file_size_distribution(path);
    } catch (const std::exception& ex) {
        std::cerr << ex.what() << '\n';
        return EXIT_FAILURE;
    }
    return EXIT_SUCCESS;
}
Output:
File size distribution for ".":
               0 bytes: 0
           1,000 bytes: 3
          10,000 bytes: 111
         100,000 bytes: 2,457
       1,000,000 bytes: 2,645
      10,000,000 bytes: 2,483
     100,000,000 bytes: 172
   1,000,000,000 bytes: 3
  10,000,000,000 bytes: 0
> 10,000,000,000 bytes: 0
Number of files: 7,874
Total file size: 11,963,566,673 bytes

Delphi

Library: System.Math
Translation of: Go
program File_size_distribution;

{$APPTYPE CONSOLE}

uses
  System.SysUtils,
  System.Math,
  Winapi.Windows;

function Commatize(n: Int64): string;
begin
  result := n.ToString;
  if n < 0 then
    delete(result, 1, 1);
  var le := result.Length;
  var i := le - 3;
  while i >= 1 do
  begin
    Insert(',', result, i + 1);
    dec(i, 3);
  end;

  if n >= 0 then
    exit;

  Result := '-' + result;
end;

procedure Walk(Root: string; walkFunc: TProc<string, TWin32FindData>); overload;
var
  rec: TWin32FindData;
  h: THandle;
  directory, PatternName: string;
begin
  if not Assigned(walkFunc) then
    exit;

  Root := IncludeTrailingPathDelimiter(Root);

  h := FindFirstFile(Pchar(Root + '*.*'), rec);
  if (INVALID_HANDLE_VALUE <> h) then
    repeat
      if rec.cFileName[0] = '.' then
        Continue;
      walkFunc(directory, rec);
      if ((rec.dwFileAttributes and FILE_ATTRIBUTE_DIRECTORY) =
        FILE_ATTRIBUTE_DIRECTORY) and (rec.cFileName[0] <> '.') then
        Walk(Root + rec.cFileName, walkFunc);
    until not FindNextFile(h, rec);
  FindClose(h);
end;

procedure FileSizeDistribution(root: string);
var
  sizes: TArray<Integer>;
  files, directories, totalSize, size, i: UInt64;
  c: string;
begin
  SetLength(sizes, 12);
  files := 0;
  directories := 0;
  totalSize := 0;
  size := 0;

  Walk(root,
    procedure(path: string; info: TWin32FindData)
    var
      logSize: Extended;
      index: integer;
    begin
      inc(files);
      if (info.dwFileAttributes and FILE_ATTRIBUTE_DIRECTORY) =
        FILE_ATTRIBUTE_DIRECTORY then
        inc(directories);
      size := info.nFileSizeHigh shl 32 + info.nFileSizeLow;
      if size = 0 then
      begin
        sizes[0] := sizes[0] + 1;
        exit;
      end;

      inc(totalSize, size);
      logSize := Log10(size);
      index := Floor(logSize);
      sizes[index] := sizes[index] + 1;
    end);

  writeln('File size distribution for "', root, '" :-'#10);
  for i := 0 to High(sizes) do
  begin
    if i = 0 then
      write('  ')
    else
      write('+ ');
    writeln(format('Files less than 10 ^ %-2d bytes : %5d', [i, sizes[i]]));
  end;
  writeln('                                  -----');
  writeln('= Total number of files         : ', files: 5);
  writeln('  including directories         : ', directories: 5);
  c := commatize(totalSize);
  writeln(#10'  Total size of files           : ', c, 'bytes');
end;

begin
  fileSizeDistribution('.');
  readln;
end.

Factor

Works with: Factor version 0.99 2020-03-02
USING: accessors assocs formatting io io.directories.search
io.files.types io.pathnames kernel math math.functions
math.statistics namespaces sequences ;

: classify ( m -- n ) [ 0 ] [ log10 >integer 1 + ] if-zero ;

: file-size-histogram ( path -- assoc )
    recursive-directory-entries
    [ type>> +directory+ = ] reject
    [ size>> classify ] map histogram ;

current-directory get file-size-histogram dup
[ "Count of files < 10^%d bytes: %4d\n" printf ] assoc-each
nl values sum "Total files: %d\n" printf
Output:
Count of files < 10^0 bytes:   20
Count of files < 10^1 bytes:  742
Count of files < 10^2 bytes: 3881
Count of files < 10^3 bytes: 2388
Count of files < 10^4 bytes: 3061
Count of files < 10^5 bytes:  486
Count of files < 10^6 bytes:   78
Count of files < 10^7 bytes:   27
Count of files < 10^8 bytes:    3
Count of files < 10^9 bytes:    1

Total files: 10687

Go

Translation of: Kotlin
package main

import (
    "fmt"
    "log"
    "math"
    "os"
    "path/filepath"
)

func commatize(n int64) string {
    s := fmt.Sprintf("%d", n)
    if n < 0 {
        s = s[1:]
    }
    le := len(s)
    for i := le - 3; i >= 1; i -= 3 {
        s = s[0:i] + "," + s[i:]
    }
    if n >= 0 {
        return s
    }
    return "-" + s
}

func fileSizeDistribution(root string) {
    var sizes [12]int
    files := 0
    directories := 0
    totalSize := int64(0)
    walkFunc := func(path string, info os.FileInfo, err error) error {
        if err != nil {
            return err
        }
        files++
        if info.IsDir() {
            directories++
        }
        size := info.Size()
        if size == 0 {
            sizes[0]++
            return nil
        }
        totalSize += size
        logSize := math.Log10(float64(size))
        index := int(math.Floor(logSize))
        sizes[index+1]++
        return nil
    }
    err := filepath.Walk(root, walkFunc)
    if err != nil {
        log.Fatal(err)
    }
    fmt.Printf("File size distribution for '%s' :-\n\n", root)
    for i := 0; i < len(sizes); i++ {
        if i == 0 {
            fmt.Print("  ")
        } else {
            fmt.Print("+ ")
        }
        fmt.Printf("Files less than 10 ^ %-2d bytes : %5d\n", i, sizes[i])
    }
    fmt.Println("                                  -----")
    fmt.Printf("= Total number of files         : %5d\n", files)
    fmt.Printf("  including directories         : %5d\n", directories)
    c := commatize(totalSize)
    fmt.Println("\n  Total size of files           :", c, "bytes")
}

func main() {
    fileSizeDistribution("./")
}
Output:
File size distribution for './' :-

  Files less than 10 ^ 0  bytes :     0
+ Files less than 10 ^ 1  bytes :     0
+ Files less than 10 ^ 2  bytes :     8
+ Files less than 10 ^ 3  bytes :    98
+ Files less than 10 ^ 4  bytes :   163
+ Files less than 10 ^ 5  bytes :    18
+ Files less than 10 ^ 6  bytes :     8
+ Files less than 10 ^ 7  bytes :    18
+ Files less than 10 ^ 8  bytes :     1
+ Files less than 10 ^ 9  bytes :     0
+ Files less than 10 ^ 10 bytes :     0
+ Files less than 10 ^ 11 bytes :     0
                                  -----
= Total number of files         :   314
  including directories         :     7

  Total size of files           : 74,205,408 bytes

Haskell

Uses a grouped frequency distribution. Program arguments are optional. Arguments include starting directory and initial frequency distribution group size. After the first frequency distribution is computed it further breaks it down for any group that exceeds 25% of the total file count, when possible.

{-# LANGUAGE LambdaCase #-}

import           Control.Concurrent          (forkIO, setNumCapabilities)
import           Control.Concurrent.Chan     (Chan, newChan, readChan,
                                              writeChan, writeList2Chan)
import           Control.Exception           (IOException, catch)
import           Control.Monad               (filterM, forever, join,
                                              replicateM, replicateM_, (>=>))
import           Control.Parallel.Strategies (parTraversable, rseq, using,
                                              withStrategy)
import           Data.Char                   (isDigit)
import           Data.List                   (find, sort)
import qualified Data.Map.Strict             as Map
import           GHC.Conc                    (getNumProcessors)
import           System.Directory            (doesDirectoryExist, doesFileExist,
                                              listDirectory,
                                              pathIsSymbolicLink)
import           System.Environment          (getArgs)
import           System.FilePath.Posix       ((</>))
import           System.IO                   (FilePath, IOMode (ReadMode),
                                              hFileSize, hPutStrLn, stderr,
                                              withFile)
import           Text.Printf                 (hPrintf, printf)

data Item = File FilePath Integer | Folder FilePath deriving (Show)

type FGKey = (Integer, Integer)
type FrequencyGroup = (FGKey, Integer)
type FrequencyGroups = Map.Map FGKey Integer

newFrequencyGroups :: FrequencyGroups
newFrequencyGroups = Map.empty

fileSizes :: [Item] -> [Integer]
fileSizes = foldr f [] where f (File _ n) acc = n:acc
                             f _          acc = acc

folders :: [Item] -> [FilePath]
folders = foldr f [] where f (Folder p) acc = p:acc
                           f _          acc = acc

totalBytes :: [Item] -> Integer
totalBytes = sum . fileSizes

counts :: [Item] -> (Integer, Integer)
counts = foldr (\x (a, b) -> case x of File _ _ -> (succ a, b)
                                       Folder _ -> (a, succ b)) (0, 0)

-- |Creates 'FrequencyGroups' from the provided size and data set.
frequencyGroups :: Int             -- ^ Desired number of frequency groups.
                -> [Integer]       -- ^ List of collected file sizes. Must be sorted.
                -> FrequencyGroups -- ^ Returns a 'FrequencyGroups' for the file sizes.
frequencyGroups _ [] = newFrequencyGroups
frequencyGroups totalGroups xs
  | length xs == 1 = Map.singleton (head xs, head xs) 1
  | otherwise = foldr placeGroups newFrequencyGroups xs `using` parTraversable rseq
  where
    range = maximum xs - minimum xs
    groupSize = succ $ ceiling $ realToFrac range / realToFrac totalGroups
    groups = takeWhile (<=groupSize + maximum xs) $ iterate (+groupSize) 0
    groupMinMax = zip groups (pred <$> tail groups)
    findGroup n = find (\(low, high) -> n >= low && n <= high)

    incrementCount (Just n) = Just (succ n) -- Update count for range.
    incrementCount Nothing  = Just 1        -- Insert new range with initial count.

    placeGroups n fgMap = case findGroup n groupMinMax of
      Just k  -> Map.alter incrementCount k fgMap
      Nothing -> fgMap -- Should never happen.

expandGroups :: Int             -- ^ Desired number of frequency groups.
             -> [Integer]       -- ^ List of collected file sizes.
             -> Integer         -- ^ Computed frequency group limit.
             -> FrequencyGroups -- ^ Expanded 'FrequencyGroups'
expandGroups gsize fileSizes groupThreshold
  | groupThreshold > 0 = loop 15 $ frequencyGroups gsize sortedFileSizes
  | otherwise = frequencyGroups gsize sortedFileSizes
  where
    sortedFileSizes = sort fileSizes
    loop 0 gs = gs -- break out in case we can't go below threshold
    loop n gs | all (<= groupThreshold) $ Map.elems gs = gs
              | otherwise = loop (pred n) (expand gs)

    expand :: FrequencyGroups -> FrequencyGroups
    expand = foldr f . withStrategy (parTraversable rseq) <*>
      Map.mapWithKey groupsFromGroup . Map.filter (> groupThreshold)
      where
        f :: Maybe (FGKey, FrequencyGroups) -- ^ expanded frequency group
          -> FrequencyGroups                -- ^ accumulator
          -> FrequencyGroups                -- ^ merged accumulator
        f (Just (k, fg)) acc = Map.union (Map.delete k acc) fg
        f Nothing        acc = acc

        groupsFromGroup
          :: FGKey                          -- ^ Group Key
          -> Integer                        -- ^ Count
          -> Maybe (FGKey, FrequencyGroups) -- ^ Returns expanded 'FrequencyGroups' with base key it replaces.
        groupsFromGroup (min, max) count
          | length range > 1 = Just ((min, max), frequencyGroups gsize range)
          | otherwise        = Nothing
          where
            range = filter (\n -> n >= min && n <= max) sortedFileSizes

displaySize :: Integer -> String
displaySize n
  |              n <= 2^10 = printf "%8dB " n
  | n >= 2^10 && n <= 2^20 = display (2^10) "KB"
  | n >= 2^20 && n <= 2^30 = display (2^20) "MB"
  | n >= 2^30 && n <= 2^40 = display (2^30) "GB"
  | n >= 2^40 && n <= 2^50 = display (2^40) "TB"
  | otherwise = "Too large!"
  where
    display :: Double -> String -> String
    display b = printf "%7.2f%s " (realToFrac n / b)

displayFrequency :: Integer -> FrequencyGroup -> IO ()
displayFrequency filesCount ((min, max), count) = do
  printf "%s <-> %s" (displaySize min) (displaySize max)
  printf "= %-10d %6.3f%%: %-5s\n" count percentage bars
  where
    percentage :: Double
    percentage = (realToFrac count / realToFrac filesCount) * 100
    size = round percentage
    bars | size == 0 = "▍"
         | otherwise = replicate size '█'

folderWorker :: Chan FilePath -> Chan [Item] -> IO ()
folderWorker folderChan resultItemsChan =
  forever (readChan folderChan >>= collectItems >>= writeChan resultItemsChan)

collectItems :: FilePath -> IO [Item]
collectItems folderPath = catch tryCollect $ \e -> do
    hPrintf stderr "Skipping: %s\n" $ show (e :: IOException)
    pure []
  where
    tryCollect = (fmap (folderPath </>) <$> listDirectory folderPath) >>=
      mapM (\p -> doesDirectoryExist p >>=
              \case True  -> pure $ Folder p
                    False -> File p <$> withFile p ReadMode hFileSize)

parallelItemCollector :: FilePath -> IO [Item]
parallelItemCollector folder = do
  wCount <- getNumProcessors
  setNumCapabilities wCount
  printf "Using %d worker threads\n" wCount
  folderChan <- newChan
  resultItemsChan <- newChan
  replicateM_ wCount (forkIO $ folderWorker folderChan resultItemsChan)
  loop folderChan resultItemsChan [Folder folder]
  where
    loop :: Chan FilePath -> Chan [Item] -> [Item] -> IO [Item]
    loop folderChan resultItemsChan xs = do
      regularFolders <- filterM (pathIsSymbolicLink >=> (pure . not)) $ folders xs
      if null regularFolders then pure []
      else do
        writeList2Chan folderChan regularFolders
        childItems <- replicateM (length regularFolders) (readChan resultItemsChan)
        result <- mapM (loop folderChan resultItemsChan) childItems
        pure (join childItems <> join result)

parseArgs :: [String] -> Either String (FilePath, Int)
parseArgs (x:y:xs)
  | all isDigit y = Right (x, read y)
  | otherwise     = Left "Invalid frequency group size"
parseArgs (x:xs) = Right (x, 4)
parseArgs _ = Right (".", 4)

main :: IO ()
main = parseArgs <$> getArgs >>= \case
    Left errorMessage -> hPutStrLn stderr errorMessage
    Right (path, groupSize) -> do
      items <- parallelItemCollector path
      let (fileCount, folderCount) = counts items
      printf "Total files: %d\nTotal folders: %d\n" fileCount folderCount
      printf "Total size: %s\n" $ displaySize $ totalBytes items
      printf "\nDistribution:\n\n%9s  <-> %9s %7s\n" "From" "To" "Count"
      putStrLn $ replicate 46 '-'
      let results = expandGroups groupSize (fileSizes items) (groupThreshold fileCount)
      mapM_ (displayFrequency fileCount) $ Map.assocs results
  where
    groupThreshold = round . (*0.25) . realToFrac
Output:
$ filedist ~/Music
Using 4 worker threads
Total files: 688
Total folders: 663
Total size:  985.85MB

Distribution:

     From  <->        To   Count
----------------------------------------------
       0B  <->       80B = 7           1.017%: █
      81B  <->      161B = 74         10.756%: ███████████
     162B  <->      242B = 112        16.279%: ████████████████
     243B  <->      323B = 99         14.390%: ██████████████
     323B  <->      645B = 23          3.343%: ███
     646B  <->      968B = 2           0.291%: ▍
     969B  <->    1.26KB = 1           0.145%: ▍
   3.19KB  <->    6.38KB = 12          1.744%: ██
   6.38KB  <->    9.58KB = 22          3.198%: ███
   9.58KB  <->   12.77KB = 12          1.744%: ██
  13.52KB  <->   27.04KB = 15          2.180%: ██
  27.04KB  <->   40.57KB = 6           0.872%: █
  40.57KB  <->   54.09KB = 22          3.198%: ███
  54.20KB  <->  108.41KB = 99         14.390%: ██████████████
 108.41KB  <->  162.61KB = 23          3.343%: ███
 162.61KB  <->  216.81KB = 8           1.163%: █
 236.46KB  <->  472.93KB = 3           0.436%: ▍
 709.39KB  <->  945.85KB = 44          6.395%: ██████
   3.30MB  <->    4.96MB = 4           0.581%: █
   4.96MB  <->    6.61MB = 21          3.052%: ███
   6.67MB  <->   13.33MB = 72         10.465%: ██████████
  13.33MB  <->   20.00MB = 6           0.872%: █
  20.00MB  <->   26.66MB = 1           0.145%: ▍

$ filedist ~/Music 10
Using 4 worker threads
Total files: 688
Total folders: 663
Total size:  985.85MB

Distribution:

     From  <->        To   Count
----------------------------------------------
       0B  <->       88B = 7           1.017%: █
      89B  <->      177B = 75         10.901%: ███████████
     178B  <->      266B = 156        22.674%: ███████████████████████
     267B  <->      355B = 57          8.285%: ████████
     356B  <->      444B = 20          2.907%: ███
     801B  <->      889B = 2           0.291%: ▍
     959B  <->    1.87KB = 1           0.145%: ▍
   3.75KB  <->    4.68KB = 1           0.145%: ▍
   4.68KB  <->    5.62KB = 1           0.145%: ▍
   5.62KB  <->    6.55KB = 11          1.599%: ██
   6.56KB  <->    7.49KB = 10          1.453%: █
   7.49KB  <->    8.43KB = 4           0.581%: █
   8.43KB  <->    9.36KB = 7           1.017%: █
   9.43KB  <->   18.85KB = 21          3.052%: ███
  18.85KB  <->   28.28KB = 6           0.872%: █
  28.28KB  <->   37.71KB = 4           0.581%: █
  37.71KB  <->   47.13KB = 12          1.744%: ██
  47.13KB  <->   56.56KB = 16          2.326%: ██
  56.56KB  <->   65.99KB = 23          3.343%: ███
  65.99KB  <->   75.41KB = 26          3.779%: ████
  75.41KB  <->   84.84KB = 15          2.180%: ██
  84.84KB  <->   94.27KB = 17          2.471%: ██
  94.59KB  <->  189.17KB = 42          6.105%: ██████
 189.17KB  <->  283.76KB = 4           0.581%: █
 283.76KB  <->  378.35KB = 2           0.291%: ▍
 851.28KB  <->  945.87KB = 44          6.395%: ██████
   2.67MB  <->    5.33MB = 5           0.727%: █
   5.33MB  <->    8.00MB = 41          5.959%: ██████
   8.00MB  <->   10.67MB = 35          5.087%: █████
  10.67MB  <->   13.33MB = 16          2.326%: ██
  13.33MB  <->   16.00MB = 3           0.436%: ▍
  16.00MB  <->   18.67MB = 3           0.436%: ▍
  24.00MB  <->   26.66MB = 1           0.145%: ▍

J

We can get file sizes of all files under a specific path by inspecting the last column from dirtree. For example, the sizes of the files under the user's home directory would be ;{:|:dirtree '~'

From there, we can bucket them by factors of ten, then display the limiting size of each bucket along with the number of files contained (we'll sort them, for legibility):

    ((10x^~.),.#/.~) <.10 ^.1>. /:~;{:|:dirtree '~'
       1  2
      10  8
     100 37
    1000 49
   10000 20
  100000  9
 1000000  4
10000000  4

Java

import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

public final class FileSizeDistribution {

	public static void main(String[] aArgs) throws IOException {		
		List<Path> fileNames = Files.list(Path.of("."))
			.filter( file -> ! Files.isDirectory(file) )
			.map(Path::getFileName)
			.toList();
		
		Map<Integer, Integer> fileSizes = new HashMap<Integer, Integer>();
		for ( Path path : fileNames ) {
			fileSizes.merge(String.valueOf(Files.size(path)).length(), 1, Integer::sum);
		}
		
		final int fileCount = fileSizes.values().stream().mapToInt(Integer::valueOf).sum();
		
		System.out.println("File size distribution for directory \".\":" + System.lineSeparator());
		System.out.println("File size in bytes | Number of files | Percentage");
		System.out.println("-------------------------------------------------");
		for ( int key : fileSizes.keySet() ) {
			final int value = fileSizes.get(key);
			System.out.println(String.format("%s%d%s%d%15d%15.1f%%",
				"   10^", ( key - 1 ), " to 10^", key, value, ( 100.0 * value ) / fileCount));
		}
	}

}
Output:
File size distribution for directory ".":

File size in bytes | Number of files | Percentage
-------------------------------------------------
   10^0 to 10^1              1            0.2%
   10^1 to 10^2              1            0.2%
   10^2 to 10^3              5            1.1%
   10^3 to 10^4              3            0.6%
   10^4 to 10^5            161           34.0%
   10^5 to 10^6            196           41.4%
   10^6 to 10^7             98           20.7%
   10^7 to 10^8              9            1.9%

jq

Works with jq, the C implementation of jq

Works with gojq, the Go implementation of jq

Works with jaq, the Rust implementation of jq

This entry illustrates how jq plays nicely with other command-line tools; in this case jc (https://kellyjonbrazil.github.io/jc) is used to JSONify the output of `ls -Rl`.

(jq could also be used to parse the raw output of `ls`, but it would no doubt be tricky to achieve portability.)

The invocation of jc and jq would be along the following lines:

jc --ls -lR | jq -c -f file-size-distribution.jq

In the present case, the output from the call to `histogram` is a stream of [category, count] pairs beginning with [0, _] showing the number of files of size 0; thereafter, the boundaries of the categories are defined logarithmically, i.e. a file of size of $n is assigned to the category `1 + ($n | log10 | trunc)`.

The output shown below for an actual directory tree suggests a unimodal distribution of file sizes.

# bag of words
def bow(stream): 
  reduce stream as $word ({}; .[($word|tostring)] += 1);

# `stream` is expected to be a stream of non-negative numbers or numeric strings.
# The output is a stream of [bucket, count] pairs, sorted by the value of `bucket`.
# No sorting except for the sorting of these bucket boundaries takes place.
def histogram(stream):
  bow(stream)
  | to_entries
  | map( [(.key | tonumber), .value] )
  | sort_by(.[0])
  | .[];

histogram(.[] | .size | if . == 0 then 0 else 1 + (log10 | trunc) end)
Output:
[0,9]
[1,67]
[2,616]
[3,6239]
[4,3679]
[5,213]
[6,56]
[7,40]
[8,20]
[9,4]
[10,1]

Julia

Works with: Julia version 0.6
using Humanize

function sizelist(path::AbstractString)
    rst = Vector{Int}(0)
    for (root, dirs, files) in walkdir(path)
        files = joinpath.(root, files)
        tmp = collect(filesize(f) for f in files if !islink(f))
        append!(rst, tmp)
    end
    return rst
end

byclass(y, classes) = Dict{eltype(classes),Int}(c => count(c[1] .≤ y .< c[2]) for c in classes)

function main(path::AbstractString)
    s = sizelist(path)
    cls = append!([(0, 1)], collect((10 ^ (i-1), 10 ^ i) for i in 1:9))
    f = byclass(s, cls)

    println("filesizes: ")
    for c in cls
        @printf(" - between %8s and %8s bytes: %3i\n", datasize(c[1]), datasize(c[2]), f[c])
    end
    println("\n-> total: $(datasize(sum(s))) bytes and $(length(s)) files")
end

main(".")
Output:
filesizes:
 - between    0.0 B and    1.0 B bytes:   0
 - between    1.0 B and   10.0 B bytes:   1
 - between   10.0 B and  100.0 B bytes:  44
 - between  100.0 B and   1.0 kB bytes: 1068
 - between   1.0 kB and  10.0 kB bytes: 250
 - between  10.0 kB and 100.0 kB bytes:   7
 - between 100.0 kB and   1.0 MB bytes:   4
 - between   1.0 MB and  10.0 MB bytes:   2
 - between  10.0 MB and 100.0 MB bytes:   0
 - between 100.0 MB and   1.0 GB bytes:   0

-> total: 7.3 MB bytes and 1376 files

Kotlin

// version 1.2.10

import java.io.File
import kotlin.math.log10
import kotlin.math.floor

fun fileSizeDistribution(path: String) {
    val sizes = IntArray(12)
    val p = File(path)
    val files = p.walk()
    var accessible = 0
    var notAccessible = 0
    var totalSize = 0L
    for (file in files) {
        try {
            if (file.isFile()) {
                val len = file.length()
                accessible++
                if (len == 0L) {
                    sizes[0]++
                    continue
                }
                totalSize += len
                val logLen = log10(len.toDouble())
                val index = floor(logLen).toInt()
                sizes[index + 1]++
            }
        }
        catch (se: SecurityException) {
            notAccessible++
        }
    }

    println("File size distribution for '$path' :-\n")
    for (i in 0 until sizes.size) {
        print(if (i == 0) "  " else "+ ")
        print("Files less than 10 ^ ${"%-2d".format(i)} bytes : ")
        println("%5d".format(sizes[i]))
    }
    println("                                  -----")
    println("= Number of accessible files    : ${"%5d".format(accessible)}")
    println("\n  Total size in bytes           : $totalSize")
    println("\n  Number of inaccessible files  : ${"%5d".format(notAccessible)}")
}

fun main(args: Array<String>) {
    fileSizeDistribution("./")  // current directory
}
Output:
File size distribution for './' :-

  Files less than 10 ^ 0  bytes :     2
+ Files less than 10 ^ 1  bytes :     0
+ Files less than 10 ^ 2  bytes :    46
+ Files less than 10 ^ 3  bytes :   380
+ Files less than 10 ^ 4  bytes :   558
+ Files less than 10 ^ 5  bytes :    19
+ Files less than 10 ^ 6  bytes :     6
+ Files less than 10 ^ 7  bytes :     5
+ Files less than 10 ^ 8  bytes :     0
+ Files less than 10 ^ 9  bytes :     0
+ Files less than 10 ^ 10 bytes :     0
+ Files less than 10 ^ 11 bytes :     0
                                  -----
= Number of accessible files    :  1016

  Total size in bytes           : 14459732

  Number of inaccessible files  :     0

Lang

# Load the IO module
# Replace "<pathToIO.lm>" with the location where the io.lm Lang module was installed to without "<" and ">"
ln.loadModule(<pathToIO.lm>)


fp.fileSizeDistribution = (&sizes, $[totalSize], $file) -> {
	if([[io]]::fp.isDirectory($file)) {
		&fileNames = [[io]]::fp.listFilesAndDirectories($file)
		$path = [[io]]::fp.getCanonicalPath($file)
		if($path == /) {
			$path = \e
		}
		
		$fileName
		foreach($[fileName], &fileNames) {
			$innerFile = [[io]]::fp.openFile($path/$fileName)
			
			$innerTotalSize = 0L
			fp.fileSizeDistribution(&sizes, $innerTotalSize, $innerFile)
			$*totalSize += $innerTotalSize
			
			[[io]]::fp.closeFile($innerFile)
		}
	}else {
		$len = [[io]]::fp.getSize($file)
		if($len == null) {
			return
		}
		
		$*totalSize += $len
		
		if($len == 0) {
			&sizes[0] += 1
		}else {
			$index = fn.int(fn.log10($len))
			&sizes[$index] += 1
		}
	}
}

$path $= @&LANG_ARGS == 1?&LANG_ARGS[0]:{{{./}}}

&sizes = fn.arrayMake(12)
fn.arraySetAll(&sizes, 0)

$file = [[io]]::fp.openFile($path)

$totalSize = 0L

fp.fileSizeDistribution(&sizes, $totalSize, $file)

[[io]]::fp.closeFile($file)

fn.println(File size distribution for "$path":)
$i
repeat($[i], @&sizes) {
	fn.printf(10 ^% 3d bytes: %d%n, $i, parser.op(&sizes[$i]))
}
fn.println(Number of files: fn.arrayReduce(&sizes, 0, fn.add))
fn.println(Total file size: $totalSize)

Mathematica / Wolfram Language

SetDirectory[NotebookDirectory[]];
Histogram[FileByteCount /@ Select[FileNames[__], DirectoryQ /* Not], {"Log", 15}, {"Log", "Count"}]

Nim

import math, os, strformat

const
  MaxPower = 10
  Powers = [1, 10, 100]

func powerWithUnit(idx: int): string =
  ## Return a string representing value 10^idx with a unit.
  if idx < 0:
    "0B"
  elif idx < 3:
    fmt"{Powers[idx]}B"
  elif idx < 6:
    fmt"{Powers[idx - 3]}kB"
  elif idx < 9:
    fmt"{Powers[idx - 6]}MB"
  else:
    fmt"{Powers[idx - 9]}GB"


# Retrieve the directory path.
var dirpath: string
if paramCount() == 0:
  dirpath = getCurrentDir()
else:
  dirpath = paramStr(1)
  if not dirExists(dirpath):
    raise newException(ValueError, "wrong directory path: " & dirpath)

# Distribute sizes.
var counts: array[-1..MaxPower, Natural]
for path in dirpath.walkDirRec():
  if not path.fileExists():
    continue  # Not a regular file.
  let size = getFileSize(path)
  let index = if size == 0: -1 else: log10(size.float).toInt
  inc counts[index]

# Display distribution.
let total = sum(counts)
echo "File size distribution for directory: ", dirpath
echo ""
for idx, count in counts:
  let rangeString = fmt"[{powerWithUnit(idx)}..{powerWithUnit(idx + 1)}[:"
  echo fmt"Size in {rangeString: 14} {count:>7}   {100 * count / total:5.2f}%"
echo ""
echo "Total number of files: ", sum(counts)
Output:
File size distribution for directory: /home/xxx

Size in [0B..1B[:         2782    1.28%
Size in [1B..10B[:         145    0.07%
Size in [10B..100B[:      2828    1.30%
Size in [100B..1kB[:     20781    9.55%
Size in [1kB..10kB[:     85469   39.29%
Size in [10kB..100kB[:   86594   39.81%
Size in [100kB..1MB[:    16629    7.64%
Size in [1MB..10MB[:      2053    0.94%
Size in [10MB..100MB[:     221    0.10%
Size in [100MB..1GB[:       38    0.02%
Size in [1GB..10GB[:         0    0.00%
Size in [10GB..100GB[:       0    0.00%

Total number of files: 217540

Perl

Translation of: Raku
use File::Find;
use List::Util qw(max);

my %fsize;
$dir = shift || '.';
find(\&fsize, $dir);

$max = max($max,$fsize{$_}) for keys %fsize;
$total += $size while (undef,$size) = each %fsize;

print "File size distribution in bytes for directory: $dir\n";
for (0 .. max(keys %fsize)) {
    printf "# files @ %4sb %8s: %s\n", $_ ? '10e'.($_-1) : 0, $fsize{$_} // 0,
       histogram( $max, $fsize{$_} // 0, 80);
}
print "$total total files.\n";

sub histogram {
    my($max, $value, $width) = @_;
    my @blocks = qw<| ▏ ▎ ▍ ▌ ▋ ▊ ▉ █>;
    my $scaled = int $value * $width / $max;
    my $end =     $scaled % 8;
    my $bar = int $scaled / 8;
    my $B = $blocks[8] x ($bar * 8) . ($end ? $blocks[$end] : '');
}

sub fsize { $fsize{ log10( (lstat($_))[7] ) }++ }
sub log10 { my($s) = @_; $s ? int log($s)/log(10) : 0 }
Output:
File size distribution in bytes for directory: .
# files @    0b        5:
# files @ 10e0b    46455: ████████████████████████████████████████████████████████████████████████████████
# files @ 10e1b    26146: ████████████████████████████████████████▋
# files @ 10e2b     3993: ▊
# files @ 10e3b     1222: ▎
# files @ 10e4b       19:
# files @ 10e5b        3:
77843 total files.

Phix

Works on Windows and Linux. Uses "proper" sizes, ie 1MB==1024KB. Can be quite slow at first, but is pretty fast on the second and subsequent runs, that is once the OS has cached its (low-level) directory reads.

without js -- file i/o
sequence sizes = {1},
         res = {0}
atom t1 = time()+1
 
function store_res(string filepath, sequence dir_entry)
    if not find('d', dir_entry[D_ATTRIBUTES]) then
        atom size = dir_entry[D_SIZE]
        integer sdx = 1
        while size>sizes[sdx] do
            if sdx=length(sizes) then
                sizes &= sizes[$]*iff(mod(length(sizes),3)?10:10.24)
                res &= 0
            end if
            sdx += 1
        end while
        res[sdx] += 1
        if time()>t1 then
            printf(1,"%,d files found\r",sum(res))
            t1 = time()+1
        end if
    end if
    return 0 -- keep going
end function
integer exit_code = walk_dir(".", store_res, true)
 
printf(1,"%,d files found\n",sum(res))
integer w = max(res)
--include builtins/pfile.e
for i=1 to length(res) do
    integer ri = res[i]
    string s = file_size_k(sizes[i], 5),
           p = repeat('*',floor(60*ri/w))
    printf(1,"files < %s: %s%,d\n",{s,p,ri})
end for
Output:
112,160 files found
files <     1: 333
files <    10: *911
files <   100: ******4,731
files <   1KB: ********************************24,332
files <  10KB: ************************************************************45,379
files < 100KB: *********************************25,299
files <   1MB: *************10,141
files <  10MB: *933
files < 100MB: 91
files <   1GB: 8
files <  10GB: 2

Python

The distribution is stored in a collections.Counter object (like a dictionary with automatic 0 value when a key is not found, useful when incrementing). Anything could be done with this object, here the number of files is printed for increasing sizes. No check is made during the directory walk: usually, safeguards would be needed or the program will fail on any unreadable file or directory (depending on rights, or too deep paths, for instance). Here links are skipped, so it should avoid cycles.

import sys, os
from collections import Counter

def dodir(path):
    global h

    for name in os.listdir(path):
        p = os.path.join(path, name)

        if os.path.islink(p):
            pass
        elif os.path.isfile(p):
            h[os.stat(p).st_size] += 1
        elif os.path.isdir(p):
            dodir(p)
        else:
            pass

def main(arg):
    global h
    h = Counter()
    for dir in arg:
        dodir(dir)

    s = n = 0
    for k, v in sorted(h.items()):
        print("Size %d -> %d file(s)" % (k, v))
        n += v
        s += k * v
    print("Total %d bytes for %d files" % (s, n))

main(sys.argv[1:])

Racket

#lang racket

(define (file-size-distribution (d (current-directory)) #:size-group-function (sgf values))
  (for/fold ((rv (hash)) (Σ 0) (n 0)) ((f (in-directory d)) #:when (file-exists? f))
    (define sz (file-size f))
    (values (hash-update rv (sgf sz) add1 0) (+ Σ sz) (add1 n))))

(define (log10-or-so x) (if (zero? x) #f (round (/ (log x) (log 10)))))

(define number-maybe-<
  (match-lambda** [(#f #f) #f]
                  [(#f _) #t]
                  [(_ #f) #f]
                  [(a b) (< a b)]))

(define ...s? (match-lambda** [(one 1) one] [(one n) (string-append one "s")]))

(define ((report-fsd f) fsd Σ n)
  (for/list ((k (in-list (sort (hash-keys fsd) number-maybe-<))))
    (printf "~a(size): ~a -> ~a ~a~%"
            (object-name f)
            k
            (hash-ref fsd k) (...s? "file" (hash-ref fsd k))))
  (printf "Total: ~a ~a in ~a ~a~%" Σ (...s? "byte" Σ) n (...s? "file" n)))

(module+ test
  (call-with-values (λ () (file-size-distribution #:size-group-function log10-or-so))
                    (report-fsd log10-or-so)))
Output:
log10-or-so(size): #f -> 3 files
log10-or-so(size): 0 -> 4 files
log10-or-so(size): 1.0 -> 39 files
log10-or-so(size): 2.0 -> 57 files
log10-or-so(size): 3.0 -> 406 files
log10-or-so(size): 4.0 -> 198 files
log10-or-so(size): 5.0 -> 20 files
log10-or-so(size): 6.0 -> 6 files
Total: 10210127 bytes in 733 files

Raku

(formerly Perl 6)

Works with: Rakudo version 2017.05

By default, process the current and all readable sub-directories, or, pass in a directory path at the command line.

sub MAIN($dir = '.') {
    sub log10 (Int $s) { $s ?? $s.log(10).Int !! 0 }
    my %fsize;
    my @dirs = $dir.IO;
    while @dirs {
        for @dirs.pop.dir -> $path {
            %fsize{$path.s.&log10}++ if $path.f;
            @dirs.push: $path if $path.d and $path.r
        }
    }
    my $max = %fsize.values.max;
    my $bar-size = 80;
    say "File size distribution in bytes for directory: $dir\n";
    for 0 .. %fsize.keys.max {
          say sprintf( "# Files @ %5sb %8s: ", $_ ?? "10e{$_-1}" !! 0, %fsize{$_} // 0 ),
              histogram( $max, %fsize{$_} // 0, $bar-size )
    }
    say %fsize.values.sum, ' total files.';
}

sub histogram ($max, $value, $width = 60) {
    my @blocks = <| ▏ ▎ ▍ ▌ ▋ ▊ ▉ █>;
    my $scaled = ($value * $width / $max).Int;
    my ($end, $bar) = $scaled.polymod(8);
    (@blocks[8] x $bar * 8) ~ (@blocks[$end] if $end) ~ "\n"
}
Output:
File size distribution in bytes for directory: /home

# Files @     0b      989: ▏

# Files @  10e0b     6655: ████████

# Files @  10e1b    31776: ████████████████████████████████████████

# Files @  10e2b    63165: ████████████████████████████████████████████████████████████████████████████████

# Files @  10e3b    19874: ████████████████████████▏

# Files @  10e4b     7730: ████████▏

# Files @  10e5b     3418: ▌

# Files @  10e6b     1378: ▏

# Files @  10e7b      199:

# Files @  10e8b       45:

135229 total files.

REXX

This REXX version works for Microsoft Windows using the   dir   subcommand;   extra code was added for
older versions of Windows that used suffixes to express big numbers   (the size of a file),   and also versions
that used a mixed case for showing the output text.

Also, some Windows versions of the   dir   command insert commas into numbers, so code was added to elide them.

/*REXX program displays a histogram of filesize distribution of a directory structure(s)*/
numeric digits 30                                /*ensure enough decimal digits for a #.*/
parse arg ds .                                   /*obtain optional argument from the CL.*/
parse source . . path .                          /*   "   the path of this REXX program.*/
fID= substr(path, 1 + lastpos('\', path) )       /*   "   the filename and the filetype.*/
parse var  fID   fn  '.'                         /*   "   just the pure filename of pgm.*/
sw=max(79, linesize() - 1)                       /*   "   terminal width (linesize) - 1.*/
                                work= fn".OUT"   /*filename for workfile output of  DIR.*/
'DIR'   ds   '/s /-c /a-d  >'   work             /*do (DOS) DIR cmd for a data structure*/
call linein 0, 1                                 /*open output file, point to 1st record*/
maxL= 0;    @.= 00;      g= 0                    /*max len size; log array; # good recs.*/
$=0                                              /*$:  total bytes used by files found. */
     do while lines(work)\==0;  _= linein(work)  /*process the data in the DIR work file*/
     if left(_, 1)==' '    then iterate          /*Is the record not legitimate?  Skip. */
     parse upper  var   _    .  .  sz  .         /*uppercase the suffix  (if any).      */
     sz= space( translate(sz, , ','),  0)        /*remove any commas if present in the #*/

     if \datatype(sz,'W')  then do; #= left(sz, length(sz) - 1)       /*SZ has a suffix?*/
                                    if \datatype(#,'N')  then iterate /*Meat ¬ numeric? */
                                    sz= # * 1024 ** pos( right(sz, 1), 'KMGTPEZYXWVU') / 1
                                end                                   /* [↑]  use suffix*/
     $= $ + sz                                   /*keep a running total for the filesize*/
     if sz==0  then L= 0                         /*handle special case for an empty file*/
               else L= length(sz)                /*obtain the length of filesize number.*/
     g= g + 1                                    /*bump the counter of # of good records*/
     maxL= max(L, maxL)                          /*get max length filesize for alignment*/
     @.L= @.L + 1                                /*bump counter of record size category.*/
     end   /*j*/                                 /* [↑]   categories:  split by log ten.*/

if g==0  then do;  say 'file not found: '  ds;  exit 13;    end        /*no good records*/
say  ' record size range    count   '
hdr= '══════════════════ ══════════ ';     say hdr;         Lhdr=length(hdr)
mC=0                                             /*mC:  the maximum count for any range.*/
     do   t=1  to 2                              /*T==1   is used to find the max count.*/
       do k=0  to maxL;  mC= max(mC, @.k);  if t==1  then iterate           /*1st pass? */
                             if k==0  then y= center('zero',  length( word(hdr, 1)  ) )
                                      else y= '10^'left(k-1,2)  "──► 10^"left(k,2)  '-1'
       say y || right( commas(@.k), 11)   copies('─', max(1, (@.k / mC * sw % 1) - LHdr) )
       end   /*k*/
     end     /*y*/
say
trace off;   'ERASE'  work                       /*perform clean─up (erase a work file).*/
say commas(g)      ' files detected, '       commas($)        " total bytes."
exit                                             /*stick a fork in it,  we're all done. */
/*──────────────────────────────────────────────────────────────────────────────────────*/
commas: parse arg _;  do j#=length(_)-3  to 1  by -3; _=insert(',', _, j#); end;  return _

This REXX program makes use of   LINESIZE   REXX program (or BIF) which is used to determine the screen width (or linesize) of the terminal (console) so as to maximize the width of the histogram.

The   LINESIZE.REX   REXX program is included here   ──►   LINESIZE.REX.

output   when using the default input:   (which in this case was the   C:   homedrive, a Windows/XP system.)
 record size range    count
══════════════════ ══════════
       zero             7,611 ─────────
10^0  ──► 10^1  -1        201 ─
10^1  ──► 10^2  -1        884 ─
10^2  ──► 10^3  -1      4,893 ─
10^3  ──► 10^4  -1     18,344 ─────────────────────────────────────────────────────────────────
10^4  ──► 10^5  -1     13,853 ─────────────────────────────────────────
10^5  ──► 10^6  -1      5,235 ─
10^6  ──► 10^7  -1        968 ─
10^7  ──► 10^8  -1        151 ─
10^8  ──► 10^9  -1          3 ─
10^9  ──► 10^10 -1          1 ─

52,144  files detected,  12,519,430,837  total bytes.
output   when using the default input:   (which in this case was the   C:   homedrive, a Windows 7 system.)
 record size range    count
══════════════════ ══════════
       zero               160 ─
10^0  ──► 10^1  ─1        123 ─
10^1  ──► 10^2  ─1      2,254 ─
10^2  ──► 10^3  ─1     22,752 ─────────
10^3  ──► 10^4  ─1     54,519 ─────────────────────────────────────────────────────────────────
10^4  ──► 10^5  ─1     36,810 ──────────────────────────────────
10^5  ──► 10^6  ─1     17,491 ─
10^6  ──► 10^7  ─1      9,659 ─
10^7  ──► 10^8  ─1        548 ─
10^8  ──► 10^9  ─1        144 ─
10^9  ──► 10^10 ─1          8 ─
10^10 ──► 10^11 ─1          1 ─

144,469  files detected,  118,733,891,020  total bytes.
output   when using the (my)   K:   drive:
 record size range    count
══════════════════ ══════════
       zero                28 ─
10^0  ──► 10^1  -1        132 ─
10^1  ──► 10^2  -1        812 ─
10^2  ──► 10^3  -1      3,810 ───────────────────────
10^3  ──► 10^4  -1      5,901 ────────────────────────────────────────────────────
10^4  ──► 10^5  -1      6,828 ─────────────────────────────────────────────────────────────────
10^5  ──► 10^6  -1      2,409 ───
10^6  ──► 10^7  -1        231 ─
10^7  ──► 10^8  -1          5 ─

20,156  files detected,  1,569,799,557  total bytes.

Rust

Will search and report on the directory the .exe is in if target is otherwise unspecified.

Library: walkdir
Works with: Rust version 2018
use std::error::Error;
use std::marker::PhantomData;
use std::path::{Path, PathBuf};
use std::{env, fmt, io, time};
use walkdir::{DirEntry, WalkDir};

fn main() -> Result<(), Box<dyn Error>> {
    let start = time::Instant::now();
    let args: Vec<String> = env::args().collect();

    let root = parse_path(&args).expect("not a valid path");
    let dir = WalkDir::new(&root);

    let (files, dirs): (Vec<PathBuf>, Vec<PathBuf>) = {
        let pool = pool(dir).expect("unable to retrieve entries from WalkDir");
        partition_from(pool).expect("unable to partition files from directories")
    };

    let (fs_count, dr_count) = (files.len(), dirs.len());
    let (file_counter, total_size) = file_count(files);

    {
        println!("++ File size distribution for : {} ++\n", &root.display());
        println!("Files @ 0B            : {:4}", file_counter[0]);
        println!("Files > 1B  - 1,023B  : {:4}", file_counter[1]);
        println!("Files > 1KB - 1,023KB : {:4}", file_counter[2]);
        println!("Files > 1MB - 1,023MB : {:4}", file_counter[3]);
        println!("Files > 1GB - 1,023GB : {:4}", file_counter[4]);
        println!("Files > 1TB+          : {:4}\n", file_counter[5]);

        println!("Files encountered: {}", fs_count);
        println!("Directories traversed: {}", dr_count);
        println!(
            "Total size of all files: {}\n",
            Filesize::<Kilobytes>::from(total_size)
        );
    }

    let end = time::Instant::now();
    println!("Run time: {:?}\n", end.duration_since(start));
    Ok(())
}

fn parse_path(args: &[String]) -> Result<&Path, io::Error> {
    // If there's no `args` entered, the executable will search it's own path.
    match args.len() {
        1 => Ok(Path::new(&args[0])),
        _ => Ok(Path::new(&args[1])),
    }
}

fn pool(dir: WalkDir) -> Result<Vec<DirEntry>, Box<dyn Error>> {
    // Check each item for errors and drop possible invalid `DirEntry`s
    Ok(dir.into_iter().filter_map(|e| e.ok()).collect())
}

fn partition_from(pool: Vec<DirEntry>) -> Result<(Vec<PathBuf>, Vec<PathBuf>), Box<dyn Error>> {
    // Read `Path` from `DirEntry`, checking if `Path` is a file or directory.
    Ok(pool
        .into_iter()
        .map(|e| e.into_path())
        .partition(|path| path.is_file()))
}

fn file_count(files: Vec<PathBuf>) -> ([u64; 6], u64) {
    let mut counter: [u64; 6] = [0; 6];
    for file in &files {
        match Filesize::<Bytes>::from(file).bytes {
            0 => counter[0] += 1,                                 // Empty file
            1..=1_023 => counter[1] += 1,                         // 1 byte to 0.99KB
            1_024..=1_048_575 => counter[2] += 1,                 // 1 kilo to 0.99MB
            1_048_576..=1_073_741_823 => counter[3] += 1,         // 1 mega to 0.99GB
            1_073_741_824..=1_099_511_627_775 => counter[4] += 1, // 1 giga to 0.99TB
            1_099_511_627_776..=std::u64::MAX => counter[5] += 1, // 1 terabyte or larger
        }
    }

    let total_file_size = files
        .iter()
        .fold(0, |acc, file| acc + Filesize::<Bytes>::from(file).bytes);
    (counter, total_file_size)
}

trait SizeUnit: Copy {
    fn singular_name() -> String;
    fn num_byte_in_unit() -> u64;
}

#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Debug)]
struct Bytes;
impl SizeUnit for Bytes {
    fn singular_name() -> String {
        "B".to_string()
    }
    fn num_byte_in_unit() -> u64 {
        1
    }
}

#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Debug)]
struct Kilobytes;
impl SizeUnit for Kilobytes {
    fn singular_name() -> String {
        "KB".to_string()
    }
    fn num_byte_in_unit() -> u64 {
        1_024
    }
}

#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Debug)]
struct Filesize<T: SizeUnit> {
    bytes: u64,
    unit: PhantomData<T>,
}

impl<T> From<u64> for Filesize<T>
where
    T: SizeUnit,
{
    fn from(n: u64) -> Self {
        Filesize {
            bytes: n * T::num_byte_in_unit(),
            unit: PhantomData,
        }
    }
}

impl<T> From<Filesize<T>> for u64
where
    T: SizeUnit,
{
    fn from(fsz: Filesize<T>) -> u64 {
        ((fsz.bytes as f64) / (T::num_byte_in_unit() as f64)) as u64
    }
}

impl<T> fmt::Display for Filesize<T>
where
    T: SizeUnit,
{
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        // convert value in associated units to float
        let size_val = ((self.bytes as f64) / (T::num_byte_in_unit() as f64)) as u64;

        // plural?
        let name_plural = match size_val {
            1 => "",
            _ => "s",
        };

        write!(
            f,
            "{} {}{}",
            (self.bytes as f64) / (T::num_byte_in_unit() as f64),
            T::singular_name(),
            name_plural
        )
    }
}

// Can be expanded for From<File>, or any type that has an alias for Metadata
impl<T> From<&PathBuf> for Filesize<T>
where
    T: SizeUnit,
{
    fn from(f: &PathBuf) -> Self {
        Filesize {
            bytes: f
                .metadata()
                .expect("error with metadata from pathbuf into filesize")
                .len(),
            unit: PhantomData,
        }
    }
}
Output:
++ File size distribution for : .\Documents ++

Files @ 0B            :  956
Files > 1B  - 1,023B  : 3724
Files > 1KB - 1,023KB : 4511
Files > 1MB - 1,023MB :  930
Files > 1GB - 1,023GB :    0
Files > 1TB+          :    0

Files encountered: 10121
Directories traversed: 2057
Total size of all files: 5264133277 KBs

Run time: 1.5671626s

Sidef

func traverse(Block callback, Dir dir) {
    dir.open(\var dir_h) || return nil
 
    for entry in (dir_h.entries) {
        if (entry.kind_of(Dir)) {
            traverse(callback, entry)
        } else {
            callback(entry)
        }
    }
}
 
var dir = (ARGV ? Dir(ARGV[0]) : Dir.cwd)

var group = Hash()
var files_num = 0
var total_size = 0

traverse({ |file|
    group{file.size+1 -> log10.round} := 0 += 1
    total_size += file.size
    files_num += 1
}, dir)

for k,v in (group.sort_by { |k,_| Num(k) }) {
    say "log10(size) ~~ #{k} -> #{v} files"
}

say "Total: #{total_size} bytes in #{files_num} files"
Output:
$ sidef script.sf /usr/bin
log10(size) ~~ 1 -> 4 files
log10(size) ~~ 2 -> 70 files
log10(size) ~~ 3 -> 246 files
log10(size) ~~ 4 -> 1337 files
log10(size) ~~ 5 -> 815 files
log10(size) ~~ 6 -> 167 files
log10(size) ~~ 7 -> 9 files
log10(size) ~~ 8 -> 2 files
Total: 370026462 bytes in 2650 files

Tcl

This is with the fileutil::traverse package from Tcllib to do the tree walking, a glob based alternative ignoring links but not hidden files is possible but would add a dozen of lines.

package require fileutil::traverse
namespace path {::tcl::mathfunc ::tcl::mathop}

# Ternary helper
proc ? {test a b} {tailcall if $test [list subst $a] [list subst $b]}

set dir [? {$argc} {[lindex $argv 0]} .]
fileutil::traverse Tobj $dir \
	-prefilter {apply {path {ne [file type $path] link}}} \
	-filter    {apply {path {eq [file type $path] file}}}
Tobj foreach path {
	set size [file size $path]
	dict incr hist [? {$size} {[int [log10 $size]]} -1]
}
Tobj destroy

foreach key [lsort -int [dict keys $hist]] {
	puts "[? {$key == -1} 0 {1e$key}]\t[dict get $hist $key]"
}
Output:
0   1
1e1 339
1e2 3142
1e3 2015
1e4 150
1e5 29
1e6 13
1e7 3

UNIX Shell

Works with: Bourne Shell

Use POSIX conformant code unless the environment variable GNU is set to anything not empty.

#!/bin/sh
set -eu

tabs -8
if [ ${GNU:-} ]
then
    find -- "${1:-.}" -type f -exec du -b -- {} +
else
    # Use a subshell to remove the last "total" line per each ARG_MAX
    find -- "${1:-.}" -type f -exec sh -c 'wc -c -- "$@" | sed \$d' argv0 {} +
fi | awk -vOFS='\t' '
    BEGIN {split("KB MB GB TB PB", u); u[0] = "B"}
    {
        ++hist[$1 ? length($1) - 1 : -1]
        total += $1
    }
    END {
        max = -2
        for (i in hist)
            max = (i > max ? i : max)

        print "From", "To", "Count\n"
        for (i = -1; i <= max; ++i)
        {
            if (i in hist)
            {
                if (i == -1)
                    print "0B", "0B", hist[i]
                else
                    print 10 ** (i       % 3) u[int(i       / 3)],
                          10 ** ((i + 1) % 3) u[int((i + 1) / 3)],
                          hist[i]
            }
        }
        l = length(total) - 1
        printf "\nTotal: %.1f %s in %d files\n",
            total / (10 ** l), u[int(l / 3)], NR
    }'
Output:
$ time ~/fsd.sh
From    To      Count

0B      0B      13
1B      10B     74
10B     100B    269
100B    1KB     5894
1KB     10KB    12727
10KB    100KB   12755
100KB   1MB     110922
1MB     10MB    50019
10MB    100MB   17706
100MB   1GB     5056
1GB     10GB    1139
10GB    100GB   141
100GB   1TB     1

Total: 8.9 TB in 216716 files
~/fsd.sh  1.28s user 2.55s system 134% cpu 2.842 total
$ time GNU=1 ~/fsd.sh
From    To      Count

0B      0B      13
1B      10B     74
10B     100B    269
100B    1KB     5894
1KB     10KB    12727
10KB    100KB   12755
100KB   1MB     110922
1MB     10MB    50019
10MB    100MB   17706
100MB   1GB     5056
1GB     10GB    1139
10GB    100GB   141
100GB   1TB     1

Total: 8.9 TB in 216716 files
GNU=1 ~/fsd.sh  0.81s user 1.33s system 135% cpu 1.586 total

Wren

Library: Wren-math
Library: Wren-fmt
import "io" for Directory, File, Stat
import "os" for Process
import "./math" for Math
import "./fmt" for Fmt

var sizes = List.filled(12, 0)
var totalSize = 0
var numFiles = 0
var numDirs = 0

var fileSizeDist // recursive function
fileSizeDist = Fn.new { |path|
    var files = Directory.list(path)
    for (file in files) {
        var path2 = "%(path)/%(file)"
        var stat = Stat.path(path2)
        if (stat.isFile) {
            numFiles = numFiles + 1
            var size = stat.size
            if (size == 0) {
                sizes[0] = sizes[0] + 1
            } else {
                totalSize = totalSize + size
                var logSize = Math.log10(size)
                var index = logSize.floor + 1
                sizes[index] = sizes[index] + 1
            }
        } else if (stat.isDirectory) {
            numDirs = numDirs + 1
            fileSizeDist.call(path2)
        }
    }
}

var args = Process.arguments
var path = (args.count == 0) ? "./" : args[0]
if (!Directory.exists(path)) Fiber.abort("Path does not exist or is not a directory.")
fileSizeDist.call(path)

System.print("File size distribution for '%(path)' :-\n")
for (i in 0...sizes.count) {
    System.write((i == 0) ? "  " : "+ ")
    Fmt.print("Files less than 10 ^ $-2d bytes : $,5d", i, sizes[i])
}
System.print("                                  -----")
Fmt.print("= Number of files               : $,5d", numFiles)
Fmt.print("  Total size in bytes           : $,d", totalSize)
Fmt.print("  Number of sub-directories     : $,5d", numDirs)
Output:
File size distribution for './' :-

  Files less than 10 ^ 0  bytes :     4
+ Files less than 10 ^ 1  bytes :     2
+ Files less than 10 ^ 2  bytes :   135
+ Files less than 10 ^ 3  bytes :   946
+ Files less than 10 ^ 4  bytes :   746
+ Files less than 10 ^ 5  bytes :    79
+ Files less than 10 ^ 6  bytes :    11
+ Files less than 10 ^ 7  bytes :     3
+ Files less than 10 ^ 8  bytes :     0
+ Files less than 10 ^ 9  bytes :     0
+ Files less than 10 ^ 10 bytes :     0
+ Files less than 10 ^ 11 bytes :     0
                                  -----
= Number of files               : 1,926
  Total size in bytes           : 12,683,455
  Number of sub-directories     :     3

zkl

pipe:=Thread.Pipe();
    // hoover all files in tree, don't return directories
fcn(pipe,dir){ File.globular(dir,"*",True,8,pipe); }
.launch(pipe,vm.arglist[0]);  // thread

dist,N,SZ,maxd:=List.createLong(50,0),0,0,0;
foreach fnm in (pipe){
   sz,szd:=File.len(fnm), sz.numDigits;
   dist[szd]+=1;
   N+=1; SZ+=sz; maxd=maxd.max(szd);
}
println("Found %d files, %,d bytes, %,d mean.".fmt(N,SZ,SZ/N));
scale:=50.0/(0.0).max(dist);
szchrs,idx,comma:=",nnn"*20, -1, Walker.cycle(0,0,1).next;
println("%15s   %s (* = %.2f)".fmt("File size","Number of files",1.0/scale));
foreach sz,cnt in ([0..].zip(dist[0,maxd])){
   println("%15s : %s".fmt(szchrs[idx,*], "*"*(scale*cnt).round().toInt()));
   idx-=1 + comma();
}
Output:
$ zkl flSzDist.zkl ..
Found 1832 files, 108,667,806 bytes, 59,316 mean.
      File size   Number of files (* = 13.44)
              n : *
             nn : ***
            nnn : ********
          n,nnn : **********************************
         nn,nnn : **************************************************
        nnn,nnn : ********************************
      n,nnn,nnn : *******

$ zkl flSzDist.zkl /media/Tunes/
Found 4320 files, 67,627,849,052 bytes, 15,654,594 mean.
      File size   Number of files (* = 69.84)
              n :
             nn :
            nnn :
          n,nnn : *
         nn,nnn :
        nnn,nnn :
      n,nnn,nnn : *
     nn,nnn,nnn : **************************************************
    nnn,nnn,nnn : ********
  n,nnn,nnn,nnn : *