File size distribution: Difference between revisions

m (→‎{{header|Haskell}}: fix non terminating condition)
 
(43 intermediate revisions by 16 users not shown)
Line 2:
 
;Task:
Beginning from the current directory, or optionally from a directory specified as a command-line argument, determine how many files there are of various sizes in a directory hierarchy.
 
 
My suggestion is to sort by logarithmn of file size, since a few bytes here or there, or even a factor of two or three, may not be that significant.
 
Don't forget that empty files may exist, to serve as a marker.
 
 
Is your file system predominantly devoted to a large number of smaller files, or a smaller number of huge files?
<br><br>
 
=={{header|Action!}}==
DOS 2.5 returns file size in number of sectors.
{{libheader|Action! Tool Kit}}
<syntaxhighlight lang="action!">INCLUDE "D2:PRINTF.ACT" ;from the Action! Tool Kit
 
PROC SizeDistribution(CHAR ARRAY filter INT ARRAY limits,counts BYTE count)
CHAR ARRAY line(255),tmp(4)
INT size
BYTE i,dev=[1]
 
FOR i=0 TO count-1
DO
counts(i)=0
OD
 
Close(dev)
Open(dev,filter,6)
DO
InputSD(dev,line)
IF line(0)=0 THEN
EXIT
FI
SCopyS(tmp,line,line(0)-3,line(0))
size=ValI(tmp)
FOR i=0 TO count-1
DO
IF size<limits(i) THEN
counts(i)==+1
EXIT
FI
OD
OD
Close(dev)
RETURN
 
PROC GenerateLimits(INT ARRAY limits BYTE count)
BYTE i
INT l
 
l=1
FOR i=0 TO count-1
DO
limits(i)=l
l==LSH 1
IF l>1000 THEN l=1000 FI
OD
RETURN
 
PROC PrintBar(INT len,max,size)
INT i,count
 
count=4*len*size/max
IF count=0 AND len>0 THEN
count=1
FI
FOR i=0 TO count/4-1
DO
Put(160)
OD
i=count MOD 4
IF i=1 THEN Put(22)
ELSEIF i=2 THEN Put(25)
ELSEIF i=3 THEN Put(130) FI
RETURN
 
PROC PrintResult(CHAR ARRAY filter
INT ARRAY limits,counts BYTE count)
 
BYTE i
CHAR ARRAY tmp(5)
INT min,max,total
 
total=0 max=0
FOR i=0 TO count-1
DO
total==+counts(i)
IF counts(i)>max THEN
max=counts(i)
FI
OD
PrintF("File size distribution of ""%S"" in sectors:%E",filter) PutE()
PrintE("From To Count Perc")
min=0
FOR i=0 TO count-1
DO
StrI(min,tmp) PrintF("%4S ",tmp)
StrI(limits(i)-1,tmp) PrintF("%3S ",tmp)
StrI(counts(i),tmp) PrintF("%3S ",tmp)
StrI(counts(i)*100/total,tmp) PrintF("%3S%% ",tmp)
PrintBar(counts(i),max,17) PutE()
min=limits(i)
OD
RETURN
 
PROC Main()
DEFINE LIMITCOUNT="11"
CHAR ARRAY filter="H1:*.*"
INT ARRAY limits(LIMITCOUNT),counts(LIMITCOUNT)
 
Put(125) PutE() ;clear the screen
GenerateLimits(limits,LIMITCOUNT)
SizeDistribution(filter,limits,counts,LIMITCOUNT)
PrintResult(filter,limits,counts,LIMITCOUNT)
RETURN</syntaxhighlight>
{{out}}
[https://gitlab.com/amarok8bit/action-rosetta-code/-/raw/master/images/File_size_distribution.png Screenshot from Atari 8-bit computer]
<pre>
File size distribution of "H1:*.*" in sectors:
 
From To Count Perc
0 0 2 0% ▌
1 1 20 3% █▌
2 3 44 8% ███▌
4 7 195 37% █████████████████
8 15 183 35% ███████████████▌
16 31 67 12% █████▌
32 63 6 1% ▌
64 127 0 0%
128 255 0 0%
256 511 0 0%
512 999 1 0% ▌
</pre>
 
=={{header|Ada}}==
{{libheader|Dir_Iterators}}
<syntaxhighlight lang="ada">with Ada.Numerics.Elementary_Functions;
with Ada.Directories; use Ada.Directories;
with Ada.Strings.Fixed; use Ada.Strings;
with Ada.Command_Line; use Ada.Command_Line;
with Ada.Text_IO; use Ada.Text_IO;
 
with Dir_Iterators.Recursive;
 
procedure File_Size_Distribution is
 
type Exponent_Type is range 0 .. 18;
type File_Count is range 0 .. Long_Integer'Last;
Counts : array (Exponent_Type) of File_Count := (others => 0);
Non_Zero_Index : Exponent_Type := 0;
Directory_Name : constant String := (if Argument_Count = 0
then "."
else Argument (1));
Directory_Walker : Dir_Iterators.Recursive.Recursive_Dir_Walk
:= Dir_Iterators.Recursive.Walk (Directory_Name);
begin
if not Exists (Directory_Name) or else Kind (Directory_Name) /= Directory then
Put_Line ("Directory does not exist");
return;
end if;
 
for Directory_Entry of Directory_Walker loop
declare
use Ada.Numerics.Elementary_Functions;
Size_Of_File : File_Size;
Exponent : Exponent_Type;
begin
if Kind (Directory_Entry) = Ordinary_File then
Size_Of_File := Size (Directory_Entry);
if Size_Of_File = 0 then
Counts (0) := Counts (0) + 1;
else
Exponent := Exponent_Type (Float'Ceiling (Log (Float (Size_Of_File),
Base => 10.0)));
Counts (Exponent) := Counts (Exponent) + 1;
end if;
end if;
end;
end loop;
 
for I in reverse Counts'Range loop
if Counts (I) /= 0 then
Non_Zero_Index := I;
exit;
end if;
end loop;
 
for I in Counts'First .. Non_Zero_Index loop
Put ("Less than 10**");
Put (Fixed.Trim (Exponent_Type'Image (I), Side => Left));
Put (": ");
Put (File_Count'Image (Counts (I)));
New_Line;
end loop;
end File_Size_Distribution;</syntaxhighlight>
{{out}}
<pre>Less than 10**0: 8
Less than 10**1: 0
Less than 10**2: 18
Less than 10**3: 88
Less than 10**4: 39
Less than 10**5: 8
Less than 10**6: 2
Less than 10**7: 1</pre>
 
=={{header|C}}==
The platform independent way to get the file size in C involves opening every file and reading the size. The implementation below works for Windows and utilizes command scripts to get size information quickly even for a large number of files, recursively traversing a large number of directories. Both textual and graphical ( ASCII ) outputs are shown. The same can be done for Linux by a combination of the find, ls and stat commands and my plan was to make it work on both OS types, but I don't have access to a Linux system right now. This would also mean either abandoning scaling the graphical output in order to fit the console buffer or porting that as well, thus including windows.h selectively.
===Windows===
<syntaxhighlight lang="c">
<lang C>
#include<windows.h>
#include<string.h>
Line 30 ⟶ 224:
double scale;
FILE* fp;
 
if(argC==1)
printf("Usage : %s <followed by directory to start search from(. for current dir), followed by \n optional parameters (T or G) to show text or graph output>",argV[0]);
Line 43 ⟶ 237:
sprintf(commandString,"forfiles /p %s /s /c \"cmd /c echo @fsize\" 2>&1",startPath);
}
 
else if(strlen(argV[1])==1 && argV[1][0]=='.')
strcpy(commandString,"forfiles /s /c \"cmd /c echo @fsize\" 2>&1");
 
else
sprintf(commandString,"forfiles /p %s /s /c \"cmd /c echo @fsize\" 2>&1",argV[1]);
Line 58 ⟶ 252:
fileSizeLog[strlen(str)]++;
}
 
if(argC==2 || (argC==3 && (argV[2][0]=='t'||argV[2][0]=='T'))){
for(i=0;i<MAXORDER;i++){
Line 64 ⟶ 258:
}
}
 
else if(argC==3 && (argV[2][0]=='g'||argV[2][0]=='G')){
CONSOLE_SCREEN_BUFFER_INFO csbi;
Line 72 ⟶ 266:
 
max = fileSizeLog[0];
 
for(i=1;i<MAXORDER;i++)
(fileSizeLog[i]>max)?max=fileSizeLog[i]:max;
 
(max < csbi.dwSize.X)?(scale=1):(scale=(1.0*(csbi.dwSize.X-50))/max);
 
for(i=0;i<MAXORDER;i++){
printf("\nSize Order < 10^%2d bytes |",i);
Line 85 ⟶ 279:
}
}
 
}
return 0;
}
}
</syntaxhighlight>
</lang>
Invocation and textual output :
<pre>
Line 156 ⟶ 350:
{{libheader|POSIX}}
This works on macOS 10.15. It should be OK for Linux as well.
<langsyntaxhighlight lang="c">#include <ftw.h>
#include <locale.h>
#include <stdint.h>
Line 177 ⟶ 371:
total_size += file_size;
size_t index = 0;
for (; index <= nsizes && sizes[index] < file_size; ++index);
++count[index];
} else if (flag == FTW_DNR) {
Line 203 ⟶ 397:
printf("Total file size: %'lu\n", total_size);
return EXIT_SUCCESS;
}</langsyntaxhighlight>
 
{{out}}
Line 223 ⟶ 417:
 
=={{header|C++}}==
<langsyntaxhighlight lang="cpp">#include <algorithm>
#include <array>
#include <filesystem>
Line 274 ⟶ 468:
}
return EXIT_SUCCESS;
}</langsyntaxhighlight>
 
{{out}}
Line 291 ⟶ 485:
Number of files: 7,874
Total file size: 11,963,566,673 bytes
</pre>
=={{header|Delphi}}==
{{libheader| System.SysUtils}}
{{libheader| System.Math}}
{{libheader| Winapi.Windows}}
{{Trans|Go}}
<syntaxhighlight lang="delphi">
program File_size_distribution;
 
{$APPTYPE CONSOLE}
 
uses
System.SysUtils,
System.Math,
Winapi.Windows;
 
function Commatize(n: Int64): string;
begin
result := n.ToString;
if n < 0 then
delete(result, 1, 1);
var le := result.Length;
var i := le - 3;
while i >= 1 do
begin
Insert(',', result, i + 1);
dec(i, 3);
end;
 
if n >= 0 then
exit;
 
Result := '-' + result;
end;
 
procedure Walk(Root: string; walkFunc: TProc<string, TWin32FindData>); overload;
var
rec: TWin32FindData;
h: THandle;
directory, PatternName: string;
begin
if not Assigned(walkFunc) then
exit;
 
Root := IncludeTrailingPathDelimiter(Root);
 
h := FindFirstFile(Pchar(Root + '*.*'), rec);
if (INVALID_HANDLE_VALUE <> h) then
repeat
if rec.cFileName[0] = '.' then
Continue;
walkFunc(directory, rec);
if ((rec.dwFileAttributes and FILE_ATTRIBUTE_DIRECTORY) =
FILE_ATTRIBUTE_DIRECTORY) and (rec.cFileName[0] <> '.') then
Walk(Root + rec.cFileName, walkFunc);
until not FindNextFile(h, rec);
FindClose(h);
end;
 
procedure FileSizeDistribution(root: string);
var
sizes: TArray<Integer>;
files, directories, totalSize, size, i: UInt64;
c: string;
begin
SetLength(sizes, 12);
files := 0;
directories := 0;
totalSize := 0;
size := 0;
 
Walk(root,
procedure(path: string; info: TWin32FindData)
var
logSize: Extended;
index: integer;
begin
inc(files);
if (info.dwFileAttributes and FILE_ATTRIBUTE_DIRECTORY) =
FILE_ATTRIBUTE_DIRECTORY then
inc(directories);
size := info.nFileSizeHigh shl 32 + info.nFileSizeLow;
if size = 0 then
begin
sizes[0] := sizes[0] + 1;
exit;
end;
 
inc(totalSize, size);
logSize := Log10(size);
index := Floor(logSize);
sizes[index] := sizes[index] + 1;
end);
 
writeln('File size distribution for "', root, '" :-'#10);
for i := 0 to High(sizes) do
begin
if i = 0 then
write(' ')
else
write('+ ');
writeln(format('Files less than 10 ^ %-2d bytes : %5d', [i, sizes[i]]));
end;
writeln(' -----');
writeln('= Total number of files : ', files: 5);
writeln(' including directories : ', directories: 5);
c := commatize(totalSize);
writeln(#10' Total size of files : ', c, 'bytes');
end;
 
begin
fileSizeDistribution('.');
readln;
end.</syntaxhighlight>
 
=={{header|Factor}}==
{{works with|Factor|0.99 2020-03-02}}
<syntaxhighlight lang="factor">USING: accessors assocs formatting io io.directories.search
io.files.types io.pathnames kernel math math.functions
math.statistics namespaces sequences ;
 
: classify ( m -- n ) [ 0 ] [ log10 >integer 1 + ] if-zero ;
 
: file-size-histogram ( path -- assoc )
recursive-directory-entries
[ type>> +directory+ = ] reject
[ size>> classify ] map histogram ;
 
current-directory get file-size-histogram dup
[ "Count of files < 10^%d bytes: %4d\n" printf ] assoc-each
nl values sum "Total files: %d\n" printf</syntaxhighlight>
{{out}}
<pre>
Count of files < 10^0 bytes: 20
Count of files < 10^1 bytes: 742
Count of files < 10^2 bytes: 3881
Count of files < 10^3 bytes: 2388
Count of files < 10^4 bytes: 3061
Count of files < 10^5 bytes: 486
Count of files < 10^6 bytes: 78
Count of files < 10^7 bytes: 27
Count of files < 10^8 bytes: 3
Count of files < 10^9 bytes: 1
 
Total files: 10687
</pre>
 
=={{header|Go}}==
{{trans|Kotlin}}
<langsyntaxhighlight lang="go">package main
 
import (
Line 366 ⟶ 705:
func main() {
fileSizeDistribution("./")
}</langsyntaxhighlight>
 
{{out}}
Line 391 ⟶ 730:
</pre>
=={{header|Haskell}}==
<p>
Uses a grouped frequency distribution. Program arguments are optional. Arguments include starting directory and initial frequency distribution group size. Distribution groups of 0 are removed. After the first frequency distribution is computed it further breaks it down for any group that exceeds 25% of the total file count, when possible.
Uses a grouped frequency distribution. Program arguments are optional. Arguments include starting directory and initial frequency distribution group size. After the first frequency distribution is computed it further breaks it down for any group that exceeds 25% of the total file count, when possible.
<lang haskell>{-# LANGUAGE TupleSections, LambdaCase #-}
</p>
<syntaxhighlight lang="haskell">{-# LANGUAGE LambdaCase #-}
 
import Control.Concurrent (forkIO, setNumCapabilities)
import Control.Concurrent.Chan (Chan, newChan, readChan, writeChan, writeList2Chan)
writeChan, writeList2Chan)
import Control.Monad (filterM, join, replicateM, replicateM_, forever, (>=>))
import Data.Char Control.Exception (isDigitIOException, catch)
import Data Control.ListMonad (sortfilterM, genericLengthforever, genericTakejoin, sortBy)
replicateM, replicateM_, (>=>))
import GHC.Conc (getNumProcessors)
import Control.Parallel.Strategies (parTraversable, rseq, using,
import System.Directory (getDirectoryContents, doesFileExist
, doesDirectoryExist, pathIsSymbolicLink withStrategy)
import System Data.EnvironmentChar (getArgsisDigit)
import System.FilePath Data.PosixList (pathSeparatorfind, (</>)sort)
import Systemqualified Data.IO Map.Strict (hFileSize, withFile, IOMode(ReadMode),as FilePath)Map
import Text GHC.PrintfConc (printfgetNumProcessors)
import System.Directory (doesDirectoryExist, doesFileExist,
listDirectory,
pathIsSymbolicLink)
import System.Environment (getArgs)
import System.FilePath.Posix ((</>))
import System.IO (FilePath, IOMode (ReadMode),
hFileSize, hPutStrLn, stderr,
withFile)
import Text.Printf (hPrintf, printf)
 
data Item = File FilePath Integer | Folder FilePath deriving (Show)
| Folder FilePath
deriving (Show)
 
type FrequencyGroupFGKey = ((Integer, Integer), Integer)
type FrequencyGroup = (FGKey, Integer)
 
type FrequencyGroups = Map.Map FGKey Integer
frequencyGroups :: Int -> [Integer] -> [FrequencyGroup]
frequencyGroups totalGroups xs = placeGroups xs groupMinMax
where
range = maximum xs - minimum xs
groupSize = succ $ ceiling $ realToFrac range / realToFrac totalGroups
groups = takeWhile (<=groupSize + maximum xs) $ iterate (+groupSize) 0
groupMinMax = (,0) <$> zip groups (pred <$> tail groups)
 
placeGroups [] gs = gs
placeGroups (d:ds) gs = placeGroups ds $
fmap (\g@((min,max), count) ->
if d >= min && d <= max
then ((min, max), succ count)
else g
) gs
 
newFrequencyGroups :: FrequencyGroups
newFrequencyGroups = Map.empty
 
fileSizes :: [Item] -> [Integer]
fileSizes = foldr f [] where f (File _ n) acc = n:acc
f _ acc = acc
where
f (File _ n) acc = n : acc
f _ acc = acc
 
folders :: [Item] -> [FilePath]
folders = foldr f [] where f (Folder p) acc = p:acc
f _ acc = acc
where
f (Folder p) acc = p:acc
f _ acc = acc
 
paths :: [Item] -> [FilePath]
paths = fmap (\case File p _ -> p
Folder p -> p)
 
totalBytes :: [Item] -> Integer
Line 450 ⟶ 778:
 
counts :: [Item] -> (Integer, Integer)
counts = foldr (\x (a, b) -> case x of File _ _ -> (succ a, b)
counts =
foldr (\x (a, b) -> case x of File _ Folder _ -> (succ a, succ b)) (0, 0)
Folder _ -> (a, succ b)
) (0, 0)
 
-- |Creates 'FrequencyGroups' from the provided size and data set.
groupsFromGroup :: Int -> [Integer] -> FrequencyGroup -> [FrequencyGroup]
frequencyGroups :: Int -- ^ Desired number of frequency groups.
groupsFromGroup gsize fileSizes ((min, max), count) = frequencyGroups gsize range
-> [Integer] -- ^ List of collected file sizes. Must be sorted.
-> FrequencyGroups -- ^ Returns a 'FrequencyGroups' for the file sizes.
frequencyGroups _ [] = newFrequencyGroups
frequencyGroups totalGroups xs
| length xs == 1 = Map.singleton (head xs, head xs) 1
| otherwise = foldr placeGroups newFrequencyGroups xs `using` parTraversable rseq
where
range = maximum xs - minimum xs
collectBetween min max = filter (\n -> n >= min && n <= max)
groupSize = succ $ ceiling $ realToFrac range / realToFrac totalGroups
range = collectBetween min max fileSizes
groups = takeWhile (<=groupSize + maximum xs) $ iterate (+groupSize) 0
groupMinMax = zip groups (pred <$> tail groups)
findGroup n = find (\(low, high) -> n >= low && n <= high)
 
incrementCount (Just n) = Just (succ n) -- Update count for range.
expandGroups :: Int -> [Integer] -> Integer -> [FrequencyGroup] -> [FrequencyGroup]
incrementCount Nothing = Just 1 -- Insert new range with initial count.
expandGroups gsize fileSizes groupThreshold = loop 15
 
placeGroups n fgMap = case findGroup n groupMinMax of
Just k -> Map.alter incrementCount k fgMap
Nothing -> fgMap -- Should never happen.
 
expandGroups :: Int -- ^ Desired number of frequency groups.
-> [Integer] -- ^ List of collected file sizes.
-> Integer -- ^ Computed frequency group limit.
-> FrequencyGroups -- ^ Expanded 'FrequencyGroups'
expandGroups gsize fileSizes groupThreshold
| groupThreshold > 0 = loop 15 $ frequencyGroups gsize sortedFileSizes
| otherwise = frequencyGroups gsize sortedFileSizes
where
sortedFileSizes = sort fileSizes
loop 0 gs = gs -- break out in case we can't go below threshold
loop n gs | all (<= groupThreshold) $ Map.elems gs = gs
| allotherwise ((<= groupThreshold)loop .(pred sndn) gs =(expand gs)
| otherwise = loop (pred n) $ expand gs
 
expand =:: ((\g@((min, max), count)FrequencyGroups -> FrequencyGroups
expand = foldr f . withStrategy (parTraversable rseq) <*>
if count > groupThreshold then
Map.mapWithKey groupsFromGroup . Map.filter (> groupThreshold)
groupsFromGroup gsize fileSizes g
elsewhere
f :: Maybe (FGKey, FrequencyGroups) -- ^ expanded frequency group
[g]
-> FrequencyGroups -- ^ accumulator
) =<<)
-> FrequencyGroups -- ^ merged accumulator
f (Just (k, fg)) acc = Map.union (Map.delete k acc) fg
f Nothing acc = acc
 
groupsFromGroup
:: FGKey -- ^ Group Key
-> Integer -- ^ Count
-> Maybe (FGKey, FrequencyGroups) -- ^ Returns expanded 'FrequencyGroups' with base key it replaces.
groupsFromGroup (min, max) count
| length range > 1 = Just ((min, max), frequencyGroups gsize range)
| otherwise = Nothing
where
range = filter (\n -> n >= min && n <= max) sortedFileSizes
 
displaySize :: Integer -> String
displaySize n
| n <= 2^10 = showprintf n <>"%8dB "B" n
| n >= 2^10 && n <= 2^20 = display (2^10) "KB" $ 2^10
| n >= 2^20 && n <= 2^30 = display (2^20) "MB" $ 2^20
| n >= 2^30 && n <= 2^40 = display (2^30) "GB" $ 2^30
| n >= 2^40 && n <= 2^50 = display (2^40) "TB" $ 2^40
| otherwise = "Too large!"
where
display suffix:: =Double (<-> suffix)String . show . round . (realToFrac n-> /)String
display b = printf "%7.2f%s " (realToFrac n / b)
 
displayFrequency :: Integer -> FrequencyGroup -> IO ()
displayFrequency filesCount ((min, max), count) = do
printf "%s <-> %s" (displaySize min) (displaySize max)
printf "= %-10d %6.3f%%: %-5s\n" count percentage bars
where
percentage :: Double
percentage = (realToFrac count / realToFrac filesCount) * 100
size = round percentage
bars | size == 0 = "▍"
| otherwise = replicate size '█'
 
folderWorker :: Chan FilePath -> Chan [Item] -> IO ()
Line 492 ⟶ 864:
 
collectItems :: FilePath -> IO [Item]
collectItems folderPath = catch tryCollect $ \e -> do
hPrintf stderr "Skipping: %s\n" $ show (e :: IOException)
contents <- fmap (folderPath </>) <$> getDirectoryContents folderPath
pure []
files <- filterM doesFileExist contents
folders <- drop 2 <$> filterM doesDirectoryExist contents
items <- mapM (\f -> File f <$> withFile f ReadMode hFileSize) files
pure $ items <> fmap Folder folders
 
displayFrequency :: Integer -> FrequencyGroup -> IO ()
displayFrequency filesCount ((min, max), count) =
printf "%5s <-> %5s = %5d %6.3f%%: %-5s\n"
(displaySize min)
(displaySize max)
count
percentage
bars
where
tryCollect = (fmap (folderPath </>) <$> listDirectory folderPath) >>=
percentage :: Double
mapM (\p -> doesDirectoryExist p >>=
percentage = (realToFrac count / realToFrac filesCount) * 100
\case True -> pure $ Folder p
bars = replicate (round percentage) '█'
False -> File p <$> withFile p ReadMode hFileSize)
 
parseArgs :: [String] -> Either String (FilePath, Int)
parseArgs (x:y:xs)
| all isDigit y = Right (x, read y)
| otherwise = Left "Invalid frequency group size"
parseArgs (x:xs) = Right (x, 4)
parseArgs _ = Right (".", 4)
 
parallelItemCollector :: FilePath -> IO [Item]
Line 531 ⟶ 885:
loop :: Chan FilePath -> Chan [Item] -> [Item] -> IO [Item]
loop folderChan resultItemsChan xs = do
letregularFolders fs<- filterM (pathIsSymbolicLink >=> (pure . not)) $ folders xs
regularFolders <- filterM (pathIsSymbolicLink >=> (pure . not)) fs
if null regularFolders then pure []
else do
Line 539 ⟶ 892:
result <- mapM (loop folderChan resultItemsChan) childItems
pure (join childItems <> join result)
 
parseArgs :: [String] -> Either String (FilePath, Int)
parseArgs (x:y:xs)
| all isDigit y = Right (x, read y)
| otherwise = Left "Invalid frequency group size"
parseArgs (x:xs) = Right (x, 4)
parseArgs _ = Right (".", 4)
 
main :: IO ()
main = parseArgs <$> getArgs >>= \case
main = do
Left errorMessage -> hPutStrLn stderr errorMessage
args <- getArgs
case parseArgs args of
Left errorMessage -> putStrLn errorMessage
Right (path, groupSize) -> do
items <- parallelItemCollector path
-- mapM_ putStrLn $ paths items
let (fileCount, folderCount) = counts items
printf "Total files: %d\nTotal folders: %d\n" fileCount folderCount
printf "Total folders: %d\n" folderCount
printf "Total size: %s\n" $ displaySize $ totalBytes items
putStrLnprintf "\nDistribution:\n\n%9s <-> %9s %7s\n" "From" "To" "Count"
putStrLn $ replicate 46 '-'
let results = expandedGroups groupSize (sizes items) (groupThreshold fileCount) items
let results = expandGroups groupSize (fileSizes items) (groupThreshold fileCount)
mapM_ (displayFrequency fileCount) results
mapM_ (displayFrequency fileCount) $ Map.assocs results
where
groupThreshold = round . (*0.25) . realToFrac</syntaxhighlight>
sizes = sort . fileSizes
initialGroups n = filter ((>0) . snd) . frequencyGroups n . sizes
groupThreshold = round . (*0.25) . realToFrac
expandedGroups gsize sizes n = filter ((>0) . snd)
. expandGroups gsize sizes n
. initialGroups gsize</lang>
{{out}}
<pre style="height: 50rem;">$ filedist ~/Music
Line 567 ⟶ 919:
Total files: 688
Total folders: 663
Total size: 986MB 985.85MB
 
Distribution:
 
0B <-> 80B = 7 1.017%: █
81B From <-> 161B = 74 10.756%:To ███████████Count
----------------------------------------------
162B <-> 242B = 112 16.279%: ████████████████
0B <-> 80B = 7 1.017%: █
243B <-> 323B = 99 14.390%: ██████████████
322B 81B <-> 643B 161B = 74 23 3 10.343756%: ███ ███████████
162B <-> 242B = 112 16.279%: ████████████████
644B <-> 965B = 2 0.291%:
966B 243B <-> 1KB = 323B = 99 1 0.145%: 14.390%: ██████████████
3KB 323B <-> 6KB 645B = 23 12 1.744%: ██ 3.343%: ███
6KB 646B <-> 10KB 968B = 2 22 3 0.198291%: ███
10KB 969B <-> 13KB 1.26KB = 1 12 1.744%: ██ 0.145%: ▍
14KB 3.19KB <-> 27KB 6.38KB = 12 15 2 1.180744%: ██
27KB 6.38KB <-> 41KB 9.58KB = 22 6 0.872%: 3.198%: ███
41KB 9.58KB <-> 54KB 12.77KB = 12 22 3 1.198744%: ███ ██
13.52KB <-> 27.04KB = 15 2.180%: ██
54KB <-> 108KB = 99 14.390%: ██████████████
108KB 27.04KB <-> 163KB 40.57KB = 6 23 3 0.343872%: ███
163KB 40.57KB <-> 217KB 54.09KB = 22 8 1.163%: 3.198%: ███
54.20KB <-> 108.41KB = 99 14.390%: ██████████████
236KB <-> 473KB = 3 0.436%:
709KB 108.41KB <-> 946KB 162.61KB = 23 44 63.395343%: █████████
162.61KB 3MB <-> 5MB216.81KB = 8 4 0.581%: 1.163%: █
236.46KB 5MB <-> 7MB472.93KB = 3 21 3 0.052436%: ███
709.39KB 7MB <-> 13MB945.85KB = 44 72 106.465395%: ████████████████
13MB 3.30MB <-> 20MB 4.96MB = 4 6 0.872581%: █
20MB 4.96MB <-> 27MB 6.61MB = 21 1 03.145052%: ███
6.67MB <-> 13.33MB = 72 10.465%: ██████████
13.33MB <-> 20.00MB = 6 0.872%: █
20.00MB <-> 26.66MB = 1 0.145%: ▍
 
$ filedist ~/Music 10
Line 598 ⟶ 953:
Total files: 688
Total folders: 663
Total size: 986MB 985.85MB
 
Distribution:
 
0B <-> 88B = 7 1.017%: █
89B From <-> 177B = 75 10.901%:To ███████████Count
----------------------------------------------
178B <-> 266B = 156 22.674%: ███████████████████████
267B 0B <-> 355B 88B = 7 57 81.285017%: ████████
356B 89B <-> 444B 177B = 75 20 2 10.907901%: ███ ███████████
178B <-> 266B = 156 22.674%: ███████████████████████
801B <-> 889B = 2 0.291%:
959B 267B <-> 2KB = 355B = 57 1 0.145%: 8.285%: ████████
4KB 356B <-> 5KB = 444B = 20 1 0.145%: 2.907%: ███
5KB 801B <-> 6KB 889B = 2 1 0.145%: 0.291%:
6KB 959B <-> 7KB 1.87KB = 1 11 1.599%: ██ 0.145%: ▍
7KB 3.75KB <-> 7KB 4.68KB = 1 10 1.453%: 0.145%: ▍
7KB 4.68KB <-> 8KB 5.62KB = 1 4 0.581%: 0.145%: ▍
8KB 5.62KB <-> 9KB 6.55KB = 11 7 1.017%: 1.599%: ██
9KB 6.56KB <-> 19KB 7.49KB = 10 21 3 1.052453%: ███
19KB 7.49KB <-> 28KB 8.43KB = 4 6 0.872581%: █
28KB 8.43KB <-> 38KB 9.36KB = 7 4 0.581%: 1.017%: █
38KB 9.43KB <-> 47KB 18.85KB = 21 12 1.744%: ██ 3.052%: ███
47KB 18.85KB <-> 57KB 28.28KB = 6 16 2.326%: ██ 0.872%: █
57KB 28.28KB <-> 66KB 37.71KB = 4 23 3 0.343581%: ███
66KB 37.71KB <-> 75KB 47.13KB = 12 26 3 1.779744%: ████ ██
75KB 47.13KB <-> 85KB 56.56KB = 16 15 2.180326%: ██
85KB 56.56KB <-> 94KB 65.99KB = 23 17 2.471%: ██ 3.343%: ███
95KB 65.99KB <-> 189KB 75.41KB = 26 42 63.105779%: ██████████
189KB 75.41KB <-> 284KB 84.84KB = 15 4 0.581%: 2.180%: ██
284KB 84.84KB <-> 378KB = 94.27KB = 17 2 0.291%: 2.471%: ██
851KB 94.59KB <-> 946KB 189.17KB = 42 44 6.395105%: ██████
189.17KB 3MB <-> 5MB283.76KB = 4 5 0.727581%: █
283.76KB 5MB <-> 8MB378.35KB = 2 41 50.959291%: ██████
851.28KB 8MB <-> 11MB945.87KB = 44 35 56.087395%: ███████████
11MB 2.67MB <-> 13MB 5.33MB = 5 16 2.326%: ██ 0.727%: █
13MB 5.33MB <-> 16MB = 8.00MB = 41 3 0.436%: 5.959%: ██████
16MB 8.00MB <-> 19MB 10.67MB = 35 3 0.436%: 5.087%: █████
24MB 10.67MB <-> 27MB 13.33MB = 16 1 0 2.145326%: ██
13.33MB <-> 16.00MB = 3 0.436%: ▍
16.00MB <-> 18.67MB = 3 0.436%: ▍
24.00MB <-> 26.66MB = 1 0.145%: ▍
</pre>
 
=={{header|J}}==
 
We can get file sizes of all files under a specific path by inspecting the last column from dirtree. For example, the sizes of the files under the user's home directory would be <tt>;{:|:dirtree '~'</tt>
 
From there, we can bucket them by factors of ten, then display the limiting size of each bucket along with the number of files contained (we'll sort them, for legibility):
 
<syntaxhighlight lang="j"> ((10x^~.),.#/.~) <.10 ^.1>. /:~;{:|:dirtree '~'
1 2
10 8
100 37
1000 49
10000 20
100000 9
1000000 4
10000000 4</syntaxhighlight>
 
=={{header|Java}}==
<syntaxhighlight lang="java">
 
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
 
public final class FileSizeDistribution {
 
public static void main(String[] aArgs) throws IOException {
List<Path> fileNames = Files.list(Path.of("."))
.filter( file -> ! Files.isDirectory(file) )
.map(Path::getFileName)
.toList();
Map<Integer, Integer> fileSizes = new HashMap<Integer, Integer>();
for ( Path path : fileNames ) {
fileSizes.merge(String.valueOf(Files.size(path)).length(), 1, Integer::sum);
}
final int fileCount = fileSizes.values().stream().mapToInt(Integer::valueOf).sum();
System.out.println("File size distribution for directory \".\":" + System.lineSeparator());
System.out.println("File size in bytes | Number of files | Percentage");
System.out.println("-------------------------------------------------");
for ( int key : fileSizes.keySet() ) {
final int value = fileSizes.get(key);
System.out.println(String.format("%s%d%s%d%15d%15.1f%%",
" 10^", ( key - 1 ), " to 10^", key, value, ( 100.0 * value ) / fileCount));
}
}
 
}
</syntaxhighlight>
{{ out }}
<pre>
File size distribution for directory ".":
 
File size in bytes | Number of files | Percentage
-------------------------------------------------
10^0 to 10^1 1 0.2%
10^1 to 10^2 1 0.2%
10^2 to 10^3 5 1.1%
10^3 to 10^4 3 0.6%
10^4 to 10^5 161 34.0%
10^5 to 10^6 196 41.4%
10^6 to 10^7 98 20.7%
10^7 to 10^8 9 1.9%
</pre>
 
=={{header|jq}}==
'''Works with jq, the C implementation of jq'''
 
'''Works with gojq, the Go implementation of jq'''
 
'''Works with jaq, the Rust implementation of jq'''
 
This entry illustrates how jq plays nicely with other command-line
tools; in this case jc (https://kellyjonbrazil.github.io/jc) is used to JSONify the output of `ls -Rl`.
 
(jq could also be used to parse the raw output of `ls`, but it would no doubt
be tricky to achieve portability.)
 
The invocation of jc and jq would be along the following lines:
<pre>
jc --ls -lR | jq -c -f file-size-distribution.jq
</pre>
 
In the present case, the output from the call to `histogram` is a stream of [category, count] pairs
beginning with [0, _] showing the number of files of size 0; thereafter, the boundaries
of the categories are defined logarithmically, i.e. a file of size of $n is assigned to
the category `1 + ($n | log10 | trunc)`.
 
The output shown below for an actual directory tree suggests a
unimodal distribution of file sizes.
 
<syntaxhighlight lang="jq">
# bag of words
def bow(stream):
reduce stream as $word ({}; .[($word|tostring)] += 1);
 
# `stream` is expected to be a stream of non-negative numbers or numeric strings.
# The output is a stream of [bucket, count] pairs, sorted by the value of `bucket`.
# No sorting except for the sorting of these bucket boundaries takes place.
def histogram(stream):
bow(stream)
| to_entries
| map( [(.key | tonumber), .value] )
| sort_by(.[0])
| .[];
 
histogram(.[] | .size | if . == 0 then 0 else 1 + (log10 | trunc) end)
</syntaxhighlight>
{{output}}
<pre>
[0,9]
[1,67]
[2,616]
[3,6239]
[4,3679]
[5,213]
[6,56]
[7,40]
[8,20]
[9,4]
[10,1]
</pre>
 
Line 639 ⟶ 1,124:
{{works with|Julia|0.6}}
 
<langsyntaxhighlight lang="julia">using Humanize
 
function sizelist(path::AbstractString)
Line 665 ⟶ 1,150:
end
 
main(".")</langsyntaxhighlight>
 
{{out}}
<pre>filesizes:
- between 0.0 B and 1.0 B bytes: 0
- between 1.0 B and 10.0 B bytes: 1
Line 683 ⟶ 1,168:
 
=={{header|Kotlin}}==
<langsyntaxhighlight lang="scala">// version 1.2.10
 
import java.io.File
Line 730 ⟶ 1,215:
fun main(args: Array<String>) {
fileSizeDistribution("./") // current directory
}</langsyntaxhighlight>
 
{{out}}
Line 755 ⟶ 1,240:
Number of inaccessible files : 0
</pre>
 
=={{header|Lang}}==
{{libheader|lang-io-module}}
<syntaxhighlight lang="lang">
# Load the IO module
# Replace "<pathToIO.lm>" with the location where the io.lm Lang module was installed to without "<" and ">"
ln.loadModule(<pathToIO.lm>)
 
 
fp.fileSizeDistribution = (&sizes, $[totalSize], $file) -> {
if([[io]]::fp.isDirectory($file)) {
&fileNames = [[io]]::fp.listFilesAndDirectories($file)
$path = [[io]]::fp.getCanonicalPath($file)
if($path == /) {
$path = \e
}
$fileName
foreach($[fileName], &fileNames) {
$innerFile = [[io]]::fp.openFile($path/$fileName)
$innerTotalSize = 0L
fp.fileSizeDistribution(&sizes, $innerTotalSize, $innerFile)
$*totalSize += $innerTotalSize
[[io]]::fp.closeFile($innerFile)
}
}else {
$len = [[io]]::fp.getSize($file)
if($len == null) {
return
}
$*totalSize += $len
if($len == 0) {
&sizes[0] += 1
}else {
$index = fn.int(fn.log10($len))
&sizes[$index] += 1
}
}
}
 
$path $= @&LANG_ARGS == 1?&LANG_ARGS[0]:{{{./}}}
 
&sizes = fn.arrayMake(12)
fn.arraySetAll(&sizes, 0)
 
$file = [[io]]::fp.openFile($path)
 
$totalSize = 0L
 
fp.fileSizeDistribution(&sizes, $totalSize, $file)
 
[[io]]::fp.closeFile($file)
 
fn.println(File size distribution for "$path":)
$i
repeat($[i], @&sizes) {
fn.printf(10 ^% 3d bytes: %d%n, $i, parser.op(&sizes[$i]))
}
fn.println(Number of files: fn.arrayReduce(&sizes, 0, fn.add))
fn.println(Total file size: $totalSize)
</syntaxhighlight>
 
=={{header|Mathematica}} / {{header|Wolfram Language}}==
<syntaxhighlight lang="mathematica">SetDirectory[NotebookDirectory[]];
Histogram[FileByteCount /@ Select[FileNames[__], DirectoryQ /* Not], {"Log", 15}, {"Log", "Count"}]</syntaxhighlight>
 
=={{header|Nim}}==
<syntaxhighlight lang="nim">import math, os, strformat
 
const
MaxPower = 10
Powers = [1, 10, 100]
 
func powerWithUnit(idx: int): string =
## Return a string representing value 10^idx with a unit.
if idx < 0:
"0B"
elif idx < 3:
fmt"{Powers[idx]}B"
elif idx < 6:
fmt"{Powers[idx - 3]}kB"
elif idx < 9:
fmt"{Powers[idx - 6]}MB"
else:
fmt"{Powers[idx - 9]}GB"
 
 
# Retrieve the directory path.
var dirpath: string
if paramCount() == 0:
dirpath = getCurrentDir()
else:
dirpath = paramStr(1)
if not dirExists(dirpath):
raise newException(ValueError, "wrong directory path: " & dirpath)
 
# Distribute sizes.
var counts: array[-1..MaxPower, Natural]
for path in dirpath.walkDirRec():
if not path.fileExists():
continue # Not a regular file.
let size = getFileSize(path)
let index = if size == 0: -1 else: log10(size.float).toInt
inc counts[index]
 
# Display distribution.
let total = sum(counts)
echo "File size distribution for directory: ", dirpath
echo ""
for idx, count in counts:
let rangeString = fmt"[{powerWithUnit(idx)}..{powerWithUnit(idx + 1)}[:"
echo fmt"Size in {rangeString: 14} {count:>7} {100 * count / total:5.2f}%"
echo ""
echo "Total number of files: ", sum(counts)</syntaxhighlight>
 
{{out}}
<pre>File size distribution for directory: /home/xxx
 
Size in [0B..1B[: 2782 1.28%
Size in [1B..10B[: 145 0.07%
Size in [10B..100B[: 2828 1.30%
Size in [100B..1kB[: 20781 9.55%
Size in [1kB..10kB[: 85469 39.29%
Size in [10kB..100kB[: 86594 39.81%
Size in [100kB..1MB[: 16629 7.64%
Size in [1MB..10MB[: 2053 0.94%
Size in [10MB..100MB[: 221 0.10%
Size in [100MB..1GB[: 38 0.02%
Size in [1GB..10GB[: 0 0.00%
Size in [10GB..100GB[: 0 0.00%
 
Total number of files: 217540</pre>
 
=={{header|Perl}}==
{{trans|Raku}}
<langsyntaxhighlight lang="perl">use File::Find;
use List::Util qw(max);
 
Line 785 ⟶ 1,406:
 
sub fsize { $fsize{ log10( (lstat($_))[7] ) }++ }
sub log10 { my($s) = @_; $s ? int log($s)/log(10) : 0 }</langsyntaxhighlight>
{{out}}
<pre>File size distribution in bytes for directory: .
Line 799 ⟶ 1,420:
=={{header|Phix}}==
Works on Windows and Linux. Uses "proper" sizes, ie 1MB==1024KB. Can be quite slow at first, but is pretty fast on the second and subsequent runs, that is once the OS has cached its (low-level) directory reads.
<!--<syntaxhighlight lang="phix">(notonline)-->
<lang Phix>sequence sizes = {1},
<span style="color: #008080;">without</span> <span style="color: #008080;">js</span> <span style="color: #000080;font-style:italic;">-- file i/o</span>
res = {0}
<span style="color: #004080;">sequence</span> <span style="color: #000000;">sizes</span> <span style="color: #0000FF;">=</span> <span style="color: #0000FF;">{</span><span style="color: #000000;">1</span><span style="color: #0000FF;">},</span>
atom t1 = time()+1
<span style="color: #000000;">res</span> <span style="color: #0000FF;">=</span> <span style="color: #0000FF;">{</span><span style="color: #000000;">0</span><span style="color: #0000FF;">}</span>
 
<span style="color: #004080;">atom</span> <span style="color: #000000;">t1</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">time</span><span style="color: #0000FF;">()+</span><span style="color: #000000;">1</span>
function store_res(string filepath, sequence dir_entry)
if not find('d', dir_entry[D_ATTRIBUTES]) then
<span style="color: #008080;">function</span> <span style="color: #000000;">store_res</span><span style="color: #0000FF;">(</span><span style="color: #004080;">string</span> <span style="color: #000000;">filepath</span><span style="color: #0000FF;">,</span> <span style="color: #004080;">sequence</span> <span style="color: #000000;">dir_entry</span><span style="color: #0000FF;">)</span>
atom size = dir_entry[D_SIZE]
<span style="color: #008080;">if</span> <span style="color: #008080;">not</span> <span style="color: #7060A8;">find</span><span style="color: #0000FF;">(</span><span style="color: #008000;">'d'</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">dir_entry</span><span style="color: #0000FF;">[</span><span style="color: #004600;">D_ATTRIBUTES</span><span style="color: #0000FF;">])</span> <span style="color: #008080;">then</span>
integer sdx = 1
<span style="color: #004080;">atom</span> <span style="color: #000000;">size</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">dir_entry</span><span style="color: #0000FF;">[</span><span style="color: #004600;">D_SIZE</span><span style="color: #0000FF;">]</span>
while size>sizes[sdx] do
<span style="color: #004080;">integer</span> <span style="color: #000000;">sdx</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">1</span>
if sdx=length(sizes) then
<span style="color: #008080;">while</span> <span style="color: #000000;">size</span><span style="color: #0000FF;">></span><span style="color: #000000;">sizes</span><span style="color: #0000FF;">[</span><span style="color: #000000;">sdx</span><span style="color: #0000FF;">]</span> <span style="color: #008080;">do</span>
sizes &= sizes[$]*iff(mod(length(sizes),3)?10:10.24)
<span style="color: #008080;">if</span> <span style="color: #000000;">sdx</span><span style="color: #0000FF;">=</span><span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">sizes</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">then</span>
res &= 0
<span style="color: #000000;">sizes</span> <span style="color: #0000FF;">&=</span> <span style="color: #000000;">sizes</span><span style="color: #0000FF;">[$]*</span><span style="color: #008080;">iff</span><span style="color: #0000FF;">(</span><span style="color: #7060A8;">mod</span><span style="color: #0000FF;">(</span><span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">sizes</span><span style="color: #0000FF;">),</span><span style="color: #000000;">3</span><span style="color: #0000FF;">)?</span><span style="color: #000000;">10</span><span style="color: #0000FF;">:</span><span style="color: #000000;">10.24</span><span style="color: #0000FF;">)</span>
end if
<span style="color: #000000;">res</span> <span style="color: #0000FF;">&=</span> <span style="color: #000000;">0</span>
sdx += 1
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
end while
<span style="color: #000000;">sdx</span> <span style="color: #0000FF;">+=</span> <span style="color: #000000;">1</span>
res[sdx] += 1
<span style="color: #008080;">end</span> <span style="color: #008080;">while</span>
if time()>t1 then
<span style="color: #000000;">res</span><span style="color: #0000FF;">[</span><span style="color: #000000;">sdx</span><span style="color: #0000FF;">]</span> <span style="color: #0000FF;">+=</span> <span style="color: #000000;">1</span>
printf(1,"%,d files found\r",sum(res))
<span style="color: #008080;">if</span> <span style="color: #7060A8;">time</span><span style="color: #0000FF;">()></span><span style="color: #000000;">t1</span> <span style="color: #008080;">then</span>
t1 = time()+1
<span style="color: #7060A8;">printf</span><span style="color: #0000FF;">(</span><span style="color: #000000;">1</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"%,d files found\r"</span><span style="color: #0000FF;">,</span><span style="color: #7060A8;">sum</span><span style="color: #0000FF;">(</span><span style="color: #000000;">res</span><span style="color: #0000FF;">))</span>
end if
<span style="color: #000000;">t1</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">time</span><span style="color: #0000FF;">()+</span><span style="color: #000000;">1</span>
end if
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
return 0 -- keep going
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
end function
<span style="color: #008080;">return</span> <span style="color: #000000;">0</span> <span style="color: #000080;font-style:italic;">-- keep going</span>
integer exit_code = walk_dir(".", routine_id("store_res"), true)
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
 
<span style="color: #004080;">integer</span> <span style="color: #000000;">exit_code</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">walk_dir</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"."</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">store_res</span><span style="color: #0000FF;">,</span> <span style="color: #004600;">true</span><span style="color: #0000FF;">)</span>
printf(1,"%,d files found\n",sum(res))
integer w = max(res)
<span style="color: #7060A8;">printf</span><span style="color: #0000FF;">(</span><span style="color: #000000;">1</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"%,d files found\n"</span><span style="color: #0000FF;">,</span><span style="color: #7060A8;">sum</span><span style="color: #0000FF;">(</span><span style="color: #000000;">res</span><span style="color: #0000FF;">))</span>
include builtins/pfile.e
<span style="color: #004080;">integer</span> <span style="color: #000000;">w</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">max</span><span style="color: #0000FF;">(</span><span style="color: #000000;">res</span><span style="color: #0000FF;">)</span>
for i=1 to length(res) do
<span style="color: #000080;font-style:italic;">--include builtins/pfile.e</span>
integer ri = res[i]
<span style="color: #008080;">for</span> <span style="color: #000000;">i</span><span style="color: #0000FF;">=</span><span style="color: #000000;">1</span> <span style="color: #008080;">to</span> <span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">res</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">do</span>
string s = file_size_k(sizes[i], 5),
<span style="color: #004080;">integer</span> <span style="color: #000000;">ri</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">res</span><span style="color: #0000FF;">[</span><span style="color: #000000;">i</span><span style="color: #0000FF;">]</span>
p = repeat('*',floor(60*ri/w))
<span style="color: #004080;">string</span> <span style="color: #000000;">s</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">file_size_k</span><span style="color: #0000FF;">(</span><span style="color: #000000;">sizes</span><span style="color: #0000FF;">[</span><span style="color: #000000;">i</span><span style="color: #0000FF;">],</span> <span style="color: #000000;">5</span><span style="color: #0000FF;">),</span>
printf(1,"files < %s: %s%,d\n",{s,p,ri})
<span style="color: #000000;">p</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">repeat</span><span style="color: #0000FF;">(</span><span style="color: #008000;">'*'</span><span style="color: #0000FF;">,</span><span style="color: #7060A8;">floor</span><span style="color: #0000FF;">(</span><span style="color: #000000;">60</span><span style="color: #0000FF;">*</span><span style="color: #000000;">ri</span><span style="color: #0000FF;">/</span><span style="color: #000000;">w</span><span style="color: #0000FF;">))</span>
end for</lang>
<span style="color: #7060A8;">printf</span><span style="color: #0000FF;">(</span><span style="color: #000000;">1</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"files &lt; %s: %s%,d\n"</span><span style="color: #0000FF;">,{</span><span style="color: #000000;">s</span><span style="color: #0000FF;">,</span><span style="color: #000000;">p</span><span style="color: #0000FF;">,</span><span style="color: #000000;">ri</span><span style="color: #0000FF;">})</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">for</span>
<!--</syntaxhighlight>-->
{{out}}
<pre>
Line 852 ⟶ 1,476:
The distribution is stored in a '''collections.Counter''' object (like a dictionary with automatic 0 value when a key is not found, useful when incrementing). Anything could be done with this object, here the number of files is printed for increasing sizes. No check is made during the directory walk: usually, safeguards would be needed or the program will fail on any unreadable file or directory (depending on rights, or too deep paths, for instance). Here links are skipped, so it should avoid cycles.
 
<langsyntaxhighlight lang="python">import sys, os
from collections import Counter
 
Line 875 ⟶ 1,499:
for dir in arg:
dodir(dir)
 
s = n = 0
for k, v in sorted(h.items()):
Line 883 ⟶ 1,507:
print("Total %d bytes for %d files" % (s, n))
 
main(sys.argv[1:])</langsyntaxhighlight>
 
=={{header|Racket}}==
 
<langsyntaxhighlight lang="racket">#lang racket
 
(define (file-size-distribution (d (current-directory)) #:size-group-function (sgf values))
Line 914 ⟶ 1,538:
(module+ test
(call-with-values (λ () (file-size-distribution #:size-group-function log10-or-so))
(report-fsd log10-or-so)))</langsyntaxhighlight>
 
{{out}}
Line 932 ⟶ 1,556:
By default, process the current and all readable sub-directories, or, pass in a directory path at the command line.
 
<syntaxhighlight lang="raku" perl6line>sub MAIN($dir = '.') {
sub log10 (Int $s) { $s ?? $s.log(10).Int !! 0 }
my %fsize;
Line 957 ⟶ 1,581:
my ($end, $bar) = $scaled.polymod(8);
(@blocks[8] x $bar * 8) ~ (@blocks[$end] if $end) ~ "\n"
}</langsyntaxhighlight>
 
{{out}}
Line 985 ⟶ 1,609:
 
=={{header|REXX}}==
This REXX version works for Microsoft Windows using the &nbsp; '''dir''' &nbsp; subcommand; &nbsp; extra code was added for
<br>older versions of Windows that used suffixes to express big numbers &nbsp; (the size of a file), &nbsp; and also versions
<br>that used a mixed case for showing the output text.
 
Also, some Windows versions of the &nbsp; '''dir''' &nbsp; command insert commas into numbers, so code was added to elide them.
<langsyntaxhighlight lang="rexx">/*REXX program displays a histogram of filesize distribution of a directory structure(s)*/
numeric digits 30 /*ensure enough decimal digits for a #.*/
parse arg ds . /*obtain optional argument from the CL.*/
Line 1,035 ⟶ 1,659:
exit /*stick a fork in it, we're all done. */
/*──────────────────────────────────────────────────────────────────────────────────────*/
commas: parse arg _; do j#=length(_)-3 to 1 by -3; _=insert(',', _, j#); end; return _</langsyntaxhighlight>
This REXX program makes use of &nbsp; '''LINESIZE''' &nbsp; REXX program (or BIF) which is used to determine the screen width (or linesize) of the terminal (console) so as to maximize the width of the histogram.
 
The &nbsp; '''LINESIZE.REX''' &nbsp; REXX program is included here &nbsp; ──► &nbsp; [[LINESIZE.REX]].<br>
Line 1,100 ⟶ 1,724:
{{libheader|walkdir}}
{{works with|Rust|2018}}
<langsyntaxhighlight lang="rust">
use std::error::Error;
use std::marker::PhantomData;
Line 1,277 ⟶ 1,901:
}
}
</syntaxhighlight>
</lang>
{{out}}
<pre>
Line 1,297 ⟶ 1,921:
 
=={{header|Sidef}}==
<langsyntaxhighlight lang="ruby">func traverse(Block callback, Dir dir) {
dir.open(\var dir_h) || return nil
 
Line 1,325 ⟶ 1,949:
}
 
say "Total: #{total_size} bytes in #{files_num} files"</langsyntaxhighlight>
{{out}}
<pre>
Line 1,338 ⟶ 1,962:
log10(size) ~~ 8 -> 2 files
Total: 370026462 bytes in 2650 files
</pre>
 
=={{header|Tcl}}==
This is with the '''fileutil::traverse''' package from Tcllib to do the tree walking, a '''glob''' based alternative ignoring links but not hidden files is possible but would add a dozen of lines.
<syntaxhighlight lang="tcl">package require fileutil::traverse
namespace path {::tcl::mathfunc ::tcl::mathop}
 
# Ternary helper
proc ? {test a b} {tailcall if $test [list subst $a] [list subst $b]}
 
set dir [? {$argc} {[lindex $argv 0]} .]
fileutil::traverse Tobj $dir \
-prefilter {apply {path {ne [file type $path] link}}} \
-filter {apply {path {eq [file type $path] file}}}
Tobj foreach path {
set size [file size $path]
dict incr hist [? {$size} {[int [log10 $size]]} -1]
}
Tobj destroy
 
foreach key [lsort -int [dict keys $hist]] {
puts "[? {$key == -1} 0 {1e$key}]\t[dict get $hist $key]"
}</syntaxhighlight>
{{out}}
<pre>0 1
1e1 339
1e2 3142
1e3 2015
1e4 150
1e5 29
1e6 13
1e7 3</pre>
 
=={{header|UNIX Shell}}==
{{works with|Bourne Shell}}
Use POSIX conformant code unless the environment variable GNU is set to anything not empty.
<syntaxhighlight lang="sh">#!/bin/sh
set -eu
 
tabs -8
if [ ${GNU:-} ]
then
find -- "${1:-.}" -type f -exec du -b -- {} +
else
# Use a subshell to remove the last "total" line per each ARG_MAX
find -- "${1:-.}" -type f -exec sh -c 'wc -c -- "$@" | sed \$d' argv0 {} +
fi | awk -vOFS='\t' '
BEGIN {split("KB MB GB TB PB", u); u[0] = "B"}
{
++hist[$1 ? length($1) - 1 : -1]
total += $1
}
END {
max = -2
for (i in hist)
max = (i > max ? i : max)
 
print "From", "To", "Count\n"
for (i = -1; i <= max; ++i)
{
if (i in hist)
{
if (i == -1)
print "0B", "0B", hist[i]
else
print 10 ** (i % 3) u[int(i / 3)],
10 ** ((i + 1) % 3) u[int((i + 1) / 3)],
hist[i]
}
}
l = length(total) - 1
printf "\nTotal: %.1f %s in %d files\n",
total / (10 ** l), u[int(l / 3)], NR
}'</syntaxhighlight>
{{out}}
<pre>$ time ~/fsd.sh
From To Count
 
0B 0B 13
1B 10B 74
10B 100B 269
100B 1KB 5894
1KB 10KB 12727
10KB 100KB 12755
100KB 1MB 110922
1MB 10MB 50019
10MB 100MB 17706
100MB 1GB 5056
1GB 10GB 1139
10GB 100GB 141
100GB 1TB 1
 
Total: 8.9 TB in 216716 files
~/fsd.sh 1.28s user 2.55s system 134% cpu 2.842 total
$ time GNU=1 ~/fsd.sh
From To Count
 
0B 0B 13
1B 10B 74
10B 100B 269
100B 1KB 5894
1KB 10KB 12727
10KB 100KB 12755
100KB 1MB 110922
1MB 10MB 50019
10MB 100MB 17706
100MB 1GB 5056
1GB 10GB 1139
10GB 100GB 141
100GB 1TB 1
 
Total: 8.9 TB in 216716 files
GNU=1 ~/fsd.sh 0.81s user 1.33s system 135% cpu 1.586 total</pre>
 
=={{header|Wren}}==
{{libheader|Wren-math}}
{{libheader|Wren-fmt}}
<syntaxhighlight lang="wren">import "io" for Directory, File, Stat
import "os" for Process
import "./math" for Math
import "./fmt" for Fmt
 
var sizes = List.filled(12, 0)
var totalSize = 0
var numFiles = 0
var numDirs = 0
 
var fileSizeDist // recursive function
fileSizeDist = Fn.new { |path|
var files = Directory.list(path)
for (file in files) {
var path2 = "%(path)/%(file)"
var stat = Stat.path(path2)
if (stat.isFile) {
numFiles = numFiles + 1
var size = stat.size
if (size == 0) {
sizes[0] = sizes[0] + 1
} else {
totalSize = totalSize + size
var logSize = Math.log10(size)
var index = logSize.floor + 1
sizes[index] = sizes[index] + 1
}
} else if (stat.isDirectory) {
numDirs = numDirs + 1
fileSizeDist.call(path2)
}
}
}
 
var args = Process.arguments
var path = (args.count == 0) ? "./" : args[0]
if (!Directory.exists(path)) Fiber.abort("Path does not exist or is not a directory.")
fileSizeDist.call(path)
 
System.print("File size distribution for '%(path)' :-\n")
for (i in 0...sizes.count) {
System.write((i == 0) ? " " : "+ ")
Fmt.print("Files less than 10 ^ $-2d bytes : $,5d", i, sizes[i])
}
System.print(" -----")
Fmt.print("= Number of files : $,5d", numFiles)
Fmt.print(" Total size in bytes : $,d", totalSize)
Fmt.print(" Number of sub-directories : $,5d", numDirs)</syntaxhighlight>
 
{{out}}
<pre>
File size distribution for './' :-
 
Files less than 10 ^ 0 bytes : 4
+ Files less than 10 ^ 1 bytes : 2
+ Files less than 10 ^ 2 bytes : 135
+ Files less than 10 ^ 3 bytes : 946
+ Files less than 10 ^ 4 bytes : 746
+ Files less than 10 ^ 5 bytes : 79
+ Files less than 10 ^ 6 bytes : 11
+ Files less than 10 ^ 7 bytes : 3
+ Files less than 10 ^ 8 bytes : 0
+ Files less than 10 ^ 9 bytes : 0
+ Files less than 10 ^ 10 bytes : 0
+ Files less than 10 ^ 11 bytes : 0
-----
= Number of files : 1,926
Total size in bytes : 12,683,455
Number of sub-directories : 3
</pre>
 
=={{header|zkl}}==
<langsyntaxhighlight lang="zkl">pipe:=Thread.Pipe();
// hoover all files in tree, don't return directories
fcn(pipe,dir){ File.globular(dir,"*",True,8,pipe); }
Line 1,359 ⟶ 2,169:
println("%15s : %s".fmt(szchrs[idx,*], "*"*(scale*cnt).round().toInt()));
idx-=1 + comma();
}</langsyntaxhighlight>
{{out}}
<pre>
Line 1,376 ⟶ 2,186:
Found 4320 files, 67,627,849,052 bytes, 15,654,594 mean.
File size Number of files (* = 69.84)
n :
nn :
nnn :
n,nnn : *
nn,nnn :
nnn,nnn :
n,nnn,nnn : *
nn,nnn,nnn : **************************************************
2,442

edits