#!/usr/local/bin/ocamlscript unix.cmxa (* filesystem-analysis.ml: some code to compute statistics on a directory. version 7 - 2006-03-25 copyright 2006 David MENTRE This file is under GNU GPL (http://www.gnu.org/licenses/gpl.html). This program is using ocamlscript: http://www.linux-france.org/~dmentre/code/ocamlscript-1.1.tar.gz To use it, just do: ./filesystem-analysis.ml /path/to/top/directory To use it whithout ocamlscript, use following command line: ocaml unix.cma filesystem-analysis.ml /path/to/top/directory To compile it as a standalone executable, use the following command: ocamlopt -pp "sed 1d" -o filesystem-analysis \ unix.cmxa filesystem-analysis.ml Changes: - integration of Thomas de Grenier de Latour's median_file_size.patch (with slight modifications): - provides the range where half of files are in; - fix two small bugs (on empty initial directory and on printing glitch). - work on files bigger than 2 GBytes (bug found by Gilles Lamiral). - print real total size for each bin (Thomas's patch) - check command line argument and give usage message in case of issue *) open Unix open Format open LargeFile type stats = { mutable min_file_size : int64; mutable max_file_size : int64; mutable min_link_size : int64; mutable max_link_size : int64; size_distribution : (int, int * int64) Hashtbl.t; (* bin -> number_of_files * total_size *) mutable total_file_size : int64; mutable total_link_size : int64; mutable num_files : int; mutable num_links : int; mutable num_zero_files : int; mutable num_zero_links : int; mutable num_directories : int; } (* 64 bits operations *) let ( +% ) = Int64.add let ( -% ) = Int64.sub let ( *% ) = Int64.mul let ( /% ) = Int64.div (* from http://www.linux-nantes.fr.eu.org/~fmonnier/OCaml/tree.ml.html *) let units = let u = Array.make 4 " B " in u.(1) <- " KB"; u.(2) <- " MB"; u.(3) <- " GB"; u let rec get_unit size level = if size < 1024.0 || level = 3 then sprintf "%.2f%s" size units.(level) else get_unit (size /. 1024.0) (level + 1) let human_size ~size = get_unit (Int64.to_float size) 0 (* end of copy/paste *) let rec get_median_size_range total count last_bin sorted_distrib = match sorted_distrib with [] -> last_bin | (b,(c,_)) :: l -> if ((2 * (count + c)) >= total) then b else get_median_size_range total (count + c) b l let size_range bin = if bin <= 0 then (Int64.zero, Int64.one) else ( let v = Int64.of_float (2.0 ** (float_of_int bin)) in (v, (max Int64.zero ((Int64.of_int 2) *% v -% Int64.one))) ) let print_analysis stats = printf "Total file size: % 10s@\n" (human_size stats.total_file_size); printf "Minimum non empty file size: % 10s@\n" (human_size stats.min_file_size); printf "Maximum file size: % 10s@\n" (human_size stats.max_file_size); printf "Average file size: % 10s@\n" (human_size (Int64.div stats.total_file_size (Int64.of_int (max stats.num_files 1)))); printf "Total number of files: %d@\n" stats.num_files; printf " of which are empty files: %d@\n" stats.num_zero_files; printf "@\nTotal link size: % 10s@\n" (human_size stats.total_link_size); printf "Minimum non empty link size: % 10s@\n" (human_size stats.min_link_size); printf "Maximum link size: % 10s@\n" (human_size stats.max_link_size); printf "Average link size: % 10s@\n" (human_size (Int64.div stats.total_link_size (Int64.of_int (max stats.num_links 1)))); printf "Total number of links: %d@\n" stats.num_links; printf " of which are empty links: %d@\n" stats.num_zero_links; printf "@\nTotal number of directories: %d@\n" stats.num_directories; let distrib = Hashtbl.fold (fun bin (c,s) l -> (bin, (c,s)) :: l) stats.size_distribution [] in let sorted_distrib = List.sort compare distrib in let bin = get_median_size_range stats.num_files 0 0 sorted_distrib in let v1,v2 = size_range bin in printf "Median file size range: [%s - %s]@\n" (human_size v1) (human_size v2); let print_line (bin, (count, size)) = let v1,v2 = size_range bin in printf "[% 10s - % 10s] % 7d files - total % 10s@\n" (human_size v1) (human_size v2) count (human_size size) in printf "File size distribution:@\n"; List.iter print_line sorted_distrib let increase_distribution stats size = let bin = int_of_float (log (Int64.to_float size) /. log 2.0) in try let (count, total_size) = Hashtbl.find stats.size_distribution bin in Hashtbl.replace stats.size_distribution bin ((count + 1), (total_size +% size)) with Not_found -> Hashtbl.add stats.size_distribution bin (1, size) let rec parse_dir_path stats path = try let dh = opendir path in analyse_dir_content stats path dh; closedir dh with Unix_error(EACCES, _, _) -> eprintf "warning: permission denied on directory \"%s\"@." path and analyse_dir_content stats dir_path dh = let rec analyse_next_entry dh = let entry_path = readdir dh in if entry_path = "." || entry_path = ".." then analyse_next_entry dh else ( let entry_full_path = dir_path ^ "/" ^ entry_path in let s = lstat entry_full_path in match s.st_kind with | S_DIR -> stats.num_directories <- stats.num_directories + 1; parse_dir_path stats entry_full_path; analyse_next_entry dh | S_REG -> stats.num_files <- stats.num_files + 1; if s.st_size = Int64.zero then stats.num_zero_files <- stats.num_zero_files + 1 else ( stats.total_file_size <- stats.total_file_size +% s.st_size; if s.st_size > stats.max_file_size then stats.max_file_size <- s.st_size; if s.st_size < stats.min_file_size then stats.min_file_size <- s.st_size; ); increase_distribution stats s.st_size; analyse_next_entry dh | S_LNK -> stats.num_links <- stats.num_links + 1; if s.st_size = Int64.zero then stats.num_zero_links <- stats.num_zero_links + 1 else ( stats.total_link_size <- stats.total_link_size +% s.st_size; if s.st_size > stats.max_link_size then stats.max_link_size <- s.st_size; if s.st_size < stats.min_link_size then stats.min_link_size <- s.st_size; ); increase_distribution stats s.st_size; analyse_next_entry dh | S_CHR | S_BLK | S_FIFO | S_SOCK -> analyse_next_entry dh ) in try analyse_next_entry dh with End_of_file -> () let _ = if Array.length Sys.argv <> 2 then ( printf "usage: %s /path/to/start/directory@." Sys.argv.(0); exit 1 ) else ( let stats = { min_file_size = Int64.max_int; max_file_size = Int64.zero; min_link_size = Int64.max_int; max_link_size = Int64.zero; size_distribution = Hashtbl.create 3; total_file_size = Int64.zero; total_link_size = Int64.zero; num_files = 0; num_links = 0; num_zero_files = 0; num_zero_links = 0; num_directories = 0; } in handle_unix_error (parse_dir_path stats) (Sys.argv.(1)); if stats.num_files = 0 then stats.min_file_size <- Int64.zero; print_analysis stats )