% tbray14 -- another Erlang solution to Tim Bray's Wide Finder project % Author: Steve Vinoski (http://steve.vinoski.net/), 14 October 2007. % See -module(tbray14). -export([start/2, start/3, main/1]). -import(wfbm). -compile([native]). -define(READSIZE, 8192*1024). make_tbl(L) -> make_tbl(L, dict:new()). make_tbl([], Tbl) -> Tbl; make_tbl([H|T], Tbl) -> make_tbl(T, dict:update_counter(H, 1, Tbl)). top_ten(D) -> L = lists:sort(fun({_,V1}, {_,V2}) -> V1 > V2 end, dict:to_list(D)), if length(L) > 10 -> {First, _} = lists:split(10, L), First; true -> L end. process_binary(Pid, Bin, Tbl) -> spawn(fun() -> Pid ! make_tbl(wfbm:find(Bin, Tbl)) end). break_chunk_on_newline(Bin, Pos, All) when Pos >= size(Bin) -> {All, Bin}; break_chunk_on_newline(Bin, Pos, All) -> {_, <>} = split_binary(Bin, Pos), case C of $\n -> {Ba, Bb} = split_binary(Bin, Pos+1), break_chunk_on_newline(Bb, Pos, [Ba | All]); _ -> break_chunk_on_newline(Bin, Pos+1, All) end. break_chunk_on_newline(Bin, N) -> break_chunk_on_newline(Bin, size(Bin) div N, []). receive_tbls(L) -> lists:foldl(fun(_, D1) -> receive D2 -> dict:merge(fun(_,V1,V2) -> V1 + V2 end, D1, D2) end end, dict:new(), L). spawn_collector(Bins, Me, Tbl) -> Collector = spawn(fun() -> Me ! receive_tbls(Bins) end), [process_binary(Collector, B, Tbl) || B <- Bins], Collector. scan_finish(<<>>, _, _, Pids) -> Pids; scan_finish(More, Tbl, Me, Pids) -> [spawn_collector([More], Me, Tbl) | Pids]. scan_file(F, N, Readsize, Tbl, Me, Leftover, Pids) -> Rd = bfile:fread(F, Readsize), case Rd of {ok, Bin} -> {Bins, More} = break_chunk_on_newline(list_to_binary([Leftover, Bin]), N), scan_file(F, N, Readsize, Tbl, Me, More, [spawn_collector(Bins, Me, Tbl) | Pids]); eof -> scan_finish(Leftover, Tbl, Me, Pids) end. scan_file(F, N, Readsize, Tbl) -> scan_file(F, N, Readsize, Tbl, self(), <<>>, []). start(Num, File, Readsize) -> Tbl = wfbm:init(), bfile:load_driver(), {ok, F} = bfile:fopen(File, "r"), Pids = scan_file(F, Num, Readsize, Tbl), bfile:fclose(F), L = top_ten(receive_tbls(Pids)), lists:map(fun({K,V}) -> io:format("~p: ~s~n", [V, K]) end, L). start(Num, File) -> start(Num, File, ?READSIZE). start(File) -> start(2, File, ?READSIZE). main([N, F, Rd]) -> start(list_to_integer(N), F, list_to_integer(Rd)), halt(); main([N, F]) -> start(list_to_integer(N), F), halt(); main([F]) -> start(F), halt().