% tbray14 -- another Erlang solution to Tim Bray's Wide Finder project
% Author: Steve Vinoski (http://steve.vinoski.net/), 14 October 2007.
% See
-module(tbray14).
-export([start/2, start/3, main/1]).
-import(wfbm).
-compile([native]).
-define(READSIZE, 8192*1024).
make_tbl(L) ->
make_tbl(L, dict:new()).
make_tbl([], Tbl) ->
Tbl;
make_tbl([H|T], Tbl) ->
make_tbl(T, dict:update_counter(H, 1, Tbl)).
top_ten(D) ->
L = lists:sort(fun({_,V1}, {_,V2}) -> V1 > V2 end, dict:to_list(D)),
if
length(L) > 10 ->
{First, _} = lists:split(10, L),
First;
true -> L
end.
process_binary(Pid, Bin, Tbl) ->
spawn(fun() -> Pid ! make_tbl(wfbm:find(Bin, Tbl)) end).
break_chunk_on_newline(Bin, Pos, All) when Pos >= size(Bin) -> {All, Bin};
break_chunk_on_newline(Bin, Pos, All) ->
{_, <>} = split_binary(Bin, Pos),
case C of
$\n ->
{Ba, Bb} = split_binary(Bin, Pos+1),
break_chunk_on_newline(Bb, Pos, [Ba | All]);
_ -> break_chunk_on_newline(Bin, Pos+1, All)
end.
break_chunk_on_newline(Bin, N) -> break_chunk_on_newline(Bin, size(Bin) div N, []).
receive_tbls(L) ->
lists:foldl(fun(_, D1) -> receive D2 ->
dict:merge(fun(_,V1,V2) -> V1 + V2 end, D1, D2)
end end, dict:new(), L).
spawn_collector(Bins, Me, Tbl) ->
Collector = spawn(fun() -> Me ! receive_tbls(Bins) end),
[process_binary(Collector, B, Tbl) || B <- Bins],
Collector.
scan_finish(<<>>, _, _, Pids) -> Pids;
scan_finish(More, Tbl, Me, Pids) -> [spawn_collector([More], Me, Tbl) | Pids].
scan_file(F, N, Readsize, Tbl, Me, Leftover, Pids) ->
Rd = bfile:fread(F, Readsize),
case Rd of
{ok, Bin} ->
{Bins, More} = break_chunk_on_newline(list_to_binary([Leftover, Bin]), N),
scan_file(F, N, Readsize, Tbl, Me, More, [spawn_collector(Bins, Me, Tbl) | Pids]);
eof -> scan_finish(Leftover, Tbl, Me, Pids)
end.
scan_file(F, N, Readsize, Tbl) ->
scan_file(F, N, Readsize, Tbl, self(), <<>>, []).
start(Num, File, Readsize) ->
Tbl = wfbm:init(),
bfile:load_driver(),
{ok, F} = bfile:fopen(File, "r"),
Pids = scan_file(F, Num, Readsize, Tbl),
bfile:fclose(F),
L = top_ten(receive_tbls(Pids)),
lists:map(fun({K,V}) -> io:format("~p: ~s~n", [V, K]) end, L).
start(Num, File) ->
start(Num, File, ?READSIZE).
start(File) ->
start(2, File, ?READSIZE).
main([N, F, Rd]) ->
start(list_to_integer(N), F, list_to_integer(Rd)),
halt();
main([N, F]) ->
start(list_to_integer(N), F),
halt();
main([F]) ->
start(F),
halt().