% tbray5 -- another Erlang solution to Tim Bray's Wide Finder project % Author: Steve Vinoski (http://steve.vinoski.net/), September 2007. % See http://steve.vinoski.net/blog/2007/09/29/more-file-processing-with-erlang/ -module(tbray5). -export([start/2, main/1]). -compile([native]). find_match("/ongoing/When/" ++ Last) -> case lists:member($., Last) of false -> 1; true -> 0 end; find_match(_) -> 0. process_binary(Pid, Bin) -> spawn( fun() -> L = string:tokens(binary_to_list(Bin), "\n"), V = lists:foldr( fun(Line, Total) -> Tok = string:tokens(Line, " "), Total + find_match(lists:nth(7, Tok)) end, 0, L), Pid ! V end). break_chunk_on_newline(Bin, Pos, All) when (Pos >= size(Bin)) -> {All, Bin}; break_chunk_on_newline(Bin, Pos, All) -> {_, <>} = split_binary(Bin, Pos), case C of $\n -> {Ba, Bb} = split_binary(Bin, Pos+1), break_chunk_on_newline(Bb, Pos, All ++ [Ba]); _ -> break_chunk_on_newline(Bin, Pos+1, All) end. break_chunk_on_newline(Bin, N) -> break_chunk_on_newline(Bin, size(Bin) div N, []). spawn_collector(Bins, Me) -> Collector = spawn( fun() -> V = lists:foldr(fun(_, T) -> receive V -> T + V end end, 0, Bins), Me ! V end), [process_binary(Collector, B) || B <- Bins], Collector. scan_finish(<<>>, _, Pids) -> Pids; scan_finish(More, Me, Pids) -> [spawn_collector([More], Me) | Pids]. scan_file(F, N, Readsize, Me, Leftover, Pids) -> Rd = bfile:fread(F, Readsize), case Rd of {ok, Bin} -> {Bins, More} = break_chunk_on_newline(list_to_binary([Leftover, Bin]), N), scan_file(F, N, Readsize, Me, More, [spawn_collector(Bins, Me) | Pids]); eof -> scan_finish(Leftover, Me, Pids) end. scan_file(F, N, Readsize) -> scan_file(F, N, Readsize, self(), <<>>, []). start(Num, File, Readsize) -> bfile:load_driver(), {ok, F} = bfile:fopen(File, "r"), Pids = scan_file(F, Num, Readsize), bfile:fclose(F), lists:foldr(fun(_, T) -> receive V -> T + V end end, 0, Pids). start(Num, File) -> start(Num, File, 512*1024). time_diff({A1,A2,A3}, {B1,B2,B3}) -> (B1 - A1) * 1000000 + (B2 - A2) + (B3 - A3) / 1000000.0 . main([N, F]) -> io:format("~p matches found~n", [start(list_to_integer(N), F)]), halt(); main([F]) -> Sz = [16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384], Results = lists:map( fun(S) -> Start = now(), start(S, F, S*1024), {S, time_diff(Start, now())} end, Sz), io:format("~p~n", [Results]), halt().