% wfbm -- search functions for Tim Bray's Wide Finder project % Author: Steve Vinoski (http://steve.vinoski.net/), 14 October 2007. % See % % UPDATED: 20 October, due to bug in REVSTR which when fixed uncovered % a bug in get_shift, which has made this solution much slower. % See tbray15.erl and wfbm3.erl from http://steve.vinoski.net/code instead. -module(wfbm). -export([find/2, init/0]). -compile([native]). -define(STR, "GET /ongoing/When"). -define(REVSTR, "nehW/gniogno/ TEG"). -define(STRLEN, length(?STR)). -define(MATCHHEADLEN, length("/200x/")). -define(SKIP, length("/200x/2007/10/15/")). set_shifts(_, Count, Tbl) when Count =:= ?STRLEN - 1 -> Tbl; set_shifts([H|T], Count, Tbl) -> New = ?STRLEN - Count - 1, NTbl = dict:store(H, New, Tbl), set_shifts(T, Count+1, NTbl). set_defaults([], Tbl) -> Tbl; set_defaults([V|T], Tbl) -> set_defaults(T, dict:store(V, ?STRLEN, Tbl)). init() -> set_shifts(?STR, 0, set_defaults(lists:seq(1, 255), dict:new())). find(Bin, Tbl) -> find_matches(Bin, Tbl, []). get_tail(<<>>) -> no_match; get_tail(Bin) -> get_tail(Bin, none, 0). get_tail(<<"/20",_:8,"x/",_:32,$/,_:16,$/,_:16,$/, Rest/binary>>, _, _) when size(Rest) =:= 0 -> no_match; get_tail(<<"/20",_:8,"x/",_:32,$/,_:16,$/,_:16,$/,32:8, _/binary>>, _, _) -> no_match; get_tail(<<"/19",_:8,"x/",_:32,$/,_:16,$/,_:16,$/, Rest/binary>>, _, _) when size(Rest) =:= 0 -> no_match; get_tail(<<"/19",_:8,"x/",_:32,$/,_:16,$/,_:16,$/,32:8, _/binary>>, _, _) -> no_match; get_tail(<<"/20",_:8,"x/",_:32,$/,M1:8,M0:8,$/,D1:8,D0:8,$/, Rest/binary>>, none, Len) when ((M1-$0)*10 + (M0-$0)) =< 12, ((D1-$0)*10 + (D0-$0)) =< 31 -> get_tail(Rest, almost, Len+?SKIP); get_tail(<<"/19",_:8,"x/",_:32,$/,M1:8,M0:8,$/,D1:8,D0:8,$/, Rest/binary>>, none, Len) when ((M1-$0)*10 + (M0-$0)) =< 12, ((D1-$0)*10 + (D0-$0)) =< 31 -> get_tail(Rest, almost, Len+?SKIP); get_tail(<<32:8, _/binary>>, found, Len) -> {ok, Len}; get_tail(<<32:8, _/binary>>, _, _) -> no_match; get_tail(<<$., _/binary>>, _, _) -> no_match; get_tail(<<_:8, Rest/binary>>, almost, Len) -> get_tail(Rest, found, Len+1); get_tail(<<_:8, Rest/binary>>, State, Len) -> get_tail(Rest, State, Len+1). get_shift([C1|T1], [C2|T2], Tbl) when C1 =:= C2 -> get_shift(T1, T2, Tbl); get_shift([C1|_], [C2|_], Tbl) -> Shift = dict:fetch(C1, Tbl), if Shift =:= ?STRLEN -> Shift; true -> lists:min([Shift, dict:fetch(C2, Tbl)]) end. find_matches(<>, Tbl, Acc) -> case get_tail(Tail) of {ok, More} -> {H, Rest} = split_binary(Tail, More), {_, Match} = split_binary(H, ?MATCHHEADLEN), Result = binary_to_list(Match), find_matches(Rest, Tbl, [Result | Acc]); no_match -> find_matches(Tail, Tbl, Acc) end; find_matches(Bin, _, Acc) when size(Bin) < ?STRLEN -> Acc; find_matches(Bin, Tbl, Acc) -> {Front, _} = split_binary(Bin, ?STRLEN), Shift = get_shift(lists:reverse(binary_to_list(Front)), ?REVSTR, Tbl), {_, Next} = split_binary(Bin, Shift), find_matches(Next, Tbl, Acc).