Untitled

%% character(+C)
%% Test whether or not character is a Char.
character(Char) :- integer(Char); atom(Char), atom_length(Char, 1).


%% A list
isa_list([]).
isa_list([_|_]).


%% string_to_chars(+Str, -Chars)
string_to_chars(Str, Chars) :- atom(Str), atom_codes(Str, Chars).
string_to_chars(Str, Chars) :-
    "" = [] -> fail; string(Str), string_chars(Str, Chars).
string_to_chars(Str, Chars) :- isa_list(Str), Chars = Str.

%% chars_to_string(-Chars, +Str)
chars_to_string(Chars, Str) :-
    "" = [] -> atom_codes(Str, Chars); string_chars(Str, Chars).

%% GNU Prolog défini `new_atom`, mais SWI Prolog l'appelle `gensym`.
genatom(X, A) :-
    (predicate_property(new_atom/2, built_in);    % Older GNU Prolog
     predicate_property(new_atom(_,_), built_in)) % Newer GNU Prolog
    -> new_atom(X, A);
    gensym(X, A).


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%         REGEX (RE)
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%% re_wf(+RE)
%% Make sure that RE is a valid Regex (Well-Formed).
re_wf(any).
re_wf(Char) :- character(Char).
re_wf(in([])).
re_wf(in([Char|Chars])) :- character(Char), re_wf(in(Chars)).
re_wf(in(String)) :-
    "" = [] -> fail;
    string(String), string_chars(String, Chars), re_wf(in(Chars)).
re_wf(notin(Chars)) :- re_wf(in(Chars)).
re_wf([]).
re_wf([RE|REs]) :- re_wf(RE), re_wf(REs).
re_wf(RE1 \/ RE2) :- re_wf(RE1), re_wf(RE2).
re_wf(?(RE)) :- re_wf(RE).
re_wf(*(RE)) :- re_wf(RE).
re_wf(+(RE)) :- re_wf(RE).
re_wf(name(Name, RE)) :- (atom(Name); integer(Name)), re_wf(RE).
re_wf(String) :-
    "" = [] -> fail;
    string(String), string_chars(String, Chars), re_wf(Chars).


%% match(+RE, +Str, -Groups, -Tail)
%% Pattern-match with a Regex (RE) on a String (STR)
%% Tail is what remains of Str
%% Groups is a list of `group` that have been validated with their chars.
match(any, [_|Tail], [], Tail).
match(Char, [Char|Tail], [], Tail) :- character(Char).
match(in(Chars), [Char|Tail], [], Tail) :- member(Char, Chars).
match(notin(Chars), [Char|Tail], [], Tail) :-
    isa_list(Chars), \+ member(Char, Chars).
match([], Tail, [], Tail).
match([RE|REs], Str, Gs, Tail) :-
    match(RE, Str, Gs1, T), match(REs, T, Gs2, Tail),
    append(Gs1, Gs2, Gs).
match(RE1 \/ RE2, Str, Gs, Tail) :-
    match(RE1, Str, Gs, Tail); match(RE2, Str, Gs, Tail).
match(?(RE), Str, [], Tail) :- match(RE, Str, [], Tail).
match(?(_), Str, [], Str).
match(*(RE), Str, Gs, Tail) :-
    match(RE, Str, Gs1, T),
    Str \= T,
    match(*(RE), T, Gs2, Tail),
    append(Gs1, Gs2, Gs).
match(*(_), Str, [], Str).
match(+(RE), Str, Gs, Tail) :-
    match(RE, Str, Gs1, S), match(*(RE), S, Gs2, Tail),
    append(Gs1, Gs2, Gs).
match(name(Name, RE), Str, [(Name = GStr)|Gs], Tail) :-
    match(RE, Str, Gs, Tail),
    append(GChars, Tail, Str), chars_to_string(GChars, GStr).
match(String, Str, Gs, Tail) :-
    "" = [] -> fail;
    string(String), string_chars(String, Chars), match(Chars, Str, Gs, Tail).
match(in(String), Str, Gs, Tail) :-
    "" = [] -> fail;
    string(String), string_chars(String, Chars),
    match(in(Chars), Str, Gs, Tail).
match(notin(String), Str, Gs, Tail) :-
    "" = [] -> fail;
    string(String), string_chars(String, Chars),
    match(notin(Chars), Str, Gs, Tail).

%% match_string(+RE, +Str, -Groups, -Tail)
%% Similar to the `match` function. Takes a String in input :
%% - atome
%% - a list of chars (ISO Prolog).
%% - an object "string" (e.g. SWI-Prolog).
%% Return a "string" as result(e.g. en SWI-Prolog)
%% Otherwise, returns an atom
match_string(RE, Str, Gs, Tail) :-
    string_to_chars(Str, Chars),
    match(RE, Chars, Gs, TailChars),
    chars_to_string(TailChars, Tail).

%% search_in_chars(+RE, +Str, -Res)
%% To find the substrings accepted by the Regex (RE)
%% Res is the result of what have been accepted by RE, along the substrings
search_in_chars(RE, Str, Res) :-
    match(RE, Str, Gs, Tail), append(FoundChars, Tail, Str),
    chars_to_string(FoundChars, Found),
    Res = ['' = Found | Gs].
search_in_chars(RE, [_|Str], Res) :- search_in_chars(RE, Str, Res).

%% search(+RE, +Str, -Res)
%% Similar to `search_in_chars` but accept "string" instead.
search(RE, Str, Res) :-
    string_to_chars(Str, Chars),
    search_in_chars(RE, Chars, Res).

%% Exemples:
%% search("hello" \/ "help" \/ [*("a"),"b"], "hello about help", Res)
%% search(["(", name(k,*(any)), "=", name(v,*(any)), ")"],
%%         "(age=23) and (position=straight)", Res)

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%         Non-deterministic Finite State Automaton  (NFA)
%%
%% State, Step
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%% state_wf(+State, +NFA)
%% Check whether or not the state is valid inside the NFA.
state_wf(State, NFA) :- atom(State), member((State = _), NFA).

%% step_wf(+Step, +NFA)
%% Check if step is valid in the NFA.
%% A step can take the following forms
%% - "success"
%% - "step(Steps)" where Steps is a list of elements (Char -> State)
%% - "epsilon(Marks,States)" where States is a list of States where we can go without taking in a character
%%   Marks is a list : `beg(Name)` or `end(Name)`, that indiquates if the sub-groups called Name start or end there.

step_wf(success, _).
step_wf(step([]), _).
step_wf(step(S), NFA) :- state_wf(S, NFA).
step_wf(step([(Char -> State) | Steps]), NFA) :-
    character(Char), state_wf(State, NFA), step_wf(step(Steps), NFA).
step_wf(epsilon([], []), _).
step_wf(epsilon([], [State | States]), NFA) :-
    state_wf(State, NFA), step_wf(epsilon([], States), NFA).
step_wf(epsilon([Mark|Marks], Ss), NFA) :-
    step_wf(epsilon(Marks, Ss), NFA),
    (Mark = beg(Name); Mark = end(Name)),
    (atom(Name); integer(Name)).

%% nfa_wf(+NFA)
%% Check whether or not the NFA is valid.
nfa_wf(NFA) :- nfa_wf(NFA, NFA).

%% nfa_wf(+Ss, +NFA)
nfa_wf([], _).
nfa_wf([State = Step | Ss], NFA) :-
    member(State = _, Ss) -> fail;
    step_wf(Step, NFA), nfa_wf(Ss, NFA).

%% nfa_match(+NFA, +Step, +Marks, +Str, -Groups, -Tail)
%% This relation is verified if the NFA, beginning at Step, and with a Str,
%% is capable of reaching a final state. If it does, Tail contains what remains of the Str (the "accepted part has been consumed").
%% Marks contain the "marks" that have been verified and Groups returns the subgroups

nfa_match(_, success, [], Str, [], Str).
%% FILL THIS PART

%% nfa_search_in_chars(+NFA, +Str, -Res)
%% Try to find the sub-string accepted by the NFA
%% Returns : Res. Res is the string and sub-groups accepted by NFA.
nfa_search_in_chars(NFA, Str, Res) :-

    NFA = [(_State = Step) | _],
    nfa_match(NFA, Step, [], Str, Gs, Tail), append(FoundChars, Tail, Str),
    chars_to_string(FoundChars, Found),
    Res = ['' = Found | Gs].
nfa_search_in_chars(NFA, [_|Str], Res) :-
    nfa_search_in_chars(NFA, Str, Res).

%% nfa_search(+NFA, +Str, -Res)
%% Similar to `nfa_search_in_chars` but with "string".
nfa_search(NFA, Str, Res) :-
    string_to_chars(Str, Chars),
    nfa_search_in_chars(NFA, Chars, Res).

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%% new_state(-State)
%% Create a new State in the NFA.
new_state(State) :- genatom(s, State).

%% re_comp(+RE, +EndState, -BeginState, -NFA)
%% Compile the Regex (RE) into a NFA. The initial State is BeginState
%% The final step is EndState (RE has been accepted).
re_comp([], S, S, []).
re_comp([RE|REs], E, B, NFA) :-
    re_comp(REs, E, I, NFA2),
    re_comp(RE, I, B, NFA1),
    append(NFA1, NFA2, NFA).
re_comp(any, E, B, [B = step(E)]) :- new_state(B).
re_comp(*(RE), E, B, [B = epsilon([], [B1,E]) | NFA]) :-
    new_state(B), re_comp(RE, B, B1, NFA).
re_comp(String, E, B, NFA) :-
    "" = [] -> fail;
    string(String), string_chars(String, Chars), re_comp(Chars, E, B, NFA).

%% FILL THIS PART HERE


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% Epsilons
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% COMPLETE THIS PART
%% This code takes care of Epsilon cycles, but do not take into account the "marks"

%% nfa_epsilons(+NFAi, -NFAo)
%% NFAo is an optimized version of NFAi.
%% The optimization will replace the epsilon(Ss) that have States that are epsilons with their destinations.
%% This is to avoid a list of epsilons.
nfa_epsilons(NFAi, NFAo) :- nfa_epsilons(NFAi, NFAi, NFAo).

%% nfa_epsilons(+NFA, +NFAi, -NFAo)
nfa_epsilons(_, [], []).
nfa_epsilons(NFA, [S = epsilon(_, Ss) | NFA1], [S = epsilon([], Ns) | NFA2]) :-
    !, nfa_epsilons_1(NFA, [S], Ss, Ns),
    nfa_epsilons(NFA, NFA1, NFA2).
nfa_epsilons(NFA, [S | NFA1], [S | NFA2]) :- nfa_epsilons(NFA, NFA1, NFA2).

%% nfa_epsilons_1(+NFA, +Epsilons, +States, -NonEpsilons)
%% Take as argument a list of `States` and returns a list of states `NonEpsilons`
%% that is equivalent, but without any list of epsilons-states.
%% `Epsilons` is the list of epsilons that have been checked.
nfa_epsilons_1(_, _, [], []).
nfa_epsilons_1(NFA, Es, [S|Ss], Ns) :-
    member(S, Es)
    %% If S is an epsilon-state that we already checked, nothing happens.
    -> nfa_epsilons_1(NFA, Es, Ss, Ns)
    ;  (member(S = epsilon(_, Ss1), NFA)
        %% A new epsilon-state
       -> append(Ss1, Ss, Ss2),
          nfa_epsilons_1(NFA, [S|Es], Ss2, Ns)
       ;  nfa_epsilons_1(NFA, Es, Ss, Ns1),
          (member(S, Ns1)
          -> Ns = Ns1
          ;  Ns = [S|Ns1])).

%% re_compile(+RE, -NFA)
%% Compile a Regex into a NFA
re_compile(RE, NFA) :-
        new_state(E),                          .
        re_comp(RE, E, B, NFA1),
        %% Élimine les cycle d'epsilon qui peuvent apparaître par exemple
        %% avec `*(RE)` si RE accepte la chaîne vide.
        nfa_epsilons(NFA1, NFA2),
        (B = E; NFA2 = [B = _ | _])
        %% Attention à garder l'état initial B comme premier élément de NFA.
        -> append(NFA2, [E = success], NFA)
        ; write(user_error, re_comp(not(etat_initial = premier_etat))),
          nl(user_error), !, fail.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


%% re_search(+RE, +Str, -Res)
%% Try to find the substrings accepted by the Regex(RE) and convert them into the NFA.
%% The results must be similar to "search(RE, Str, Res)".
re_search(RE, Str, Res) :-
        re_compile(RE, NFA),
        (nfa_wf(NFA)
            -> nfa_search(NFA, Str, Res);
          Res = re_compile_error).