Untitled

-module(index).
-export([index/1]).

% Used to read a file into a list of lines.
% Example files available in:
%   gettysburg-address.txt (short)
%   dickens-christmas.txt  (long)


% Get the contents of a text file into a list of lines.
% Each line has its trailing newline removed.
get_file_contents(Name) ->
    {ok,File} = file:open(Name,[read]),
    Rev = get_all_lines(File,[]),
    lists:reverse(Rev).

% Auxiliary function for get_file_contents.
% Not exported.
get_all_lines(File,Partial) ->
    case io:get_line(File,"") of
        eof -> file:close(File),
               Partial;
        Line -> {Strip,_} = lists:split(length(Line)-1,Line),
                get_all_lines(File,[Strip|Partial])
    end.


% Show the contents of a list of strings.
% Can be used to check the results of calling get_file_contents.
show_file_contents([L|Ls]) ->
    io:format("~s~n",[L]),
    show_file_contents(Ls);
 show_file_contents([]) ->
    ok.


% Get the contents of the file as a List of Strings (text lines)
% where non-alphabetic characters have been removed and everything
% has been lower-cased
get_cleaned_contents(FileName) ->
    Lines = get_file_contents(FileName),
    LowerCaseLines = lists:map(fun(WordI) -> string:lowercase(WordI) end, Lines),
    LineCleaner = fun(LineI) ->
        re:replace(LineI, "[^A-Za-z ]+", "", [global, {return, list}])
    end,
    lists:map(LineCleaner, LowerCaseLines).


% Get the unique words of the text lines
unique_words(Lines) ->
    Words =
        lists:concat(
            lists:map(
                fun(LineI) -> string:tokens(LineI, " ") end,
                Lines
            )
        )
    ,
    UniqueWords = sets:to_list((sets:from_list(Words))),
    UniqueNonEmptyWords = lists:filter(fun(WordI) -> length(WordI) > 0 end, UniqueWords),
    UniqueNonEmptyWords.


% Get the index of a word
% i.e. a tuple composed by the word and a list of tuples with the line numbers
% this word appers, grouped by adjacency
index_word(Word, Lines) ->
    Mask = lists:map(
        fun(Line) ->
            LineWords = string:tokens(Line, " "),
            WordInLine = lists:any(fun(LineWord) -> LineWord == Word end, LineWords),
            WordInLine
        end,
        Lines
    ),
    LineIndexesWhereWordIsFound = indexes_where(fun(X) -> X == true end, Mask),
    { Word, anex_numbers(LineIndexesWhereWordIsFound) }.


% Gets the items that fulfill a predicate and return a list with their
% indices.
indexes_where(Pred, List) ->
    indexes_where(Pred, List, 0, []).
indexes_where(_, [], _, Indexes) ->
    Indexes;
indexes_where(Pred, [X|Xs], CurrentIndex, Indexes) ->
    XFulfillsPred = Pred(X),
    if
        XFulfillsPred ->
            indexes_where(Pred, Xs, CurrentIndex+1, Indexes ++ [CurrentIndex]);
        true ->
            indexes_where(Pred, Xs, CurrentIndex+1, Indexes)
    end.


% Return the line numbers grouped by adjacency
anex_numbers([X|Xs]) ->
    anex_numbers([X|Xs], [], [], X - 1).
anex_numbers([], Acc, Curr, _) ->
        Acc ++ [convert_line_number_to_index(Curr)];
anex_numbers([X|Xs], Acc, Curr, CurrI) when X == CurrI + 1 ->
    anex_numbers(Xs, Acc, Curr ++ [X], X);
anex_numbers([X|Xs], Acc, Curr, CurrI) when X =/= CurrI + 1 ->
    anex_numbers(Xs, Acc ++ [convert_line_number_to_index(Curr)], [X], X).


% From a list of line numbers, group them and return a list of tuples
% Each item of the list is the minimum and maximum line number in a two item tuple.
convert_line_number_to_index([LineNumber]) -> list_to_tuple([LineNumber, LineNumber]);
convert_line_number_to_index(LineNumbers) ->
    list_to_tuple([lists:nth(1, LineNumbers), lists:nth(length(LineNumbers), LineNumbers)]).


% Index a file
index(FileName) ->
    Lines = get_cleaned_contents(FileName),
    UniqueWords = unique_words(Lines),
    lists:map(
        fun(UniqueWordI) -> index_word(UniqueWordI, Lines) end,
        UniqueWords
    ).