Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- -module(index).
- -export([index/1]).
- % Used to read a file into a list of lines.
- % Example files available in:
- % gettysburg-address.txt (short)
- % dickens-christmas.txt (long)
- % Get the contents of a text file into a list of lines.
- % Each line has its trailing newline removed.
- get_file_contents(Name) ->
- {ok,File} = file:open(Name,[read]),
- Rev = get_all_lines(File,[]),
- lists:reverse(Rev).
- % Auxiliary function for get_file_contents.
- % Not exported.
- get_all_lines(File,Partial) ->
- case io:get_line(File,"") of
- eof -> file:close(File),
- Partial;
- Line -> {Strip,_} = lists:split(length(Line)-1,Line),
- get_all_lines(File,[Strip|Partial])
- end.
- % Show the contents of a list of strings.
- % Can be used to check the results of calling get_file_contents.
- show_file_contents([L|Ls]) ->
- io:format("~s~n",[L]),
- show_file_contents(Ls);
- show_file_contents([]) ->
- ok.
- % Get the contents of the file as a List of Strings (text lines)
- % where non-alphabetic characters have been removed and everything
- % has been lower-cased
- get_cleaned_contents(FileName) ->
- Lines = get_file_contents(FileName),
- LowerCaseLines = lists:map(fun(WordI) -> string:lowercase(WordI) end, Lines),
- LineCleaner = fun(LineI) ->
- re:replace(LineI, "[^A-Za-z ]+", "", [global, {return, list}])
- end,
- lists:map(LineCleaner, LowerCaseLines).
- % Get the unique words of the text lines
- unique_words(Lines) ->
- Words =
- lists:concat(
- lists:map(
- fun(LineI) -> string:tokens(LineI, " ") end,
- Lines
- )
- )
- ,
- UniqueWords = sets:to_list((sets:from_list(Words))),
- UniqueNonEmptyWords = lists:filter(fun(WordI) -> length(WordI) > 0 end, UniqueWords),
- UniqueNonEmptyWords.
- % Get the index of a word
- % i.e. a tuple composed by the word and a list of tuples with the line numbers
- % this word appers, grouped by adjacency
- index_word(Word, Lines) ->
- Mask = lists:map(
- fun(Line) ->
- LineWords = string:tokens(Line, " "),
- WordInLine = lists:any(fun(LineWord) -> LineWord == Word end, LineWords),
- WordInLine
- end,
- Lines
- ),
- LineIndexesWhereWordIsFound = indexes_where(fun(X) -> X == true end, Mask),
- { Word, anex_numbers(LineIndexesWhereWordIsFound) }.
- % Gets the items that fulfill a predicate and return a list with their
- % indices.
- indexes_where(Pred, List) ->
- indexes_where(Pred, List, 0, []).
- indexes_where(_, [], _, Indexes) ->
- Indexes;
- indexes_where(Pred, [X|Xs], CurrentIndex, Indexes) ->
- XFulfillsPred = Pred(X),
- if
- XFulfillsPred ->
- indexes_where(Pred, Xs, CurrentIndex+1, Indexes ++ [CurrentIndex]);
- true ->
- indexes_where(Pred, Xs, CurrentIndex+1, Indexes)
- end.
- % Return the line numbers grouped by adjacency
- anex_numbers([X|Xs]) ->
- anex_numbers([X|Xs], [], [], X - 1).
- anex_numbers([], Acc, Curr, _) ->
- Acc ++ [convert_line_number_to_index(Curr)];
- anex_numbers([X|Xs], Acc, Curr, CurrI) when X == CurrI + 1 ->
- anex_numbers(Xs, Acc, Curr ++ [X], X);
- anex_numbers([X|Xs], Acc, Curr, CurrI) when X =/= CurrI + 1 ->
- anex_numbers(Xs, Acc ++ [convert_line_number_to_index(Curr)], [X], X).
- % From a list of line numbers, group them and return a list of tuples
- % Each item of the list is the minimum and maximum line number in a two item tuple.
- convert_line_number_to_index([LineNumber]) -> list_to_tuple([LineNumber, LineNumber]);
- convert_line_number_to_index(LineNumbers) ->
- list_to_tuple([lists:nth(1, LineNumbers), lists:nth(length(LineNumbers), LineNumbers)]).
- % Index a file
- index(FileName) ->
- Lines = get_cleaned_contents(FileName),
- UniqueWords = unique_words(Lines),
- lists:map(
- fun(UniqueWordI) -> index_word(UniqueWordI, Lines) end,
- UniqueWords
- ).
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement