Advertisement
Guest User

Untitled

a guest
Aug 13th, 2019
120
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Erlang 3.94 KB | None | 0 0
  1. -module(index).
  2. -export([index/1]).
  3.  
  4. % Used to read a file into a list of lines.
  5. % Example files available in:
  6. %   gettysburg-address.txt (short)
  7. %   dickens-christmas.txt  (long)
  8.  
  9.  
  10. % Get the contents of a text file into a list of lines.
  11. % Each line has its trailing newline removed.
  12. get_file_contents(Name) ->
  13.     {ok,File} = file:open(Name,[read]),
  14.     Rev = get_all_lines(File,[]),
  15.     lists:reverse(Rev).
  16.  
  17. % Auxiliary function for get_file_contents.
  18. % Not exported.
  19. get_all_lines(File,Partial) ->
  20.     case io:get_line(File,"") of
  21.         eof -> file:close(File),
  22.                Partial;
  23.         Line -> {Strip,_} = lists:split(length(Line)-1,Line),
  24.                 get_all_lines(File,[Strip|Partial])
  25.     end.
  26.  
  27.  
  28. % Show the contents of a list of strings.
  29. % Can be used to check the results of calling get_file_contents.
  30. show_file_contents([L|Ls]) ->
  31.     io:format("~s~n",[L]),
  32.     show_file_contents(Ls);
  33.  show_file_contents([]) ->
  34.     ok.    
  35.  
  36.  
  37. % Get the contents of the file as a List of Strings (text lines)
  38. % where non-alphabetic characters have been removed and everything
  39. % has been lower-cased
  40. get_cleaned_contents(FileName) ->
  41.     Lines = get_file_contents(FileName),
  42.     LowerCaseLines = lists:map(fun(WordI) -> string:lowercase(WordI) end, Lines),
  43.     LineCleaner = fun(LineI) ->
  44.         re:replace(LineI, "[^A-Za-z ]+", "", [global, {return, list}])
  45.     end,
  46.     lists:map(LineCleaner, LowerCaseLines).
  47.  
  48.  
  49. % Get the unique words of the text lines
  50. unique_words(Lines) ->
  51.     Words =
  52.         lists:concat(
  53.             lists:map(
  54.                 fun(LineI) -> string:tokens(LineI, " ") end,
  55.                 Lines
  56.             )
  57.         )
  58.     ,
  59.     UniqueWords = sets:to_list((sets:from_list(Words))),
  60.     UniqueNonEmptyWords = lists:filter(fun(WordI) -> length(WordI) > 0 end, UniqueWords),
  61.     UniqueNonEmptyWords.
  62.  
  63.  
  64. % Get the index of a word
  65. % i.e. a tuple composed by the word and a list of tuples with the line numbers
  66. % this word appers, grouped by adjacency
  67. index_word(Word, Lines) ->
  68.     Mask = lists:map(
  69.         fun(Line) ->
  70.             LineWords = string:tokens(Line, " "),
  71.             WordInLine = lists:any(fun(LineWord) -> LineWord == Word end, LineWords),
  72.             WordInLine
  73.         end,
  74.         Lines
  75.     ),
  76.     LineIndexesWhereWordIsFound = indexes_where(fun(X) -> X == true end, Mask),
  77.     { Word, anex_numbers(LineIndexesWhereWordIsFound) }.
  78.  
  79.  
  80. % Gets the items that fulfill a predicate and return a list with their
  81. % indices.
  82. indexes_where(Pred, List) ->
  83.     indexes_where(Pred, List, 0, []).
  84. indexes_where(_, [], _, Indexes) ->
  85.     Indexes;
  86. indexes_where(Pred, [X|Xs], CurrentIndex, Indexes) ->
  87.     XFulfillsPred = Pred(X),  
  88.     if
  89.         XFulfillsPred ->
  90.             indexes_where(Pred, Xs, CurrentIndex+1, Indexes ++ [CurrentIndex]);
  91.         true ->
  92.             indexes_where(Pred, Xs, CurrentIndex+1, Indexes)
  93.     end.
  94.  
  95.  
  96. % Return the line numbers grouped by adjacency
  97. anex_numbers([X|Xs]) ->
  98.     anex_numbers([X|Xs], [], [], X - 1).
  99. anex_numbers([], Acc, Curr, _) ->
  100.         Acc ++ [convert_line_number_to_index(Curr)];
  101. anex_numbers([X|Xs], Acc, Curr, CurrI) when X == CurrI + 1 ->
  102.     anex_numbers(Xs, Acc, Curr ++ [X], X);
  103. anex_numbers([X|Xs], Acc, Curr, CurrI) when X =/= CurrI + 1 ->
  104.     anex_numbers(Xs, Acc ++ [convert_line_number_to_index(Curr)], [X], X).
  105.  
  106.  
  107. % From a list of line numbers, group them and return a list of tuples
  108. % Each item of the list is the minimum and maximum line number in a two item tuple.
  109. convert_line_number_to_index([LineNumber]) -> list_to_tuple([LineNumber, LineNumber]);
  110. convert_line_number_to_index(LineNumbers) ->
  111.     list_to_tuple([lists:nth(1, LineNumbers), lists:nth(length(LineNumbers), LineNumbers)]).
  112.  
  113.  
  114. % Index a file
  115. index(FileName) ->
  116.     Lines = get_cleaned_contents(FileName),
  117.     UniqueWords = unique_words(Lines),
  118.     lists:map(
  119.         fun(UniqueWordI) -> index_word(UniqueWordI, Lines) end,
  120.         UniqueWords
  121.     ).
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement