Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- -module(index).
- -export([get_file_contents/1,show_file_contents/1,index/1,aggregate/1]).
- % Used to read a file into a list of lines.
- % Example files available in:
- % gettysburg-address.txt (short)
- % dickens-christmas.txt (long)
- % Get the contents of a text file into a list of lines.
- % Each line has its trailing newline removed.
- get_file_contents(Name) ->
- {ok,File} = file:open(Name,[read]),
- Rev = get_all_lines(File,[]),
- lists:reverse(Rev).
- % Auxiliary function for get_file_contents.
- % Not exported.
- get_all_lines(File,Partial) ->
- case io:get_line(File,"") of
- eof -> file:close(File),
- Partial;
- Line -> {Strip,_} = lists:split(length(Line)-1,Line),
- get_all_lines(File,[Strip|Partial])
- end.
- % Show the contents of a list of strings.
- % Can be used to check the results of calling get_file_contents.
- show_file_contents([L|Ls]) ->
- io:format("~s~n",[L]),
- show_file_contents(Ls);
- show_file_contents([]) ->
- ok.
- %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
- % Index words within the given file.
- % Output is a list of entries consisting of a word and a list of the ranges
- % of lines on which it occurs. Example entry:
- % { "foo" , [{3,5},{7,7},{11,13}] }
- index(Filename) ->
- Lines = get_file_contents(Filename),
- Index = build_index(Lines),
- clean_index(Index).
- % Build the index given a list of lines.
- build_index(Lines) ->
- build_index(Lines, 1, []).
- build_index([], _, Index) -> Index;
- build_index([Line | Lines], LineNum, Index) ->
- NewIndex = index_line(Line, LineNum, Index),
- build_index(Lines, LineNum+1, NewIndex).
- % Build the index for the given line.
- index_line(Line, LineNum, Index) ->
- % Extract words as lower-case.
- Words = lists:map(
- fun string:to_lower/1,
- string:tokens(Line, " ,-.;\\!'\"")),
- index_words(Words, LineNum, Index).
- % Build the index for the given words.
- index_words([], _, Index) -> Index;
- index_words([Word | Words], LineNum, Index) ->
- Entry = add_occurrence(get_entry(Word, Index), LineNum),
- NewIndex = update_index(Index, Entry),
- index_words(Words, LineNum, NewIndex).
- % Find an existing entry in the index, or create a new one.
- get_entry(Word, Index) ->
- Entry = lists:keyfind(Word, 1, Index),
- case Entry of
- false -> {Word, []};
- _ -> Entry
- end.
- % Add a line number occurrence to a word's entry.
- add_occurrence(Entry = {Word, LineNumbers}, LineNum) ->
- % Just throw plain line numbers in here, we'll sort out the proper
- % format later on.
- case lists:member(LineNum, LineNumbers) of
- % The word has already occurred on this line.
- true -> Entry;
- % The word has never been seen on this line before.
- false -> {Word, [LineNum | LineNumbers]}
- end.
- % Update or add an entry in the index.
- update_index(Index, Entry = {Word, _LineNumbers}) ->
- lists:keystore(Word, 1, Index, Entry).
- % Take the simple index and turn each entry's list of plain line numbers into
- % a list of {start,end} line number ranges. Sort entries alphabetically.
- clean_index(Index) ->
- lists:keysort(1, clean_index(Index, [])).
- clean_index([], Index) -> Index;
- clean_index([Entry | NextEntry], Index) ->
- clean_index(NextEntry, [clean_entry(Entry) | Index]).
- % Turn an entry's list of plain line numbers into a list of {start,end}
- % line number ranges.
- clean_entry(_Entry = {Word, LineNumbers}) ->
- {Word, aggregate(lists:sort(LineNumbers))}.
- % Take a sorted list and aggregate consecutive numbers into {Lo,Hi} tuples.
- % Numbers that stand alone become {N,N} tuples.
- %
- % Eg aggregate([1,3,4,5,7,9,10]) becomes
- % [{1,1}, {3,5}, {7,7}, {9,10}]
- aggregate(Numbers) ->
- aggregate(Numbers, none, none, []).
- % aggregate(
- % Numbers :: [pos_int],
- % Lo :: pos_int or none,
- % Hi :: pos_int or none,
- % Agg :: [{pos_int, pos_int}])
- %
- % Two base cases. Empty list, with or without a group in progress.
- aggregate([], none, none, Agg) -> lists:reverse(Agg);
- aggregate([], Lo, Hi, Agg) ->
- aggregate([], none, none, [{Lo, Hi} | Agg]);
- aggregate([N|Ns], none, none, Agg) -> % Start a new group.
- aggregate(Ns, N, N, Agg);
- aggregate([N|Ns], Lo, Hi, Agg) when N == (Hi+1) -> % Continue a group.
- aggregate(Ns, Lo, N, Agg);
- aggregate(Numbers=[N|_], Lo, Hi, Agg) when N > (Hi+1) -> % End a group.
- aggregate(Numbers, none, none, [{Lo, Hi} | Agg]).
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement