Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- -module(crawler).
- -compile(export_all).
- -author({jha, abhinav}).
- -define(SOCKOPTS, [binary,{packet, 0}, {active, true}]).
- startshell(Seed)->
- Unvisited = [Seed],
- lists:foreach(fun(Url)->accumulate(Url, [], []) end, Unvisited).
- % Call accumulate for each of the members of the Unvisited list.
- start(Seed)->
- Seed0 = case is_list(Seed) of
- true -> lists:concat(Seed);
- false -> Seed
- end,
- Unvisited = [Seed0],
- lists:foreach(fun(Url)->accumulate(Url, [], []) end, Unvisited).
- % Simple test for whether a list contains a member.
- contains(_, "#") -> false;
- contains([X|_T], X)->false;
- contains([_H|T], X)-> contains(T, X);
- contains([], _X)-> true.
- % Main chewing function. Chews up a web page and spits out the URLs contained within.
- analyze([$h,$r,$e,$f,$=,$"|T], Reaped) ->
- {Remain, R} = analyze(T, Reaped, []),
- analyze(Remain, R);
- analyze([_H|T], Reaped)->
- analyze(T, Reaped);
- analyze([], Reaped) -> Reaped.
- analyze([H|T], Reaped, Cur)->
- case H of
- $" -> {T, [lists:reverse(Cur)|Reaped]};
- _ -> analyze(T, Reaped, [H|Cur])
- end.
- % Does the dirty work by calling the page fetching function and accumulating URLs
- accumulate(Url, Unvisited, Visited)->
- io:format("Now downloading: ~p ~n", [Url]),
- io:format("***********************************************~n"),
- {Host, Port, Body} = split_url(Url, "", "80"),
- Newurls = analyze(binary_to_list(Body), []),
- Mappedurls = lists:map(fun(X)->
- case X of
- [$h,$t,$t,$p,$:,$/,$/|_T] -> X;
- [$/|_T] -> case Port of
- "80" ->lists:concat(["http://",Host,X]);
- _-> lists:concat(["http://",Host,":", Port, X])
- end;
- "#" -> "#";
- _ -> case Port of
- "80" -> lists:concat(["http://", Host, "/", X]);
- _-> lists:concat(["http://", Host, ":", Port, "/", X])
- end
- end
- end, Newurls),
- Finalurls = lists:filter(fun(X) -> contains(Visited, X) end, Mappedurls),
- lists:foreach(fun(XX) -> io:format("Got URL: ~p ~n", [XX]) end, Finalurls),
- lists:foreach(fun(U) -> accumulate(U, Unvisited, [Url|Visited]) end, Finalurls).
- % Extracts host and port from the URL and fetches the page.
- split_url([$h,$t,$t,$p,$:,$/,$/|T], Host, Port)->
- split_url("http", T, Host, Port);
- split_url(_X, Host, Port)->
- fetch(Host, Port, _X).
- split_url(_Tag, Meat, Host, Port)->
- [H|Tokens] = string:tokens(Meat, ":"),
- case length(Tokens) of
- 0 ->
- [HH | UU] = string:tokens(H, "/"),
- case length(UU) of
- 0 -> fetch(HH,"80", ["/"]);
- _ -> fetch(HH, "80", ["/" ++ string:join(UU, "/")])
- end;
- 1 -> [Port|Other] = Tokens,
- fetch(H, Port, Other);
- _ -> io:format("~p WTF? ~n", [Meat]),
- fetch(Host, Port, Meat)
- end.
- fetch(Host, P, Url)->
- Actualurl = case length(Url) of
- 0 -> "/";
- _ -> lists:concat(Url)
- end,
- {Port, _} = string:to_integer(P),
- io:format("~p:~p/~p ~n", [Host, Port, Actualurl]),
- try gen_tcp:connect(Host, Port, ?SOCKOPTS) of
- {ok, S} ->
- io:format("Success."),
- Request = "GET " ++ Actualurl ++ "/ HTTP/1.0\r\n\r\n",
- gen_tcp:send(S, Request),
- io:format("Sent: ~p~n", [Request]),
- {Host, P, recv(S, [])};
- _ -> {Host, P, <>}
- catch
- _:_ -> {Host, P, <>}
- end.
- % Receive the page's contents.
- recv(S, X)->
- receive
- {tcp, S, Bin}->
- %io:format("~p~n",[Bin]),
- recv(S, [Bin|X]);
- {tcp_closed, S}->
- io:format("Writing file: ~p ~n.", ["/tmp/current.file"]),
- file:write_file("/tmp/current.file", X),
- list_to_binary(lists:reverse(X))
- end.
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement