blackbeard

-module(crawler).
-compile(export_all).
-author({jha, abhinav}).

-define(SOCKOPTS, [binary,{packet, 0}, {active, true}]).
startshell(Seed)->
Unvisited = [Seed],
lists:foreach(fun(Url)->accumulate(Url, [], []) end, Unvisited).

% Call accumulate for each of the members of the Unvisited list.
start(Seed)->
Seed0 = case is_list(Seed) of
true -> lists:concat(Seed);
false -> Seed
end,
Unvisited = [Seed0],
lists:foreach(fun(Url)->accumulate(Url, [], []) end, Unvisited).

% Simple test for whether a list contains a member.
contains(_, "#") -> false;
contains([X|_T], X)->false;
contains([_H|T], X)-> contains(T, X);
contains([], _X)-> true.

% Main chewing function. Chews up a web page and spits out the URLs contained within.
analyze([$h,$r,$e,$f,$=,$"|T], Reaped) ->
{Remain, R} = analyze(T, Reaped, []),
analyze(Remain, R);

analyze([_H|T], Reaped)->
analyze(T, Reaped);

analyze([], Reaped) -> Reaped.

analyze([H|T], Reaped, Cur)->
case H of
$" -> {T, [lists:reverse(Cur)|Reaped]};
_ -> analyze(T, Reaped, [H|Cur])
end.

% Does the dirty work by calling the page fetching function and accumulating URLs
accumulate(Url, Unvisited, Visited)->
io:format("Now downloading: ~p ~n", [Url]),
io:format("***********************************************~n"),

{Host, Port, Body} = split_url(Url, "", "80"),
Newurls = analyze(binary_to_list(Body), []),
Mappedurls = lists:map(fun(X)->
case X of
[$h,$t,$t,$p,$:,$/,$/|_T] -> X;
[$/|_T] -> case Port of
"80" ->lists:concat(["http://",Host,X]);
_-> lists:concat(["http://",Host,":", Port, X])
end;
"#" -> "#";
_ -> case Port of
"80" -> lists:concat(["http://", Host, "/", X]);
_-> lists:concat(["http://", Host, ":", Port, "/", X])
end
end
end, Newurls),
Finalurls = lists:filter(fun(X) -> contains(Visited, X) end, Mappedurls),
lists:foreach(fun(XX) -> io:format("Got URL: ~p ~n", [XX]) end, Finalurls),
lists:foreach(fun(U) -> accumulate(U, Unvisited, [Url|Visited]) end, Finalurls).

% Extracts host and port from the URL and fetches the page.
split_url([$h,$t,$t,$p,$:,$/,$/|T], Host, Port)->
split_url("http", T, Host, Port);

split_url(_X, Host, Port)->
fetch(Host, Port, _X).

split_url(_Tag, Meat, Host, Port)->
[H|Tokens] = string:tokens(Meat, ":"),
case length(Tokens) of
0 ->
[HH | UU] = string:tokens(H, "/"),
case length(UU) of
0 -> fetch(HH,"80", ["/"]);
_ -> fetch(HH, "80", ["/" ++ string:join(UU, "/")])
end;
1 -> [Port|Other] = Tokens,
fetch(H, Port, Other);
_ -> io:format("~p WTF? ~n", [Meat]),
fetch(Host, Port, Meat)
end.

fetch(Host, P, Url)->
Actualurl = case length(Url) of
0 -> "/";
_ -> lists:concat(Url)
end,
{Port, _} = string:to_integer(P),
io:format("~p:~p/~p ~n", [Host, Port, Actualurl]),
try gen_tcp:connect(Host, Port, ?SOCKOPTS) of
{ok, S} ->
io:format("Success."),
Request = "GET " ++ Actualurl ++ "/ HTTP/1.0\r\n\r\n",
gen_tcp:send(S, Request),
io:format("Sent: ~p~n", [Request]),
{Host, P, recv(S, [])};
_ -> {Host, P, <>}
catch
_:_ -> {Host, P, <>}
end.

% Receive the page's contents.
recv(S, X)->
receive
{tcp, S, Bin}->
%io:format("~p~n",[Bin]),
recv(S, [Bin|X]);
{tcp_closed, S}->
io:format("Writing file: ~p ~n.", ["/tmp/current.file"]),
file:write_file("/tmp/current.file", X),
list_to_binary(lists:reverse(X))
end.