Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- -module(spyder).
- -compile(export_all).
- -author({jha, abhinav}).
- -define(SOCKOPTS, [binary,{packet, 0}, {active, true}]).
- clean(X) ->
- [H|_] = lists:reverse(X),
- Seed = case H of
- "/" -> X;
- _ -> lists:reverse([$/|lists:reverse(X)])
- end,
- Seed.
- start(X)->
- H = lists:concat(X),
- io:format("~p~n", [H]),
- Seed = clean(H),
- Unvisited = [Seed],
- Visited = [],
- crawl(Unvisited, Visited).
- crawl([], _V) ->
- io:format("Successfully traversed the entire list.");
- crawl([H|T], V)->
- {Visited, Acc} = gather_links(clean(H), V, []),
- io:format("Visited:~p Unvisited:~p ~n.", [length(Visited), length(T ++ Acc)]),
- lists:foreach(fun(X)->io:format("~p~n" , [X]) end, Visited),
- crawl(T ++ Acc, Visited).
- gather_links(U, Visited, Acc)->
- case string:str(U, "http://") of
- 0 -> io:format("~p. Could not recognize ~p~n", [?LINE, U]);
- X ->
- Url = string:substr(U, X+7),
- [Host|Rest] = string:tokens(string:substr(Url, 1, string:str(Url, "/") - 1), ":"),
- RestUrl = string:substr(Url, string:str(Url, "/")),
- Port = case length(Rest) of
- 0 -> 80;
- 1 ->[_Port|_] = Rest,
- {IPort, _} = string:to_integer(_Port),
- IPort
- end,
- {ok, S} = gen_tcp:connect(Host, Port, ?SOCKOPTS),
- ok = gen_tcp:send(S, "GET " ++ RestUrl ++ " HTTP/1.0\r\n\r\n"),
- Page = fetch_data(S, []),
- Links = extract_links_from_page(Page, Host, Port),
- {[U|Visited] , (Links -- Visited) ++ (Links -- Acc) ++ Acc}
- end.
- fetch_data(S, Acc)->
- receive
- {tcp, S, Bin} -> fetch_data(S, [Bin|Acc]);
- {tcp_closed, S} -> list_to_binary(lists:reverse(Acc))
- end.
- extract_links_from_page(Page, Host, Port) ->
- file:write_file("/tmp/" ++ os:cmd("uuidgen -t"), Page),
- Pagestr = binary_to_list(Page),
- Links = extract_links_from_page(Pagestr, []),
- Httplinks = [X || X <- Links, string:str(X, "http://") =/= 0],
- Rellinks = ["http://" ++ Host ++ ":" ++ integer_to_list(Port) ++ X || X <- Links, string:str(X, "/") =:= 1],
- Missinghttplinks = ["http://" ++ X || X <- Links -- Httplinks -- Rellinks,
- string:str(X, ".") =/= 0,
- string:str(X, ".php") =:= 0,
- string:str(X, ".do") =:= 0,
- string:str(X, ".aspx") =:=0,
- string:str(X, ".html") =:=0,
- string:str(X, ".mpg") =:= 0,
- string:str(X, ".jpg") =:= 0,
- string:str(X, ".mp3") =:=0,
- string:str(X, ".png") =:=0],
- Slashlinks = [ "http://" ++ Host ++ ":" ++ integer_to_list(Port) ++ "/" ++ X || X <- Links -- Httplinks -- Rellinks -- Missinghttplinks, string:str(X, "/") =/= 1],
- Otherlinks = [ "http://" ++ Host ++ ":" ++ integer_to_list(Port) ++ X || X <- Links -- Httplinks -- Rellinks -- Missinghttplinks -- Slashlinks],
- Httplinks ++ Rellinks ++ Missinghttplinks ++ Otherlinks ++ Slashlinks.
- extract_links_from_page([], Acc) -> Acc;
- extract_links_from_page([$h,$r,$e,$f,$=,$"|T], Acc)->
- {L, R} = get_remaining_link(T, []),
- extract_links_from_page(R, [L|Acc]);
- extract_links_from_page([_|T], Acc)-> extract_links_from_page(T, Acc).
- get_remaining_link([$"|T], Acc)->{lists:reverse(Acc), T};
- get_remaining_link([H|T], Acc)-> get_remaining_link(T, [H|Acc]).
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement