SHARE
TWEET

blackbeard

a guest Dec 4th, 2010 569 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. -module(spyder).
  2. -compile(export_all).
  3. -author({jha, abhinav}).
  4.  
  5. -define(SOCKOPTS, [binary,{packet, 0}, {active, true}]).
  6.  
  7. clean(X) ->
  8.     [H|_] = lists:reverse(X),
  9.     Seed = case H of
  10.         "/" -> X;
  11.         _ -> lists:reverse([$/|lists:reverse(X)])
  12.     end,
  13.     Seed.
  14.  
  15. start(X)->
  16.     H = lists:concat(X),
  17.     io:format("~p~n", [H]),
  18.     Seed = clean(H),
  19.     Unvisited = [Seed],
  20.     Visited = [],
  21.     crawl(Unvisited, Visited).
  22.  
  23. crawl([], _V) ->
  24.     io:format("Successfully traversed the entire list.");
  25.  
  26. crawl([H|T], V)->
  27.     {Visited, Acc} = gather_links(clean(H), V, []),
  28.     io:format("Visited:~p Unvisited:~p ~n.", [length(Visited), length(T ++ Acc)]),
  29.     lists:foreach(fun(X)->io:format("~p~n" , [X]) end, Visited),
  30.     crawl(T ++ Acc, Visited).
  31.  
  32. gather_links(U, Visited, Acc)->
  33.     case string:str(U, "http://") of
  34.         0 -> io:format("~p. Could not recognize ~p~n", [?LINE, U]);
  35.         X ->
  36.             Url = string:substr(U, X+7),
  37.             [Host|Rest] = string:tokens(string:substr(Url, 1, string:str(Url, "/") - 1), ":"),
  38.             RestUrl = string:substr(Url, string:str(Url, "/")),
  39.             Port = case length(Rest) of
  40.                 0 -> 80;
  41.                 1 ->[_Port|_] = Rest,
  42.                     {IPort, _} = string:to_integer(_Port),
  43.                     IPort
  44.             end,
  45.             {ok, S} = gen_tcp:connect(Host, Port, ?SOCKOPTS),
  46.             ok = gen_tcp:send(S, "GET " ++ RestUrl ++ " HTTP/1.0\r\n\r\n"),
  47.             Page = fetch_data(S, []),
  48.             Links = extract_links_from_page(Page, Host, Port),
  49.             {[U|Visited] , (Links -- Visited) ++ (Links -- Acc) ++ Acc}
  50.     end.
  51.  
  52. fetch_data(S, Acc)->
  53.     receive
  54.         {tcp, S, Bin} -> fetch_data(S, [Bin|Acc]);
  55.         {tcp_closed, S} -> list_to_binary(lists:reverse(Acc))
  56.     end.
  57.  
  58. extract_links_from_page(Page, Host, Port) ->
  59.     file:write_file("/tmp/" ++ os:cmd("uuidgen -t"), Page),
  60.     Pagestr = binary_to_list(Page),
  61.     Links = extract_links_from_page(Pagestr, []),
  62.     Httplinks = [X || X <- Links, string:str(X, "http://") =/= 0],
  63.     Rellinks = ["http://" ++ Host ++ ":" ++ integer_to_list(Port) ++ X || X <- Links, string:str(X, "/") =:= 1],
  64.     Missinghttplinks = ["http://" ++ X || X <- Links -- Httplinks -- Rellinks,
  65.                                           string:str(X, ".") =/= 0,
  66.                                           string:str(X, ".php") =:= 0,
  67.                                           string:str(X, ".do") =:= 0,
  68.                                           string:str(X, ".aspx") =:=0,
  69.                                           string:str(X, ".html") =:=0,
  70.                                           string:str(X, ".mpg") =:= 0,
  71.                                           string:str(X, ".jpg") =:= 0,
  72.                                           string:str(X, ".mp3") =:=0,
  73.                                           string:str(X, ".png") =:=0],
  74.  
  75.     Slashlinks = [ "http://" ++ Host ++ ":" ++ integer_to_list(Port) ++ "/" ++ X || X <- Links -- Httplinks -- Rellinks -- Missinghttplinks, string:str(X, "/") =/= 1],
  76.     Otherlinks = [ "http://" ++ Host ++ ":" ++ integer_to_list(Port) ++ X || X <- Links -- Httplinks -- Rellinks -- Missinghttplinks -- Slashlinks],
  77.     Httplinks ++ Rellinks ++ Missinghttplinks ++ Otherlinks ++ Slashlinks.
  78.  
  79. extract_links_from_page([], Acc) -> Acc;
  80. extract_links_from_page([$h,$r,$e,$f,$=,$"|T], Acc)->
  81.    {L, R} = get_remaining_link(T, []),
  82.    extract_links_from_page(R, [L|Acc]);
  83. extract_links_from_page([_|T], Acc)-> extract_links_from_page(T, Acc).
  84.  
  85. get_remaining_link([$"|T], Acc)->{lists:reverse(Acc), T};
  86. get_remaining_link([H|T], Acc)-> get_remaining_link(T, [H|Acc]).
RAW Paste Data
Top