Advertisement
Guest User

blackbeard

a guest
Dec 1st, 2010
108
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Erlang 3.24 KB | None | 0 0
  1. -module(crawler).
  2. -compile(export_all).
  3. -author({jha, abhinav}).
  4.  
  5. -define(SOCKOPTS, [binary,{packet, 0}, {active, true}]).
  6. startshell(Seed)->
  7. Unvisited = [Seed],
  8. lists:foreach(fun(Url)->accumulate(Url, [], []) end, Unvisited).
  9.  
  10. % Call accumulate for each of the members of the Unvisited list.
  11. start(Seed)->
  12. Seed0 = case is_list(Seed) of
  13. true -> lists:concat(Seed);
  14. false -> Seed
  15. end,
  16. Unvisited = [Seed0],
  17. lists:foreach(fun(Url)->accumulate(Url, [], []) end, Unvisited).
  18.  
  19. % Simple test for whether a list contains a member.
  20. contains(_, "#") -> false;
  21. contains([X|_T], X)->false;
  22. contains([_H|T], X)-> contains(T, X);
  23. contains([], _X)-> true.
  24.  
  25. % Main chewing function. Chews up a web page and spits out the URLs contained within.
  26. analyze([$h,$r,$e,$f,$=,$"|T], Reaped) ->
  27. {Remain, R} = analyze(T, Reaped, []),
  28. analyze(Remain, R);
  29.  
  30. analyze([_H|T], Reaped)->
  31. analyze(T, Reaped);
  32.  
  33. analyze([], Reaped) -> Reaped.
  34.  
  35. analyze([H|T], Reaped, Cur)->
  36. case H of
  37. $" -> {T, [lists:reverse(Cur)|Reaped]};
  38. _ -> analyze(T, Reaped, [H|Cur])
  39. end.
  40.  
  41. % Does the dirty work by calling the page fetching function and accumulating URLs
  42. accumulate(Url, Unvisited, Visited)->
  43. io:format("Now downloading: ~p ~n", [Url]),
  44. io:format("***********************************************~n"),
  45.  
  46. {Host, Port, Body} = split_url(Url, "", "80"),
  47. Newurls = analyze(binary_to_list(Body), []),
  48. Mappedurls = lists:map(fun(X)->
  49. case X of
  50. [$h,$t,$t,$p,$:,$/,$/|_T] -> X;
  51. [$/|_T] -> case Port of
  52. "80" ->lists:concat(["http://",Host,X]);
  53. _-> lists:concat(["http://",Host,":", Port, X])
  54. end;
  55. "#" -> "#";
  56. _ -> case Port of
  57. "80" -> lists:concat(["http://", Host, "/", X]);
  58. _-> lists:concat(["http://", Host, ":", Port, "/", X])
  59. end
  60. end
  61. end, Newurls),
  62. Finalurls = lists:filter(fun(X) -> contains(Visited, X) end, Mappedurls),
  63. lists:foreach(fun(XX) -> io:format("Got URL: ~p ~n", [XX]) end, Finalurls),
  64. lists:foreach(fun(U) -> accumulate(U, Unvisited, [Url|Visited]) end, Finalurls).
  65.  
  66. % Extracts host and port from the URL and fetches the page.
  67. split_url([$h,$t,$t,$p,$:,$/,$/|T], Host, Port)->
  68. split_url("http", T, Host, Port);
  69.  
  70. split_url(_X, Host, Port)->
  71. fetch(Host, Port, _X).
  72.  
  73. split_url(_Tag, Meat, Host, Port)->
  74. [H|Tokens] = string:tokens(Meat, ":"),
  75. case length(Tokens) of
  76. 0 ->
  77. [HH | UU] = string:tokens(H, "/"),
  78. case length(UU) of
  79. 0 -> fetch(HH,"80", ["/"]);
  80. _ -> fetch(HH, "80", ["/" ++ string:join(UU, "/")])
  81. end;
  82. 1 -> [Port|Other] = Tokens,
  83. fetch(H, Port, Other);
  84. _ -> io:format("~p WTF? ~n", [Meat]),
  85. fetch(Host, Port, Meat)
  86. end.
  87.  
  88. fetch(Host, P, Url)->
  89. Actualurl = case length(Url) of
  90. 0 -> "/";
  91. _ -> lists:concat(Url)
  92. end,
  93. {Port, _} = string:to_integer(P),
  94. io:format("~p:~p/~p ~n", [Host, Port, Actualurl]),
  95. try gen_tcp:connect(Host, Port, ?SOCKOPTS) of
  96. {ok, S} ->
  97. io:format("Success."),
  98. Request = "GET " ++ Actualurl ++ "/ HTTP/1.0\r\n\r\n",
  99. gen_tcp:send(S, Request),
  100. io:format("Sent: ~p~n", [Request]),
  101. {Host, P, recv(S, [])};
  102. _ -> {Host, P, <>}
  103. catch
  104. _:_ -> {Host, P, <>}
  105. end.
  106.  
  107. % Receive the page's contents.
  108. recv(S, X)->
  109. receive
  110. {tcp, S, Bin}->
  111. %io:format("~p~n",[Bin]),
  112. recv(S, [Bin|X]);
  113. {tcp_closed, S}->
  114. io:format("Writing file: ~p ~n.", ["/tmp/current.file"]),
  115. file:write_file("/tmp/current.file", X),
  116. list_to_binary(lists:reverse(X))
  117. end.
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement