View difference between Paste ID: ACq8Yzu1 and
SHOW: | | - or go back to the newest paste.
1-
1+
-module(crawler).
2
-compile(export_all).
3
-author({jha, abhinav}).
4
5
-define(SOCKOPTS, [binary,{packet, 0}, {active, true}]).
6
startshell(Seed)->
7
Unvisited = [Seed],
8
lists:foreach(fun(Url)->accumulate(Url, [], []) end, Unvisited).
9
10
% Call accumulate for each of the members of the Unvisited list.
11
start(Seed)->
12
Seed0 = case is_list(Seed) of
13
true -> lists:concat(Seed);
14
false -> Seed
15
end,
16
Unvisited = [Seed0],
17
lists:foreach(fun(Url)->accumulate(Url, [], []) end, Unvisited).
18
19
% Simple test for whether a list contains a member.
20
contains(_, "#") -> false;
21
contains([X|_T], X)->false;
22
contains([_H|T], X)-> contains(T, X);
23
contains([], _X)-> true.
24
25
% Main chewing function. Chews up a web page and spits out the URLs contained within.
26
analyze([$h,$r,$e,$f,$=,$"|T], Reaped) ->
27
{Remain, R} = analyze(T, Reaped, []),
28
analyze(Remain, R);
29
30
analyze([_H|T], Reaped)->
31
analyze(T, Reaped);
32
33
analyze([], Reaped) -> Reaped.
34
35
analyze([H|T], Reaped, Cur)->
36
case H of
37
$" -> {T, [lists:reverse(Cur)|Reaped]};
38
_ -> analyze(T, Reaped, [H|Cur])
39
end.
40
41
% Does the dirty work by calling the page fetching function and accumulating URLs
42
accumulate(Url, Unvisited, Visited)->
43
io:format("Now downloading: ~p ~n", [Url]),
44
io:format("***********************************************~n"),
45
46
{Host, Port, Body} = split_url(Url, "", "80"),
47
Newurls = analyze(binary_to_list(Body), []),
48
Mappedurls = lists:map(fun(X)->
49
case X of
50
[$h,$t,$t,$p,$:,$/,$/|_T] -> X;
51
[$/|_T] -> case Port of
52
"80" ->lists:concat(["http://",Host,X]);
53
_-> lists:concat(["http://",Host,":", Port, X])
54
end;
55
"#" -> "#";
56
_ -> case Port of
57
"80" -> lists:concat(["http://", Host, "/", X]);
58
_-> lists:concat(["http://", Host, ":", Port, "/", X])
59
end
60
end
61
end, Newurls),
62
Finalurls = lists:filter(fun(X) -> contains(Visited, X) end, Mappedurls),
63
lists:foreach(fun(XX) -> io:format("Got URL: ~p ~n", [XX]) end, Finalurls),
64
lists:foreach(fun(U) -> accumulate(U, Unvisited, [Url|Visited]) end, Finalurls).
65
66
% Extracts host and port from the URL and fetches the page.
67
split_url([$h,$t,$t,$p,$:,$/,$/|T], Host, Port)->
68
split_url("http", T, Host, Port);
69
70
split_url(_X, Host, Port)->
71
fetch(Host, Port, _X).
72
73
split_url(_Tag, Meat, Host, Port)->
74
[H|Tokens] = string:tokens(Meat, ":"),
75
case length(Tokens) of
76
0 ->
77
[HH | UU] = string:tokens(H, "/"),
78
case length(UU) of
79
0 -> fetch(HH,"80", ["/"]);
80
_ -> fetch(HH, "80", ["/" ++ string:join(UU, "/")])
81
end;
82
1 -> [Port|Other] = Tokens,
83
fetch(H, Port, Other);
84
_ -> io:format("~p WTF? ~n", [Meat]),
85
fetch(Host, Port, Meat)
86
end.
87
88
fetch(Host, P, Url)->
89
Actualurl = case length(Url) of
90
0 -> "/";
91
_ -> lists:concat(Url)
92
end,
93
{Port, _} = string:to_integer(P),
94
io:format("~p:~p/~p ~n", [Host, Port, Actualurl]),
95
try gen_tcp:connect(Host, Port, ?SOCKOPTS) of
96
{ok, S} ->
97
io:format("Success."),
98
Request = "GET " ++ Actualurl ++ "/ HTTP/1.0\r\n\r\n",
99
gen_tcp:send(S, Request),
100
io:format("Sent: ~p~n", [Request]),
101
{Host, P, recv(S, [])};
102
_ -> {Host, P, <>}
103
catch
104
_:_ -> {Host, P, <>}
105
end.
106
107
% Receive the page's contents.
108
recv(S, X)->
109
receive
110
{tcp, S, Bin}->
111
%io:format("~p~n",[Bin]),
112
recv(S, [Bin|X]);
113
{tcp_closed, S}->
114
io:format("Writing file: ~p ~n.", ["/tmp/current.file"]),
115
file:write_file("/tmp/current.file", X),
116
list_to_binary(lists:reverse(X))
117
end.