View difference between Paste ID: ACq8Yzu1 and
SHOW:
|
|
- or go back to the newest paste.
1 | - | |
1 | + | -module(crawler). |
2 | -compile(export_all). | |
3 | -author({jha, abhinav}). | |
4 | ||
5 | -define(SOCKOPTS, [binary,{packet, 0}, {active, true}]). | |
6 | startshell(Seed)-> | |
7 | Unvisited = [Seed], | |
8 | lists:foreach(fun(Url)->accumulate(Url, [], []) end, Unvisited). | |
9 | ||
10 | % Call accumulate for each of the members of the Unvisited list. | |
11 | start(Seed)-> | |
12 | Seed0 = case is_list(Seed) of | |
13 | true -> lists:concat(Seed); | |
14 | false -> Seed | |
15 | end, | |
16 | Unvisited = [Seed0], | |
17 | lists:foreach(fun(Url)->accumulate(Url, [], []) end, Unvisited). | |
18 | ||
19 | % Simple test for whether a list contains a member. | |
20 | contains(_, "#") -> false; | |
21 | contains([X|_T], X)->false; | |
22 | contains([_H|T], X)-> contains(T, X); | |
23 | contains([], _X)-> true. | |
24 | ||
25 | % Main chewing function. Chews up a web page and spits out the URLs contained within. | |
26 | analyze([$h,$r,$e,$f,$=,$"|T], Reaped) -> | |
27 | {Remain, R} = analyze(T, Reaped, []), | |
28 | analyze(Remain, R); | |
29 | ||
30 | analyze([_H|T], Reaped)-> | |
31 | analyze(T, Reaped); | |
32 | ||
33 | analyze([], Reaped) -> Reaped. | |
34 | ||
35 | analyze([H|T], Reaped, Cur)-> | |
36 | case H of | |
37 | $" -> {T, [lists:reverse(Cur)|Reaped]}; | |
38 | _ -> analyze(T, Reaped, [H|Cur]) | |
39 | end. | |
40 | ||
41 | % Does the dirty work by calling the page fetching function and accumulating URLs | |
42 | accumulate(Url, Unvisited, Visited)-> | |
43 | io:format("Now downloading: ~p ~n", [Url]), | |
44 | io:format("***********************************************~n"), | |
45 | ||
46 | {Host, Port, Body} = split_url(Url, "", "80"), | |
47 | Newurls = analyze(binary_to_list(Body), []), | |
48 | Mappedurls = lists:map(fun(X)-> | |
49 | case X of | |
50 | [$h,$t,$t,$p,$:,$/,$/|_T] -> X; | |
51 | [$/|_T] -> case Port of | |
52 | "80" ->lists:concat(["http://",Host,X]); | |
53 | _-> lists:concat(["http://",Host,":", Port, X]) | |
54 | end; | |
55 | "#" -> "#"; | |
56 | _ -> case Port of | |
57 | "80" -> lists:concat(["http://", Host, "/", X]); | |
58 | _-> lists:concat(["http://", Host, ":", Port, "/", X]) | |
59 | end | |
60 | end | |
61 | end, Newurls), | |
62 | Finalurls = lists:filter(fun(X) -> contains(Visited, X) end, Mappedurls), | |
63 | lists:foreach(fun(XX) -> io:format("Got URL: ~p ~n", [XX]) end, Finalurls), | |
64 | lists:foreach(fun(U) -> accumulate(U, Unvisited, [Url|Visited]) end, Finalurls). | |
65 | ||
66 | % Extracts host and port from the URL and fetches the page. | |
67 | split_url([$h,$t,$t,$p,$:,$/,$/|T], Host, Port)-> | |
68 | split_url("http", T, Host, Port); | |
69 | ||
70 | split_url(_X, Host, Port)-> | |
71 | fetch(Host, Port, _X). | |
72 | ||
73 | split_url(_Tag, Meat, Host, Port)-> | |
74 | [H|Tokens] = string:tokens(Meat, ":"), | |
75 | case length(Tokens) of | |
76 | 0 -> | |
77 | [HH | UU] = string:tokens(H, "/"), | |
78 | case length(UU) of | |
79 | 0 -> fetch(HH,"80", ["/"]); | |
80 | _ -> fetch(HH, "80", ["/" ++ string:join(UU, "/")]) | |
81 | end; | |
82 | 1 -> [Port|Other] = Tokens, | |
83 | fetch(H, Port, Other); | |
84 | _ -> io:format("~p WTF? ~n", [Meat]), | |
85 | fetch(Host, Port, Meat) | |
86 | end. | |
87 | ||
88 | fetch(Host, P, Url)-> | |
89 | Actualurl = case length(Url) of | |
90 | 0 -> "/"; | |
91 | _ -> lists:concat(Url) | |
92 | end, | |
93 | {Port, _} = string:to_integer(P), | |
94 | io:format("~p:~p/~p ~n", [Host, Port, Actualurl]), | |
95 | try gen_tcp:connect(Host, Port, ?SOCKOPTS) of | |
96 | {ok, S} -> | |
97 | io:format("Success."), | |
98 | Request = "GET " ++ Actualurl ++ "/ HTTP/1.0\r\n\r\n", | |
99 | gen_tcp:send(S, Request), | |
100 | io:format("Sent: ~p~n", [Request]), | |
101 | {Host, P, recv(S, [])}; | |
102 | _ -> {Host, P, <>} | |
103 | catch | |
104 | _:_ -> {Host, P, <>} | |
105 | end. | |
106 | ||
107 | % Receive the page's contents. | |
108 | recv(S, X)-> | |
109 | receive | |
110 | {tcp, S, Bin}-> | |
111 | %io:format("~p~n",[Bin]), | |
112 | recv(S, [Bin|X]); | |
113 | {tcp_closed, S}-> | |
114 | io:format("Writing file: ~p ~n.", ["/tmp/current.file"]), | |
115 | file:write_file("/tmp/current.file", X), | |
116 | list_to_binary(lists:reverse(X)) | |
117 | end. |