Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- public Message crawl_an_url(String url) throws IOException {
- /*Funcao para fazer crawl a um URL pedido*/
- Message response_message;
- boolean partial_success = false;
- LinkedList<Pair<String, Integer>> ll = new LinkedList<>();
- ll.add(new Pair<>(url, 0));
- String token;
- HashSet<String> set;
- String current_url;
- Pair<String, Integer> p;
- int depth;
- while (ll.size()>0) {
- p = ll.removeFirst();
- current_url = p.getKey();
- depth = p.getValue();
- try {
- Document doc = Jsoup.connect(current_url).timeout(10000).execute().parse();
- StringTokenizer tokens = new StringTokenizer(doc.text());
- int countTokens = 0;
- while (tokens.hasMoreElements() && countTokens++ < 200) {
- token = tokens.nextToken().toLowerCase();
- if (this.word_index.containsKey(token)) {
- set = this.word_index.get(token);
- } else {
- set = new HashSet<>();
- this.word_index.put(token, set);
- }
- set.add(current_url);
- }
- Elements links = doc.select("a[href]");
- for (Element link : links) {
- String new_url = link.attr("abs:href");
- if (this.url_index.containsKey(new_url)) {
- set = this.url_index.get(new_url);
- } else {
- set = new HashSet<>();
- this.url_index.put(new_url, set);
- }
- set.add(current_url);
- if (depth < 1) ll.add(new Pair<>(new_url, depth + 1));
- }
- } catch (IllegalArgumentException e){
- if(current_url.equals(url)){
- response_message = new Message("type|index_status;successful|no;msg|Given url isn't valid");
- return response_message;
- }else{
- partial_success = true;
- }
- } catch (IOException e) {
- partial_success = true;
- }
- }
- write_to_file(this.word_index, f_word_index);
- write_to_file(this.url_index, f_url_index);
- if(partial_success){
- response_message = new Message("type|index_status;successful|partial;msg|Given url is valid but one or more url found in iteration is not");
- }else{
- response_message = new Message("type|index_status;successful|yes;msg|Given url is valid and was indexed");
- }
- return response_message;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement