Advertisement
Guest User

Untitled

a guest
Dec 6th, 2019
82
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.73 KB | None | 0 0
  1. public Message crawl_an_url(String url) throws IOException {
  2. /*Funcao para fazer crawl a um URL pedido*/
  3. Message response_message;
  4. boolean partial_success = false;
  5. LinkedList<Pair<String, Integer>> ll = new LinkedList<>();
  6. ll.add(new Pair<>(url, 0));
  7. String token;
  8. HashSet<String> set;
  9. String current_url;
  10. Pair<String, Integer> p;
  11. int depth;
  12. while (ll.size()>0) {
  13. p = ll.removeFirst();
  14. current_url = p.getKey();
  15. depth = p.getValue();
  16. try {
  17. Document doc = Jsoup.connect(current_url).timeout(10000).execute().parse();
  18. StringTokenizer tokens = new StringTokenizer(doc.text());
  19. int countTokens = 0;
  20. while (tokens.hasMoreElements() && countTokens++ < 200) {
  21. token = tokens.nextToken().toLowerCase();
  22. if (this.word_index.containsKey(token)) {
  23. set = this.word_index.get(token);
  24. } else {
  25. set = new HashSet<>();
  26. this.word_index.put(token, set);
  27. }
  28. set.add(current_url);
  29. }
  30. Elements links = doc.select("a[href]");
  31. for (Element link : links) {
  32. String new_url = link.attr("abs:href");
  33. if (this.url_index.containsKey(new_url)) {
  34. set = this.url_index.get(new_url);
  35. } else {
  36. set = new HashSet<>();
  37. this.url_index.put(new_url, set);
  38. }
  39. set.add(current_url);
  40. if (depth < 1) ll.add(new Pair<>(new_url, depth + 1));
  41. }
  42. } catch (IllegalArgumentException e){
  43. if(current_url.equals(url)){
  44. response_message = new Message("type|index_status;successful|no;msg|Given url isn't valid");
  45. return response_message;
  46. }else{
  47. partial_success = true;
  48. }
  49. } catch (IOException e) {
  50. partial_success = true;
  51. }
  52. }
  53. write_to_file(this.word_index, f_word_index);
  54. write_to_file(this.url_index, f_url_index);
  55. if(partial_success){
  56. response_message = new Message("type|index_status;successful|partial;msg|Given url is valid but one or more url found in iteration is not");
  57. }else{
  58. response_message = new Message("type|index_status;successful|yes;msg|Given url is valid and was indexed");
  59. }
  60. return response_message;
  61. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement