Advertisement
Guest User

Untitled

a guest
Feb 28th, 2017
83
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.70 KB | None | 0 0
  1. package abc;
  2.  
  3. import java.io.FileNotFoundException;
  4. import java.io.PrintWriter;
  5. import java.io.UnsupportedEncodingException;
  6. import java.util.Hashtable;
  7. import java.util.Set;
  8.  
  9. import org.jsoup.nodes.Document;
  10. import org.jsoup.nodes.Element;
  11. import org.jsoup.select.Elements;
  12.  
  13. public class Parsare {
  14.  
  15. public static void Description(Document doc) {
  16. try {
  17. PrintWriter descriptionf = new PrintWriter("description.txt", "UTF-8");
  18. String title = doc.title();
  19. String keywords = doc.head().select("meta[name=keywords]").attr("content");
  20. String description = doc.head().select("meta[name=description]").attr("content");
  21. String text = doc.body().text();
  22. descriptionf.println("TITLE: " + title);
  23. descriptionf.println("keywords: " + keywords);
  24. descriptionf.println("description: " + description);
  25. descriptionf.println("text: " + text);
  26. descriptionf.close();
  27. } catch (FileNotFoundException e) {
  28. e.printStackTrace();
  29. } catch (UnsupportedEncodingException e) {
  30. e.printStackTrace();
  31. }
  32. }
  33.  
  34. public static void Robots(Document doc) {
  35. try {
  36. PrintWriter robotsf = new PrintWriter("robots.txt", "UTF-8");
  37. String robots = doc.head().select("meta[name=robots]").attr("content");
  38. robotsf.println(robots);
  39. robotsf.close();
  40. } catch (FileNotFoundException e) {
  41. e.printStackTrace();
  42. } catch (UnsupportedEncodingException e) {
  43. e.printStackTrace();
  44. }
  45. }
  46.  
  47. public static void Links(Document doc) {
  48. try {
  49. PrintWriter linksf = new PrintWriter("links.txt", "UTF-8");
  50. Elements links = doc.getElementsByTag("a");
  51. for (Element link : links) {
  52. if(link.attr("abs:href") != "")
  53. linksf.println(link.attr("abs:href"));
  54. }
  55.  
  56. linksf.close();
  57. } catch (FileNotFoundException e) {
  58. e.printStackTrace();
  59. } catch (UnsupportedEncodingException e) {
  60. e.printStackTrace();
  61. }
  62. }
  63.  
  64. public static void Words(Document doc) {
  65. try {
  66. Hashtable<String, Integer> words = new Hashtable<String, Integer>();
  67. String text = doc.body().text();
  68. PrintWriter wordsFile = new PrintWriter("words.txt", "UTF-8");
  69.  
  70. // String temp = "";
  71.  
  72. StringBuilder temp = new StringBuilder();
  73. for (int i = 0; i < text.length(); i++){
  74.  
  75. char c = text.charAt(i);
  76. if(Character.isLetter(c) || Character.isDigit(c))
  77. {
  78. //temp += c;
  79. temp.append(c);
  80. } else {
  81. if(!temp.equals(""))
  82. {
  83. if(words.get(temp) != null) {
  84. words.put(temp.toString(), words.get(temp) + 1);
  85. } else {
  86. words.put(temp.toString(), 1);
  87. }
  88. }
  89. temp.delete(0, temp.length());
  90. }
  91. if(i==text.length()) {
  92. if(!temp.equals(""))
  93. {
  94. if(words.get(temp) != null) {
  95. words.put(temp.toString(), words.get(temp) + 1);
  96. } else {
  97. words.put(temp.toString(), 1);
  98. }
  99. }
  100. }
  101. }
  102.  
  103. Set<String> keys = words.keySet();
  104. for(String key: keys){
  105. wordsFile.println(key+ " : "+ words.get(key));
  106. }
  107. wordsFile.close();
  108. } catch (FileNotFoundException e) {
  109. e.printStackTrace();
  110. } catch (UnsupportedEncodingException e) {
  111. e.printStackTrace();
  112. }
  113. }
  114. }
  115.  
  116.  
  117.  
  118.  
  119.  
  120.  
  121. main
  122. package abc;
  123.  
  124.  
  125. import java.io.File;
  126. import java.io.IOException;
  127.  
  128. import org.jsoup.Jsoup;
  129. import org.jsoup.nodes.Document;
  130. import abc.Parsare;
  131.  
  132. public class Main {
  133.  
  134. public static void main(String [] args)
  135. {
  136. Document doc;
  137.  
  138. try {
  139. File input = new File("input.html");
  140. doc = Jsoup.parse(input, "UTF-8", "http://study.tuiasi.ro/");
  141. Parsare.Description(doc);
  142. Parsare.Robots(doc);
  143. Parsare.Links(doc);
  144. Parsare.Words(doc);
  145. } catch (IOException e) {
  146. e.printStackTrace();
  147. }
  148.  
  149. }
  150. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement