Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- package abc;
- import java.io.FileNotFoundException;
- import java.io.PrintWriter;
- import java.io.UnsupportedEncodingException;
- import java.util.Hashtable;
- import java.util.Set;
- import org.jsoup.nodes.Document;
- import org.jsoup.nodes.Element;
- import org.jsoup.select.Elements;
- public class Parsare {
- public static void Description(Document doc) {
- try {
- PrintWriter descriptionf = new PrintWriter("description.txt", "UTF-8");
- String title = doc.title();
- String keywords = doc.head().select("meta[name=keywords]").attr("content");
- String description = doc.head().select("meta[name=description]").attr("content");
- String text = doc.body().text();
- descriptionf.println("TITLE: " + title);
- descriptionf.println("keywords: " + keywords);
- descriptionf.println("description: " + description);
- descriptionf.println("text: " + text);
- descriptionf.close();
- } catch (FileNotFoundException e) {
- e.printStackTrace();
- } catch (UnsupportedEncodingException e) {
- e.printStackTrace();
- }
- }
- public static void Robots(Document doc) {
- try {
- PrintWriter robotsf = new PrintWriter("robots.txt", "UTF-8");
- String robots = doc.head().select("meta[name=robots]").attr("content");
- robotsf.println(robots);
- robotsf.close();
- } catch (FileNotFoundException e) {
- e.printStackTrace();
- } catch (UnsupportedEncodingException e) {
- e.printStackTrace();
- }
- }
- public static void Links(Document doc) {
- try {
- PrintWriter linksf = new PrintWriter("links.txt", "UTF-8");
- Elements links = doc.getElementsByTag("a");
- for (Element link : links) {
- if(link.attr("abs:href") != "")
- linksf.println(link.attr("abs:href"));
- }
- linksf.close();
- } catch (FileNotFoundException e) {
- e.printStackTrace();
- } catch (UnsupportedEncodingException e) {
- e.printStackTrace();
- }
- }
- public static void Words(Document doc) {
- try {
- Hashtable<String, Integer> words = new Hashtable<String, Integer>();
- String text = doc.body().text();
- PrintWriter wordsFile = new PrintWriter("words.txt", "UTF-8");
- // String temp = "";
- StringBuilder temp = new StringBuilder();
- for (int i = 0; i < text.length(); i++){
- char c = text.charAt(i);
- if(Character.isLetter(c) || Character.isDigit(c))
- {
- //temp += c;
- temp.append(c);
- } else {
- if(!temp.equals(""))
- {
- if(words.get(temp) != null) {
- words.put(temp.toString(), words.get(temp) + 1);
- } else {
- words.put(temp.toString(), 1);
- }
- }
- temp.delete(0, temp.length());
- }
- if(i==text.length()) {
- if(!temp.equals(""))
- {
- if(words.get(temp) != null) {
- words.put(temp.toString(), words.get(temp) + 1);
- } else {
- words.put(temp.toString(), 1);
- }
- }
- }
- }
- Set<String> keys = words.keySet();
- for(String key: keys){
- wordsFile.println(key+ " : "+ words.get(key));
- }
- wordsFile.close();
- } catch (FileNotFoundException e) {
- e.printStackTrace();
- } catch (UnsupportedEncodingException e) {
- e.printStackTrace();
- }
- }
- }
- main
- package abc;
- import java.io.File;
- import java.io.IOException;
- import org.jsoup.Jsoup;
- import org.jsoup.nodes.Document;
- import abc.Parsare;
- public class Main {
- public static void main(String [] args)
- {
- Document doc;
- try {
- File input = new File("input.html");
- doc = Jsoup.parse(input, "UTF-8", "http://study.tuiasi.ro/");
- Parsare.Description(doc);
- Parsare.Robots(doc);
- Parsare.Links(doc);
- Parsare.Words(doc);
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement