Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import org.jsoup.Jsoup;
- import org.jsoup.nodes.Document;
- import sun.org.mozilla.javascript.internal.Function;
- import java.io.*;
- import java.nio.charset.StandardCharsets;
- import java.nio.file.Files;
- import java.nio.file.Paths;
- import java.util.*;
- /**
- * Created by lukibeni on 2015.12.08..
- */
- public class Main {
- static Map<String, Set<String>> uberData = new TreeMap<>();
- public static void main(String[] args) {
- initNotIngredients();
- try {
- FileInputStream fileInputStream = new FileInputStream("/home/lukibeni/magyaros");
- ObjectInputStream objectInputStream = new ObjectInputStream(fileInputStream);
- uberData = (Map<String, Set<String>>)objectInputStream.readObject();
- } catch (FileNotFoundException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- } catch (ClassNotFoundException e) {
- e.printStackTrace();
- }
- /*File[] filesInFolder = new File("/home/lukibeni/www.mindmegette.hu/").listFiles();
- List<String> filesNames = new ArrayList<>();
- if (filesInFolder != null) {
- for (File file : filesInFolder) {
- if (!file.isDirectory()) {
- filesNames.add(file.getName());
- }
- }
- }
- int i = 0;
- for (String filename : filesNames) {
- if (filename.matches(".+recept\\.html")) {
- try {
- byte[] encoded = Files.readAllBytes(Paths.get("/home/lukibeni/www.mindmegette.hu/" + filename));
- String tmp = new String(encoded, StandardCharsets.UTF_8);
- Document doc = Jsoup.parse(tmp);
- String set = doc.body().text();
- String[] split1 = set.split("Hozzávalók: ");
- String[] split2;
- Set<String> ingredients = new HashSet<>();
- if (split1.length >= 2) {
- split2 = split1[1].split(" Bevásárlólista");
- if (split2.length >= 2) {
- String[] split3 = split2[0].split(" ");
- for (String ingredient : split3) {
- if (!ingredient.matches(".*\\d+.*") && !notIngredients.contains(ingredient)) {
- ingredients.add(ingredient);
- }
- }
- }
- }
- Set<String> tags = new HashSet<>();
- split1 = set.split("Elküldöm ");
- if (split1.length >= 2) {
- split2 = split1[1].split(" Ajánlott");
- if (split2.length >= 2) {
- String[] split3 = split2[0].split(" ");
- for (String tag : split3) {
- tags.add(tag.toUpperCase());
- }
- }
- }
- if (tags.contains("MAGYAR") || tags.contains("MAGYAROS")) {
- try {
- uberData.put(filename.split("\\.")[0], ingredients);
- } catch (ArrayIndexOutOfBoundsException e) {
- System.out.println(filename);
- }
- }
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- }
- System.out.println("#Receipt: " + filesNames.size() + ", Parsed data: " + uberData.size());
- try{
- FileOutputStream fos= new FileOutputStream("/home/lukibeni/magyaros");
- ObjectOutputStream oos= new ObjectOutputStream(fos);
- oos.writeObject(uberData);
- oos.close();
- fos.close();
- }catch(IOException ioe){
- ioe.printStackTrace();
- }
- */
- for (Set<String> value : uberData.values()) {
- for (String ingredient : value) {
- if (ingredientCount.containsKey(ingredient)) {
- ingredientCount.put(ingredient, ingredientCount.get(ingredient) + 1);
- } else {
- ingredientCount.put(ingredient, 1);
- }
- }
- }
- /*for (String key : ingredientCount.keySet()) {
- if (ingredientCount.get(key) > 200 && !notIngredients.contains(key)) {
- System.out.println(key + ": " + ingredientCount.get(key));
- }
- }*/
- List<String> keys = new ArrayList<>(ingredientCount.keySet());
- Collections.sort(keys, new Comparator<String>() {
- @Override
- public int compare(String o1, String o2) {
- return ingredientCount.get(o2).compareTo(ingredientCount.get(o1));
- }
- });
- int y = 0;
- for (String key : keys) {
- if (!notIngredients.contains(key) && ingredientCount.get(key) > 50) {
- ++y;
- System.out.println(key + ": " + ingredientCount.get(key));
- }
- }
- System.out.println(y);
- }
- public static Set<String> notIngredients;
- public static Map<String, Integer> ingredientCount = new HashMap<>();
- private static void initNotIngredients() {
- notIngredients = new HashSet<>();
- String[] ni = {
- "kis",
- "nagy",
- "kevés",
- "sok",
- "ek",
- "csipet",
- "dl",
- "ml",
- "l",
- "g",
- "dkg",
- "kg",
- "fej",
- "db",
- "púpozott",
- "evőkanál",
- "kávéskanál",
- "hegyes",
- "erős",
- "teáskanál",
- "kanál",
- "friss",
- "őrölt",
- "szelet",
- "nagyobb",
- "szárított",
- "gerezd",
- "csomag",
- "darált",
- "csipet",
- "A",
- "vagy",
- "és",
- "csokor",
- "ízlés",
- "füstölt",
- "egész",
- "reszelt",
- "közepes",
- "tésztához",
- "pohár",
- "szál",
- "a",
- "szerint",
- "tésztához:",
- "kb.",
- "fél",
- "pici",
- "csip",
- "poh",
- "csípős",
- "főtt",
- "héja",
- "töltelékhez:",
- "kisebb",
- "tetejére:",
- "savanyú",
- "mokkáskanál",
- "ísz",
- "csapott",
- "szem",
- "száraz",
- "házi",
- "(vagy",
- "húsos",
- "tészta",
- "+",
- "Valamint:",
- "is",
- "liter",
- "sütéshez",
- "vágott",
- "fehér",
- "Arany",
- "Piros",
- "vágva",
- "pár",
- "arany",
- "csipetnyi",
- "-",
- "késhegynyi",
- "krémhez",
- "apróra",
- "doboz",
- "üveg",
- "is)",
- "(lehet"
- };
- notIngredients.addAll(Arrays.asList(ni));
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement