Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- for(int i = 0 ; i <graph.getNodes().size(); i++) {
- for(int j = i +1; j < graph.getNodes().size();j++) {
- HashSet<String> spaceSet = new HashSet<String>();
- spaceSet.addAll(graph.getNodes().get(i).getTokenSet());
- spaceSet.addAll(graph.getNodes().get(j).getTokenSet());
- HashSet<String> closeSet = new HashSet<String>();
- HashMap<String,Double> closeMapMax = new HashMap<String,Double>();
- HashMap<String,Double> secondMap = new HashMap<String, Double>();
- for(int k = 0; k < g.getNameAttributes().get(i).getValueToken().size(); k++) {
- for(int l = 0; l < g.getNameAttributes().get(j).getValueToken().size(); l++) {
- String a1 = g.getNameAttributes().get(i).getValueToken().get(k);
- String a2 = g.getNameAttributes().get(j).getValueToken().get(l);
- spaceSet.add(a1);
- spaceSet.add(a2);
- double jaro = TFIDF.applyJaroWinkler(a1, a2);
- if(jaro > 0.75) {
- closeSet.add(a1);
- }
- }
- for( String close: closeSet) {
- double max;
- for(int l = 0; l < g.getNameAttributes().get(j).getValueToken().size(); l++) {
- double jaro = TFIDF.applyJaroWinkler(g.getNameAttributes().get(j).getValueToken().get(l), close);
- if(secondMap.containsKey(g.getNameAttributes().get(j).getValueToken().get(l))){
- if(secondMap.get(g.getNameAttributes().get(j).getValueToken().get(l)) < jaro){
- secondMap.put(g.getNameAttributes().get(j).getValueToken().get(l), jaro);
- }else continue;
- }else {
- secondMap.put(g.getNameAttributes().get(j).getValueToken().get(l), jaro);
- }
- }
- max = Collections.max(secondMap.values());
- closeMapMax.put(close, max);
- }
- }
- double sumSoft = 0.0;
- for (String term : closeSet) {
- double tfidfI = g.getNodes().get(i).getTfIdfScore().get(term);
- HashMap.Entry<String, Double> maxEntry = null;
- for (HashMap.Entry<String, Double> entry : secondMap.entrySet()) {
- if (maxEntry == null || entry.getValue().compareTo(maxEntry.getValue()) > 0){
- maxEntry = entry;
- }
- }
- String termJ = maxEntry.getKey();
- double tfidfJ = g.getNodes().get(j).getTfIdfScore().get(termJ);
- double jMax = secondMap.get(termJ);
- double unitI = 0.0;
- double unitJ = 0.0;
- for(String s : spaceSet) {
- if(g.getNameAttributes().get(i).getValueToken().contains(s)) {
- unitI = unitI + g.getNodes().get(i).getTfIdfScore().get(s)*g.getNodes().get(i).getTfIdfScore().get(s);
- }
- if(g.getNameAttributes().get(j).getValueToken().contains(s)) {
- unitJ = unitJ + g.getNodes().get(j).getTfIdfScore().get(s)*g.getNodes().get(j).getTfIdfScore().get(s);
- }
- }
- sumSoft = sumSoft + (tfidfI / (java.lang.Math.sqrt(unitI))) * (tfidfJ / (java.lang.Math.sqrt(unitJ)))* jMax;
- }
- Compare String "RianAir Corp" with "RiadAir Inc"
- Compare Token: RianAir RiadAir 0.9333333333333333
- Compare Token: RianAir Inc 0.4920634920634921
- Compare Token: Corp RiadAir 0.0
- Compare Token: Corp Inc 0.0
- SpaceSet [RiadAir, RianAir, Corp, Inc]
- CLoseSet [RianAir]
- Most Similar Token in Second Node: RiadAir
- RianAir tf-idf-score: 1.0397207708399179
- RiadAir tf-idf-score: 0.4377343686769499
- Similarity between Token 0.9333333333333333
- Soft-TF-IDF Distance: 0.6388541633769426
- Normal Jaro-Winkler-Distance: 0.817929292929293
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement