Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import org.apache.commons.codec.digest.DigestUtils;
- import java.math.BigInteger;
- import java.util.*;
- public class SimHashBuckets {
- public static void main(String[] args) {
- Scanner scanner = new Scanner(System.in);
- int N = scanner.nextInt();
- scanner.nextLine();
- List<String> texts = new ArrayList<String>(N);
- for (int i = 0; i < N; i++) {
- texts.add(simhash(scanner.nextLine()));
- }
- Map<Integer, Set<Integer>> kandidati = new HashMap<Integer, Set<Integer>>();
- kandidati = lsh(kandidati, N, texts);
- // System.out.println(kandidati);
- int Q = scanner.nextInt();
- System.out.println("nextLine " + scanner.nextLine());
- String query;
- for (int i = 0; i <= Q; i++) {
- query = scanner.nextLine();
- int first = Integer.parseInt(query.split(" ")[0]);
- int second = Integer.parseInt(query.split(" ")[1]);
- Set<Integer> similar = kandidati.get(first);
- String t = texts.get(first);
- int slicniTekstovi = 0;
- if (similar != null) {
- for (Integer sim : similar) {
- if (sim != first) {
- String usporediS = texts.get(sim);
- int slicni = 0;
- for (int bit = 0; bit < usporediS.length(); bit++) {
- if (t.charAt(bit) != usporediS.charAt(bit)) {
- slicni++;
- }
- }
- if (slicni <= second) {
- slicniTekstovi++;
- }
- }
- }
- }
- System.out.println(slicniTekstovi);
- }
- }
- private static Map<Integer, Set<Integer>> lsh(Map<Integer, Set<Integer>> kandidati, int n, List<String> texts) {
- for (int pojas = 1; pojas <= 8; pojas++) {
- Map<Integer, Set<Integer>> pretinci = new HashMap<Integer, Set<Integer>>();
- for (int trenutni_dokument = 0; trenutni_dokument < n; trenutni_dokument++) {
- String hash = texts.get(trenutni_dokument);
- // System.out.println(hash);
- int val = hashToInt(pojas, hash);
- // System.out.println(val);
- Set<Integer> tekstovi = new HashSet<Integer>();
- if (pretinci.containsKey(val)) {
- tekstovi = pretinci.get(val);
- for (Integer text_id : tekstovi) {
- if (!kandidati.containsKey(trenutni_dokument)) {
- Set<Integer> set = new HashSet<Integer>();
- kandidati.put(trenutni_dokument, set);
- }
- if (!kandidati.containsKey(text_id)) {
- Set<Integer> set = new HashSet<Integer>();
- kandidati.put(text_id, set);
- }
- kandidati.get(trenutni_dokument).add(text_id);
- kandidati.get(text_id).add(trenutni_dokument);
- // if (kandidati.containsKey(trenutni_dokument) && kandidati.containsKey(text_id)) {
- // kandidati.get(trenutni_dokument).add(text_id);
- // kandidati.get(text_id).add(trenutni_dokument);
- // } else {
- // Set<Integer> set = new HashSet<Integer>();
- // set.add(text_id);
- // kandidati.put(trenutni_dokument, set);
- // set.remove(text_id);
- // set.add(trenutni_dokument);
- // kandidati.put(text_id, set);
- // }
- }
- }
- tekstovi.add(trenutni_dokument);
- pretinci.put(val, tekstovi);
- // System.out.println(pretinci);
- // System.out.println(kandidati);
- }
- }
- return kandidati;
- }
- private static int hashToInt(int pojas, String hash) {
- String bits = "";
- bits = hash.substring((pojas - 1) * 16, pojas * 16);
- return Integer.parseInt(bits, 2);
- }
- private static String simhash(String text) {
- String[] jedinke = text.split(" ");
- int[] sh = new int[128];
- for (String j : jedinke) {
- byte[] digest = DigestUtils.md5(j);
- BigInteger biStr = new BigInteger(digest);
- // String s = biStr.toString(2);
- if (biStr.compareTo(BigInteger.ZERO) < 0) {
- biStr = new BigInteger(1, digest);
- }
- String s = biStr.toString(2);
- s = String.format("%128s", s).replace(' ', '0');
- // System.out.println(s);
- for (int bit = 0; bit < s.length(); bit++) {
- if (s.charAt(bit) == '1') {
- sh[bit] += 1;
- } else {
- sh[bit] -= 1;
- }
- }
- }
- for (int bit = 0; bit < sh.length; bit++) {
- if (sh[bit] >= 0)
- sh[bit] = 1;
- else
- sh[bit] = 0;
- }
- StringBuffer sb = new StringBuffer();
- for (int i = 0; i < sh.length; i++) {
- sb.append(sh[i]);
- }
- // System.out.println(sb.toString());
- return sb.toString();
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement