Guest User

Bloom.cpp

a guest
May 29th, 2015
418
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 10.93 KB | None | 0 0
  1. ######### Bloom.cpp ########
  2. //
  3. //  Bloom.cpp
  4. //
  5. //  Created by Guillaume Rizk on 9/02/12.
  6. //
  7.  
  8. #include <iostream>
  9. #include <stdio.h>
  10. #include <string.h>
  11.  
  12. #include "Bloom.h"
  13.  
  14.  
  15.  
  16. Bloom::Bloom()
  17. {
  18.     //empty default constructor
  19.     nb_elem = 0;
  20.     blooma = NULL;
  21. }
  22.  
  23. BloomCpt::BloomCpt()
  24. {
  25.     //empty default constructor
  26.     nb_elem = 0;
  27.     blooma = NULL;
  28.  
  29. }
  30. void Bloom::setSeed(uint64_t seed)
  31. {
  32.     if(user_seed==0)
  33.     {
  34.         user_seed = seed;
  35.         this->generate_hash_seed(); //regenerate the hash with the new seed
  36.     }
  37.     else{
  38.         fprintf(stderr,"Warning! you should not change the seed a second time!, resuming with previous seed %llu \n",(unsigned long long)user_seed);
  39.     }
  40. }
  41.  
  42. void Bloom::set_number_of_hash_func(int i)
  43. {
  44.     if(i>NSEEDSBLOOM || i<1){
  45.         fprintf(stderr,"%i is not a valid value for number of hash funcs, should be in [1-%i], resuming wild old value %i\n",i,NSEEDSBLOOM,n_hash_func );
  46.         return;
  47.     }  
  48.     n_hash_func = i;
  49. }
  50.  
  51. void Bloom::generate_hash_seed()
  52. {
  53.     unsigned int i;
  54.     for ( i = 0; i < NSEEDSBLOOM; ++i)
  55.     {
  56.         seed_tab[i]= rbase[i];
  57.     }
  58.     for ( i = 0; i < NSEEDSBLOOM; ++i)
  59.     {
  60.         seed_tab[i]= seed_tab[i] * seed_tab[(i+3) % NSEEDSBLOOM] + user_seed ;
  61.     }
  62.    
  63. }
  64.  
  65. #ifdef _largeint
  66. inline uint64_t Bloom::hash_func(LargeInt<KMER_PRECISION> elem, int num_hash)
  67. {
  68.     // hash = XOR_of_series[hash(i-th chunk iof 64 bits)]
  69.     uint64_t result = 0, chunk, mask = ~0;
  70.     LargeInt<KMER_PRECISION> intermediate = elem;
  71.     int i;
  72.     for (i=0;i<KMER_PRECISION;i++)
  73.     {
  74.         chunk = (intermediate & mask).toInt();
  75.         intermediate = intermediate >> 64;
  76.    
  77.         result ^= hash_func(chunk,num_hash);
  78.     }
  79.     return result;
  80. }
  81. #endif
  82.  
  83. #ifdef _ttmath
  84. inline uint64_t Bloom::hash_func(ttmath::UInt<KMER_PRECISION> elem, int num_hash)
  85. {
  86.     // hash = XOR_of_series[hash(i-th chunk iof 64 bits)]
  87.     uint64_t result = 0, to_hash;
  88.     ttmath::UInt<KMER_PRECISION> intermediate = elem;
  89.     uint32_t mask=~0, chunk;
  90.     int i;
  91.     for (i=0;i<KMER_PRECISION/2;i++)
  92.     {
  93.         // retrieve a 64 bits part to hash
  94.         (intermediate & mask).ToInt(chunk);
  95.         to_hash = chunk;
  96.         intermediate >>= 32;
  97.         (intermediate & mask).ToInt(chunk);
  98.         to_hash |= ((uint64_t)chunk) << 32 ;
  99.         intermediate >>= 32;
  100.  
  101.         result ^= hash_func(to_hash,num_hash);
  102.     }
  103.     return result;
  104. }
  105. #endif
  106.  
  107. #ifdef _LP64
  108. inline uint64_t Bloom::hash_func( __uint128_t elem, int num_hash)
  109. {
  110.     // hash(uint128) = ( hash(upper 64 bits) xor hash(lower 64 bits))
  111.     return hash_func((uint64_t)(elem>>64),num_hash) ^ hash_func((uint64_t)(elem&((((__uint128_t)1)<<64)-1)),num_hash);
  112. }
  113. #endif
  114.  
  115.  
  116. inline uint64_t Bloom::hash_func( uint64_t key, int num_hash)
  117. {
  118.     uint64_t hash = seed_tab[num_hash];
  119.     hash ^= (hash <<  7) ^  key * (hash >> 3) ^ (~((hash << 11) + (key ^ (hash >> 5))));
  120.     hash = (~hash) + (hash << 21); // hash = (hash << 21) - hash - 1;
  121.     hash = hash ^ (hash >> 24);
  122.     hash = (hash + (hash << 3)) + (hash << 8); // hash * 265
  123.     hash = hash ^ (hash >> 14);
  124.     hash = (hash + (hash << 2)) + (hash << 4); // hash * 21
  125.     hash = hash ^ (hash >> 28);
  126.     hash = hash + (hash << 31);
  127.     return hash;
  128. }
  129.  
  130.  
  131. //tai is 2^tai_bloom
  132. Bloom::Bloom(int tai_bloom)
  133. {
  134.     n_hash_func = 4 ;//def
  135.     user_seed =0;
  136.     nb_elem = 0;
  137.     tai = (1LL << tai_bloom);
  138.     nchar = tai/8LL;
  139.     blooma =(unsigned char *)  malloc( nchar *sizeof(unsigned char)); // 1 bit per elem
  140.     memset(blooma,0,nchar *sizeof(unsigned char));
  141.     //fprintf(stderr,"malloc Power-of-two bloom %lli MB nchar %llu %llu\n",(long long)((tai/8LL)/1024LL/1024LL),(unsigned long long)nchar,(unsigned long long)(tai/8));
  142.     this->generate_hash_seed();
  143. }
  144.  
  145.  
  146.  
  147.  
  148.  Bloom::Bloom(uint64_t tai_bloom)
  149.  {
  150.      //printf("custom construc \n");
  151.      n_hash_func = 4 ;//def
  152.      user_seed =0;
  153.      nb_elem = 0;
  154.      tai = tai_bloom;
  155.      nchar = (1+tai/8LL);
  156.      blooma =(unsigned char *)  malloc( nchar *sizeof(unsigned char)); // 1 bit per elem
  157.      memset(blooma,0,nchar *sizeof(unsigned char));
  158.      //fprintf(stderr,"malloc bloom %lli MB \n",(tai/8LL)/1024LL/1024LL);
  159.      this->generate_hash_seed();
  160.  }
  161.  
  162.  
  163.  
  164.  
  165.  
  166. // //tai is 2^tai_bloom
  167. BloomCpt::BloomCpt(int tai_bloom)
  168. {
  169.  
  170.     n_hash_func = 2;
  171.     user_seed = 0;
  172.     nb_elem = 0;
  173.     tai = (1LL << tai_bloom);
  174.     blooma =(unsigned char *)  malloc( (tai/2) *sizeof(unsigned char)); //4bits per elem
  175.     memset(blooma,0,(tai/2) *sizeof(unsigned char));
  176.    
  177.     fprintf(stderr,"malloc bloom cpt  %lli MB \n",(tai/2LL)/1024LL/1024LL);
  178.     this->generate_hash_seed();
  179. }
  180.  
  181.  
  182. // //tai is 2^tai_bloom
  183. BloomCpt3::BloomCpt3(int tai_bloom)
  184. {
  185.  
  186.     n_hash_func = 2;
  187.     user_seed = 0;
  188.     nb_elem = 0;
  189.     tai = (1LL << tai_bloom);
  190.     //blooma =(unsigned char *)  malloc( (tai/2) *sizeof(unsigned char)); //4bits per elem
  191.     blooma3 = (uint64_t*)  malloc( ((tai/21)+1) *sizeof(uint64_t)); //3bits per elem, 21 elem per uint64
  192.  
  193.     memset(blooma3,0, ((tai/21)+1) *sizeof(uint64_t));
  194.    
  195.     fprintf(stderr,"malloc bloom cpt64 3 bits    %lli MB \n",8*(tai/21LL)/1024LL/1024LL);
  196.     this->generate_hash_seed();
  197. }
  198.  
  199.  
  200. // //tai is 2^tai_bloom
  201. BloomCpt2::BloomCpt2(int tai_bloom)
  202. {
  203.  
  204.     n_hash_func = 2;
  205.     user_seed = 0;
  206.     nb_elem = 0;
  207.     tai = (1LL << tai_bloom);
  208.     //blooma =(unsigned char *)  malloc( (tai/2) *sizeof(unsigned char)); //4bits per elem
  209.     blooma2 = (uint64_t*)  malloc( (tai/32) *sizeof(uint64_t)); //2bits per elem, 32 elem per uint64
  210.  
  211.     memset(blooma2,0, (tai/32) *sizeof(uint64_t));
  212.    
  213.     fprintf(stderr,"malloc bloom cpt64 2 bits    %lli MB \n",8*(tai/32LL)/1024LL/1024LL);
  214.     this->generate_hash_seed();
  215. }
  216.  
  217.  
  218.  
  219.  
  220.  
  221. Bloom::~Bloom()
  222. {
  223.   if(blooma!=NULL)
  224.     free(blooma);
  225. }
  226.  
  227. BloomCpt3::~BloomCpt3()
  228. {
  229.   if(blooma3!=NULL)
  230.     free(blooma3);
  231. }
  232.  
  233.  
  234. BloomCpt2::~BloomCpt2()
  235. {
  236.   if(blooma2!=NULL)
  237.     free(blooma2);
  238. }
  239.  
  240.  
  241.  
  242.  
  243. void Bloom::dump(char * filename)
  244. {
  245.  FILE *file_data;
  246.  file_data = fopen(filename,"wb");
  247.  fwrite(blooma, sizeof(unsigned char), nchar, file_data); //1+
  248.  printf("bloom dumped \n");
  249.  
  250. }
  251.  
  252.  
  253. void Bloom::load(char * filename)
  254. {
  255.  FILE *file_data;
  256.  file_data = fopen(filename,"rb");
  257.  printf("loading bloom filter from file, nelem %lli \n",nchar);
  258.  fread(blooma, sizeof(unsigned char), nchar, file_data);
  259.  printf("bloom loaded\n");
  260.  
  261.  
  262. }
  263.  
  264. long Bloom::weight()
  265. {
  266.     // return the number of 1's in the Bloom, nibble by nibble
  267.     const unsigned char oneBits[] = {0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4};
  268.     long weight = 0;
  269.     for(uint64_t index = 0; index < nchar; index++)
  270.     {
  271.         unsigned char current_char = blooma[index];
  272.         weight += oneBits[current_char&0x0f];
  273.         weight += oneBits[current_char>>4];
  274.     }
  275.     return weight;
  276. }
  277.  
  278.  
  279. void Bloom::add(bloom_elem elem)
  280. {
  281.     uint64_t h1;
  282.     int i;
  283.     for(i=0; i<n_hash_func; i++)
  284.     {
  285. #if CUSTOMSIZE
  286.       h1 = hash_func(elem,i) % tai;
  287. #else
  288.       h1 = hash_func(elem,i) & (tai-1);
  289. #endif
  290.         blooma [h1 >> 3] |= bit_mask[h1 & 7];
  291.     }
  292.    
  293.     //nb_elem++;
  294.    
  295. }
  296.  
  297.  
  298.  
  299. int Bloom::contains(bloom_elem elem)
  300. {
  301.   uint64_t h1;
  302.     int i;
  303.     for(i=0; i<n_hash_func; i++)
  304.     {
  305. #if CUSTOMSIZE
  306.       h1 = hash_func(elem,i) % tai;
  307. #else
  308.       h1 = hash_func(elem,i) & (tai-1);
  309. #endif
  310.  
  311.         if ((blooma[h1 >> 3 ] & bit_mask[h1 & 7]) != bit_mask[h1 & 7])
  312.             return 0;
  313.     }
  314.    
  315.    
  316.     return 1;
  317. }
  318.  
  319.  
  320.  
  321. void BloomCpt::add(bloom_elem elem)
  322. {
  323.   uint64_t h1;
  324.     unsigned char val,cpt_per_key;
  325.    
  326.     int i;
  327.     for(i=0; i<n_hash_func; i++)
  328.     {
  329.        
  330.         h1 = hash_func(elem,i) & (tai-1);
  331.         val = blooma [h1 / cpt_per_char];
  332.         cpt_per_key = (val & cpt_mask[h1 & 1]) >> (4* (h1 & 1)) ;
  333.         cpt_per_key++;
  334.         if(cpt_per_key==16) cpt_per_key = 15; //satur at 15
  335.         val  &= ~ cpt_mask[h1 & 1];
  336.         val  |= cpt_per_key << (4* (h1 & 1)) ;
  337.         blooma [h1 / cpt_per_char] = val;
  338.     }
  339.    
  340.    
  341. }
  342.  
  343.  
  344. //9698232370296160
  345. int  BloomCpt::contains_n_occ(bloom_elem elem, int nks)
  346. {
  347.   uint64_t h1;
  348.     unsigned char cpt_per_key;
  349.     int i;
  350.  
  351.     for(i=0; i<n_hash_func; i++)
  352.     {
  353.         h1 = hash_func(elem,i) & (tai-1);
  354.        
  355.         cpt_per_key = (blooma [h1 / cpt_per_char] & cpt_mask[h1 & 1]) >> (4* (h1 & 1));
  356.  
  357.     if(cpt_per_key<nks) return 0;
  358.     }
  359.    
  360.     return 1;
  361.    
  362. }
  363.  
  364.  
  365.  
  366.  
  367.  
  368.  
  369. void BloomCpt3::add(bloom_elem elem)
  370. {
  371.   uint64_t h1,val;
  372.   uint64_t cpt_per_key;
  373.    
  374.     int i;
  375.     //  printf("Add elem \n");
  376.     // printf ("\nadd3 elem %lli \n",elem);
  377.  
  378.     for(i=0; i<n_hash_func; i++)
  379.     {
  380.        
  381.         h1 = hash_func(elem,i) & (tai-1);
  382.     // printf("--h1 %lli %lli  m%016llX\n",h1, h1%21,cpt_mask21[h1 % 21]);
  383.         val = blooma3 [h1 / 21]; //21 elem per uint64
  384.     //if(elem==9698232370296160)    printf("--%016llX\n", val);
  385.  
  386.         cpt_per_key = (val & cpt_mask21[h1 % 21]) >> (3* (h1 %21)) ;
  387.         cpt_per_key++;
  388.         if(cpt_per_key==8) cpt_per_key = 7; //satur at 7
  389.         val  &= ~ cpt_mask21[h1 % 21];
  390.         val  |= cpt_per_key << (3* (h1 % 21)) ;
  391.         blooma3 [h1 / 21] = val;
  392.     //  if(elem==9698232370296160)  printf("--%016llX  %i\n", val,cpt_per_key);
  393.  
  394.     }
  395.    
  396.    
  397. }
  398.  
  399.  
  400. int  BloomCpt3::contains_n_occ(bloom_elem elem, int nks)
  401. {
  402.     uint64_t h1;
  403.     unsigned char cpt_per_key;
  404.     int i;
  405.     //  printf("--contains-- \n");
  406.     // if(elem==9698232370296160) printf ("\nquery3 elem %lli \n",elem);
  407.  
  408.     for(i=0; i<n_hash_func; i++)
  409.     {
  410.         h1 = hash_func(elem,i) & (tai-1);
  411.     //      printf("h1 %lli \n",h1);
  412.  
  413.         cpt_per_key = (blooma3 [h1 / 21] & cpt_mask21[h1 % 21]) >> (3* (h1 % 21));
  414.  
  415.     //printf("%016llX\n", blooma3 [h1 / 21]);
  416.     //printf("cpt %i \n", cpt_per_key);
  417.         //if(elem==9698232370296160) printf("bloocpt3 %i \n", cpt_per_key);
  418.  
  419.         if(cpt_per_key<nks) return 0;
  420.     }
  421.    
  422.     return 1;
  423.    
  424. }
  425.  
  426.  
  427.  
  428.  
  429. void BloomCpt2::add(bloom_elem elem)
  430. {
  431.   uint64_t h1,val;
  432.   uint64_t cpt_per_key;
  433.    
  434.     int i;
  435.     //  printf("Add elem \n");
  436.     // printf ("\nadd3 elem %lli \n",elem);
  437.  
  438.     for(i=0; i<n_hash_func; i++)
  439.     {
  440.        
  441.         h1 = hash_func(elem,i) & (tai-1);
  442.         val = blooma2 [h1 / 32]; //32 elem per uint64
  443.  
  444.         cpt_per_key = (val & cpt_mask32[h1 & 31]) >> (2* (h1 & 31)) ;
  445.         cpt_per_key++;
  446.         if(cpt_per_key==4) cpt_per_key = 3; //satur at 3
  447.         val  &= ~ cpt_mask32[h1 & 31];
  448.         val  |= cpt_per_key << (2* (h1 & 31)) ;
  449.         blooma2 [h1 / 32] = val;
  450.     //  if(elem==9698232370296160)  printf("--%016llX  %i\n", val,cpt_per_key);
  451.  
  452.     }
  453.    
  454.    
  455. }
  456.  
  457. int  BloomCpt2::contains_n_occ(bloom_elem elem, int nks)
  458. {
  459.     uint64_t h1;
  460.     unsigned char cpt_per_key;
  461.     int i;
  462.  
  463.     for(i=0; i<n_hash_func; i++)
  464.     {
  465.         h1 = hash_func(elem,i) & (tai-1);
  466.  
  467.         cpt_per_key = (blooma2 [h1 / 32] & cpt_mask32[h1 & 31]) >> (2* (h1 & 31));
  468.  
  469.  
  470.         if(cpt_per_key<nks) return 0;
  471.     }
  472.    
  473.     return 1;
  474.    
  475. }
Advertisement
Add Comment
Please, Sign In to add comment