Guest User

Untitled

a guest
Oct 6th, 2024
100
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C 8.75 KB | None | 0 0
  1. #include <stdlib.h>
  2. #include <stdio.h>
  3. #include <stddef.h>
  4. #include <stdint.h>
  5. #include <sys/stat.h>
  6. #include <sys/mman.h>
  7. #include <fcntl.h>
  8. #include <unistd.h>
  9. #include <dirent.h>
  10. #include <string.h>
  11.  
  12.  
  13. int hash(char *filename, uint8_t *bloomhash);
  14. uint32_t get_chunk_len(uint8_t *data, uint32_t min, uint32_t max, uint8_t threshhold);
  15. uint64_t make_fnv_from_block(uint8_t *block, uint32_t blocksize);
  16. uint32_t bloom_bytes = 256*8; //The actual number of buckets is eight times this. Must be a multiple of 8 (due to __builtin_popcountll)
  17. void diag_show_hash(uint8_t *hash);
  18. float jaccard_simularity(uint64_t *a, uint64_t *b);
  19. void dofile(char *filename, struct stat *statbuf);
  20. void dofolder( char *foldername);
  21. uint8_t *bloomhashes=NULL;
  22. char **filenames=NULL;
  23. uint64_t num_files=0;
  24. uint64_t alloc_files=0;
  25. float match_thresh=0.9;
  26. uint8_t chunking_threshhold=50; //Lower number, bigger chunks. 50 seems about right.
  27.  
  28. #define alloc_chunk 256
  29.  
  30. int main(int argc, char *argv[]){
  31.   uint32_t n;
  32.   for(n=1;n<argc;n++){
  33.     if(argv[n][0] == '-'){
  34.       if(strcmp(argv[n],"--help") == 0){
  35.         printf("Confero: A file comparison program. Through the use of variable length chunking, Bloom filters and Jaccard similarity metric, compares a set of files to find similar ones.\n");
  36.         printf("         Specifically it looks for long strings of data in common between two or more files within a (potentially very large) set.\n");
  37.         printf("         For example, it will find different edits of a document that all descend from a common source. Or versions with different metadate.\n");
  38.         printf("         Most usefully, it does this regardless of format. Text, images, audio, it matters not - it's all just data. Format-agnostic. Though compression may throw it off.\n");
  39.         printf("         Only a problem if two versionf use different types of settings of compression, as it'lll be working on the compressed byte stream.\n\n");
  40.         printf("Usage:   confero [options] <file> [file] [file] ...\n");
  41.         printf("         If you specify a folder it'll process that folder recursively.\n\n");
  42.         printf("Options: -Bn            Set number buckets. Default is -B256. More buckets means more accurate comparisons, at the expense of needing more RAM.\n");
  43.         printf("                        Needs buckets * 8 bytes per file. So default needs 2KiB per file.\n");
  44.         printf("         -Tn            Chunking threshhold. Default -T50. Lower value, more chunks, will identify smaller matches and so more accurate comparisons. But also need more buckets.\n");
  45.         printf("                        Sensible values are 45-55.\n");
  46.         printf("                        If you receive warnings about files being too large to process, increase one or both of these B or T.\n");
  47.         printf("         -Mn            Percentage of simularity to consider a match. Any file-pair with a simularity equal ot greater than this will be output.\n");
  48.         return(1);
  49.       }
  50.       switch(argv[n][1]){
  51.         case 'B':
  52.           bloom_bytes=atoi(argv[n]+2)*8;
  53.           if(!bloom_bytes){
  54.              printf("Invalid B value.\n");
  55.              return(1);
  56.            }
  57.           break;
  58.         case 'T':
  59.           chunking_threshhold=atoi(argv[n]+2);
  60.           if(!chunking_threshhold){
  61.              printf("Invalid T value.\n");
  62.              return(1);
  63.            }
  64.           break;
  65.         case 'M':
  66.           int m=atoi(argv[n]+2);
  67.           if(m<1 || m > 100 ){
  68.              printf("Invalid M value.\n");
  69.              return(1);
  70.           }
  71.           match_thresh=(float)m / 100;
  72.           break;
  73.         default:
  74.           printf("Unknown option %s\n", argv[n]);
  75.           return(1);
  76.       }
  77.     }
  78.   }
  79.  
  80.  
  81.   for(n=1;n<argc;n++){
  82.     if(argv[n][0]!= '-')
  83.       dofolder(argv[n]);
  84.   }
  85.   printf("Read and processed %lu files.\nB=%u T=%u\n", num_files, bloom_bytes/8, chunking_threshhold);
  86.   uint32_t a,b;
  87. /*  for(a=0;a<num_files;a++){
  88.     printf("%s\n", filenames[a]);
  89.     diag_show_hash(bloomhashes+(a*bloom_bytes));
  90.   }*/
  91.   for(a=0;a<num_files;a++){
  92.     for(b=0;b<a;b++){
  93.       float simularity=jaccard_simularity((uint64_t*)(bloomhashes+(a*bloom_bytes)),(uint64_t*)(bloomhashes+(b*bloom_bytes)));
  94.       if(simularity >= match_thresh)
  95.         printf("%f\n  %s\n  %s\n\n", simularity, filenames[a], filenames[b]);
  96.     }
  97.   }
  98. }
  99.  
  100. float jaccard_simularity(uint64_t *a, uint64_t *b){ //Will need to cast the pointer when calling this.
  101.   uint32_t comparisons=bloom_bytes/8;
  102.   uint32_t n;
  103.   uint32_t jac_union=0;
  104.   uint32_t jac_intersection=0;
  105.   for(n=0;n<comparisons;n++){
  106.     jac_union+=__builtin_popcountll(a[n] | b[n]);
  107.     jac_intersection+=__builtin_popcountll(a[n] & b[n]);
  108.   }
  109.   if(jac_union==0 || jac_intersection ==0)
  110.     return(0);
  111.   return((float)jac_intersection / (float)jac_union);
  112. }
  113.  
  114. void diag_show_hash(uint8_t *hash){
  115.   uint32_t n;
  116.   for(n=0;n<bloom_bytes;n++)
  117.     printf("%02X", hash[n]);
  118.   printf("\n");
  119. }
  120.  
  121. void dofile(char *filename, struct stat *statbuf){
  122.   if(alloc_files==num_files){
  123.     alloc_files+=alloc_chunk;
  124.     bloomhashes=realloc(bloomhashes,alloc_files*bloom_bytes);
  125.     filenames=realloc(filenames, alloc_files*sizeof(char *));
  126.   }
  127.   if(!bloomhashes || !filenames){
  128.     fprintf(stderr, "Out of memory.\n");
  129.     exit(1);
  130.   }
  131.   uint8_t *bloomhash=bloomhashes+(num_files*bloom_bytes);
  132.   memset(bloomhash, 0, bloom_bytes);
  133.   int err=hash(filename, bloomhash);
  134.   if(err)
  135.     return;
  136. //  printf("%lu,%s\n", num_files, filename);
  137. //  diag_show_hash(bloomhash);
  138.   filenames[num_files] = malloc(strlen(filename)+1);
  139.   strcpy(filenames[num_files], filename);
  140.   num_files++;
  141. }
  142.  
  143. int hash(char *filename, uint8_t *bloomhash){
  144.   uint32_t min_chunk_size=1024;
  145.   int fd = open (filename, O_RDONLY);
  146.   if(!fd){
  147.     fprintf(stderr, "Error opening file %s\n", filename);
  148.     return(1);
  149.   }
  150.   struct stat file_info;
  151.   if(fstat (fd, &file_info)){
  152.     fprintf(stderr, "Error statting file %s\n", filename);
  153.     close(fd);
  154.     return(1);
  155.   }  
  156.  
  157.   if(file_info.st_size<min_chunk_size){
  158.     close(fd);
  159.     return(2); //
  160.   }
  161.  
  162.  uint8_t *mapped_file = mmap (NULL, file_info.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
  163.   if(!mapped_file){
  164.     fprintf(stderr, "Error mapping file %s\n", filename);
  165.     close(fd);
  166.     return(1);
  167.   }
  168.  
  169.   uint64_t pos=0;  
  170.   uint64_t filesize=file_info.st_size;
  171.   uint64_t bytes_left=filesize;
  172.   uint32_t bitsset=0;
  173.   do{
  174.     uint32_t chunk_len=get_chunk_len(mapped_file+pos, min_chunk_size, bytes_left, chunking_threshhold);
  175.     uint64_t fnvhash=make_fnv_from_block(mapped_file+pos, chunk_len);
  176.     uint32_t bucket=fnvhash % (bloom_bytes*8);
  177.     uint32_t bloombyte=bucket>>3;
  178. //    printf("%09lu:%09lu,%06u:%08lX(%u/%u)\n", pos, bytes_left, chunk_len, fnvhash, bucket,bloombyte);
  179.     uint8_t bit=1;
  180.     bit = bit << (bucket&0x07);
  181. //    printf("Change: %u+%02X\n", bloombyte, bit);
  182.     bloomhash[bloombyte] = bloomhash[bloombyte] | bit;
  183.     bitsset++;
  184.     if(bitsset>(bloom_bytes*4)){
  185.       fprintf(stderr, "%s\n  More than one in two bits set: This file is too large to effectively process at current settings.\n", filename);
  186.       munmap(mapped_file, file_info.st_size);
  187.       close(fd);
  188.       return(3);
  189.     }
  190.     pos+=chunk_len;
  191.     bytes_left=filesize-pos;
  192.   }while(bytes_left);
  193.   munmap(mapped_file, file_info.st_size);
  194.   close(fd);
  195.   return(0);
  196. }
  197.  
  198.  
  199. uint32_t get_chunk_len(uint8_t *data, uint32_t min, uint32_t max, uint8_t threshhold){
  200.   uint64_t limit=1;
  201.   limit = limit << threshhold;
  202.   uint32_t n;
  203.   for(n=min;n<max;n++){
  204.     uint64_t hash=make_fnv_from_block(data+n-8, 8);
  205.     if(hash<limit)
  206.       return(n);
  207.    
  208.   }
  209.   return(max);
  210. }
  211.  
  212. uint64_t make_fnv_from_block(uint8_t *block, uint32_t blocksize){
  213.     uint64_t hash = 14695981039346656037ull;
  214.     for(int n = 0; n < blocksize; n++)
  215.     {
  216.         hash = hash ^ (block[n]);
  217.         hash = hash * 1099511628211; // Magic prime.
  218.     }
  219.     return(hash);
  220. }
  221.  
  222. void dofolder(char *foldername){
  223.   struct stat statbuf;
  224.  
  225.   if (stat(foldername, &statbuf) == -1){
  226.     fprintf(stderr, "Error statting %s\n", foldername);
  227.     return;
  228.   }
  229.   if(S_ISREG(statbuf.st_mode)){
  230.     dofile(foldername, &statbuf);
  231.     return;
  232.   }
  233.   if(! S_ISDIR(statbuf.st_mode))
  234.     return; //What is this non-file, non-folder?
  235.  
  236.   DIR *dp;
  237.   struct dirent *ep;
  238.  
  239.   dp = opendir (foldername);
  240.   if (dp == NULL){
  241.     fprintf(stderr, "Could not read folder %s\n", foldername);
  242.     return;
  243.   }
  244.   while ((ep = readdir(dp)))
  245.   if(ep->d_name[0]!='.'){
  246.     char *filename=malloc((strlen(foldername)+strlen(ep->d_name)+2)*sizeof(char)); //+2: One for the /, one for the null term.
  247.     strcpy(filename, foldername);
  248.     strcat(filename, "/");
  249.     strcat(filename, ep->d_name);
  250.     dofolder(filename);
  251.     free(filename);
  252.   }
  253.   closedir (dp);
  254. }
Advertisement
Add Comment
Please, Sign In to add comment