Advertisement
SMF

dedup.cpp

SMF
Jan 27th, 2015
320
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 28.37 KB | None | 0 0
  1. #include <stdlib.h>
  2. #include <vector>
  3. #include <iostream>
  4. #include <iomanip>
  5. #include <string>
  6. #include <sys/types.h>
  7. #include <dirent.h>
  8. #include <sys/stat.h>
  9. #include <map>
  10. #include <sys/mman.h>
  11. #include <unistd.h>
  12. #include <stdio.h>
  13. #include <errno.h>
  14. #include <fcntl.h>
  15. #include <fstream>
  16. #include <stdlib.h>
  17. #include <tgmath.h>
  18. #include <sched.h>
  19.  
  20. //include the libraries needed for sha-1 hashing
  21. #include <openssl/sha.h>
  22.  
  23. //include the libraries needed for the associative container storing
  24. //hashed values. Note that this library is the current version of the old
  25. //hash_map library.
  26. #include <unordered_map>
  27.  
  28. //include libraries for rabin's fingerprints
  29. #include "rabinpoly.h"
  30.  
  31. //IMPORTANT!!!
  32. //For the definitions below, if the minimum chunk size is set too low, large files can create
  33. //far too many chunks. To avoid overflows, this value should be set sufficiently high enough
  34. //to ensure that we aren't getting too many chunks, and thus, too many hashes
  35. //We will enforce the creation of chunks between these values
  36.  
  37. //definitions for use with rabin's fingerprints
  38. #define FINGERPRINT_PT 0xbfe6b8a5bf378d83LL //a constant representation of a polynomial function
  39. //used to get the same results across the filesystem
  40. #define BREAKMARK_VALUE 0x78 //a constant for determining which fingerprint
  41. //value determines a breakpoint
  42. #define MIN_CHUNK_SIZE 8 //4096 //number of minimum bytes for a chunksize
  43. #define MAX_CHUNK_SIZE 8192 //number of maximum bytes for a chunksize
  44. #define WINDOW_SIZE 128 //Sets the size of the sliding window of the data
  45. //that creates a unique rabin fingerprint
  46. #define BUFSIZE 4096 //Set the buffer size for the array that will hold
  47. //the data read in from the file
  48.  
  49. using namespace std;
  50.  
  51. //static const unsigned chunk_size = 65536;
  52. static const unsigned chunk_size = 128;
  53.  
  54.  
  55.  
  56. int hit_max = 0;
  57. int hit_break = 0;
  58. long long total_size_of_data = 0;
  59. long long deduped_size_of_data = 0;
  60. long long compressed_size_of_data = 0;
  61. long long untouched_size_of_data = 0;
  62.  
  63. static const int LIBDO_MAX_BYTES = 256;
  64. static const int LIBDO_BUFFER_LEN = 16384;
  65.  
  66. static int m_token_freqs[LIBDO_MAX_BYTES]; //frequency of each token in sample
  67. static float m_token_probs[LIBDO_MAX_BYTES]; //P(each token appearing)
  68. static int m_num_tokens = 0; //actual number of `seen' tokens, max 256
  69. static float m_maxent = 0.0;
  70. static float m_ratio = 0.0;
  71. static int LIBDISORDER_INITIALIZED = 0;
  72.  
  73. //VECTOR TO SAVE NAME OF FILES TO BE COMPRESSED
  74. vector<string> files_to_compress;
  75.  
  76. ////NOTE!!!!!!
  77. //The only values above that should be changed are:
  78. //MIN_CHUNK_SIZE
  79. //MAX_CHUNK_SIZE
  80. //WINDOW_SIZE
  81. //chunk_size
  82.  
  83. //DEFINE FOR RDTSC TIMINGS
  84. #ifdef __i386
  85. __inline__ uint64_t rdtsc() {
  86. uint64_t x;
  87. __asm__ volatile ("rdtsc" : "=A" (x));
  88. return x;
  89. }
  90. #elif __amd64
  91. __inline__ uint64_t rdtsc() {
  92. uint64_t a, d;
  93. __asm__ volatile ("rdtsc" : "=a" (a), "=d" (d));
  94. return (d<<32) | a;
  95. }
  96. #endif
  97.  
  98. //TIMINGS MEASUREMENTS (CYCLES)
  99. uint64_t cycles_compression = 0;
  100. uint64_t cycles_dedup = 0;
  101. uint64_t temptime1 = 0;
  102. uint64_t temptime2 = 0;
  103.  
  104.  
  105. struct filedata
  106. {
  107. //The name of the file, as found in the filesystem
  108. string filename;
  109. //The size of the file, in bytes
  110. int size;
  111. //The extension the file has. If the file is not one of the most popular
  112. //extensions, or if the file has NO extension, then this string will be "".
  113. string extension;
  114. //The full path of where the file is located in the system
  115. string filepath;
  116. };
  117.  
  118. struct hashdata
  119. {
  120. //The hash itself is saved as unsigned chars. The SHA-1 hash produces
  121. //a 20 byte hash.
  122. vector<unsigned char> hash;
  123. //The size of the file that was hashed, in bytes
  124. int size;
  125. //The extension of this hashed file
  126. string extension;
  127. //The string representation of the hash in a hexidecimal format
  128. string hex;
  129. };
  130.  
  131. int get_num_tokens()
  132. {
  133. return m_num_tokens;
  134. }
  135.  
  136. float get_max_entropy()
  137. {
  138. return m_maxent;
  139. }
  140.  
  141. float get_entropy_ratio()
  142. {
  143. return m_ratio;
  144. }
  145.  
  146. static void initialize_lib()
  147. {
  148. int i = 0;
  149. if(1==LIBDISORDER_INITIALIZED)
  150. return;
  151.  
  152. m_num_tokens = 0;
  153.  
  154. for(i=0;i<LIBDO_MAX_BYTES;i++)
  155. {
  156. m_token_freqs[i]=0;
  157. m_token_probs[i]=0.0;
  158. }
  159.  
  160. LIBDISORDER_INITIALIZED = 1;
  161. }
  162.  
  163. static void get_token_frequencies(char* buf, long long length)
  164. {
  165. int i=0;
  166. char* itr=NULL;
  167. unsigned char c=0;
  168.  
  169. itr = buf;
  170.  
  171. //reset number of tokens
  172. m_num_tokens = 0;
  173.  
  174. //make sure freqency and probability arrays are cleared
  175. for(i=0;i<LIBDO_MAX_BYTES;i++)
  176. {
  177. m_token_freqs[i] = 0;
  178. m_token_probs[i] = 0.0;
  179. }
  180.  
  181. for(i=0;i<length;i++)
  182. {
  183. c = (unsigned char)*itr;
  184. //assert(0<=c<LIBDO_MAX_BYTES);
  185. m_token_freqs[c]++;
  186. itr++;
  187. }
  188. }
  189.  
  190. static void count_num_tokens()
  191. {
  192. int i = 0;
  193. int counter = 0;
  194. for(i=0;i<LIBDO_MAX_BYTES;i++)
  195. {
  196. if(0!=m_token_freqs[i])
  197. {
  198. counter++;
  199. }
  200. }
  201. m_num_tokens = counter;
  202. return;
  203. }
  204.  
  205. float shannon_H(char* buf, long long length)
  206. {
  207. int i = 0;
  208. float bits = 0.0;
  209. char* itr=NULL; //values of itr should be zero to 255
  210. unsigned char token;
  211. int num_events = 0; //`length' parameter
  212. float freq = 0.0; //loop variable for holding freq from m_token_freq[]
  213. float entropy = 0.0; //running entropy sum
  214.  
  215. if(NULL==buf || 0==length)
  216. return 0.0;
  217.  
  218. if(0==LIBDISORDER_INITIALIZED)
  219. initialize_lib();
  220.  
  221. itr = buf;
  222. m_maxent = 0.0;
  223. m_ratio = 0.0;
  224. num_events = length;
  225. get_token_frequencies(itr, num_events); //modifies m_token_freqs[]
  226. //set m_num_tokens by counting unique m_token_freqs entries
  227. count_num_tokens();
  228.  
  229. if(m_num_tokens>LIBDO_MAX_BYTES)
  230. {
  231. //report error somehow?
  232. return 0.0;
  233. }
  234.  
  235. //iterate through whole m_token_freq array, but only count
  236. //spots that have a registered token (i.e., freq>0)
  237. for(i=0;i<LIBDO_MAX_BYTES;i++)
  238. {
  239. if(0!=m_token_freqs[i])
  240. {
  241. token = i;
  242. freq = ((float)m_token_freqs[token]);
  243. m_token_probs[token] = (freq / ((float)num_events));
  244. entropy += m_token_probs[token] * log2(m_token_probs[token]);
  245. }
  246. }
  247.  
  248. bits = -1.0 * entropy;
  249. m_maxent = log2(m_num_tokens);
  250. m_ratio = bits / m_maxent;
  251.  
  252. return bits;
  253. }
  254.  
  255. void add_to_vector(vector<vector<filedata> > &filesystem, string filename, int size, string path_name)
  256. {
  257. filedata temp;
  258. temp.filename = filename;
  259. temp.size = size;
  260. temp.filepath = path_name;
  261.  
  262. //Check the filename for a period "." Files with a period have extensions.
  263. //unsigned find_index = filename.find(".");
  264. unsigned int find_index = filename.find_last_of(".");
  265.  
  266. if(find_index > 10000) //No period found
  267. {
  268. temp.extension = "";
  269.  
  270. //If nothing is in the vector, just stick it anywhere
  271. if(filesystem.size() == 0)
  272. {
  273. // ***************** debugging comment *****************
  274. // cout << find_index << endl;
  275. // cout << filename << endl;
  276.  
  277. vector<filedata> temp_vector;
  278. temp_vector.push_back(temp);
  279. filesystem.push_back(temp_vector);
  280. }
  281. //If something is in the vector, check the extension (in this case "") to see
  282. //if it matches an already existing vector. If so, stick it there.
  283. //If not, create a new vector row.
  284. else
  285. {
  286. int index = -1;
  287.  
  288. for(int i = 0; i < filesystem.size(); i++)
  289. if(filesystem[i][0].extension == "")
  290. index = i;
  291.  
  292. //No "" extension found
  293. if(index == -1)
  294. {
  295. vector<filedata> temp_vector;
  296. temp_vector.push_back(temp);
  297. filesystem.push_back(temp_vector);
  298. }
  299. //Extension found
  300. else
  301. filesystem[index].push_back(temp);
  302. }
  303. return;
  304. }
  305. else //Period/extension found
  306. {
  307. // ***************** debugging comment *****************
  308. // cout << find_index << endl;
  309. // cout << filename << endl;
  310.  
  311. temp.extension = filename.substr(find_index, (filename.size() - find_index));
  312.  
  313. //If nothing is in the vector, just stick it anywhere
  314. if(filesystem.size() == 0)
  315. {
  316. vector<filedata> temp_vector;
  317. temp_vector.push_back(temp);
  318. filesystem.push_back(temp_vector);
  319. }
  320. //If something is in the vector, check the extension to see
  321. //if it matches an already existing vector. If so, stick it there.
  322. //If not, create a new vector row.
  323. else
  324. {
  325. int index = -1;
  326.  
  327. for(int i = 0; i < filesystem.size(); i++)
  328. if(filesystem[i][0].extension == temp.extension)
  329. index = i;
  330.  
  331. //The extension not found
  332. if(index == -1)
  333. {
  334. vector<filedata> temp_vector;
  335. temp_vector.push_back(temp);
  336. filesystem.push_back(temp_vector);
  337. }
  338. //Extension found
  339. else
  340. filesystem[index].push_back(temp);
  341. }
  342. return;
  343. }
  344. } // add_to_vector
  345.  
  346.  
  347. void recursive_search(string start_directory, vector<vector<filedata> > &filesystem, int &num_files)
  348. {
  349. //This vector holds the names of the files in the directory
  350. vector<string>file_names;
  351.  
  352. //Required declarations for the directory pointer and directory stream
  353. DIR * directoryptr;
  354. struct dirent *direntptr;
  355.  
  356. //Open the directory
  357. directoryptr = opendir(start_directory.c_str());
  358.  
  359. //If the directory open failed, indicate to user
  360. if(directoryptr == NULL)
  361. {
  362. cerr << "Directory " << start_directory << "could not be opened.";
  363. return;
  364. }
  365.  
  366. //While files exist in the directory, read their names and put them into the
  367. //vector holding the file names (vector<string>file_names)
  368. while((direntptr = readdir(directoryptr)) != NULL)
  369. {
  370. string temp_name = (string(direntptr->d_name));
  371.  
  372. //If the file is . or .., it is an up directory or self-directory, and
  373. //can be disregarded
  374. if((temp_name == ".") || (temp_name == ".."))
  375. continue;
  376. else
  377. {
  378. string working_file;
  379.  
  380. //Use stat to get file information from the system
  381. if(start_directory == "/")
  382. working_file = "/" + temp_name;
  383. else
  384. working_file = start_directory + "/" + temp_name;
  385.  
  386. // ***************** debugging comment *****************
  387. // cout << working_file << endl;
  388.  
  389. struct stat file_info;
  390. lstat(working_file.c_str(), &file_info);
  391.  
  392. //Determine if the file is a symbolic link. If so, ignore it.
  393. if(S_ISLNK(file_info.st_mode) == true)
  394. continue;
  395. //Ignore any file that is a pipe
  396. else if(S_ISFIFO(file_info.st_mode) == true)
  397. continue;
  398. //Ignore any file that is a socket
  399. else if(S_ISSOCK(file_info.st_mode) == true)
  400. continue;
  401. //Determine if the file is a directory. If so, call recursive_search
  402. //in this directory. Disregard certain folders.
  403. else if(S_ISDIR(file_info.st_mode) == true)
  404. {
  405. if(temp_name == "proc")
  406. continue;
  407. if(temp_name == "tmp")
  408. continue;
  409. if(temp_name == "project_files")
  410. continue;
  411.  
  412.  
  413. recursive_search(working_file, filesystem, num_files);
  414. }
  415. //If the file is not a directory, determine if it is a regular or
  416. //special file. Disregard special files. Add regular files to our vector.
  417. else if(S_ISREG(file_info.st_mode))
  418. {
  419.  
  420. num_files++;
  421.  
  422. // ***************** debugging comment *****************
  423. // cout << temp_name << endl;
  424.  
  425. add_to_vector(filesystem, temp_name, file_info.st_size, working_file);
  426. }
  427. else
  428. continue;
  429. } // end else
  430. }// end while
  431. closedir(directoryptr);
  432. } // recursive_search
  433.  
  434.  
  435.  
  436.  
  437. void dedup_fingerprint(vector<vector<filedata> > &filesystem, vector<vector<hashdata> > &hashes, int &num_hashes)
  438. {
  439. int num_extension_types = filesystem.size();
  440. vector<hashdata> temp;
  441.  
  442.  
  443. //1. Loop through all files in the system.
  444. //2. If a file is equal or smaller than a window, we just hash
  445. // hash the file, fingerprinting isn't applicable
  446. //3. For every other file, a window of 64 bytes is considered
  447. //4. The window moves byte-by-byte through the file, creating
  448. // a rabin fingerprint checksum for this window
  449. //5. If the rabin fingerprint for the window value is equal
  450. // to a predetermined value, that window is a breakpoint
  451. // This window is the last n bytes of this block of data
  452. //6. This data in the newly created block is hashed
  453. //7. Hashes are stored. If this was a full filesystem, if the hash
  454. // matched an existing hash, the data would be a duplicate, and
  455. // only a reference to the original data would be required. Here
  456. // we save all the hashes, then can later determine how many hashes
  457. // are unique and how much data would be saved in a full implementation.
  458.  
  459. //for debugging
  460. int filecount = 0;
  461.  
  462.  
  463. for(int i = 0; i < num_extension_types; i++)
  464. {
  465. for(int j = 0; j < filesystem[i].size(); j++)
  466. {
  467. filecount++;
  468. if(filecount % 1000 == 0)
  469. cout << filecount << " files done." << flush << endl;
  470. // ***************** debugging comment *****************
  471. //cout << "Hashing a file" << endl;
  472. //cout << filesystem[i][j].filepath << endl;
  473.  
  474. //First check the file to see if it's size is 0. If it is,
  475. //we can ignore it, as it won't have any dedup effects
  476.  
  477. struct stat file_info;
  478. lstat(filesystem[i][j].filepath.c_str(), &file_info);
  479. if(file_info.st_size == 0)
  480. continue;
  481.  
  482.  
  483. //open the file being hashed, read-only, binary mode ("rb")
  484. FILE * fileptr = fopen(filesystem[i][j].filepath.c_str(), "rb");
  485.  
  486. //If file opening fails, we can't do anything with this file due to
  487. //some sort of probem (permissions or otherwise)
  488. //Skip this file and go on to the next one
  489. if(fileptr == NULL)
  490. {
  491. continue;
  492. }
  493.  
  494. //Check that the file size is greater than the minimum size we defined for
  495. //fingerprint chunks. If the file is smaller than this minimum size, we won't
  496. //be making fingerprints out of it
  497.  
  498. bool no_fingerprinting = false;
  499. if(file_info.st_size <= MIN_CHUNK_SIZE)
  500. no_fingerprinting = true;
  501.  
  502. if(no_fingerprinting)
  503. {
  504. //debugging
  505. //cout << "NO FINGERPRINT FILE" << endl;
  506.  
  507. SHA_CTX sha_struct;
  508. unsigned char* file_buffer;
  509. file_buffer = new unsigned char[16384];
  510.  
  511. while(1)
  512. {
  513. int length;
  514. length = fread(file_buffer, 1, sizeof(file_buffer), fileptr);
  515. if(length == 0)
  516. break;
  517. SHA_Update(&sha_struct, file_buffer, length);
  518. }
  519.  
  520. unsigned char* hash_output;
  521. hash_output = new unsigned char[20];
  522.  
  523. //close the file
  524. int error_message = ferror(fileptr);
  525. fclose(fileptr);
  526. if(error_message)
  527. {
  528. continue;
  529. }
  530. //get the hash from SHA1, via hash_output
  531. SHA1_Final(hash_output, &sha_struct);
  532.  
  533. //store the hash to our vector containing the hashes
  534. hashdata temp_struct;
  535. string tempstring;
  536. temp_struct.extension = filesystem[i][0].extension;
  537. temp_struct.size = file_info.st_size;
  538. for(int k = 0; k < 20; k++)
  539. {
  540. temp_struct.hash.push_back(hash_output[k]);
  541. }
  542.  
  543. temp.push_back(temp_struct);
  544.  
  545. //cleanup variables
  546. delete [] file_buffer;
  547. delete [] hash_output;
  548. num_hashes++;
  549. }
  550. //In the case fingerprinting is required, do so in the else branch
  551. else
  552. {
  553. //debugging
  554. //cout << "FINGERPRINT FILE" << flush << endl;
  555.  
  556. //close the original file opening
  557. fclose(fileptr);
  558.  
  559.  
  560.  
  561.  
  562.  
  563.  
  564.  
  565.  
  566.  
  567.  
  568.  
  569.  
  570.  
  571.  
  572.  
  573.  
  574.  
  575. bool continue_dedup = false; //To determine if the file should be deduped
  576. //based on entropy
  577.  
  578. //DETERMINE FILE ENTROPY
  579. //cout << filesystem[i][j].filepath.c_str() << endl;
  580.  
  581.  
  582. fileptr = fopen(filesystem[i][j].filepath.c_str(), "r");
  583. if(fileptr == NULL)
  584. {
  585.  
  586. cout << "ERROR OPENING FILE" << endl;
  587. exit(-1);
  588. }
  589. int fildes = -1;
  590. int ret = 0;
  591. char * itr = NULL;
  592. struct stat stat_fd;
  593. int x = 0;
  594. fildes = fileno(fileptr);
  595. ret = fstat(fildes, &stat_fd);
  596. if(ret == -1)
  597. {
  598. cout << filesystem[i][j].filepath.c_str() << endl;
  599. cout << "ERROR OPENING FILE STATS" << endl;
  600. exit(-1);
  601. }
  602.  
  603. long long entropy_file_size = (long long)stat_fd.st_size;
  604. total_size_of_data += entropy_file_size;
  605. char* buffer = NULL;
  606. buffer = (char*)calloc(entropy_file_size, sizeof(char));
  607. if(buffer == NULL)
  608. {
  609.  
  610. cout << "ERROR ALLOCATING BUFFER" << endl;
  611. exit(-1);
  612. }
  613. itr = buffer;
  614. unsigned char c;
  615. long long entropy_num_bytes_read = 0L;
  616. while(EOF!=(x=fgetc(fileptr)))
  617. {
  618. c = (unsigned char)x;
  619. *itr = c;
  620. itr++;
  621. entropy_num_bytes_read++;
  622. }
  623. float entropy_value = shannon_H(buffer, entropy_num_bytes_read);
  624. fclose(fileptr);
  625. if(entropy_value >= 5.0)
  626. {
  627. continue_dedup = true;
  628. deduped_size_of_data += entropy_file_size;
  629. }
  630. //else if((entropy_value >= 5.0) && (entropy_value < 7.0))
  631. //{
  632. // compressed_size_of_data += entropy_file_size;
  633. // files_to_compress.push_back(filesystem[i][j].filepath.c_str());
  634. // //untouched_size_of_data += entropy_file_size;
  635. //}
  636. else
  637. {
  638. continue_dedup = true;
  639. deduped_size_of_data += entropy_file_size;
  640. }
  641.  
  642.  
  643. //If the entropy is not in the valid range to perform dedup,
  644. //skip to the next file
  645. if(continue_dedup == false)
  646. {
  647. continue;
  648. }
  649.  
  650.  
  651.  
  652.  
  653. //////////TIMER START//////////////
  654. temptime1 = rdtsc();
  655. ///////////////////////////////////
  656.  
  657.  
  658.  
  659.  
  660. //Initialize the fingerprinting class
  661. //Since breakpoints are determined using a polynomial (usually random), we
  662. //must use a constant to ensure that all breakpoints are constant across
  663. //the filesystem.
  664.  
  665. //debugging
  666. //cout << "INIT WINDOW FILE" << flush << endl;
  667.  
  668. window rabinfp(FINGERPRINT_PT, WINDOW_SIZE);
  669.  
  670. //Reset the window prior to every run, although probably not needed given
  671. //the locality of the class
  672. rabinfp.reset();
  673.  
  674. //Open the file using the standard open function to get a file descriptor
  675. //instead of a file pointer
  676. int fd = open(filesystem[i][j].filepath.c_str(), O_RDONLY);
  677.  
  678. //if file fails to open, skip to the next file
  679. if (!fd)
  680. {
  681. continue;
  682. }
  683.  
  684. //Do multiple reads on the file, putting the data into a buffer.
  685. //The data in the buffer will be used a byte at a time to determine
  686. //the breakpoints (chunks) in the file. Breakpoints will be saved
  687. //as integer values denoting how many bytes from the beginning of a
  688. //file a breakpoint exists. These will be saved in a vector.
  689.  
  690. int bytes_from_front = 0;
  691. int bytes_from_last_break = 0;
  692. vector<int> breakpoints;
  693. int count;
  694. char readbuffer[BUFSIZE];
  695.  
  696. //debugging
  697. //cout << "GET BREAKPOINTS" << flush << endl;
  698.  
  699. while((count = read(fd, readbuffer, BUFSIZE)) > 0)
  700. {
  701. //count now indicates how many bytes were read in by the read
  702. //operation. we will read in each byte for 'count' iterations
  703.  
  704. u_int64_t fingerprint_value;
  705.  
  706. for(int k = 0; k < count; k++)
  707. {
  708. //This function call returns the fingerprint value
  709. //as determined by rabin's fingerprinting
  710. fingerprint_value = rabinfp.slide8(readbuffer[k]);
  711.  
  712. //if we have hit the maximum chunk size without finding a
  713. //breakpoint using fingerprints, we make a breakpoint anyway
  714. #if 0
  715. if(bytes_from_last_break == MAX_CHUNK_SIZE)
  716. {
  717. hit_max++;
  718. breakpoints.push_back(bytes_from_front);
  719. bytes_from_front++;
  720. bytes_from_last_break = 0;
  721. }
  722. #endif
  723. //Mod the fingerprint with the chunk size constant. If this result
  724. //equals our constant breakmark, then this could be a breakpoint
  725. //this was an else if
  726.  
  727. if ((fingerprint_value % chunk_size) == BREAKMARK_VALUE)
  728. {
  729.  
  730. hit_break++;
  731. //if the number of bytes is between our range of acceptable
  732. //chunk sizes, we make it a breakpoint
  733. #if 0
  734. if((bytes_from_last_break >= MIN_CHUNK_SIZE) &&
  735. (bytes_from_last_break <= MAX_CHUNK_SIZE))
  736. {
  737.  
  738. #endif
  739. breakpoints.push_back(bytes_from_front);
  740. bytes_from_front++;
  741. bytes_from_last_break = 0;
  742. #if 0
  743. }
  744. else
  745. {
  746. bytes_from_front++;
  747. bytes_from_last_break++;
  748. }
  749. #endif
  750.  
  751. }
  752.  
  753. //This isn't a breakpoint. We've moved one byte, so we increment
  754. //the bytes from the beginning, and the bytes from the last breakpoint
  755. else
  756. {
  757. bytes_from_front++;
  758. bytes_from_last_break++;
  759. }
  760. }//end for
  761. }//end while
  762. //Clear out variables
  763. bytes_from_front = 0;
  764. bytes_from_last_break = 0;
  765.  
  766. //cout << breakpoints.size() << " BREAKPOINTS" << flush << endl;
  767.  
  768. //close file
  769. close(fd);
  770.  
  771. //Now that we have the breakpoints, we know how many chunks are in the file, and
  772. //how much data needs to be read to hash.
  773. int numchunks = breakpoints.size() + 1;
  774.  
  775. //If the file fails to read at this point, something is wrong
  776. fileptr = fopen(filesystem[i][j].filepath.c_str(), "rb");
  777. if(fileptr == NULL)
  778. {
  779. cerr << "Error opening file after chunking. Critical. Terminating.";
  780. exit(1);
  781. }
  782.  
  783. //debugging
  784. //cout << "READ FINGERPRINTS" << flush << endl;
  785.  
  786.  
  787. //Iterate through the chunks in a file, creating a hash for each chunk
  788. int previous_breakpoint = 0;
  789. for(int k = 0; k < numchunks; k++)
  790. {
  791. //cout << "READING CHUNK" << flush << endl;
  792.  
  793. //In the case that this is the last run through the
  794. //file, the last breakpoint is the end of the file
  795. if(k == numchunks - 1)
  796. {
  797. previous_breakpoint = file_info.st_size;
  798. }
  799.  
  800. unsigned char* hash_output;
  801. hash_output = new unsigned char[20];
  802. unsigned char* file_buffer;
  803.  
  804. int chunksize_inbytes;
  805. if(breakpoints.size() == 0)
  806. {
  807. chunksize_inbytes = file_info.st_size;
  808. }
  809. else
  810. {
  811. chunksize_inbytes = breakpoints[k] - previous_breakpoint;
  812. }
  813.  
  814. //In the case where the breakpoint is exactly at the end of the file,
  815. //the previous breakpoint will be the file size. Handle this to avoid
  816. //attempting to allocate memory with size 0 or large negatives, as we
  817. //don't actually have another chunk for hashing in this instance
  818. if(chunksize_inbytes <= 0)
  819. {
  820. delete [] hash_output;
  821. continue;
  822. }
  823.  
  824. //cout << "FILESIZE IS: " << file_info.st_size << flush << endl;
  825. //cout << "PREVIOUS BREAKPOINT WAS: " << previous_breakpoint << flush << endl;
  826. //cout << "CHUNKSIZE IS: " << chunksize_inbytes << flush << endl;
  827.  
  828. //cout << "INIT NEW FILE BUFFER[" << chunksize_inbytes << "]" << flush << endl;
  829. file_buffer = new unsigned char[chunksize_inbytes];
  830.  
  831. fread(file_buffer, chunksize_inbytes, 1, fileptr);
  832.  
  833. //Create the struct to hold the data
  834. hashdata temp_struct;
  835. string tempstring;
  836. temp_struct.extension = filesystem[i][0].extension;
  837. temp_struct.size = chunksize_inbytes;
  838.  
  839. //Get the SHA1 hash of the chunk
  840. SHA_CTX sha_struct;
  841. SHA1_Init(&sha_struct);
  842. SHA1_Update(&sha_struct, file_buffer, chunksize_inbytes);
  843. SHA1_Final(hash_output, &sha_struct);
  844.  
  845. //////////TIMER END////////////////
  846. temptime2 = rdtsc();
  847. cycles_dedup += temptime2 - temptime1;
  848. temptime1 = 0;
  849. temptime2 = 0;
  850. ///////////////////////////////////
  851.  
  852. for(int l = 0; l < 20; l++)
  853. temp_struct.hash.push_back(hash_output[l]);
  854.  
  855. temp.push_back(temp_struct);
  856.  
  857. //cout << "DELETE MEM ALLOCATION" << flush << endl;
  858. //Cleanup memory
  859. delete [] hash_output;
  860. delete [] file_buffer;
  861.  
  862. //Change the previous breakpoint to the current breakpoint
  863. if(breakpoints.size() > 0)
  864. previous_breakpoint = breakpoints[k];
  865.  
  866. //Update number of hashes
  867. num_hashes++;
  868. //////////TIMER START//////////////
  869. temptime1 = rdtsc();
  870. ///////////////////////////////////
  871. }
  872. //////////TIMER END////////////////
  873. temptime2 = rdtsc();
  874. cycles_dedup += temptime2 - temptime1;
  875. temptime1 = 0;
  876. temptime2 = 0;
  877. ///////////////////////////////////
  878.  
  879. //close the file
  880. fclose(fileptr);
  881.  
  882. }//end else(fingerprinting)
  883.  
  884. }//end for "j"
  885.  
  886. hashes.push_back(temp);
  887. temp.clear();
  888. }//end for "i"
  889. }//end dedup_fingerprint
  890.  
  891.  
  892.  
  893.  
  894. int main()
  895. {
  896. //SET AFFINITY SO PROGRAM ONLY RUNS ON ONE CORE
  897. cpu_set_t mask;
  898. CPU_ZERO(&mask);
  899. CPU_SET(0, &mask);
  900. sched_setaffinity(0, sizeof(mask), &mask);
  901.  
  902. int num_files = 0;
  903. int num_hashes = 0;
  904. unordered_map<string, string> dictionary; //container for collision checking
  905. vector<vector<hashdata> > hashes; //contains hashes, associated file data
  906. vector<vector<filedata> > filesystem; //contains system file data
  907.  
  908. struct results //stores results of deduplication
  909. {
  910. string extension;
  911. int total_hashes;
  912. int num_hash_collisions;
  913. int bytes_saved;
  914. };
  915.  
  916. vector<results> dedup_results; //stores all individual file results info
  917. results systemwide_results; //stores systemwide dedup data
  918. systemwide_results.bytes_saved = 0;
  919. systemwide_results.num_hash_collisions = 0;
  920.  
  921. //Get the files in the system
  922. recursive_search("/home/kwoelfe/dedup/test_programs/dedup_full/test_files", filesystem, num_files);
  923.  
  924. cout <<"Please be patient. Fingerprinting the entire filesystem takes about " << flush << endl;
  925. cout << "30 minutes on a ~6 GB system." << flush << endl << endl;
  926.  
  927. //Create file chunks, create hashes from the chunks
  928. dedup_fingerprint(filesystem, hashes, num_hashes);
  929.  
  930. cout <<"Fingerprinting and hashing is completed." << flush << endl;
  931. cout <<"Please wait while hashes are checked for collisions and results stored." << flush << endl << endl;
  932.  
  933. //Convert the hashes into hex format, for container entries
  934. //Insert hashes into unordered map container to find collisions
  935. //Save results
  936. char hexes[41];
  937. int num_hashes_hexes = 0;
  938. int status = 0;
  939. results temp;
  940.  
  941. for(int i=0; i < hashes.size(); i++)
  942. {
  943. //Initialize the results struct
  944. temp.extension = "";
  945. temp.total_hashes = 0;
  946. temp.bytes_saved = 0;
  947. temp.num_hash_collisions = 0;
  948.  
  949. for(int j = 0; j < hashes[i].size(); j++)
  950. {
  951. for(int k = 0; k < 20; k++)
  952. sprintf(&hexes[k*2], "%02X", hashes[i][j].hash[k]);
  953.  
  954. hashes[i][j].hex = hexes;
  955. num_hashes_hexes++;
  956.  
  957. //Now add data to results
  958. pair<string, string> hash_entries (hashes[i][j].hex, hashes[i][j].extension);
  959. pair<unordered_map<string,string>::const_iterator, bool> return_value =
  960. dictionary.insert(hash_entries);
  961.  
  962. if(return_value.second == false)
  963. {
  964. temp.extension = hashes[i][j].extension;
  965. temp.bytes_saved = temp.bytes_saved + hashes[i][j].size;
  966. temp.num_hash_collisions++;
  967. }
  968.  
  969. if(temp.num_hash_collisions != 0 && j == hashes[i].size() - 1)
  970. {
  971. temp.total_hashes = hashes[i].size();
  972. dedup_results.push_back(temp);
  973. }
  974. }
  975. }
  976.  
  977. cout << "Extension, Total Hashes, Number of Hash Collisions, Bytes Saved " << endl;
  978.  
  979. for(int i = 0; i < dedup_results.size(); i++)
  980. {
  981. systemwide_results.bytes_saved = (systemwide_results.bytes_saved + 1);
  982. dedup_results[i].bytes_saved;
  983.  
  984. systemwide_results.num_hash_collisions = systemwide_results.num_hash_collisions +
  985. dedup_results[i].num_hash_collisions;
  986.  
  987. // ***************** debugging comment *****************
  988. cout << dedup_results[i].extension << ", ";
  989. cout << dedup_results[i].total_hashes << ", ";
  990. cout << dedup_results[i].num_hash_collisions << ", ";
  991. cout << dedup_results[i].bytes_saved << endl;
  992.  
  993. }
  994.  
  995. cout << "LIST OF FILES TO UNDERGO COMPRESSION" << endl;
  996. for(int i = 0; i < files_to_compress.size(); i++)
  997. {
  998. cout << files_to_compress[i] << endl;
  999. }
  1000. cout << endl;
  1001.  
  1002. cout << "Files Scanned, " << num_files << endl;
  1003. cout << "Hashes Stored, " << num_hashes << endl;
  1004. cout << "Hash Collisions, " << systemwide_results.num_hash_collisions << endl;
  1005. //cout << "Bytes Saved, " << systemwide_results.bytes_saved << endl;
  1006.  
  1007. cout << "Max chunk: " << hit_max << endl;
  1008. //cout << "Breaks found: " << hit_break << endl;
  1009. cout << "ALL FILES ARE DEDUPED IN THIS VERSION." << endl << endl;
  1010. cout << "Total Data Scanned, " << total_size_of_data << endl;
  1011. cout << "Total Data Deduped, " << deduped_size_of_data << endl;
  1012. cout << "Total Data Ignored, " << untouched_size_of_data << endl;
  1013. cout << "Total Data To Be Compressed, " << compressed_size_of_data << endl;
  1014.  
  1015. cout << endl;
  1016. cout << "Dedup Portion Size (after dedup applied), " << deduped_size_of_data - dedup_results[0].bytes_saved << endl;
  1017. cout << "Cycles Spent Deduplicating, " << cycles_dedup << endl;
  1018. cout << endl;
  1019. return 0;
  1020. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement