SHARE
TWEET

Untitled

a guest Sep 18th, 2019 66 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #!/usr/bin/env python
  2. # encoding: utf-8
  3.  
  4. import os
  5. import shutil
  6. import sys
  7. import time
  8. import requests
  9. import argparse
  10. import hashlib
  11. import multiprocessing
  12. import csv
  13. from joblib import Parallel, delayed
  14.  
  15.  
  16. def get_hash(file_path, hash_method, block_size=65536):
  17.     method = hash_method.lower().strip()
  18.     try:
  19.         file_content = open(file_path, 'rb').read()
  20.         if method == 'md5':
  21.             return hashlib.md5(file_content).hexdigest()
  22.         elif method == 'sha1':
  23.             return hashlib.sha1(file_content).hexdigest()
  24.         elif method == 'sha224':
  25.             return hashlib.sha224(file_content).hexdigest()
  26.         elif method == 'sha256':
  27.             return hashlib.sha256(file_content).hexdigest()
  28.         elif method == 'sha384':
  29.             return hashlib.sha384(file_content).hexdigest()
  30.         elif method == 'sha512':
  31.             return hashlib.sha512(file_content).hexdigest()
  32.         else:
  33.             print('Unsupported/Unknown hash method')
  34.             return 0
  35.     except:
  36.         try:
  37.             print('Failed to hash using fast mothod1. Try use method2 to hash file: {}'.format(file_path))
  38.             if method == 'md5': d = hashlib.md5()
  39.             elif method == 'sha1': d = hashlib.sha1()
  40.             elif method == 'sha224': d = hashlib.sha224()
  41.             elif method == 'sha256': d = hashlib.sha256()
  42.             elif method == 'sha384': d = hashlib.sha384()
  43.             elif method == 'sha512': d = hashlib.sha512()
  44.             else:
  45.                 print('Unsupported/Unknown hash method')
  46.                 return 0
  47.             # try calc hash for huge file
  48.             with open(file_path, mode='rb') as f:
  49.                 while True:
  50.                     buf = f.read(block_size)
  51.                     if not buf:
  52.                         break
  53.                     d.update(buf)
  54.             print('Successfully hashed the file using method2: {}'.format(file_path))
  55.             return d.hexdigest()
  56.         except:
  57.             print('Failed to hash the file: {}'.format(file_path))
  58.             return 0
  59.  
  60.  
  61. def calc_recursive_dir_size(source):
  62.     try:
  63.         total_size = os.path.getsize(source)
  64.         for item in os.listdir(source):
  65.             item_path = os.path.join(source, item)
  66.             if os.path.isfile(item_path):
  67.                 total_size += os.path.getsize(item_path)
  68.             elif os.path.isdir(item_path):
  69.                 total_size += calc_recursive_dir_size(item_path)
  70.         return total_size
  71.     except:
  72.         print('Failed to calc dir size: {}'.format(source))
  73.         return 0
  74.  
  75.  
  76. def get_dir_metadata(dir_path):
  77.     try:
  78.         st = os.stat(dir_path)
  79.         truesize = calc_recursive_dir_size(dir_path)
  80.         return [truesize, st.st_atime, st.st_mtime, st.st_ctime, dir_path]
  81.     except:
  82.         print('Failed to get dir statistics: {}'.format(dir_path))
  83.         return 0
  84.  
  85.  
  86. def get_file_metadata(file_path):
  87.     try:
  88.         st = os.stat(file_path)
  89.         return [st.st_size, st.st_atime, st.st_mtime, st.st_ctime, path]
  90.     except:
  91.         print('Failed to get file statistics: {}'.format(file_path))
  92.         return 0
  93.  
  94.  
  95. def file_ingest(file_path, md5_only=False):
  96.     if md5_only:
  97.         return [get_hash(file_path=file_path, hash_method='md5'), file_path]
  98.     else:
  99.         return [
  100.             get_hash(file_path=file_path, hash_method='md5'),
  101.             get_hash(file_path=file_path, hash_method='sha256'),
  102.             #get_file_metadata(file_path=file_path),
  103.             file_path
  104.         ]
  105.  
  106.  
  107. def process_sub_files_and_folders(dir_path):
  108.     root = os.path.abspath(dir_path)
  109.     file_paths = []
  110.     dir_paths = []
  111.     for dir_name, subdir_list, file_list in os.walk(root):
  112.         if os.path.isdir(dir_name):
  113.             dir_paths += [dir_name]
  114.         for file in file_list:
  115.             file_path = os.path.join(dir_name, file)
  116.             if os.path.isfile(file_path):
  117.                 file_paths += [file_path]
  118.     return file_paths, dir_paths
  119.  
  120.  
  121. def process_dir(dir_path):
  122.     start = time.time()
  123.     file_paths, dir_paths = process_sub_files_and_folders(dir_path)
  124.     end = time.time()
  125.     print('Process {} completed in {} secs. Extracted:\n\t{} files\n\t{}dirs'.format(dir_path, end-start, len(file_paths), len(dir_paths)))
  126.  
  127.     cores_num = multiprocessing.cpu_count()
  128.  
  129.     start = time.time()
  130.     dirs_results_list = Parallel(n_jobs=cores_num)(delayed(get_dir_metadata)(path) for path in dir_paths)
  131.     end = time.time()
  132.     print('Calc dirs metadata for {} items in {} secs.'.format(len(dirs_results_list), end-start))
  133.  
  134.     start = time.time()
  135.     files_results_list = Parallel(n_jobs=cores_num)(delayed(file_ingest)(path) for path in file_paths)
  136.     end = time.time()
  137.     print('Calc Hashs for {} items in {} secs.'.format(len(files_results_list), end-start))
  138.  
  139.     # convert lists to unique dics
  140.     results_md5s_files_dic = {}
  141.     results_sha256s_files_dic = {}
  142.     for arr in files_results_list:
  143.         curr_md5 = arr[0]
  144.         curr_sha256 = arr[1]
  145.         curr_filepath = arr[3]
  146.         results_md5s_files_dic[curr_md5] = curr_filepath
  147.         results_sha256s_files_dic[curr_sha256] = curr_filepath
  148.     return dirs_results_list, files_results_list, results_md5s_files_dic, results_sha256s_files_dic
  149.  
  150.  
  151. def process_disk(diskpath, resultfilepath, only_md5=False):
  152.     print('START process_disk: {}'.format(process_disk))
  153.     dirsDataList, filesDataList, filesMD5sDic, filesSHA256sDic = process_dir(diskpath)
  154.     print('Total dirs count: {}'. format(len(dirsDataList)))
  155.     print('Total files count: {}'. format(len(filesDataList)))
  156.     print('Total md5s count: {}'. format(len(filesMD5sDic)))
  157.     print('Total sha256s count: {}'. format(len(filesSHA256sDic)))
  158.  
  159.     print('Write results to file: {}'. format(resultfilepath))
  160.     with open(resultfilepath, 'w+') as csv_file:
  161.         field_names = ['type', 'path', 'md5', 'sha256', 'size', 'change_size', 'modified_time', 'access_time']
  162.         writer = csv.DictWriter(csv_file, fieldnames=field_names)
  163.         writer.writeheader()
  164.  
  165.         for dir_data in dirsDataList:
  166.             # result ex: [truesize, st.st_atime, st.st_mtime, st.st_ctime, path]
  167.             writer.writerow({'type': 'folder', 'path': dir_data[1], 'md5': '', 'sha256': '', 'size': dir_data[0]})
  168.         for file_data in filesDataList:
  169.             # result ex: [md5, sha256, path]
  170.             writer.writerow({'type': 'file', 'path': file_data[2], 'md5': file_data[0], 'sha256': file_data[1]})
  171.     print('END process_disk: {}'.format(process_disk))
  172.  
  173.  
  174. def cli_menu():
  175.     parser.add_argument('-v', '--verbose', help='verbose output')
  176.     parser.add_argument('-d', '--diskpath', type=str, help='source disk mount directory')
  177.     parser.add_argument('-o', '--output', type=str, help='destination output csv file path')
  178.     return parser.parse_args()
  179.  
  180.  
  181. def main():
  182.     args = cli_menu()
  183.     if args.diskpath and args.output:
  184.         if not os.path.isdir(args.diskpath):
  185.             print('please provide valid directory disk path')
  186.             exit()
  187.         else:
  188.             process_disk(args.diskpath, args.output, args.md5)
  189.  
  190.  
  191. if __name__ == '__main__':
  192.     main()
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top