Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- # encoding: utf-8
- import os
- import shutil
- import sys
- import time
- import requests
- import argparse
- import hashlib
- import multiprocessing
- import csv
- from joblib import Parallel, delayed
- def get_hash(file_path, hash_method, block_size=65536):
- method = hash_method.lower().strip()
- try:
- file_content = open(file_path, 'rb').read()
- if method == 'md5':
- return hashlib.md5(file_content).hexdigest()
- elif method == 'sha1':
- return hashlib.sha1(file_content).hexdigest()
- elif method == 'sha224':
- return hashlib.sha224(file_content).hexdigest()
- elif method == 'sha256':
- return hashlib.sha256(file_content).hexdigest()
- elif method == 'sha384':
- return hashlib.sha384(file_content).hexdigest()
- elif method == 'sha512':
- return hashlib.sha512(file_content).hexdigest()
- else:
- print('Unsupported/Unknown hash method')
- return 0
- except:
- try:
- print('Failed to hash using fast mothod1. Try use method2 to hash file: {}'.format(file_path))
- if method == 'md5': d = hashlib.md5()
- elif method == 'sha1': d = hashlib.sha1()
- elif method == 'sha224': d = hashlib.sha224()
- elif method == 'sha256': d = hashlib.sha256()
- elif method == 'sha384': d = hashlib.sha384()
- elif method == 'sha512': d = hashlib.sha512()
- else:
- print('Unsupported/Unknown hash method')
- return 0
- # try calc hash for huge file
- with open(file_path, mode='rb') as f:
- while True:
- buf = f.read(block_size)
- if not buf:
- break
- d.update(buf)
- print('Successfully hashed the file using method2: {}'.format(file_path))
- return d.hexdigest()
- except:
- print('Failed to hash the file: {}'.format(file_path))
- return 0
- def calc_recursive_dir_size(source):
- try:
- total_size = os.path.getsize(source)
- for item in os.listdir(source):
- item_path = os.path.join(source, item)
- if os.path.isfile(item_path):
- total_size += os.path.getsize(item_path)
- elif os.path.isdir(item_path):
- total_size += calc_recursive_dir_size(item_path)
- return total_size
- except:
- print('Failed to calc dir size: {}'.format(source))
- return 0
- def get_dir_metadata(dir_path):
- try:
- st = os.stat(dir_path)
- truesize = calc_recursive_dir_size(dir_path)
- return [truesize, st.st_atime, st.st_mtime, st.st_ctime, dir_path]
- except:
- print('Failed to get dir statistics: {}'.format(dir_path))
- return 0
- def get_file_metadata(file_path):
- try:
- st = os.stat(file_path)
- return [st.st_size, st.st_atime, st.st_mtime, st.st_ctime, path]
- except:
- print('Failed to get file statistics: {}'.format(file_path))
- return 0
- def file_ingest(file_path, md5_only=False):
- if md5_only:
- return [get_hash(file_path=file_path, hash_method='md5'), file_path]
- else:
- return [
- get_hash(file_path=file_path, hash_method='md5'),
- get_hash(file_path=file_path, hash_method='sha256'),
- #get_file_metadata(file_path=file_path),
- file_path
- ]
- def process_sub_files_and_folders(dir_path):
- root = os.path.abspath(dir_path)
- file_paths = []
- dir_paths = []
- for dir_name, subdir_list, file_list in os.walk(root):
- if os.path.isdir(dir_name):
- dir_paths += [dir_name]
- for file in file_list:
- file_path = os.path.join(dir_name, file)
- if os.path.isfile(file_path):
- file_paths += [file_path]
- return file_paths, dir_paths
- def process_dir(dir_path):
- start = time.time()
- file_paths, dir_paths = process_sub_files_and_folders(dir_path)
- end = time.time()
- print('Process {} completed in {} secs. Extracted:\n\t{} files\n\t{}dirs'.format(dir_path, end-start, len(file_paths), len(dir_paths)))
- cores_num = multiprocessing.cpu_count()
- start = time.time()
- dirs_results_list = Parallel(n_jobs=cores_num)(delayed(get_dir_metadata)(path) for path in dir_paths)
- end = time.time()
- print('Calc dirs metadata for {} items in {} secs.'.format(len(dirs_results_list), end-start))
- start = time.time()
- files_results_list = Parallel(n_jobs=cores_num)(delayed(file_ingest)(path) for path in file_paths)
- end = time.time()
- print('Calc Hashs for {} items in {} secs.'.format(len(files_results_list), end-start))
- # convert lists to unique dics
- results_md5s_files_dic = {}
- results_sha256s_files_dic = {}
- for arr in files_results_list:
- curr_md5 = arr[0]
- curr_sha256 = arr[1]
- curr_filepath = arr[3]
- results_md5s_files_dic[curr_md5] = curr_filepath
- results_sha256s_files_dic[curr_sha256] = curr_filepath
- return dirs_results_list, files_results_list, results_md5s_files_dic, results_sha256s_files_dic
- def process_disk(diskpath, resultfilepath, only_md5=False):
- print('START process_disk: {}'.format(process_disk))
- dirsDataList, filesDataList, filesMD5sDic, filesSHA256sDic = process_dir(diskpath)
- print('Total dirs count: {}'. format(len(dirsDataList)))
- print('Total files count: {}'. format(len(filesDataList)))
- print('Total md5s count: {}'. format(len(filesMD5sDic)))
- print('Total sha256s count: {}'. format(len(filesSHA256sDic)))
- print('Write results to file: {}'. format(resultfilepath))
- with open(resultfilepath, 'w+') as csv_file:
- field_names = ['type', 'path', 'md5', 'sha256', 'size', 'change_size', 'modified_time', 'access_time']
- writer = csv.DictWriter(csv_file, fieldnames=field_names)
- writer.writeheader()
- for dir_data in dirsDataList:
- # result ex: [truesize, st.st_atime, st.st_mtime, st.st_ctime, path]
- writer.writerow({'type': 'folder', 'path': dir_data[1], 'md5': '', 'sha256': '', 'size': dir_data[0]})
- for file_data in filesDataList:
- # result ex: [md5, sha256, path]
- writer.writerow({'type': 'file', 'path': file_data[2], 'md5': file_data[0], 'sha256': file_data[1]})
- print('END process_disk: {}'.format(process_disk))
- def cli_menu():
- parser.add_argument('-v', '--verbose', help='verbose output')
- parser.add_argument('-d', '--diskpath', type=str, help='source disk mount directory')
- parser.add_argument('-o', '--output', type=str, help='destination output csv file path')
- return parser.parse_args()
- def main():
- args = cli_menu()
- if args.diskpath and args.output:
- if not os.path.isdir(args.diskpath):
- print('please provide valid directory disk path')
- exit()
- else:
- process_disk(args.diskpath, args.output, args.md5)
- if __name__ == '__main__':
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement