Advertisement
Guest User

Untitled

a guest
Apr 28th, 2017
59
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.85 KB | None | 0 0
  1. from __future__ import print_function
  2. import subprocess
  3. import os
  4. import argparse
  5. import ast
  6. import multiprocessing
  7. import sys
  8. import threading
  9.  
  10.  
  11. FIND_KEYS = {
  12. 'inode': '%i',
  13. 'name': '"%p"',
  14. 'size_B': '%s',
  15. 'disk_kB': '%k',
  16. 'a_time': '%A@',
  17. 'c_time': '%C@',
  18. 'm_time': '%T@',
  19. 'type': '"%Y"',
  20. 'uid': '%U',
  21. 'gid': '%G',
  22. 'perm': '%m',
  23. }
  24.  
  25.  
  26. DEFAULT_KEYS = ('name', 'inode', 'size_B', 'm_time')
  27.  
  28.  
  29. class SearchDir(object):
  30. """
  31. A directory to search for file metadata
  32. """
  33. def __init__(self, path, recurse=True):
  34. self.path = path
  35. self.recurse = recurse
  36.  
  37. def search(self, find_keys=None):
  38. find_keys = find_keys if find_keys is not None else DEFAULT_KEYS
  39. cmd = [
  40. 'find' if sys.platform != 'darwin' else 'gfind',
  41. '-L', self.path,
  42. ]
  43. if not self.recurse:
  44. cmd += ['-maxdepth', '0']
  45. cmd += [
  46. '-type', 'f',
  47. '-printf', '\t'.join(FIND_KEYS[key] for key in find_keys) + '\n'
  48. ]
  49. finder = subprocess.Popen(
  50. cmd,
  51. stdout=subprocess.PIPE,
  52. )
  53. # print('"' + '" "'.join(cmd) + '"')
  54. try:
  55. for line in finder.stdout:
  56. line = line.decode()
  57. values = line.split('\t')
  58. yield [ast.literal_eval(val.strip()) for val in values]
  59. finally:
  60. finder.terminate()
  61.  
  62. def __repr__(self):
  63. return '%s(%r, recurse=%s)' % (self.__class__.__name__, self.path, self.recurse)
  64.  
  65.  
  66. def find_searchdirs(basepath, max_recurse=1):
  67. """Find all sub-directories to search inside a basepath"""
  68. basepath = basepath.rstrip(os.sep)
  69. dirs = []
  70. for dirpath, dirnames, filenames in os.walk(basepath):
  71. relpath = os.path.relpath(dirpath, basepath)
  72. if relpath == '.':
  73. depth = 0
  74. else:
  75. depth = relpath.count(os.sep) + 1
  76. print('inspecting %r, depth %d, files %d, dirs %d' % (dirpath, depth, len(filenames), len(dirnames)), file=sys.stderr)
  77. # once we have recursed deeply enough, have the entire directory searched as a task
  78. if depth >= max_recurse:
  79. # add all leaf directories for searching
  80. dirs.extend(
  81. SearchDir(os.path.join(dirpath, subdirpath), True) for subdirpath in dirnames
  82. )
  83. dirnames[:] = []
  84. # if there are files at intermediate levels, reap them without recursion
  85. if filenames:
  86. dirs.append(SearchDir(dirpath, False))
  87. return dirs
  88.  
  89.  
  90. def search_dir(target, push_every=100, find_keys=None):
  91. """Search all paths inside target"""
  92. if find_keys is not None:
  93. dir_iter = target.search(find_keys=find_keys)
  94. else:
  95. dir_iter = target.search()
  96. file_buffer = []
  97. for file_data in dir_iter:
  98. file_buffer.append(file_data)
  99. if len(file_buffer) >= push_every:
  100. search_dir.queue.put(file_buffer)
  101. file_buffer = []
  102. if file_buffer:
  103. search_dir.queue.put(file_buffer)
  104.  
  105.  
  106. def pmap(args):
  107. return search_dir(*args)
  108.  
  109.  
  110. def pinit(queue):
  111. search_dir.queue = queue
  112.  
  113.  
  114. def preduce(queue, find_keys):
  115. find_keys = find_keys if find_keys is not None else DEFAULT_KEYS
  116. print(','.join(find_keys))
  117. while True:
  118. file_buffer = queue.get()
  119. if file_buffer is None:
  120. return
  121. for file_data in file_buffer:
  122. line = ','.join(str(item) for item in file_data)
  123. print(line)
  124.  
  125.  
  126. def reap_from_path(path, max_recurse=1, nprocesses=None, push_every=100, find_keys=None):
  127. dirs = find_searchdirs(path, max_recurse=max_recurse)
  128. print('Searching %d directories...' % len(dirs), file=sys.stderr)
  129. queue = multiprocessing.Queue()
  130. pool = multiprocessing.Pool(processes=nprocesses, initializer=pinit, initargs=(queue,))
  131. procs = pool.imap_unordered(
  132. pmap, [
  133. (target, push_every, find_keys) for target in dirs
  134. ],
  135. chunksize=1,
  136. )
  137. reaper_thread = threading.Thread(target=preduce, args=(queue, find_keys))
  138. reaper_thread.daemon = True
  139. reaper_thread.start()
  140. for count, _ in enumerate(procs):
  141. print('%d/%d' % (count + 1, len(dirs)), file=sys.stderr)
  142. queue.put(None)
  143. try:
  144. reaper_thread.join()
  145. except KeyboardInterrupt:
  146. pass
  147.  
  148. if __name__ == '__main__':
  149. CLI = argparse.ArgumentParser()
  150. CLI.add_argument(
  151. '--path',
  152. default='.',
  153. help='Path from which to start searching files'
  154. )
  155. CLI.add_argument(
  156. '-r',
  157. '--max-recurse',
  158. default=2,
  159. type=int,
  160. help='Maximum recursion depth for parallelization'
  161. )
  162. CLI.add_argument(
  163. '--max-output',
  164. default=float('inf'),
  165. type=int,
  166. help='Maximum output to produce, in bytes'
  167. )
  168. options = CLI.parse_args()
  169. reap_from_path(path=options.path, max_recurse=options.max_recurse)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement