Advertisement
Guest User

Untitled

a guest
Apr 26th, 2018
69
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 10.82 KB | None | 0 0
  1. from __future__ import division, unicode_literals
  2.  
  3. import hashlib
  4. import logging
  5. import os
  6. import shelve
  7.  
  8. from fnmatch import fnmatch
  9.  
  10. from .utils import is_unsplitable, get_root_of_unsplitable
  11.  
  12. logger = logging.getLogger(__name__)
  13.  
  14. class Database(object):
  15. hash_mode_size_varying = 10.0 # 10% size variation from size on disk for the two scan modes
  16. # that allows size to vary
  17.  
  18. def __init__(self, db_file, paths, ignore_files, normal_mode, unsplitable_mode, exact_mode,
  19. hash_name_mode, hash_size_mode, hash_slow_mode):
  20. """
  21. Database used to match files and torrents.
  22. """
  23. self.db = shelve.open(db_file)
  24. self.db_file = db_file
  25. self.paths = paths
  26. self.ignore_files = [self.normalize_filename(x) for x in ignore_files]
  27. self.normal_mode = normal_mode
  28. self.unsplitable_mode = unsplitable_mode
  29. self.exact_mode = exact_mode
  30. self.hash_name_mode = hash_name_mode
  31. self.hash_size_mode = hash_size_mode
  32. self.hash_slow_mode = hash_slow_mode
  33. self.hash_mode = hash_name_mode or hash_size_mode or hash_slow_mode
  34. self.hash_size_table = None
  35.  
  36. def truncate(self):
  37. """
  38. Truncates the database
  39. """
  40. logger.info('Truncated the database')
  41. self.db.close()
  42. self.db = shelve.open(self.db_file, flag='n')
  43.  
  44. def insert_into_database(self, root, f, mode, prefix=None, unsplitable_name=None):
  45. """
  46. Wraps the database insert to catch exceptions
  47. """
  48. try:
  49. self._insert_into_database(root, f, mode, prefix, unsplitable_name)
  50. except UnicodeDecodeError:
  51. logger.error('Failed to insert %r / %r / %r' % (root, f, mode))
  52.  
  53. def _insert_into_database(self, root, f, mode, prefix=None, unsplitable_name=None):
  54. """
  55. Does the actual insertion into the database.
  56. """
  57. path = os.path.abspath(os.path.join(root, f))
  58. if not os.access(path, os.R_OK):
  59. logger.warning('Path %r is not accessible, skipping' % path)
  60. return
  61.  
  62. if mode == 'exact':
  63. key = self.keyify(prefix, f)
  64. if key in self.db:
  65. self.db[key] = self.db[key] + [path]
  66. else:
  67. self.db[key] = [path]
  68. else:
  69. normalized_filename = self.normalize_filename(f)
  70. size = os.path.getsize(path)
  71.  
  72. if mode.startswith('hash_'):
  73. if mode == 'hash_store_name': # the size can vary, name is exact. I.e. filename to path mapping
  74. key = self.keyify(normalized_filename)
  75. elif mode == 'hash_store_size': # the name can vary, size is exact (same db can be used for slow-mo). I.e. size to path mapping
  76. key = str('s:%i' % size)
  77. self.db[key] = self.db.get(key, []) + [path]
  78. else:
  79. if mode == 'unsplitable':
  80. split_root = root.split(os.sep)
  81. p_index = len(split_root) - split_root[::-1].index(unsplitable_name) - 1
  82. p = [self.normalize_filename(x) for x in split_root[p_index:]] + [normalized_filename]
  83.  
  84. key = self.keyify(size, *p)
  85. elif mode == 'normal':
  86. key = self.keyify(size, normalized_filename)
  87.  
  88. if key in self.db: # check if same file
  89. old_inode = os.stat(self.db[key]).st_ino
  90. new_inode = os.stat(path).st_ino
  91. if old_inode != new_inode:
  92. logger.warning('Duplicate key %s and %s' % (path, self.db[key]))
  93.  
  94. self.db[key] = path
  95.  
  96. def skip_file(self, f):
  97. """
  98. Checks if a filename is in the skiplist
  99. """
  100. normalized_filename = self.normalize_filename(f)
  101. for ignore_file in self.ignore_files:
  102. if fnmatch(normalized_filename, ignore_file):
  103. return True
  104. return False
  105.  
  106. def rebuild(self, paths=None):
  107. """
  108. Scans the paths for files and rebuilds the database.
  109. """
  110. if paths:
  111. logger.info('Just adding new paths')
  112. else:
  113. logger.info('Rebuilding database')
  114. self.truncate()
  115. paths = self.paths
  116.  
  117. unsplitable_paths = set()
  118. if self.unsplitable_mode or self.exact_mode:
  119. logger.info('Special modes enabled, doing a preliminary scan')
  120. for root_path in paths:
  121. logger.info('Preliminary scanning %s' % root_path)
  122. for root, dirs, files in os.walk(root_path):
  123. if is_unsplitable(files):
  124. sep_root = root.split(os.sep)
  125. name = get_root_of_unsplitable(root.split(os.sep))
  126. while sep_root[-1] != name:
  127. sep_root.pop()
  128. path = os.path.join(*sep_root)
  129. logger.debug('Found unsplitable path %r' % path)
  130. unsplitable_paths.add(path)
  131. logger.info('Done preliminary scanning %s' % root_path)
  132.  
  133. for root_path in paths:
  134. logger.info('Scanning %s' % root_path)
  135. for root, dirs, files in os.walk(root_path):
  136. unsplitable = False
  137. if self.unsplitable_mode or self.exact_mode:
  138. sep_root = root.split(os.sep)
  139. while sep_root:
  140. if os.path.join(*sep_root) in unsplitable_paths:
  141. break
  142. sep_root.pop()
  143.  
  144. if sep_root:
  145. unsplitable = True
  146. if self.unsplitable_mode:
  147. unsplitable_name = sep_root[-1]
  148. logger.info('Looks like we found a unsplitable release in %r' % (os.sep.join(sep_root)))
  149. for f in files:
  150. self.insert_into_database(root, f, 'unsplitable', unsplitable_name=unsplitable_name)
  151. continue
  152.  
  153. if not unsplitable:
  154. if self.normal_mode:
  155. for f in files:
  156. if self.skip_file(f):
  157. continue
  158.  
  159. self.insert_into_database(root, f, 'normal')
  160.  
  161. if self.exact_mode:
  162. for f in files:
  163. self.insert_into_database(root, f, 'exact', 'f')
  164.  
  165. for d in dirs:
  166. self.insert_into_database(root, d, 'exact', 'd')
  167.  
  168. if self.hash_name_mode or self.hash_size_mode or self.hash_slow_mode:
  169. for f in files:
  170. if self.hash_size_mode or self.hash_slow_mode:
  171. self.insert_into_database(root, f, 'hash_store_size')
  172.  
  173. if self.hash_name_mode:
  174. self.insert_into_database(root, f, 'hash_store_name')
  175.  
  176.  
  177. logger.info('Done scanning %s' % root_path)
  178. self.db.sync()
  179.  
  180. def clear_hash_size_table(self):
  181. """
  182. Clears the hash size table.
  183. """
  184. self.hash_size_table = None
  185.  
  186. def build_hash_size_table(self):
  187. """
  188. Builds a table of all sizes to make lookups faster for varying sizes.
  189. """
  190. if self.hash_size_table is not None:
  191. logger.debug('Hash size table already built, skipping')
  192. return
  193.  
  194. self.hash_size_table = set()
  195. for key in self.db.keys():
  196. if not key.startswith('s:'):
  197. continue
  198.  
  199. _, size = key.split(':')
  200. self.hash_size_table.add(int(size))
  201.  
  202. self.hash_size_table = sorted(self.hash_size_table)
  203.  
  204. def find_hash_varying_size(self, size):
  205. """
  206. Looks for a file with close to size in the database.
  207. The function assumes build_hash_size_table has already been called.
  208.  
  209. Returns a list of paths ordered by how close they are to the size.
  210. """
  211. size_span = size * self.hash_mode_size_varying / 100
  212. min_size_span, max_size_span = size - size_span, size + size_span
  213.  
  214. found_sizes = []
  215. for db_size in self.hash_size_table:
  216. if db_size < min_size_span:
  217. continue
  218.  
  219. if db_size > max_size_span:
  220. break
  221.  
  222. found_sizes.append(db_size)
  223.  
  224. found_sizes = sorted(found_sizes, key=lambda x:abs(x-size))
  225. result = []
  226. for found_size in found_sizes:
  227. key = str('s:%i' % found_size)
  228. result += self.db.get(key, [])
  229.  
  230. return result
  231.  
  232. def find_hash_size(self, size):
  233. """
  234. Looks for a file with exact size in the database.
  235.  
  236. Returns a list of paths.
  237. """
  238. return self.db.get(str('s:%s' % size), [])
  239.  
  240. def find_hash_name(self, f):
  241. """
  242. Looks for a file with name f in the database.
  243.  
  244. Returns a list of paths.
  245. """
  246. key = self.keyify(self.normalize_filename(f))
  247.  
  248. return self.db.get(key, [])
  249.  
  250. def find_unsplitable_file_path(self, rls, f, size):
  251. """
  252. Looks for a file in the database.
  253. """
  254. f = [self.normalize_filename(x) for x in f]
  255. key = self.keyify(size, self.normalize_filename(rls), *f)
  256.  
  257. return self.db.get(key)
  258.  
  259. def find_exact_file_path(self, prefix, rls):
  260. """
  261. Looks for a name in the database.
  262. """
  263. key = self.keyify(prefix, rls)
  264.  
  265. return self.db.get(key)
  266.  
  267. def find_file_path(self, f, size):
  268. """
  269. Looks for a file in the database.
  270. """
  271. key = self.keyify(size, self.normalize_filename(f))
  272.  
  273. return self.db.get(key)
  274.  
  275. def keyify(self, size, *names):
  276. """
  277. Turns a name and size into a key that can be stored in the database.
  278. """
  279. key = '%s|%s' % (size, '|'.join(names))
  280. logger.debug('Keyify: %s' % key)
  281.  
  282. return hashlib.sha256(key.encode('utf-8')).hexdigest()
  283.  
  284. def normalize_filename(self, filename):
  285. """
  286. Normalizes a filename to better detect simlar files.
  287. """
  288. return filename.replace(' ', '_').lower()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement