Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from __future__ import division, unicode_literals
- import hashlib
- import logging
- import os
- import shelve
- from fnmatch import fnmatch
- from .utils import is_unsplitable, get_root_of_unsplitable
- logger = logging.getLogger(__name__)
- class Database(object):
- hash_mode_size_varying = 10.0 # 10% size variation from size on disk for the two scan modes
- # that allows size to vary
- def __init__(self, db_file, paths, ignore_files, normal_mode, unsplitable_mode, exact_mode,
- hash_name_mode, hash_size_mode, hash_slow_mode):
- """
- Database used to match files and torrents.
- """
- self.db = shelve.open(db_file)
- self.db_file = db_file
- self.paths = paths
- self.ignore_files = [self.normalize_filename(x) for x in ignore_files]
- self.normal_mode = normal_mode
- self.unsplitable_mode = unsplitable_mode
- self.exact_mode = exact_mode
- self.hash_name_mode = hash_name_mode
- self.hash_size_mode = hash_size_mode
- self.hash_slow_mode = hash_slow_mode
- self.hash_mode = hash_name_mode or hash_size_mode or hash_slow_mode
- self.hash_size_table = None
- def truncate(self):
- """
- Truncates the database
- """
- logger.info('Truncated the database')
- self.db.close()
- self.db = shelve.open(self.db_file, flag='n')
- def insert_into_database(self, root, f, mode, prefix=None, unsplitable_name=None):
- """
- Wraps the database insert to catch exceptions
- """
- try:
- self._insert_into_database(root, f, mode, prefix, unsplitable_name)
- except UnicodeDecodeError:
- logger.error('Failed to insert %r / %r / %r' % (root, f, mode))
- def _insert_into_database(self, root, f, mode, prefix=None, unsplitable_name=None):
- """
- Does the actual insertion into the database.
- """
- path = os.path.abspath(os.path.join(root, f))
- if not os.access(path, os.R_OK):
- logger.warning('Path %r is not accessible, skipping' % path)
- return
- if mode == 'exact':
- key = self.keyify(prefix, f)
- if key in self.db:
- self.db[key] = self.db[key] + [path]
- else:
- self.db[key] = [path]
- else:
- normalized_filename = self.normalize_filename(f)
- size = os.path.getsize(path)
- if mode.startswith('hash_'):
- if mode == 'hash_store_name': # the size can vary, name is exact. I.e. filename to path mapping
- key = self.keyify(normalized_filename)
- elif mode == 'hash_store_size': # the name can vary, size is exact (same db can be used for slow-mo). I.e. size to path mapping
- key = str('s:%i' % size)
- self.db[key] = self.db.get(key, []) + [path]
- else:
- if mode == 'unsplitable':
- split_root = root.split(os.sep)
- p_index = len(split_root) - split_root[::-1].index(unsplitable_name) - 1
- p = [self.normalize_filename(x) for x in split_root[p_index:]] + [normalized_filename]
- key = self.keyify(size, *p)
- elif mode == 'normal':
- key = self.keyify(size, normalized_filename)
- if key in self.db: # check if same file
- old_inode = os.stat(self.db[key]).st_ino
- new_inode = os.stat(path).st_ino
- if old_inode != new_inode:
- logger.warning('Duplicate key %s and %s' % (path, self.db[key]))
- self.db[key] = path
- def skip_file(self, f):
- """
- Checks if a filename is in the skiplist
- """
- normalized_filename = self.normalize_filename(f)
- for ignore_file in self.ignore_files:
- if fnmatch(normalized_filename, ignore_file):
- return True
- return False
- def rebuild(self, paths=None):
- """
- Scans the paths for files and rebuilds the database.
- """
- if paths:
- logger.info('Just adding new paths')
- else:
- logger.info('Rebuilding database')
- self.truncate()
- paths = self.paths
- unsplitable_paths = set()
- if self.unsplitable_mode or self.exact_mode:
- logger.info('Special modes enabled, doing a preliminary scan')
- for root_path in paths:
- logger.info('Preliminary scanning %s' % root_path)
- for root, dirs, files in os.walk(root_path):
- if is_unsplitable(files):
- sep_root = root.split(os.sep)
- name = get_root_of_unsplitable(root.split(os.sep))
- while sep_root[-1] != name:
- sep_root.pop()
- path = os.path.join(*sep_root)
- logger.debug('Found unsplitable path %r' % path)
- unsplitable_paths.add(path)
- logger.info('Done preliminary scanning %s' % root_path)
- for root_path in paths:
- logger.info('Scanning %s' % root_path)
- for root, dirs, files in os.walk(root_path):
- unsplitable = False
- if self.unsplitable_mode or self.exact_mode:
- sep_root = root.split(os.sep)
- while sep_root:
- if os.path.join(*sep_root) in unsplitable_paths:
- break
- sep_root.pop()
- if sep_root:
- unsplitable = True
- if self.unsplitable_mode:
- unsplitable_name = sep_root[-1]
- logger.info('Looks like we found a unsplitable release in %r' % (os.sep.join(sep_root)))
- for f in files:
- self.insert_into_database(root, f, 'unsplitable', unsplitable_name=unsplitable_name)
- continue
- if not unsplitable:
- if self.normal_mode:
- for f in files:
- if self.skip_file(f):
- continue
- self.insert_into_database(root, f, 'normal')
- if self.exact_mode:
- for f in files:
- self.insert_into_database(root, f, 'exact', 'f')
- for d in dirs:
- self.insert_into_database(root, d, 'exact', 'd')
- if self.hash_name_mode or self.hash_size_mode or self.hash_slow_mode:
- for f in files:
- if self.hash_size_mode or self.hash_slow_mode:
- self.insert_into_database(root, f, 'hash_store_size')
- if self.hash_name_mode:
- self.insert_into_database(root, f, 'hash_store_name')
- logger.info('Done scanning %s' % root_path)
- self.db.sync()
- def clear_hash_size_table(self):
- """
- Clears the hash size table.
- """
- self.hash_size_table = None
- def build_hash_size_table(self):
- """
- Builds a table of all sizes to make lookups faster for varying sizes.
- """
- if self.hash_size_table is not None:
- logger.debug('Hash size table already built, skipping')
- return
- self.hash_size_table = set()
- for key in self.db.keys():
- if not key.startswith('s:'):
- continue
- _, size = key.split(':')
- self.hash_size_table.add(int(size))
- self.hash_size_table = sorted(self.hash_size_table)
- def find_hash_varying_size(self, size):
- """
- Looks for a file with close to size in the database.
- The function assumes build_hash_size_table has already been called.
- Returns a list of paths ordered by how close they are to the size.
- """
- size_span = size * self.hash_mode_size_varying / 100
- min_size_span, max_size_span = size - size_span, size + size_span
- found_sizes = []
- for db_size in self.hash_size_table:
- if db_size < min_size_span:
- continue
- if db_size > max_size_span:
- break
- found_sizes.append(db_size)
- found_sizes = sorted(found_sizes, key=lambda x:abs(x-size))
- result = []
- for found_size in found_sizes:
- key = str('s:%i' % found_size)
- result += self.db.get(key, [])
- return result
- def find_hash_size(self, size):
- """
- Looks for a file with exact size in the database.
- Returns a list of paths.
- """
- return self.db.get(str('s:%s' % size), [])
- def find_hash_name(self, f):
- """
- Looks for a file with name f in the database.
- Returns a list of paths.
- """
- key = self.keyify(self.normalize_filename(f))
- return self.db.get(key, [])
- def find_unsplitable_file_path(self, rls, f, size):
- """
- Looks for a file in the database.
- """
- f = [self.normalize_filename(x) for x in f]
- key = self.keyify(size, self.normalize_filename(rls), *f)
- return self.db.get(key)
- def find_exact_file_path(self, prefix, rls):
- """
- Looks for a name in the database.
- """
- key = self.keyify(prefix, rls)
- return self.db.get(key)
- def find_file_path(self, f, size):
- """
- Looks for a file in the database.
- """
- key = self.keyify(size, self.normalize_filename(f))
- return self.db.get(key)
- def keyify(self, size, *names):
- """
- Turns a name and size into a key that can be stored in the database.
- """
- key = '%s|%s' % (size, '|'.join(names))
- logger.debug('Keyify: %s' % key)
- return hashlib.sha256(key.encode('utf-8')).hexdigest()
- def normalize_filename(self, filename):
- """
- Normalizes a filename to better detect simlar files.
- """
- return filename.replace(' ', '_').lower()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement