Advertisement
Guest User

Untitled

a guest
Feb 10th, 2016
48
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.88 KB | None | 0 0
  1. # duplicates_csv.py - duplicate file report as csv
  2.  
  3. import os
  4. import csv
  5. import hashlib
  6. import datetime
  7.  
  8. import sqlalchemy as sa
  9. import sqlalchemy.ext.declarative
  10.  
  11. STARTDIR = '.'
  12. FSENCODING = 'latin-1'
  13. DBFILE = 'duplicates_csv.sqlite3'
  14. OUTFILE = 'duplicates.csv'
  15.  
  16. engine = sa.create_engine('sqlite:///%s' % DBFILE)
  17.  
  18.  
  19. class File(sa.ext.declarative.declarative_base()):
  20.  
  21. __tablename__ = 'file'
  22.  
  23. location = sa.Column(sa.Text, primary_key=True)
  24. md5sum = sa.Column(sa.Text, index=True)
  25. size = sa.Column(sa.Integer, nullable=False)
  26. mtime = sa.Column(sa.DateTime, nullable=False)
  27. # sqlite3 string funcs cannot right extract, denormalize
  28. name = sa.Column(sa.Text, nullable=False)
  29. ext = sa.Column(sa.Text, nullable=False)
  30.  
  31. @staticmethod
  32. def getinfo(start, path, encoding=FSENCODING):
  33. location = os.path.relpath(path, start).decode(encoding).replace('\\', '/')
  34. name = os.path.basename(path).decode(encoding)
  35. ext = os.path.splitext(name)[1].lstrip('.')
  36. statinfo = os.stat(path)
  37. size = statinfo.st_size
  38. mtime = datetime.datetime.fromtimestamp(statinfo.st_mtime)
  39. return {'location': location, 'name': name, 'ext': ext, 'size': size, 'mtime': mtime}
  40.  
  41. @classmethod
  42. def from_path(cls, start, path, encoding=FSENCODING):
  43. kwargs = cls.getinfo(start, path, encoding=encoding)
  44. return cls(**kwargs)
  45.  
  46. def __repr__(self):
  47. return '<%s %r>' % (self.__class__.__name__, self.location)
  48.  
  49.  
  50. def md5sum(filename, bufsize=32768):
  51. md = hashlib.md5()
  52. with open(filename, 'rb') as fd:
  53. while True:
  54. data = fd.read(bufsize)
  55. if not data:
  56. break
  57. md.update(data)
  58. return md.hexdigest()
  59.  
  60.  
  61. def build_db(engine=engine, start=STARTDIR, recreate=False, verbose=False):
  62. dbfile = engine.url.database
  63.  
  64. if recreate and os.path.exists(dbfile):
  65. os.remove(dbfile)
  66.  
  67. if os.path.exists(dbfile):
  68. return
  69.  
  70. File.metadata.create_all(engine)
  71.  
  72. with engine.begin() as conn:
  73. insert_fileinfos(conn, start, verbose)
  74.  
  75. with engine.begin() as conn:
  76. add_md5sums(conn, start, verbose)
  77.  
  78.  
  79. def insert_fileinfos(conn, start, verbose):
  80. conn = conn.execution_options(compiled_cache={})
  81. insert_file = sa.insert(File, bind=conn).execute
  82. for root, dirs, files in os.walk(start):
  83. if verbose:
  84. print(root)
  85. values = [File.getinfo(start, os.path.join(root, f)) for f in files]
  86. if values:
  87. insert_file(values)
  88.  
  89.  
  90. def add_md5sums(conn, start, verbose):
  91. conn = conn.execution_options(compiled_cache={})
  92. query = sa.select([File.location])\
  93. .where(File.size.in_(sa.select([File.size])\
  94. .group_by(File.size).having(sa.func.count() > 1)))\
  95. .order_by(File.location)
  96.  
  97. update_file = sa.update(File, bind=conn)\
  98. .where(File.location == sa.bindparam('loc'))\
  99. .values(md5sum=sa.bindparam('md5sum')).execute
  100.  
  101. for location, in conn.execute(query):
  102. if verbose:
  103. print(location)
  104. digest = md5sum(os.path.join(start, location))
  105. update_file(loc=location, md5sum=digest)
  106.  
  107.  
  108. def duplicates_query(by_location=False):
  109. query = sa.select([File])\
  110. .where(File.md5sum.in_(sa.select([File.md5sum])\
  111. .group_by(File.md5sum).having(sa.func.count() > 1)))
  112. if by_location:
  113. query = query.order_by(File.location)
  114. else:
  115. query = query.order_by(File.md5sum, File.location)
  116. return query
  117.  
  118.  
  119. def to_csv(results, filename=OUTFILE, encoding='utf-8', dialect='excel'):
  120. with open(filename, 'wb') as fd:
  121. csvwriter = csv.writer(fd, dialect=dialect)
  122. csvwriter.writerow([k.encode(encoding) for k in results.keys()])
  123. for row in results:
  124. csvwriter.writerow([str(c).encode(encoding) for c in row])
  125.  
  126.  
  127. if __name__ == '__main__':
  128. build_db()
  129. query = duplicates_query()
  130. to_csv(engine.execute(query))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement