Meszias

Removing duplicate files

Nov 11th, 2014
203
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.45 KB | None | 0 0
  1. # this code list an huge amount of files (~600 files) and the files are
  2. # compressed postgres dump files, from 2 databases. The file patterns are:
  3. # 20140510-0200_calipso_teste.sql.xz
  4. # 20140608-0300_calipso_sistema.sql.xz
  5. # where the prefix 20140608-0300 means 08/06/2014 and 3:00 am, both has the same
  6. # sufix are .sql.xz
  7.  
  8. # the proposal here are verify which files are equal (have to test binary and filecmp.cmp can do it)
  9. # and get the most new from them sometimes those databases don't change on whole day.
  10. # It have to crate a list of files to delete.
  11.  
  12. from os import listdir
  13. from os.path import isfile, join, getctime, getsize
  14. import filecmp
  15.  
  16. mypath = '.'
  17. sufix = '.sql.xz'
  18. sufix_len = len(sufix)
  19. prefix_len = 14
  20. onlyfiles = [ f for f in listdir(mypath) if isfile(join(mypath,f)) ]
  21. compares = {}
  22.  
  23. for v in onlyfiles:
  24.     for e in onlyfiles:
  25.         if  e == v  or e in compares.get(v, []) or v in compares.get(e, []) or (e[prefix_len:] != v[prefix_len:]) or (v[-sufix_len:] != sufix or e[-sufix_len:] != sufix):
  26.             #print "Ignore %s with %s" % (v, e)
  27.             continue
  28.  
  29.         print "compare %s with %s" % (v, e)
  30.         if filecmp.cmp(v, e):
  31.             if v in compares:
  32.                 compares[v].add(e)
  33.             elif e in compares:
  34.                 compares[e].add(v)
  35.             else:
  36.                 compares[v] = set()
  37.                 compares[v].add(e)
  38.  
  39. print "all:"
  40. from pprint import pprint
  41. pprint(compares)
Advertisement
Add Comment
Please, Sign In to add comment