Removing duplicate files

# this code list an huge amount of files (~600 files) and the files are
# compressed postgres dump files, from 2 databases. The file patterns are:
# 20140510-0200_calipso_teste.sql.xz
# 20140608-0300_calipso_sistema.sql.xz
# where the prefix 20140608-0300 means 08/06/2014 and 3:00 am, both has the same
# sufix are .sql.xz

# the proposal here are verify which files are equal (have to test binary and filecmp.cmp can do it)
# and get the most new from them sometimes those databases don't change on whole day.
# It have to crate a list of files to delete.

from os import listdir
from os.path import isfile, join, getctime, getsize
import filecmp

mypath = '.'
sufix = '.sql.xz'
sufix_len = len(sufix)
prefix_len = 14
onlyfiles = [ f for f in listdir(mypath) if isfile(join(mypath,f)) ]
compares = {}

for v in onlyfiles:
    for e in onlyfiles:
        if  e == v  or e in compares.get(v, []) or v in compares.get(e, []) or (e[prefix_len:] != v[prefix_len:]) or (v[-sufix_len:] != sufix or e[-sufix_len:] != sufix):
            #print "Ignore %s with %s" % (v, e)
            continue

        print "compare %s with %s" % (v, e)
        if filecmp.cmp(v, e):
            if v in compares:
                compares[v].add(e)
            elif e in compares:
                compares[e].add(v)
            else:
                compares[v] = set()
                compares[v].add(e)

print "all:"
from pprint import pprint
pprint(compares)