Advertisement
Pseudobyte

List Dupe Finder

Jan 2nd, 2016
79
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 11.34 KB | None | 0 0
  1. import re
  2.  
  3. def listJoin(items):
  4.   if (len(items) == 0):
  5.     return ''
  6.   elif (len(items) == 1):
  7.     return items[0]
  8.   elif (len(items) == 2):
  9.     return ' and '.join(items)
  10.   else:
  11.     return ', '.join(items[:-1] + ['and ' + items[-1]])
  12.  
  13.  
  14.  
  15. class Thing(object):
  16.   def __init__(self, name, id):
  17.     object.__init__(self)
  18.     self.name = name
  19.     self.id = id
  20.     self.counts = {}
  21.    
  22.   def count(self, group):
  23.     self.counts[group.id] = self.counts.get(group.id, 0) + 1
  24.  
  25. class Group(object):
  26.   def __init__(self, name, id):
  27.     object.__init__(self)
  28.     self.name = name
  29.     self.id = id
  30.     self.counts = {}
  31.    
  32.   def count(self, thing):
  33.     self.counts[thing.id] = self.counts.get(thing.id, 0) + 1
  34.  
  35. class GroupGroup(object):
  36.   def __init__(self, names= None, contents= None):
  37.     object.__init__(self)
  38.     self.things = {}
  39.     self.groups = {}
  40.  
  41.     self.ruleHandlers = {
  42.       'common': self.getCommons,
  43.       'dupe': self.getDupes,
  44.       'unique': self.getUniques
  45.     }
  46.  
  47.     self.renderHandlers = {
  48.       'short': self.shortReport,
  49.       'verbose': self.verboseReport
  50.     }
  51.  
  52.     self.verboseRenderers = {
  53.       'common': self.verboseCommon,
  54.       'dupe': self.verboseDupe,
  55.       'unique': self.verboseUnique
  56.     }
  57.  
  58.     self.update(names, contents)
  59.  
  60.   def update(self, names= None, contents= None):
  61.     if (names):
  62.       self.addGroups(names)
  63.     if (names and contents):
  64.       self.addThings(contents)
  65.  
  66.   def addGroups(self, names):
  67.     for name in names:
  68.       self.addGroup(name)
  69.  
  70.   def addThings(self, contents):
  71.     for i in range(len(contents)):
  72.       for thing in contents[i]:
  73.         self.addThing(thing, self.groups[i])
  74.  
  75.   def thingName(self, thingid, tag= None):
  76.     name = self.things[thingid].name
  77.     if (tag == None):
  78.       return name
  79.     else:
  80.       return str(tag) + name + str(-tag)
  81.  
  82.   def groupName(self, groupid):
  83.     return self.groups[groupid].name
  84.    
  85.   def addGroup(self, name):
  86.     groupid = len(self.groups)
  87.     self.groups[groupid] = Group(name, groupid)
  88.     return self.groups[groupid]
  89.    
  90.   def addThing(self, name, group):
  91.     thingid = name.lower()
  92.     if (thingid not in self.things):
  93.       self.things[thingid] = Thing(name, thingid)
  94.     thing = self.things[thingid]
  95.     group.count(thing)
  96.     thing.count(group)
  97.     return thing
  98.  
  99.   def getCommons(self, arg):
  100.     commons = {}
  101.     for thing in self.things:
  102.       tcounts = self.things[thing].counts
  103.       if (len(tcounts) in arg):
  104.         commons[thing] = tcounts.keys()
  105.     return commons
  106.  
  107.   def getDupes(self, arg):
  108.     dupes = {}
  109.     for thing in self.things:
  110.       tcounts = self.things[thing].counts
  111.       counts = {x:tcounts[x] for x in tcounts if tcounts[x] in arg}
  112.       if (counts):
  113.         dupes[thing] = counts
  114.     return dupes
  115.  
  116.   def getUniques(self, arg):
  117.     uniques = {}
  118.     for thing in self.things:
  119.       tcounts = self.things[thing].counts
  120.       if (len(tcounts) == 1 and list(tcounts.values())[0] == 1):
  121.         uniques[thing] = list(tcounts.keys())[0]
  122.     return uniques
  123.  
  124.   def buildReport(self, **args):
  125.     data = {}
  126.     for arg in args:
  127.       if (args[arg] is not False and arg in self.ruleHandlers):
  128.         data[arg] = self.ruleHandlers[arg](args[arg])
  129.     report = {}
  130.     for thingid in self.things:
  131.       thingdata = {}
  132.       for rule in data:
  133.         if (thingid in data[rule]):
  134.           thingdata[rule] = data[rule][thingid]
  135.       if (args['all'] and len(thingdata) < len(data)):
  136.         continue
  137.       if (len(thingdata) > 0):
  138.         report[thingid] = thingdata
  139.     return report
  140.  
  141.   def renderReport(self, report, format, **args):
  142.     return self.renderHandlers[format](report, **args)
  143.  
  144.   def shortReport(self, report, **args):
  145.     if (len(report) > 0):
  146.       report = [self.thingName(x, args['tag']) for x in report]
  147.       report.sort()
  148.       return '\n'.join(report)
  149.     else:
  150.       return ''
  151.  
  152.   def verboseCommon(self, commons, **args):
  153.     return 'common to %s' % listJoin([self.groupName(x) for x in commons])
  154.  
  155.   def verboseDupe(self, dupes, **args):
  156.     items = ['%s (%d)' % (self.groupName(x), dupes[x]) for x in dupes]
  157.     return 'duplicated in %s' % listJoin(items)
  158.  
  159.   def verboseUnique(self, unique, **args):
  160.     return 'unique to %s' % self.groupName(unique)
  161.  
  162.   def verboseReport(self, report, **args):
  163.     items = []
  164.     rules = sorted(list(self.verboseRenderers.keys()))
  165.     things = sorted(list(report.keys()), key= lambda x: self.thingName(x))
  166.     for item in things:
  167.       entry = []
  168.       for rule in rules:
  169.         if rule in report[item]:
  170.           entry.append(self.verboseRenderers[rule](report[item][rule], **args))
  171.       entry = self.thingName(item, args['tag']) + ': ' + '; '.join(entry)
  172.       items.append(entry)
  173.     return '\n'.join(items)
  174.  
  175.  
  176.  
  177. def oneItemPerLineCodec(data):
  178.   # strip lines and remove comments
  179.   data = [x.strip() for x in data.split('\n')]
  180.   data = [x for x in data if (x == '' or x[0] != '#')]
  181.  
  182.   # break up groups by blank lines
  183.   groups = []
  184.   while ('' in data):
  185.     i = data.index('')
  186.     if (i != 0):
  187.       groups.append(data[:i])
  188.     data = data[data.index('') + 1:]
  189.   if (data):
  190.     groups.append(data)
  191.  
  192.   # determine named groups
  193.   names = []
  194.   for group in groups:
  195.     if (len(group) > 1 and re.match(r'-+$', group[1])):
  196.       names.append(group[0])
  197.       del group[:2]
  198.     else:
  199.       names.append(None)
  200.  
  201.   # autoname unnamed groups
  202.   c = 1
  203.   for i in range(len(names)):
  204.     if (names[i] is None):
  205.       while (('Group %d' % c) in names): c += 1
  206.       names[i] = 'Group %d' % c
  207.       c += 1
  208.  
  209.   return names, groups
  210.  
  211.  
  212.  
  213. class Tag(object):
  214.   tagRe = [
  215.     (re.compile(r'\[(\w+)\]'), 'bb', True),
  216.     (re.compile(r'\[/(\w+)\]'), 'bb', False),
  217.     (re.compile(r'<(\w+)>'), 'html', True),
  218.     (re.compile(r'</(\w+)>'), 'html', False)
  219.   ]
  220.   renderers = {
  221.     'bb': lambda b, o: '[' +  ('' if o else '/') + b + ']',
  222.     'html': lambda b, o: '<' +  ('' if o else '/') + b + '>'
  223.   }
  224.   def __init__(self, string= None, body= None, kind= None, start= None):
  225.     object.__init__(self)
  226.     if string:
  227.       found = False
  228.       for rule in self.tagRe:
  229.         match = rule[0].match(string)
  230.         if (match):
  231.           self.body = match.group(1)
  232.           self.kind, self.start = rule[1:]
  233.           found = True
  234.           break
  235.       if (not found):
  236.         raise ValueError('unknown tag format')
  237.     else:
  238.       self.body = body
  239.       self.kind = kind
  240.       self.start = start
  241.  
  242.   def __str__(self):
  243.     return self.renderers[self.kind](self.body, self.start)
  244.  
  245.   def __neg__(self):
  246.     return Tag(None, self.body, self.kind, not self.start)
  247.  
  248.  
  249.  
  250. class IntRange(object):
  251.   def __init__(self, start, stop):
  252.     object.__init__(self)
  253.     if (start is None and stop is None):
  254.       raise ValueError('IntRange cannot be unbounded on both sides')
  255.     if (start is not None and stop is not None and start > stop):
  256.       start, stop = stop, start
  257.     self.start = start
  258.     self.stop = stop
  259.  
  260.   def __contains__(self, item):
  261.     if (self.start is not None and item < self.start):
  262.       return False
  263.     if (self.stop is not None and item > self.stop):
  264.       return False
  265.     return True
  266.  
  267.   def __or__(self, other):
  268.     if (other.start in self and other.stop in self):
  269.       return IntRange(self.start, self.stop)
  270.     elif (self.start in other and self.stop in other):
  271.       return IntRange(other.start, other.stop)
  272.     elif (self.start in other):
  273.       return IntRange(other.start, self.stop)
  274.     elif (self.stop in other):
  275.       return IntRange(self.start, other.stop)
  276.     else:
  277.       raise ValueError('ranges do not overlap')
  278.  
  279.   def __repr__(self):
  280.     return 'IntRange(%d,%d)' % (self.start, self.stop)
  281.  
  282. class MultiRange(object):
  283.   rangeRe = re.compile(r'(\d*):(\d*)')
  284.   def __init__(self, string= None):
  285.     object.__init__(self)
  286.     self.ranges = []
  287.     if (string):
  288.       for item in string.split(','):
  289.         match = self.rangeRe.match(item)
  290.         if (match):
  291.           start, stop = [None if x is '' else int(x) for x in match.groups()]
  292.           self.addRange(start, stop)
  293.         else:
  294.           item = int(item)
  295.           self.addRange(item, item)
  296.  
  297.   def addRange(self, start, stop):
  298.     r = IntRange(start, stop)
  299.     rs = self.ranges
  300.     self.ranges = []
  301.     while (len(rs)):
  302.       r2 = rs.pop()
  303.       if (r.start in r2 or r.stop in r2):
  304.         r = r | r2
  305.       else:
  306.         self.ranges.append(r2)
  307.     self.ranges.append(r)
  308.  
  309.   def __contains__(self, item):
  310.     for r in self.ranges:
  311.       if (item in r):
  312.         return True
  313.     return False
  314.  
  315.   def __repr__(self):
  316.     rs = []
  317.     self.ranges.sort(key= lambda x: x.start)
  318.     for r in self.ranges:
  319.       if (r.start == r.stop):
  320.         rs.append('%d' % r.start)
  321.       else:
  322.         rs.append('%d:%d' % tuple('' if x is None else x for x in (r.start, r.stop)))
  323.     return ','.join(rs)
  324.  
  325.  
  326.  
  327. if (__name__ == '__main__'):
  328.   import sys, argparse
  329.  
  330.   parser = argparse.ArgumentParser()
  331.  
  332.   # rule options
  333.   rulegroup = parser.add_argument_group('Filtering Rules')
  334.   rulegroup.add_argument('-c', '--common', help= 'shows items that some number of lists have in common; syntax is comma separated ranges or items, e.g. 2:5  4,6  2:  :3,5  (default: 2:)', nargs= '?', const= '2', default= False, type= MultiRange, metavar= 'vals')
  335.   rulegroup.add_argument('-d', '--dupe', help= 'shows items that are duplicated in at least one list some number of times; syntax is the same as -c (default: 2:)', nargs= '?', const= '2:', default= False, type= MultiRange, metavar= 'vals')
  336.   rulegroup.add_argument('-u', '--unique', action= 'store_true', help= 'shows items that only appear once and in only one list')
  337.   rulegroup.add_argument('-a', '--all', action= 'store_true', help= 'items must meet all other selected rules to appear in reports')
  338.  
  339.   # report format options
  340.   reportgroup = parser.add_mutually_exclusive_group()
  341.   reportgroup.add_argument('-v', '--verbose', action= 'store_const', help= 'generates a verbose report', dest= 'format', const= 'verbose')
  342.   parser.set_defaults(format= 'short')
  343.  
  344.   # other options
  345.   parser.add_argument('-g', '--groups', action= 'store_true', help= 'list all groups present in input files')
  346.   parser.add_argument('-o', '--out', help= 'file to output to instead of printing')
  347.   parser.add_argument('-t', '--tag', help= 'adds bbcode or html tags to wrap names of list items (default: [c]); note that the windows shell is wonky and to an html tag, you need to do it like so: ^<tagname^>', nargs= '?', const= '[c]', default= None, type= Tag)
  348.   parser.add_argument('infiles', help= 'file(s) to read from', metavar= 'filepath', nargs='+')
  349.  
  350.   args = parser.parse_args()
  351.  
  352.   ruleArgs = ['common', 'dupe', 'unique', 'all']
  353.   ruleArgs = {x:vars(args)[x] for x in ruleArgs}
  354.  
  355.   reportArgs = ['tag']
  356.   reportArgs = {x:vars(args)[x] for x in reportArgs}
  357.  
  358.   gg = GroupGroup()
  359.   for path in args.infiles:
  360.     fp = open(path)
  361.     names, groups = oneItemPerLineCodec(fp.read())
  362.     fp.close()
  363.     gg.update(names, groups)
  364.  
  365.   doRender = len([x for x in ruleArgs.values() if x is not False]) >= 1
  366.   if (doRender):
  367.     report = gg.buildReport(**ruleArgs)
  368.     if (report):
  369.       out = open(args.out, 'w') if args.out else sys.stdout
  370.       out.write(gg.renderReport(report, args.format, **reportArgs))
  371.     if (args.out):
  372.       out.close()
  373.  
  374.   if (args.groups):
  375.     for groupid in gg.groups:
  376.       print(gg.groups[groupid].name)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement