Advertisement
mjb

find_invalid_release_dates.py

mjb
Oct 30th, 2019
305
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.95 KB | None | 0 0
  1. """
  2. Find releases with invalid release dates
  3. """
  4.  
  5. # for Python 2 compatibility
  6. from __future__ import print_function
  7.  
  8. import re
  9. from sys import stderr
  10. from parse_discogs_dump import ElementProcessor, process_dump_file
  11.  
  12. # Try our best to write UTF-8 to stdout (the shell may still wrap stdout, though)
  13. try:
  14.     import sys
  15.     sys.stdout.reconfigure(encoding='utf-8')
  16. except AttributeError:
  17.     import codecs
  18.     utf8_writer = codecs.getwriter('UTF-8')
  19.     sys.stdout = utf8_writer(sys.stdout, errors='replace')
  20.  
  21.  
  22. valid_short_date = re.compile(u'(\d{4})$')
  23. valid_long_date = re.compile(u'(\d{4})-(\d{2})-(\d{2})$')
  24.  
  25. class ReleaseDateChecker(ElementProcessor):
  26.     """
  27.     An object which processes release elements:
  28.     For each invalid release date found (if any), print the release URL, a
  29.     comma, and the quoted field value.
  30.     If interrupted, print the parsed element count and last processed element ID.
  31.     """
  32.     def __init__(self):
  33.         self.counter = 0
  34.         self.item_id = None
  35.         self.interesting_element_name = 'release'
  36.  
  37.     def process(self, elem):
  38.         self.counter += 1
  39.         self.item_id = elem.get('id')
  40.         elems = elem.findall('.//released')
  41.         for e in elems:
  42.             date = e.text
  43.             if date is not None and valid_short_date.match(date) is None:
  44.                 match = valid_long_date.match(date)
  45.                 if match is None or int(match.group(2)) < 1:
  46.                     stderr.write('.')
  47.                     stderr.flush()
  48.                     print(u'https://www.discogs.com/release/%s - release date is "%s"' % (self.item_id, date))
  49.  
  50.     def handle_interruption(self, e):
  51.         print('\nInterrupted after %d %ss. Last %s id parsed: %s' % (self.counter, self.interesting_element_name, self.interesting_element_name, self.item_id), file=stderr)
  52.         raise
  53.  
  54.  
  55. # when run from the command line, do this stuff
  56. if __name__ == "__main__":
  57.     from sys import argv
  58.     if len(argv) < 2:
  59.         raise RuntimeError("A dump file path must be provided as the first argument.")
  60.     processor = ReleaseDateChecker()
  61.     stderr.flush()
  62.     process_dump_file(argv[1], processor)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement