Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- """
- Find releases with invalid release dates
- """
- # for Python 2 compatibility
- from __future__ import print_function
- import re
- from sys import stderr
- from parse_discogs_dump import ElementProcessor, process_dump_file
- # Try our best to write UTF-8 to stdout (the shell may still wrap stdout, though)
- try:
- import sys
- sys.stdout.reconfigure(encoding='utf-8')
- except AttributeError:
- import codecs
- utf8_writer = codecs.getwriter('UTF-8')
- sys.stdout = utf8_writer(sys.stdout, errors='replace')
- valid_short_date = re.compile(u'(\d{4})$')
- valid_long_date = re.compile(u'(\d{4})-(\d{2})-(\d{2})$')
- class ReleaseDateChecker(ElementProcessor):
- """
- An object which processes release elements:
- For each invalid release date found (if any), print the release URL, a
- comma, and the quoted field value.
- If interrupted, print the parsed element count and last processed element ID.
- """
- def __init__(self):
- self.counter = 0
- self.item_id = None
- self.interesting_element_name = 'release'
- def process(self, elem):
- self.counter += 1
- self.item_id = elem.get('id')
- elems = elem.findall('.//released')
- for e in elems:
- date = e.text
- if date is not None and valid_short_date.match(date) is None:
- match = valid_long_date.match(date)
- if match is None or int(match.group(2)) < 1:
- stderr.write('.')
- stderr.flush()
- print(u'https://www.discogs.com/release/%s - release date is "%s"' % (self.item_id, date))
- def handle_interruption(self, e):
- print('\nInterrupted after %d %ss. Last %s id parsed: %s' % (self.counter, self.interesting_element_name, self.interesting_element_name, self.item_id), file=stderr)
- raise
- # when run from the command line, do this stuff
- if __name__ == "__main__":
- from sys import argv
- if len(argv) < 2:
- raise RuntimeError("A dump file path must be provided as the first argument.")
- processor = ReleaseDateChecker()
- stderr.flush()
- process_dump_file(argv[1], processor)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement