Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- from invenio.search_engine import get_collection_reclist, get_record
- from invenio.intbitset import intbitset
- from click import progressbar
- collection = sys.argv[1]
- recids = list(get_collection_reclist(collection))
- recids.reverse()
- repeatable_tags = {}
- repeatable_subfields = {}
- with progressbar(recids) as recids:
- for recid in recids:
- record = get_record(recid)
- records_tags = set()
- for tag in record:
- for field in record[tag]:
- current_tag = tag + (field[1] or ' ') + (field[2] or ' ')
- if current_tag in records_tags:
- if current_tag not in repeatable_tags:
- repeatable_tags[current_tag] = intbitset([recid])
- else:
- repeatable_tags[current_tag].add(recid)
- records_tags.add(current_tag)
- current_codes = set()
- for code, value in field[0]:
- if code in current_codes:
- current_code = current_tag + code
- if current_code not in repeatable_subfields:
- repeatable_subfields[current_code] = intbitset([recid])
- else:
- repeatable_subfields[current_code].add(recid)
- current_codes.add(code)
- for key, value in repeatable_tags.iteritems():
- print "%s -> %s" % (key, ", ".join(value[-3:]))
- for key, value in repeatable_subfields.iteritems():
- print "%s -> %s" % (key, ", ".join(value[-3:]))
Add Comment
Please, Sign In to add comment