Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- import sys
- from invenio.dbquery import run_sql
- from invenio.search_engine import get_tag_name
- from invenio.search_engine import get_collection_reclist
- from invenio.intbitset import intbitset
- collection = sys.argv[1]
- recids = get_collection_reclist(collection)
- for i in range(100):
- for tag in run_sql("SELECT DISTINCT tag FROM bib%02dx ORDER BY tag" % i):
- tag = tag[0]
- sys.stderr.write("\r%s " % tag)
- sys.stderr.flush()
- tag_name = get_tag_name(tag) or get_tag_name(tag[:3]) or get_tag_name(tag[:3] + "__a")
- new_tag = True
- recids_with_value = recids & intbitset(run_sql("SELECT id_bibrec from bibrec_bib%02dx JOIN bib%02dx ON id_bibxxx=id WHERE tag=%%s" % (i, i), (tag, )))
- if not recids_with_value:
- continue
- distinct_values = run_sql("SELECT COUNT(1) FROM bib%02dx WHERE tag=%%s" % i, (tag, ))[0][0]
- print
- msg = "%s (%s), (%s %s records with values) (%s distinct values in general)" % (tag, tag_name, len(recids_with_value), collection, distinct_values)
- msg2 = str(recids_with_value)
- print msg
- print msg2
- print "-" * max(len(msg), len(msg2))
- distinct_values = run_sql("SELECT COUNT(1) FROM bib%02dx WHERE tag=%%s" % i, (tag, ))[0][0]
- if distinct_values > 1000:
- # Too much populated query.
- print "---- Example of values ----"
- example_values = run_sql("SELECT value from bib%02dx WHERE tag=%%s LIMIT %%s" % i, (tag, 10))
- for value in example_values:
- print value[0]
- continue
- limit = 10
- if distinct_values < 30:
- limit = distinct_values
- outliers = run_sql("SELECT value, count(*) AS c, id FROM bibrec_bib%02dx join bib%02dx ON id_bibxxx=id WHERE tag=%%s GROUP BY id ORDER BY c LIMIT %%s" % (i, i), (tag, limit))
- print "---- Good values ----"
- good_values = run_sql("SELECT value, count(*) AS c, id FROM bibrec_bib%02dx join bib%02dx ON id_bibxxx=id WHERE tag=%%s GROUP BY id ORDER BY c desc LIMIT %%s" % (i, i), (tag, 100))
- for value, dummy_count, id in good_values:
- matched_recids = recids & intbitset(run_sql("SELECT id_bibrec from bibrec_bib%02dx WHERE id_bibxxx=%%s" % i, (id, )))
- if matched_recids:
- print "% 10d %s " % (len(matched_recids), value)
- print "---- Outliers ----"
- for value, dummy_count, id in outliers:
- matched_recids = recids & intbitset(run_sql("SELECT id_bibrec from bibrec_bib%02dx WHERE id_bibxxx=%%s" % i, (id, )))
- if matched_recids:
- print "%s (%s %s records): %s" % (value, len(matched_recids), collection, matched_recids)
Add Comment
Please, Sign In to add comment