Guest User

Untitled

a guest
May 3rd, 2016
68
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.69 KB | None | 0 0
  1. #!/usr/bin/env python
  2.  
  3. import sys
  4.  
  5. from invenio.dbquery import run_sql
  6. from invenio.search_engine import get_tag_name
  7. from invenio.search_engine import get_collection_reclist
  8. from invenio.intbitset import intbitset
  9.  
  10. collection = sys.argv[1]
  11.  
  12. recids = get_collection_reclist(collection)
  13.  
  14. for i in range(100):
  15. for tag in run_sql("SELECT DISTINCT tag FROM bib%02dx ORDER BY tag" % i):
  16. tag = tag[0]
  17. sys.stderr.write("\r%s " % tag)
  18. sys.stderr.flush()
  19. tag_name = get_tag_name(tag) or get_tag_name(tag[:3]) or get_tag_name(tag[:3] + "__a")
  20. new_tag = True
  21. recids_with_value = recids & intbitset(run_sql("SELECT id_bibrec from bibrec_bib%02dx JOIN bib%02dx ON id_bibxxx=id WHERE tag=%%s" % (i, i), (tag, )))
  22. if not recids_with_value:
  23. continue
  24. distinct_values = run_sql("SELECT COUNT(1) FROM bib%02dx WHERE tag=%%s" % i, (tag, ))[0][0]
  25. print
  26. msg = "%s (%s), (%s %s records with values) (%s distinct values in general)" % (tag, tag_name, len(recids_with_value), collection, distinct_values)
  27. msg2 = str(recids_with_value)
  28. print msg
  29. print msg2
  30. print "-" * max(len(msg), len(msg2))
  31.  
  32. distinct_values = run_sql("SELECT COUNT(1) FROM bib%02dx WHERE tag=%%s" % i, (tag, ))[0][0]
  33.  
  34. if distinct_values > 1000:
  35. # Too much populated query.
  36. print "---- Example of values ----"
  37. example_values = run_sql("SELECT value from bib%02dx WHERE tag=%%s LIMIT %%s" % i, (tag, 10))
  38. for value in example_values:
  39. print value[0]
  40. continue
  41. limit = 10
  42. if distinct_values < 30:
  43. limit = distinct_values
  44. outliers = run_sql("SELECT value, count(*) AS c, id FROM bibrec_bib%02dx join bib%02dx ON id_bibxxx=id WHERE tag=%%s GROUP BY id ORDER BY c LIMIT %%s" % (i, i), (tag, limit))
  45.  
  46. print "---- Good values ----"
  47. good_values = run_sql("SELECT value, count(*) AS c, id FROM bibrec_bib%02dx join bib%02dx ON id_bibxxx=id WHERE tag=%%s GROUP BY id ORDER BY c desc LIMIT %%s" % (i, i), (tag, 100))
  48. for value, dummy_count, id in good_values:
  49. matched_recids = recids & intbitset(run_sql("SELECT id_bibrec from bibrec_bib%02dx WHERE id_bibxxx=%%s" % i, (id, )))
  50. if matched_recids:
  51. print "% 10d %s " % (len(matched_recids), value)
  52.  
  53. print "---- Outliers ----"
  54. for value, dummy_count, id in outliers:
  55. matched_recids = recids & intbitset(run_sql("SELECT id_bibrec from bibrec_bib%02dx WHERE id_bibxxx=%%s" % i, (id, )))
  56. if matched_recids:
  57. print "%s (%s %s records): %s" % (value, len(matched_recids), collection, matched_recids)
Add Comment
Please, Sign In to add comment