Guest User

Wikidatastats

a guest
Mar 8th, 2013
126
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. import bz2, time
  2.  
  3. print 'Calculating Wikidata stats'
  4.  
  5. start_time = time.time()
  6.  
  7. linecount = 0
  8. pagecount = 0
  9. revisioncount = 0
  10. revisionsperitemcount = 0
  11. revisionsperitem = {}
  12. itemcount = 0
  13. itemswithclaims = 0
  14. claimcount = 0
  15. claimsperitem = {}
  16. propertycount = 0
  17. sitelinkcount = 0
  18. labelcount = 0
  19. descriptioncount = 0
  20. item = False
  21. property = True
  22.  
  23. file = bz2.BZ2File('wikidatawiki-20130228-pages-meta-history.xml.bz2')
  24. for line in file :
  25.     linecount += 1
  26.     if linecount % 1000000 == 0 : print linecount / 1000000
  27.  
  28.     if line == '  <page>\n' :
  29.         pagecount += 1
  30.         if item :
  31.             sitelinkcount += len(val['links'])
  32.             labelcount += len(val['label'])
  33.             descriptioncount += len(val['description'])
  34.             if 'claims' in val and len(val['claims']) > 0 :
  35.                 itemswithclaims += 1
  36.                 claimcount += len(val['claims'])
  37.                 if not len(val['claims']) in claimsperitem :
  38.                     claimsperitem[len(val['claims'])] = 0
  39.                 claimsperitem[len(val['claims'])] += 1
  40.             if revisionsperitemcount > 0 :
  41.                 if not revisionsperitemcount in revisionsperitem :
  42.                     revisionsperitem[revisionsperitemcount] = 0
  43.                 revisionsperitem[revisionsperitemcount] += 1
  44.         revisionsperitemcount = 0
  45.         item = False
  46.         property = False
  47.     if line == '    <ns>0</ns>\n' :
  48.         item = True
  49.         itemcount += 1
  50.     if line == '    <ns>120</ns>\n' :
  51.         property = True
  52.         propertycount += 1
  53.     if line == '    <revision>\n' :
  54.         revisioncount += 1
  55.         revisionsperitemcount += 1
  56.     if line.startswith('      <timestamp>') :
  57.         timestamp = line[17:-23]
  58.     if line.startswith('    <title>') :
  59.         title = line[11:-9]
  60.     # checks for anomalies
  61.     if line.startswith('      <text xml:space="preserve">') :
  62.         if item or property :
  63.             if not line.endswith('</text>\n') :
  64.                 print line
  65.             else :
  66.                 content = line[33:-8]
  67.                 content = content.replace('&quot;', '"')
  68.                 val = eval(content)
  69.     #if linecount >= 100000 : break
  70.  
  71. print itemcount, 'items'
  72. print itemswithclaims, 'items with claims'
  73. print claimcount, 'claims'
  74. print 'claims per item', claimsperitem
  75. print propertycount, 'properties'
  76. print sitelinkcount, 'links'
  77. print labelcount, 'labels'
  78. print descriptioncount, 'descriptions'
  79. print pagecount, 'pages'
  80. print revisioncount, 'revisions'
  81. print 'revisions per item', revisionsperitem
  82. print linecount, 'lines'
  83.  
  84. print time.time() - start_time, 'seconds'
  85. print 'Done.'
RAW Paste Data