Advertisement
Guest User

Untitled

a guest
May 21st, 2019
74
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 0.97 KB | None | 0 0
  1. import re
  2. from collections import defaultdict
  3. import codecs
  4.  
  5. filename = "wiki_10GB.txt"
  6. word_blob = defaultdict(int)
  7. num_lines = 0
  8.  
  9. with codecs.open(filename, 'r', encoding='utf-8', errors='ignore') as f:
  10. num_lines = sum(1 for line in f)
  11.  
  12. with codecs.open(filename, 'r', encoding='utf-8', errors='ignore') as f:
  13. for line_nr, line in enumerate(f):
  14. if (line_nr % 100 == 0):
  15. print("\r%d / %d" % (line_nr, num_lines), end = "")
  16. if re.search("^\t*<p>.+</p>$", line):
  17. line2 = re.sub("</?[ipb]>", "", line)
  18. line2 = re.sub("<br />", "", line2)
  19. line2 = re.sub(r"<sup[^>]*>((?!</sup>).)*</sup>", "", line2)
  20. line2 = re.sub(r"<sub[^>]*>((?!</sub>).)*</sub>", "", line2)
  21. line2 = re.sub(r"<a (href|name)[^>]*>((?!</a>).)*</a>", r"\2", line2)
  22. for word in line2.split(" "):
  23. word_blob[word.strip(" .,-!?\n")] += 1
  24.  
  25. print()
  26. d_view = [ (v,k) for k,v in word_blob.items() ]
  27. d_view.sort(reverse = True)
  28. for count, word in d_view[:100]:
  29. print(word, count)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement