Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import re
- from collections import defaultdict
- import codecs
- filename = "wiki_10GB.txt"
- word_blob = defaultdict(int)
- num_lines = 0
- with codecs.open(filename, 'r', encoding='utf-8', errors='ignore') as f:
- num_lines = sum(1 for line in f)
- with codecs.open(filename, 'r', encoding='utf-8', errors='ignore') as f:
- for line_nr, line in enumerate(f):
- if (line_nr % 100 == 0):
- print("\r%d / %d" % (line_nr, num_lines), end = "")
- if re.search("^\t*<p>.+</p>$", line):
- line2 = re.sub("</?[ipb]>", "", line)
- line2 = re.sub("<br />", "", line2)
- line2 = re.sub(r"<sup[^>]*>((?!</sup>).)*</sup>", "", line2)
- line2 = re.sub(r"<sub[^>]*>((?!</sub>).)*</sub>", "", line2)
- line2 = re.sub(r"<a (href|name)[^>]*>((?!</a>).)*</a>", r"\2", line2)
- for word in line2.split(" "):
- word_blob[word.strip(" .,-!?\n")] += 1
- print()
- d_view = [ (v,k) for k,v in word_blob.items() ]
- d_view.sort(reverse = True)
- for count, word in d_view[:100]:
- print(word, count)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement