Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from nltk.tokenize import sent_tokenize
- from nltk.tokenize import word_tokenize
- from nltk.tokenize import wordpunct_tokenize
- from datetime import datetime
- from collections import deque
- from collections import defaultdict
- from collections import OrderedDict
- import operator
- import os
- # Loop through all the English Wikipedia Article files and store their path and filename in a list. 4 minutes.
- dir = r'D:DownloadsWikipediaarticles'
- l = [os.path.join(root, name) for root, _, files in os.walk(dir) for name in files]
- t1 = datetime.now()
- # For each article (file) loop through all the words and generate unigrams. 1175MB memory use spotted.
- # 12 minutes to first output. 4200000: 4:37:24.586706 was last output.
- c = 1
- d1s = defaultdict(int)
- for file in l:
- try:
- with open(file, encoding="utf8") as f_in:
- content = f_in.read()
- except:
- with open(file, encoding="latin-1") as f_in:
- content = f_in.read()
- words = wordpunct_tokenize(content) # word_tokenize handles 'n ʼn and ʼn as a single word. wordpunct_tokenize does not.
- # Take all the words from the sentence and count them.
- for i, word in enumerate(words):
- d1s[word] = d1s[word] + 1
- c = c + 1
- if c % 200000 == 0:
- t2 = datetime.now()
- print(str(c) + ': ' + str(t2 - t1))
- t2 = datetime.now()
- print('After unigram: ' + str(t2 - t1))
- t1 = datetime.now()
- # Sort the defaultdict in descending order and write the unigrams to a file.
- # 0:00:27.740082 was output. 3285Mb memory. 165Mb output file.
- d1ss = OrderedDict(sorted(d1s.items(), key=operator.itemgetter(1), reverse=True))
- with open("D:\Downloads\Wikipedia\en_ngram1.txt", mode="w", encoding="utf-8") as f_out:
- for k, v in d1ss.items():
- f_out.write(k + '┼' + str(v) + "n")
- t2 = datetime.now()
- print('After unigram write: ' + str(t2 - t1))
- # Determine the lowest 1gram count we are interested in.
- low_count = 20 - 1
- d1s = {}
- # Get all the 1gram counts as a dict.
- for word, count in d1ss.items():
- # Stop adding 1gram counts when we reach the lowest 1gram count.
- if count == low_count:
- break
- # Add the count to the dict.
- d1s[word] = count
- t1 = datetime.now()
- # For each article (file) loop through all the sentences and generate 2grams. 13GB memory use spotted.
- # 17 minutes to first output. 4200000: 4:37:24.586706 was last output.
- c = 1
- d2s = defaultdict(int)
- for file in l:
- try:
- with open(file, encoding="utf8") as f_in:
- content = f_in.read()
- except:
- with open(file, encoding="latin-1") as f_in:
- content = f_in.read()
- # Extract the sentences in the file content.
- sentences = deque()
- sentences.extend(sent_tokenize(content))
- # Get all the words for one sentence.
- for sentence in sentences:
- words = wordpunct_tokenize(sentence) # word_tokenize handles 'n ʼn and ʼn as a single word. wordpunct_tokenize does not.
- # Take all the words from the sentence with high 1gram count that are next to each other and count them.
- for i, word in enumerate(words):
- if word in d1s:
- try:
- word2 = words[i+1]
- if word2 in d1s:
- gram2 = word + ' ' + word2
- d2s[gram2] = d2s[gram2] + 1
- except:
- pass
- c = c + 1
- if c % 200000 == 0:
- t2 = datetime.now()
- print(str(c) + ': ' + str(t2 - t1))
- t2 = datetime.now()
- print('After bigram: ' + str(t2 - t1))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement