Advertisement
Guest User

Untitled

a guest
Jun 17th, 2019
63
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.51 KB | None | 0 0
  1. from nltk.tokenize import sent_tokenize
  2. from nltk.tokenize import word_tokenize
  3. from nltk.tokenize import wordpunct_tokenize
  4. from datetime import datetime
  5. from collections import deque
  6. from collections import defaultdict
  7. from collections import OrderedDict
  8. import operator
  9. import os
  10.  
  11. # Loop through all the English Wikipedia Article files and store their path and filename in a list. 4 minutes.
  12. dir = r'D:DownloadsWikipediaarticles'
  13. l = [os.path.join(root, name) for root, _, files in os.walk(dir) for name in files]
  14.  
  15. t1 = datetime.now()
  16.  
  17. # For each article (file) loop through all the words and generate unigrams. 1175MB memory use spotted.
  18. # 12 minutes to first output. 4200000: 4:37:24.586706 was last output.
  19. c = 1
  20. d1s = defaultdict(int)
  21. for file in l:
  22. try:
  23. with open(file, encoding="utf8") as f_in:
  24. content = f_in.read()
  25. except:
  26. with open(file, encoding="latin-1") as f_in:
  27. content = f_in.read()
  28. words = wordpunct_tokenize(content) # word_tokenize handles 'n ʼn and ʼn as a single word. wordpunct_tokenize does not.
  29. # Take all the words from the sentence and count them.
  30. for i, word in enumerate(words):
  31. d1s[word] = d1s[word] + 1
  32. c = c + 1
  33. if c % 200000 == 0:
  34. t2 = datetime.now()
  35. print(str(c) + ': ' + str(t2 - t1))
  36.  
  37. t2 = datetime.now()
  38. print('After unigram: ' + str(t2 - t1))
  39.  
  40. t1 = datetime.now()
  41. # Sort the defaultdict in descending order and write the unigrams to a file.
  42. # 0:00:27.740082 was output. 3285Mb memory. 165Mb output file.
  43. d1ss = OrderedDict(sorted(d1s.items(), key=operator.itemgetter(1), reverse=True))
  44. with open("D:\Downloads\Wikipedia\en_ngram1.txt", mode="w", encoding="utf-8") as f_out:
  45. for k, v in d1ss.items():
  46. f_out.write(k + '┼' + str(v) + "n")
  47. t2 = datetime.now()
  48. print('After unigram write: ' + str(t2 - t1))
  49.  
  50. # Determine the lowest 1gram count we are interested in.
  51. low_count = 20 - 1
  52. d1s = {}
  53. # Get all the 1gram counts as a dict.
  54. for word, count in d1ss.items():
  55. # Stop adding 1gram counts when we reach the lowest 1gram count.
  56. if count == low_count:
  57. break
  58. # Add the count to the dict.
  59. d1s[word] = count
  60.  
  61. t1 = datetime.now()
  62.  
  63. # For each article (file) loop through all the sentences and generate 2grams. 13GB memory use spotted.
  64. # 17 minutes to first output. 4200000: 4:37:24.586706 was last output.
  65. c = 1
  66. d2s = defaultdict(int)
  67. for file in l:
  68. try:
  69. with open(file, encoding="utf8") as f_in:
  70. content = f_in.read()
  71. except:
  72. with open(file, encoding="latin-1") as f_in:
  73. content = f_in.read()
  74. # Extract the sentences in the file content.
  75. sentences = deque()
  76. sentences.extend(sent_tokenize(content))
  77. # Get all the words for one sentence.
  78. for sentence in sentences:
  79. words = wordpunct_tokenize(sentence) # word_tokenize handles 'n ʼn and ʼn as a single word. wordpunct_tokenize does not.
  80. # Take all the words from the sentence with high 1gram count that are next to each other and count them.
  81. for i, word in enumerate(words):
  82. if word in d1s:
  83. try:
  84. word2 = words[i+1]
  85. if word2 in d1s:
  86. gram2 = word + ' ' + word2
  87. d2s[gram2] = d2s[gram2] + 1
  88. except:
  89. pass
  90. c = c + 1
  91. if c % 200000 == 0:
  92. t2 = datetime.now()
  93. print(str(c) + ': ' + str(t2 - t1))
  94.  
  95. t2 = datetime.now()
  96. print('After bigram: ' + str(t2 - t1))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement