Advertisement
Guest User

Untitled

a guest
Aug 22nd, 2017
69
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.74 KB | None | 0 0
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. # ===========================================
  4. #
  5. # WikiConsolidator reads a directory with .bz2 files generated by WikiExtractor.
  6. # The files are compiled into one .txt file, where each line is a pre-processed
  7. # sentence. The following transformations and filters are applied in the text:
  8. #
  9. # * Commas, dots, quotes and parathensis are removed. Question and exclamation
  10. # marks are kept, but with spaces between then and words. For example:
  11. # "how are you?" becomes "how are you ?"
  12. #
  13. # * Text is converted to lower case. Not sure if this is a good thing, though.
  14. # Let's A/B test in the future!
  15.  
  16. import os
  17. import bz2
  18.  
  19. extract_dir = "/home/luiz/Documents/extracted"
  20. bz2_dirs = os.listdir(extract_dir)
  21. consolidated_file = "./ptbrwiki_consolidated.txt"
  22. consolidated_doc = ""
  23. total_docs = 0
  24.  
  25. def consolidate_bz2_dir(dir):
  26. print("Consolidating directory: [{}]".format(dir))
  27. files = os.listdir(dir)
  28. print("There are {} files in the directory".format(len(files)))
  29. for file in files:
  30. print("Consolidating file {}".format(file))
  31. bz_file = bz2.BZ2File(os.path.join(dir, file))
  32. lines = bz_file.readlines()
  33. doc = ""
  34. for line in lines:
  35. if line.find('<doc') == 0:
  36. doc = ""
  37. elif line.find('</doc>') != -1:
  38. consolidate_document(doc)
  39. else:
  40. doc += line
  41. write_to_consolidated_file()
  42. print "---> {} consolidated documents so far".format(total_docs)
  43.  
  44. def consolidate_document(doc):
  45. global consolidated_doc
  46. global total_docs
  47.  
  48. total_docs += 1
  49. for sentence in doc.split('\n'):
  50. # The first step is to convert the content to lower case
  51. sentence = sentence.decode('utf-8').lower()
  52.  
  53. # Then we're going to remove unwanted characters
  54. sentence = sentence.replace(",", "")
  55. sentence = sentence.replace(".", "")
  56. sentence = sentence.replace("(", "")
  57. sentence = sentence.replace(")", "")
  58. sentence = sentence.replace(":", "")
  59. sentence = sentence.replace(";", "")
  60. sentence = sentence.replace(" - ", "")
  61.  
  62. # Then we put a space between punctuation to separate it from the words.
  63. # We don't want "there?" and "there" to be two separated entities.
  64. sentence = sentence.replace("?", " ? ")
  65. sentence = sentence.replace("!", " ! ")
  66.  
  67. # Then we split by word and make sure the sentence has at least 10 words
  68. words = sentence.split()
  69. if len(words) < 10:
  70. continue
  71.  
  72. consolidated_doc += sentence + "\n"
  73.  
  74. # Writes to the consolidated file the contents of the 'consolidated_doc'
  75. # variable
  76. def write_to_consolidated_file():
  77. global consolidated_doc
  78. with open(consolidated_file, "a") as f:
  79. f.write(consolidated_doc.encode('utf-8'))
  80. consolidated_doc = ""
  81.  
  82. for bz2_dir in bz2_dirs:
  83. consolidate_bz2_dir(os.path.join(extract_dir, bz2_dir))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement