Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- # -*- coding: utf-8 -*-
- # ===========================================
- #
- # WikiConsolidator reads a directory with .bz2 files generated by WikiExtractor.
- # The files are compiled into one .txt file, where each line is a pre-processed
- # sentence. The following transformations and filters are applied in the text:
- #
- # * Commas, dots, quotes and parathensis are removed. Question and exclamation
- # marks are kept, but with spaces between then and words. For example:
- # "how are you?" becomes "how are you ?"
- #
- # * Text is converted to lower case. Not sure if this is a good thing, though.
- # Let's A/B test in the future!
- import os
- import bz2
- extract_dir = "/home/luiz/Documents/extracted"
- bz2_dirs = os.listdir(extract_dir)
- consolidated_file = "./ptbrwiki_consolidated.txt"
- consolidated_doc = ""
- total_docs = 0
- def consolidate_bz2_dir(dir):
- print("Consolidating directory: [{}]".format(dir))
- files = os.listdir(dir)
- print("There are {} files in the directory".format(len(files)))
- for file in files:
- print("Consolidating file {}".format(file))
- bz_file = bz2.BZ2File(os.path.join(dir, file))
- lines = bz_file.readlines()
- doc = ""
- for line in lines:
- if line.find('<doc') == 0:
- doc = ""
- elif line.find('</doc>') != -1:
- consolidate_document(doc)
- else:
- doc += line
- write_to_consolidated_file()
- print "---> {} consolidated documents so far".format(total_docs)
- def consolidate_document(doc):
- global consolidated_doc
- global total_docs
- total_docs += 1
- for sentence in doc.split('\n'):
- # The first step is to convert the content to lower case
- sentence = sentence.decode('utf-8').lower()
- # Then we're going to remove unwanted characters
- sentence = sentence.replace(",", "")
- sentence = sentence.replace(".", "")
- sentence = sentence.replace("(", "")
- sentence = sentence.replace(")", "")
- sentence = sentence.replace(":", "")
- sentence = sentence.replace(";", "")
- sentence = sentence.replace(" - ", "")
- # Then we put a space between punctuation to separate it from the words.
- # We don't want "there?" and "there" to be two separated entities.
- sentence = sentence.replace("?", " ? ")
- sentence = sentence.replace("!", " ! ")
- # Then we split by word and make sure the sentence has at least 10 words
- words = sentence.split()
- if len(words) < 10:
- continue
- consolidated_doc += sentence + "\n"
- # Writes to the consolidated file the contents of the 'consolidated_doc'
- # variable
- def write_to_consolidated_file():
- global consolidated_doc
- with open(consolidated_file, "a") as f:
- f.write(consolidated_doc.encode('utf-8'))
- consolidated_doc = ""
- for bz2_dir in bz2_dirs:
- consolidate_bz2_dir(os.path.join(extract_dir, bz2_dir))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement