Advertisement
Guest User

Untitled

a guest
Apr 19th, 2019
130
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 0.75 KB | None | 0 0
  1. import io
  2. import time
  3. from datetime import timedelta
  4.  
  5. import gensim
  6.  
  7. if __name__ == '__main__':
  8.  
  9. start_time = time.time()
  10. print('Streaming wiki...')
  11. id_wiki = gensim.corpora.WikiCorpus('idwiki-latest-pages-articles.xml.bz2', lemmatize=False, dictionary={})
  12. article_count = 0
  13.  
  14. with io.open('idwiki.txt', 'w', encoding='utf-8') as wiki_txt:
  15. for text in id_wiki.get_texts():
  16.  
  17. wiki_txt.write(" ".join(text) + '\n')
  18. article_count += 1
  19.  
  20. if article_count % 10000 == 0:
  21. print('{} articles processed'.format(article_count))
  22.  
  23. print('total: {} articles'.format(article_count))
  24.  
  25. finish_time = time.time()
  26. print('Elapsed time: {}'.format(timedelta(seconds=finish_time-start_time)))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement