Advertisement
desdemona

duzo tekstu z wiki

May 1st, 2016
493
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.43 KB | None | 0 0
  1. #!/usr/bin/env python
  2.  
  3. import sys
  4. import uuid
  5. import string
  6. import re
  7.  
  8. from urllib import urlopen
  9. from bs4 import BeautifulSoup
  10. import unicodedata
  11. import time
  12. import datetime
  13.  
  14.  
  15. random_wiki_article_url = "https://en.wikipedia.org/wiki/Special:Random"
  16. language_code = "en"
  17.  
  18. content_div_id = "mw-content-text"
  19.  
  20. timestamp = str(datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d_%H:%M:%S'))
  21. output_filename = language_code + "_" + timestamp + "_" + str(uuid.uuid1()) + ".txt"
  22. output_dir = "/home/domi/Desktop/wikiscraps/en/"
  23. open(output_dir + output_filename, 'w').close()
  24.  
  25.  
  26.  
  27. for i in range(0, 1000):
  28.     fo = open(output_dir + output_filename, "a")
  29.  
  30.     html = urlopen(random_wiki_article_url).read()
  31.  
  32.     soup = BeautifulSoup(html, "html.parser")
  33.     content = soup.find("div", {"id": content_div_id})
  34.     text = content.getText()
  35.  
  36.     #obrobka tekstu
  37.     text = text.replace('\n', ' ').replace('\r', '')
  38.     text = text.lower()
  39.     text = text.replace("wikipedia", "")
  40.     text = text.replace("–", " ")
  41.     text = text.replace(" v t e ", "")
  42.     regex = re.compile('[%s]' % re.escape(string.punctuation))
  43.     text = regex.sub(' ', text)
  44.     text = ''.join(j for j in text if not j.isdigit())
  45.     while "  " in text:
  46.         text = text.replace("  ", " ")
  47.     print(text)
  48.     print(len(text))
  49.     utf8_text = text.encode('utf-8')
  50.     fo.write(utf8_text)
  51.     fo.close()
  52.  
  53. print("do widzenia dzieciaczki")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement