Advertisement
desdemona

universal wiki scrapper

May 25th, 2016
485
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.71 KB | None | 0 0
  1. #!/usr/bin/env python
  2.  
  3. import sys
  4. import uuid
  5. import string
  6. import re
  7.  
  8. from urllib import urlopen
  9. from bs4 import BeautifulSoup
  10. import unicodedata
  11. import time
  12. import datetime
  13. import os
  14.  
  15.  
  16. random_wiki_article_url = "https://pl.wikipedia.org/wiki/Specjalna:Losowa_strona"
  17. language_code = "pl"
  18.  
  19. if len(sys.argv) == 3:
  20.     language_code = sys.argv[1]
  21.     random_wiki_article_url = sys.argv[2]
  22.  
  23. content_div_id = "mw-content-text"
  24.  
  25. myrange = 100
  26.  
  27. timestamp = str(datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d_%H:%M:%S'))
  28. output_filename = language_code + "_" + timestamp + "_" + str(uuid.uuid1()) + "_" + str(myrange) + ".txt"
  29. output_dir = "/home/domi/Desktop/wikiscraps/" + language_code + "/"
  30.  
  31. if not os.path.exists(output_dir):
  32.     os.makedirs(output_dir)
  33. open(output_dir + output_filename, 'w').close()
  34.  
  35.  
  36. for i in range(0, myrange):
  37.     fo = open(output_dir + output_filename, "a")
  38.  
  39.     html = urlopen(random_wiki_article_url).read()
  40.  
  41.     soup = BeautifulSoup(html, "html.parser")
  42.     content = soup.find("div", {"id": content_div_id})
  43.     text = content.getText()
  44.  
  45.     #obrobka tekstu
  46.     text = text.replace('\n', ' ').replace('\r', '')
  47.     text = text.lower()
  48.     text = text.replace("wikipedia", "")
  49.     text = text.replace("-", " ")
  50.     text = text.replace(" v t e ", "")
  51.     regex = re.compile('[%s]' % re.escape(string.punctuation))
  52.     text = regex.sub(' ', text)
  53.     text = ''.join(j for j in text if not j.isdigit())
  54.     while "  " in text:
  55.         text = text.replace("  ", " ")
  56.  
  57.  
  58.     utf8_text = text.encode('utf-8')
  59.     print(len(text))
  60.     print (utf8_text)
  61.     fo.write(utf8_text + "\n")
  62.     fo.close()
  63.  
  64. print("scrapping ended for language: " + language_code)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement