Advertisement
iamaamir

Wikipedia Scrapper

May 25th, 2016
295
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.92 KB | None | 0 0
  1. #Python 3
  2.  
  3. import os
  4. import ssl
  5. import re
  6. from urllib.request import urlopen, Request
  7.  
  8. from bs4 import BeautifulSoup
  9.  
  10.  
  11. class WikiFruitCorpus:
  12.  
  13.     __list_of_fruit_url = ""
  14.     __headers = ""
  15.     __dir_location = os.path.dirname(os.path.realpath(__file__)) + "/"
  16.    
  17.     ssl._create_default_https_context = ssl._create_unverified_context  #A not Recommended solution to SSL ERROR
  18.  
  19.     def __init__(self, header="Mozilla/5.0"):
  20.         self.__list_of_fruit_url = "https://simple.wikipedia.org/wiki/List_of_fruits"
  21.         self.set_headers(header)
  22.  
  23.     def set_headers(self, header):
  24.         self.__headers = {"User-Agent": header}
  25.  
  26.     def get_fruit_list_links(self):
  27.         fruit_list_links = []
  28.         req = Request(self.__list_of_fruit_url, headers=self.__headers)
  29.         page = urlopen(req)
  30.         soup = BeautifulSoup(page,"html.parser")
  31.         scrap_links = soup.findAll("a")
  32.         for link in scrap_links:
  33.             tmp = link.get("href")
  34.             if str(tmp).startswith("/wiki/"):
  35.                 fruit_list_links.append(self.__remove_non_ascii(tmp))
  36.         return fruit_list_links[2:87]
  37.  
  38.     @staticmethod
  39.     def __remove_non_ascii(text):
  40.         return "".join([i if ord(i) < 128 else "" for i in text])
  41.  
  42.     def create_corpus(self, links_list):
  43.         dir_name = "fruit_corpus_simplewiki"
  44.         dir_loc = self.__dir_location + dir_name + "/"
  45.         base_url = "https://simple.wikipedia.org"
  46.  
  47.         if not os.path.exists(dir_loc) :
  48.             try:
  49.                 os.mkdir("fruit_corpus_simplewiki")
  50.             except OSError as e:
  51.                 print(e.message)
  52.  
  53.         for link in links_list:
  54.             fruit_url = base_url + link
  55.             print("Working with %s" % fruit_url)
  56.             soup = BeautifulSoup(urlopen(fruit_url).read(),"html.parser")
  57.             html_content = soup.findAll("p")
  58.             main_content = str(self.__remove_non_ascii("\n".join(["".join(w.text) for w in html_content])))
  59.             filename = link[str(link).rindex("/") + 1: len(link)]
  60.             try:
  61.                 with open(dir_loc + filename + ".txt", "w") as f: #opening file with 'wb' may causes the TypeError
  62.                     main_content = main_content.replace('. ','.\n')
  63.                     main_content = re.sub("\[\d+\]","",main_content)#Remove the tags at the end of the line --> [x]
  64.                     f.write(main_content)
  65.             except IOError as e:
  66.                 print( "Error in writing to file", e.message)
  67.             print("File \"%s.txt\" saved in %s" % (filename, dir_loc))
  68.  
  69.  
  70.  
  71.  
  72.  
  73. def main():
  74.     cor = WikiFruitCorpus()
  75.     header = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) " \
  76.              "Ubuntu Chromium/34.0.1847.116 Chrome/34.0.1847.116 Safari/537.36"
  77.     cor.set_headers(header)
  78.     fruit_list = cor.get_fruit_list_links()
  79.     cor.create_corpus(fruit_list)
  80.  
  81. if __name__ == "__main__":
  82.     main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement