Wikipedia Scrapper

#Python 3

import os
import ssl
import re
from urllib.request import urlopen, Request

from bs4 import BeautifulSoup


class WikiFruitCorpus:

    __list_of_fruit_url = ""
    __headers = ""
    __dir_location = os.path.dirname(os.path.realpath(__file__)) + "/"

    ssl._create_default_https_context = ssl._create_unverified_context  #A not Recommended solution to SSL ERROR

    def __init__(self, header="Mozilla/5.0"):
        self.__list_of_fruit_url = "https://simple.wikipedia.org/wiki/List_of_fruits"
        self.set_headers(header)

    def set_headers(self, header):
        self.__headers = {"User-Agent": header}

    def get_fruit_list_links(self):
        fruit_list_links = []
        req = Request(self.__list_of_fruit_url, headers=self.__headers)
        page = urlopen(req)
        soup = BeautifulSoup(page,"html.parser")
        scrap_links = soup.findAll("a")
        for link in scrap_links:
            tmp = link.get("href")
            if str(tmp).startswith("/wiki/"):
                fruit_list_links.append(self.__remove_non_ascii(tmp))
        return fruit_list_links[2:87]

    @staticmethod
    def __remove_non_ascii(text):
        return "".join([i if ord(i) < 128 else "" for i in text])

    def create_corpus(self, links_list):
        dir_name = "fruit_corpus_simplewiki"
        dir_loc = self.__dir_location + dir_name + "/"
        base_url = "https://simple.wikipedia.org"

        if not os.path.exists(dir_loc) :
            try:
                os.mkdir("fruit_corpus_simplewiki")
            except OSError as e:
                print(e.message)

        for link in links_list:
            fruit_url = base_url + link
            print("Working with %s" % fruit_url)
            soup = BeautifulSoup(urlopen(fruit_url).read(),"html.parser")
            html_content = soup.findAll("p")
            main_content = str(self.__remove_non_ascii("\n".join(["".join(w.text) for w in html_content])))
            filename = link[str(link).rindex("/") + 1: len(link)]
            try:
                with open(dir_loc + filename + ".txt", "w") as f: #opening file with 'wb' may causes the TypeError
                    main_content = main_content.replace('. ','.\n')
                    main_content = re.sub("\[\d+\]","",main_content)#Remove the tags at the end of the line --> [x]
                    f.write(main_content)
            except IOError as e:
                print( "Error in writing to file", e.message)
            print("File \"%s.txt\" saved in %s" % (filename, dir_loc))


def main():
    cor = WikiFruitCorpus()
    header = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) " \
             "Ubuntu Chromium/34.0.1847.116 Chrome/34.0.1847.116 Safari/537.36"
    cor.set_headers(header)
    fruit_list = cor.get_fruit_list_links()
    cor.create_corpus(fruit_list)

if __name__ == "__main__":
    main()