Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #Python 3
- import os
- import ssl
- import re
- from urllib.request import urlopen, Request
- from bs4 import BeautifulSoup
- class WikiFruitCorpus:
- __list_of_fruit_url = ""
- __headers = ""
- __dir_location = os.path.dirname(os.path.realpath(__file__)) + "/"
- ssl._create_default_https_context = ssl._create_unverified_context #A not Recommended solution to SSL ERROR
- def __init__(self, header="Mozilla/5.0"):
- self.__list_of_fruit_url = "https://simple.wikipedia.org/wiki/List_of_fruits"
- self.set_headers(header)
- def set_headers(self, header):
- self.__headers = {"User-Agent": header}
- def get_fruit_list_links(self):
- fruit_list_links = []
- req = Request(self.__list_of_fruit_url, headers=self.__headers)
- page = urlopen(req)
- soup = BeautifulSoup(page,"html.parser")
- scrap_links = soup.findAll("a")
- for link in scrap_links:
- tmp = link.get("href")
- if str(tmp).startswith("/wiki/"):
- fruit_list_links.append(self.__remove_non_ascii(tmp))
- return fruit_list_links[2:87]
- @staticmethod
- def __remove_non_ascii(text):
- return "".join([i if ord(i) < 128 else "" for i in text])
- def create_corpus(self, links_list):
- dir_name = "fruit_corpus_simplewiki"
- dir_loc = self.__dir_location + dir_name + "/"
- base_url = "https://simple.wikipedia.org"
- if not os.path.exists(dir_loc) :
- try:
- os.mkdir("fruit_corpus_simplewiki")
- except OSError as e:
- print(e.message)
- for link in links_list:
- fruit_url = base_url + link
- print("Working with %s" % fruit_url)
- soup = BeautifulSoup(urlopen(fruit_url).read(),"html.parser")
- html_content = soup.findAll("p")
- main_content = str(self.__remove_non_ascii("\n".join(["".join(w.text) for w in html_content])))
- filename = link[str(link).rindex("/") + 1: len(link)]
- try:
- with open(dir_loc + filename + ".txt", "w") as f: #opening file with 'wb' may causes the TypeError
- main_content = main_content.replace('. ','.\n')
- main_content = re.sub("\[\d+\]","",main_content)#Remove the tags at the end of the line --> [x]
- f.write(main_content)
- except IOError as e:
- print( "Error in writing to file", e.message)
- print("File \"%s.txt\" saved in %s" % (filename, dir_loc))
- def main():
- cor = WikiFruitCorpus()
- header = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) " \
- "Ubuntu Chromium/34.0.1847.116 Chrome/34.0.1847.116 Safari/537.36"
- cor.set_headers(header)
- fruit_list = cor.get_fruit_list_links()
- cor.create_corpus(fruit_list)
- if __name__ == "__main__":
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement