Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- """***THIS APPLICATION IS UNDER DEVELOPMENT***"""
- """WEVA--Webpage English Vocabulary Assessor
- Evaluates a source text for difficulty of vocabulary and returns qualitative evaluation, related vocabulary statistics, and list of difficult words.
- Difficulty evaluated based on:
- :basic: Basic English words extracted into a set from https://simple.wikipedia.org/wiki/Wikipedia:List_of_1000_basic_words
- :combined: Combined lists of English words extracted into a set from https://simple.wikipedia.org/wiki/Wikipedia:Basic_English_combined_wordlist
- Qualitative evaluation returned is:
- :BASIC: At least 85% of unique words in text are found in basic words set
- :INTERMEDIATE: Between 50% and 84% of unique words in text are found in basic words set, and at least 85% in combined words set
- :CHALLENGING: Fewer than 50% of uninque words in text are found in basic words set, and between 50% and 84% in combined words set
- :ADVANCED: Fewer than 50% of unique words in text are found in both basic and combined word sets
- The evaluation also returns percentages, as well as two sets of words:
- :intermediate words: Unique words in text found in combined words set but not in basic words set
- :challenging words: Unique words in text not found in basic or combined words sets
- """
- import bs4
- import requests
- import re
- from string import punctuation, ascii_uppercase, digits
- import os
- import sys
- import pickle
- def get_raw_words(url):
- """Retrieves the basic and combined word list pages and returns them as a raw BeautifulSoup object.
- :url: url of webpage words are being extracted from
- :returns: extracted set of words
- """
- raw_data = requests.get(url)
- soup = bs4.BeautifulSoup(raw_data.text, 'html.parser')
- raw_text = soup.find_all()
- raw_strings = [item.string for item in raw_text if item.string]
- word_set = set()
- # regex = re.compile(punctuation)
- for string in raw_strings:
- string2 = ''
- for char in string:
- if char in ascii_uppercase or char in digits:
- break
- if char in punctuation:
- string2 += ' '
- else:
- string2 += char
- string = string2.split()
- for word in string:
- word_set.add(word)
- return word_set
- def pickle_word_set(url, word_set):
- """Pickles word sets to disk after checking if they already exists, to minimize rescraping of same pages.
- :url: Url of page word set is from
- :word_set: Set of words from Url
- :returns: Status message
- """
- url = url.split("/")
- url = "./" + "-".join(url) + ".p"
- url = re.sub(":", "-", url)
- if os._exists(url):
- return "File {} exists, skipping.".format(url)
- sys.setrecursionlimit(5000)
- with open(url, 'bw') as handle:
- pickle.dump(word_set, handle)
- return "File {} pickled successfully.".format(url)
- def set_up_reference():
- """Pickles the original basic and comb ined word list refernece files. Executed only once per installation.
- :returns: None
- """
- basic_words_url = "https://simple.wikipedia.org/wiki/Wikipedia:List_of_1000_basic_words"
- combined_words_url = "https://simple.wikipedia.org/wiki/Wikipedia:Basic_English_combined_wordlist"
- basic_set = get_raw_words(basic_words_url)
- status = pickle_word_set(basic_words_url, basic_set)
- print(status)
- combined_set = get_raw_words(combined_words_url)
- status = pickle_word_set(combined_words_url, combined_set)
- print(status)
- set_up_reference()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement