WEVA--Webpage English Vocabulary Assessor

"""***THIS APPLICATION IS UNDER DEVELOPMENT***"""

"""WEVA--Webpage English Vocabulary Assessor
Evaluates a source text for difficulty of vocabulary and returns qualitative evaluation, related vocabulary statistics, and list of difficult words.

Difficulty evaluated based on:

    :basic:         Basic English words extracted into a set from https://simple.wikipedia.org/wiki/Wikipedia:List_of_1000_basic_words

    :combined:      Combined lists of English words extracted into a set from https://simple.wikipedia.org/wiki/Wikipedia:Basic_English_combined_wordlist

Qualitative evaluation returned is:

    :BASIC:         At least 85% of unique words in text are found in basic words set

    :INTERMEDIATE:  Between 50% and 84% of unique words in text are found in basic words set, and at least 85% in combined words set

    :CHALLENGING:   Fewer than 50% of uninque words in text are found in basic words set, and between 50% and 84% in combined words set

    :ADVANCED:      Fewer than 50% of unique words in text are found in both basic and combined word sets

The evaluation also returns percentages, as well as two sets of words:

    :intermediate words:    Unique words in text found in combined words set but not in basic words set

    :challenging words:     Unique words in text not found in basic or combined words sets
"""

import bs4
import requests
import re
from string import punctuation, ascii_uppercase, digits
import os
import sys
import pickle

def get_raw_words(url):
    """Retrieves the basic and combined word list pages and returns them as a raw BeautifulSoup object.

        :url:       url of webpage words are being extracted from

        :returns:   extracted set of words
    """

    raw_data = requests.get(url)
    soup =  bs4.BeautifulSoup(raw_data.text, 'html.parser')
    raw_text = soup.find_all()
    raw_strings = [item.string for item in raw_text if item.string]
    word_set = set()
#    regex = re.compile(punctuation)
    for string in raw_strings:
        string2 = ''
        for char in string:
            if char in ascii_uppercase or char in digits:
                break
            if char in punctuation:
                string2 += ' '
            else:
                string2 += char
        string = string2.split()
        for word in string:
            word_set.add(word)
    return word_set


def pickle_word_set(url, word_set):
    """Pickles word sets to disk after checking if they already exists, to minimize rescraping of same pages.

        :url:       Url of page word set is from

        :word_set:  Set of words from Url

        :returns:   Status message
    """

    url = url.split("/")
    url = "./" + "-".join(url) + ".p"
    url = re.sub(":", "-", url)
    if os._exists(url):
        return "File {} exists, skipping.".format(url)
    sys.setrecursionlimit(5000)
    with open(url, 'bw') as handle:
        pickle.dump(word_set, handle)
    return "File {} pickled successfully.".format(url)


def set_up_reference():
    """Pickles the original basic and comb ined word list refernece files. Executed only once per installation.

        :returns:       None
    """

    basic_words_url = "https://simple.wikipedia.org/wiki/Wikipedia:List_of_1000_basic_words"
    combined_words_url = "https://simple.wikipedia.org/wiki/Wikipedia:Basic_English_combined_wordlist"

    basic_set = get_raw_words(basic_words_url)
    status = pickle_word_set(basic_words_url, basic_set)
    print(status)
    combined_set = get_raw_words(combined_words_url)
    status = pickle_word_set(combined_words_url, combined_set)
    print(status)


set_up_reference()