Untitled

import bs4
import os
import re
from num2words import num2words
from tqdm import tqdm


def read_file(filepath):
    content = None
    try:
        with open(filepath, 'r') as f:
            content = f.read()
    except IOError as e:
        print(e)

    return content


def append_file(file, data):
    try:
        with open(file, 'a') as f:
            f.write(data)
    except IOError as e:
        print(e)


def process(file):
    filepath = os.path.join(input_dir, file)
    content = read_file(filepath)
    soup = bs4.BeautifulSoup(content, "lxml")
    text = soup.get_text()
    text = re.sub('[^A-Za-z0-9]+', ' ', text)

    text = text.split()
    text = [num2words(token) if token.isdigit() else token.lower() for token in text]

    text = " ".join(text)

    append_file(output_file, text)


input_dir = "paper"
output_file = "output.txt"
with open(output_file, 'w'): pass  # empty the file

files = os.listdir(input_dir)

for file in tqdm(files):
    process(file)

print("ready in", output_file)


"""
Answer for part 2:

Word error rate and character error rate are common metrics for evaluation.
An established way is benchmarks like SwitchBoard.
In domain specific case, you would need the both the voice and the correct transcript to comparing your transcription.
Using ready sources with both speech and transcript is great, if there are any.
If not you can try to first convert text to speech and use it for testing, but it will introduce bias to your system.

In the end, the real test is using it, because only this can show the real-world performance.

the frequency of words in user generated content can be used as an input for speech-to-text decisions.
For example, a home assistant understand “holla, hella, holo” as “hello” because it’s more common.
the system would not correctly less frequent words, a tf-idf scheme can correct this.
"""