Parsha Stats Calculator - Python

# Tanach Stats Calculator
# By Jonah Lawrence
# (c) TorahCalc.com, all rights reserved
# March 21, 2021
# Results: https://torahcalc.com/info/parsha-stats/

import re
from bs4 import BeautifulSoup

x = "/path/to/x/"  # path to folder containing texts

# "Hebrew without vowels - masoretic spelling" files downloaded from https://www.mechon-mamre.org/
books = [
    ["x01.htm", "בראשית"],
    ["x02.htm", "שמות"],
    ["x03.htm", "ויקרא"],
    ["x04.htm", "במדבר"],
    ["x05.htm", "דברים"],
]

begins = {
    "x01.htm": [
        ("בראשית", "א,א"),
        ("נוח", "ו,ט"),
        ("לך-לך", "יב,א"),
        ("ויירא", "יח,א"),
        ("חיי שרה", "כג,א"),
        ("תולדות", "כה,יט"),
        ("וייצא", "כח,י"),
        ("וישלח", "לב,ד"),
        ("ויישב", "לז,א"),
        ("מקץ", "מא,א"),
        ("וייגש", "מד,יח"),
        ("ויחי", "מז,כח"),
    ],
    "x02.htm": [
        ("שמות", "א,א"),
        ("ואירא", "ו,ב"),
        ("בוא", "י,א"),
        ("בשלח", "יג,יז"),
        ("יתרו", "יח,א"),
        ("משפטים", "כא,א"),
        ("תרומה", "כה,א"),
        ("תצווה", "כז,כ"),
        ("כי תישא", "ל,יא"),
        ("ויקהל", "לה,א"),
        ("פקודי", "לח,כא"),
    ],
    "x03.htm": [
        ("ויקרא", "א,א"),
        ("צו", "ו,א"),
        ("שמיני", "ט,א"),
        ("תזריע", "יב,א"),
        ("מצורע", "יד,א"),
        ("אחרי מות", "טז,א"),
        ("קדושים", "יט,א"),
        ("אמור", "כא,א"),
        ("בהר סיניי", "כה,א"),
        ("בחוקותיי", "כו,ג"),
    ],
    "x04.htm": [
        ("במדבר", "א,א"),
        ("נשוא", "ד,כא"),
        ("בהעלותך", "ח,א"),
        ("שלח-לך", "יג,א"),
        ("קורח", "טז,א"),
        ("חוקת", "יט,א"),
        ("בלק", "כב,ב"),
        ("פינחס", "כה,י"),
        ("מטות", "ל,ב"),
        ("מסעי", "לג,א"),
    ],
    "x05.htm": [
        ("דברים", "א,א"),
        ("ואתחנן", "ג,כג"),
        ("עקב", "ז,יב"),
        ("ראה", "יא,כו"),
        ("שופטים", "טז,יח"),
        ("כי-תצא", "כא,י"),
        ("כי-תבוא", "כו,א"),
        ("ניצבים", "כט,ט"),
        ("וילך", "לא,א"),
        ("האזינו", "לב,א"),
        ("וזאת הברכה", "לג,א"),
    ],
}

# allows .sub() syntax for substitutions which looks nicer [https://stackoverflow.com/a/33359326/]
class Substitutable(str):
    def __new__(cls, *args, **kwargs):
        newobj = str.__new__(cls, *args, **kwargs)
        newobj.sub = lambda fro, to: Substitutable(re.sub(fro, to, newobj))
        return newobj

# returns verses, words, and letters in a paragraph element
def count(paragraph):
    # apply Substitutable class for simple substitution syntax
    p = Substitutable(paragraph)
    # find occurrences of verse labels in paragraph
    verses = len(re.findall(r"<b>[^<]{3,8}</b>", p))
    # if no verses in paragraph, skip to next paragraph
    if verses < 1:
        return 0, 0, 0
    # apply modifications
    p = (
        p.sub(r"<b>[^<]{3,8}</b>", " ")  # remove verse labels
        .sub(r"\{[^\{\}]*\}", " ")  # remove letters in curly braces
        .sub(r"\([^\(\)]*\)", " ")  # remove words in parenthesis
        .sub(r"[-;,.:\]]", " ")  # remove punctuation and upside down nun
        .sub(r"<[^>]*>", "")  # remove html tags
        .sub(" +", " ")  # replace consecutive spaces with single space
    )
    # count number of words in paragraph
    words = len(p.split())
    # remove all non-Hebrew characters (removes spaces)
    p = p.sub(r"[^\u05D0-\u05EA]", "")
    # count number of letters in paragraph
    letters = len(p)
    # return
    return verses, words, letters

for i in range(len(books)):
    # open file
    with open(x + books[i][0], "r", encoding="Windows-1255") as file:
        # get contents of file
        data = file.read()
        # parse html
        soup = BeautifulSoup(data, features="html.parser")
        # get a list of all paragraph elements
        paragraphs = soup.find_all("p")
        # split at parshios
        parshios = begins[books[i][0]]
        # current paragraph number
        i = 0
        # go through all parshios
        for j in range(len(parshios)):
            # reset counters
            verses = 0
            words = 0
            letters = 0
            # get next parsha break
            next_parsha_label = soup.find('b', string=parshios[j+1][1]) if j + 1 < len(parshios) else None
            next_parsha_p = next_parsha_label.parent if next_parsha_label is not None else None
            # for each paragraph
            while i < len(paragraphs) and paragraphs[i] != next_parsha_p:
                p_verses, p_words, p_letters = count(paragraphs[i])
                verses += p_verses
                words += p_words
                letters += p_letters
                i += 1
            # get pesukim in the first paragraph of next parsha still in prev parsha
            if i < len(paragraphs):
                next_parsha_first_passuk = paragraphs[i].contents.index(next_parsha_label)
                before = "".join(str(x) for x in paragraphs[i].contents[:next_parsha_first_passuk])
                paragraphs[i] = "".join(str(x) for x in paragraphs[i].contents[next_parsha_first_passuk:])
                p_verses, p_words, p_letters = count(before)
                verses += p_verses
                words += p_words
                letters += p_letters
            # print results for current parsha
            parsha = re.sub(" ", "_", parshios[j][0])
            print(parsha + " " + str(verses) + " " + str(words) + " " + str(letters))