Tanach Stats Calculator - Python

# Tanach Stats Calculator
# By Jonah Lawrence
# (c) TorahCalc.com, all rights reserved
# April 20, 2020
# Results: https://torahcalc.com/info/tanach-stats/

import re
from bs4 import BeautifulSoup

x = "C:/Users/<path>/x/" # path to /x/ folder containing texts

# "Hebrew without vowels - masoretic spelling" files downloaded from https://www.mechon-mamre.org/
# Shmuel A/B, Malachim A/B, Divrei Hayamim A/B, and Ezra/Nechemia were divided manually
books = [
    ["", "תורה"],  # Torah
    ["x01.htm", "בראשית"],
    ["x02.htm", "שמות"],
    ["x03.htm", "ויקרא"],
    ["x04.htm", "במדבר"],
    ["x05.htm", "דברים"],

    ["", "נביאים"],  # Neviim
    ["x06.htm", "יהושוע"],
    ["x07.htm", "שופטים"],
    ["x08A.htm", "שמואל א"], # contains first part of "x08.htm"
    ["x08B.htm", "שמואל ב"], # contains second part of "x08.htm"
    ["x09A.htm", "מלכים א"], # contains first part of "x09.htm"
    ["x09B.htm", "מלכים ב"], # contains second part of "x09.htm"
    ["x10.htm", "ישעיהו"],
    ["x11.htm", "ירמיהו"],
    ["x12.htm", "יחזקאל"],
    ["", "תרי_עשר"],  # Trei Asar
    ["x13.htm", "הושע"],
    ["x14.htm", "יואל"],
    ["x15.htm", "עמוס"],
    ["x16.htm", "עובדיה"],
    ["x17.htm", "יונה"],
    ["x18.htm", "מיכה"],
    ["x19.htm", "נחום"],
    ["x20.htm", "חבקוק"],
    ["x21.htm", "צפניה"],
    ["x22.htm", "חגיי"],
    ["x23.htm", "זכריה"],
    ["x24.htm", "מלאכי"],

    ["", "כתובים"],  # Ketuvim
    ["x25A.htm", "דברי הימים א"], # contains first part of "x25.htm"
    ["x25B.htm", "דברי הימים ב"], # contains second part of "x25.htm"
    ["x26.htm", "תהילים"],
    ["x27.htm", "איוב"],
    ["x28.htm", "משלי"],
    ["x29.htm", "רות"],
    ["x30.htm", "שיר השירים"],
    ["x31.htm", "קוהלת"],
    ["x32.htm", "איכה"],
    ["x33.htm", "אסתר"],
    ["x34.htm", "דנייאל"],
    ["x35A.htm", "עזרא"], # contains first part of "x35.htm"
    ["x35B.htm", "נחמיה"], # contains second part of "x35.htm"
]

# allows .sub() syntax for substitutions which looks nicer [https://stackoverflow.com/a/33359326/]
class Substitutable(str):
    def __new__(cls, *args, **kwargs):
        newobj = str.__new__(cls, *args, **kwargs)
        newobj.sub = lambda fro, to: Substitutable(re.sub(fro, to, newobj))
        return newobj


for i in range(len(books)):
    # reset counters
    verses = 0
    words = 0
    letters = 0
    # print labels (rows with no filename)
    if (books[i][0] == ''):
        print(books[i][1])
        continue
    # open file
    with open(x+books[i][0], 'r', encoding="Windows-1255") as file:
        # get contents of file
        data = file.read()
        # parse html
        soup = BeautifulSoup(data, features="html.parser")
        # get a list of all paragraph elements
        paragraphs = soup.find_all('p')
        # for each paragraph
        for p in paragraphs:
            # apply Substitutable class for simple substitution syntax
            p = Substitutable(p)
            # find occurrences of verse labels in paragraph
            verses = verses + len(re.findall(r'<b>[^<]{3,8}</b>', p))
            # if no verses in paragraph, skip to next paragraph
            if (verses < 1):
                continue
            # apply modifications
            p = p.sub(
                r'<b>[^<]{3,8}</b>', ' ' # remove verse labels
            ).sub(
                r'\{[^\{\}]*\}', ' ' # remove letters in curly braces
            ).sub(
                r'\([^\(\)]*\)', ' ' # remove words in parenthesis
            ).sub(
                r'[-;,.:\]]', ' ' # remove punctuation and upside down nun
            ).sub(
                r'<[^>]*>', '' # remove html tags
            ).sub(
                ' +', ' ' # replace consecutive spaces with single space
            )
            # count number of words in paragraph
            words = words + len(p.split())
            # remove all non-Hebrew characters (removes spaces)
            p = p.sub(r'[^\u05D0-\u05EA]', '')
            # count number of letters in paragraph
            letters = letters + len(p)
        # print results for current book
        book = re.sub(' ', '_', books[i][1])
        print(book+'    ' + str(verses)+'    '+str(words)+'    '+str(letters))