Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # Tanach Stats Calculator
- # By Jonah Lawrence
- # (c) TorahCalc.com, all rights reserved
- # April 20, 2020
- # Results: https://torahcalc.com/info/tanach-stats/
- import re
- from bs4 import BeautifulSoup
- x = "C:/Users/<path>/x/" # path to /x/ folder containing texts
- # "Hebrew without vowels - masoretic spelling" files downloaded from https://www.mechon-mamre.org/
- # Shmuel A/B, Malachim A/B, Divrei Hayamim A/B, and Ezra/Nechemia were divided manually
- books = [
- ["", "תורה"], # Torah
- ["x01.htm", "בראשית"],
- ["x02.htm", "שמות"],
- ["x03.htm", "ויקרא"],
- ["x04.htm", "במדבר"],
- ["x05.htm", "דברים"],
- ["", "נביאים"], # Neviim
- ["x06.htm", "יהושוע"],
- ["x07.htm", "שופטים"],
- ["x08A.htm", "שמואל א"], # contains first part of "x08.htm"
- ["x08B.htm", "שמואל ב"], # contains second part of "x08.htm"
- ["x09A.htm", "מלכים א"], # contains first part of "x09.htm"
- ["x09B.htm", "מלכים ב"], # contains second part of "x09.htm"
- ["x10.htm", "ישעיהו"],
- ["x11.htm", "ירמיהו"],
- ["x12.htm", "יחזקאל"],
- ["", "תרי_עשר"], # Trei Asar
- ["x13.htm", "הושע"],
- ["x14.htm", "יואל"],
- ["x15.htm", "עמוס"],
- ["x16.htm", "עובדיה"],
- ["x17.htm", "יונה"],
- ["x18.htm", "מיכה"],
- ["x19.htm", "נחום"],
- ["x20.htm", "חבקוק"],
- ["x21.htm", "צפניה"],
- ["x22.htm", "חגיי"],
- ["x23.htm", "זכריה"],
- ["x24.htm", "מלאכי"],
- ["", "כתובים"], # Ketuvim
- ["x25A.htm", "דברי הימים א"], # contains first part of "x25.htm"
- ["x25B.htm", "דברי הימים ב"], # contains second part of "x25.htm"
- ["x26.htm", "תהילים"],
- ["x27.htm", "איוב"],
- ["x28.htm", "משלי"],
- ["x29.htm", "רות"],
- ["x30.htm", "שיר השירים"],
- ["x31.htm", "קוהלת"],
- ["x32.htm", "איכה"],
- ["x33.htm", "אסתר"],
- ["x34.htm", "דנייאל"],
- ["x35A.htm", "עזרא"], # contains first part of "x35.htm"
- ["x35B.htm", "נחמיה"], # contains second part of "x35.htm"
- ]
- # allows .sub() syntax for substitutions which looks nicer [https://stackoverflow.com/a/33359326/]
- class Substitutable(str):
- def __new__(cls, *args, **kwargs):
- newobj = str.__new__(cls, *args, **kwargs)
- newobj.sub = lambda fro, to: Substitutable(re.sub(fro, to, newobj))
- return newobj
- for i in range(len(books)):
- # reset counters
- verses = 0
- words = 0
- letters = 0
- # print labels (rows with no filename)
- if (books[i][0] == ''):
- print(books[i][1])
- continue
- # open file
- with open(x+books[i][0], 'r', encoding="Windows-1255") as file:
- # get contents of file
- data = file.read()
- # parse html
- soup = BeautifulSoup(data, features="html.parser")
- # get a list of all paragraph elements
- paragraphs = soup.find_all('p')
- # for each paragraph
- for p in paragraphs:
- # apply Substitutable class for simple substitution syntax
- p = Substitutable(p)
- # find occurrences of verse labels in paragraph
- verses = verses + len(re.findall(r'<b>[^<]{3,8}</b>', p))
- # if no verses in paragraph, skip to next paragraph
- if (verses < 1):
- continue
- # apply modifications
- p = p.sub(
- r'<b>[^<]{3,8}</b>', ' ' # remove verse labels
- ).sub(
- r'\{[^\{\}]*\}', ' ' # remove letters in curly braces
- ).sub(
- r'\([^\(\)]*\)', ' ' # remove words in parenthesis
- ).sub(
- r'[-;,.:\]]', ' ' # remove punctuation and upside down nun
- ).sub(
- r'<[^>]*>', '' # remove html tags
- ).sub(
- ' +', ' ' # replace consecutive spaces with single space
- )
- # count number of words in paragraph
- words = words + len(p.split())
- # remove all non-Hebrew characters (removes spaces)
- p = p.sub(r'[^\u05D0-\u05EA]', '')
- # count number of letters in paragraph
- letters = letters + len(p)
- # print results for current book
- book = re.sub(' ', '_', books[i][1])
- print(book+' ' + str(verses)+' '+str(words)+' '+str(letters))
Add Comment
Please, Sign In to add comment