Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # Tanach Stats Calculator
- # By Jonah Lawrence
- # (c) TorahCalc.com, all rights reserved
- # March 21, 2021
- # Results: https://torahcalc.com/info/parsha-stats/
- import re
- from bs4 import BeautifulSoup
- x = "/path/to/x/" # path to folder containing texts
- # "Hebrew without vowels - masoretic spelling" files downloaded from https://www.mechon-mamre.org/
- books = [
- ["x01.htm", "בראשית"],
- ["x02.htm", "שמות"],
- ["x03.htm", "ויקרא"],
- ["x04.htm", "במדבר"],
- ["x05.htm", "דברים"],
- ]
- begins = {
- "x01.htm": [
- ("בראשית", "א,א"),
- ("נוח", "ו,ט"),
- ("לך-לך", "יב,א"),
- ("ויירא", "יח,א"),
- ("חיי שרה", "כג,א"),
- ("תולדות", "כה,יט"),
- ("וייצא", "כח,י"),
- ("וישלח", "לב,ד"),
- ("ויישב", "לז,א"),
- ("מקץ", "מא,א"),
- ("וייגש", "מד,יח"),
- ("ויחי", "מז,כח"),
- ],
- "x02.htm": [
- ("שמות", "א,א"),
- ("ואירא", "ו,ב"),
- ("בוא", "י,א"),
- ("בשלח", "יג,יז"),
- ("יתרו", "יח,א"),
- ("משפטים", "כא,א"),
- ("תרומה", "כה,א"),
- ("תצווה", "כז,כ"),
- ("כי תישא", "ל,יא"),
- ("ויקהל", "לה,א"),
- ("פקודי", "לח,כא"),
- ],
- "x03.htm": [
- ("ויקרא", "א,א"),
- ("צו", "ו,א"),
- ("שמיני", "ט,א"),
- ("תזריע", "יב,א"),
- ("מצורע", "יד,א"),
- ("אחרי מות", "טז,א"),
- ("קדושים", "יט,א"),
- ("אמור", "כא,א"),
- ("בהר סיניי", "כה,א"),
- ("בחוקותיי", "כו,ג"),
- ],
- "x04.htm": [
- ("במדבר", "א,א"),
- ("נשוא", "ד,כא"),
- ("בהעלותך", "ח,א"),
- ("שלח-לך", "יג,א"),
- ("קורח", "טז,א"),
- ("חוקת", "יט,א"),
- ("בלק", "כב,ב"),
- ("פינחס", "כה,י"),
- ("מטות", "ל,ב"),
- ("מסעי", "לג,א"),
- ],
- "x05.htm": [
- ("דברים", "א,א"),
- ("ואתחנן", "ג,כג"),
- ("עקב", "ז,יב"),
- ("ראה", "יא,כו"),
- ("שופטים", "טז,יח"),
- ("כי-תצא", "כא,י"),
- ("כי-תבוא", "כו,א"),
- ("ניצבים", "כט,ט"),
- ("וילך", "לא,א"),
- ("האזינו", "לב,א"),
- ("וזאת הברכה", "לג,א"),
- ],
- }
- # allows .sub() syntax for substitutions which looks nicer [https://stackoverflow.com/a/33359326/]
- class Substitutable(str):
- def __new__(cls, *args, **kwargs):
- newobj = str.__new__(cls, *args, **kwargs)
- newobj.sub = lambda fro, to: Substitutable(re.sub(fro, to, newobj))
- return newobj
- # returns verses, words, and letters in a paragraph element
- def count(paragraph):
- # apply Substitutable class for simple substitution syntax
- p = Substitutable(paragraph)
- # find occurrences of verse labels in paragraph
- verses = len(re.findall(r"<b>[^<]{3,8}</b>", p))
- # if no verses in paragraph, skip to next paragraph
- if verses < 1:
- return 0, 0, 0
- # apply modifications
- p = (
- p.sub(r"<b>[^<]{3,8}</b>", " ") # remove verse labels
- .sub(r"\{[^\{\}]*\}", " ") # remove letters in curly braces
- .sub(r"\([^\(\)]*\)", " ") # remove words in parenthesis
- .sub(r"[-;,.:\]]", " ") # remove punctuation and upside down nun
- .sub(r"<[^>]*>", "") # remove html tags
- .sub(" +", " ") # replace consecutive spaces with single space
- )
- # count number of words in paragraph
- words = len(p.split())
- # remove all non-Hebrew characters (removes spaces)
- p = p.sub(r"[^\u05D0-\u05EA]", "")
- # count number of letters in paragraph
- letters = len(p)
- # return
- return verses, words, letters
- for i in range(len(books)):
- # open file
- with open(x + books[i][0], "r", encoding="Windows-1255") as file:
- # get contents of file
- data = file.read()
- # parse html
- soup = BeautifulSoup(data, features="html.parser")
- # get a list of all paragraph elements
- paragraphs = soup.find_all("p")
- # split at parshios
- parshios = begins[books[i][0]]
- # current paragraph number
- i = 0
- # go through all parshios
- for j in range(len(parshios)):
- # reset counters
- verses = 0
- words = 0
- letters = 0
- # get next parsha break
- next_parsha_label = soup.find('b', string=parshios[j+1][1]) if j + 1 < len(parshios) else None
- next_parsha_p = next_parsha_label.parent if next_parsha_label is not None else None
- # for each paragraph
- while i < len(paragraphs) and paragraphs[i] != next_parsha_p:
- p_verses, p_words, p_letters = count(paragraphs[i])
- verses += p_verses
- words += p_words
- letters += p_letters
- i += 1
- # get pesukim in the first paragraph of next parsha still in prev parsha
- if i < len(paragraphs):
- next_parsha_first_passuk = paragraphs[i].contents.index(next_parsha_label)
- before = "".join(str(x) for x in paragraphs[i].contents[:next_parsha_first_passuk])
- paragraphs[i] = "".join(str(x) for x in paragraphs[i].contents[next_parsha_first_passuk:])
- p_verses, p_words, p_letters = count(before)
- verses += p_verses
- words += p_words
- letters += p_letters
- # print results for current parsha
- parsha = re.sub(" ", "_", parshios[j][0])
- print(parsha + " " + str(verses) + " " + str(words) + " " + str(letters))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement