Advertisement
eyl327

Parsha Stats Calculator - Python

Mar 21st, 2021
390
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.90 KB | None | 0 0
  1. # Tanach Stats Calculator
  2. # By Jonah Lawrence
  3. # (c) TorahCalc.com, all rights reserved
  4. # March 21, 2021
  5. # Results: https://torahcalc.com/info/parsha-stats/
  6.  
  7. import re
  8. from bs4 import BeautifulSoup
  9.  
  10. x = "/path/to/x/"  # path to folder containing texts
  11.  
  12. # "Hebrew without vowels - masoretic spelling" files downloaded from https://www.mechon-mamre.org/
  13. books = [
  14.     ["x01.htm", "בראשית"],
  15.     ["x02.htm", "שמות"],
  16.     ["x03.htm", "ויקרא"],
  17.     ["x04.htm", "במדבר"],
  18.     ["x05.htm", "דברים"],
  19. ]
  20.  
  21. begins = {
  22.     "x01.htm": [
  23.         ("בראשית", "א,א"),
  24.         ("נוח", "ו,ט"),
  25.         ("לך-לך", "יב,א"),
  26.         ("ויירא", "יח,א"),
  27.         ("חיי שרה", "כג,א"),
  28.         ("תולדות", "כה,יט"),
  29.         ("וייצא", "כח,י"),
  30.         ("וישלח", "לב,ד"),
  31.         ("ויישב", "לז,א"),
  32.         ("מקץ", "מא,א"),
  33.         ("וייגש", "מד,יח"),
  34.         ("ויחי", "מז,כח"),
  35.     ],
  36.     "x02.htm": [
  37.         ("שמות", "א,א"),
  38.         ("ואירא", "ו,ב"),
  39.         ("בוא", "י,א"),
  40.         ("בשלח", "יג,יז"),
  41.         ("יתרו", "יח,א"),
  42.         ("משפטים", "כא,א"),
  43.         ("תרומה", "כה,א"),
  44.         ("תצווה", "כז,כ"),
  45.         ("כי תישא", "ל,יא"),
  46.         ("ויקהל", "לה,א"),
  47.         ("פקודי", "לח,כא"),
  48.     ],
  49.     "x03.htm": [
  50.         ("ויקרא", "א,א"),
  51.         ("צו", "ו,א"),
  52.         ("שמיני", "ט,א"),
  53.         ("תזריע", "יב,א"),
  54.         ("מצורע", "יד,א"),
  55.         ("אחרי מות", "טז,א"),
  56.         ("קדושים", "יט,א"),
  57.         ("אמור", "כא,א"),
  58.         ("בהר סיניי", "כה,א"),
  59.         ("בחוקותיי", "כו,ג"),
  60.     ],
  61.     "x04.htm": [
  62.         ("במדבר", "א,א"),
  63.         ("נשוא", "ד,כא"),
  64.         ("בהעלותך", "ח,א"),
  65.         ("שלח-לך", "יג,א"),
  66.         ("קורח", "טז,א"),
  67.         ("חוקת", "יט,א"),
  68.         ("בלק", "כב,ב"),
  69.         ("פינחס", "כה,י"),
  70.         ("מטות", "ל,ב"),
  71.         ("מסעי", "לג,א"),
  72.     ],
  73.     "x05.htm": [
  74.         ("דברים", "א,א"),
  75.         ("ואתחנן", "ג,כג"),
  76.         ("עקב", "ז,יב"),
  77.         ("ראה", "יא,כו"),
  78.         ("שופטים", "טז,יח"),
  79.         ("כי-תצא", "כא,י"),
  80.         ("כי-תבוא", "כו,א"),
  81.         ("ניצבים", "כט,ט"),
  82.         ("וילך", "לא,א"),
  83.         ("האזינו", "לב,א"),
  84.         ("וזאת הברכה", "לג,א"),
  85.     ],
  86. }
  87.  
  88. # allows .sub() syntax for substitutions which looks nicer [https://stackoverflow.com/a/33359326/]
  89. class Substitutable(str):
  90.     def __new__(cls, *args, **kwargs):
  91.         newobj = str.__new__(cls, *args, **kwargs)
  92.         newobj.sub = lambda fro, to: Substitutable(re.sub(fro, to, newobj))
  93.         return newobj
  94.  
  95. # returns verses, words, and letters in a paragraph element
  96. def count(paragraph):
  97.     # apply Substitutable class for simple substitution syntax
  98.     p = Substitutable(paragraph)
  99.     # find occurrences of verse labels in paragraph
  100.     verses = len(re.findall(r"<b>[^<]{3,8}</b>", p))
  101.     # if no verses in paragraph, skip to next paragraph
  102.     if verses < 1:
  103.         return 0, 0, 0
  104.     # apply modifications
  105.     p = (
  106.         p.sub(r"<b>[^<]{3,8}</b>", " ")  # remove verse labels
  107.         .sub(r"\{[^\{\}]*\}", " ")  # remove letters in curly braces
  108.         .sub(r"\([^\(\)]*\)", " ")  # remove words in parenthesis
  109.         .sub(r"[-;,.:\]]", " ")  # remove punctuation and upside down nun
  110.         .sub(r"<[^>]*>", "")  # remove html tags
  111.         .sub(" +", " ")  # replace consecutive spaces with single space
  112.     )
  113.     # count number of words in paragraph
  114.     words = len(p.split())
  115.     # remove all non-Hebrew characters (removes spaces)
  116.     p = p.sub(r"[^\u05D0-\u05EA]", "")
  117.     # count number of letters in paragraph
  118.     letters = len(p)
  119.     # return
  120.     return verses, words, letters
  121.  
  122. for i in range(len(books)):
  123.     # open file
  124.     with open(x + books[i][0], "r", encoding="Windows-1255") as file:
  125.         # get contents of file
  126.         data = file.read()
  127.         # parse html
  128.         soup = BeautifulSoup(data, features="html.parser")
  129.         # get a list of all paragraph elements
  130.         paragraphs = soup.find_all("p")
  131.         # split at parshios
  132.         parshios = begins[books[i][0]]
  133.         # current paragraph number
  134.         i = 0
  135.         # go through all parshios
  136.         for j in range(len(parshios)):
  137.             # reset counters
  138.             verses = 0
  139.             words = 0
  140.             letters = 0
  141.             # get next parsha break
  142.             next_parsha_label = soup.find('b', string=parshios[j+1][1]) if j + 1 < len(parshios) else None
  143.             next_parsha_p = next_parsha_label.parent if next_parsha_label is not None else None
  144.             # for each paragraph
  145.             while i < len(paragraphs) and paragraphs[i] != next_parsha_p:
  146.                 p_verses, p_words, p_letters = count(paragraphs[i])
  147.                 verses += p_verses
  148.                 words += p_words
  149.                 letters += p_letters
  150.                 i += 1
  151.             # get pesukim in the first paragraph of next parsha still in prev parsha
  152.             if i < len(paragraphs):
  153.                 next_parsha_first_passuk = paragraphs[i].contents.index(next_parsha_label)
  154.                 before = "".join(str(x) for x in paragraphs[i].contents[:next_parsha_first_passuk])
  155.                 paragraphs[i] = "".join(str(x) for x in paragraphs[i].contents[next_parsha_first_passuk:])
  156.                 p_verses, p_words, p_letters = count(before)
  157.                 verses += p_verses
  158.                 words += p_words
  159.                 letters += p_letters
  160.             # print results for current parsha
  161.             parsha = re.sub(" ", "_", parshios[j][0])
  162.             print(parsha + " " + str(verses) + " " + str(words) + " " + str(letters))
  163.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement