eyl327

Tanach Stats Calculator - Python

Apr 20th, 2020
684
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.42 KB | None | 0 0
  1. # Tanach Stats Calculator
  2. # By Jonah Lawrence
  3. # (c) TorahCalc.com, all rights reserved
  4. # April 20, 2020
  5. # Results: https://torahcalc.com/info/tanach-stats/
  6.  
  7. import re
  8. from bs4 import BeautifulSoup
  9.  
  10. x = "C:/Users/<path>/x/" # path to /x/ folder containing texts
  11.  
  12. # "Hebrew without vowels - masoretic spelling" files downloaded from https://www.mechon-mamre.org/
  13. # Shmuel A/B, Malachim A/B, Divrei Hayamim A/B, and Ezra/Nechemia were divided manually
  14. books = [
  15.     ["", "תורה"],  # Torah
  16.     ["x01.htm", "בראשית"],
  17.     ["x02.htm", "שמות"],
  18.     ["x03.htm", "ויקרא"],
  19.     ["x04.htm", "במדבר"],
  20.     ["x05.htm", "דברים"],
  21.  
  22.     ["", "נביאים"],  # Neviim
  23.     ["x06.htm", "יהושוע"],
  24.     ["x07.htm", "שופטים"],
  25.     ["x08A.htm", "שמואל א"], # contains first part of "x08.htm"
  26.     ["x08B.htm", "שמואל ב"], # contains second part of "x08.htm"
  27.     ["x09A.htm", "מלכים א"], # contains first part of "x09.htm"
  28.     ["x09B.htm", "מלכים ב"], # contains second part of "x09.htm"
  29.     ["x10.htm", "ישעיהו"],
  30.     ["x11.htm", "ירמיהו"],
  31.     ["x12.htm", "יחזקאל"],
  32.     ["", "תרי_עשר"],  # Trei Asar
  33.     ["x13.htm", "הושע"],
  34.     ["x14.htm", "יואל"],
  35.     ["x15.htm", "עמוס"],
  36.     ["x16.htm", "עובדיה"],
  37.     ["x17.htm", "יונה"],
  38.     ["x18.htm", "מיכה"],
  39.     ["x19.htm", "נחום"],
  40.     ["x20.htm", "חבקוק"],
  41.     ["x21.htm", "צפניה"],
  42.     ["x22.htm", "חגיי"],
  43.     ["x23.htm", "זכריה"],
  44.     ["x24.htm", "מלאכי"],
  45.  
  46.     ["", "כתובים"],  # Ketuvim
  47.     ["x25A.htm", "דברי הימים א"], # contains first part of "x25.htm"
  48.     ["x25B.htm", "דברי הימים ב"], # contains second part of "x25.htm"
  49.     ["x26.htm", "תהילים"],
  50.     ["x27.htm", "איוב"],
  51.     ["x28.htm", "משלי"],
  52.     ["x29.htm", "רות"],
  53.     ["x30.htm", "שיר השירים"],
  54.     ["x31.htm", "קוהלת"],
  55.     ["x32.htm", "איכה"],
  56.     ["x33.htm", "אסתר"],
  57.     ["x34.htm", "דנייאל"],
  58.     ["x35A.htm", "עזרא"], # contains first part of "x35.htm"
  59.     ["x35B.htm", "נחמיה"], # contains second part of "x35.htm"
  60. ]
  61.  
  62. # allows .sub() syntax for substitutions which looks nicer [https://stackoverflow.com/a/33359326/]
  63. class Substitutable(str):
  64.     def __new__(cls, *args, **kwargs):
  65.         newobj = str.__new__(cls, *args, **kwargs)
  66.         newobj.sub = lambda fro, to: Substitutable(re.sub(fro, to, newobj))
  67.         return newobj
  68.  
  69.  
  70. for i in range(len(books)):
  71.     # reset counters
  72.     verses = 0
  73.     words = 0
  74.     letters = 0
  75.     # print labels (rows with no filename)
  76.     if (books[i][0] == ''):
  77.         print(books[i][1])
  78.         continue
  79.     # open file
  80.     with open(x+books[i][0], 'r', encoding="Windows-1255") as file:
  81.         # get contents of file
  82.         data = file.read()
  83.         # parse html
  84.         soup = BeautifulSoup(data, features="html.parser")
  85.         # get a list of all paragraph elements
  86.         paragraphs = soup.find_all('p')
  87.         # for each paragraph
  88.         for p in paragraphs:
  89.             # apply Substitutable class for simple substitution syntax
  90.             p = Substitutable(p)
  91.             # find occurrences of verse labels in paragraph
  92.             verses = verses + len(re.findall(r'<b>[^<]{3,8}</b>', p))
  93.             # if no verses in paragraph, skip to next paragraph
  94.             if (verses < 1):
  95.                 continue
  96.             # apply modifications
  97.             p = p.sub(
  98.                 r'<b>[^<]{3,8}</b>', ' ' # remove verse labels
  99.             ).sub(
  100.                 r'\{[^\{\}]*\}', ' ' # remove letters in curly braces
  101.             ).sub(
  102.                 r'\([^\(\)]*\)', ' ' # remove words in parenthesis
  103.             ).sub(
  104.                 r'[-;,.:\]]', ' ' # remove punctuation and upside down nun
  105.             ).sub(
  106.                 r'<[^>]*>', '' # remove html tags
  107.             ).sub(
  108.                 ' +', ' ' # replace consecutive spaces with single space
  109.             )
  110.             # count number of words in paragraph
  111.             words = words + len(p.split())
  112.             # remove all non-Hebrew characters (removes spaces)
  113.             p = p.sub(r'[^\u05D0-\u05EA]', '')
  114.             # count number of letters in paragraph
  115.             letters = letters + len(p)
  116.         # print results for current book
  117.         book = re.sub(' ', '_', books[i][1])
  118.         print(book+'    ' + str(verses)+'    '+str(words)+'    '+str(letters))
Add Comment
Please, Sign In to add comment