Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python3.6
- import pdb
- import random
- import argparse
- from pathlib import Path
- class WeedLMAO:
- latin_fragments = [
- 'F', 'U', 'TH', 'O', 'R', 'C', 'G', 'W', 'H', 'N', 'I',
- 'J', 'EO', 'P', 'X', 'S', 'T', 'B', 'E', 'M', 'L', 'NG',
- 'OE', 'D', 'A', 'AE', 'Y', 'IA', 'EA',
- ]
- gematria = [
- 'ᚠ', 'ᚢ', 'ᚦ', 'ᚩ', 'ᚱ', 'ᚳ', 'ᚷ', 'ᚹ', 'ᚻ', 'ᚾ',
- 'ᛁ', 'ᛄ', 'ᛇ', 'ᛈ', 'ᛉ', 'ᛋ', 'ᛏ', 'ᛒ', 'ᛖ', 'ᛗ',
- 'ᛚ', 'ᛝ', 'ᛟ', 'ᛞ', 'ᚪ', 'ᚫ', 'ᚣ', 'ᛡ', 'ᛠ',
- ]
- latin_to_gematria = dict(zip(latin_fragments, gematria))
- latin_to_gematria.update({
- 'K': 'ᚳ', 'Q': 'ᚳ', 'Z': 'ᛋ', 'ING': 'ᛝ', 'IO': 'ᛡ',
- 'V': 'ᚢ'
- })
- gutenberg_header_jump = 30
- def __init__(self, args):
- self.root = args.source
- self.target = args.target
- if args.single:
- self.process(args.single)
- # file_list = list(self.root.glob('*'))
- # for f in [random.choice(file_list)]:
- c = 0
- for f in self.root.glob('*'):
- if f.is_dir():
- continue
- self.process(f)
- def process(self, f): # central hub for our processing
- with f.open() as fd:
- try:
- lines = fd.readlines()
- except Exception as e:
- pdb.post_mortem()
- lines = self.cleanup(lines)
- # runetext = self.translate_to_gematria("\n".join(lines))
- new_filename = "%s.gematria" % f.name.split("-")[0]
- new_path = self.target / new_filename
- print("%s ---> %s" % (f, new_path))
- with new_path.open('w') as fd:
- fd.write(runetext)
- """
- project gutenberg has annoying header and footer sections, they are
- indicated by three stars at the beginning of a line. it is definitely
- advised to clean the dump files up before processing them in any way; in
- test runs, i forgot to remove them and some programs i ran calculated
- part of the header text for assumed plaintext. this program strips those
- headers plus a few lines that follow or preceed, depending on context.
- sometimes you have annoying glossaries in the end or other unrelated stuff
- right after the header that you dont care about.
- my purposes included stripping all whitespaces before processing. for your
- purposes, i'd advise commenting out the .replace(" ", "") lines in
- `find_gutenberg_start_header` and `find_gutenberg_end_header` to preserve
- whitespaces.
- """
- def cleanup(self, lines):
- gsh = self.find_gutenberg_start_header(lines)
- if gsh:
- lines = lines[gsh+self.gutenberg_header_jump:]
- esh = self.find_gutenberg_end_header(lines)
- if esh:
- lines = lines[:esh-60]
- return lines
- def find_gutenberg_start_header(self, lines):
- for i, l in enumerate(lines):
- tl = l.upper().replace(" ", "")
- if tl.startswith('***STARTOFTH'):
- return i
- def find_gutenberg_end_header(self, lines):
- for i, l in enumerate(lines):
- tl = l.upper().replace(" ", "")
- if tl.startswith('***ENDOFTH'):
- return i
- def translate_to_gematria(self, t):
- res = ""
- skip = 0
- bigram = ['th', 'eo', 'ng', 'oe', 'ae', 'ia', 'io', 'ea']
- t = t.upper()
- ltg = self.latin_to_gematria
- for i, val in enumerate(t):
- if skip:
- skip -= 1
- continue
- frag_short = t[i:i+2]
- frag_long = t[i:i+3]
- if frag_long == 'ING':
- res += ltg[frag_long]
- skip += 2
- continue
- elif frag_short in bigram:
- res += ltg[frag_short]
- skip += 1
- continue
- res += ltg.get(val, val)
- return res
- def main():
- parser = argparse.ArgumentParser()
- parser.add_argument(
- "-s", "--source", required=True, type=Path,
- help="Selects the folder to traverse"
- )
- parser.add_argument(
- "-t", "--target", required=True, type=Path,
- help="Selects folder to dump collected and converted files into"
- )
- parser.add_argument(
- "-f", "--single", type=Path,
- help="Select single file for analysis"
- )
- args = parser.parse_args()
- WeedLMAO(args)
- if __name__ == "__main__":
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement