Not a member of Pastebin yet?
                        Sign Up,
                        it unlocks many cool features!                    
                - #!/usr/bin/env python3.6
 - import pdb
 - import random
 - import argparse
 - from pathlib import Path
 - class WeedLMAO:
 - latin_fragments = [
 - 'F', 'U', 'TH', 'O', 'R', 'C', 'G', 'W', 'H', 'N', 'I',
 - 'J', 'EO', 'P', 'X', 'S', 'T', 'B', 'E', 'M', 'L', 'NG',
 - 'OE', 'D', 'A', 'AE', 'Y', 'IA', 'EA',
 - ]
 - gematria = [
 - 'ᚠ', 'ᚢ', 'ᚦ', 'ᚩ', 'ᚱ', 'ᚳ', 'ᚷ', 'ᚹ', 'ᚻ', 'ᚾ',
 - 'ᛁ', 'ᛄ', 'ᛇ', 'ᛈ', 'ᛉ', 'ᛋ', 'ᛏ', 'ᛒ', 'ᛖ', 'ᛗ',
 - 'ᛚ', 'ᛝ', 'ᛟ', 'ᛞ', 'ᚪ', 'ᚫ', 'ᚣ', 'ᛡ', 'ᛠ',
 - ]
 - latin_to_gematria = dict(zip(latin_fragments, gematria))
 - latin_to_gematria.update({
 - 'K': 'ᚳ', 'Q': 'ᚳ', 'Z': 'ᛋ', 'ING': 'ᛝ', 'IO': 'ᛡ',
 - 'V': 'ᚢ'
 - })
 - gutenberg_header_jump = 30
 - def __init__(self, args):
 - self.root = args.source
 - self.target = args.target
 - if args.single:
 - self.process(args.single)
 - # file_list = list(self.root.glob('*'))
 - # for f in [random.choice(file_list)]:
 - c = 0
 - for f in self.root.glob('*'):
 - if f.is_dir():
 - continue
 - self.process(f)
 - def process(self, f): # central hub for our processing
 - with f.open() as fd:
 - try:
 - lines = fd.readlines()
 - except Exception as e:
 - pdb.post_mortem()
 - lines = self.cleanup(lines)
 - # runetext = self.translate_to_gematria("\n".join(lines))
 - new_filename = "%s.gematria" % f.name.split("-")[0]
 - new_path = self.target / new_filename
 - print("%s ---> %s" % (f, new_path))
 - with new_path.open('w') as fd:
 - fd.write(runetext)
 - """
 - project gutenberg has annoying header and footer sections, they are
 - indicated by three stars at the beginning of a line. it is definitely
 - advised to clean the dump files up before processing them in any way; in
 - test runs, i forgot to remove them and some programs i ran calculated
 - part of the header text for assumed plaintext. this program strips those
 - headers plus a few lines that follow or preceed, depending on context.
 - sometimes you have annoying glossaries in the end or other unrelated stuff
 - right after the header that you dont care about.
 - my purposes included stripping all whitespaces before processing. for your
 - purposes, i'd advise commenting out the .replace(" ", "") lines in
 - `find_gutenberg_start_header` and `find_gutenberg_end_header` to preserve
 - whitespaces.
 - """
 - def cleanup(self, lines):
 - gsh = self.find_gutenberg_start_header(lines)
 - if gsh:
 - lines = lines[gsh+self.gutenberg_header_jump:]
 - esh = self.find_gutenberg_end_header(lines)
 - if esh:
 - lines = lines[:esh-60]
 - return lines
 - def find_gutenberg_start_header(self, lines):
 - for i, l in enumerate(lines):
 - tl = l.upper().replace(" ", "")
 - if tl.startswith('***STARTOFTH'):
 - return i
 - def find_gutenberg_end_header(self, lines):
 - for i, l in enumerate(lines):
 - tl = l.upper().replace(" ", "")
 - if tl.startswith('***ENDOFTH'):
 - return i
 - def translate_to_gematria(self, t):
 - res = ""
 - skip = 0
 - bigram = ['th', 'eo', 'ng', 'oe', 'ae', 'ia', 'io', 'ea']
 - t = t.upper()
 - ltg = self.latin_to_gematria
 - for i, val in enumerate(t):
 - if skip:
 - skip -= 1
 - continue
 - frag_short = t[i:i+2]
 - frag_long = t[i:i+3]
 - if frag_long == 'ING':
 - res += ltg[frag_long]
 - skip += 2
 - continue
 - elif frag_short in bigram:
 - res += ltg[frag_short]
 - skip += 1
 - continue
 - res += ltg.get(val, val)
 - return res
 - def main():
 - parser = argparse.ArgumentParser()
 - parser.add_argument(
 - "-s", "--source", required=True, type=Path,
 - help="Selects the folder to traverse"
 - )
 - parser.add_argument(
 - "-t", "--target", required=True, type=Path,
 - help="Selects folder to dump collected and converted files into"
 - )
 - parser.add_argument(
 - "-f", "--single", type=Path,
 - help="Select single file for analysis"
 - )
 - args = parser.parse_args()
 - WeedLMAO(args)
 - if __name__ == "__main__":
 - main()
 
Advertisement
 
                    Add Comment                
                
                        Please, Sign In to add comment