Advertisement
Madmouse

generate a simplified set of tokens from arbitrary data

Jul 9th, 2016
218
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.71 KB | None | 0 0
  1. #!/usr/bin/env python
  2.  
  3. from __future__ import print_function
  4.  
  5. import re, sys, string
  6.  
  7. files = list(sys.argv)
  8. files.remove(sys.argv[0])
  9.  
  10. if not files:
  11.     print("Usage: %s <list of files>" % (sys.argv[0]))
  12.     exit(-1)
  13.  
  14. def is_number(c):
  15.     try:
  16.         int(c)
  17.         return True
  18.     except ValueError:
  19.         return False
  20.  
  21. def is_alpha(c):
  22.     return c in string.letters
  23.  
  24. def is_upper(c):
  25.     return c in string.uppercase
  26.  
  27. def is_white(c):
  28.     return c in string.whitespace
  29.  
  30.  
  31. def string_tokens(l):
  32.     r = list()
  33.     for s in l:
  34.         token = str()
  35.         for c in s:
  36.             token += "N" if is_number(c) else ("A" if is_upper(c) else "a") if is_alpha(c) else "S" if is_white(c) else "-"
  37.         r.append(token)
  38.     return list(set(r))
  39.  
  40. def group_tokens(l):
  41.     r = list()
  42.     for s in l:
  43.         i = 0
  44.         x = 0
  45.         o = s[0]
  46.         t = list()
  47.         for c in s:
  48.             if c != o:
  49.                 t.append(s[x : i])
  50.                 x = i
  51.             i += 1
  52.             o = c
  53.         if x != i:
  54.             t.append(s[x:])
  55.         r.append(t)
  56.     return r
  57.  
  58.  
  59. if __name__ == "__main__":
  60.     for cf in files:
  61.         result = str()
  62.         with open(cf) as f:
  63.             l = filter(None, f.read().splitlines())
  64.             pass_1 = string_tokens(l)
  65.             pass_2 = group_tokens(pass_1)
  66.            
  67.             pass_1.sort(key = lambda e: len(e))
  68.             pass_2.sort(key = lambda e: len(e))
  69.            
  70.             print("Pass_1 = ")
  71.             for e in pass_1:
  72.                 print("\t%s" % (e))
  73.            
  74.             print("Pass_2 = ")
  75.             for e in pass_2:
  76.                 print("\t%s" % (e))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement