Advertisement
Guest User

Untitled

a guest
Apr 25th, 2017
72
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.48 KB | None | 0 0
  1. #####################################################################################
  2. #
  3. # Requirements: You'll need Python 3.5.1 or higher to run this
  4. #
  5. # This script will provide you a basic understanding of the alphanumeric patterns
  6. # which exist in a list. You might get this list from a SQL query or something like
  7. # that.
  8. #
  9. # INPUT: Give this script a file that has a single column of ID type strings.
  10. # EXAMPLE (from command line):
  11. # > python patternEyes.py "c:\temp\id_list.txt"
  12. #
  13. # OUTPUT: Comes in the form of print statement converting numbers to "#" and alpha
  14. # characters to "X". All punctuation stays as it exists.
  15. #
  16. # For example, if you want to see if all records are phone numbers, you might expect
  17. # to see something like this:
  18. # (###)-###-####
  19. # But if you also see something like this, you know the data isn't as "clean" as
  20. # you were hoping, requiring further investigation:
  21. # ##-XXX-######
  22. #
  23. #####################################################################################
  24.  
  25. import re, os.path, sys
  26. from collections import defaultdict
  27. from pathlib import Path
  28.  
  29.  
  30. def patternEyes( filePath = r'c:\temp\id_list.txt'):
  31. strings = []
  32. patterns = []
  33. input_file = filePath
  34.  
  35. if os.path.isfile( input_file ):
  36. cp = re.compile(r'[,]')
  37. np = re.compile(r'\d')
  38. ap = re.compile(r'[a-z]', re.IGNORECASE)
  39.  
  40. file = open(input_file, 'r')
  41. for line in file:
  42. strings.extend(line.strip('\n').split(','))
  43. file.close()
  44.  
  45. for string in strings:
  46. nm = np.sub('#', string)
  47. am = ap.sub('X', nm)
  48. patterns.append(am)
  49.  
  50. pattern_counts = defaultdict(int)
  51. for pattern in patterns:
  52. if pattern == '':
  53. pattern_counts['No Data'] += 1
  54. else:
  55. pattern_counts[pattern] += 1
  56.  
  57. pattern_rank = []
  58. for k in sorted(pattern_counts, key=pattern_counts.__getitem__, reverse=True):
  59. pattern_rank.append([k, pattern_counts[k]])
  60.  
  61. print("\nREPORT FOR: {}".format(Path(input_file).resolve()))
  62. print("\n{0:20} | {1:10}".format("PATTERN", "COUNT"))
  63. print("-"*30)
  64. for pattern, count in pattern_rank:
  65. print("{0:20} | {1:10}".format(pattern, str(count)))
  66. else:
  67. print( "\nSorry, there is no file here: {}".format(input_file))
  68.  
  69. def main( inputs ):
  70. if len( inputs ) = 2:
  71. patternEyes( inputs[1] )
  72. else:
  73. patternEyes()
  74.  
  75. if __name__ == "__main__": main( sys.argv )
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement