Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #####################################################################################
- #
- # Requirements: You'll need Python 3.5.1 or higher to run this
- #
- # This script will provide you a basic understanding of the alphanumeric patterns
- # which exist in a list. You might get this list from a SQL query or something like
- # that.
- #
- # INPUT: Give this script a file that has a single column of ID type strings.
- # EXAMPLE (from command line):
- # > python patternEyes.py "c:\temp\id_list.txt"
- #
- # OUTPUT: Comes in the form of print statement converting numbers to "#" and alpha
- # characters to "X". All punctuation stays as it exists.
- #
- # For example, if you want to see if all records are phone numbers, you might expect
- # to see something like this:
- # (###)-###-####
- # But if you also see something like this, you know the data isn't as "clean" as
- # you were hoping, requiring further investigation:
- # ##-XXX-######
- #
- #####################################################################################
- import re, os.path, sys
- from collections import defaultdict
- from pathlib import Path
- def patternEyes( filePath = r'c:\temp\id_list.txt'):
- strings = []
- patterns = []
- input_file = filePath
- if os.path.isfile( input_file ):
- cp = re.compile(r'[,]')
- np = re.compile(r'\d')
- ap = re.compile(r'[a-z]', re.IGNORECASE)
- file = open(input_file, 'r')
- for line in file:
- strings.extend(line.strip('\n').split(','))
- file.close()
- for string in strings:
- nm = np.sub('#', string)
- am = ap.sub('X', nm)
- patterns.append(am)
- pattern_counts = defaultdict(int)
- for pattern in patterns:
- if pattern == '':
- pattern_counts['No Data'] += 1
- else:
- pattern_counts[pattern] += 1
- pattern_rank = []
- for k in sorted(pattern_counts, key=pattern_counts.__getitem__, reverse=True):
- pattern_rank.append([k, pattern_counts[k]])
- print("\nREPORT FOR: {}".format(Path(input_file).resolve()))
- print("\n{0:20} | {1:10}".format("PATTERN", "COUNT"))
- print("-"*30)
- for pattern, count in pattern_rank:
- print("{0:20} | {1:10}".format(pattern, str(count)))
- else:
- print( "\nSorry, there is no file here: {}".format(input_file))
- def main( inputs ):
- if len( inputs ) = 2:
- patternEyes( inputs[1] )
- else:
- patternEyes()
- if __name__ == "__main__": main( sys.argv )
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement