Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- import os
- import re
- import sys
- import xml.etree.ElementTree as ElementTree
- class XMLScrubber(object):
- ALPHANUMERIC_REGEX = '^[a-zA-Z0-9]*$'
- MASKED_CHARACTER = 'X'
- @staticmethod
- def process(input_file_path, out_dir_path, element_xpaths):
- print 'Scrubbing request file from {0}'.format(input_file_path)
- html_response_tree = ElementTree.parse(input_file_path)
- for element_xpath in element_xpaths:
- pii_element = html_response_tree.find(element_xpath)
- masked_chars = [re.sub(XMLScrubber.ALPHANUMERIC_REGEX, XMLScrubber.MASKED_CHARACTER, i) for i in
- pii_element.text]
- pii_element.text = ''.join(masked_chars)
- out_file_path = '{0}/{1}'.format(out_dir_path, os.path.basename(input_file_path))
- print('Outputting scrubbed file to {0}'.format(out_file_path))
- html_response_tree.write(out_file_path)
- # TODO: need to pass XPath IDs as command line arguments
- XPATHS = [".//*[@id=\"Initial_PROJECTED_TO_STATED_RATIO\"]"]
- file_extension = sys.argv[1]
- input_directory = sys.argv[2]
- output_directory = sys.argv[3]
- print('File extension: {0}'.format(file_extension))
- print('Input directory: {0}'.format(input_directory))
- print('Output directory: {0}'.format(output_directory))
- for file_path in os.listdir(input_directory):
- if file_path.endswith('.{0}'.format(file_extension)):
- full_file_path = "{0}/{1}".format(input_directory, file_path)
- print "Processing {0}".format(full_file_path)
- XMLScrubber.process(full_file_path, output_directory, XPATHS)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement