Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- import hashlib
- pd.set_option('display.max_columns', 1000)
- pd.set_option('display.width', 1000)
- pd.set_option('display.max_colwidth', 128)
- _SENSITIVE_COL_NAMES = ['FIRST', 'LAST', 'DOB', 'POSTCODE']
- def __hash_sensitive_data(first, last, dob, post_code):
- to_hash = __prep_hash(dob, first, last, post_code)
- return __hash_generic_string(to_hash)
- def __hash_generic_string(user_uuid):
- salt = 'SALT_VALUE'
- m = hashlib.sha256()
- m.update(salt.encode('utf-8'))
- m.update(user_uuid.encode('utf-8'))
- return m.hexdigest()
- def __prep_hash(dob, first, last, post_code):
- s_first = __fix_char_length(first)
- s_last = __fix_char_length(last)
- blob = "{}::{}::{}::{}".format(s_first, s_last, dob, post_code.replace(" ", ""))
- return blob
- def __fix_char_length(generic_str):
- configured_length = 5
- if len(generic_str) > configured_length:
- return generic_str[:configured_length]
- else:
- return generic_str
- def __do_pii_hash(df_row):
- return __hash_sensitive_data(df_row[_SENSITIVE_COL_NAMES[0]].upper(), df_row[_SENSITIVE_COL_NAMES[1]].upper(),
- df_row[_SENSITIVE_COL_NAMES[2]].upper(), df_row[_SENSITIVE_COL_NAMES[3]].upper())
- def __do_uuid_hash(df_row):
- return __hash_generic_string(df_row['USER_ID'].upper())
- def main() -> None:
- file_name = 'retro/resources/SENSITIVE-OUTPUT-FULL.csv'
- csv_df = pd.read_csv(file_name)
- csv_df['HASHED_UUID'] = csv_df.apply(__do_uuid_hash, axis=1)
- csv_df['HASHED_PII'] = csv_df.apply(__do_pii_hash, axis=1)
- pii_stripped = csv_df.drop(_SENSITIVE_COL_NAMES, axis=1).copy(deep=True)
- print(pii_stripped.head())
- pii_stripped.to_csv('retro/resources/AVIVA-RETRO-10K-CLEARSCORE-VERSION.csv', index=False)
- uuid_stripped = pii_stripped.drop('USER_ID', axis=1).copy(deep=True)
- uuid_stripped.to_csv('retro/resources/AVIVA-RETRO-10K-AVIVA-VERSION.csv', index=False)
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement