Advertisement
say2joe

Bitwarden Duplicate Removal.py

Jan 27th, 2022
1,688
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #!/usr/bin/env python3
  2.  
  3. # dedup.py
  4. # Removes duplicates from Bitwarden export .csv
  5. # 2019-02-09 5erif
  6.  
  7. import sys
  8. import hashlib
  9. from urllib.parse import urlparse
  10.  
  11. # Field ordinals in Bitwarden CSV
  12. FOLDER   = 0
  13. FAVORITE = 1
  14. TYPE     = 2
  15. NAME     = 3
  16. NOTES    = 4
  17. FIELDS   = 5
  18. URI      = 6
  19. USERNAME = 7
  20. PASSWORD = 8
  21. TOTP     = 9
  22.  
  23. def main(argv):
  24.  
  25.     if len(argv) < 1:
  26.         print('Missing input file path')
  27.         sys.exit(1)
  28.        
  29.     in_file_path  = argv[0]
  30.     out_file_path = in_file_path[0:(len(in_file_path)-4)]+'_out.csv'
  31.     rem_file_path = in_file_path[0:(len(in_file_path)-4)]+'_rem.csv'
  32.     completed_lines_hash = set()
  33.     line_number   = -1
  34.     write_count   = 0
  35.     cache         = ''
  36.    
  37.     out_file = open(out_file_path, 'w', encoding = 'utf8')
  38.     rem_file = open(rem_file_path, 'w', encoding = 'utf8')
  39.     for line in open(in_file_path, 'r', encoding = 'utf8'):
  40.         line_number += 1
  41.         fields = line.split(',')
  42.         if len(fields) < 10:
  43.             # Add previous line if short
  44.             line = cache.strip('\n') + line
  45.             cache = line
  46.             fields = line.split(',')
  47.             if len(fields) > 9:
  48.                 print(f'Recovered with line {line_number}:\n{line}')
  49.                 cache = ''
  50.             else:
  51.                 print(f'Missing fields in line {line_number}:\n{line}')
  52.                 rem_file.write(line)
  53.                 continue
  54.         else:
  55.             cache = ''
  56.         if line_number != 0:
  57.             domain = urlparse(fields[URI]).netloc
  58.             if len(domain) > 0:
  59.                 fields[URI] = domain
  60.         token = fields[URI] + fields[USERNAME] + fields[PASSWORD]
  61.         hashValue = hashlib.md5(token.rstrip().encode('utf-8')).hexdigest()
  62.         if hashValue not in completed_lines_hash:
  63.             out_file.write(line)
  64.             completed_lines_hash.add(hashValue)
  65.             write_count += 1
  66.         else:
  67.             rem_file.write(line)
  68.             # Uncomment for verbose mode
  69.             # print(f'Skipping duplicate on line {line_number}:\n{line}')
  70.     out_file.close()
  71.     rem_file.close()
  72.    
  73.     dup_count = line_number - write_count
  74.     print(f'\nOutput file: {out_file_path}\n{write_count} unique entries saved')
  75.     print(f'\n{dup_count} duplicates saved to {rem_file_path}')
  76.  
  77. if __name__ == "__main__":
  78.    main(sys.argv[1:])
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement