View difference between Paste ID: rGpLdGhE and 7HVQF9TF
SHOW: | | - or go back to the newest paste.
1
#!/usr/bin/env python3
2
3
# dedup.py
4
# Removes duplicates from Bitwarden export .csv
5
# 2019-02-09 5erif
6
7
import sys
8
import hashlib
9
from urllib.parse import urlparse
10
11
# Field ordinals in Bitwarden CSV
12
FOLDER   = 0
13
FAVORITE = 1
14
TYPE     = 2
15
NAME     = 3
16
NOTES    = 4
17
FIELDS   = 5
18
URI      = 6
19
USERNAME = 7
20
PASSWORD = 8
21
TOTP     = 9
22
23
def main(argv):
24
25
    if len(argv) < 1:
26
        print('Missing input file path')
27
        sys.exit(1)
28
        
29
    in_file_path  = argv[0]
30
    out_file_path = in_file_path[0:(len(in_file_path)-4)]+'_out.csv'
31
    rem_file_path = in_file_path[0:(len(in_file_path)-4)]+'_rem.csv'
32
    completed_lines_hash = set()
33
    line_number   = -1
34
    write_count   = 0
35
    cache         = ''
36
    
37
    out_file = open(out_file_path, 'w', encoding = 'utf8')
38
    rem_file = open(rem_file_path, 'w', encoding = 'utf8')
39
    for line in open(in_file_path, 'r', encoding = 'utf8'):
40
        line_number += 1
41
        fields = line.split(',')
42
        if len(fields) < 10:
43
            # Add previous line if short
44
            line = cache.strip('\n') + line
45
            cache = line
46
            fields = line.split(',')
47
            if len(fields) > 9:
48
                print(f'Recovered with line {line_number}:\n{line}')
49
                cache = ''
50
            else:
51
                print(f'Missing fields in line {line_number}:\n{line}')
52
                rem_file.write(line)
53
                continue
54
        else:
55
            cache = ''
56
        if line_number != 0:
57
            domain = urlparse(fields[URI]).netloc
58
            if len(domain) > 0:
59
                fields[URI] = domain
60
        token = fields[URI] + fields[USERNAME] + fields[PASSWORD]
61
        hashValue = hashlib.md5(token.rstrip().encode('utf-8')).hexdigest()
62
        if hashValue not in completed_lines_hash:
63
            out_file.write(line)
64
            completed_lines_hash.add(hashValue)
65
            write_count += 1
66
        else: 
67
            rem_file.write(line)
68
            # Uncomment for verbose mode
69
            # print(f'Skipping duplicate on line {line_number}:\n{line}')
70
    out_file.close()
71
    rem_file.close()
72
    
73
    dup_count = line_number - write_count
74
    print(f'\nOutput file: {out_file_path}\n{write_count} unique entries saved')
75
    print(f'\n{dup_count} duplicates saved to {rem_file_path}')
76
77
if __name__ == "__main__":
78
   main(sys.argv[1:])