Untitled

# Author: Cezary Zukowski (cezary.zukowski@gmail.com)
import csv
import os
import sys

# List of input filenames
file_names = ['BIP-URLs-by-province-district-city.csv', 'Homepage-URLs-by-province-district-city.csv']
# file_names = ['test.csv'];

# Output filename
unique_urls_file_name = 'unique_urls.csv'

# CONFIGURATION:
input_path = '../sample_input_files/'
output_path = '../output_files/'
# path = ''
encoding = 'utf-8'
delimiters = ';'
# Create 'xyz' dialect with:
# * ';' as a delimiter
# * '"' as a quote character
csv.register_dialect('xyz', delimiter=';', quotechar='"')

# Global variables
T = set()
unique_url_list = []
duplicates = 0

for fn in file_names:
    # open() implements __exit__ which closes a file automatically,
    # therefore, calling .close() on a file would be unimportant/redundant/unnecessary/pointless.
    full_path = input_path + fn
    try:
        with open(full_path, encoding=encoding) as csv_file:
            dreader = csv.DictReader(csv_file, dialect='xyz')
            for row in dreader:
                # print(row['URL'])
                if row['URL'] in T:
                    duplicates += 1
                    print("Duplicated URL: {0}".format(row['URL']) + os.linesep)
                else:
                    T.add(row['URL'])
                    unique_url_list.append(row)

    except FileNotFoundError as err:
        print(("Could not find file: '{0}'. Check whether the path is correct. " + os.linesep +
               "Full error message: \"{1}\"." + os.linesep +
               "err.args: \"{2}\"" + os.linesep +
               "err.errno: \"{3}\"" + os.linesep +
               "err.filename: '{4}'" + os.linesep +
               "err.strerror: '{5}'" + os.linesep
               ).format(full_path,
                        err,
                        err.errno,
                        err.args,
                        err.filename,
                        err.strerror)
              )
    except:
        # What do `sys.exec_info()` and `sys.exec_info()[0]` return?
        print("Unexpected error: ", sys.exc_info()[0])
        raise

print("Total number of unique URLs: {}".format(len(unique_url_list)))
print("Total number of duplicated URLs: {}".format(duplicates))

# Write the entire list of the unique URLs to a CSV file.
full_path = output_path + unique_urls_file_name
# for field in dreader.fieldnames:
#     print("Field name: "+field+os.linesep)

# print("Type of dreader.fieldnames: "+str(type(dreader.fieldnames)))

#print(dreader.fieldnames)
#print(unique_url_list[0])

# newline has to be set to '', so that formatting is correct
with open(full_path, 'w', newline='', encoding=encoding) as fout:
    # dwriter = csv.DictWriter(fout, dialect='xyz', fieldnames=dreader.fieldnames)
    # dwriter = csv.DictWriter(fout, dialect='xyz', fieldnames=dreader.fieldnames)
    dwriter = csv.DictWriter(fout, dialect='xyz', fieldnames=dreader.fieldnames,)
    dwriter.writeheader()
    for row in unique_url_list:
        dwriter.writerow(row)