Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # Author: Cezary Zukowski (cezary.zukowski@gmail.com)
- import csv
- import os
- import sys
- # List of input filenames
- file_names = ['BIP-URLs-by-province-district-city.csv', 'Homepage-URLs-by-province-district-city.csv']
- # file_names = ['test.csv'];
- # Output filename
- unique_urls_file_name = 'unique_urls.csv'
- # CONFIGURATION:
- input_path = '../sample_input_files/'
- output_path = '../output_files/'
- # path = ''
- encoding = 'utf-8'
- delimiters = ';'
- # Create 'xyz' dialect with:
- # * ';' as a delimiter
- # * '"' as a quote character
- csv.register_dialect('xyz', delimiter=';', quotechar='"')
- # Global variables
- T = set()
- unique_url_list = []
- duplicates = 0
- for fn in file_names:
- # open() implements __exit__ which closes a file automatically,
- # therefore, calling .close() on a file would be unimportant/redundant/unnecessary/pointless.
- full_path = input_path + fn
- try:
- with open(full_path, encoding=encoding) as csv_file:
- dreader = csv.DictReader(csv_file, dialect='xyz')
- for row in dreader:
- # print(row['URL'])
- if row['URL'] in T:
- duplicates += 1
- print("Duplicated URL: {0}".format(row['URL']) + os.linesep)
- else:
- T.add(row['URL'])
- unique_url_list.append(row)
- except FileNotFoundError as err:
- print(("Could not find file: '{0}'. Check whether the path is correct. " + os.linesep +
- "Full error message: \"{1}\"." + os.linesep +
- "err.args: \"{2}\"" + os.linesep +
- "err.errno: \"{3}\"" + os.linesep +
- "err.filename: '{4}'" + os.linesep +
- "err.strerror: '{5}'" + os.linesep
- ).format(full_path,
- err,
- err.errno,
- err.args,
- err.filename,
- err.strerror)
- )
- except:
- # What do `sys.exec_info()` and `sys.exec_info()[0]` return?
- print("Unexpected error: ", sys.exc_info()[0])
- raise
- print("Total number of unique URLs: {}".format(len(unique_url_list)))
- print("Total number of duplicated URLs: {}".format(duplicates))
- # Write the entire list of the unique URLs to a CSV file.
- full_path = output_path + unique_urls_file_name
- # for field in dreader.fieldnames:
- # print("Field name: "+field+os.linesep)
- # print("Type of dreader.fieldnames: "+str(type(dreader.fieldnames)))
- #print(dreader.fieldnames)
- #print(unique_url_list[0])
- # newline has to be set to '', so that formatting is correct
- with open(full_path, 'w', newline='', encoding=encoding) as fout:
- # dwriter = csv.DictWriter(fout, dialect='xyz', fieldnames=dreader.fieldnames)
- # dwriter = csv.DictWriter(fout, dialect='xyz', fieldnames=dreader.fieldnames)
- dwriter = csv.DictWriter(fout, dialect='xyz', fieldnames=dreader.fieldnames,)
- dwriter.writeheader()
- for row in unique_url_list:
- dwriter.writerow(row)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement