Advertisement
Guest User

Untitled

a guest
Jul 22nd, 2017
61
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.03 KB | None | 0 0
  1. # Author: Cezary Zukowski (cezary.zukowski@gmail.com)
  2. import csv
  3. import os
  4. import sys
  5.  
  6. # List of input filenames
  7. file_names = ['BIP-URLs-by-province-district-city.csv', 'Homepage-URLs-by-province-district-city.csv']
  8. # file_names = ['test.csv'];
  9.  
  10. # Output filename
  11. unique_urls_file_name = 'unique_urls.csv'
  12.  
  13. # CONFIGURATION:
  14. input_path = '../sample_input_files/'
  15. output_path = '../output_files/'
  16. # path = ''
  17. encoding = 'utf-8'
  18. delimiters = ';'
  19. # Create 'xyz' dialect with:
  20. # * ';' as a delimiter
  21. # * '"' as a quote character
  22. csv.register_dialect('xyz', delimiter=';', quotechar='"')
  23.  
  24. # Global variables
  25. T = set()
  26. unique_url_list = []
  27. duplicates = 0
  28.  
  29. for fn in file_names:
  30.     # open() implements __exit__ which closes a file automatically,
  31.     # therefore, calling .close() on a file would be unimportant/redundant/unnecessary/pointless.
  32.     full_path = input_path + fn
  33.     try:
  34.         with open(full_path, encoding=encoding) as csv_file:
  35.             dreader = csv.DictReader(csv_file, dialect='xyz')
  36.             for row in dreader:
  37.                 # print(row['URL'])
  38.                 if row['URL'] in T:
  39.                     duplicates += 1
  40.                     print("Duplicated URL: {0}".format(row['URL']) + os.linesep)
  41.                 else:
  42.                     T.add(row['URL'])
  43.                     unique_url_list.append(row)
  44.  
  45.     except FileNotFoundError as err:
  46.         print(("Could not find file: '{0}'. Check whether the path is correct. " + os.linesep +
  47.                "Full error message: \"{1}\"." + os.linesep +
  48.                "err.args: \"{2}\"" + os.linesep +
  49.                "err.errno: \"{3}\"" + os.linesep +
  50.                "err.filename: '{4}'" + os.linesep +
  51.                "err.strerror: '{5}'" + os.linesep
  52.                ).format(full_path,
  53.                         err,
  54.                         err.errno,
  55.                         err.args,
  56.                         err.filename,
  57.                         err.strerror)
  58.               )
  59.     except:
  60.         # What do `sys.exec_info()` and `sys.exec_info()[0]` return?
  61.         print("Unexpected error: ", sys.exc_info()[0])
  62.         raise
  63.  
  64. print("Total number of unique URLs: {}".format(len(unique_url_list)))
  65. print("Total number of duplicated URLs: {}".format(duplicates))
  66.  
  67. # Write the entire list of the unique URLs to a CSV file.
  68. full_path = output_path + unique_urls_file_name
  69. # for field in dreader.fieldnames:
  70. #     print("Field name: "+field+os.linesep)
  71.  
  72. # print("Type of dreader.fieldnames: "+str(type(dreader.fieldnames)))
  73.  
  74. #print(dreader.fieldnames)
  75. #print(unique_url_list[0])
  76.  
  77. # newline has to be set to '', so that formatting is correct
  78. with open(full_path, 'w', newline='', encoding=encoding) as fout:
  79.     # dwriter = csv.DictWriter(fout, dialect='xyz', fieldnames=dreader.fieldnames)
  80.     # dwriter = csv.DictWriter(fout, dialect='xyz', fieldnames=dreader.fieldnames)
  81.     dwriter = csv.DictWriter(fout, dialect='xyz', fieldnames=dreader.fieldnames,)
  82.     dwriter.writeheader()
  83.     for row in unique_url_list:
  84.         dwriter.writerow(row)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement