daily pastebin goal
8%
SHARE
TWEET

Untitled

a guest Mar 22nd, 2019 51 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. import csv
  2. import itertools
  3. import os
  4. import operator
  5. import shutil
  6. import bz2
  7. import numpy as np
  8. import pandas as pd
  9. import multiprocessing as mp
  10. import argparse
  11.  
  12. # initiate argument parser and args
  13. parser = argparse.ArgumentParser()
  14. parser.add_argument("-i", "--input", help="specify input filename",
  15.         type=str, required=True)
  16. parser.add_argument("-o", "--output", help="specify output filename",
  17.         type=str, required=True)
  18. parser.add_argument("-t", "--type",
  19.         help="specify the format to convert to", type=str,
  20.         required=True, default="wide", choices=["wide", "long"])
  21. parser.add_argument("-d", "--dir", type=str, default="matrixconv_temp",
  22.         help="temp directory for intermediate files")
  23. parser.add_argument("-c", "--cores", type=int, default=4,
  24.         help="number of cores to pivot with")
  25.  
  26. # save args as vars
  27. args = parser.parse_args()
  28. input_fname = args.input
  29. output_fname = args.output
  30. mat_type = args.type
  31. temp_dir = os.path.join(args.dir, "")
  32. n_cores = args.cores
  33.  
  34. # long to wide conversion
  35. if mat_type == "wide":
  36.     dtypes = {0:np.int, 1:np.int, 2:np.float64}
  37.     col_names = ["origin", "destination", "agg_cost"]
  38.  
  39.     # Mini function to pivot each origin csv to wide
  40.     def convert_row(file):
  41.         pd.read_csv(file, dtype=dtypes, names=col_names) \
  42.         .pivot(index="origin", columns="destination", values="agg_cost") \
  43.         .reindex(columns=destinations) \
  44.         .to_csv(file)
  45.  
  46.     # Create a new temporary directory to store intermediate files
  47.     try:
  48.         os.makedirs(temp_dir)
  49.     except FileExistsError:
  50.         pass
  51.  
  52.     # Extract a csv for each unique origin id
  53.     print("Parsing individual origin files...")
  54.     for key, rows in itertools.groupby(csv.reader(bz2.open(input_fname, "rt")),
  55.             operator.itemgetter(0)):
  56.         with open(temp_dir + "%s.csv" % key, "w") as output:
  57.             for row in rows:
  58.                 output.write(",".join(row[0:3]) + "\n")
  59.  
  60.     # Get a list of all files created minus header file
  61.     if os.path.isfile(temp_dir + "origin.csv"):
  62.         os.remove(temp_dir + "origin.csv")
  63.         files = [os.path.join(temp_dir, f) for f in os.listdir(temp_dir) if
  64.                 os.path.isfile(os.path.join(temp_dir, f)) and f != "origin.csv"]
  65.  
  66.     # Use pandas to load a list of all unique destinations from the input file
  67.     print("Getting list of unique destinations...")
  68.     destinations = pd.read_csv(
  69.             input_fname,
  70.             usecols=["destination"],
  71.             dtype=dtypes,
  72.             squeeze=True).unique()
  73.  
  74.       # Execute pivoting of all origins in parallel
  75.     print("Converting origin files from long to wide...")
  76.     pool = mp.Pool(n_cores)
  77.     results = pool.map(convert_row, files)
  78.  
  79.     # Combine all of the pivoted files into one output file
  80.     print("Concatenating pivoted origin files...")
  81.     with open(output_fname, 'wb') as outfile:
  82.         for i, file in enumerate(files):
  83.             with open(file, 'rb') as infile:
  84.                 if i != 0:
  85.                     infile.readline()
  86.                 shutil.copyfileobj(infile, outfile)
  87.  
  88.     # Delete temp directory and files
  89.     try:
  90.         shutil.rmtree(temp_dir)
  91.     except:
  92.         pass
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top