Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pyarrow as pa
- import os
- from csv import writer
- from sys import argv
- import numpy as np
- def get_data(filepath):
- data = None
- with pa.memory_map(filepath, 'rb') as f:
- f.seek(0)
- buf = f.read_buffer()
- data = pa.deserialize(buf)
- return data
- def write_csv_iter(data, destination_path, file):
- if sum(data.shape) > 0:
- it = np.nditer(data, flags=['multi_index'])
- with open(destination_path, 'w') as writefile:
- writ = writer(writefile)
- while not it.finished:
- if it[0] != 0:
- writ.writerow(
- [it.multi_index[0],
- it.multi_index[1],
- it[0]])
- it.iternext()
- else:
- print("Skipping {} as it is empty".format(file))
- def write_csv_nonzero(data, destination_path, file):
- indices = np.nonzero(data)
- if np.count_nonzero(data) > 0:
- with open(destination_path, 'w') as writefile:
- writ = writer(writefile)
- for row, col in zip(indices[0], indices[1]):
- writ.writerow([row, col, data[row, col]])
- else:
- print("Skipping {} as it is empty".format(file))
- def convert_pyarrow_to_csv(path):
- if not os.path.exists(os.path.join(path, 'converted')):
- os.mkdir(os.path.join(path, 'converted'))
- for file in os.listdir(path):
- destination_path = os.path.join(path, 'converted', os.path.splitext(file)[0] + '.csv')
- if not os.path.exists(destination_path) and file.startswith('output_'):
- print("Converting {}".format(file))
- data = get_data(os.path.join(path, file))
- print("Array {} has dimensions {}".format(file, data.shape))
- write_csv_nonzero(data, destination_path, file)
- # write_csv_iter(data, destination_path, file)
- if __name__ == '__main__':
- assert len(argv) == 2, "Usage: python process.py <path_to_results_dir>"
- convert_pyarrow_to_csv(argv[1])
Add Comment
Please, Sign In to add comment