Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import sys
- import numpy as np
- import scipy.signal
- import glob
- import matplotlib.pyplot as plt
- import os
- import random
- import sys
- from pathlib import Path
- import NumpyAvro as npAvro
- from sklearn.utils import shuffle
- data_set_folder = "/data/integrals/realize-job-809888bc-2281-49e0-b65a-a3b23d9942a/"
- data_set_files = shuffle(glob.glob(data_set_folder + "*.ds"), random_state=seed)
- # If data is still downloading use this to filter files that are not completely fetched
- expected_file_size = 7372941
- print(len(data_set_files))
- #data_set_files = list(filter(lambda f: os.path.getsize(f) == expected_file_size, data_set_files))
- print(len(data_set_files))
- data_set_folder = "/data/integrals/realize-job-809888bc-2281-49e0-b65a-a3b23d9942a/"
- #exp_data_folder = "/data/integrals/expdata/expdata/HIGHEST_PEAK_OUTSIDE_ADDED_IMP_REGIONS/expdata_16k/"
- #exp_data_folder = "/data/integrals/expdata/expdata/HIGHEST_PEAK_OUTSIDE_PREDEFINED_IMP_REGIONS/expdata_16k/"
- rlz_data_files = glob.glob(data_set_folder + "*.ds")
- rlz_data_files.sort()
- print(len(rlz_data_files))
- for file in rlz_data_files:
- print(file)
- file_idx = 0
- #file_idx = random.randint(0, len(data_set_files)-1)
- rlz_data_file = rlz_data_files[file_idx]
- data_set = npAvro.read_data_set(rlz_data_file)
- #keys = open("/data/integrals/expdata/expdata/HIGHEST_PEAK/expdata_keys/00.txt", "r").read().splitlines()
- #print(keys[1])
- print(file_idx, rlz_data_file)
- type(data_set[0]),data_set[0].shape,type(data_set[1]),data_set[1].shape
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement