Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import time
- import numpy as np
- from dataclasses import dataclass
- @dataclass
- class RandIter:
- low: int
- high: int
- size: int
- seed: int
- def __post_init__(self):
- rng = np.random.default_rng(self.seed)
- self._sampled_values = rng.integers(low=self.low, high=self.high, size=self.size).tolist()
- def __iter__(self):
- return iter(self._sampled_values)
- def __len__(self):
- return self.size
- def bench(bc, keys, n):
- for key in keys:
- _start = time.time()
- for _ in range(n):
- bc[key]
- print("Avg access time key={:<50}:\t{:.3f}ms".format(str(key), (time.time() - _start) / n * 1000))
- if __name__ == "__main__":
- import os
- os.environ["USE_TF"] = "0"
- import datasets as ds
- ds.logging.set_verbosity_error()
- bc = ds.load_dataset("bookcorpus", split="train")
- # bc = ds.concatenate_datasets([bc] * 10)
- n = 100
- keys = [1, len(bc) - 1, range(len(bc) - 1024, len(bc)), RandIter(0, len(bc), 1024, 42)]
- print(f"Loaded dataset '{bc.info.builder_name}', len={len(bc)}, nbytes={bc.data.nbytes}\n")
- print("\n" + "=" * 25 + " Querying unshuffled bookcorpus " + "=" * 25 + "\n")
- bench(bc, keys, n)
- print("\n" + "=" * 26 + " Querying shuffled bookcorpus " + "=" * 26 + "\n")
- bc = bc.shuffle(42)
- bench(bc, keys, n)
Advertisement
Add Comment
Please, Sign In to add comment