Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pyarrow as pa
- import time
- from pathlib import Path
- from tqdm.auto import tqdm
- # This benchmark creates an arrow file of 1GB and evaluates the time needed to access examples via memory mapping
- # define paths
- tmp_path = Path() / "tmp"
- tmp_path.mkdir(exist_ok=True)
- arrow_file_path = tmp_path / "test.arrow"
- # write table
- schema = pa.schema({"text": pa.string()})
- writer = pa.RecordBatchFileWriter(arrow_file_path, schema=schema)
- table1MB = pa.Table.from_pydict({"text": ["a" * 1024] * 1024})
- for _ in tqdm(range(1024)): # write 1GB
- writer.write_table(table1MB)
- writer.close()
- # memory map the table
- mmapped_table1GB = pa.ipc.open_file(pa.memory_map(str(arrow_file_path))).read_all()
- # run benchmark
- n = 10
- repeats = 10
- for i in range(n):
- i = int(i / n * len(mmapped_table1GB))
- _start = time.time()
- for _ in range(repeats):
- mmapped_table1GB["text"][i]
- average_microseconds = (time.time() - _start) * 1e6 / repeats
- print(f"Time to access example at i={i/len(mmapped_table1GB)*100:.0f}%\t: {average_microseconds:.1f}μs")
- # Results
- # Time to access example at i=0% : 6.7μs
- # Time to access example at i=10% : 7.2μs
- # Time to access example at i=20% : 9.1μs
- # Time to access example at i=30% : 11.4μs
- # Time to access example at i=40% : 13.8μs
- # Time to access example at i=50% : 16.2μs
- # Time to access example at i=60% : 18.7μs
- # Time to access example at i=70% : 21.1μs
- # Time to access example at i=80% : 26.8μs
- # Time to access example at i=90% : 25.2μs
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement