hdf5ファイルの読み書き比較【WIP】

最近触ることがあったのでどういう形で使うのがいいか検証してみました。

記事は作成中です…

データ作成
ベンチマーク

データ作成

とりあえず乱数でデータ作成。

実際にdataがある場合はそちらで検証してみると面白いかも

import h5py
import numpy as np

# >>>>>>>>>>>>>>>>>>>>>>>
# basic
# >>>>>>>>>>>>>>>>>>>>>>>
# make hdf5
data = np.random.rand(1000, 256, 256, 3).astype(np.uint8)

with h5py.File('data.hdf5', 'w', libver='latest') as f:
    f.create_dataset('images', data=data)


# read hdf5
with h5py.File('data.hdf5', 'r') as f:
    images = f['images']
    print("shape:", images.shape)
    print("chunks:", images.chunks)

# >>>>>>>>>>>>>>>>>>>>>>>
# optional
# >>>>>>>>>>>>>>>>>>>>>>>
cuncks = (1, 256, 256, 3)
chuncks16 = (16, 256, 256, 3)
# with chunks
with h5py.File('data_chunks.hdf5', 'w', libver='latest') as f:
    f.create_dataset('images', data=data, chunks=cuncks)
# with chunks16
with h5py.File('data_chunks16.hdf5', 'w', libver='latest') as f:
    f.create_dataset('images', data=data, chunks=chuncks16)

# with auto-chunk (63, 32, 32, 1)
with h5py.File('data_chunks_auto.hdf5', 'w', libver='latest') as f:
    f.create_dataset('images', data=data, chunks=True)

# wirh compression
with h5py.File('data_compressed.hdf5', 'w', libver='latest') as f:
    f.create_dataset('images', data=data, chunks=cuncks, compression='gzip')

# wirh compression chunks16
with h5py.File('data_compressed16.hdf5', 'w', libver='latest') as f:
    f.create_dataset('images', data=data, chunks=chuncks16, compression='gzip')

# wirh compression shuffle
with h5py.File('data_compressed_shuffle.hdf5', 'w', libver='latest') as f:
    f.create_dataset('images', data=data, chunks=cuncks, compression='gzip', shuffle=True)

ベンチマーク

使い方：FILESの辞書型を編集して実行

import h5py
import numpy as np
import time
import os


FILES = {
    "basic": "data.hdf5",
    "chunks": "data_chunks.hdf5",
    "chunks16": "data_chunks16.hdf5",
    "chunks_auto": "data_chunks_auto.hdf5",
    "compressed": "data_compressed.hdf5",
    "compressed16": "data_compressed16.hdf5",
    "compressed_shuffle": "data_compressed_shuffle.hdf5",
}

CACHE_PRESETS = {
    "default": None,   # h5py / HDF5 のデフォルト
    "nocache_like": {
        "rdcc_nbytes": 2 * 1024**2,  # 2MB（チャンクが大きいなら 1〜2チャンク分に調整）
        "rdcc_nslots": 1019,         # 素数
        "rdcc_w0": 1.0,
    },
    "reuse_32mb": {
        "rdcc_nbytes": 32 * 1024**2,  # 32MB
        "rdcc_nslots": 20011,         # 素数（多め）
        "rdcc_w0": 0.0,
    },
    "reuse_256mb": {
        "rdcc_nbytes": 256 * 1024**2,  # 256MB
        "rdcc_nslots": 150001,         # 素数（かなり多め）
        "rdcc_w0": 0.0,
    },
}


N, H, W, C = 1000, 256, 256, 3
BATCH = 16
N_RANDOM = 1000

rng = np.random.default_rng(0)
random_indices = rng.integers(0, N, size=N_RANDOM)


def _open_file(path, cache_opts):
    """
    cache_opts:
        None -> h5py デフォルト
        dict -> rdcc_* を含む辞書
    """
    if cache_opts is None:
        return h5py.File(path, "r")
    else:
        return h5py.File(path, "r", **cache_opts)


def measure_full_read(path, cache_opts=None):
    times = []
    for _ in range(2):  # 1回目ウォームアップ
        t0 = time.perf_counter()
        with _open_file(path, cache_opts) as f:
            _ = f["images"][:]
        times.append(time.perf_counter() - t0)
    return times[1]


def measure_random_single(path, cache_opts=None):
    times = []
    for _ in range(2):
        t0 = time.perf_counter()
        with _open_file(path, cache_opts) as f:
            dset = f["images"]
            for i in random_indices:
                _ = dset[i]
        times.append(time.perf_counter() - t0)
    return times[1]


def measure_batch_read(path, cache_opts=None):
    times = []
    starts = rng.integers(0, N - BATCH, size=N_RANDOM // BATCH)

    for _ in range(2):
        t0 = time.perf_counter()
        with _open_file(path, cache_opts) as f:
            dset = f["images"]
            for s in starts:
                _ = dset[s : s + BATCH]
        times.append(time.perf_counter() - t0)
    return times[1]


def file_size_mb(path):
    return os.path.getsize(path) / (1024 ** 2)


total_mb = (N * H * W * C * 4) / (1024 ** 2)

for name, path in FILES.items():
    print(f"\n==============================")
    print(f"=== FILE: {name} ===")

    size_mb = file_size_mb(path)
    print(f"file size : {size_mb:.2f} MB")

    for cache_name, cache_opts in CACHE_PRESETS.items():
        print(f"\n--- cache preset: {cache_name} ---")

        t_full = measure_full_read(path, cache_opts)
        t_rand = measure_random_single(path, cache_opts)
        t_batch = measure_batch_read(path, cache_opts)

        print(f"full read      : {t_full:.4f} s  ({total_mb / t_full:.1f} MB/s)")
        print(f"random single  : {t_rand:.4f} s  ({N_RANDOM / t_rand:.1f} imgs/s)")
        print(f"batch read     : {t_batch:.4f} s  ({(N_RANDOM / BATCH) / t_batch:.1f} batches/s)")