""" Check how long it takes to iterate through an entire dataset on different disk hardware / filesystems. """ def main(): import ubelt as ub import kwcoco import platform from watch.utils import util_hardware hostname = platform.node() print('hostname = {!r}'.format(hostname)) # dvc_dpath = ub.Path('$HOME/data/dvc-repos/smart_watch_dvc').expand() dvc_dpath = ub.Path('$HOME/data/dvc-repos/smart_watch_dvc-hdd').expand() # dvc_dpath = ub.Path('$HOME/data/dvc-repos/smart_watch_dvc-ssd').expand() coco_fpath = dvc_dpath / 'Cropped-Drop3-TA1-2022-03-10/data.kwcoco.json' system_info = util_hardware.get_cpu_mem_info() disk_info = util_hardware.disk_info_of_path(coco_fpath) print('disk_info = {!r}'.format(disk_info)) start_timestamp = ub.timestamp() measures = [] import sys args = repr(sys.argv) benchmark_info = { 'type': 'process', 'properties': { 'name': 'benchmark', 'args': args, 'system_info': system_info, 'disk_info': disk_info, 'hostname': hostname, 'start_timestamp': start_timestamp, 'coco_fpath': coco_fpath, } } with ub.Timer('load_coco') as timer: coco_dset = kwcoco.CocoDataset(coco_fpath) measures.append({ 'type': 'measure', 'properties': { 'label': 'load_coco', 'seconds': timer.elapsed, 'size': coco_fpath.stat().st_size, } }) # import kwarray gids = list(coco_dset.images()) coco_img_list = list(coco_dset.images().coco_images) if 1: import kwarray gids = kwarray.shuffle(gids, rng=0)[0:100] fpaths = [] for gid in gids: coco_img = coco_dset.coco_image(gid) for fpath in coco_img.iter_image_filepaths(): fpath = ub.Path(fpath) fpaths.append(fpath) ## with ub.Timer('iter_image_exists') as timer: prog = ub.ProgIter(fpaths) for fpath in prog: fpath.exists() measures.append({ 'type': 'measure', 'properties': { 'label': timer.label, 'seconds': timer.elapsed, 'num_images': len(gids), } }) ## with ub.Timer('iter_image_stat') as timer: prog = ub.ProgIter(fpaths) for fpath in prog: fpath.stat() measures.append({ 'type': 'measure', 'properties': { 'label': timer.label, 'seconds': timer.elapsed, 'num_images': len(gids), 'Hz': prog._iters_per_second, } }) ## with ub.Timer('iter_image_load') as timer: prog = ub.ProgIter(coco_img_list) errors = [] per_img_times = [] for coco_img in prog: for stream in coco_img.channels.streams(): with ub.Timer() as t2: delayed = coco_img.imdelay(stream, space='asset') try: delayed.finalize() except Exception as ex: errors.append(ex) pass per_img_times.append(t2.elapsed) measures.append({ 'type': 'measure', 'properties': { 'label': timer.label, 'seconds': timer.elapsed, 'num_images': len(gids), 'num_errors': len(errors), 'per_img_times': per_img_times, } }) ## with ub.Timer('iter_image_load_second_pass') as timer: prog = ub.ProgIter(coco_img_list) errors = [] per_img_times = [] for coco_img in prog: for stream in coco_img.channels.streams(): with ub.Timer() as t2: delayed = coco_img.imdelay(stream, space='asset') try: delayed.finalize() except Exception as ex: errors.append(ex) pass per_img_times.append(t2.elapsed) measures.append({ 'type': 'measure', 'properties': { 'label': timer.label, 'seconds': timer.elapsed, 'num_images': len(gids), 'num_errors': len(errors), 'per_img_times': per_img_times, } }) end_timestamp = ub.timestamp() benchmark_info['end_timestamp'] = end_timestamp benchmark = { 'info': [benchmark_info], 'measures': measures, } for m in measures: if m['properties']['label'] == 'iter_image_load': print(m['properties']['label']) import pandas as pd try: print(pd.DataFrame({'seconds': m['properties']['per_img_times']}).describe().T) except Exception: pass if m['properties']['label'] == 'iter_image_load_second_pass': print(m['properties']['label']) import pandas as pd try: print(pd.DataFrame({'seconds': m['properties']['per_img_times']}).describe().T) except Exception: pass print('disk_info = {!r}'.format(disk_info)) # Save based on timestamp import safer import json from kwcoco.util import util_json benchmark = util_json.ensure_json_serializable(benchmark) fpath = ub.Path(f'benchmark-filesys-iteration-crop-{ub.timestamp()}.json') with safer.open(fpath, 'w', temp_file=True) as f: json.dump(benchmark, f) def gathering(): import ubelt as ub import kwplot kwplot.autompl() sns = kwplot.autosns() sns.set() suffixes = [ '', 'data/dvc-repos/smart_watch_dvc', 'data/dvc-repos/smart_watch_dvc-hdd', 'code/watch', ] remotes = [ ub.Path('$HOME/remote/namek').expand(), ub.Path('$HOME/remote/toothbrush').expand(), ub.Path('$HOME/remote/ooo').expand(), ub.Path('$HOME/remote/horologic').expand(), ] candidates = [] for remote in remotes: for suff in suffixes: dpath = remote / suff print(f'{dpath=}') candidates += list(dpath.glob('benchmark-filesys-*')) datas = [] for fpath in ub.ProgIter(candidates): import json with open(fpath, 'r') as file: data = json.load(file) data['fpath'] = str(fpath) datas.append(data) rows = [] trial = 0 for data in ub.ProgIter(datas): data['info'] try: info_prop = data['info'][0]['properties'] except KeyError: info_prop = data['info'][0] fs = info_prop['disk_info']['filesystem'] disks = '-'.join(info_prop['disk_info']['names']) cpu = info_prop['system_info']['cpu_info']['brand_raw'] import pint ureg = pint.UnitRegistry() ram = int((info_prop['system_info']['mem_info']['total'] * ureg.bytes).to('GiB').m) common = { 'cpu': cpu, 'filesystem': fs, 'hostname': info_prop['hostname'], 'disks': disks, 'ram': ram, } for m in data['measures']: trial += 1 prop = m['properties'] label = prop[ 'label'] if 'per_img_times' in prop: for time in m['properties']['per_img_times']: row = { 'label': label, 'trial': trial, 'per_img_time': time, **common, } rows.append(row) else: row = { 'label': label, 'trial': trial, 'seconds': prop['seconds'], **common, } rows.append(row) for row in ub.ProgIter(rows): key = '{}-{}-{}'.format(row['filesystem'], row['disks'], row['ram']) row['key'] = key import pandas as pd df = pd.DataFrame(rows) load_df = df[(df['label'] == 'iter_image_load') | (df['label'] == 'iter_image_load_second_pass')] ax = sns.violinplot(data=load_df, x='trial', y='per_img_time', # scale="width", scale="area", inner="quartile", hue='key', cut=0) ax.set_yscale('log') ax.set_title('Per Image Loading Time') if __name__ == '__main__': """ CommandLine: python ~/code/watch/dev/check_filesys_iteration_time.py """ main()