import xdev def debug_feature_loading(): """ profile loading steps in ndsampler / kwcoco """ import xdev from watch.tasks.fusion.datamodules.kwcoco_video_data import KWCocoVideoDataset # NOQA import ndsampler import kwcoco import ubelt as ub from watch.utils.util_data import find_smart_dvc_dpath dvc_dpath = find_smart_dvc_dpath() print(ub.urepr(list((dvc_dpath / 'drop1-S2-L8-aligned').glob('combo*.json')))) coco_fpath = dvc_dpath / 'drop1-S2-L8-aligned/combo_train_data.kwcoco.json' coco_dset = kwcoco.CocoDataset(coco_fpath) sampler = ndsampler.CocoSampler(coco_dset) sample_shape = (3, 96, 96) channels = 'blue|green|red|nir|swir16' channels = 'blue' from watch.cli import watch_coco_stats watch_coco_stats.coco_watch_stats(coco_dset) channels = 'blue|inv_shared30' #channels = 'rice_field|cropland|water|inland_water|river_or_stream|sebkha|snow_or_ice_field|bare_ground|sand_dune|built_up|grassland|brush|forest|wetland|road' #channels = 'matseg_0|matseg_1|matseg_2|matseg_3|matseg_4' self = KWCocoVideoDataset(sampler, sample_shape=sample_shape, channels=channels, neg_to_pos_ratio=1.0) print('self = {!r}'.format(self)) # _ = xdev.profile_now(self.compute_dataset_stats)(num=10, num_workers=4, batch_size=1) # Profile the getitem step (points to load_sample as the bottleneck) if 0: _ = xdev.profile_now(self.__getitem__)(0) tr = self.new_sample_grid['targets'][0] tr['channels'] = self.channels print('tr = {!r}'.format(tr)) # Profile load_sample itself (points to sampler._load_slice as bottleneck) if 0: _ = xdev.profile_now(self.sampler.load_sample)(tr) pad = None padkw = {} # Several longer lived places exist here # Try with and without experimental loading if 0: tr['use_experimental_loader'] = 0 with ub.Timer() as t1: item1 = xdev.profile_now(self.sampler._load_slice)(tr, pad, padkw) tr['use_experimental_loader'] = 1 with ub.Timer() as t2: item2 = xdev.profile_now(self.sampler._load_slice)(tr, pad, padkw) print('t1.elapsed = {!r}'.format(t1.elapsed)) print('t2.elapsed = {!r}'.format(t2.elapsed)) # -- inspect outputs item1['im'].shape item1['im'].sum() print(item1['im'].mean(axis=(1, 2))) item2['im'].shape print(item2['im'].mean(axis=(1, 2))) if 0: import timerit print('start experimental loader timing') ti = timerit.Timerit(1, bestof=1, verbose=3) for timer in ti.reset('time'): with timer: tr['use_experimental_loader'] = 0 (self.sampler._load_slice)(tr, pad, padkw) for timer in ti.reset('time'): with timer: tr['use_experimental_loader'] = 1 (self.sampler._load_slice)(tr, pad, padkw) print('done experimental loader timing') # Dig into the experimental loader time_gids = tr['gids'] gid = time_gids[0] dset = coco_dset space_slice = tr['space_slice'] from kwcoco import channel_spec request_chanspec = channel_spec.ChannelSpec.coerce(channels) if 0: obj = dset.imgs[2]['auxiliary'][0] xdev.profile_now(dset._delay_load_imglike)(obj) from kwcoco.channel_spec import FusedChannelSpec xdev.profile_now(FusedChannelSpec.coerce)(obj['channels']) xdev.profile_now(FusedChannelSpec.parse)(obj['channels']) xdev.profile_now(FusedChannelSpec.coerce('a').normalize)() xdev.profile_now(FusedChannelSpec.coerce('a').normalize)() with ub.Timer() as t3: delayed_frame = xdev.profile_now(dset.delayed_load)(gid, space='video') with ub.Timer() as t4: delayed_frame = xdev.profile_now(dset.delayed_load)(gid, space='video', channels=request_chanspec) print('t3.elapsed = {!r}'.format(t3.elapsed)) print('t4.elapsed = {!r}'.format(t4.elapsed)) delayed_frame = xdev.profile_now(delayed_frame.delayed_crop)(space_slice) delayed_frame = dset.delayed_load(gid, space='video') delayed_frame = delayed_frame.crop(space_slice) # if not all_chan: # delayed_frame = delayed_frame.take_channels(request_chanspec) # xr_frame = delayed_frame.finalize(as_xarray=True) # space_frames.append(xr_frame) for i in range(100): # Not a huge time difference in the actual delayed load, which seems correct gid = 2 with ub.Timer() as tshrd: shared_delayed = (coco_dset.delayed_load)(gid=gid, channels='inv_shared30') with ub.Timer() as tblue: blue_delayed = (coco_dset.delayed_load)(gid=gid, channels='blue') print('---') print('shared tshrd.elapsed = {!r}'.format(tshrd.elapsed)) print('blue tblue.elapsed = {!r}'.format(tblue.elapsed)) # The crop operation is also fine print('---') space_slice = (slice(0, 96, None), slice(0, 96, None)) gid = 2 with ub.Timer() as tshrd: shared_crop = shared_delayed.delayed_crop(space_slice) with ub.Timer() as tblue: blue_crop = blue_delayed.delayed_crop(space_slice) print('shared tshrd.elapsed = {!r}'.format(tshrd.elapsed)) print('blue tblue.elapsed = {!r}'.format(tblue.elapsed)) # FIRST run of finalize is about 100x slower. print('---') with ub.Timer() as tblue: blue_final = blue_crop.finalize() with ub.Timer() as tshrd: shared_final = shared_crop.finalize() print('shared tshrd.elapsed = {!r}'.format(tshrd.elapsed)) print('blue tblue.elapsed = {!r}'.format(tblue.elapsed)) # SUBSEQUENT runs seem to be hitting a cache print('---') with ub.Timer() as tshrd: shared_final = shared_crop.finalize() with ub.Timer() as tblue: blue_final = blue_crop.finalize() print('shared tshrd.elapsed = {!r}'.format(tshrd.elapsed)) print('blue tblue.elapsed = {!r}'.format(tblue.elapsed)) if 1: from os.path import join from kwcoco.util.util_delayed_poc import LazyGDalFrameFile aux_blue = [aux for aux in coco_dset.imgs[gid]['auxiliary'] if 'blue' in aux['channels']][0] blue_fpath = join(dset.bundle_dpath, aux_blue['file_name']) blue_frame = LazyGDalFrameFile(blue_fpath) sl_blue = (slice(0, 147, None), slice(0, 147, None), slice(None, None, None)) aux_shared = [aux for aux in coco_dset.imgs[gid]['auxiliary'] if 'inv_shared1' in aux['channels']][0] shared_fpath = join(dset.bundle_dpath, aux_shared['file_name']) shared_frame = LazyGDalFrameFile(shared_fpath) sl_shared = (slice(0, 147, None), slice(0, 147, None), [29]) print('blue_fpath = {!r}'.format(blue_fpath)) print('shared_fpath = {!r}'.format(shared_fpath)) import timerit ti = timerit.Timerit(1, bestof=10, verbose=2) for timer in ti.reset('blue'): with timer: blue_frame[sl_blue] for timer in ti.reset('shared'): with timer: shared_frame[sl_shared] for timer in ti.reset('shared'): with timer: shared_frame[sl_shared] for timer in ti.reset('blue'): with timer: blue_frame[sl_blue] @xdev.profile def simple_debug_blue(): simple_debug_feature_loading('blue') @xdev.profile def simple_debug_shared(): simple_debug_feature_loading('inv_shared30') @xdev.profile def simple_debug_feature_loading(channels): """ Method to profile a single run of feature loading for some channel selection """ from watch.tasks.fusion.datamodules.kwcoco_video_data import KWCocoVideoDataset # NOQA import ndsampler import kwcoco from watch.utils.util_data import find_smart_dvc_dpath dvc_dpath = find_smart_dvc_dpath() coco_fpath = dvc_dpath / 'drop1-S2-L8-aligned/combo_train_data.kwcoco.json' coco_dset = kwcoco.CocoDataset(coco_fpath) sampler = ndsampler.CocoSampler(coco_dset) sample_shape = (3, 96, 96) self = KWCocoVideoDataset(sampler, sample_shape=sample_shape, channels=channels) self[0] self[1] self[2] self[3] if __name__ == '__main__': """ CommandLine: python ~/code/watch/dev/debug_kwcoco_feature_loading.py --profile """ simple_debug_blue() simple_debug_shared() # simple_debug_feature_loading('inv_shared30') # debug_feature_loading()