import ubelt as ub try: import xdev profile = xdev.profile except Exception: profile = ub.identity def benchmark_models(): # https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html # TODO: profile attention_impl import netharn as nh # from watch.tasks.fusion import datamodules import torch.profiler from watch.tasks.fusion.architectures import transformer from watch.tasks.fusion.methods.channelwise_transformer import MultimodalTransformer # from torch.profiler import profile, ProfilerActivity, record_function # datamodule = datamodules.KWCocoVideoDataModule( # train_dataset='special:vidshapes8', num_workers=0) # datamodule.setup('fit') # loader = datamodule.train_dataloader() # batch = next(iter(loader)) #self = MultimodalTransformer(arch_name='smt_it_joint_p8') # frames = batch[0]['frames'] # collate_images = torch.cat([frame['modes']['r|g|b'][None, :].float() for frame in frames], dim=0) device = nh.XPU.coerce('cpu').main_device device = nh.XPU.coerce('gpu').main_device #device = nh.XPU.coerce('auto').main_device # images = collate_images[None, :].to(device) input_grid = list(ub.named_product({ 'S': [32, 64, 96, 128], # 'T': [2, 3, 5, 9], # 'T': [2, 5, 9], # 'T': [2, 5, 9], # 'T': [2], 'T': [2, 9, 11, 17], # 'M': [3, 5, 7, 11, 13, 32, 64, 128, 256], 'M': [3, 5, 7, 11, 13, 32, 64], })) # coco_fpath = ub.expandpath('$HOME/data/work/toy_change/vidshapes_msi_train/data.kwcoco.json') import watch from watch.tasks.fusion import datamodules if 1: coco_dset = watch.coerce_kwcoco('watch-msi') channels = "B11,r|g|b,B1|B8|B11" datamodule = datamodules.KWCocoVideoDataModule( train_dataset=coco_dset, chip_size=224, batch_size=1, time_steps=3, channels=channels, normalize_inputs=True, neg_to_pos_ratio=0, num_workers='avail/2', true_multimodal=True, ) datamodule.setup('fit') # TODO batch = next(iter(datamodule.train_dataloader())) # NOQA encoder_info = transformer.encoder_configs.copy() import pandas as pd df = pd.DataFrame(encoder_info).T df['num_atten_mechs'] = df['axes'].map(len) df['axes'] = df['axes'].map(lambda x: '.'.join([ ''.join([p[0] for p in part]) for part in x ])) df['default_shape'] = df['default_shape'].map(lambda x: ''.join([c[0] for c in x])) df.drop('axes', axis=1) df.index.name = 'arch_name' # df = df.drop('axes', axis=1) # df = df.drop('default_shape', axis=1) flags = ( (df['n_layers'] >= 12) & (df['n_layers'] < 24) & (df['num_atten_mechs'] == 1) ) df_subset = df[flags] print(df_subset.to_string()) df = df[df['n_heads'] >= 8] chosen_arch = [ # 'smt_it_stm_p8', 'smt_it_joint_p8', # 'smt_it_hwtm_p8', # 'smt_it_joint_n12', ] # chosen_arch = df_subset.index.values.tolist() # all_arch_names = list(transformer.encoder_configs.keys()) model_basis = { 'arch_name': chosen_arch, 'squash_modes': [True, False], 'window_size': [8, 4], 'attention_impl': [ 'exact', 'performer' # 'reformer' ], } model_grid = list(ub.named_product(model_basis)) import itertools as it bench_grid = list(it.product(model_grid, input_grid)) rows = [] nicerows = [] self = None images = None # train_prof = None output = None torch.cuda.empty_cache() torch.cuda.reset_peak_memory_stats() torch.cuda.reset_max_memory_allocated() # Pure memory benchmarks for modelkw, inputkw in ub.ProgIter(bench_grid, verbose=3): # for arch_name in ['smt_it_stm_p8']: M = inputkw['M'] T = inputkw['T'] S = inputkw['S'] row = {} row.update(inputkw) row.update(modelkw) row.update(ub.dict_isect(transformer.encoder_configs[modelkw['arch_name']], {'n_layers', 'embedding_size', 'n_heads'})) # Get dummy input images = torch.rand(1, T, M, S, S).to(device) errored = False try: self = MultimodalTransformer(input_channels=M, **modelkw) num_params = nh.util.number_of_parameters(self) self = self.to(device) optim = torch.optim.SGD(self.parameters(), lr=1e-9) # with torch.profiler.profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True, profile_memory=True) as train_prof: # with torch.profiler.profile(activities=[ProfilerActivity.CUDA], record_shapes=False, profile_memory=True) as train_prof: # with record_function(f"train_{arch_name}"): optim.zero_grad() output = self(images)['change'] output.sum().backward() optim.step() # total_memory = sum(event.cuda_memory_usage for event in train_prof.events()) # total_mem_str = xdev.byte_str(total_memory) # print(total_mem_str) row.update({ 'num_params': num_params, }) mem_stats = ({ 'max_mem_alloc': torch.cuda.max_memory_allocated(), 'mem_alloc': torch.cuda.memory_allocated(), 'mem_reserve': torch.cuda.memory_reserved(), 'max_mem_reserve': torch.cuda.max_memory_reserved(), }) row.update(mem_stats) except RuntimeError: errored = True pass except AssertionError: errored = True pass self = None images = None # train_prof = None output = None optim = None torch.cuda.empty_cache() torch.cuda.reset_peak_memory_stats() torch.cuda.reset_max_memory_allocated() if not errored: rows.append(row) nicerow = row.copy() nicestats = {k + '_str': xdev.byte_str(v, unit='GB') if isinstance(v, int) else v for k, v in mem_stats.items()} nicerow.update(nicestats) nicerows.append(nicerow) print(nicerow) import pandas as pd df = (pd.DataFrame(nicerows)) df = df.sort_values('max_mem_alloc') df['max_mem_alloc_str'] = df['max_mem_alloc'].map(lambda x: xdev.byte_str(x, 'GB')) print(df.to_string()) df_subset = df[df.arch_name.apply(lambda x: 'joint' in x)] import kwplot kwplot.autompl() sns = kwplot.autosns() from matplotlib.colors import LogNorm fnum = 0 grouper = list(model_basis.keys()) for k, subdf in df.groupby(grouper): fnum += 1 print('') print('k = {!r}'.format(k)) piv = subdf.pivot(['S', 'T'], ['M', 'num_params'], ['max_mem_alloc_str']) print(piv) fig = kwplot.figure(fnum=fnum) fig.set_size_inches(8.69, 4.93) ax = fig.gca() piv = piv.droplevel((0, 2), axis=1) d = piv.applymap(lambda x: float(x.split(' ')[0]) if isinstance(x, str) else x) arch_cfg = transformer.encoder_configs[k[0]] cfg = ub.dzip(grouper, k) cfg.update(ub.dict_isect(arch_cfg, {'n_layers', 'n_heats', 'embedding_size'})) title = ub.urepr(cfg, compact=1, sort=0) sns.heatmap(d, annot=piv, ax=ax, fmt='s', norm=LogNorm(vmin=1, vmax=24), annot_kws={'size': 8}, cbar_kws={'label': 'memory', 'pad': 0.001}) # ax.set_xticklabels(ax.get_xticklabels(), rotation=45) ax.figure.subplots_adjust(bottom=0.2) ax.set_title(title) ax.set_xlabel('Number of Modes (M)') ax.set_ylabel('Space (S) Time (S) dims') # ax.figure # import timerit # ti = timerit.Timerit(3, bestof=1, verbose=2) # # # for arch_name in ['smt_it_stm_p8', 'smt_it_joint_p8', 'smt_it_hwtm_p8']: # print('====') # # self = MultimodalTransformer(arch_name=arch_name, input_channels=datamodule.input_channels) # num_params = nh.util.number_of_parameters(self) # print('arch_name = {!r}'.format(arch_name)) # print('num_params = {!r}'.format(num_params)) # print('running') # self = self.to(device) # output = self(images) # for timer in ti.reset(f'inference-{arch_name}'): # torch.cuda.synchronize() # with timer: # output = self(images)['change'] # torch.cuda.synchronize() # for timer in ti.reset(f'train-{arch_name}'): # torch.cuda.synchronize() # with timer: # output = self(images)['change'] # output.sum().backward() # torch.cuda.synchronize() # with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True, profile_memory=True) as pred_prof: # with record_function(f"pred_{arch_name}"): # output = self(images)['change'] # with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True, profile_memory=True) as train_prof: # with record_function(f"train_{arch_name}"): # output = self(images)['change'] # output.sum().backward() # print('arch_name = {!r}'.format(arch_name)) # print('num_params = {!r}'.format(num_params)) # print(pred_prof.key_averages().table(sort_by="cuda_time_total", row_limit=10)) # print(train_prof.key_averages().table(sort_by="cuda_time_total", row_limit=10)) # total_memory = sum(event.cuda_memory_usage for event in train_prof.events()) # total_mem_str = xdev.byte_str(total_memory) # print(total_mem_str) if __name__ == '__main__': """ CommandLine: python ~/code/watch/watch/tasks/fusion/experiments/crall/benchmark_models.py """ benchmark_models()