#!/usr/bin/env python3 """ Combine kwcoco files with different "auxiliary" / "asset" features into a single kwcoco file. """ import ubelt as ub import scriptconfig as scfg class CocoCombineFeatures(scfg.DataConfig): """ Combine kwcoco files with different "auxiliary" / "asset" features into a single kwcoco file. The names of the kwcoco images in all of the input ``src`` datasets must be the same. TODO: - [ ] This might go in kwcoco proper? This could be folded into "union" """ src = scfg.Value([], nargs='+', position=1, help=ub.paragraph( ''' Paths to the input kwcoco datasets. The first one will be the "base" ''')) dst = scfg.Value(None, help=ub.paragraph( ''' Path to the destination combined kwcoco dataset to write. ''')) io_workers = scfg.Value('avail', help=ub.paragraph( ''' Number of workers used to read multiple datasets. Can be numeric or a string code like "avail", which uses all available CPUs. ''')) absolute = scfg.Value(False, isflag=True, help=ub.paragraph( ''' if True, reroot all inputs to use absolute paths ''')) def main(cmdline=True, **kwargs): """ Example: >>> from geowatch.cli import coco_combine_features >>> import geowatch >>> dset = geowatch.coerce_kwcoco('geowatch-msi') >>> dpath = ub.Path.appdir('geowatch/tests/combine_fetures').ensuredir() >>> # Breakup the data into two parts with different features >>> dset1 = dset.copy() >>> dset2 = dset.copy() >>> dset1.fpath = dpath / 'part1.kwcoco.json' >>> dset2.fpath = dpath / 'part2.kwcoco.json' >>> # Remove all but the first asset from dset1 >>> for coco_img in dset1.images().coco_images: ... del coco_img.img['auxiliary'][1:] >>> # Remove the first asset from dset2 >>> for coco_img in dset2.images().coco_images: ... del coco_img.img['auxiliary'][0] >>> dset1.dump() >>> dset2.dump() >>> from geowatch.utils import kwcoco_extensions >>> chan_stats0 = kwcoco_extensions.coco_channel_stats(dset)['chan_hist'] >>> chan_stats1 = kwcoco_extensions.coco_channel_stats(dset1)['chan_hist'] >>> chan_stats2 = kwcoco_extensions.coco_channel_stats(dset2)['chan_hist'] >>> assert chan_stats1 != chan_stats0, 'channels should be different' >>> # Combining the two modified kwcoco files should result in the original >>> dst_fpath = dpath / 'combo.kwcoco.json' >>> kwargs = { >>> 'src': [str(dset1.fpath), str(dset2.fpath)], >>> 'dst': str(dst_fpath), >>> } >>> cmdline = 0 >>> coco_combine_features.main(cmdline=cmdline, **kwargs) >>> dst_dset = geowatch.coerce_kwcoco(dst_fpath) >>> chan_stats3 = kwcoco_extensions.coco_channel_stats(dst_dset)['chan_hist'] >>> assert chan_stats3 == chan_stats0, ( >>> 'combine features should have the same as the original dset') Example: >>> # xdoctest: +REQUIRES(env:DVC_DPATH) >>> # xdoctest: +SKIP >>> # drop1-S2-L8-aligned-old deprecated >>> from geowatch.cli.coco_combine_features import * # NOQA >>> import os >>> _default = ub.expandpath('$HOME/data/dvc-repos/smart_watch_dvc') >>> dvc_dpath = ub.Path(os.environ.get('DVC_DPATH', _default)) >>> fpath1 = dvc_dpath / 'drop1-S2-L8-aligned/data.kwcoco.json' >>> #fpath1 = dvc_dpath / 'drop1-S2-L8-aligned-old/data.kwcoco.json' >>> fpath2 = dvc_dpath / 'drop1-S2-L8-aligned-old/uky_invariants.kwcoco.json' >>> fpath3 = dvc_dpath / 'drop1-S2-L8-aligned/_testcombo.kwcoco.json' >>> assert fpath1.exists() >>> assert fpath2.exists() >>> cmdline = False >>> kwargs = { >>> 'src': [str(fpath1), str(fpath2)], >>> 'dst': str(fpath3), >>> } >>> main(cmdline, **kwargs) """ import kwcoco config = CocoCombineFeatures.cli(data=kwargs, cmdline=cmdline) import rich rich.print(ub.urepr(config)) dset_iter = kwcoco.CocoDataset.coerce_multiple( config.src, workers=config.io_workers) dset_list = [] for dset in dset_iter: if config['absolute']: dset.reroot(absolute=True) dset_list.append(dset) src_dsets = dset_list[1:] dst_dset = dset_list[0] dst_dset.fpath = config['dst'] dst_dset = combine_auxiliary_features(dst_dset, src_dsets) missing_hist = ub.ddict(lambda: 0) channel_specs = [] # Check which images have which features (did we miss any?) for _gid, dst_img in ub.ProgIter(dst_dset.index.imgs.items(), total=dst_dset.n_images, desc='checking features'): img_channels = set() for aux in dst_img.get('auxiliary'): img_channels.add(aux['channels']) channel_specs.append(img_channels) all_channels = set.union(*channel_specs) for spec in channel_specs: missing = all_channels - spec if missing: for k in missing: missing_hist[k] += 1 if missing_hist: print('missing_hist = {!r}'.format(missing_hist)) print('dump dst_dset.fpath = {!r}'.format(dst_dset.fpath)) dst_dset.fpath dst_dset.dump(newlines=True) def combine_auxiliary_features(dst_dset, src_dsets): """ Copies all non-existing assets from ``src_dsets`` into ``dst_dset``. Updates each image in ``dst_dset`` with all non-existing asset (as determined by the 'channels' attribute) in each corresponding image in each ``src_dsets``. Args: dst_dset (kwcoco.CocoDataset): modified inplace src_dsets (List[kwcoco.CocoDataset]): Returns: kwcoco.CocoDataset: returns input ``dst_dset``. Example: >>> from geowatch.cli.coco_combine_features import * # NOQA >>> import kwcoco >>> base = kwcoco.CocoDataset.demo('vidshapes8-multispectral') >>> dset1 = base.copy() >>> dset2 = base.copy() >>> dset3 = base.copy() >>> dset4 = base.copy() >>> for img in dset1.index.imgs.values(): >>> del img['auxiliary'][0::3] >>> for img in dset2.index.imgs.values(): >>> del img['auxiliary'][1::3] >>> dset2.remove_images([2, 3]) >>> for img in dset3.index.imgs.values(): >>> del img['auxiliary'][2::3] >>> dset3.remove_images([2, 3]) >>> for img in dset4.index.imgs.values(): >>> del img['auxiliary'][0::2] >>> dset4.remove_images([2, 3]) >>> dst_dset = dset1 >>> src_dsets = [dset2, dset3, dset4] >>> for img in dset1.index.imgs.values(): ... assert len(img['auxiliary']) != 5 >>> dst_dset = combine_auxiliary_features(dst_dset, src_dsets) >>> lens1 = list(map(len, dset1.images(set(dset1.imgs) - {2, 3}).lookup('auxiliary'))) >>> assert ub.allsame([5] + lens1) >>> lens2 = list(map(len, dset1.images({2, 3}).lookup('auxiliary'))) >>> assert ub.allsame([3] + lens2) """ for src_dset in src_dsets: gids1, gids2, report = associate_images(dst_dset, src_dset) print('report = {!r}'.format(report)) for gid1, gid2 in zip(gids1, gids2): dst_img = dst_dset.index.imgs[gid1] src_img = src_dset.index.imgs[gid2] dst_auxiliary = dst_img.get('auxiliary') src_auxiliary = src_img.get('auxiliary') if src_auxiliary is None: src_auxiliary = [] # nothing will happen in this case if dst_auxiliary is None: dst_auxiliary = dst_img['auxiliary'] = [] have_channels = set(aux.get('channels') for aux in dst_auxiliary) assert src_img['name'] == dst_img['name'] for src_aux in src_auxiliary: if src_aux['channels'] not in have_channels: have_channels.add(src_aux['channels']) dst_auxiliary.append(src_aux) return dst_dset def associate_images(dset1, dset2): """ Get image ids for images in two datasets that share the same name. This is a hueristic for getting pairs of images that correspond between two datasets. Args: dset1 (kwcoco.CocoDataset): dset2 (kwcoco.CocoDataset): Returns: Tuple[List[int], List[int], Dict]: """ dset1_img_names = set(dset1.index.name_to_img) dset2_img_names = set(dset2.index.name_to_img) common_names = dset1_img_names & dset2_img_names dset1_missing_img_names = dset1_img_names - common_names dset2_missing_img_names = dset2_img_names - common_names report = {} report.update({ 'num_name_common': len(common_names), 'num_name_missing1': len(dset1_missing_img_names), 'num_name_missing2': len(dset2_missing_img_names), }) gids1 = [] gids2 = [] for name in common_names: img1 = dset1.index.name_to_img[name] img2 = dset2.index.name_to_img[name] gids1.append(img1['id']) gids2.append(img2['id']) return gids1, gids2, report if __name__ == '__main__': """ CommandLine: python ~/code/watch/geowatch/cli/coco_combine_features.py python -m geowatch.cli.coco_combine_features """ main(cmdline=True)