import ubelt as ub import os def find_linked_files(dpath): items = [] prog = ub.ProgIter(desc='walking') prog.begin() for r, ds, fs in os.walk(dpath): r = ub.Path(r) prog.update(1) for f in fs: fpath = r / f if fpath.is_symlink(): real_fpath = fpath.readlink() items.append({'fpath': fpath, 'real_fpath': real_fpath}) prog.end() return items def main(): """ We want to see (1) if there are symlinks that point to the same data in different datasets (2) if there are files that are not referenced by the kwcoco data """ import watch dvc_dpath = watch.find_smart_dvc_dpath().augment(suffix='-hdd') dpath1 = dvc_dpath / 'Drop2-Aligned-TA1-2022-02-15' dpath2 = dvc_dpath / 'Aligned-Drop3-TA1-2022-03-10' linked_files1 = find_linked_files(dpath1) linked_files2 = find_linked_files(dpath2) for item in linked_files2: item['md5'] = item['real_fpath'].name for item in linked_files1: item['md5'] = item['real_fpath'].name md5_to_items1 = ub.group_items(linked_files1, key=lambda x: x['md5']) md5_to_items2 = ub.group_items(linked_files2, key=lambda x: x['md5']) common = set(md5_to_items1) & set(md5_to_items2) for md5 in common: items1 = md5_to_items1[md5] items2 = md5_to_items2[md5] md5_dup1 = {k: v for k, v in ub.map_vals(len, md5_to_items1).items() if v > 1} md5_dup2 = {k: v for k, v in ub.map_vals(len, md5_to_items2).items() if v > 1} md5_dup1[''] md5_to_items1['4a67c5b4326903f6ca36a34bd8d531'] for k in md5_dup2: items = md5_to_items2[k] print('items = {}'.format(ub.urepr(items, nl=1))) def find_unregistered_files(): import watch import kwcoco dvc_dpath = watch.find_smart_dvc_dpath().augment(suffix='-hdd') dpath = dvc_dpath / 'Aligned-Drop3-TA1-2022-03-10' coco_fpath = dpath / 'data.kwcoco.json' coco_dset = kwcoco.CocoDataset(coco_fpath) linked_files = find_linked_files(dpath) for item in linked_files: item['md5'] = item['real_fpath'].name md5_to_items = ub.group_items(linked_files, key=lambda x: x['md5']) fpath_to_md5 = {x['fpath']: x['md5'] for x in linked_files} on_disk_files = {f['fpath'] for f in linked_files} in_kwcoco_files = set() for coco_img in coco_dset.images().coco_images: in_kwcoco_files.update(list(map(ub.Path, coco_img.iter_image_filepaths()))) # Find all the files that are not registered in the kwcoco data print(len(in_kwcoco_files)) print(len(on_disk_files)) common = on_disk_files & in_kwcoco_files print(len(common)) unregistered_files = on_disk_files - in_kwcoco_files total_unreg_bytes = 0 for unreg_fpath in ub.ProgIter(unregistered_files): total_unreg_bytes += unreg_fpath.lstat().st_size import pint reg = pint.UnitRegistry() (total_unreg_bytes * reg.Unit('bytes')).to('megabytes') for unreg_fpath in unregistered_files: md5 = fpath_to_md5[unreg_fpath] others = md5_to_items[md5] if len(others) > 1: print('others = {!r}'.format(others)) print('has others') else: print('no others') total_bytes = 0 unique_items = [] for md5, items in ub.ProgIter(md5_to_items.items()): item = items[0] fpath = item['fpath'] rel = fpath.relative_to(dpath) region, sensor = rel.parts[0:2] stat = item['real_fpath'].stat() item['st_size'] = stat.st_size item['sensor'] = sensor item['region'] = region unique_items.append(item) total_bytes += stat.st_size total_gb = (total_bytes * reg.Unit('bytes')).to('terabytes') print('total_gb = {!r}'.format(total_gb)) def bytes_to_gigabytes(x): return (x * reg.bytes).to('gigabytes').m import pandas as pd df = pd.DataFrame(unique_items) df.groupby('sensor')['st_size'].sum().apply(bytes_to_gigabytes)