#!/usr/bin/env python3 r""" Usage: ANNOTATIONS_DPATH=$HOME/data/dvc-repos/smart_data_dvc-ssd/annotations/drop6 python ~/code/watch/dev/poc/compare_iarpa_metrics_versions.py \ --roi CH_R001 \ --sm_dir $HOME/data/dvc-repos/smart_expt_dvc/_airflow/preeval14_batch_v28/CH_R001/kit_fixups_v2/cropped_site_models_fixed \ --gt_dir $ANNOTATIONS_DPATH/site_models \ --rm_dir $ANNOTATIONS_DPATH/region_models ANNOTATIONS_DPATH=$HOME/data/dvc-repos/smart_data_dvc-ssd/annotations/drop6 python ~/code/watch/dev/poc/compare_iarpa_metrics_versions.py \ --roi KR_R001 \ --sm_dir $HOME/data/dvc-repos/smart_expt_dvc/_airflow/preeval14_batch_v28/KR_R001/kit_fixups_v2/cropped_site_models_fixed \ --gt_dir $ANNOTATIONS_DPATH/site_models \ --rm_dir $ANNOTATIONS_DPATH/region_models \ --versions \ kitware-main-2023-07-25 \ new_kit_speedups_2023-08-01 \ make-database-reqs-optional Requirements: pip install ubelt scriptconfig rich cmd_queue pandas """ import scriptconfig as scfg import ubelt as ub import cmd_queue import itertools as it import pandas as pd import rich import numbers import numpy as np class CompareIarpaMetricsVersionsCLI(scfg.DataConfig): """ Install, run, and compare the results of multiple versions of the metrics code. This script will create virtual environments for different versions of the IARPA metrics and inspect the differences between them. """ roi = scfg.Value(None, help='Region name') gt_dir = scfg.Value(None, help='Path to true sites directory') rm_dir = scfg.Value(None, help='Path to true regions directory') sm_dir = scfg.Value(None, help='Path to predicted site models directory') repo_dpath = scfg.Value(None, help='path to a cannonical checkout of the metrics code. Inferred if possible') performer = scfg.Value('kit', help='passed to metrics code') tmux_workers = scfg.Value(4, help='number of tmux workers') versions = scfg.Value([ 'kitware-main-2023-07-25', 'new_kit_speedups_2023-08-01', 'make-database-reqs-optional', ], nargs='+', help='Multiple git hashes or branch names of the metric repo to test') @classmethod def main(cls, cmdline=1, **kwargs): """ Example: >>> # xdoctest: +SKIP >>> from compare_iarpa_metrics_versions import * # NOQA >>> cmdline = 0 >>> kwargs = dict() >>> cls = CompareIarpaMetricsVersionsCLI >>> cls.main(cmdline=cmdline, **kwargs) """ config = cls.cli(cmdline=cmdline, data=kwargs, strict=True) rich.print('config = ' + ub.urepr(config, nl=1)) roi = config.roi performer = config.performer gt_dir = ub.Path(config.gt_dir) rm_dir = ub.Path(config.rm_dir) sm_dir = ub.Path(config.sm_dir) assert gt_dir.exists() assert rm_dir.exists() assert sm_dir.exists() versions = config.versions # Ensure we have a venv for each requested version with a standalone # checkout of the code installed. repo_infos = setup_versioned_repos(versions, repo_dpath=None) # Setup a command queue which will execute all of the metrics commands # in parallel queue = cmd_queue.Queue.create(backend='tmux', size=min(len(versions), config.tmux_workers)) requests = [] for repo_info in repo_infos: # Build an evaluation command specific to each version of the code sm_hashid = ub.hash_data(sm_dir)[0:8] output_dir = (repo_info['clone_dpath'] / f'output_{roi}_{sm_hashid}') venv_dpath = repo_info['venv_dpath'] metrics_command = ub.codeblock( fr''' source {venv_dpath}/bin/activate && python -m iarpa_smart_metrics.run_evaluation \ --roi {roi} \ --gt_dir {gt_dir} \ --rm_dir {rm_dir} \ --sm_dir {sm_dir} \ --output_dir "{output_dir}" \ --activity overall \ --performer={performer} \ --eval_num=0 --eval_run_num=0 \ --sequestered_id '' \ --serial --no-viz-slices \ --no-viz-region \ --no-viz-slices \ --no-viz-detection-table --no-viz-comparison-table --no-viz-associate-metrics --no-viz-activity-metrics ''') request = repo_info.copy() request['metrics_command'] = metrics_command request['output_dir'] = output_dir if not output_dir.exists(): queue.submit(metrics_command) requests.append(request) queue.print_commands() queue.run() rich.print('repo_infos = {}'.format(ub.urepr(repo_infos, nl=2))) # Load up all metric outputs and compare them. outputs = {} for repo_info in requests: hashid = repo_info['commit_hashid'] output_dir = repo_info['output_dir'] print(f'Load results: output_dir={output_dir}') outputs[hashid] = load_metric_results(output_dir) # Compare each pair of outputs for out_hashid1, out_hashid2 in it.combinations(outputs, 2): outs1 = outputs[out_hashid1] outs2 = outputs[out_hashid2] print('----') rich.print(f'Compare: {out_hashid1} and {out_hashid2}') status = compare_output_pair(outs1, outs2) if status['n_common'] == 0: rich.print(f'[red]{out_hashid1} and {out_hashid2} have NO COMMON FILES') elif status['has_major_difference']: rich.print(f'[red]{out_hashid1} and {out_hashid2} are NOT the same') elif status['has_minor_difference']: rich.print(f'[yellow]{out_hashid1} and {out_hashid2} are very close') else: rich.print(f'[green]{out_hashid1} and {out_hashid2} are exactly the same') rich.print('status = {}'.format(ub.urepr(status, nl=-1))) def setup_versioned_repos(versions, repo_dpath=None): """ Setup a standalone virtual environment and clone for each specific branch of the metrics repo, and install the repo to that venv. """ import iarpa_smart_metrics if repo_dpath is None: repo_dpath = ub.Path(iarpa_smart_metrics.__file__).parent.parent else: repo_dpath = ub.Path(repo_dpath) src_gitdir = repo_dpath / '.git' assert src_gitdir.exists() # Temporary dir where we will store clone_base = ub.Path.appdir('cache', 'iarpa_smart_metrics', 'compare-versions') repo_infos = [] for version in versions: info = ub.cmd(f'git rev-parse {version}', cwd=repo_dpath, check=1) commit_hashid = info.stdout.strip()[0:8] repo_infos.append({ 'version': version, 'commit_hashid': commit_hashid, }) # Install different versions of the code to different virtual environments. for repo_info in repo_infos: commit_hashid = repo_info['commit_hashid'] clone_dpath = clone_base / f'metrics_{commit_hashid}' venv_dpath = clone_dpath / f'venv_{commit_hashid}' repo_info['venv_dpath'] = venv_dpath repo_info['clone_dpath'] = clone_dpath clone_dpath.parent.ensuredir() if not clone_dpath.exists(): ub.cmd(f'git clone {src_gitdir} {clone_dpath}', verbose=3) ub.cmd(f'git reset --hard {commit_hashid}', verbose=3, cwd=clone_dpath) if not venv_dpath.exists(): ub.cmd(f'python -m venv {venv_dpath}', verbose=3) # Skip if we already installed thep package if not list(clone_dpath.glob('*.egg-info')): # Hack to ensure specific versions ub.cmd(f"bash -c 'source {venv_dpath}/bin/activate && pip install pandas==1.5.3'", shell=True, cwd=clone_dpath, verbose=3, check=True) ub.cmd(f"bash -c 'source {venv_dpath}/bin/activate && pip install -e .[runtime-strict]'", shell=True, cwd=clone_dpath, verbose=3, check=True) return repo_infos def load_metric_results(output_dir): """ Get a list of all files and their hashes in an output directory """ fpaths = {} for r, ds, fs in output_dir.walk(): for f in fs: # Skip images if not f.endswith('.png'): fpath = r / f rel_fpath = fpath.relative_to(output_dir) fpaths[rel_fpath] = { 'fpath': fpath, 'hash': ub.hash_file(fpath), } return fpaths def compare_output_pair(outs1, outs2): """ Given a precomputed list of files / hashes in two directories, compare the directory contents and return information about how many major / minor differences there were. """ fnames1 = set(outs1.keys()) fnames2 = set(outs2.keys()) common_fnames = fnames1 & fnames2 unpaired_fnames = fnames1 ^ fnames2 exactsame_fnames = [] hasdiff_fnames = [] errors = [] diffs = { 'ac_phase_table': { 'major': 0, 'minor': 0, 'rows_checked': 0, 'files_checked': 0, 'exact_same': 0, }, 'other_csv': { 'major': 0, 'minor': 0, 'items_checked': 0, 'files_checked': 0, 'exact_same': 0, }, } if fnames1 != fnames2: errors.append('Versions did not produce the same exact same output files names') for fname in common_fnames: info1 = outs1[fname] info2 = outs2[fname] fpath1 = info1['fpath'] fpath2 = info2['fpath'] if fpath1.name == 'ac_phase_table.csv': # Type of diff we do depends on the specific file diff_key = 'ac_phase_table' elif fpath1.name.endswith('.csv'): diff_key = 'other_csv' else: diff_key = NotImplemented raise NotImplementedError if info1['hash'] == info2['hash']: exactsame_fnames.append(fname) diffs[diff_key]['exact_same'] += 1 else: hasdiff_fnames.append(fname) if diff_key == 'ac_phase_table': _this_diff = compare_ac_phase_table(fpath1, fpath2) for k, v in _this_diff.items(): diffs[diff_key][k] += v elif fpath1.name.endswith('.csv'): _this_diff = compare_generic_csv(fpath1, fpath2) if 'error' in _this_diff: errors.append(_this_diff.pop('error')) for k, v in _this_diff.items(): diffs[diff_key][k] += v else: raise NotImplementedError('TODO: handle if this comes up') print(f'{fpath1=}') t1 = fpath1.read_text() t2 = fpath2.read_text() print(difftext(t1, t2, colored=True)) raise Exception # Summarize how different the two output dirs are status = {} status['n_common'] = len(common_fnames) status['n_unpaired'] = len(unpaired_fnames) status['diffs'] = diffs status['n_exact_same'] = len(exactsame_fnames) status['n_some_diff'] = len(hasdiff_fnames) has_major_difference = False has_minor_difference = False if status['n_unpaired'] > 0: has_major_difference = True if status['n_some_diff'] > 0: has_minor_difference = True for subdiff in diffs.values(): if subdiff['major'] > 0: has_major_difference = True if subdiff['minor'] > 0: has_minor_difference = True if len(errors): has_major_difference = True if has_major_difference: has_minor_difference = True status['has_major_difference'] = has_major_difference status['has_minor_difference'] = has_minor_difference status['errors'] = errors return status def compare_generic_csv(fpath1, fpath2): """ Basic comparison of CSVs """ df1 = pd.read_csv(fpath1) df2 = pd.read_csv(fpath2) isna1 = df1.isna() isna2 = df1.isna() flags = ~((df1 == df2) | (isna1 & isna2)) this_diff = { 'files_checked': 1, 'items_checked': df1.values.size, } if flags.values.any(): bad_rows1 = df1[flags.any(axis=1)] bad_rows2 = df2[flags.any(axis=1)] # Check to see if the difference is a minor floating point # difference issue is_both_nan = (bad_rows1.isna() & bad_rows2.isna()) is_bad_item = (bad_rows1 != bad_rows2) & ~is_both_nan bad_values1 = bad_rows1.values[is_bad_item] bad_values2 = bad_rows2.values[is_bad_item] _major_diffs = 0 _minor_diffs = 0 NUMBER = (numbers.Number, np.number) for v1, v2 in zip(bad_values1, bad_values2): if isinstance(v1, NUMBER) and isinstance(v2, NUMBER): if v1 != v2: if np.isclose(v1, v2): _minor_diffs += 1 else: _major_diffs += 1 else: _major_diffs += 1 this_diff['minor'] = _minor_diffs this_diff['major'] = _major_diffs if _major_diffs: msg = f'Has {len(bad_rows2)} / {len(df2)} rows with major differences' print('=========') print('BAD ROWS:', msg) print('=========') print(f'{fpath1}') print(f'{fpath2}') print(bad_rows1) print('---') print(bad_rows2) print('---') a = bad_rows1.to_string() b = bad_rows2.to_string() print('Diff:') print(difftext(a, b, colored=True)) this_diff['error'] = msg return this_diff def compare_ac_phase_table(fpath1, fpath2): df1 = pd.read_csv(fpath1) df2 = pd.read_csv(fpath2) assert (df1.columns == df2.columns).all() assert (df1['date'] == df2['date']).all() this_diff = { 'files_checked': 1, } def normalize_row(row): new_row = {} for k, v in row.items(): if isinstance(v, str) and 'vs.' in v: parts = v.split('vs.') import ast sets = [] for p in parts: try: p = ast.literal_eval(p) except Exception: p = p.strip() # Remove ordering of subsites p = '_'.join(sorted(p.split('_'))) sets.append(p) v = sets new_row[k] = v return new_row records1 = df1.to_dict('records') records2 = df2.to_dict('records') major_diffs = 0 minor_diffs = 0 rows_checked = 0 for row1, row2 in zip(records1, records2): rows_checked += 1 if row1 != row2: norm_row1 = normalize_row(row1) norm_row2 = normalize_row(row2) if norm_row1 == norm_row2: minor_diffs += 1 else: norm_row1 print('ERROR:') print('norm_row1 = {}'.format(ub.urepr(norm_row1, nl=1))) print('norm_row2 = {}'.format(ub.urepr(norm_row2, nl=1))) a = ub.urepr(norm_row1, nl=1) b = ub.urepr(norm_row2, nl=1) print('Diff:') print(difftext(a, b, colored=True)) major_diffs += 1 this_diff.update({ 'rows_checked': rows_checked, 'minor': minor_diffs, 'major': major_diffs, }) return this_diff def difftext(text1, text2, context_lines=0, ignore_whitespace=False, colored=False): r""" Uses difflib to return a difference string between two similar texts Args: text1 (str): old text text2 (str): new text context_lines (int): number of lines of unchanged context ignore_whitespace (bool): colored (bool): if true highlight the diff Returns: str: formatted difference text message References: http://www.java2s.com/Code/Python/Utility/IntelligentdiffbetweentextfilesTimPeters.htm Example: >>> # build test data >>> text1 = 'one\ntwo\nthree' >>> text2 = 'one\ntwo\nfive' >>> # execute function >>> result = difftext(text1, text2) >>> # verify results >>> print(result) - three + five Example: >>> # build test data >>> text1 = 'one\ntwo\nthree\n3.1\n3.14\n3.1415\npi\n3.4\n3.5\n4' >>> text2 = 'one\ntwo\nfive\n3.1\n3.14\n3.1415\npi\n3.4\n4' >>> # execute function >>> context_lines = 1 >>> result = difftext(text1, text2, context_lines, colored=True) >>> # verify results >>> print(result) """ import ubelt as ub import difflib text1 = ub.ensure_unicode(text1) text2 = ub.ensure_unicode(text2) text1_lines = text1.splitlines() text2_lines = text2.splitlines() if ignore_whitespace: text1_lines = [t.rstrip() for t in text1_lines] text2_lines = [t.rstrip() for t in text2_lines] ndiff_kw = dict(linejunk=difflib.IS_LINE_JUNK, charjunk=difflib.IS_CHARACTER_JUNK) else: ndiff_kw = {} all_diff_lines = list(difflib.ndiff(text1_lines, text2_lines, **ndiff_kw)) if context_lines is None: diff_lines = all_diff_lines else: # boolean for every line if it is marked or not ismarked_list = [len(line) > 0 and line[0] in '+-?' for line in all_diff_lines] # flag lines that are within context_lines away from a diff line isvalid_list = ismarked_list[:] for i in range(1, context_lines + 1): isvalid_list[:-i] = list(map(any, zip( isvalid_list[:-i], ismarked_list[i:]))) isvalid_list[i:] = list(map(any, zip( isvalid_list[i:], ismarked_list[:-i]))) USE_BREAK_LINE = True if USE_BREAK_LINE: # insert a visual break when there is a break in context diff_lines = [] prev = False visual_break = '\n <... FILTERED CONTEXT ...> \n' #print(isvalid_list) for line, valid in zip(all_diff_lines, isvalid_list): if valid: diff_lines.append(line) elif prev: if False: diff_lines.append(visual_break) prev = valid else: diff_lines = list(ub.compress(all_diff_lines, isvalid_list)) text = '\n'.join(diff_lines) if colored: text = ub.highlight_code(text, lexer_name='diff') return text __cli__ = CompareIarpaMetricsVersionsCLI main = __cli__.main if __name__ == '__main__': """ CommandLine: python ~/code/watch/dev/poc/compare_iarpa_metrics_versions.py python -m compare_iarpa_metrics_versions """ main()