def best_separating_threshold(x, y):
    from kwcoco.metrics.confusion_vectors import BinaryConfusionVectors
    import kwarray
    data = kwarray.DataFrameArray({
        'is_true': y.values,
        'pred_score': x.values,
    })
    bincfsn_vec = BinaryConfusionVectors(data)
    dct = bincfsn_vec.measures().asdict()
    f1_score, f1_thresh = dct['_max_f1']
    return f1_score, f1_thresh


def find_relevant_stuff():
    # import watch
    from watch.mlops.aggregate import AggregateLoader
    import ubelt as ub

    target = '/home/joncrall/remote/namek/data/dvc-repos/smart_expt_dvc/_ac_static_small_baseline_namek_v1'
    # expt_dvc_dpath = watch.find_dvc_dpath(tags='phase2_expt', hardware='auto')

    load_kwargs = {
        'target': [
            target
        ],
        'pipeline': 'sc',
        'io_workers': 'avail',
    }
    with ub.Timer('load'):
        loader = AggregateLoader(**load_kwargs)
        eval_type_to_agg = loader.coerce_aggregators()
    _agg = eval_type_to_agg['sc_poly_eval']
    agg = _agg.filterto(models=['Drop7-Cropped2GSD_SC_bgrn_gnt_split6_V84_epoch17_step1548'])
    agg.build_macro_tables()
    best = agg.report_best()
    top_param_hashid = list(best.top_param_lut.keys())[-1]
    _agg = _agg.filterto(param_hashids=[top_param_hashid])

    poly_fpaths = _agg.index['fpath'].to_list()
    # Manually run confusion analysis

    confusion_paths = []
    for p in poly_fpaths:
        p = ub.Path(p)
        print(p.parent)
        pred_dpath = p.parent / 'confusion_analysis/confusion_groups/pred/'
        if pred_dpath.exists():
            confusion_paths.append(pred_dpath)

    sites = []
    from watch.geoannots import geomodels
    for dpath in confusion_paths:
        sites.extend(list(geomodels.SiteModel.coerce_multiple(dpath, workers=8)))
    site_score_analysis(sites)


def relevant_stuff_case1():
    dpath = '/home/joncrall/remote/toothbrush/data/dvc-repos/smart_expt_dvc/_ac_static_small_baseline_v1/eval/flat/sc_poly_eval/sc_poly_eval_id_372c0c95/confusion_analysis/confusion_groups/pred/'
    from watch.geoannots import geomodels
    sites = list(geomodels.SiteModel.coerce_multiple(dpath, workers=8))
    site_score_analysis(sites)


def site_score_analysis(sites):
    """
    """
    import pandas as pd
    import ubelt as ub
    from watch.cli.run_tracker import smooth_observation_scores
    # import geowatch_tpl
    # from watch.utils import util_gis
    # shapestats = geowatch_tpl.import_submodule('shapestats')

    simple_types = {
        'unhandled_cfsn_system_rejected': 'negative',
        'sm_pos_match': 'positive',
        'sm_completely_wrong': 'negative',
        'sm_ignore': 'ignore',
    }
    import numpy as np

    grid = list(ub.named_product({
        # 'smooth_mode': ['conv3', 'ewma'],
        'smooth_mode': ['ewma'],
        # 'smoothing': [0.6, 0.63, 0.65, 0.66, 0.7],
        'smoothing': [0.66],
    }))

    meta_rows = []
    for grid_item in grid:
        obs_rows = []

        site_rows = []
        for site in ub.ProgIter(sites):

            confusion = site.header['properties']['cache']['confusion']
            cfsn_type = confusion['type']
            ideal_label = simple_types[cfsn_type]

            if ideal_label == 'ignore':
                continue

            positive = (ideal_label == 'positive')

            cfsn_status = confusion.get('te_association_status', 'tn')
            if cfsn_status == 'tn':
                assert cfsn_type == 'unhandled_cfsn_system_rejected'

            if cfsn_status == '0':
                if cfsn_type == 'sm_pos_match':
                    cfsn_status = 'tp'
                else:
                    cfsn_status = 'tn'
                    raise Exception

            observations = list(site.observations())

            # current_phase = observations['current_phase']
            obs_multi_scores = [obs['properties']['cache']
                                ['raw_multi_scores'] for obs in observations]
            assert all(len(ms) == 1 for ms in obs_multi_scores)
            obs_scores = [ms[0] for ms in obs_multi_scores]
            raw_scores = pd.DataFrame(obs_scores)

            smooth_observation_scores(observations, smoothing=grid_item['smoothing'], smooth_mode=grid_item['smooth_mode'])
            smooth_scores = pd.DataFrame([
                obs['properties']['cache']['smooth_scores'] for obs in observations])

            from kwutil.util_time import coerce_datetime
            start_date = coerce_datetime(observations[0]['properties']['observation_date'])

            for idx, obs in enumerate(observations):
                obs_time = coerce_datetime(obs['properties']['observation_date'])
                s1 = obs['properties']['cache']['raw_multi_scores'][0]
                s2 = obs['properties']['cache']['smooth_scores']
                s1 = {'raw_' + k: v for k, v in s1.items()}
                s2 = {'smooth_' + k: v for k, v in s2.items()}
                row = s1 | s2
                row['site_id'] = site.site_id
                row['index'] = idx
                duration = obs_time - start_date
                row.update({
                    'positive': positive,
                    'region_id': site.region_id,
                    'cfsn_status': cfsn_status,
                    'duration': duration,
                })
                obs_rows.append(row)

            curr_labels = [o['properties']['current_phase'] for o in observations]
            active_labels = {'Site Preparation', 'Active Construction'}
            flags = [lbl in active_labels for lbl in curr_labels]

            # site_crs84_gdf = site.pandas_site()
            # site_utm_gdf = util_gis.project_gdf_to_local_utm(site_crs84_gdf, mode=1)
            # df = site_utm_gdf
            # import kwimage
            # obox_whs = [kwimage.MultiPolygon.from_shapely(s).oriented_bounding_box().extent
            #             for s in df.geometry]
            # df['obox_major'] = [max(e) for e in obox_whs]
            # df['obox_minor'] = [min(e) for e in obox_whs]
            # df['major_obox_ratio'] = df['obox_major'] / df['obox_minor']
            # df['area'] = df.geometry.area
            # df['isoperimetric_quotient'] = df.geometry.apply(shapestats.ipq)
            # df['boundary_amplitude'] = df.geometry.apply(shapestats.compactness.boundary_amplitude)
            # df['eig_seitzinger'] = df.geometry.apply(shapestats.compactness.eig_seitzinger)

            # subscores = raw_scores[flags]
            # [flags]
            # if subscores.isna().any():
            #     raise Exception

            site_rows.append({
                # 'rt_area': np.sqrt(df.iloc[0]['area']),
                # 'isoperimetric_quotient': df['isoperimetric_quotient'].iloc[0],
                # 'boundary_amplitude': df['boundary_amplitude'].iloc[0],
                # 'eig_seitzinger': df['eig_seitzinger'].iloc[0],
                # 'major_obox_ratio': df['major_obox_ratio'].iloc[0],
                'frac_active': np.mean(flags),

                'smooth_salient_mean': smooth_scores['ac_salient'].mean(),
                'smooth_salient_max': smooth_scores['ac_salient'].max(),

                'raw_salient_mean': raw_scores['ac_salient'].mean(),
                'raw_salient_max': raw_scores['ac_salient'].max(),
                'raw_salient_min': raw_scores['ac_salient'].min(),

                'active_max': raw_scores['Active Construction'].max(),
                'active_mean': raw_scores['Active Construction'].mean(),

                'siteprep_max': raw_scores['Site Preparation'].max(),
                'siteprep_mean': raw_scores['Site Preparation'].mean(),

                # 'cfsn_type': cfsn_type,
                'positive': positive,
                'region_id': site.region_id,
                'cfsn_status': cfsn_status,
            })

        import kwimage
        palette = {
            'tp': kwimage.Color.coerce('kitware_green').as01(),
            'fp': kwimage.Color.coerce('kitware_red').as01(),
            'tn': kwimage.Color.coerce('kitware_blue').as01(),
        }

        if 0:
            import kwplot
            sns = kwplot.autosns()
            obs_df = pd.DataFrame(obs_rows)
            fig = kwplot.figure(fnum=1, doclf=True)
            ax = fig.gca()
            ax.cla()
            for _, grp in ub.ProgIter(list(obs_df.groupby('site_id'))):
                sns.lineplot(
                    data=grp, x='duration', y='smooth_ac_salient', hue='cfsn_status',
                    legend=False, ax=ax, palette=palette)

        scalars = pd.DataFrame(site_rows)
        if 0:
            scalars = scalars[scalars['region_id'] == 'KR_R002']
        scalars = scalars.fillna(0)

        y = scalars['positive']
        X = scalars.drop(['positive', 'region_id', 'cfsn_status'], axis=1)
        # Find the most informative feature
        rows = []
        for k in X.columns:
            x = X[k]
            best_f1, best_thresh = best_separating_threshold(x, y)
            rows.append({
                'key': k,
                'best_f1': best_f1,
                'best_thresh': best_thresh,
            })
        separators = pd.DataFrame(rows).sort_values('best_f1')
        separators = separators.set_index('key')
        print(separators)

        meta_row = separators.loc['smooth_salient_max'].to_dict()
        meta_row.update(grid_item)
        meta_rows.append(meta_row)

    print(pd.DataFrame(meta_rows).sort_values('best_f1'))

    if 0:
        # Do it per region
        rows = []
        for group_key, group in scalars.groupby('region_id'):
            y = group['positive']
            X = group.drop(['positive', 'region_id', 'cfsn_status'], axis=1)
            for k in X.columns:
                x = X[k]
                best_f1, best_thresh = best_separating_threshold(x, y)
                rows.append({
                    'key': k,
                    'best_f1': best_f1,
                    'best_thresh': best_thresh,
                    'group_key': group_key,
                })
        separators = pd.DataFrame(rows).sort_values('best_f1')
        separators = separators.set_index('key')
        print(separators)
        piv = separators.reset_index().pivot(index=['group_key'], columns=['key'], values=['best_thresh'])
        print(piv.T)
        piv.T.loc[piv.std(axis=0).T.sort_values().index]

    import kwplot
    sns = kwplot.autosns()

    sep1 = separators.iloc[-1]
    # sep2 = separators.iloc[-2]
    # key1 = sep1.name
    # key2 = sep2.name
    key1 = 'smooth_salient_max'
    key2 = 'frac_active'
    sep1 = separators.loc[key1]
    # sep2 = separators.loc[key2]

    fig = kwplot.figure(fnum=1, doclf=True)
    ax = fig.gca()
    ax.cla()
    sns.scatterplot(
        data=scalars,
        # x='siteprep_max',
        # y='siteprep_mean',
        x=key1,
        y=key2,
        hue='cfsn_status',
        ax=ax,
        palette=palette,
        style='region_id',
    )
    thresh = sep1['best_thresh']
    ymin, ymax = ax.get_ylim()
    ax.plot([thresh, thresh], [ymin, ymax], '-')


def foo(scalars):
    import sklearn.ensemble
    sklearn.ensemble.RandomForestClassifier

    y = scalars['positive']
    X = scalars.drop(['positive'], axis=1)
    rf = sklearn.ensemble.RandomForestClassifier().fit(X, y)

    y_hat = rf.predict(X)
    y.values & y_hat

    # from sklearn.ensemble import RandomForestRegressor
    # from sklearn.feature_selection import SelectFromModel
    from sklearn.model_selection import GridSearchCV
    # from sklearn.model_selection import LeaveOneOut
    # from sklearn.model_selection import cross_val_score
    # from sklearn.pipeline import make_pipeline

    # feature_selector = SelectFromModel(
    #     RandomForestRegressor(n_jobs=-1), threshold="mean"
    # )
    # pipe = make_pipeline(
    #     feature_selector, RandomForestRegressor(n_jobs=-1)
    # )

    # param_grid = {
    #     # define the grid of the random-forest for the feature selection
    #     "selectfrommodel__estimator__n_estimators": [10, 20],
    #     "selectfrommodel__estimator__max_depth": [3, 5],
    #     # define the grid of the random-forest for the prediction
    #     "randomforestregressor__n_estimators": [10, 20],
    #     "randomforestregressor__max_depth": [5, 8],
    # }
    # grid_search = GridSearchCV(pipe, param_grid=param_grid, n_jobs=-1, cv=3)
    # # You can use the LOO in this way. Be aware that this not a good practise,
    # # it leads to large variance when evaluating your model.
    # # scores = cross_val_score(pipe, X, y, cv=LeaveOneOut(), error_score='raise')
    # scores = cross_val_score(grid_search, X, y, cv=2, error_score='raise')
    # scores.mean()

    # https://stats.stackexchange.com/questions/568695/using-k-fold-cross-validation-of-random-forest-how-many-samples-are-used-to-cre
    param_grid = {
        'max_depth': [2, 3, 4, 5],
        # 'criterion': ['absolute_error', 'squared_error'],
        'n_estimators': [5, 10, 20, 30, 40, 50, 60, 70, 80, 100, 150, 200]
    }
    # Create a base model
    rfCV = sklearn.ensemble.RandomForestClassifier(random_state=1099)

    # Instantiate the grid search model
    regCV = GridSearchCV(
        estimator=rfCV,
        cv=5,
        param_grid=param_grid,
        n_jobs=-1,
        verbose=2,
        return_train_score=True)
    # Fit the grid search to the data
    out = regCV.fit(X, y)
    out.best_score_