"""
Limited, but similar functionality to ~/code/netharn/netharn/cli/manage_runs.py
to remove batch visualizations that take up a lot of space.
"""
import pint
import ubelt as ub

ureg = pint.UnitRegistry()


class ImageDirectory(ub.NiceRepr):
    def __init__(self, dpath):
        self.dpath = dpath
        self.fpaths = None
        self.stats = None
        self.sizes = None
        self.total_mb = None
        self.total_bytes = None

    def __nice__(self):
        return f'{self.total_mb} - len(self.fpaths)'

    def build(self):
        fpaths = sorted(self.dpath.glob('*.jpg'))
        self.fpaths = fpaths
        self.stats = []
        sizes = []
        for fpath in fpaths:
            stat = fpath.stat()
            self.stats.append(stat)
            sizes.append(stat.st_size * ureg.bytes)

        self.sizes = sizes
        total_bytes = sum(sizes)
        total_mb = total_bytes.to('megabytes')
        self.total_mb = total_mb
        self.total_bytes = total_bytes
        return self

    def select_removers(self, max_size, keep_atleast=0):
        """
        Ignore:
            dpath = ub.Path('/home/local/KHQ/jon.crall/remote/horologic/data/dvc-repos/smart_expt_dvc/training/horologic/jon.crall/Aligned-Drop4-2022-08-08-TA1-S2-L8-ACC/runs/Drop4_BAS_BGR_15GSD_multihead_perceiver_V008/lightning_logs/version_0/monitor/validate/batch')
            self = imgdir = ImageDirectory(dpath).build()
            max_size = 64 * ureg.megabytes
            remove_fpaths, info = self.select_removers(max_size, keep_atleast=4)
            for p in remove_fpaths:
                p.delete()
        """
        total = 0
        all_indexes = set(range(len(self.fpaths)))
        keep_idxs = []

        # for idx in generate_indexes(len(self.fpaths)):
        for idx in farthest_from_previous(0, len(self.fpaths)):
            total += self.sizes[idx]
            if total > max_size and len(keep_idxs) >= keep_atleast:
                break
            keep_idxs.append(idx)

        remove_idxs = sorted(all_indexes - set(keep_idxs))
        remove_fpaths = list(ub.take(self.fpaths, remove_idxs))
        remove_sizes = list(ub.take(self.sizes, remove_idxs))
        keep_sizes = list(ub.take(self.sizes, keep_idxs))

        info = {}

        zero = 0 * ureg.bytes
        info['remove_size'] = sum(remove_sizes, start=zero).to('megabytes')
        info['keep_size'] = sum(keep_sizes, start=zero).to('megabytes')
        info['keep_num'] = len(keep_idxs)
        info['remove_num'] = len(remove_idxs)
        return remove_fpaths, info


def main():
    # dpath = '/data/projects/smart/smart_watch_dvc/training/horologic/jon.crall/Drop1-20201117/runs/SC_smt_it_stm_p8_newanns_weighted_mat6raw6_v41/lightning_logs/version_0/monitor/train/batch'
    dpath = '/data/projects/smart/smart_watch_dvc/training/horologic/jon.crall/Drop1-20201117/runs/SC_smt_it_stm_p8_newanns_weighted_mat6raw6_v41/lightning_logs/version_0/monitor/train/batch'
    # training_dpath = ub.Path('/data/projects/smart/smart_watch_dvc/training')
    training_dpath = ub.Path('/home/local/KHQ/jon.crall/remote/horologic/data/dvc-repos/smart_expt_dvc/training')
    training_dpath = ub.Path('/home/local/KHQ/jon.crall/remote/horologic/data/dvc-repos/smart_watch_dvc/training')

    # run_dpaths = list(training_dpath.glob('*/*/*/runs'))
    # lightning_root_dpaths = list(training_dpath.glob('*/*/*/runs/*/lightning_logs'))
    # lightning_ver_dpaths = list(training_dpath.glob('*/*/*/runs/*/lightning_logs/version_*'))

    sanity_dpaths = list(training_dpath.glob('*/*/*/runs/*/lightning_logs/version_*/monitor/sanity_check'))
    for p in sanity_dpaths:
        p.delete()
    batch_dpaths = list(training_dpath.glob('*/*/*/runs/*/lightning_logs/version_*/monitor/*/batch'))

    # batch_dpaths = list(training_dpath.glob('*/*/*/runs/*/lightning_logs/version_*/monitor/train/batch'))
    # batch_dpaths = list(training_dpath.glob('*/*/*/runs/*/lightning_logs/version_*/monitor/validate/batch'))
    # batch_dpaths = list(training_dpath.glob('*/*/*/runs/*/lightning_logs/version_*/monitor/validate/sanity_check'))

    image_dirs = []
    for dpath in ub.ProgIter(batch_dpaths, verbose=3):
        imgdir = ImageDirectory(dpath).build()
        print(f'imgdir={imgdir}')
        image_dirs.append(imgdir)

    total_mb = 0 * ureg.bytes
    for imgdir in image_dirs:
        total_mb += imgdir.total_mb
    total_gb = total_mb.to('gigabytes')
    print(f'total_gb={total_gb}')

    max_size = 64 * ureg.megabytes

    remove_infos = []
    all_remove_fpaths = []

    for imgdir in image_dirs:
        self = imgdir
        remove_fpaths, info = self.select_removers(max_size, keep_atleast=4)
        all_remove_fpaths.extend(remove_fpaths)
        remove_infos.append(info)

    total_keep = sum([g['keep_size'] for g in remove_infos], start=0 * ureg.bytes).to('gigabytes')
    total_remove = sum([g['remove_size'] for g in remove_infos], start=0 * ureg.bytes).to('gigabytes')
    print(f'total_keep={total_keep}')
    print(f'total_remove={total_remove}')

    for fpath in ub.ProgIter(all_remove_fpaths):
        fpath.delete()

    # Checkpoint cleanup
    to_remove = []
    checkpoint_fpaths = list(training_dpath.glob('*/*/*/runs/*/lightning_logs/version_*/checkpoints/*.ckpt'))
    groups = ub.group_items(checkpoint_fpaths, key=lambda x: x.parent)
    for k, vs in groups.items():
        vers = [p for p in vs if '-v' in p.stem]
        if len(vers):
            subgroups = ub.group_items(vers, lambda p: p.stem.split('-v')[0])
            for _, subs in subgroups.items():
                if len(subs) > 1:
                    for s in sorted(subs)[1:]:
                        to_remove.append(s)

    remove_bytes = sum([p.stat().st_size for p in to_remove])
    import xdev as xd
    print('removing: ' + xd.byte_str(remove_bytes))
    for p in to_remove:
        p.delete()


def generate_indexes(total):
    """
    E.g. if total is 10, should generate something like:

    0, 9, 4, 2, 6, 1, 5, 3, 7, 8

    To visualize the pattern

      |0|1|2|3|4|5|6|7|8|9|
    0 |x| | | | | | | | | |
    1 |.| | | | | | | | |x|
    2 |.| | | |x| | | | |.|
    3 |.| |x| |.| | | | |.|
    4 |.| |.| |.| |x| | |.|
    6 |.|x|.| |.| |.| | |.|
    8 |.|.|.| |.|x|.| | |.|
    7 |.|.|.|x|.|.|.| | |.|
    5 |.|.|.|.|.|.|.|x| |.|
    9 |.|.|.|.|.|.|.|.|x|.|

    Example:
        >>> total = 10
        >>> gen = generate_indexes(total)
        >>> result = list(gen)
        >>> assert set(result) == set(range(total))
        >>> print(result)
        [0, 9, 4, 2, 6, 1, 5, 3, 7, 8]
    """
    start = 0
    stop = total
    yield stop - 1
    yield start
    yield from generate_midpoints(start, stop - 1)


def generate_midpoints(start, stop):
    import itertools as it
    mid = (start + stop) // 2

    if start == stop or start == mid:
        return

    yield mid

    left_start = start
    left_stop = mid
    left_gen = generate_midpoints(left_start, left_stop)

    right_start = mid
    right_stop = stop
    right_gen = generate_midpoints(right_start, right_stop)

    for a, b in it.zip_longest(left_gen, right_gen):
        if a is not None:
            yield a
        if b is not None:
            yield b


def alt(total):
    import math
    digits = math.ceil(math.log2(total))
    ordered = list(range(total))
    fmtstr = '{:0' + str(digits) + 'b}'
    binary = [fmtstr.format(x) for x in ordered]
    munged = sorted([b[::-1] for b in binary])
    new_order = [int(b[::-1], 2) for b in munged]
    return new_order


def generate_all(start, stop):
    yield stop - 1
    yield from generate_from_starts(start, stop - 1)


def generate_from_starts(start, stop):
    import itertools as it
    mid = (start + stop) // 2
    if start == mid:
        yield start
    else:
        for a, b in it.zip_longest(
                generate_from_starts(start, mid),
                generate_from_starts(mid, stop)
        ):
            if a is not None:
                yield a
            if b is not None:
                yield b


def farthest_from_previous(start: int, stop: int):
    """
    Given a ordered list of items, incrementally yield indexes such that each
    new index maximizes the distance to all other previously chosen indexes.

    Args:
        start (int): The inclusive starting index (typically 0)
        stop (int): The exclusive maximum index (typically ``len(items)``)

    Yields:
        int: the next chosen index in the series

    References:
        .. [CSSE_167943] https://cs.stackexchange.com/questions/167943/is-this-knapsack-variant-named-studied-online-algorithm-for-farthest-from-pr

    Example:
        >>> total = 10
        >>> start, stop = 0, 10
        >>> gen = farthest_from_previous(start, stop)
        >>> result = list(gen)
        >>> assert set(result) == set(range(start, stop))
        >>> print(result)
        [9, 0, 5, 2, 7, 1, 6, 3, 8, 4]

        list(farthest_from_previous(0, 4))
    """
    import itertools as it

    def from_starts(start: int, stop: int):
        if start < stop:
            low_mid: int = (start + stop) // 2
            high_mid: int = (start + stop + 1) // 2

            left_gen = from_starts(start, low_mid)
            right_gen = from_starts(high_mid, stop)

            pairgen = it.zip_longest(left_gen, right_gen)
            flatgen = it.chain.from_iterable(pairgen)
            filtgen = filter(lambda x: x is not None, flatgen)
            yield from filtgen
            if low_mid < high_mid:
                yield low_mid
    if start < stop:
        yield stop - 1
        yield from from_starts(start, stop - 1)


def __farthest_from_previous_writeup__():
    """
    Use case:

        I have a directory of ordered images images that were generated to
        visualize neural network training iterations. I create one of these
        directories every time I train a network.

        These visualizations can start to take up too much disk space, and
        removing some percent of them would free up a lot of space, but still
        leave some of the visualizations in case I wanted to go back and
        inspect an old run. So the question is: which of these images do I
        keep? By incrementally generating "furthest from previous" indexes and
        checking if the total size exceeds some threshold, I can stop, keep all
        files corresponding to generated indexes, and remove the rest.

        To formally talk about this problem we will refer to files as "items",
        and the file size will be the "weight" of each "item".

        In general, if we can take N items, we should pick items equally spaced
        (or as close to it as possible). But imagine we can choose items within
        a total weight constraint.
        I.e. sum(item['weight'] for item in items) < W

        We can formulate this as an optimization problem:

            # this is not complete...

            Let N = the number of candidate items
            Let W = the maximum weight allowed
            Let x[i] = be an indicator variable if the i-th item is taken
            Let w[i] = be the weight of the i-th item

            objective:

                Consider each pair of nodes with indexes i < j where
                both x[i] > 0 and x[j] > 0 and not
                any(x[k] for k in range(i, j + 1)).
                These are chosen neighbors. Call them neighbs

                Take:
                    max_dist = max((j - i) for i, j in neighbs)
                    min_dist = min((j - i) for i, j in neighbs)
                    total_chosen = sum(x[i] for i in range(N)
                    diff_delta = max_dist - min_dist

                Minimize:
                    # We want to choose as many points as possible such that
                    # the difference between the furthest pair of neighbors and
                    # closest pair of neighbors is minimized
                    # q: does total_chosen need a multiplier
                    #    such that its always the secondary objective?
                    diff_delta - total_chosen

                # todo: nicer formulation of objective.
                # basic idea: distribute chosen points uniformly

            constraint:

                # Total weight is within allowance
                sum(w[i] * x[i] for in range(N)) <= W

        #### The idea of these paragraphs is to motivate the restricted
        #### version of the problem where this heuristic is optimal.

        A specific variant of the above problem is the case where you only have
        one shot to decide if you want to remove an item. We can keep as many
        items as we want, but we can only query the weight of the item once,
        and after you do you have to decide if you keep or delete everything
        else.

        Such a constraint minimizes the number of filesystem operations you
        have to perform, which greatly speeds up the procedure.

        If we keep any of the images, we probably want to see what the network
        was doing at different points in the training process. If we can only
        keep one image, it should be the last one: see what the end state was
        like. If we can keep two, then we want to take the first one as well,
        so we can see what the network looked like at the start of training. If
        we can take three, then perhaps we should take the previous two and
        then one as far away from either of them as possible, so take the
        middle one if the number of items is odd, otherwise pick one of the two
        equidistant items.

        This motivates a heuristic to obtain a feasible solution to the above
        objective. It will not optimize the original objective in all cases,
        but in many cases it will.
    """