# https://stackoverflow.com/questions/13403069/how-to-find-out-which-files-take-up-the-most-space-in-git-repo
#
# Sort all objects in the repo by size and show the top 20
git rev-list --all --objects | awk '{print $1}' | git cat-file --batch-check | sort -k3nr | head -n 20

# Noting that 2986e51827c9d86b651fe1cb6de8ed9c8842b614 is a very big object


git log --all --find-object=fcba2687fb86ab911222bb73b6ff0b69fbf24527


# Find which commit adds a specific object
git log --all --find-object=2986e51827c9d86b651fe1cb6de8ed9c8842b614


# This file seems to correspond to
# watch/tasks/super_res/model_zoo/swinir/003_realSR_BSRGAN_DFOWMFC_s64w8_SwinIR-L_x4_GAN.pth


# We can rewrite history of a branch to remove all instances of a specific
# file.  Note: that this will only work if the file has not been merged into
# main yet.  Otherwise the entire repo will need a history rewrite.

env FILTER_BRANCH_SQUELCH_WARNING=1 \
  git filter-branch -f --prune-empty --index-filter '
    git rm -rf --cached --ignore-unmatch -- watch/tasks/super_res/model_zoo/swinir/003_realSR_BSRGAN_DFOWMFC_s64w8_SwinIR-L_x4_GAN.pth
  ' main..HEAD


# POC bloat script with better python interaction
python -c 'if 1:

    lbrace = chr(123)
    rbrace = chr(125)
    squote = chr(39)

    import ubelt as ub
    awk_cmd = f"awk {squote}{lbrace}print $1{rbrace}{squote}"
    info = ub.cmd(f"git rev-list --all --objects | {awk_cmd} | git cat-file --batch-check | sort -k3nr", shell=True)

    lines = [x for x in info.stdout.split(chr(10)) if x.strip()]
    rows = []
    import xdev
    for line, _ in zip(lines, range(20)):
        obj_id, type, size = line.split(" ")
        rows.append({
            "obj_id": obj_id,
            "type": type,
            "num_bytes": int(size),
            "size": xdev.byte_str(int(size)),
        })

    big_paths = ub.ddict(dict)
    for row in rows:
        if row["num_bytes"] > 874274:
            obj_id = row["obj_id"]
            obj_info = ub.cmd(f"git log --all --find-object={obj_id}")
            for line in obj_info.stdout.split(chr(10)):
                if line.startswith("commit "):
                    commit_id = line.split(" ")[1]
                    commit_info = ub.cmd(f"git diff-tree --no-commit-id --name-only {commit_id} -r")
                    paths = [p for p in commit_info.stdout.split(chr(10)) if p.strip()]

                    for p in paths:
                        p = ub.Path(p)
                        prow = big_paths[p]
                        prow["path"] = p
                        prow.setdefault("commit_ids", [])
                        prow.setdefault("obj_ids", [])
                        prow["obj_ids"].append(obj_id)
                        prow["commit_ids"].append(commit_id)
                        if p.exists():
                            prow["num_bytes"] = p.stat().st_size
                            prow["size"] = xdev.byte_str(prow["num_bytes"])
            ...
    big_paths = ub.udict(big_paths)
    big_paths = big_paths.sorted_values(key=lambda x: x.get("num_bytes", -1))
    print(ub.urepr(big_paths))

    import pandas as pd
    df = pd.DataFrame(rows)
    print(df)

'


git log --all --find-object=5f365d826723ab12df839d89aec6f0170242aa72

# List all files in a commit
git diff-tree --no-commit-id --name-only 25646375cc337ba4089878fe21d7627da75f341b -r