Skip to main content

add a script to find big commits

ID
9bee71f
date
2023-05-28 04:03:02+00:00
author
Alex Chan <alex@alexwlchan.net>
parent
5eb4752
message
add a script to find big commits
changed files
2 files, 83 additions

Changed files

git/README.md (3154) → git/README.md (3393)

diff --git a/git/README.md b/git/README.md
index b7c11fa..8f2f38c 100644
--- a/git/README.md
+++ b/git/README.md
@@ -28,6 +28,15 @@ These scripts are all shortcuts for using [Git], mostly designed to let me do my
   </dd>
 
   <dt>
+    <a href="https://github.com/alexwlchan/scripts/blob/main/git/find_big_commits">
+      <code>find_big_commits</code>
+    </a>
+  </dt>
+  <dd>
+    print some information about the biggest files/commits in the Git history.
+  </dd>
+
+  <dt>
     <a href="https://github.com/alexwlchan/scripts/blob/main/git/gb">
       <code>gb [name]</code>
     </a>

git/find_big_commits (0) → git/find_big_commits (2072)

diff --git a/git/find_big_commits b/git/find_big_commits
new file mode 100755
index 0000000..17db086
--- /dev/null
+++ b/git/find_big_commits
@@ -0,0 +1,74 @@
+#!/usr/bin/env python3
+"""
+Give some information about the biggest files in the .git folder.
+
+This is based on a Stack Overflow answer by raphinesse [1], with a bunch of
+extra formatting and the total of the .git folder printed also.
+
+[1]: https://stackoverflow.com/a/42544963/1558022
+
+"""
+
+import os
+import subprocess
+
+import humanize  # humanize==4.4.0
+import termcolor  # termcolor==2.1.1
+
+
+def get_blobs():
+    output = subprocess.check_output(
+        "git rev-list --objects --all | "
+        "git cat-file --batch-check='%(objecttype)\t%(objectname)\t%(objectsize)\t%(rest)'",
+        shell=True,
+    )
+
+    for line in output.decode("utf8").splitlines():
+        object_type, object_name, object_size, rest = line.split("\t")
+
+        if object_type == "blob":
+            yield {"commit_id": object_name, "size": int(object_size), "filename": rest}
+
+
+def get_file_paths_under(root=".", *, suffix=""):
+    """Generates the paths to every file under ``root``."""
+    if not os.path.isdir(root):
+        raise ValueError(f"Cannot find files under non-existent directory: {root!r}")
+
+    for dirpath, _, filenames in os.walk(root):
+        for f in filenames:
+            if os.path.isfile(os.path.join(dirpath, f)) and f.lower().endswith(suffix):
+                yield os.path.join(dirpath, f)
+
+
+def get_git_folder_size():
+    root = (
+        subprocess.check_output(["git", "rev-parse", "--show-toplevel"])
+        .decode("utf8")
+        .strip()
+    )
+
+    return sum(
+        os.path.getsize(p) for p in get_file_paths_under(os.path.join(root, ".git"))
+    )
+
+
+if __name__ == "__main__":
+    blobs = [b for b in get_blobs() if b["size"] >= 1024]
+
+    for b in sorted(blobs, key=lambda b: b["size"]):
+        print(
+            b["commit_id"][:7],
+            humanize.naturalsize(b["size"]).rjust(10),
+            "  ",
+            b["filename"],
+        )
+
+    print(
+        " " * 7,
+        termcolor.colored(
+            humanize.naturalsize(get_git_folder_size()).rjust(10), "blue"
+        ),
+        "  ",
+        termcolor.colored(".git", "blue"),
+    )