Skip to main content

Add my ‘images_only_pdf’ script

ID
168410b
date
2022-09-26 17:49:17+00:00
author
Alex Chan <alex@alexwlchan.net>
parent
55b2b9b
message
Add my 'images_only_pdf' script
changed files
1 file, 42 additions

Changed files

images_only_pdf (0) → images_only_pdf (1339)

diff --git a/images_only_pdf b/images_only_pdf
new file mode 100755
index 0000000..6f662c3
--- /dev/null
+++ b/images_only_pdf
@@ -0,0 +1,42 @@
+#!/usr/bin/env python3
+"""
+This script takes a PDF, and creates a new PDF with just the images
+filling the page.
+
+It's working around a behaviour of the "Scan Document" feature in the
+iOS Notes app – when you export the scan as PDF, it adds large white
+borders around the images which is precisely what I don't want.
+"""
+
+import os
+import sys
+
+import fitz  # PyMuPDF
+
+
+if __name__ == "__main__":
+    try:
+        path = sys.argv[1]
+    except IndexError:
+        sys.exit(f"Usage: {__file__} <PATH>")
+
+    with fitz.open(path) as pdf_original, fitz.open() as pdf_new:
+        for page_number, page in enumerate(pdf_original, start=1):
+            for image_number, image in enumerate(page.getImageList(), start=1):
+                # Get the XREF of the image
+                xref = image[0]
+
+                # Extract the image bytes
+                base_image = pdf_original.extractImage(xref)
+
+                new_page = pdf_new.new_page(
+                    width=base_image["width"], height=base_image["height"]
+                )
+                rect = fitz.Rect(0.0, 0.0, base_image["width"], base_image["height"])
+                new_page.insertImage(rect, stream=base_image["image"])
+
+        out_path = path.replace(".pdf", "-noimages.pdf")
+        assert path != out_path
+
+        pdf_new.save(out_path)
+        print(out_path)