Skip to main content

add more explanation

ID
2018c02
date
2022-12-31 12:17:37+00:00
author
Alex Chan <alex@alexwlchan.net>
parent
df5b7d6
message
add more explanation
changed files
1 file, 73 additions

Changed files

get_all_live_text (0) → get_all_live_text (2178)

diff --git a/get_all_live_text b/get_all_live_text
new file mode 100755
index 0000000..fd67d7f
--- /dev/null
+++ b/get_all_live_text
@@ -0,0 +1,73 @@
+#!/usr/bin/env python3
+"""
+Get OCR'd text for all the images in a directory using Live Text.
+
+See https://alexwlchan.net/2022/12/live-text-script/
+
+== Usage ==
+
+Pass the name of the directory you want to scan as a single argument:
+
+    $ python3 get_all_live_text ~/screenshots
+
+It will create a JSON file `live-text.json` in the top of the directory with
+the text for every image it finds.  So the command above would create a file
+at `~/screenshots/live-text.json`.
+
+There will be one line for each image, for example:
+
+    {"path": "railway-sign.jpg", "text": ["Passengers must", "not pass this point", "or cross the line"]}
+    {"path": "dancers.jpg", "text": []}
+
+The script runs incrementally, so if you add more images later, you can
+re-run to just get the text for all the new images.
+
+"""
+
+import json
+import os
+import subprocess
+import sys
+
+
+def get_file_paths_under(root=".", *, suffix=""):
+    """Generates the paths to every file under ``root``."""
+    if not os.path.isdir(root):
+        raise ValueError(f"Cannot find files under non-existent directory: {root!r}")
+
+    for dirpath, _, filenames in os.walk(root):
+        for f in filenames:
+            if os.path.isfile(os.path.join(dirpath, f)) and f.lower().endswith(suffix):
+                yield os.path.join(dirpath, f)
+
+
+def get_text(path):
+    try:
+        return json.loads(subprocess.check_output(["get_live_text", path]))
+    except subprocess.CalledProcessError:
+        return None
+
+
+if __name__ == "__main__":
+    try:
+        root = sys.argv[1]
+    except IndexError:
+        sys.exit(f"Usage: {__file__} <PATH>")
+
+    out_path = os.path.join(root, "live_text.json")
+
+    try:
+        seen_paths = {json.loads(line)["path"] for line in open(out_path)}
+    except FileNotFoundError:
+        seen_paths = set()
+
+    for path in get_file_paths_under(root):
+        if path in seen_paths:
+            continue
+        if os.path.basename(path).startswith("."):
+            continue
+        if path.lower().endswith((".gif", ".pdf", ".webarchive")):
+            continue
+        with open(out_path, "a") as outfile:
+            outfile.write(json.dumps({"path": path, "text": get_text(path)}) + "\n")
+