add more explanation
- ID
2018c02- date
2022-12-31 12:17:37+00:00- author
Alex Chan <alex@alexwlchan.net>- parent
df5b7d6- message
add more explanation- changed files
1 file, 73 additions
Changed files
get_all_live_text (0) → get_all_live_text (2178)
diff --git a/get_all_live_text b/get_all_live_text
new file mode 100755
index 0000000..fd67d7f
--- /dev/null
+++ b/get_all_live_text
@@ -0,0 +1,73 @@
+#!/usr/bin/env python3
+"""
+Get OCR'd text for all the images in a directory using Live Text.
+
+See https://alexwlchan.net/2022/12/live-text-script/
+
+== Usage ==
+
+Pass the name of the directory you want to scan as a single argument:
+
+ $ python3 get_all_live_text ~/screenshots
+
+It will create a JSON file `live-text.json` in the top of the directory with
+the text for every image it finds. So the command above would create a file
+at `~/screenshots/live-text.json`.
+
+There will be one line for each image, for example:
+
+ {"path": "railway-sign.jpg", "text": ["Passengers must", "not pass this point", "or cross the line"]}
+ {"path": "dancers.jpg", "text": []}
+
+The script runs incrementally, so if you add more images later, you can
+re-run to just get the text for all the new images.
+
+"""
+
+import json
+import os
+import subprocess
+import sys
+
+
+def get_file_paths_under(root=".", *, suffix=""):
+ """Generates the paths to every file under ``root``."""
+ if not os.path.isdir(root):
+ raise ValueError(f"Cannot find files under non-existent directory: {root!r}")
+
+ for dirpath, _, filenames in os.walk(root):
+ for f in filenames:
+ if os.path.isfile(os.path.join(dirpath, f)) and f.lower().endswith(suffix):
+ yield os.path.join(dirpath, f)
+
+
+def get_text(path):
+ try:
+ return json.loads(subprocess.check_output(["get_live_text", path]))
+ except subprocess.CalledProcessError:
+ return None
+
+
+if __name__ == "__main__":
+ try:
+ root = sys.argv[1]
+ except IndexError:
+ sys.exit(f"Usage: {__file__} <PATH>")
+
+ out_path = os.path.join(root, "live_text.json")
+
+ try:
+ seen_paths = {json.loads(line)["path"] for line in open(out_path)}
+ except FileNotFoundError:
+ seen_paths = set()
+
+ for path in get_file_paths_under(root):
+ if path in seen_paths:
+ continue
+ if os.path.basename(path).startswith("."):
+ continue
+ if path.lower().endswith((".gif", ".pdf", ".webarchive")):
+ continue
+ with open(out_path, "a") as outfile:
+ outfile.write(json.dumps({"path": path, "text": get_text(path)}) + "\n")
+