#!/usr/bin/env python3 """ Get OCR'd text for all the images in a directory using Live Text. See https://alexwlchan.net/2022/live-text-script/ == Usage == Pass the name of the directory you want to scan as a single argument: $ python3 get_all_live_text ~/screenshots It will create a JSON file `live_text.json` in the top of the directory with the text for every image it finds. So the command above would create a file at `~/screenshots/live_text.json`. There will be one line for each image, for example: {"path": "railway-sign.jpg", "text": ["Passengers must", "not pass this point", "or cross the line"]} {"path": "dancers.jpg", "text": []} The script runs incrementally, so if you add more images later, you can re-run to just get the text for all the new images. """ import json import os import subprocess import sys try: from tqdm import tqdm except ImportError: def tqdm(s): return s def get_file_paths_under(root): """Generates the paths to every file under ``root``.""" if not os.path.isdir(root): raise ValueError(f"Cannot find files under non-existent directory: {root!r}") for dirpath, _, filenames in os.walk(root): for f in filenames: if os.path.isfile(os.path.join(dirpath, f)): yield os.path.join(dirpath, f) def get_static_image_paths_under(root): for path in get_file_paths_under(root): if os.path.basename(path).startswith("."): continue if path.lower().endswith((".gif", ".pdf", ".webarchive")): continue yield path def get_text(path): try: return json.loads(subprocess.check_output(["get_live_text", path])) except subprocess.CalledProcessError: return None if __name__ == "__main__": try: root = sys.argv[1] except IndexError: sys.exit(f"Usage: {__file__} ") out_path = os.path.join(root, "live_text.json") try: seen_paths = {json.loads(line)["path"] for line in open(out_path)} except FileNotFoundError: seen_paths = set() for path in tqdm(list(get_static_image_paths_under(root))): if path in seen_paths: continue with open(out_path, "a") as outfile: outfile.write(json.dumps({"path": path, "text": get_text(path)}) + "\n")