Skip to main content

toss in tqdm for get_all_live_text

ID
bce046d
date
2022-12-31 13:17:10+00:00
author
Alex Chan <alex@alexwlchan.net>
parent
e2e475c
message
toss in tqdm for get_all_live_text
changed files
1 file, 19 additions, 7 deletions

Changed files

get_all_live_text (2178) → get_all_live_text (2341)

diff --git a/get_all_live_text b/get_all_live_text
index 673d45e..2b70176 100755
--- a/get_all_live_text
+++ b/get_all_live_text
@@ -29,18 +29,33 @@ import os
 import subprocess
 import sys
 
+try:
+    from tqdm import tqdm
+except ImportError:
+    def tqdm(s):
+        return s
 
-def get_file_paths_under(root=".", *, suffix=""):
+
+def get_file_paths_under(root):
     """Generates the paths to every file under ``root``."""
     if not os.path.isdir(root):
         raise ValueError(f"Cannot find files under non-existent directory: {root!r}")
 
     for dirpath, _, filenames in os.walk(root):
         for f in filenames:
-            if os.path.isfile(os.path.join(dirpath, f)) and f.lower().endswith(suffix):
+            if os.path.isfile(os.path.join(dirpath, f)):
                 yield os.path.join(dirpath, f)
 
 
+def get_static_image_paths_under(root):
+    for path in get_file_paths_under(root):
+        if os.path.basename(path).startswith("."):
+            continue
+        if path.lower().endswith((".gif", ".pdf", ".webarchive")):
+            continue
+        yield path
+
+
 def get_text(path):
     try:
         return json.loads(subprocess.check_output(["get_live_text", path]))
@@ -61,13 +76,10 @@ if __name__ == "__main__":
     except FileNotFoundError:
         seen_paths = set()
 
-    for path in get_file_paths_under(root):
+    for path in tqdm(list(get_static_image_paths_under(root))):
         if path in seen_paths:
             continue
-        if os.path.basename(path).startswith("."):
-            continue
-        if path.lower().endswith((".gif", ".pdf", ".webarchive")):
-            continue
+
         with open(out_path, "a") as outfile:
             outfile.write(json.dumps({"path": path, "text": get_text(path)}) + "\n")