Skip to main content

Fix up the info.json file after download

ID
785a5bc
date
2024-02-16 07:19:31+00:00
author
Alex Chan <alex@alexwlchan.net>
parent
08c1bad
message
Fix up the info.json file after download
changed files
1 file, 32 additions

Changed files

web/save_youtube_videos.py (5294) → web/save_youtube_videos.py (6100)

diff --git a/web/save_youtube_videos.py b/web/save_youtube_videos.py
index fd228ae..c63e163 100755
--- a/web/save_youtube_videos.py
+++ b/web/save_youtube_videos.py
@@ -134,6 +134,33 @@ def classify_file_type(
     raise ValueError(f"Unrecognised filename: {filename}")
 
 
+def fix_info_json(path: pathlib.Path) -> None:
+    """
+    Tidy up the contents of the info.json fie.
+    """
+    with open(path) as in_file:
+        data = json.load(in_file)
+
+    # These are a couple of fields which are very large, don't contain
+    # much useful metadata, and point to transient URLs that don't work
+    # later.
+    for key in (
+        "formats",
+        "automatic_captions",
+        "thumbnails",
+        "heatmap",
+        "_format_sort_fields",
+        "subtitles",
+    ):
+        if key in data:
+            del data[key]
+
+    json_string = json.dumps(data, indent=2)
+
+    with open(path, "w") as out_file:
+        out_file.write(json_string)
+
+
 @log_result("https://youtube.com/watch?v={video_id}")
 def download_video(*, video_id, download_root):
     uploader = get_uploader(video_id=video_id, db_path=download_root / "uploaders.db")
@@ -177,6 +204,11 @@ def download_video(*, video_id, download_root):
     try:
         youtube_dl(*cmd, cwd=download_dir)
         print(download_dir)
+
+        for f in os.listdir(download_dir):
+            if f.endswith(".info.json"):
+                fix_info_json(download_dir / f)
+
         return "downloaded"
     except subprocess.CalledProcessError as err:  # pragma: no cover
         print(f"Unable to download {video_url}: {err}", file=sys.stderr)