3yt-dlp_alexwlchan is a personal wrapper around yt-dlp that downloads a video
4with my preferred settings.
7from datetime import datetime, timezone
9from pathlib import Path
14from typing import Any, TypedDict
17from chives.fetch import download_image
18from chives.media import create_video_entity, VideoEntity
19from yt_dlp import YoutubeDL
20from yt_dlp.networking.exceptions import HTTPError as YouTubeDLHTTPError
21from yt_dlp.utils import DownloadError
25 # Print progress output to stderr, not stdout
28 # Download the thumbnail
29 "writethumbnail": True,
31 # Download subtitles, or YouTube's automatic subtitles if there
33 "writesubtitles": True,
35 # Ignore AI-upscaled videos from YouTube.
36 # See https://alexwlchan.net/til/2025/ignore-ai-scaled-videos/
37 "format": "bestvideo*[format_id!*=-sr]+bestaudio/best[format_id!*=-sr]",
39 # Download video files as MP4 and thumbnails as JPEG, or convert
40 # to those formats if they aren't the best available.
41 "format_sort": ["res", "ext:mp4:m4a"],
44 "key": "FFmpegVideoConvertor",
45 "preferedformat": "mp4",
48 "key": "FFmpegThumbnailsConvertor",
56def normalise_url(url: str) -> str:
58 Remove unnecessary tracking parameters from a URL.
60 u = urllib.parse.urlsplit(url)
62 # If it's a YouTube URL, remove all query parameters except video ID (v)
63 if u.netloc == "www.youtube.com":
64 qs = urllib.parse.parse_qsl(u.query)
65 qs = [(k, v) for k, v in qs if k == "v"]
66 query = urllib.parse.urlencode(qs)
67 return urllib.parse.urlunsplit((u.scheme, u.netloc, u.path, query, ""))
72def get_youtube_avatar(tmp_dir: Path, channel_url: str) -> Path:
74 Download the avatar of a YouTube channel.
77 # Print progress output to stderr, not stdout
80 # Don't download every page of results for the channel.
82 # This tells yt-dlp that we're only interested in the first video,
83 # which is technically a lie because we don't care about any videos,
84 # but it has the desired effect.
85 "playlist_items": "0",
88 # Get the URL of the YouTube avatar.
89 with YoutubeDL(ydl_opts) as ydl:
90 channel_info: Any = ydl.extract_info(channel_url, download=False)
92 thumbnails = channel_info["thumbnails"]
93 best_thumbnail = next(t for t in thumbnails if t["id"] == "avatar_uncropped")
94 thumbnail_url = best_thumbnail["url"]
96 # Work out the base filename, e.g. "https://www.youtube.com/@networkrail"
97 # becomes "networkrail"
98 u = urllib.parse.urlsplit(channel_url)
99 basename = u.path.split("/")[1].replace("@", "")
101 return download_image(url=thumbnail_url, out_prefix=tmp_dir / basename)
104def get_instagram_avatar(tmp_dir: Path, uploader_name: str) -> Path:
106 Download the avatar of an Instagram channel.
108 # Somewhere gallery-dl is caching avatars, so the URL it returns
109 # can have an expired signature. For now, just clear the cache before
110 # every run to flush any stale data.
111 subprocess.check_call(["gallery-dl", "--clear-cache", "ALL"])
113 output = subprocess.check_output(
117 f"https://www.instagram.com/{uploader_name}/avatar",
118 "--cookies-from-browser",
123 avatar_url = output.strip()
125 return download_image(url=avatar_url, out_prefix=tmp_dir / uploader_name)
128class UploaderInfo(TypedDict):
130 Information about a video's uploader.
139class VideoInfo(TypedDict):
141 Information about a downloaded video.
151 subtitle_path: Path | None
153 uploader: UploaderInfo
158def cleanup_paths(dir_path: Path) -> None:
160 For every file in `dir_path`, remove URL-unsafe characters from
163 for p in dir_path.iterdir():
175 new_name = new_name.replace(old, new)
177 new_name = re.sub(r"\s+", " ", new_name).strip()
179 if old_name == new_name:
182 assert not (dir_path / new_name).exists(), new_name
183 p.move(dir_path / new_name)
186def download_video(url: str) -> VideoInfo:
188 Download a video with yt-dlp and return metadata about the video.
190 # Download all the videos to a temp directory; this allows the caller
191 # to decide exactly where they want the video later.
192 tmp_dir = Path(tempfile.mkdtemp())
193 ydl_opts["outtmpl"] = str(tmp_dir / "%(title)s [%(id)s].%(ext)s")
195 with YoutubeDL(ydl_opts) as ydl:
196 video_info: Any = ydl.extract_info(url)
198 # Try to download automatic subtitles for a YouTube video.
200 # If you try to download autosubs for a video which doesn't have any,
201 # YouTube rturns an HTTP 429 "Too Many Requests" error. Ignore this
202 # error, but raise all others.
203 if video_info["extractor"] == "youtube" and not any(
204 p.suffix == ".vtt" for p in tmp_dir.iterdir()
206 ydl_auto_subtitle_opts: Any = {
208 "writeautomaticsub": True,
209 "skip_download": True,
210 "outtmpl": ydl_opts["outtmpl"],
212 with YoutubeDL(ydl_auto_subtitle_opts) as ydl:
214 ydl.extract_info(url)
215 except DownloadError as e:
217 e.exc_info is not None
218 and isinstance(e.exc_info[1], YouTubeDLHTTPError)
219 and e.exc_info[1].status == 429
225 cleanup_paths(tmp_dir)
227 video_path = next(p for p in tmp_dir.iterdir() if p.suffix == ".mp4")
228 thumbnail_path = next(p for p in tmp_dir.iterdir() if p.suffix == ".jpg")
230 subtitle_path = next(p for p in tmp_dir.iterdir() if p.suffix == ".vtt")
231 except StopIteration:
234 uploader: UploaderInfo
236 if video_info["extractor"] == "youtube":
239 "id": video_info["uploader_id"],
240 "name": video_info["uploader"],
241 "url": video_info["uploader_url"],
242 "avatar_path": get_youtube_avatar(tmp_dir, video_info["uploader_url"]),
244 elif video_info["extractor"] == "Instagram":
247 "id": video_info["uploader_id"],
248 "name": video_info["uploader"],
249 "url": f"https://www.instagram.com/{video_info['channel']}/",
250 "avatar_path": get_instagram_avatar(
251 tmp_dir, uploader_name=video_info["channel"]
255 sys.exit(f"Unsupported extractor: {video_info['extractor']}")
257 date_uploaded = datetime.fromtimestamp(video_info["timestamp"], tz=timezone.utc)
260 "id": video_info["id"],
262 "title": video_info["title"],
263 "description": video_info["description"],
264 "date_uploaded": date_uploaded.isoformat().replace("+00:00", "Z"),
265 "video_path": video_path,
266 "thumbnail_path": thumbnail_path,
267 "subtitle_path": subtitle_path,
268 "uploader": uploader,
269 "entity": create_video_entity(
271 poster_path=thumbnail_path,
272 subtitles_path=subtitle_path,
274 background="#222222",
276 "folder_path": tmp_dir,
281class PathEncoder(json.JSONEncoder):
283 Custom JSON encoder that encodes paths as a string.
286 def default(self, o: Any) -> Any:
288 Encode paths as a string; everything else us the default encoder.
290 if isinstance(o, Path):
291 return str(o.absolute())
293 return super().default(o)
296if __name__ == "__main__":
298 url = normalise_url(sys.argv[1])
300 sys.exit(f"Usage: {__file__} URL")
302 video_info = download_video(url)
304 json_string = json.dumps(video_info, indent=2, cls=PathEncoder, ensure_ascii=False)