Skip to main content

yt-dlp_alexwlchan.py

1#!/usr/bin/env python3
2"""
3yt-dlp_alexwlchan is a personal wrapper around yt-dlp that downloads a video
4with my preferred settings.
5"""
7from datetime import datetime, timezone
8import json
9from pathlib import Path
10import re
11import subprocess
12import sys
13import tempfile
14from typing import Any, TypedDict
15import urllib.parse
17from chives.fetch import download_image
18from chives.media import create_video_entity, VideoEntity
19from yt_dlp import YoutubeDL
20from yt_dlp.networking.exceptions import HTTPError as YouTubeDLHTTPError
21from yt_dlp.utils import DownloadError
24ydl_opts: Any = {
25 # Print progress output to stderr, not stdout
26 "logtostderr": True,
27 #
28 # Download the thumbnail
29 "writethumbnail": True,
30 #
31 # Download subtitles, or YouTube's automatic subtitles if there
32 # aren't any.
33 "writesubtitles": True,
34 #
35 # Ignore AI-upscaled videos from YouTube.
36 # See https://alexwlchan.net/til/2025/ignore-ai-scaled-videos/
37 "format": "bestvideo*[format_id!*=-sr]+bestaudio/best[format_id!*=-sr]",
38 #
39 # Download video files as MP4 and thumbnails as JPEG, or convert
40 # to those formats if they aren't the best available.
41 "format_sort": ["res", "ext:mp4:m4a"],
42 "postprocessors": [
43 {
44 "key": "FFmpegVideoConvertor",
45 "preferedformat": "mp4",
46 },
47 {
48 "key": "FFmpegThumbnailsConvertor",
49 "format": "jpg",
50 "when": "before_dl",
51 },
52 ],
56def normalise_url(url: str) -> str:
57 """
58 Remove unnecessary tracking parameters from a URL.
59 """
60 u = urllib.parse.urlsplit(url)
62 # If it's a YouTube URL, remove all query parameters except video ID (v)
63 if u.netloc == "www.youtube.com":
64 qs = urllib.parse.parse_qsl(u.query)
65 qs = [(k, v) for k, v in qs if k == "v"]
66 query = urllib.parse.urlencode(qs)
67 return urllib.parse.urlunsplit((u.scheme, u.netloc, u.path, query, ""))
69 return url
72def get_youtube_avatar(tmp_dir: Path, channel_url: str) -> Path:
73 """
74 Download the avatar of a YouTube channel.
75 """
76 ydl_opts: Any = {
77 # Print progress output to stderr, not stdout
78 "logtostderr": True,
79 #
80 # Don't download every page of results for the channel.
81 #
82 # This tells yt-dlp that we're only interested in the first video,
83 # which is technically a lie because we don't care about any videos,
84 # but it has the desired effect.
85 "playlist_items": "0",
86 }
88 # Get the URL of the YouTube avatar.
89 with YoutubeDL(ydl_opts) as ydl:
90 channel_info: Any = ydl.extract_info(channel_url, download=False)
92 thumbnails = channel_info["thumbnails"]
93 best_thumbnail = next(t for t in thumbnails if t["id"] == "avatar_uncropped")
94 thumbnail_url = best_thumbnail["url"]
96 # Work out the base filename, e.g. "https://www.youtube.com/@networkrail"
97 # becomes "networkrail"
98 u = urllib.parse.urlsplit(channel_url)
99 basename = u.path.split("/")[1].replace("@", "")
101 return download_image(url=thumbnail_url, out_prefix=tmp_dir / basename)
104def get_instagram_avatar(tmp_dir: Path, uploader_name: str) -> Path:
105 """
106 Download the avatar of an Instagram channel.
107 """
108 # Somewhere gallery-dl is caching avatars, so the URL it returns
109 # can have an expired signature. For now, just clear the cache before
110 # every run to flush any stale data.
111 subprocess.check_call(["gallery-dl", "--clear-cache", "ALL"])
113 output = subprocess.check_output(
114 [
115 "gallery-dl",
116 "--get-urls",
117 f"https://www.instagram.com/{uploader_name}/avatar",
118 "--cookies-from-browser",
119 "firefox",
120 ],
121 text=True,
122 )
123 avatar_url = output.strip()
125 return download_image(url=avatar_url, out_prefix=tmp_dir / uploader_name)
128class UploaderInfo(TypedDict):
129 """
130 Information about a video's uploader.
131 """
133 id: str
134 name: str
135 url: str
136 avatar_path: Path
139class VideoInfo(TypedDict):
140 """
141 Information about a downloaded video.
142 """
144 id: str
145 url: str
146 title: str
147 description: str
148 date_uploaded: str
149 video_path: Path
150 thumbnail_path: Path
151 subtitle_path: Path | None
152 folder_path: Path
153 uploader: UploaderInfo
154 entity: VideoEntity
155 site: str
158def cleanup_paths(dir_path: Path) -> None:
159 """
160 For every file in `dir_path`, remove URL-unsafe characters from
161 the filenames.
162 """
163 for p in dir_path.iterdir():
164 old_name = p.name
165 new_name = p.name
167 for old, new in [
168 ("#", " "),
169 ("?", " "),
170 ("⧸", "-"),
171 (":", "-"),
172 ("|", "-"),
173 (""", ""),
174 ]:
175 new_name = new_name.replace(old, new)
177 new_name = re.sub(r"\s+", " ", new_name).strip()
179 if old_name == new_name:
180 continue
182 assert not (dir_path / new_name).exists(), new_name
183 p.move(dir_path / new_name)
186def download_video(url: str) -> VideoInfo:
187 """
188 Download a video with yt-dlp and return metadata about the video.
189 """
190 # Download all the videos to a temp directory; this allows the caller
191 # to decide exactly where they want the video later.
192 tmp_dir = Path(tempfile.mkdtemp())
193 ydl_opts["outtmpl"] = str(tmp_dir / "%(title)s [%(id)s].%(ext)s")
195 with YoutubeDL(ydl_opts) as ydl:
196 video_info: Any = ydl.extract_info(url)
198 # Try to download automatic subtitles for a YouTube video.
199 #
200 # If you try to download autosubs for a video which doesn't have any,
201 # YouTube rturns an HTTP 429 "Too Many Requests" error. Ignore this
202 # error, but raise all others.
203 if video_info["extractor"] == "youtube" and not any(
204 p.suffix == ".vtt" for p in tmp_dir.iterdir()
205 ):
206 ydl_auto_subtitle_opts: Any = {
207 "logtostderr": True,
208 "writeautomaticsub": True,
209 "skip_download": True,
210 "outtmpl": ydl_opts["outtmpl"],
211 }
212 with YoutubeDL(ydl_auto_subtitle_opts) as ydl:
213 try:
214 ydl.extract_info(url)
215 except DownloadError as e:
216 if (
217 e.exc_info is not None
218 and isinstance(e.exc_info[1], YouTubeDLHTTPError)
219 and e.exc_info[1].status == 429
220 ):
221 pass
222 else:
223 raise
225 cleanup_paths(tmp_dir)
227 video_path = next(p for p in tmp_dir.iterdir() if p.suffix == ".mp4")
228 thumbnail_path = next(p for p in tmp_dir.iterdir() if p.suffix == ".jpg")
229 try:
230 subtitle_path = next(p for p in tmp_dir.iterdir() if p.suffix == ".vtt")
231 except StopIteration:
232 subtitle_path = None
234 uploader: UploaderInfo
236 if video_info["extractor"] == "youtube":
237 site = "youtube"
238 uploader = {
239 "id": video_info["uploader_id"],
240 "name": video_info["uploader"],
241 "url": video_info["uploader_url"],
242 "avatar_path": get_youtube_avatar(tmp_dir, video_info["uploader_url"]),
243 }
244 elif video_info["extractor"] == "Instagram":
245 site = "instagram"
246 uploader = {
247 "id": video_info["uploader_id"],
248 "name": video_info["uploader"],
249 "url": f"https://www.instagram.com/{video_info['channel']}/",
250 "avatar_path": get_instagram_avatar(
251 tmp_dir, uploader_name=video_info["channel"]
252 ),
253 }
254 else:
255 sys.exit(f"Unsupported extractor: {video_info['extractor']}")
257 date_uploaded = datetime.fromtimestamp(video_info["timestamp"], tz=timezone.utc)
259 return {
260 "id": video_info["id"],
261 "url": url,
262 "title": video_info["title"],
263 "description": video_info["description"],
264 "date_uploaded": date_uploaded.isoformat().replace("+00:00", "Z"),
265 "video_path": video_path,
266 "thumbnail_path": thumbnail_path,
267 "subtitle_path": subtitle_path,
268 "uploader": uploader,
269 "entity": create_video_entity(
270 video_path,
271 poster_path=thumbnail_path,
272 subtitles_path=subtitle_path,
273 source_url=url,
274 background="#222222",
275 ),
276 "folder_path": tmp_dir,
277 "site": site,
278 }
281class PathEncoder(json.JSONEncoder):
282 """
283 Custom JSON encoder that encodes paths as a string.
284 """
286 def default(self, o: Any) -> Any:
287 """
288 Encode paths as a string; everything else us the default encoder.
289 """
290 if isinstance(o, Path):
291 return str(o.absolute())
292 else:
293 return super().default(o)
296if __name__ == "__main__":
297 try:
298 url = normalise_url(sys.argv[1])
299 except IndexError:
300 sys.exit(f"Usage: {__file__} URL")
302 video_info = download_video(url)
304 json_string = json.dumps(video_info, indent=2, cls=PathEncoder, ensure_ascii=False)
306 print(json_string)