Skip to main content

Add a save_flickr script

ID
5e6645c
date
2023-08-05 08:51:42+00:00
author
Alex Chan <alex@alexwlchan.net>
parent
f344649
message
Add a save_flickr script
changed files
2 files, 148 additions

Changed files

images/README.md (4491) → images/README.md (4715)

diff --git a/images/README.md b/images/README.md
index 7ced139..c173fbd 100644
--- a/images/README.md
+++ b/images/README.md
@@ -106,6 +106,15 @@ These scripts are for working with images and other visual material.
   </dd>
 
   <dt>
+    <a href="https://github.com/alexwlchan/scripts/blob/main/images/save_flickr">
+      <code>save_flickr [FLICKR_URL]</code>
+    </a>
+  </dt>
+  <dd>
+    saves a single image from Flickr, plus some metadata.
+  </dd>
+
+  <dt>
     <a href="https://github.com/alexwlchan/scripts/blob/main/images/save_xkcd">
       <code>save_xkcd [COMIC_NUMBER]</code>
     </a>

images/save_flickr (0) → images/save_flickr (3364)

diff --git a/images/save_flickr b/images/save_flickr
new file mode 100755
index 0000000..20d9b71
--- /dev/null
+++ b/images/save_flickr
@@ -0,0 +1,139 @@
+#!/usr/bin/env python3
+"""
+Downloads and saves a single Flickr image, plus a bit of metadata.
+
+I'm not using this to create a complete archive of Flickr but to create
+a mini-library of my personal favourites.
+"""
+
+import datetime as dt
+import json
+import pathlib
+import shutil
+import sys
+import tempfile
+
+import bs4
+import hyperlink
+import urllib3
+import xmltodict  # pip3 install xmltodict==0.12.0
+
+
+BACKUP_ROOT = pathlib.Path("/Volumes/Media (Sapphire)") / "backups" / "flickr"
+
+
+def get_canonical_url(url):
+    http = urllib3.PoolManager()
+
+    seen_urls = set(url)
+
+    url = url.split("/in/")[0]
+
+    while True:
+        resp = http.request(
+            "GET", url,
+            redirect=False,
+            headers={"User-Agent": "urllib3"}
+        )
+
+        try:
+            url = resp.headers["Location"]
+        except KeyError:
+            return url
+
+        if url in seen_urls:
+            raise ValueError("Circular redirect: {url}")
+        else:
+            seen_urls.add(url)
+
+
+def build_url(base, params):
+    u = hyperlink.URL.from_text(base)
+    for k, v in params.items():
+        u = u.set(k, v)
+    return u
+
+
+def get_oembed_data(canonical_url):
+    http = urllib3.PoolManager()
+
+    request_url = build_url(
+        "https://www.flickr.com/services/oembed/",
+        params={"url": url}
+    )
+
+    oembed_resp = http.request("GET", str(request_url))
+    assert oembed_resp.status == 200, oembed_resp.status
+    return xmltodict.parse(oembed_resp.data)["oembed"]
+
+
+def get_description(url):
+    http = urllib3.PoolManager()
+
+    resp = http.request("GET", url)
+    assert resp.status == 200, resp.status
+
+    soup = bs4.BeautifulSoup(resp.data, "html.parser")
+    return soup.find("meta", attrs={"name": "description"}).attrs["content"]
+
+
+def save_image(out_dir, oembed_data):
+    http = urllib3.PoolManager()
+
+    img_url = oembed_data["url"]
+    filename = hyperlink.URL.from_text(img_url).path[-1]
+
+    # See https://stackoverflow.com/q/17285464/1558022
+    resp = http.request("GET", img_url, preload_content=False)
+
+    with (out_dir / filename).open("wb") as out:
+        while True:
+            data = resp.read(1024)
+            if not data:
+                break
+            out.write(data)
+
+    resp.release_conn()
+
+
+if __name__ == "__main__":
+    try:
+        url = sys.argv[1]
+    except IndexError:
+        sys.exit(f"Usage: {__file__} <FLICKR_URL>")
+
+    canonical_url = get_canonical_url(url)
+    oembed_data = get_oembed_data(canonical_url)
+
+    parsed_url = hyperlink.URL.from_text(canonical_url.strip("/"))
+
+    assert parsed_url.path[0] == "photos", parsed_url
+
+    flickr_id = parsed_url.path[-1]
+    assert flickr_id.isnumeric()
+
+    author_name = oembed_data["author_name"]
+
+    author_url = oembed_data["author_url"].strip("/")
+    creator = hyperlink.URL.from_text(author_url).path[-1]
+
+    backup_dir = BACKUP_ROOT / creator
+    backup_dir.mkdir(exist_ok=True)
+
+    description = get_description(canonical_url)
+
+    flickr_info = {
+        "url": url,
+        "canonical_url": canonical_url,
+        "saved_at": dt.datetime.now().isoformat(),
+        "description": description,
+        "oembed_data": oembed_data
+    }
+
+    json_string = json.dumps(flickr_info, indent=2, sort_keys=True)
+
+    (backup_dir / f"{flickr_id}.json").write_text(json_string)
+
+    save_image(backup_dir, oembed_data)
+
+    print(backup_dir)