Skip to main content

Add my Pinboard bookmark script

ID
e85aa81
date
2024-02-14 22:46:26+00:00
author
Alex Chan <alex@alexwlchan.net>
parent
c595ce3
message
Add my Pinboard bookmark script
changed files
2 files, 190 additions, 5 deletions

Changed files

web/README.md (3467) → web/README.md (3919)

diff --git a/web/README.md b/web/README.md
index 5ada617..0a6518f 100644
--- a/web/README.md
+++ b/web/README.md
@@ -36,6 +36,12 @@ scripts = [
         """
     },
     {
+        "name": "save_pinboard_bookmarks.py",
+        "description": """
+        save a complete copy of all my Pinboard bookmarks, including my archive backups.
+        """
+    },
+    {
         "name": "save_tumblr_likes.py",
         "description": """
         save a copy of all the posts I've liked on Tumblr to my backup drive.
@@ -87,6 +93,15 @@ cog_helpers.create_description_table(folder_name=folder_name, scripts=scripts)
   </dd>
 
   <dt>
+    <a href="https://github.com/alexwlchan/scripts/blob/main/web/save_pinboard_bookmarks.py">
+      <code>save_pinboard_bookmarks.py</code>
+    </a>
+  </dt>
+  <dd>
+    save a complete copy of all my Pinboard bookmarks, including my archive backups.
+  </dd>
+
+  <dt>
     <a href="https://github.com/alexwlchan/scripts/blob/main/web/save_tumblr_likes.py">
       <code>save_tumblr_likes.py</code>
     </a>
@@ -113,4 +128,4 @@ cog_helpers.create_description_table(folder_name=folder_name, scripts=scripts)
     this is a wrapper around <a href="https://github.com/yt-dlp/yt-dlp">yt-dlp</a> that does parallel downloads of videos in playlists.
   </dd>
 </dl>
-<!-- [[[end]]] (checksum: 0d41d07b707085af6913de92b942a152) -->
+<!-- [[[end]]] (checksum: e326ff2ac898ceecc4bddd204f9318b2) -->

web/save_pinboard_bookmarks.py (1058) → web/save_pinboard_bookmarks.py (5924)

diff --git a/web/save_pinboard_bookmarks.py b/web/save_pinboard_bookmarks.py
index 03739c8..07bc366 100755
--- a/web/save_pinboard_bookmarks.py
+++ b/web/save_pinboard_bookmarks.py
@@ -1,9 +1,16 @@
 #!/usr/bin/env python3
 
+import contextlib
 import datetime
 import json
+import os
 import pathlib
+import shutil
+import subprocess
+import tarfile
+import tempfile
 
+import bs4
 import httpx
 import keyring
 
@@ -11,6 +18,15 @@ import keyring
 BACKUP_ROOT = pathlib.Path("/Volumes/Media (Sapphire)/backups/pinboard")
 
 
+def write_to_file(name: str, contents: str) -> None:
+    """
+    Write a string to a text file, and log that you're doing it.
+    """
+    path = BACKUP_ROOT / name
+    print(f"    ~> {path}")
+    path.write_text(contents)
+
+
 def get_bookmarks_json(username: str, password: str) -> str:
     """
     Call the Pinboard API to get a complete list of my bookmarks.
@@ -30,16 +46,170 @@ def get_bookmarks_json(username: str, password: str) -> str:
     return json_string
 
 
+def get_cache_ids(username: str, password: str) -> dict[str, str]:
+    """
+    Get a list of cache IDs for bookmarks in my account.
+
+    These are the URLs where Pinboard takes archived snapshots of
+    web pages, e.g. https://pinboard.in/cached/1234567890/
+
+    Returns a dict (bookmarked URL) -> (cache ID).
+    """
+    # Start by logging in to Pinbaord, so we have the appropriate
+    # cookies in our session.
+    client = httpx.Client(follow_redirects=True)
+
+    resp = client.post(
+        "https://pinboard.in/auth/", data={"username": username, "password": password}
+    )
+    resp.raise_for_status()
+
+    # Now start fetching cache IDs from my account.
+    cache_ids: dict[str, str] = {}
+
+    url = f"https://pinboard.in/u:{username}"
+
+    while True:
+        print(f"    ... fetching cache IDs from {url}")
+        resp = client.get(url, params={"per_page": "160"})
+        resp.raise_for_status()
+
+        soup = bs4.BeautifulSoup(resp.text, "html.parser")
+
+        # The structure of the page is of the form:
+        #
+        #     <div id="bookmarks">
+        #       <div class="bookmark">
+        #         <a class="bookmark_title" href="http://example.net">Example</a>
+        #         <a class="cached" href="/cached/1234567890/">☑</a>
+        #         …
+        #
+        bookmarks_div = soup.find("div", attrs={"id": "bookmarks"})
+        bookmarks = bookmarks_div.find_all("div", attrs={"class": "bookmark"})
+
+        for b in bookmarks:
+            href = b.find("a", attrs={"class": "bookmark_title"}).attrs["href"]
+            cache_link = b.find("a", attrs={"class": "cached"})
+
+            if cache_link is None:
+                continue
+
+            this_cache_id = cache_link.attrs["href"].split("/")[-2]
+
+            cache_ids[href] = this_cache_id
+
+        # The pagination link, if present, will be something like:
+        #
+        #      <a class="next_prev" href="/u:alexwlchan/before:1234">« earlier</a>
+        #
+        pagination_link = soup.find("a", attrs={"class": "next_prev"})
+
+        if "earlier" not in pagination_link.text:
+            break
+
+        url = f"https://pinboard.in" + pagination_link.attrs["href"]
+
+    return cache_ids
+
+
+def wget(*args):
+    subprocess.call(["wget"] + list(args), stdout=subprocess.DEVNULL)
+
+
+@contextlib.contextmanager
+def wget_context(username: str, password: str):
+    with tempfile.TemporaryDirectory() as tmpdir:
+        os.chdir(tmpdir)
+
+        wget(
+            "--save-cookies",
+            "pinboard-cookies.txt",
+            "--keep-session-cookies",
+            "--delete-after",
+            "--output-file",
+            "-",
+            "--post-data",
+            f"username={username}&password={password}",
+            "https://pinboard.in/auth/",
+        )
+
+        yield
+
+        os.unlink("pinboard-cookies.txt")
+
+
+def download_single_archive(url: str, cache_id: str):
+    cache_dir = BACKUP_ROOT / "archive" / cache_id[0] / cache_id
+    cache_path = cache_dir.with_suffix(".tar.gz")
+
+    # If the archive is already downloaded, there's nothing to do.
+    if cache_path.exists():
+        return
+
+    print(f"    ... saving https://pinboard.in/cached/{cache_id}/")
+    print(f"        {url}")
+
+    # Otherwise, start downloading the archive into a temporary directory.
+    # Clear any pending downloads first.
+    tmp_dir = cache_dir.with_suffix(".tmp")
+    tmp_dir.mkdir(parents=True, exist_ok=True)
+
+    shutil.rmtree(tmp_dir)
+
+    wget(
+        "--adjust-extension",
+        "--span-hosts",
+        "--no-verbose",
+        "--convert-links",
+        "--page-requisites",
+        "--no-directories",
+        "-e",
+        "robots=off",
+        "--load-cookies",
+        "pinboard-cookies.txt",
+        "--output-file",
+        "-",
+        "--directory-prefix",
+        str(tmp_dir),
+        f"https://pinboard.in/cached/{cache_id}/",
+    )
+
+    with tarfile.open(cache_path, "w:gz") as tf:
+        tf.add(tmp_dir, arcname=cache_id)
+
+    print(f"        {cache_path}")
+
+    shutil.rmtree(tmp_dir)
+
+
 if __name__ == "__main__":
     username = "alexwlchan"
 
     password = keyring.get_password("pinboard", "password")
     assert password is not None
 
-    json_string = get_bookmarks_json(username, password)
-
     now = datetime.date.today().strftime("%Y-%m-%d")
 
+    print("*** Getting a JSON copy of my bookmarks data")
+    json_string = get_bookmarks_json(username, password)
+
     for name in (f"bookmarks.{now}.json", "bookmarks.json"):
-        with open(BACKUP_ROOT / name, "w") as outfile:
-            outfile.write(json_string)
+        write_to_file(name, contents=json_string)
+
+    print("")
+
+    print("*** Getting a list of cache IDs")
+    all_cache_ids = get_cache_ids(username, password)
+
+    for name in (f"cache_ids.{now}.json", "cache_ids.json"):
+        write_to_file(name, contents=json.dumps(all_cache_ids))
+
+    all_cache_ids = json.load(open(BACKUP_ROOT / "cache_ids.json"))
+
+    print("")
+
+    print("*** Saving archive files using wget")
+
+    with wget_context(username, password):
+        for url, cache_id in all_cache_ids.items():
+            download_single_archive(url, cache_id)