Merge pull request #13 from alexwlchan/save-pinboard-bookmarks

ID

56686e1

date

2024-02-14 23:28:21+00:00

author

Alex Chan <alex@alexwlchan.net>

parents

e95f0ad, 0deb9c3

message

Merge pull request #13 from alexwlchan/save-pinboard-bookmarks

[WIP] Add my script for backing up Pinboard bookmarks

changed files

5 files, 363 additions, 1 deletion

.github/workflows/test.yml
web/README.md
web/save_ao3_links.py
web/save_pinboard_bookmarks.py
web/test_save_ao3_links.py

Changed files

.github/workflows/test.yml (1190) → .github/workflows/test.yml (1233)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 3171640..6356a05 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -44,3 +44,4 @@ jobs:
         py.test aws/test_s3tree.py
         py.test text/test_fix_twitter_thread.py
         py.test textexpander/test_get_mastodon_text.py
+        py.test web/test_save_ao3_links.py

web/README.md (3467) → web/README.md (4353)

diff --git a/web/README.md b/web/README.md
index 5ada617..491c4a6 100644
--- a/web/README.md
+++ b/web/README.md
@@ -36,6 +36,18 @@ scripts = [
         """
     },
     {
+        "usage": "save_ao3_links.py [URL...]",
+        "description": """
+        save a copy of a story on AO3, including exports in every available format.
+        """
+    },
+    {
+        "name": "save_pinboard_bookmarks.py",
+        "description": """
+        save a complete copy of all my Pinboard bookmarks, including my archive backups.
+        """
+    },
+    {
         "name": "save_tumblr_likes.py",
         "description": """
         save a copy of all the posts I've liked on Tumblr to my backup drive.
@@ -87,6 +99,24 @@ cog_helpers.create_description_table(folder_name=folder_name, scripts=scripts)
   </dd>
 
   <dt>
+    <a href="https://github.com/alexwlchan/scripts/blob/main/web/save_ao3_links.py">
+      <code>save_ao3_links.py [URL...]</code>
+    </a>
+  </dt>
+  <dd>
+    save a copy of a story on AO3, including exports in every available format.
+  </dd>
+
+  <dt>
+    <a href="https://github.com/alexwlchan/scripts/blob/main/web/save_pinboard_bookmarks.py">
+      <code>save_pinboard_bookmarks.py</code>
+    </a>
+  </dt>
+  <dd>
+    save a complete copy of all my Pinboard bookmarks, including my archive backups.
+  </dd>
+
+  <dt>
     <a href="https://github.com/alexwlchan/scripts/blob/main/web/save_tumblr_likes.py">
       <code>save_tumblr_likes.py</code>
     </a>
@@ -113,4 +143,4 @@ cog_helpers.create_description_table(folder_name=folder_name, scripts=scripts)
     this is a wrapper around <a href="https://github.com/yt-dlp/yt-dlp">yt-dlp</a> that does parallel downloads of videos in playlists.
   </dd>
 </dl>
-<!-- [[[end]]] (checksum: 0d41d07b707085af6913de92b942a152) -->
+<!-- [[[end]]] (checksum: a4f4aaedc92d2ce7e499f50a87c39d22) -->

web/save_ao3_links.py (0) → web/save_ao3_links.py (2160)

diff --git a/web/save_ao3_links.py b/web/save_ao3_links.py
new file mode 100755
index 0000000..b1d9f37
--- /dev/null
+++ b/web/save_ao3_links.py
@@ -0,0 +1,85 @@
+#!/usr/bin/env python3
+
+import os
+import pathlib
+import shutil
+import subprocess
+import sys
+import tarfile
+
+import hyperlink
+
+
+BACKUP_ROOT = pathlib.Path("/Volumes/Media (Sapphire)/backups/ao3")
+
+
+def get_ao3_id(url: str) -> str:
+    # e.g. 'https://archiveofourown.org/works/1234' ~> '1234'
+    u = hyperlink.DecodedURL.from_text(url)
+
+    if u.path[0] == "works" and u.path[1].isnumeric():
+        return u.path[1]
+    else:
+        raise ValueError(url)
+
+
+def save_ao3_url(url: str):
+    ao3_id = get_ao3_id(url)
+
+    # Check if the fic is already downloaded -- if it is, nothing to do.
+    if any(
+        name.startswith(f"{ao3_id}-") and name.endswith(".tar.gz")
+        for name in os.listdir(BACKUP_ROOT)
+    ):
+        return
+
+    print(f"Saving {url}...")
+
+    # Otherwise, create a temporary directory for the download.
+    #
+    # Delete any partial downloads first.
+    tmp_dir = BACKUP_ROOT / f"{ao3_id}.tmp"
+
+    try:
+        shutil.rmtree(tmp_dir)
+    except FileNotFoundError:
+        pass
+
+    for ext in ["azw", "epub", "mobi", "pdf", "html"]:
+        wget(
+            "--no-verbose",
+            "--output-file",
+            "-",
+            # The Content-Disposition header is sent by the server to say
+            # what the file "should" be called.  By telling wget to respect this,
+            # it means we can request "a.html", the header from AO3 will specify
+            # the correct filename (including the fic title), and the file will
+            # be named correctly.
+            "--content-disposition",
+            "--directory-prefix",
+            tmp_dir,
+            f"https://archiveofourown.org/downloads/{ao3_id}/a.{ext}",
+        )
+
+    try:
+        title = os.listdir(tmp_dir)[0].rsplit(".")[0]
+    except FileNotFoundError:
+        return
+
+    out_path = BACKUP_ROOT / f"{ao3_id}-{title}.tar.gz"
+
+    with tarfile.open(out_path, "w:gz") as tf:
+        tf.add(tmp_dir, arcname=ao3_id)
+
+    shutil.rmtree(tmp_dir)
+
+    print(f" ~> {out_path}")
+
+
+def wget(*args):
+    subprocess.call(["wget"] + list(args), stdout=subprocess.DEVNULL)
+
+
+if __name__ == "__main__":
+    for url in sys.argv[1:]:
+        save_ao3_url(url)

web/save_pinboard_bookmarks.py (0) → web/save_pinboard_bookmarks.py (6277)

diff --git a/web/save_pinboard_bookmarks.py b/web/save_pinboard_bookmarks.py
new file mode 100755
index 0000000..1d41c80
--- /dev/null
+++ b/web/save_pinboard_bookmarks.py
@@ -0,0 +1,228 @@
+#!/usr/bin/env python3
+
+import contextlib
+import datetime
+import json
+import os
+import pathlib
+import shutil
+import subprocess
+import tarfile
+import tempfile
+
+import bs4
+import httpx
+import keyring
+
+
+BACKUP_ROOT = pathlib.Path("/Volumes/Media (Sapphire)/backups/pinboard")
+
+
+def write_to_file(name: str, contents: str) -> None:
+    """
+    Write a string to a text file, and log that you're doing it.
+    """
+    path = BACKUP_ROOT / name
+    print(f"    ~> {path}")
+    path.write_text(contents)
+
+
+def get_bookmarks_data(username: str, password: str) -> str:
+    """
+    Call the Pinboard API to get a complete list of my bookmarks.
+
+    Return the result as a pretty-printed JSON string.
+    """
+    resp = httpx.get(
+        "https://api.pinboard.in/v1/posts/all",
+        params={"format": "json"},
+        auth=(username, password),
+    )
+
+    resp.raise_for_status()
+
+    return resp.json()
+
+
+def get_cache_ids(username: str, password: str) -> dict[str, str]:
+    """
+    Get a list of cache IDs for bookmarks in my account.
+
+    These are the URLs where Pinboard takes archived snapshots of
+    web pages, e.g. https://pinboard.in/cached/1234567890/
+
+    Returns a dict (bookmarked URL) -> (cache ID).
+    """
+    # Start by logging in to Pinbaord, so we have the appropriate
+    # cookies in our session.
+    client = httpx.Client(follow_redirects=True)
+
+    resp = client.post(
+        "https://pinboard.in/auth/", data={"username": username, "password": password}
+    )
+    resp.raise_for_status()
+
+    # Now start fetching cache IDs from my account.
+    cache_ids: dict[str, str] = {}
+
+    url = f"https://pinboard.in/u:{username}"
+
+    while True:
+        print(f"    ... fetching cache IDs from {url}")
+        resp = client.get(url, params={"per_page": "160"})
+        resp.raise_for_status()
+
+        soup = bs4.BeautifulSoup(resp.text, "html.parser")
+
+        # The structure of the page is of the form:
+        #
+        #     <div id="bookmarks">
+        #       <div class="bookmark">
+        #         <a class="bookmark_title" href="http://example.net">Example</a>
+        #         <a class="cached" href="/cached/1234567890/">☑</a>
+        #         …
+        #
+        bookmarks_div = soup.find("div", attrs={"id": "bookmarks"})
+        bookmarks = bookmarks_div.find_all("div", attrs={"class": "bookmark"})
+
+        for b in bookmarks:
+            href = b.find("a", attrs={"class": "bookmark_title"}).attrs["href"]
+            cache_link = b.find("a", attrs={"class": "cached"})
+
+            if cache_link is None:
+                continue
+
+            this_cache_id = cache_link.attrs["href"].split("/")[-2]
+
+            cache_ids[href] = this_cache_id
+
+        # The pagination link, if present, will be something like:
+        #
+        #      <a class="next_prev" href="/u:alexwlchan/before:1234">« earlier</a>
+        #
+        pagination_link = soup.find("a", attrs={"class": "next_prev"})
+
+        if "earlier" not in pagination_link.text:
+            break
+
+        url = "https://pinboard.in" + pagination_link.attrs["href"]
+
+    return cache_ids
+
+
+def wget(*args):
+    subprocess.call(["wget"] + list(args), stdout=subprocess.DEVNULL)
+
+
+@contextlib.contextmanager
+def wget_context(username: str, password: str):
+    with tempfile.TemporaryDirectory() as tmpdir:
+        os.chdir(tmpdir)
+
+        wget(
+            "--save-cookies",
+            "pinboard-cookies.txt",
+            "--keep-session-cookies",
+            "--delete-after",
+            "--output-file",
+            "-",
+            "--post-data",
+            f"username={username}&password={password}",
+            "https://pinboard.in/auth/",
+        )
+
+        yield
+
+        os.unlink("pinboard-cookies.txt")
+
+
+def download_single_archive(url: str, cache_id: str):
+    cache_dir = BACKUP_ROOT / "archive" / cache_id[0] / cache_id
+    cache_path = cache_dir.with_suffix(".tar.gz")
+
+    # If the archive is already downloaded, there's nothing to do.
+    if cache_path.exists():
+        return
+
+    print(f"    ... saving https://pinboard.in/cached/{cache_id}/")
+    print(f"        {url}")
+
+    # Otherwise, start downloading the archive into a temporary directory.
+    # Clear any pending downloads first.
+    tmp_dir = cache_dir.with_suffix(".tmp")
+    tmp_dir.mkdir(parents=True, exist_ok=True)
+
+    shutil.rmtree(tmp_dir)
+
+    wget(
+        "--adjust-extension",
+        "--span-hosts",
+        "--no-verbose",
+        "--convert-links",
+        "--page-requisites",
+        "--no-directories",
+        "-e",
+        "robots=off",
+        "--load-cookies",
+        "pinboard-cookies.txt",
+        "--output-file",
+        "-",
+        "--directory-prefix",
+        str(tmp_dir),
+        f"https://pinboard.in/cached/{cache_id}/",
+    )
+
+    with tarfile.open(cache_path, "w:gz") as tf:
+        tf.add(tmp_dir, arcname=cache_id)
+
+    print(f"        {cache_path}")
+
+    shutil.rmtree(tmp_dir)
+
+
+if __name__ == "__main__":
+    username = "alexwlchan"
+
+    password = keyring.get_password("pinboard", "password")
+    assert password is not None
+
+    now = datetime.date.today().strftime("%Y-%m-%d")
+
+    print("*** Getting a JSON copy of my bookmarks data")
+    bookmarks = get_bookmarks_data(username, password)
+    json_string = json.dumps(bookmarks, indent=2, sort_keys=True)
+
+    for name in (f"bookmarks.{now}.json", "bookmarks.json"):
+        write_to_file(name, contents=json_string)
+
+    print("")
+
+    # print("*** Getting a list of cache IDs")
+    # all_cache_ids = get_cache_ids(username, password)
+    #
+    # for name in (f"cache_ids.{now}.json", "cache_ids.json"):
+    #     write_to_file(name, contents=json.dumps(all_cache_ids))
+    #
+    # all_cache_ids = json.load(open(BACKUP_ROOT / "cache_ids.json"))
+    #
+    # print("")
+    #
+    # print("*** Saving archive files using wget")
+    #
+    # with wget_context(username, password):
+    #     for url, cache_id in all_cache_ids.items():
+    #         download_single_archive(url, cache_id)
+    #
+    # print("")
+
+    print("*** Saving stories from AO3")
+
+    ao3_urls = [b["href"] for b in bookmarks if "archiveofourown.org" in b["href"]]
+
+    subprocess.check_call(
+        [
+            "python3",
+            "/Users/alexwlchan/repos/scripts/web/save_ao3_links.py",
+        ]
+        + ao3_urls
+    )

web/test_save_ao3_links.py (0) → web/test_save_ao3_links.py (451)

diff --git a/web/test_save_ao3_links.py b/web/test_save_ao3_links.py
new file mode 100644
index 0000000..b3b5b92
--- /dev/null
+++ b/web/test_save_ao3_links.py
@@ -0,0 +1,18 @@
+import pytest
+
+from save_ao3_links import get_ao3_id
+
+
+@pytest.mark.parametrize(
+    ["url", "ao3_id"],
+    [
+        ("https://archiveofourown.org/works/1234", "1234"),
+        ("https://archiveofourown.org/works/1234?view_adult=true", "1234"),
+        (
+            "https://archiveofourown.org/works/1234/chapters/5678?view_adult=true",
+            "1234",
+        ),
+    ],
+)
+def test_get_ao3_id(url, ao3_id):
+    assert get_ao3_id(url) == ao3_id