Add the AO3 script

ID

7ccd420

date

2024-02-14 23:10:17+00:00

author

Alex Chan <alex@alexwlchan.net>

parent

38ac6ec

message

Add the AO3 script

changed files

3 files, 123 additions, 21 deletions

web/README.md
web/save_ao3_links.py
web/save_pinboard_bookmarks.py

Changed files

web/README.md (3919) → web/README.md (4353)

diff --git a/web/README.md b/web/README.md
index 0a6518f..491c4a6 100644
--- a/web/README.md
+++ b/web/README.md
@@ -36,6 +36,12 @@ scripts = [
         """
     },
     {
+        "usage": "save_ao3_links.py [URL...]",
+        "description": """
+        save a copy of a story on AO3, including exports in every available format.
+        """
+    },
+    {
         "name": "save_pinboard_bookmarks.py",
         "description": """
         save a complete copy of all my Pinboard bookmarks, including my archive backups.
@@ -93,6 +99,15 @@ cog_helpers.create_description_table(folder_name=folder_name, scripts=scripts)
   </dd>
 
   <dt>
+    <a href="https://github.com/alexwlchan/scripts/blob/main/web/save_ao3_links.py">
+      <code>save_ao3_links.py [URL...]</code>
+    </a>
+  </dt>
+  <dd>
+    save a copy of a story on AO3, including exports in every available format.
+  </dd>
+
+  <dt>
     <a href="https://github.com/alexwlchan/scripts/blob/main/web/save_pinboard_bookmarks.py">
       <code>save_pinboard_bookmarks.py</code>
     </a>
@@ -128,4 +143,4 @@ cog_helpers.create_description_table(folder_name=folder_name, scripts=scripts)
     this is a wrapper around <a href="https://github.com/yt-dlp/yt-dlp">yt-dlp</a> that does parallel downloads of videos in playlists.
   </dd>
 </dl>
-<!-- [[[end]]] (checksum: e326ff2ac898ceecc4bddd204f9318b2) -->
+<!-- [[[end]]] (checksum: a4f4aaedc92d2ce7e499f50a87c39d22) -->

web/save_ao3_links.py (0) → web/save_ao3_links.py (1945)

diff --git a/web/save_ao3_links.py b/web/save_ao3_links.py
new file mode 100755
index 0000000..a8512f0
--- /dev/null
+++ b/web/save_ao3_links.py
@@ -0,0 +1,74 @@
+#!/usr/bin/env python3
+
+import os
+import pathlib
+import shutil
+import subprocess
+import sys
+import tarfile
+
+
+BACKUP_ROOT = pathlib.Path("/Volumes/Media (Sapphire)/backups/ao3")
+
+
+def save_ao3_url(url: str):
+    # e.g. 'https://archiveofourown.org/works/1234' ~> '1234'
+    ao3_id = url.split("/")[-1]
+
+    # Check if the fic is already downloaded -- if it is, nothing to do.
+    if any(
+        name.startswith(f"{ao3_id}-") and name.endswith(".tar.gz")
+        for name in os.listdir(BACKUP_ROOT)
+    ):
+        return
+
+    print(f"Saving {url}...")
+
+    # Otherwise, create a temporary directory for the download.
+    #
+    # Delete any partial downloads first.
+    tmp_dir = BACKUP_ROOT / f"{ao3_id}.tmp"
+
+    try:
+        shutil.rmtree(tmp_dir)
+    except FileNotFoundError:
+        pass
+
+    for ext in ["azw", "epub", "mobi", "pdf", "html"]:
+        wget(
+            "--no-verbose",
+            "--output-file",
+            "-",
+            # The Content-Disposition header is sent by the server to say
+            # what the file "should" be called.  By telling wget to respect this,
+            # it means we can request "a.html", the header from AO3 will specify
+            # the correct filename (including the fic title), and the file will
+            # be named correctly.
+            "--content-disposition",
+            "--directory-prefix",
+            tmp_dir,
+            f"https://archiveofourown.org/downloads/{ao3_id}/a.{ext}",
+        )
+
+    try:
+        title = os.listdir(tmp_dir)[0].rsplit(".")[0]
+    except FileNotFoundError:
+        return
+
+    out_path = BACKUP_ROOT / f"{ao3_id}-{title}.tar.gz"
+
+    with tarfile.open(out_path, "w:gz") as tf:
+        tf.add(tmp_dir, arcname=ao3_id)
+
+    shutil.rmtree(tmp_dir)
+
+    print(f" ~> {out_path}")
+
+
+def wget(*args):
+    subprocess.call(["wget"] + list(args), stdout=subprocess.DEVNULL)
+
+
+if __name__ == "__main__":
+    for url in sys.argv[1:]:
+        save_ao3_url(url)

web/save_pinboard_bookmarks.py (5923) → web/save_pinboard_bookmarks.py (6277)

diff --git a/web/save_pinboard_bookmarks.py b/web/save_pinboard_bookmarks.py
index 33f7dc2..1d41c80 100755
--- a/web/save_pinboard_bookmarks.py
+++ b/web/save_pinboard_bookmarks.py
@@ -27,7 +27,7 @@ def write_to_file(name: str, contents: str) -> None:
     path.write_text(contents)
 
 
-def get_bookmarks_json(username: str, password: str) -> str:
+def get_bookmarks_data(username: str, password: str) -> str:
     """
     Call the Pinboard API to get a complete list of my bookmarks.
 
@@ -41,9 +41,7 @@ def get_bookmarks_json(username: str, password: str) -> str:
 
     resp.raise_for_status()
 
-    json_string = json.dumps(resp.json(), indent=2, sort_keys=True)
-
-    return json_string
+    return resp.json()
 
 
 def get_cache_ids(username: str, password: str) -> dict[str, str]:
@@ -191,25 +189,40 @@ if __name__ == "__main__":
     now = datetime.date.today().strftime("%Y-%m-%d")
 
     print("*** Getting a JSON copy of my bookmarks data")
-    json_string = get_bookmarks_json(username, password)
+    bookmarks = get_bookmarks_data(username, password)
+    json_string = json.dumps(bookmarks, indent=2, sort_keys=True)
 
     for name in (f"bookmarks.{now}.json", "bookmarks.json"):
         write_to_file(name, contents=json_string)
 
     print("")
 
-    print("*** Getting a list of cache IDs")
-    all_cache_ids = get_cache_ids(username, password)
-
-    for name in (f"cache_ids.{now}.json", "cache_ids.json"):
-        write_to_file(name, contents=json.dumps(all_cache_ids))
-
-    all_cache_ids = json.load(open(BACKUP_ROOT / "cache_ids.json"))
-
-    print("")
-
-    print("*** Saving archive files using wget")
-
-    with wget_context(username, password):
-        for url, cache_id in all_cache_ids.items():
-            download_single_archive(url, cache_id)
+    # print("*** Getting a list of cache IDs")
+    # all_cache_ids = get_cache_ids(username, password)
+    #
+    # for name in (f"cache_ids.{now}.json", "cache_ids.json"):
+    #     write_to_file(name, contents=json.dumps(all_cache_ids))
+    #
+    # all_cache_ids = json.load(open(BACKUP_ROOT / "cache_ids.json"))
+    #
+    # print("")
+    #
+    # print("*** Saving archive files using wget")
+    #
+    # with wget_context(username, password):
+    #     for url, cache_id in all_cache_ids.items():
+    #         download_single_archive(url, cache_id)
+    #
+    # print("")
+
+    print("*** Saving stories from AO3")
+
+    ao3_urls = [b["href"] for b in bookmarks if "archiveofourown.org" in b["href"]]
+
+    subprocess.check_call(
+        [
+            "python3",
+            "/Users/alexwlchan/repos/scripts/web/save_ao3_links.py",
+        ]
+        + ao3_urls
+    )