Add a script for tidying up my Twitter threads
- ID
4b4a354- date
2023-12-17 21:03:53+00:00- author
Alex Chan <alex@alexwlchan.net>- parent
98bb938- message
Add a script for tidying up my Twitter threads- changed files
Changed files
.github/workflows/test.yml (1068) → .github/workflows/test.yml (1116)
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index e58febd..be9b28e 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -38,4 +38,5 @@ jobs:
- name: Run tests
run: |
py.test aws/test_s3tree.py
+ py.test text/test_fix_twitter_thread.py
py.test textexpander/test_get_mastodon_text.py
text/README.md (2891) → text/README.md (3182)
diff --git a/text/README.md b/text/README.md
index 436749b..7877713 100644
--- a/text/README.md
+++ b/text/README.md
@@ -15,6 +15,14 @@ These are utilities for manipulating streams of text; I consider them in a simil
This is meant to fill a gap between the Unix utilities <code>head</code> and <code>tail</code>.
</dd>
+ <dt>
+ <a href="https://github.com/alexwlchan/scripts/blob/main/text/fix_twitter_thread.py">
+ <code>fix_twitter_thread.py [PATH]</code>
+ </a>
+ </dt>
+ <dd>
+ when I copy/paste a Twitter thread into Obsidian, this does some initial tidying up of the formatting for me.
+ </dd>
<dt>
<a href="https://github.com/alexwlchan/scripts/blob/main/text/midline">
text/fix_twitter_thread.py (0) → text/fix_twitter_thread.py (5137)
diff --git a/text/fix_twitter_thread.py b/text/fix_twitter_thread.py
new file mode 100755
index 0000000..05051fa
--- /dev/null
+++ b/text/fix_twitter_thread.py
@@ -0,0 +1,183 @@
+#!/usr/bin/env python3
+
+import datetime
+import os
+import re
+import sys
+
+import httpx
+import termcolor
+
+
+# A regex to find links to somebody's Twitter profile
+#
+# e.g. (https://twitter.com/BooksandChokers)
+PROFILE_URL_RE = re.compile(r"\(https://twitter\.com/(?P<username>[^\)^/]+)\)")
+
+
+def fix_emoji(text: str) -> str:
+ """
+ Replace any Twemoji in the text with the actual emoji characters.
+ """
+ # A regex to find emoji characters which have been replaced with twimg's
+ #
+ # e.g. 
+ # e.g. 
+ emoji_twimg_re = re.compile(
+ r"!\[(?P<emoji>[^\]]+)\]"
+ r'\(https://abs\-0\.twimg\.com/emoji/v2/svg/[0-9a-f\-]+\.svg(?: "[A-Za-z]+")?\)'
+ )
+
+ return emoji_twimg_re.sub(repl=r"\g<emoji>", string=text)
+
+
+def fix_hashtags(text: str) -> str:
+ """
+ Replace any hashtag links in the text with literal text.
+ """
+ # A regex to find hashtag links
+ #
+ # e.g. [#akindofspark](https://twitter.com/hashtag/akindofspark?src=hashtag_click)
+ hashtag_re = re.compile(
+ r"\[#(?P<hashtag>[a-zA-Z][a-zA-Z0-9]+)\]"
+ r"\(https://twitter\.com/hashtag/[a-zA-Z][a-zA-Z0-9]+\?src=hashtag_click\)"
+ )
+
+ return hashtag_re.sub(repl=r"\\#\g<hashtag>", string=text)
+
+
+def get_profile_photo_re(handle: str) -> re.Pattern:
+ # A regex to find links to somebody's profile that uses their profile
+ # picture in the body of the link, e.g.
+ #
+ # [
+ #
+ # 
+ #
+ # ](https://twitter.com/BooksandChokers)
+ #
+ return re.compile(
+ r"\[\n"
+ r"\n"
+ r"!\[\]\(https://pbs\.twimg\.com/profile_images/[0-9]+/[a-zA-Z0-9_\.]+\)\n"
+ r"\n"
+ r"\]\(https://twitter\.com/" + handle + r"\)\n"
+ )
+
+
+def remove_profile_links(text: str, /, *, handle: str) -> str:
+ # Remove any profile links that are on a single line, e.g.
+ #
+ # [Elle McNicoll ✍🏻📚](https://twitter.com/BooksandChokers)
+ text = re.sub(
+ r"\[[^\]]+\]\(https://twitter\.com/" + handle + r"\)" + "\xa0\n",
+ repl="",
+ string=text,
+ )
+
+ # Remove any profile links that are spread across multiple lines, e.g.
+ #
+ # [
+ #
+ # @BooksandChokers
+ #
+ # ](https://twitter.com/BooksandChokers)
+ #
+ # .
+ text = re.sub(
+ r"\[\n\n@"
+ + handle
+ + r"\n\n\]\(https://twitter\.com/"
+ + handle
+ + r"\)\n\n(?:·\n)?",
+ repl="",
+ string=text,
+ )
+
+ return text
+
+
+def remove_individual_tweet_links(text: str, /, *, handle: str) -> str:
+ # Remove any links to individual tweets in the thread,
+ #
+ # e.g. [10 Jun](https://twitter.com/BooksandChokers/status/1667617023801839616)
+ return re.sub(
+ r"\[[0-9]+ [A-Z][a-z]+\]"
+ r"\(https://twitter\.com/" + handle + r"/status/[0-9]+\)\n",
+ repl="",
+ string=text,
+ )
+
+
+def download_images(text: str, /, *, handle: str) -> str:
+ # Download images and save them to Obsidian
+
+ image_match = re.compile(
+ r"!\[(?P<alt_text>[^\]]*)\]"
+ r"\((?P<url>https://pbs\.twimg\.com/media/(?P<media_id>[a-zA-Z0-9]+)\?format=(?P<format>jpg))\&name=(?P<size>small|medium)\)"
+ )
+
+ for m in image_match.finditer(text):
+ url = m.group("url")
+ out_name = m.group("media_id") + "." + m.group("format")
+ out_path = os.path.join(
+ os.environ["HOME"],
+ "textfiles",
+ "Attachments",
+ str(datetime.datetime.now().year),
+ out_name,
+ )
+
+ os.makedirs(os.path.dirname(out_path), exist_ok=True)
+
+ resp = httpx.get(url)
+ resp.raise_for_status()
+
+ with open(out_path, "xb") as f:
+ f.write(resp.content)
+
+ alt_text = m.group("alt_text")
+
+ if alt_text != "Image":
+ text = text.replace(m.group(0), f"![[{out_name}|{alt_text}]]")
+ else:
+ text = text.replace(m.group(0), f"![[{out_name}]]")
+
+ # Remove the lingering bits from the image link
+ text = text.replace("[\n", "")
+ text = re.sub(
+ r"\]\(https://twitter\.com/" + handle + r"/status/[0-9]+/photo/[0-9]\)\n",
+ repl="",
+ string=text,
+ )
+
+ return text
+
+
+if __name__ == "__main__":
+ try:
+ path = sys.argv[1]
+ except IndexError:
+ sys.exit(f"Usage: {__file__} <PATH>")
+
+ new_lines = []
+ last_line = None
+
+ with open(path) as in_file:
+ text = in_file.read()
+
+ handle = re.search(PROFILE_URL_RE, text).group("username")
+ print("Detected handle as", termcolor.colored(handle, "blue"))
+
+ profile_photo_re = get_profile_photo_re(handle)
+ text = profile_photo_re.sub(repl="", string=text)
+
+ text = fix_emoji(text)
+ text = fix_hashtags(text)
+ text = remove_profile_links(text, handle=handle)
+ text = remove_individual_tweet_links(text, handle=handle)
+ text = download_images(text, handle=handle)
+ text = re.sub("\n\n", "\n", text)
+
+ with open(path, "w") as out_file:
+ out_file.write(text)
text/test_fix_twitter_thread.py (0) → text/test_fix_twitter_thread.py (754)
diff --git a/text/test_fix_twitter_thread.py b/text/test_fix_twitter_thread.py
new file mode 100644
index 0000000..fa9759e
--- /dev/null
+++ b/text/test_fix_twitter_thread.py
@@ -0,0 +1,27 @@
+import pytest
+
+from fix_twitter_thread import fix_emoji, remove_profile_links
+
+
+@pytest.mark.parametrize(
+ ["input", "output"],
+ [
+ ("", "✍🏻"),
+ (
+ '',
+ "✍🏻📚",
+ ),
+ ],
+)
+def test_fix_emoji(input: str, output: str) -> None:
+ assert fix_emoji(input) == output
+
+
+def test_remove_profile_links() -> None:
+ assert (
+ remove_profile_links(
+ "[Elle McNicoll ✍🏻📚](https://twitter.com/BooksandChokers) \n",
+ handle="BooksandChokers",
+ )
+ == ""
+ )