Skip to main content

text: remove the smartypants dependency

ID
7452067
date
2026-05-09 07:11:09+00:00
author
Alex Chan <alex@alexwlchan.net>
parent
d7cced9
message
text: remove the `smartypants` dependency
changed files
8 files, 366 additions, 43 deletions

Changed files

CHANGELOG.md (4622) → CHANGELOG.md (4700)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 43514f7..9fae1a2 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,9 @@
 # CHANGELOG
 
+## v38 - 2026-05-09
+
+Remove the `smartypants` dependency from `chives.text`.
+
 ## v37 - 2026-05-09
 
 Remove the `rapidfuzz` dependency from `chives.static_site_tests`.

dev_requirements.txt (1235) → dev_requirements.txt (1189)

diff --git a/dev_requirements.txt b/dev_requirements.txt
index 21e5b23..6e133fb 100644
--- a/dev_requirements.txt
+++ b/dev_requirements.txt
@@ -2,27 +2,27 @@
 #    uv pip compile dev_requirements.in --output-file=dev_requirements.txt --exclude-newer=P7D --exclude-newer-package alexwlchan-chives=false
 -e file:.
     # via -r dev_requirements.in
-certifi==2026.2.25
+certifi==2026.4.22
     # via alexwlchan-chives
 coverage==7.13.5
     # via pytest-cov
-greenlet==3.3.2
+greenlet==3.5.0
     # via playwright
 iniconfig==2.3.0
     # via pytest
-librt==0.8.1
+librt==0.9.0
     # via mypy
-mypy==1.20.0
+mypy==1.20.2
     # via -r dev_requirements.in
 mypy-extensions==1.1.0
     # via mypy
-packaging==26.0
+packaging==26.2
     # via pytest
-pathspec==1.0.4
+pathspec==1.1.1
     # via mypy
 pillow==12.2.0
     # via alexwlchan-chives
-playwright==1.58.0
+playwright==1.59.0
     # via alexwlchan-chives
 pluggy==1.6.0
     # via
@@ -32,7 +32,7 @@ pyee==13.0.1
     # via playwright
 pygments==2.20.0
     # via pytest
-pytest==9.0.2
+pytest==9.0.3
     # via
     #   alexwlchan-chives
     #   pytest-cov
@@ -43,10 +43,8 @@ pytest-vcr==1.0.2
     # via -r dev_requirements.in
 pyyaml==6.0.3
     # via vcrpy
-ruff==0.15.9
+ruff==0.15.12
     # via -r dev_requirements.in
-smartypants==2.0.2
-    # via alexwlchan-chives
 typing-extensions==4.15.0
     # via
     #   mypy

pyproject.toml (1497) → pyproject.toml (1474)

diff --git a/pyproject.toml b/pyproject.toml
index 6581b3a..042f2bf 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -27,7 +27,6 @@ license = "MIT"
 fetch = ["certifi"]
 media = ["Pillow"]
 static_site_tests = ["playwright", "pytest"]
-text = ["smartypants"]
 urls = ["certifi"]
 
 [project.urls]

src/chives/__init__.py (391) → src/chives/__init__.py (391)

diff --git a/src/chives/__init__.py b/src/chives/__init__.py
index 6a441e1..d67a6f0 100644
--- a/src/chives/__init__.py
+++ b/src/chives/__init__.py
@@ -11,4 +11,4 @@ I share across multiple sites.
 
 """
 
-__version__ = "37"
+__version__ = "38"

src/chives/smartypants.py (0) → src/chives/smartypants.py (10516)

diff --git a/src/chives/smartypants.py b/src/chives/smartypants.py
new file mode 100644
index 0000000..9224f63
--- /dev/null
+++ b/src/chives/smartypants.py
@@ -0,0 +1,319 @@
+"""
+A Python implementation of SmartyPants, a tool for adding "smart punctuation"
+to text -- for example, curly quotes and smart dashes.
+
+This is based on SmartyPants.pl, a BSD-licensed Perl script written
+by John Gruber. See https://daringfireball.net/projects/smartypants/
+
+It is also based on SmartyPants.py, a Python fork of SmartyPants
+maintained by Chad Miller, Yu-Jie Lin, Leo Hemsted, and Justin Mayer.
+See https://pypi.org/project/smartypants/
+
+This module is not intended to be used directly -- use `chives.text` instead.
+"""
+
+from collections.abc import Iterator
+import re
+from typing import Literal, NamedTuple
+
+
+def smartypants(text: str) -> str:
+    """
+    Add smart punctuation to a piece of text.
+    """
+    skipped_tag_stack: list[str] = []
+    in_pre = False
+    result: list[str] = []
+
+    # Preserve context for single-character quote tokens. Remember the last
+    # character of the previous token, so we can curl them correctly.
+    prev_token_last_char = ""
+
+    for token in tokenize(text):
+        if token.type == "tag":
+            # Don't mess with quotes inside some tags.  This does not handle
+            # self <closing/> tags!
+            result.append(token.value)
+
+            if skip_match := TAGS_TO_SKIP_RE.match(token.value):
+                is_closing = bool(skip_match.group("closing"))
+                tag_name = skip_match.group("tag_name").lower()
+
+                if not is_closing:
+                    skipped_tag_stack.append(tag_name)
+                    in_pre = True
+                else:
+                    if skipped_tag_stack:
+                        if tag_name == skipped_tag_stack[-1]:
+                            skipped_tag_stack.pop()
+                        else:  # pragma: no cover
+                            assert 0
+                            pass
+                            # This close doesn't match the open.  This isn't
+                            # XHTML.  We should barf here.
+                    else:  # pragma: no cover
+                        pass
+                    if not skipped_tag_stack:
+                        in_pre = False
+
+        else:
+            text = token.value
+            # Remember the last character of this token before processing.
+            last_char = text[-1:]
+            if not in_pre:
+                # Process escaped characters; they shouldn't have smart
+                # punctuation added.
+                text = re.sub(r"\\\\", "&#92;", text)
+                text = re.sub(r'\\"', "&#34;", text)
+                text = re.sub(r"\\'", "&#39;", text)
+                text = re.sub(r"\\\.", "&#46;", text)
+                text = re.sub(r"\\-", "&#45;", text)
+                text = re.sub(r"\\`", "&#96;", text)
+
+                # Convert quote entities back to regular quotes
+                text = re.sub("&quot;", '"', text)
+
+                # Convert dashes
+                text = re.sub("---", "—", text)
+                text = re.sub("--", "–", text)
+
+                # Convert ellipses
+                text = re.sub(r"\.\.\.", "…", text)
+                text = re.sub(r"\. \. \.", "…", text)
+
+                if text == "'":
+                    # Special case: single-character ' token
+                    if re.match(r"\S", prev_token_last_char):  # pragma: no cover
+                        text = "’"
+                    else:
+                        text = "‘"
+                elif text == '"':
+                    # Special case: single-character " token
+                    if re.match(r"\S", prev_token_last_char):  # pragma: no cover
+                        text = "”"
+                    else:
+                        text = "“"
+
+                else:
+                    text = convert_quotes(text)
+
+                # Convert entities
+                CTBL = {
+                    "&#8211;": "–",
+                    "&#8212;": "—",
+                    "&#8216;": "‘",
+                    "&#8217;": "’",
+                    "&#8220;": "“",
+                    "&#8221;": "”",
+                    "&#8230;": "…",
+                }
+
+                for k, v in CTBL.items():
+                    text = text.replace(k, v)
+
+            prev_token_last_char = last_char
+            result.append(text)
+
+    return "".join(result)
+
+
+def convert_quotes(text: str) -> str:
+    """
+    Convert quotes in *text* into HTML curly quote entities.
+
+    This is based on a function of the same name from the Python
+    SmartyPants.py library.
+
+    TODO: Use named entities.
+    """
+    punct_class = r"""[!"#\$\%'()*+,-.\/:;<=>?\@\[\\\]\^_`{|}~]"""
+
+    # Special case if the very first character is a quote followed by
+    # punctuation at a non-word-break. Close the quotes by brute force:
+    text = re.sub(r"""^'(?=%s\\B)""" % (punct_class,), "&#8217;", text)
+    text = re.sub(r"""^"(?=%s\\B)""" % (punct_class,), "&#8221;", text)
+
+    # Special case for double sets of quotes, e.g.:
+    #   <p>He said, "'Quoted' words in a larger quote."</p>
+    text = re.sub(r""""'(?=\w)""", "&#8220;&#8216;", text)
+    text = re.sub(r"""'"(?=\w)""", "&#8216;&#8220;", text)
+
+    # Special case for decade abbreviations (the '80s):
+    text = re.sub(r"""\b'(?=\d{2}s)""", "&#8217;", text)
+
+    close_class = r"[^\ \t\r\n\[\{\(\-]"
+    dec_dashes = "&#8211;|&#8212;"
+
+    # Get most opening single quotes:
+    opening_single_quotes_regex = re.compile(
+        r"""
+            (
+                \s          |   # a whitespace char, or
+                &nbsp;      |   # a non-breaking space entity, or
+                --          |   # dashes, or
+                &[mn]dash;  |   # named dash entities
+                %s          |   # or decimal entities
+                &\#x201[34];    # or hex
+            )
+            '                 # the quote
+            (?=\w)            # followed by a word character
+            """
+        % (dec_dashes,),
+        re.VERBOSE,
+    )
+    text = opening_single_quotes_regex.sub(r"\1&#8216;", text)
+
+    closing_single_quotes_regex = re.compile(
+        r"""
+            (%s)
+            '
+            (?!\s | s\b | \d)
+            """
+        % (close_class,),
+        re.VERBOSE,
+    )
+    text = closing_single_quotes_regex.sub(r"\1&#8217;", text)
+
+    closing_single_quotes_regex = re.compile(
+        r"""
+            (%s)
+            '
+            (\s | s\b)
+            """
+        % (close_class,),
+        re.VERBOSE,
+    )
+    text = closing_single_quotes_regex.sub(r"\1&#8217;\2", text)
+
+    # Any remaining single quotes should be opening ones:
+    text = re.sub("'", "&#8216;", text)
+
+    # Get most opening double quotes:
+    opening_double_quotes_regex = re.compile(
+        r"""
+            (
+                \s          |   # a whitespace char, or
+                &nbsp;      |   # a non-breaking space entity, or
+                --          |   # dashes, or
+                &[mn]dash;  |   # named dash entities
+                %s          |   # or decimal entities
+                &\#x201[34];    # or hex
+            )
+            "                 # the quote
+            (?=\w)            # followed by a word character
+            """
+        % (dec_dashes,),
+        re.VERBOSE,
+    )
+    text = opening_double_quotes_regex.sub(r"\1&#8220;", text)
+
+    # Double closing quotes:
+    closing_double_quotes_regex = re.compile(
+        r"""
+            #(%s)?   # character that indicates the quote should be closing
+            "
+            (?=\s)
+            """
+        % (close_class,),
+        re.VERBOSE,
+    )
+    text = closing_double_quotes_regex.sub("&#8221;", text)
+
+    closing_double_quotes_regex = re.compile(
+        r"""
+            ^
+            "
+            (?=%s)
+            """
+        % (punct_class,),
+        re.VERBOSE,
+    )
+    text = closing_double_quotes_regex.sub("&#8221;", text)
+
+    closing_double_quotes_regex = re.compile(
+        r"""
+            (%s)   # character that indicates the quote should be closing
+            "
+            """
+        % (close_class,),
+        re.VERBOSE,
+    )
+    text = closing_double_quotes_regex.sub(r"\1&#8221;", text)
+
+    # Any remaining quotes should be opening ones.
+    text = re.sub('"', "&#8220;", text)
+
+    return text
+
+
+# This regex matches HTML tags that use one of these tags.
+tags_to_skip = "|".join(["pre", "samp", "code", "tt", "kbd", "script", "style", "math"])
+TAGS_TO_SKIP_RE = re.compile(
+    r"<(?P<closing>/)?(?P<tag_name>%s)[^>]*>" % tags_to_skip, re.IGNORECASE
+)
+
+
+class Token(NamedTuple):
+    """
+    A token in the input text.
+
+    A token is either:
+
+    -   a tag, possibly with nested tags, such as `<a href="<example>">`, or
+    -   a run of text between tags.
+
+    """
+
+    type: Literal["text", "tag"]
+    value: str
+
+
+def tokenize(text: str) -> Iterator[Token]:
+    """
+    Find all the tokens in the input text.
+
+    This is based on two existing functions:
+
+    *   The `_tokenize()` subroutine from Brad Choate's MTRegex plugin.
+        http://www.bradchoate.com/past/mtregex.php
+    *   The `_tokenize()` function from the SmartyPants.py library.
+        https://github.com/justinmayer/smartypants.py/blob/main/smartypants.py
+    """
+    # Matches text outside an HTML tag followed by a comment or a tag.
+    tag_soup = re.compile(
+        r"(?P<text>[^<]*)"
+        r"(?P<tag><!--(?P<comment>.*?)--\s*>|<[^>]*>)",
+        re.DOTALL,
+    )
+
+    previous_end = 0
+
+    while match := tag_soup.match(text, previous_end):
+        if match.group("text"):
+            yield Token(type="text", value=match.group("text"))
+
+        # if -- in text part of comment, then it's not a comment, therefore it
+        # should be converted.
+        #
+        # In HTML4 [1]:
+        #   [...] Authors should avoid putting two or more adjacent hyphens
+        #   inside comments.
+        #
+        # In HTML5 [2]:
+        #   [...] the comment may have text, with the additional restriction
+        #   that the text must not [...], nor contain two consecutive U+002D
+        #   HYPHEN-MINUS characters (--)
+        #
+        # [1]: http://www.w3.org/TR/REC-html40/intro/sgmltut.html#h-3.2.4
+        # [2]: http://www.w3.org/TR/html5/syntax.html#comments
+        tag = match.group("tag")
+        if match.group("comment") and "--" in match.group("comment"):
+            yield Token(type="text", value=tag)
+        else:
+            yield Token(type="tag", value=tag)
+
+        previous_end = match.end()
+        print(previous_end)
+
+    if previous_end < len(text):
+        yield Token(type="text", value=text[previous_end:])

src/chives/text.py (721) → src/chives/text.py (215)

diff --git a/src/chives/text.py b/src/chives/text.py
index 2ef2465..8b461a7 100644
--- a/src/chives/text.py
+++ b/src/chives/text.py
@@ -2,31 +2,11 @@
 Functions for dealing with text.
 """
 
-import functools
+from chives.smartypants import smartypants
 
-import smartypants
 
-
-@functools.cache
 def smartify(text: str) -> str:
     """
     Add curly quotes and smart dashes to a string.
     """
-    attrs = (
-        # normal quotes (" and ') to curly ones
-        smartypants.Attr.q
-        |
-        # quote entities (&quot;) to curly quotes
-        smartypants.Attr.w
-        |
-        # typewriter dashes (--) to en-dashes and dashes (---) to em-dashes
-        smartypants.Attr.D
-        |
-        # dashes (...) to ellipses
-        smartypants.Attr.e
-        |
-        # output Unicode chars instead of numeric character references
-        smartypants.Attr.u
-    )
-
-    return smartypants.smartypants(text, attrs)
+    return smartypants(text)

tests/stubs/smartypants.pyi (119) → tests/stubs/smartypants.pyi (0)

diff --git a/tests/stubs/smartypants.pyi b/tests/stubs/smartypants.pyi
deleted file mode 100644
index 766b445..0000000
--- a/tests/stubs/smartypants.pyi
+++ /dev/null
@@ -1,8 +0,0 @@
-class Attr:
-    q: int
-    D: int
-    e: int
-    u: int
-    w: int
-
-def smartypants(text: str, attrs: int) -> str: ...

tests/test_text.py (674) → tests/test_text.py (1470)

diff --git a/tests/test_text.py b/tests/test_text.py
index 22e137e..187244b 100644
--- a/tests/test_text.py
+++ b/tests/test_text.py
@@ -10,11 +10,21 @@ from chives.text import smartify
 @pytest.mark.parametrize(
     "text, expected",
     [
+        ("<", "<"),
+        ("<0", "<0"),
+        ("--", "–"),
+        ("---", "—"),
+        ("&quot;", "“"),
+        ("'", "‘"),
         ("Isn't it delightful -- she said", "Isn’t it delightful – she said"),
         ("Are you ... sure?", "Are you … sure?"),
         ("<h2>Isn't it delightful?</h2>", "<h2>Isn’t it delightful?</h2>"),
         ("<li>Isn't it delightful?</li>", "<li>Isn’t it delightful?</li>"),
         ("<p>&quot;It's nice&quot;, he said</p>", "<p>“It’s nice”, he said</p>"),
+        (
+            "<!-- this -- is -- not -- a -- valid -- comment -->",
+            "<!– this – is – not – a – valid – comment –>",
+        ),
     ],
 )
 def test_smartify(text: str, expected: str) -> None:
@@ -23,3 +33,24 @@ def test_smartify(text: str, expected: str) -> None:
     """
     actual = smartify(text)
     assert actual == expected
+
+    assert smartify(actual) == actual
+
+
+@pytest.mark.parametrize(
+    "text",
+    [
+        "<",
+        "<0",
+        '<a href="https://example.com">example.com</a>',
+        '<pre>print("hello world")</pre>',
+        '<pre><code data-lang="python">print("hello world")</code></pre>',
+        "<br/>",
+        "</br>",
+    ],
+)
+def test_is_unchanged_by_smartify(text: str) -> None:
+    """
+    Test these strings are unaffected by "smart" punctuation.
+    """
+    assert smartify(text) == text