Skip to main content

Merge pull request #12 from alexwlchan/fuzzy-tag-matching

ID
ab7365b
date
2025-12-06 14:02:40+00:00
author
Alex Chan <alex@alexwlchan.net>
parents
bed1321, 44efa84
message
Merge pull request #12 from alexwlchan/fuzzy-tag-matching

Add standard tests for fuzzy tag matching
changed files
6 files, 93 additions, 11 deletions

Changed files

CHANGELOG.md (1415) → CHANGELOG.md (1497)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index eb88e56..8fde25b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,9 @@
 # CHANGELOG
 
+## v12 - 2025-12-06
+
+Add checks for fuzzy tag matching to `StaticSiteTestSuite`.
+
 ## v11 - 2025-12-06
 
 Add a new class `StaticSiteTestSuite` which runs my standard set of tests for a static site, e.g. checking every file is saved, checking timestamps use the correct format.

dev_requirements.txt (2600) → dev_requirements.txt (2646)

diff --git a/dev_requirements.txt b/dev_requirements.txt
index 03365bb..565f627 100644
--- a/dev_requirements.txt
+++ b/dev_requirements.txt
@@ -47,7 +47,7 @@ javascript-data-files==1.4.1
     # via alexwlchan-chives
 keyring==25.7.0
     # via twine
-librt==0.7.0
+librt==0.7.2
     # via mypy
 markdown-it-py==4.0.0
     # via rich
@@ -99,6 +99,8 @@ pytest-vcr==1.0.2
     # via silver-nitrate
 pyyaml==6.0.3
     # via vcrpy
+rapidfuzz==3.14.3
+    # via alexwlchan-chives
 readme-renderer==44.0
     # via twine
 requests==2.32.5

pyproject.toml (1345) → pyproject.toml (1358)

diff --git a/pyproject.toml b/pyproject.toml
index 5387fa3..a7b7379 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -25,7 +25,7 @@ license = "MIT"
 
 [project.optional-dependencies]
 media = ["Pillow"]
-static_site_tests = ["javascript-data-files[typed]", "pytest"]
+static_site_tests = ["javascript-data-files[typed]", "pytest", "rapidfuzz"]
 urls = ["httpx", "hyperlink"]
 
 [project.urls]

src/chives/__init__.py (391) → src/chives/__init__.py (391)

diff --git a/src/chives/__init__.py b/src/chives/__init__.py
index 5654aa7..a276d0a 100644
--- a/src/chives/__init__.py
+++ b/src/chives/__init__.py
@@ -11,4 +11,4 @@ I share across multiple sites.
 
 """
 
-__version__ = "11"
+__version__ = "12"

src/chives/static_site_tests.py (5255) → src/chives/static_site_tests.py (6462)

diff --git a/src/chives/static_site_tests.py b/src/chives/static_site_tests.py
index 92d9bf9..2a1712f 100644
--- a/src/chives/static_site_tests.py
+++ b/src/chives/static_site_tests.py
@@ -3,7 +3,10 @@ Defines a set of common tests and test helpers used for all my static sites.
 """
 
 from abc import ABC, abstractmethod
+import collections
+from collections.abc import Iterator
 import glob
+import itertools
 import os
 from pathlib import Path
 import subprocess
@@ -11,6 +14,7 @@ from typing import cast, TypedDict, TypeVar
 
 from javascript_data_files import read_typed_js
 import pytest
+from rapidfuzz import fuzz
 
 from chives.dates import date_matches_any_format, find_all_dates
 from chives.media import is_av1_video
@@ -51,6 +55,16 @@ class StaticSiteTestSuite[M](ABC):
         """
         ...
 
+    @abstractmethod
+    def list_tags_in_metadata(self, metadata: M) -> Iterator[str]:
+        """
+        Returns all the tags used in the metadata, once for every usage.
+
+        For example, if three documents use the same tag, the tag will
+        be returned three times.
+        """
+        ...
+
     def test_no_uncommitted_git_changes(self, site_root: Path) -> None:
         """
         There are no changes which haven't been committed to Git.
@@ -175,3 +189,28 @@ class StaticSiteTestSuite[M](ABC):
         }
 
         assert bad_date_strings == set()
+
+    @staticmethod
+    def find_similar_pairs(tags: dict[str, int]) -> Iterator[tuple[str, str]]:
+        """
+        Find pairs of similar-looking tags in the collection `tags`.
+        """
+        for t1, t2 in itertools.combinations(sorted(tags), 2):
+            if fuzz.ratio(t1, t2) > 80:
+                yield (t1, t2)
+
+    known_similar_tags: set[tuple[str, str]] = set()
+
+    def test_no_similar_tags(self, metadata: M) -> None:
+        """
+        There are no similar/misspelt tags.
+        """
+        tags = collections.Counter(self.list_tags_in_metadata(metadata))
+
+        bad_tags = [
+            f"{t1} ({tags[t1]}) / {t2} ({tags[t2]})"
+            for t1, t2 in self.find_similar_pairs(tags)
+            if (t1, t2) not in self.known_similar_tags
+        ]
+
+        assert bad_tags == []

tests/test_static_site_tests.py (5808) → tests/test_static_site_tests.py (6784)

diff --git a/tests/test_static_site_tests.py b/tests/test_static_site_tests.py
index 9a70c38..c3c41d2 100644
--- a/tests/test_static_site_tests.py
+++ b/tests/test_static_site_tests.py
@@ -2,6 +2,7 @@
 Tests for `chives.static_site_tests`.
 """
 
+from collections.abc import Iterator
 from pathlib import Path
 import shutil
 import subprocess
@@ -24,7 +25,11 @@ def site_root(tmp_path: Path) -> Path:
 
 
 def create_test_suite[M](
-    site_root: Path, metadata: M, paths_in_metadata: set[Path]
+    site_root: Path,
+    metadata: M,
+    *,
+    paths_in_metadata: set[Path] | None = None,
+    tags_in_metadata: set[str] | None = None,
 ) -> StaticSiteTestSuite[M]:
     """
     Create a new instance of StaticSiteTestSuite with the hard-coded data
@@ -39,7 +44,10 @@ def create_test_suite[M](
             return metadata
 
         def list_paths_in_metadata(self, metadata: M) -> set[Path]:
-            return paths_in_metadata
+            return paths_in_metadata or set()
+
+        def list_tags_in_metadata(self, metadata: M) -> Iterator[str]:
+            yield from (tags_in_metadata or set())
 
     return TestSuite()
 
@@ -87,7 +95,7 @@ def test_checks_for_git_changes(site_root: Path) -> None:
     """
     The tests check that there are no uncommitted Git changes.
     """
-    t = create_test_suite(site_root, metadata=[1, 2, 3], paths_in_metadata=set())
+    t = create_test_suite(site_root, metadata=[1, 2, 3])
 
     # Initially this should fail, because there isn't a Git repo in
     # the folder.
@@ -114,7 +122,7 @@ def test_checks_for_url_safe_paths(site_root: Path) -> None:
     """
     The tests check for URL-safe paths.
     """
-    t = create_test_suite(site_root, metadata=[1, 2, 3], paths_in_metadata=set())
+    t = create_test_suite(site_root, metadata=[1, 2, 3])
 
     # This should pass trivially when the site is empty.
     t.test_every_path_is_url_safe(site_root)
@@ -141,7 +149,7 @@ def test_checks_for_av1_videos(site_root: Path) -> None:
     """
     The tests check for AV1-encoded videos.
     """
-    t = create_test_suite(site_root, metadata=[1, 2, 3], paths_in_metadata=set())
+    t = create_test_suite(site_root, metadata=[1, 2, 3])
 
     # This should pass trivially when the site is empty.
     t.test_no_videos_are_av1(site_root)
@@ -168,17 +176,46 @@ def test_checks_for_date_formats(site_root: Path) -> None:
     """
     # Check a site with correct metadata
     metadata1 = {"date_saved": "2025-12-06"}
-    t1 = create_test_suite(site_root, metadata1, paths_in_metadata=set())
+    t1 = create_test_suite(site_root, metadata1)
     t1.test_all_timestamps_are_consistent(metadata1)
 
     # Check a site with incorrect metadata
     metadata2 = {"date_saved": "AAAA-BB-CC"}
-    t2 = create_test_suite(site_root, metadata2, paths_in_metadata=set())
+    t2 = create_test_suite(site_root, metadata2)
     with pytest.raises(AssertionError):
         t2.test_all_timestamps_are_consistent(metadata2)
 
     # Check we can override the timestamp format
     metadata3 = {"date_saved": "AAAA-BB-CC"}
-    t3 = create_test_suite(site_root, metadata=metadata3, paths_in_metadata=set())
+    t3 = create_test_suite(site_root, metadata=metadata3)
     t3.date_formats.append("AAAA-BB-CC")
     t3.test_all_timestamps_are_consistent(metadata3)
+
+
+def test_checks_for_similar_tags(site_root: Path) -> None:
+    """
+    The tests check for similar and misspelt tags.
+    """
+    metadata = [1, 2, 3]
+
+    # Check a site with distinct tags.
+    t1 = create_test_suite(
+        site_root, metadata, tags_in_metadata={"red", "green", "blue"}
+    )
+    t1.test_no_similar_tags(metadata)
+
+    # Check a site with similar tags.
+    t2 = create_test_suite(
+        site_root, metadata, tags_in_metadata={"red robot", "rod robot", "rid robot"}
+    )
+    with pytest.raises(AssertionError):
+        t2.test_no_similar_tags(metadata)
+
+    # Check a site with similar tags, but marked as known-similar.
+    t3 = create_test_suite(
+        site_root,
+        metadata,
+        tags_in_metadata={"red robot", "rod robot", "green", "blue"},
+    )
+    t3.known_similar_tags = {("red robot", "rod robot")}
+    t3.test_no_similar_tags(metadata)