Add standard tests for fuzzy tag matching
- ID
44efa84- date
2025-12-06 14:01:17+00:00- author
Alex Chan <alex@alexwlchan.net>- parent
bed1321- message
Add standard tests for fuzzy tag matching- changed files
6 files, 93 additions, 11 deletions
Changed files
CHANGELOG.md (1415) → CHANGELOG.md (1497)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index eb88e56..8fde25b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,9 @@
# CHANGELOG
+## v12 - 2025-12-06
+
+Add checks for fuzzy tag matching to `StaticSiteTestSuite`.
+
## v11 - 2025-12-06
Add a new class `StaticSiteTestSuite` which runs my standard set of tests for a static site, e.g. checking every file is saved, checking timestamps use the correct format.
dev_requirements.txt (2600) → dev_requirements.txt (2646)
diff --git a/dev_requirements.txt b/dev_requirements.txt
index 03365bb..565f627 100644
--- a/dev_requirements.txt
+++ b/dev_requirements.txt
@@ -47,7 +47,7 @@ javascript-data-files==1.4.1
# via alexwlchan-chives
keyring==25.7.0
# via twine
-librt==0.7.0
+librt==0.7.2
# via mypy
markdown-it-py==4.0.0
# via rich
@@ -99,6 +99,8 @@ pytest-vcr==1.0.2
# via silver-nitrate
pyyaml==6.0.3
# via vcrpy
+rapidfuzz==3.14.3
+ # via alexwlchan-chives
readme-renderer==44.0
# via twine
requests==2.32.5
pyproject.toml (1345) → pyproject.toml (1358)
diff --git a/pyproject.toml b/pyproject.toml
index 5387fa3..a7b7379 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -25,7 +25,7 @@ license = "MIT"
[project.optional-dependencies]
media = ["Pillow"]
-static_site_tests = ["javascript-data-files[typed]", "pytest"]
+static_site_tests = ["javascript-data-files[typed]", "pytest", "rapidfuzz"]
urls = ["httpx", "hyperlink"]
[project.urls]
src/chives/__init__.py (391) → src/chives/__init__.py (391)
diff --git a/src/chives/__init__.py b/src/chives/__init__.py
index 5654aa7..a276d0a 100644
--- a/src/chives/__init__.py
+++ b/src/chives/__init__.py
@@ -11,4 +11,4 @@ I share across multiple sites.
"""
-__version__ = "11"
+__version__ = "12"
src/chives/static_site_tests.py (5255) → src/chives/static_site_tests.py (6462)
diff --git a/src/chives/static_site_tests.py b/src/chives/static_site_tests.py
index 92d9bf9..2a1712f 100644
--- a/src/chives/static_site_tests.py
+++ b/src/chives/static_site_tests.py
@@ -3,7 +3,10 @@ Defines a set of common tests and test helpers used for all my static sites.
"""
from abc import ABC, abstractmethod
+import collections
+from collections.abc import Iterator
import glob
+import itertools
import os
from pathlib import Path
import subprocess
@@ -11,6 +14,7 @@ from typing import cast, TypedDict, TypeVar
from javascript_data_files import read_typed_js
import pytest
+from rapidfuzz import fuzz
from chives.dates import date_matches_any_format, find_all_dates
from chives.media import is_av1_video
@@ -51,6 +55,16 @@ class StaticSiteTestSuite[M](ABC):
"""
...
+ @abstractmethod
+ def list_tags_in_metadata(self, metadata: M) -> Iterator[str]:
+ """
+ Returns all the tags used in the metadata, once for every usage.
+
+ For example, if three documents use the same tag, the tag will
+ be returned three times.
+ """
+ ...
+
def test_no_uncommitted_git_changes(self, site_root: Path) -> None:
"""
There are no changes which haven't been committed to Git.
@@ -175,3 +189,28 @@ class StaticSiteTestSuite[M](ABC):
}
assert bad_date_strings == set()
+
+ @staticmethod
+ def find_similar_pairs(tags: dict[str, int]) -> Iterator[tuple[str, str]]:
+ """
+ Find pairs of similar-looking tags in the collection `tags`.
+ """
+ for t1, t2 in itertools.combinations(sorted(tags), 2):
+ if fuzz.ratio(t1, t2) > 80:
+ yield (t1, t2)
+
+ known_similar_tags: set[tuple[str, str]] = set()
+
+ def test_no_similar_tags(self, metadata: M) -> None:
+ """
+ There are no similar/misspelt tags.
+ """
+ tags = collections.Counter(self.list_tags_in_metadata(metadata))
+
+ bad_tags = [
+ f"{t1} ({tags[t1]}) / {t2} ({tags[t2]})"
+ for t1, t2 in self.find_similar_pairs(tags)
+ if (t1, t2) not in self.known_similar_tags
+ ]
+
+ assert bad_tags == []
tests/test_static_site_tests.py (5808) → tests/test_static_site_tests.py (6784)
diff --git a/tests/test_static_site_tests.py b/tests/test_static_site_tests.py
index 9a70c38..c3c41d2 100644
--- a/tests/test_static_site_tests.py
+++ b/tests/test_static_site_tests.py
@@ -2,6 +2,7 @@
Tests for `chives.static_site_tests`.
"""
+from collections.abc import Iterator
from pathlib import Path
import shutil
import subprocess
@@ -24,7 +25,11 @@ def site_root(tmp_path: Path) -> Path:
def create_test_suite[M](
- site_root: Path, metadata: M, paths_in_metadata: set[Path]
+ site_root: Path,
+ metadata: M,
+ *,
+ paths_in_metadata: set[Path] | None = None,
+ tags_in_metadata: set[str] | None = None,
) -> StaticSiteTestSuite[M]:
"""
Create a new instance of StaticSiteTestSuite with the hard-coded data
@@ -39,7 +44,10 @@ def create_test_suite[M](
return metadata
def list_paths_in_metadata(self, metadata: M) -> set[Path]:
- return paths_in_metadata
+ return paths_in_metadata or set()
+
+ def list_tags_in_metadata(self, metadata: M) -> Iterator[str]:
+ yield from (tags_in_metadata or set())
return TestSuite()
@@ -87,7 +95,7 @@ def test_checks_for_git_changes(site_root: Path) -> None:
"""
The tests check that there are no uncommitted Git changes.
"""
- t = create_test_suite(site_root, metadata=[1, 2, 3], paths_in_metadata=set())
+ t = create_test_suite(site_root, metadata=[1, 2, 3])
# Initially this should fail, because there isn't a Git repo in
# the folder.
@@ -114,7 +122,7 @@ def test_checks_for_url_safe_paths(site_root: Path) -> None:
"""
The tests check for URL-safe paths.
"""
- t = create_test_suite(site_root, metadata=[1, 2, 3], paths_in_metadata=set())
+ t = create_test_suite(site_root, metadata=[1, 2, 3])
# This should pass trivially when the site is empty.
t.test_every_path_is_url_safe(site_root)
@@ -141,7 +149,7 @@ def test_checks_for_av1_videos(site_root: Path) -> None:
"""
The tests check for AV1-encoded videos.
"""
- t = create_test_suite(site_root, metadata=[1, 2, 3], paths_in_metadata=set())
+ t = create_test_suite(site_root, metadata=[1, 2, 3])
# This should pass trivially when the site is empty.
t.test_no_videos_are_av1(site_root)
@@ -168,17 +176,46 @@ def test_checks_for_date_formats(site_root: Path) -> None:
"""
# Check a site with correct metadata
metadata1 = {"date_saved": "2025-12-06"}
- t1 = create_test_suite(site_root, metadata1, paths_in_metadata=set())
+ t1 = create_test_suite(site_root, metadata1)
t1.test_all_timestamps_are_consistent(metadata1)
# Check a site with incorrect metadata
metadata2 = {"date_saved": "AAAA-BB-CC"}
- t2 = create_test_suite(site_root, metadata2, paths_in_metadata=set())
+ t2 = create_test_suite(site_root, metadata2)
with pytest.raises(AssertionError):
t2.test_all_timestamps_are_consistent(metadata2)
# Check we can override the timestamp format
metadata3 = {"date_saved": "AAAA-BB-CC"}
- t3 = create_test_suite(site_root, metadata=metadata3, paths_in_metadata=set())
+ t3 = create_test_suite(site_root, metadata=metadata3)
t3.date_formats.append("AAAA-BB-CC")
t3.test_all_timestamps_are_consistent(metadata3)
+
+
+def test_checks_for_similar_tags(site_root: Path) -> None:
+ """
+ The tests check for similar and misspelt tags.
+ """
+ metadata = [1, 2, 3]
+
+ # Check a site with distinct tags.
+ t1 = create_test_suite(
+ site_root, metadata, tags_in_metadata={"red", "green", "blue"}
+ )
+ t1.test_no_similar_tags(metadata)
+
+ # Check a site with similar tags.
+ t2 = create_test_suite(
+ site_root, metadata, tags_in_metadata={"red robot", "rod robot", "rid robot"}
+ )
+ with pytest.raises(AssertionError):
+ t2.test_no_similar_tags(metadata)
+
+ # Check a site with similar tags, but marked as known-similar.
+ t3 = create_test_suite(
+ site_root,
+ metadata,
+ tags_in_metadata={"red robot", "rod robot", "green", "blue"},
+ )
+ t3.known_similar_tags = {("red robot", "rod robot")}
+ t3.test_no_similar_tags(metadata)