Files
alfred/tests/domain/test_subtitle_knowledge.py
T
francwa e07c9ec77b chore: sprint cleanup — language unification, parser unification, fossils removal
Several weeks of work accumulated without being committed. Grouped here for
clarity; see CHANGELOG.md [Unreleased] for the user-facing summary.

Highlights
----------

P1 #2 — ISO 639-2/B canonical migration
- New Language VO + LanguageRegistry (alfred/domain/shared/knowledge/).
- iso_languages.yaml as single source of truth for language codes.
- SubtitleKnowledgeBase now delegates lookup to LanguageRegistry; subtitles.yaml
  only declares subtitle-specific tokens (vostfr, vf, vff, …).
- SubtitlePreferences default → ["fre", "eng"]; subtitle filenames written as
  {iso639_2b}.srt (legacy fr.srt still read via alias).
- Scanner: dropped _LANG_KEYWORDS / _SDH_TOKENS / _FORCED_TOKENS /
  SUBTITLE_EXTENSIONS hardcoded dicts.
- Fixed: 'hi' token no longer marks SDH (conflicted with Hindi alias).
- Added settings.min_movie_size_bytes (was a module constant).

P1 #3 — Release parser unification + data-driven tokenizer
- parse_release() is now the single source of truth for release-name parsing.
- alfred/knowledge/release/separators.yaml declares the token separators used
  by the tokenizer (., space, [, ], (, ), _). New conventions can be added
  without code changes.
- Tokenizer now splits on any configured separator instead of name.split('.').
  Releases like 'The Father (2020) [1080p] [WEBRip] [5.1] [YTS.MX]' parse via
  the direct path without sanitization fallback.
- Site-tag extraction always runs first; well-formedness only rejects truly
  forbidden chars.
- _parse_season_episode() extended with NxNN / NxNNxNN alt forms.
- Removed dead helpers: _sanitize, _normalize.

Domain cleanup
- Deleted fossil services with zero production callers:
    alfred/domain/movies/services.py
    alfred/domain/tv_shows/services.py
    alfred/domain/subtitles/services.py (replaced by subtitles/services/ package)
    alfred/domain/subtitles/repositories.py
- Split monolithic subtitle services into a package (identifier, matcher,
  placer, pattern_detector, utils) + dedicated knowledge/ package.
- MediaInfo split into dedicated package (alfred/domain/shared/media/:
  audio, video, subtitle, info, matching).

Persistence cleanup
- Removed dead JSON repositories (movie/subtitle/tvshow_repository.py).

Tests
- Major expansion of the test suite organized to mirror the source tree.
- Removed obsolete *_edge_cases test files superseded by structured tests.
- Suite: 990 passed, 8 skipped.

Misc
- .gitignore: exclude env_backup/ and *.bak.
- Adjustments across agent/llm, app.py, application/filesystem, and
  infrastructure/filesystem to align with the new domain layout.
2026-05-17 23:38:00 +02:00

282 lines
9.9 KiB
Python

"""Tests for ``alfred.domain.subtitles.knowledge`` (loader + base).
Covers:
- ``TestMerge`` — the internal ``_merge`` deep-merge function:
scalar override, dict merge, list extension+dedup.
- ``TestLoader`` — builtin loads alone, learned overlays add tokens,
learned-only pattern is picked up, missing files don't crash.
- ``TestKnowledgeBase`` — typed view: formats / languages /
type-token lookup, default rules, ``patterns_for_group``.
Uses ``monkeypatch`` to override the module-level ``_BUILTIN_ROOT`` and
``_LEARNED_ROOT`` constants so we can drive the loader from a temp dir.
"""
from __future__ import annotations
from pathlib import Path
import pytest
from alfred.domain.subtitles.knowledge import loader as loader_mod
from alfred.domain.subtitles.knowledge.base import SubtitleKnowledgeBase
from alfred.domain.subtitles.knowledge.loader import KnowledgeLoader, _merge
from alfred.domain.subtitles.value_objects import (
ScanStrategy,
SubtitleType,
TypeDetectionMethod,
)
# --------------------------------------------------------------------------- #
# _merge — pure dict merger #
# --------------------------------------------------------------------------- #
class TestMerge:
def test_scalar_override(self):
assert _merge({"a": 1}, {"a": 2}) == {"a": 2}
def test_new_key_added(self):
assert _merge({"a": 1}, {"b": 2}) == {"a": 1, "b": 2}
def test_nested_dict_merged(self):
out = _merge({"a": {"x": 1}}, {"a": {"y": 2}})
assert out == {"a": {"x": 1, "y": 2}}
def test_list_extended_and_deduped(self):
out = _merge({"a": [1, 2]}, {"a": [2, 3]})
assert out == {"a": [1, 2, 3]}
def test_list_preserves_order(self):
out = _merge({"a": ["x", "y"]}, {"a": ["z", "x"]})
assert out == {"a": ["x", "y", "z"]}
def test_type_mismatch_override_wins(self):
# If shapes differ, override replaces wholesale.
out = _merge({"a": [1, 2]}, {"a": {"new": True}})
assert out == {"a": {"new": True}}
# --------------------------------------------------------------------------- #
# Loader helpers #
# --------------------------------------------------------------------------- #
def _write(path: Path, content: str) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(content, encoding="utf-8")
@pytest.fixture
def isolated_loader(tmp_path: Path, monkeypatch):
"""Redirect _BUILTIN_ROOT and _LEARNED_ROOT to temp dirs."""
builtin = tmp_path / "builtin"
learned = tmp_path / "learned"
builtin.mkdir()
learned.mkdir()
monkeypatch.setattr(loader_mod, "_BUILTIN_ROOT", builtin)
monkeypatch.setattr(loader_mod, "_LEARNED_ROOT", learned)
return builtin, learned
class TestLoader:
def test_builtin_only(self, isolated_loader):
builtin, _ = isolated_loader
_write(
builtin / "subtitles.yaml",
"languages:\n fra:\n tokens: [fr, fre]\n",
)
ldr = KnowledgeLoader()
assert ldr.subtitles()["languages"]["fra"]["tokens"] == ["fr", "fre"]
def test_learned_adds_tokens_additively(self, isolated_loader):
builtin, learned = isolated_loader
_write(
builtin / "subtitles.yaml",
"languages:\n fra:\n tokens: [fr, fre]\n",
)
_write(
learned / "subtitles_learned.yaml",
"languages:\n fra:\n tokens: [vff, custom]\n",
)
ldr = KnowledgeLoader()
tokens = ldr.subtitles()["languages"]["fra"]["tokens"]
assert tokens == ["fr", "fre", "vff", "custom"]
def test_missing_files_dont_crash(self, isolated_loader):
# No files written → loader still produces empty structures.
ldr = KnowledgeLoader()
assert ldr.subtitles() == {}
assert ldr.patterns() == {}
assert ldr.release_groups() == {}
def test_builtin_pattern_loaded(self, isolated_loader):
builtin, _ = isolated_loader
_write(
builtin / "patterns" / "adjacent.yaml",
"id: adjacent\nscan_strategy: adjacent\ndescription: test\n",
)
ldr = KnowledgeLoader()
assert "adjacent" in ldr.patterns()
assert ldr.pattern("adjacent")["scan_strategy"] == "adjacent"
def test_learned_pattern_overlays_builtin(self, isolated_loader):
builtin, learned = isolated_loader
_write(
builtin / "patterns" / "p.yaml",
"id: p\nscan_strategy: flat\ndescription: old\n",
)
_write(
learned / "patterns" / "p.yaml",
"id: p\ndescription: new\n",
)
ldr = KnowledgeLoader()
# learned replaces scalar 'description', keeps scan_strategy from builtin
assert ldr.pattern("p")["description"] == "new"
assert ldr.pattern("p")["scan_strategy"] == "flat"
def test_learned_only_pattern_added(self, isolated_loader):
_, learned = isolated_loader
_write(
learned / "patterns" / "neo.yaml",
"id: neo\nscan_strategy: embedded\n",
)
ldr = KnowledgeLoader()
assert "neo" in ldr.patterns()
def test_release_group_case_insensitive_lookup(self, isolated_loader):
builtin, _ = isolated_loader
_write(
builtin / "release_groups" / "kontrast.yaml",
"name: KONTRAST\nknown_patterns: [adjacent]\n",
)
ldr = KnowledgeLoader()
# Stored under "KONTRAST" but case-insensitive match must work.
assert ldr.release_group("kontrast") is not None
assert ldr.release_group("Kontrast")["name"] == "KONTRAST"
assert ldr.release_group("unknown_group") is None
def test_pattern_id_falls_back_to_filename(self, isolated_loader):
# File without 'id' field — uses the stem.
builtin, _ = isolated_loader
_write(
builtin / "patterns" / "no_id.yaml",
"scan_strategy: adjacent\n",
)
ldr = KnowledgeLoader()
assert "no_id" in ldr.patterns()
# --------------------------------------------------------------------------- #
# SubtitleKnowledgeBase #
# --------------------------------------------------------------------------- #
class TestKnowledgeBase:
@pytest.fixture
def kb(self, isolated_loader):
builtin, _ = isolated_loader
_write(
builtin / "subtitles.yaml",
"""
formats:
srt:
extensions: [".srt"]
description: "SubRip"
ass:
extensions: [".ass", ".ssa"]
language_tokens:
fre: ["vostfr"]
types:
sdh:
tokens: ["sdh", "cc"]
forced:
tokens: ["forced"]
defaults:
languages: ["fre"]
formats: ["srt"]
types: ["standard"]
format_priority: ["srt"]
min_confidence: 0.8
""",
)
_write(
builtin / "patterns" / "adj.yaml",
"id: adj\nscan_strategy: adjacent\ndescription: d\n",
)
_write(
builtin / "patterns" / "bad.yaml",
# invalid scan_strategy → skipped at build time
"id: bad\nscan_strategy: not_a_real_strategy\n",
)
_write(
builtin / "release_groups" / "group_a.yaml",
"name: GroupA\nknown_patterns: [adj]\n",
)
return SubtitleKnowledgeBase()
def test_formats_loaded(self, kb):
formats = kb.formats()
assert "srt" in formats and "ass" in formats
assert kb.format_for_extension(".srt").id == "srt"
assert kb.format_for_extension(".ssa").id == "ass"
assert kb.format_for_extension(".unknown") is None
def test_known_extensions_aggregates(self, kb):
exts = kb.known_extensions()
assert ".srt" in exts and ".ass" in exts and ".ssa" in exts
def test_language_for_token(self, kb):
# Canonical ISO 639-2/B codes are sourced from LanguageRegistry.
assert kb.language_for_token("french").code == "fre"
assert kb.language_for_token("FR").code == "fre"
assert kb.language_for_token("xxx") is None
assert kb.is_known_lang_token("eng") is True
assert kb.is_known_lang_token("ghost") is False
def test_subtitle_specific_token_recognized(self, kb):
# ``vostfr`` is subtitle-specific and lives in subtitles.yaml's
# ``language_tokens`` block — still resolves to canonical "fre".
assert kb.language_for_token("vostfr").code == "fre"
def test_type_for_token(self, kb):
assert kb.type_for_token("sdh") == SubtitleType.SDH
assert kb.type_for_token("FORCED") == SubtitleType.FORCED
assert kb.type_for_token("nope") is None
# 'hi' must NOT be a SDH token any more (it collides with Hindi).
assert kb.is_known_type_token("hi") is False
assert kb.is_known_type_token("cc") is True
def test_default_rules(self, kb):
r = kb.default_rules()
assert r.preferred_languages == ["fre"]
assert r.preferred_formats == ["srt"]
assert r.min_confidence == 0.8
def test_patterns_valid_kept_invalid_skipped(self, kb):
patterns = kb.patterns()
assert "adj" in patterns
# 'bad' had an invalid scan_strategy → quietly dropped.
assert "bad" not in patterns
def test_pattern_typed_view(self, kb):
p = kb.pattern("adj")
assert p.scan_strategy == ScanStrategy.ADJACENT
assert p.type_detection == TypeDetectionMethod.TOKEN_IN_NAME
def test_patterns_for_group(self, kb):
ps = kb.patterns_for_group("GroupA")
assert len(ps) == 1 and ps[0].id == "adj"
assert kb.patterns_for_group("unknown") == []
def test_reload_picks_up_changes(self, kb, isolated_loader):
# Add a new pattern, reload, check it's visible.
builtin, _ = isolated_loader
_write(
builtin / "patterns" / "new.yaml",
"id: new\nscan_strategy: flat\n",
)
kb.reload()
assert "new" in kb.patterns()