chore: sprint cleanup — language unification, parser unification, fossils removal
Several weeks of work accumulated without being committed. Grouped here for clarity; see CHANGELOG.md [Unreleased] for the user-facing summary. Highlights ---------- P1 #2 — ISO 639-2/B canonical migration - New Language VO + LanguageRegistry (alfred/domain/shared/knowledge/). - iso_languages.yaml as single source of truth for language codes. - SubtitleKnowledgeBase now delegates lookup to LanguageRegistry; subtitles.yaml only declares subtitle-specific tokens (vostfr, vf, vff, …). - SubtitlePreferences default → ["fre", "eng"]; subtitle filenames written as {iso639_2b}.srt (legacy fr.srt still read via alias). - Scanner: dropped _LANG_KEYWORDS / _SDH_TOKENS / _FORCED_TOKENS / SUBTITLE_EXTENSIONS hardcoded dicts. - Fixed: 'hi' token no longer marks SDH (conflicted with Hindi alias). - Added settings.min_movie_size_bytes (was a module constant). P1 #3 — Release parser unification + data-driven tokenizer - parse_release() is now the single source of truth for release-name parsing. - alfred/knowledge/release/separators.yaml declares the token separators used by the tokenizer (., space, [, ], (, ), _). New conventions can be added without code changes. - Tokenizer now splits on any configured separator instead of name.split('.'). Releases like 'The Father (2020) [1080p] [WEBRip] [5.1] [YTS.MX]' parse via the direct path without sanitization fallback. - Site-tag extraction always runs first; well-formedness only rejects truly forbidden chars. - _parse_season_episode() extended with NxNN / NxNNxNN alt forms. - Removed dead helpers: _sanitize, _normalize. Domain cleanup - Deleted fossil services with zero production callers: alfred/domain/movies/services.py alfred/domain/tv_shows/services.py alfred/domain/subtitles/services.py (replaced by subtitles/services/ package) alfred/domain/subtitles/repositories.py - Split monolithic subtitle services into a package (identifier, matcher, placer, pattern_detector, utils) + dedicated knowledge/ package. - MediaInfo split into dedicated package (alfred/domain/shared/media/: audio, video, subtitle, info, matching). Persistence cleanup - Removed dead JSON repositories (movie/subtitle/tvshow_repository.py). Tests - Major expansion of the test suite organized to mirror the source tree. - Removed obsolete *_edge_cases test files superseded by structured tests. - Suite: 990 passed, 8 skipped. Misc - .gitignore: exclude env_backup/ and *.bak. - Adjustments across agent/llm, app.py, application/filesystem, and infrastructure/filesystem to align with the new domain layout.
This commit is contained in:
@@ -0,0 +1,281 @@
|
||||
"""Tests for ``alfred.domain.subtitles.knowledge`` (loader + base).
|
||||
|
||||
Covers:
|
||||
|
||||
- ``TestMerge`` — the internal ``_merge`` deep-merge function:
|
||||
scalar override, dict merge, list extension+dedup.
|
||||
- ``TestLoader`` — builtin loads alone, learned overlays add tokens,
|
||||
learned-only pattern is picked up, missing files don't crash.
|
||||
- ``TestKnowledgeBase`` — typed view: formats / languages /
|
||||
type-token lookup, default rules, ``patterns_for_group``.
|
||||
|
||||
Uses ``monkeypatch`` to override the module-level ``_BUILTIN_ROOT`` and
|
||||
``_LEARNED_ROOT`` constants so we can drive the loader from a temp dir.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from alfred.domain.subtitles.knowledge import loader as loader_mod
|
||||
from alfred.domain.subtitles.knowledge.base import SubtitleKnowledgeBase
|
||||
from alfred.domain.subtitles.knowledge.loader import KnowledgeLoader, _merge
|
||||
from alfred.domain.subtitles.value_objects import (
|
||||
ScanStrategy,
|
||||
SubtitleType,
|
||||
TypeDetectionMethod,
|
||||
)
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# _merge — pure dict merger #
|
||||
# --------------------------------------------------------------------------- #
|
||||
|
||||
|
||||
class TestMerge:
|
||||
def test_scalar_override(self):
|
||||
assert _merge({"a": 1}, {"a": 2}) == {"a": 2}
|
||||
|
||||
def test_new_key_added(self):
|
||||
assert _merge({"a": 1}, {"b": 2}) == {"a": 1, "b": 2}
|
||||
|
||||
def test_nested_dict_merged(self):
|
||||
out = _merge({"a": {"x": 1}}, {"a": {"y": 2}})
|
||||
assert out == {"a": {"x": 1, "y": 2}}
|
||||
|
||||
def test_list_extended_and_deduped(self):
|
||||
out = _merge({"a": [1, 2]}, {"a": [2, 3]})
|
||||
assert out == {"a": [1, 2, 3]}
|
||||
|
||||
def test_list_preserves_order(self):
|
||||
out = _merge({"a": ["x", "y"]}, {"a": ["z", "x"]})
|
||||
assert out == {"a": ["x", "y", "z"]}
|
||||
|
||||
def test_type_mismatch_override_wins(self):
|
||||
# If shapes differ, override replaces wholesale.
|
||||
out = _merge({"a": [1, 2]}, {"a": {"new": True}})
|
||||
assert out == {"a": {"new": True}}
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Loader helpers #
|
||||
# --------------------------------------------------------------------------- #
|
||||
|
||||
|
||||
def _write(path: Path, content: str) -> None:
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
path.write_text(content, encoding="utf-8")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def isolated_loader(tmp_path: Path, monkeypatch):
|
||||
"""Redirect _BUILTIN_ROOT and _LEARNED_ROOT to temp dirs."""
|
||||
builtin = tmp_path / "builtin"
|
||||
learned = tmp_path / "learned"
|
||||
builtin.mkdir()
|
||||
learned.mkdir()
|
||||
monkeypatch.setattr(loader_mod, "_BUILTIN_ROOT", builtin)
|
||||
monkeypatch.setattr(loader_mod, "_LEARNED_ROOT", learned)
|
||||
return builtin, learned
|
||||
|
||||
|
||||
class TestLoader:
|
||||
def test_builtin_only(self, isolated_loader):
|
||||
builtin, _ = isolated_loader
|
||||
_write(
|
||||
builtin / "subtitles.yaml",
|
||||
"languages:\n fra:\n tokens: [fr, fre]\n",
|
||||
)
|
||||
ldr = KnowledgeLoader()
|
||||
assert ldr.subtitles()["languages"]["fra"]["tokens"] == ["fr", "fre"]
|
||||
|
||||
def test_learned_adds_tokens_additively(self, isolated_loader):
|
||||
builtin, learned = isolated_loader
|
||||
_write(
|
||||
builtin / "subtitles.yaml",
|
||||
"languages:\n fra:\n tokens: [fr, fre]\n",
|
||||
)
|
||||
_write(
|
||||
learned / "subtitles_learned.yaml",
|
||||
"languages:\n fra:\n tokens: [vff, custom]\n",
|
||||
)
|
||||
ldr = KnowledgeLoader()
|
||||
tokens = ldr.subtitles()["languages"]["fra"]["tokens"]
|
||||
assert tokens == ["fr", "fre", "vff", "custom"]
|
||||
|
||||
def test_missing_files_dont_crash(self, isolated_loader):
|
||||
# No files written → loader still produces empty structures.
|
||||
ldr = KnowledgeLoader()
|
||||
assert ldr.subtitles() == {}
|
||||
assert ldr.patterns() == {}
|
||||
assert ldr.release_groups() == {}
|
||||
|
||||
def test_builtin_pattern_loaded(self, isolated_loader):
|
||||
builtin, _ = isolated_loader
|
||||
_write(
|
||||
builtin / "patterns" / "adjacent.yaml",
|
||||
"id: adjacent\nscan_strategy: adjacent\ndescription: test\n",
|
||||
)
|
||||
ldr = KnowledgeLoader()
|
||||
assert "adjacent" in ldr.patterns()
|
||||
assert ldr.pattern("adjacent")["scan_strategy"] == "adjacent"
|
||||
|
||||
def test_learned_pattern_overlays_builtin(self, isolated_loader):
|
||||
builtin, learned = isolated_loader
|
||||
_write(
|
||||
builtin / "patterns" / "p.yaml",
|
||||
"id: p\nscan_strategy: flat\ndescription: old\n",
|
||||
)
|
||||
_write(
|
||||
learned / "patterns" / "p.yaml",
|
||||
"id: p\ndescription: new\n",
|
||||
)
|
||||
ldr = KnowledgeLoader()
|
||||
# learned replaces scalar 'description', keeps scan_strategy from builtin
|
||||
assert ldr.pattern("p")["description"] == "new"
|
||||
assert ldr.pattern("p")["scan_strategy"] == "flat"
|
||||
|
||||
def test_learned_only_pattern_added(self, isolated_loader):
|
||||
_, learned = isolated_loader
|
||||
_write(
|
||||
learned / "patterns" / "neo.yaml",
|
||||
"id: neo\nscan_strategy: embedded\n",
|
||||
)
|
||||
ldr = KnowledgeLoader()
|
||||
assert "neo" in ldr.patterns()
|
||||
|
||||
def test_release_group_case_insensitive_lookup(self, isolated_loader):
|
||||
builtin, _ = isolated_loader
|
||||
_write(
|
||||
builtin / "release_groups" / "kontrast.yaml",
|
||||
"name: KONTRAST\nknown_patterns: [adjacent]\n",
|
||||
)
|
||||
ldr = KnowledgeLoader()
|
||||
# Stored under "KONTRAST" but case-insensitive match must work.
|
||||
assert ldr.release_group("kontrast") is not None
|
||||
assert ldr.release_group("Kontrast")["name"] == "KONTRAST"
|
||||
assert ldr.release_group("unknown_group") is None
|
||||
|
||||
def test_pattern_id_falls_back_to_filename(self, isolated_loader):
|
||||
# File without 'id' field — uses the stem.
|
||||
builtin, _ = isolated_loader
|
||||
_write(
|
||||
builtin / "patterns" / "no_id.yaml",
|
||||
"scan_strategy: adjacent\n",
|
||||
)
|
||||
ldr = KnowledgeLoader()
|
||||
assert "no_id" in ldr.patterns()
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# SubtitleKnowledgeBase #
|
||||
# --------------------------------------------------------------------------- #
|
||||
|
||||
|
||||
class TestKnowledgeBase:
|
||||
@pytest.fixture
|
||||
def kb(self, isolated_loader):
|
||||
builtin, _ = isolated_loader
|
||||
_write(
|
||||
builtin / "subtitles.yaml",
|
||||
"""
|
||||
formats:
|
||||
srt:
|
||||
extensions: [".srt"]
|
||||
description: "SubRip"
|
||||
ass:
|
||||
extensions: [".ass", ".ssa"]
|
||||
language_tokens:
|
||||
fre: ["vostfr"]
|
||||
types:
|
||||
sdh:
|
||||
tokens: ["sdh", "cc"]
|
||||
forced:
|
||||
tokens: ["forced"]
|
||||
defaults:
|
||||
languages: ["fre"]
|
||||
formats: ["srt"]
|
||||
types: ["standard"]
|
||||
format_priority: ["srt"]
|
||||
min_confidence: 0.8
|
||||
""",
|
||||
)
|
||||
_write(
|
||||
builtin / "patterns" / "adj.yaml",
|
||||
"id: adj\nscan_strategy: adjacent\ndescription: d\n",
|
||||
)
|
||||
_write(
|
||||
builtin / "patterns" / "bad.yaml",
|
||||
# invalid scan_strategy → skipped at build time
|
||||
"id: bad\nscan_strategy: not_a_real_strategy\n",
|
||||
)
|
||||
_write(
|
||||
builtin / "release_groups" / "group_a.yaml",
|
||||
"name: GroupA\nknown_patterns: [adj]\n",
|
||||
)
|
||||
return SubtitleKnowledgeBase()
|
||||
|
||||
def test_formats_loaded(self, kb):
|
||||
formats = kb.formats()
|
||||
assert "srt" in formats and "ass" in formats
|
||||
assert kb.format_for_extension(".srt").id == "srt"
|
||||
assert kb.format_for_extension(".ssa").id == "ass"
|
||||
assert kb.format_for_extension(".unknown") is None
|
||||
|
||||
def test_known_extensions_aggregates(self, kb):
|
||||
exts = kb.known_extensions()
|
||||
assert ".srt" in exts and ".ass" in exts and ".ssa" in exts
|
||||
|
||||
def test_language_for_token(self, kb):
|
||||
# Canonical ISO 639-2/B codes are sourced from LanguageRegistry.
|
||||
assert kb.language_for_token("french").code == "fre"
|
||||
assert kb.language_for_token("FR").code == "fre"
|
||||
assert kb.language_for_token("xxx") is None
|
||||
assert kb.is_known_lang_token("eng") is True
|
||||
assert kb.is_known_lang_token("ghost") is False
|
||||
|
||||
def test_subtitle_specific_token_recognized(self, kb):
|
||||
# ``vostfr`` is subtitle-specific and lives in subtitles.yaml's
|
||||
# ``language_tokens`` block — still resolves to canonical "fre".
|
||||
assert kb.language_for_token("vostfr").code == "fre"
|
||||
|
||||
def test_type_for_token(self, kb):
|
||||
assert kb.type_for_token("sdh") == SubtitleType.SDH
|
||||
assert kb.type_for_token("FORCED") == SubtitleType.FORCED
|
||||
assert kb.type_for_token("nope") is None
|
||||
# 'hi' must NOT be a SDH token any more (it collides with Hindi).
|
||||
assert kb.is_known_type_token("hi") is False
|
||||
assert kb.is_known_type_token("cc") is True
|
||||
|
||||
def test_default_rules(self, kb):
|
||||
r = kb.default_rules()
|
||||
assert r.preferred_languages == ["fre"]
|
||||
assert r.preferred_formats == ["srt"]
|
||||
assert r.min_confidence == 0.8
|
||||
|
||||
def test_patterns_valid_kept_invalid_skipped(self, kb):
|
||||
patterns = kb.patterns()
|
||||
assert "adj" in patterns
|
||||
# 'bad' had an invalid scan_strategy → quietly dropped.
|
||||
assert "bad" not in patterns
|
||||
|
||||
def test_pattern_typed_view(self, kb):
|
||||
p = kb.pattern("adj")
|
||||
assert p.scan_strategy == ScanStrategy.ADJACENT
|
||||
assert p.type_detection == TypeDetectionMethod.TOKEN_IN_NAME
|
||||
|
||||
def test_patterns_for_group(self, kb):
|
||||
ps = kb.patterns_for_group("GroupA")
|
||||
assert len(ps) == 1 and ps[0].id == "adj"
|
||||
assert kb.patterns_for_group("unknown") == []
|
||||
|
||||
def test_reload_picks_up_changes(self, kb, isolated_loader):
|
||||
# Add a new pattern, reload, check it's visible.
|
||||
builtin, _ = isolated_loader
|
||||
_write(
|
||||
builtin / "patterns" / "new.yaml",
|
||||
"id: new\nscan_strategy: flat\n",
|
||||
)
|
||||
kb.reload()
|
||||
assert "new" in kb.patterns()
|
||||
Reference in New Issue
Block a user