chore: sprint cleanup — language unification, parser unification, fossils removal

Several weeks of work accumulated without being committed. Grouped here for
clarity; see CHANGELOG.md [Unreleased] for the user-facing summary.

Highlights
----------

P1 #2 — ISO 639-2/B canonical migration
- New Language VO + LanguageRegistry (alfred/domain/shared/knowledge/).
- iso_languages.yaml as single source of truth for language codes.
- SubtitleKnowledgeBase now delegates lookup to LanguageRegistry; subtitles.yaml
  only declares subtitle-specific tokens (vostfr, vf, vff, …).
- SubtitlePreferences default → ["fre", "eng"]; subtitle filenames written as
  {iso639_2b}.srt (legacy fr.srt still read via alias).
- Scanner: dropped _LANG_KEYWORDS / _SDH_TOKENS / _FORCED_TOKENS /
  SUBTITLE_EXTENSIONS hardcoded dicts.
- Fixed: 'hi' token no longer marks SDH (conflicted with Hindi alias).
- Added settings.min_movie_size_bytes (was a module constant).

P1 #3 — Release parser unification + data-driven tokenizer
- parse_release() is now the single source of truth for release-name parsing.
- alfred/knowledge/release/separators.yaml declares the token separators used
  by the tokenizer (., space, [, ], (, ), _). New conventions can be added
  without code changes.
- Tokenizer now splits on any configured separator instead of name.split('.').
  Releases like 'The Father (2020) [1080p] [WEBRip] [5.1] [YTS.MX]' parse via
  the direct path without sanitization fallback.
- Site-tag extraction always runs first; well-formedness only rejects truly
  forbidden chars.
- _parse_season_episode() extended with NxNN / NxNNxNN alt forms.
- Removed dead helpers: _sanitize, _normalize.

Domain cleanup
- Deleted fossil services with zero production callers:
    alfred/domain/movies/services.py
    alfred/domain/tv_shows/services.py
    alfred/domain/subtitles/services.py (replaced by subtitles/services/ package)
    alfred/domain/subtitles/repositories.py
- Split monolithic subtitle services into a package (identifier, matcher,
  placer, pattern_detector, utils) + dedicated knowledge/ package.
- MediaInfo split into dedicated package (alfred/domain/shared/media/:
  audio, video, subtitle, info, matching).

Persistence cleanup
- Removed dead JSON repositories (movie/subtitle/tvshow_repository.py).

Tests
- Major expansion of the test suite organized to mirror the source tree.
- Removed obsolete *_edge_cases test files superseded by structured tests.
- Suite: 990 passed, 8 skipped.

Misc
- .gitignore: exclude env_backup/ and *.bak.
- Adjustments across agent/llm, app.py, application/filesystem, and
  infrastructure/filesystem to align with the new domain layout.
This commit is contained in:
2026-05-17 23:38:00 +02:00
parent ba6f016d49
commit e07c9ec77b
99 changed files with 8833 additions and 6533 deletions
+283
View File
@@ -0,0 +1,283 @@
"""Tests for ``alfred.domain.release`` — release-name parser.
Covers the public surface used by the resolver / move pipeline:
- ``parse_release`` — well-formed scene names (TV episodes, season packs,
movies), site-tagged names, malformed names recovered via sanitization,
and irrecoverable names that fall back to ``media_type="unknown"``.
- ``ParsedRelease`` — derived properties (``is_season_pack``,
``show_folder_name``, ``season_folder_name``, ``episode_filename``,
``movie_folder_name``, ``movie_filename``) including the Windows-forbidden
character sanitizer and the episode-stripping helper for season folders.
These tests exercise the parser end-to-end through real YAML knowledge
files; no monkeypatching of the knowledge layer is performed.
"""
from __future__ import annotations
import pytest
from alfred.domain.release.services import parse_release
from alfred.domain.release.value_objects import ParsedRelease
class TestParseTVEpisode:
"""Single-episode TV releases."""
def test_basic_tv_episode(self):
r = parse_release("Oz.S03E01.1080p.WEBRip.x265-KONTRAST")
assert r.title == "Oz"
assert r.season == 3
assert r.episode == 1
assert r.episode_end is None
assert r.quality == "1080p"
assert r.source == "WEBRip"
assert r.codec == "x265"
assert r.group == "KONTRAST"
assert r.media_type == "tv_show"
assert r.parse_path == "direct"
assert r.is_season_pack is False
def test_multi_episode(self):
r = parse_release("Archer.S14E09E10.1080p.WEB.x265-GRP")
assert r.season == 14
assert r.episode == 9
assert r.episode_end == 10
def test_nxnn_alt_form(self):
# Alt season/episode form: 1x05 instead of S01E05.
r = parse_release("Some.Show.1x05.720p.HDTV.x264-GRP")
assert r.season == 1
assert r.episode == 5
assert r.episode_end is None
assert r.media_type == "tv_show"
def test_nxnnxnn_multi_episode_alt_form(self):
r = parse_release("Some.Show.2x07x08.1080p.WEB.x265-GRP")
assert r.season == 2
assert r.episode == 7
assert r.episode_end == 8
def test_season_pack(self):
r = parse_release("Oz.S03.1080p.WEBRip.x265-KONTRAST")
assert r.season == 3
assert r.episode is None
assert r.is_season_pack is True
assert r.media_type == "tv_show"
class TestParseMovie:
"""Movie releases."""
def test_basic_movie(self):
r = parse_release("Inception.2010.1080p.BluRay.x264-GROUP")
assert r.title == "Inception"
assert r.year == 2010
assert r.season is None
assert r.episode is None
assert r.quality == "1080p"
assert r.source == "BluRay"
assert r.codec == "x264"
assert r.group == "GROUP"
assert r.media_type == "movie"
def test_movie_multi_word_title(self):
r = parse_release("The.Dark.Knight.2008.2160p.UHD.BluRay.x265-TERMINAL")
assert r.title == "The.Dark.Knight"
assert r.year == 2008
assert r.quality == "2160p"
def test_movie_without_year_still_movie_if_tech_present(self):
r = parse_release("UntitledFilm.1080p.WEBRip.x264-GRP")
# No season, no year, but tech markers → still movie
assert r.media_type == "movie"
assert r.year is None
class TestParseEdgeCases:
"""Site tags, malformed names, and unknown media types."""
def test_site_tag_prefix_stripped(self):
r = parse_release("[ OxTorrent.vc ] The.Title.S01E01.1080p.WEB.x265-GRP")
assert r.site_tag == "OxTorrent.vc"
assert r.parse_path == "sanitized"
assert r.season == 1
assert r.episode == 1
def test_site_tag_suffix_stripped(self):
r = parse_release("The.Title.S01E01.1080p.WEB.x265-NTb[TGx]")
assert r.site_tag == "TGx"
# Suffix-tagged names are well-formed (only [] in tag → after strip clean)
assert r.season == 1
def test_irrecoverably_malformed(self):
# @ is a forbidden char and not stripped by _sanitize → stays malformed
r = parse_release("foo@bar@baz")
assert r.media_type == "unknown"
assert r.parse_path == "ai"
assert r.group == "UNKNOWN"
def test_empty_unknown_when_no_evidence(self):
r = parse_release("Some.Random.Title")
# No season, no year, no tech markers → unknown
assert r.media_type == "unknown"
def test_missing_group_defaults_to_unknown(self):
r = parse_release("Movie.2020.1080p.WEBRip.x265")
# No "-GROUP" suffix → group = "UNKNOWN"
assert r.group == "UNKNOWN"
def test_yts_bracket_release(self):
# YTS-style: spaces, parens for year, multiple bracketed tech tokens.
# The tokenizer must handle ' ', '(', ')', '[', ']' transparently.
r = parse_release("The Father (2020) [1080p] [WEBRip] [5.1] [YTS.MX]")
assert r.title == "The.Father"
assert r.year == 2020
assert r.quality == "1080p"
assert r.source == "WEBRip"
assert r.audio_channels == "5.1"
assert r.media_type == "movie"
def test_human_friendly_spaces(self):
# Spaces as separators (no brackets).
r = parse_release("Inception 2010 1080p BluRay x264-GROUP")
assert r.title == "Inception"
assert r.year == 2010
assert r.quality == "1080p"
assert r.codec == "x264"
assert r.group == "GROUP"
assert r.media_type == "movie"
def test_underscore_separators(self):
# Old usenet style: underscores between tokens.
r = parse_release("Some_Show_S01E01_1080p_WEB_x265-GRP")
assert r.season == 1
assert r.episode == 1
assert r.quality == "1080p"
assert r.group == "GRP"
class TestParseAudioVideoEdition:
"""Audio, video metadata, edition extraction."""
def test_audio_codec_and_channels(self):
r = parse_release("Movie.2020.1080p.BluRay.DTS.5.1.x264-GRP")
assert r.audio_channels == "5.1"
def test_language_token(self):
r = parse_release("Movie.2020.MULTI.1080p.WEBRip.x265-GRP")
assert "MULTI" in r.languages
def test_edition_token(self):
r = parse_release("Movie.2020.UNRATED.1080p.BluRay.x264-GRP")
assert r.edition == "UNRATED"
class TestParsedReleaseFolderNames:
"""Helpers that build filesystem-safe folder/filenames."""
def _parsed_tv(self) -> ParsedRelease:
return parse_release("Oz.S03E01.1080p.WEBRip.x265-KONTRAST")
def _parsed_movie(self) -> ParsedRelease:
return parse_release("Inception.2010.1080p.BluRay.x264-GROUP")
def test_show_folder_name(self):
r = self._parsed_tv()
assert r.show_folder_name("Oz", 1997) == "Oz.1997.1080p.WEBRip.x265-KONTRAST"
def test_show_folder_name_strips_windows_chars(self):
r = self._parsed_tv()
# Colons and question marks are Windows-forbidden — must be stripped.
result = r.show_folder_name("Oz: The Series?", 1997)
assert ":" not in result
assert "?" not in result
def test_season_folder_name_strips_episode(self):
r = self._parsed_tv()
# Episode token Exx is stripped, Sxx stays
result = r.season_folder_name()
assert "S03" in result
assert "E01" not in result
def test_season_folder_name_multi_episode(self):
r = parse_release("Archer.S14E09E10E11.1080p.WEB.x265-GRP")
result = r.season_folder_name()
assert "S14" in result
assert "E09" not in result
assert "E10" not in result
assert "E11" not in result
def test_episode_filename_with_title(self):
r = self._parsed_tv()
fname = r.episode_filename("The Routine", "mkv")
assert fname.endswith(".mkv")
assert "S03E01" in fname
assert "The.Routine" in fname
assert "KONTRAST" in fname
def test_episode_filename_without_title(self):
r = self._parsed_tv()
fname = r.episode_filename(None, "mkv")
assert fname.endswith(".mkv")
assert "S03E01" in fname
def test_episode_filename_strips_ext_dot(self):
r = self._parsed_tv()
# Whether the caller passes "mkv" or ".mkv", we get a single dot.
a = r.episode_filename(None, "mkv")
b = r.episode_filename(None, ".mkv")
assert a == b
assert "..mkv" not in a
def test_movie_folder_name(self):
r = self._parsed_movie()
assert (
r.movie_folder_name("Inception", 2010)
== "Inception.2010.1080p.BluRay.x264-GROUP"
)
def test_movie_filename(self):
r = self._parsed_movie()
assert (
r.movie_filename("Inception", 2010, "mkv")
== "Inception.2010.1080p.BluRay.x264-GROUP.mkv"
)
class TestParsedReleaseInvariants:
"""Structural invariants of ParsedRelease."""
def test_raw_is_preserved(self):
raw = "Oz.S03E01.1080p.WEBRip.x265-KONTRAST"
r = parse_release(raw)
assert r.raw == raw
def test_languages_defaults_to_empty_list_not_none(self):
r = parse_release("Movie.2020.1080p.BluRay.x264-GRP")
# __post_init__ ensures languages is a list, never None
assert r.languages == []
def test_tech_string_joined(self):
r = parse_release("Movie.2020.1080p.BluRay.x264-GRP")
assert r.tech_string == "1080p.BluRay.x264"
def test_tech_string_partial(self):
# Codec-only release (no quality/source): tech_string == codec
r = parse_release("Show.S01E01.x265-GRP")
assert r.tech_string == "x265"
assert r.codec == "x265"
assert r.quality is None
assert r.source is None
@pytest.mark.parametrize(
"name,expected_type",
[
("Show.S01E01.1080p.WEB.x265-GRP", "tv_show"),
("Movie.2020.1080p.BluRay.x264-GRP", "movie"),
("Random.Title.With.Nothing", "unknown"),
],
)
def test_media_type_inference(self, name, expected_type):
assert parse_release(name).media_type == expected_type