chore: sprint cleanup — language unification, parser unification, fossils removal
Several weeks of work accumulated without being committed. Grouped here for clarity; see CHANGELOG.md [Unreleased] for the user-facing summary. Highlights ---------- P1 #2 — ISO 639-2/B canonical migration - New Language VO + LanguageRegistry (alfred/domain/shared/knowledge/). - iso_languages.yaml as single source of truth for language codes. - SubtitleKnowledgeBase now delegates lookup to LanguageRegistry; subtitles.yaml only declares subtitle-specific tokens (vostfr, vf, vff, …). - SubtitlePreferences default → ["fre", "eng"]; subtitle filenames written as {iso639_2b}.srt (legacy fr.srt still read via alias). - Scanner: dropped _LANG_KEYWORDS / _SDH_TOKENS / _FORCED_TOKENS / SUBTITLE_EXTENSIONS hardcoded dicts. - Fixed: 'hi' token no longer marks SDH (conflicted with Hindi alias). - Added settings.min_movie_size_bytes (was a module constant). P1 #3 — Release parser unification + data-driven tokenizer - parse_release() is now the single source of truth for release-name parsing. - alfred/knowledge/release/separators.yaml declares the token separators used by the tokenizer (., space, [, ], (, ), _). New conventions can be added without code changes. - Tokenizer now splits on any configured separator instead of name.split('.'). Releases like 'The Father (2020) [1080p] [WEBRip] [5.1] [YTS.MX]' parse via the direct path without sanitization fallback. - Site-tag extraction always runs first; well-formedness only rejects truly forbidden chars. - _parse_season_episode() extended with NxNN / NxNNxNN alt forms. - Removed dead helpers: _sanitize, _normalize. Domain cleanup - Deleted fossil services with zero production callers: alfred/domain/movies/services.py alfred/domain/tv_shows/services.py alfred/domain/subtitles/services.py (replaced by subtitles/services/ package) alfred/domain/subtitles/repositories.py - Split monolithic subtitle services into a package (identifier, matcher, placer, pattern_detector, utils) + dedicated knowledge/ package. - MediaInfo split into dedicated package (alfred/domain/shared/media/: audio, video, subtitle, info, matching). Persistence cleanup - Removed dead JSON repositories (movie/subtitle/tvshow_repository.py). Tests - Major expansion of the test suite organized to mirror the source tree. - Removed obsolete *_edge_cases test files superseded by structured tests. - Suite: 990 passed, 8 skipped. Misc - .gitignore: exclude env_backup/ and *.bak. - Adjustments across agent/llm, app.py, application/filesystem, and infrastructure/filesystem to align with the new domain layout.
This commit is contained in:
@@ -7,7 +7,7 @@ import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
from ...shared.value_objects import ImdbId
|
||||
from ..entities import MediaSubtitleMetadata, SubtitleTrack
|
||||
from ..entities import MediaSubtitleMetadata, SubtitleCandidate
|
||||
from ..knowledge.base import SubtitleKnowledgeBase
|
||||
from ..value_objects import ScanStrategy, SubtitlePattern, SubtitleType
|
||||
|
||||
@@ -91,7 +91,7 @@ class SubtitleIdentifier:
|
||||
# Embedded tracks — ffprobe
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _scan_embedded(self, video_path: Path) -> list[SubtitleTrack]:
|
||||
def _scan_embedded(self, video_path: Path) -> list[SubtitleCandidate]:
|
||||
if not video_path.exists():
|
||||
return []
|
||||
try:
|
||||
@@ -139,7 +139,7 @@ class SubtitleIdentifier:
|
||||
stype = SubtitleType.STANDARD
|
||||
|
||||
tracks.append(
|
||||
SubtitleTrack(
|
||||
SubtitleCandidate(
|
||||
language=lang,
|
||||
format=None,
|
||||
subtitle_type=stype,
|
||||
@@ -159,7 +159,7 @@ class SubtitleIdentifier:
|
||||
|
||||
def _scan_external(
|
||||
self, video_path: Path, pattern: SubtitlePattern
|
||||
) -> list[SubtitleTrack]:
|
||||
) -> list[SubtitleCandidate]:
|
||||
strategy = pattern.scan_strategy
|
||||
episode_stem: str | None = None
|
||||
|
||||
@@ -238,7 +238,7 @@ class SubtitleIdentifier:
|
||||
paths: list[Path],
|
||||
pattern: SubtitlePattern,
|
||||
episode_stem: str | None = None,
|
||||
) -> list[SubtitleTrack]:
|
||||
) -> list[SubtitleCandidate]:
|
||||
tracks = []
|
||||
for path in paths:
|
||||
track = self._classify_single(path, episode_stem=episode_stem)
|
||||
@@ -253,7 +253,7 @@ class SubtitleIdentifier:
|
||||
|
||||
def _classify_single(
|
||||
self, path: Path, episode_stem: str | None = None
|
||||
) -> SubtitleTrack:
|
||||
) -> SubtitleCandidate:
|
||||
fmt = self.kb.format_for_extension(path.suffix)
|
||||
tokens = (
|
||||
_tokenize_suffix(path.stem, episode_stem)
|
||||
@@ -290,7 +290,7 @@ class SubtitleIdentifier:
|
||||
size_kb = path.stat().st_size / 1024 if path.exists() else None
|
||||
entry_count = _count_entries(path) if path.exists() else None
|
||||
|
||||
return SubtitleTrack(
|
||||
return SubtitleCandidate(
|
||||
language=language,
|
||||
format=fmt,
|
||||
subtitle_type=subtitle_type,
|
||||
@@ -302,7 +302,7 @@ class SubtitleIdentifier:
|
||||
raw_tokens=tokens,
|
||||
)
|
||||
|
||||
def _disambiguate_by_size(self, tracks: list[SubtitleTrack]) -> list[SubtitleTrack]:
|
||||
def _disambiguate_by_size(self, tracks: list[SubtitleCandidate]) -> list[SubtitleCandidate]:
|
||||
"""
|
||||
When multiple tracks share the same language and type is UNKNOWN/STANDARD,
|
||||
the one with the most entries (lines) is SDH, the smallest is FORCED if
|
||||
@@ -312,7 +312,7 @@ class SubtitleIdentifier:
|
||||
"""
|
||||
|
||||
# Group by language code
|
||||
lang_groups: dict[str, list[SubtitleTrack]] = {}
|
||||
lang_groups: dict[str, list[SubtitleCandidate]] = {}
|
||||
for track in tracks:
|
||||
key = track.language.code if track.language else "__unknown__"
|
||||
lang_groups.setdefault(key, []).append(track)
|
||||
@@ -341,6 +341,6 @@ class SubtitleIdentifier:
|
||||
|
||||
return result
|
||||
|
||||
def _set_type(self, track: SubtitleTrack, stype: SubtitleType) -> None:
|
||||
def _set_type(self, track: SubtitleCandidate, stype: SubtitleType) -> None:
|
||||
"""Mutate track type in-place."""
|
||||
track.subtitle_type = stype
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
import logging
|
||||
|
||||
from ..entities import SubtitleTrack
|
||||
from ..entities import SubtitleCandidate
|
||||
from ..value_objects import SubtitleMatchingRules
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -10,7 +10,7 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
class SubtitleMatcher:
|
||||
"""
|
||||
Filters a list of SubtitleTrack against effective SubtitleMatchingRules.
|
||||
Filters a list of SubtitleCandidate against effective SubtitleMatchingRules.
|
||||
|
||||
Returns matched tracks (pass all filters, confidence >= min_confidence)
|
||||
and unresolved tracks (need user clarification).
|
||||
@@ -21,14 +21,14 @@ class SubtitleMatcher:
|
||||
|
||||
def match(
|
||||
self,
|
||||
tracks: list[SubtitleTrack],
|
||||
tracks: list[SubtitleCandidate],
|
||||
rules: SubtitleMatchingRules,
|
||||
) -> tuple[list[SubtitleTrack], list[SubtitleTrack]]:
|
||||
) -> tuple[list[SubtitleCandidate], list[SubtitleCandidate]]:
|
||||
"""
|
||||
Returns (matched, unresolved).
|
||||
"""
|
||||
matched: list[SubtitleTrack] = []
|
||||
unresolved: list[SubtitleTrack] = []
|
||||
matched: list[SubtitleCandidate] = []
|
||||
unresolved: list[SubtitleCandidate] = []
|
||||
|
||||
for track in tracks:
|
||||
if track.is_embedded:
|
||||
@@ -51,7 +51,7 @@ class SubtitleMatcher:
|
||||
return matched, unresolved
|
||||
|
||||
def _passes_filters(
|
||||
self, track: SubtitleTrack, rules: SubtitleMatchingRules
|
||||
self, track: SubtitleCandidate, rules: SubtitleMatchingRules
|
||||
) -> bool:
|
||||
# Language filter
|
||||
if rules.preferred_languages:
|
||||
@@ -76,14 +76,14 @@ class SubtitleMatcher:
|
||||
|
||||
def _resolve_conflicts(
|
||||
self,
|
||||
tracks: list[SubtitleTrack],
|
||||
tracks: list[SubtitleCandidate],
|
||||
rules: SubtitleMatchingRules,
|
||||
) -> list[SubtitleTrack]:
|
||||
) -> list[SubtitleCandidate]:
|
||||
"""
|
||||
When multiple tracks have same language + type, keep only the best one
|
||||
according to format_priority. If no format_priority applies, keep the first.
|
||||
"""
|
||||
seen: dict[tuple, SubtitleTrack] = {}
|
||||
seen: dict[tuple, SubtitleCandidate] = {}
|
||||
|
||||
for track in tracks:
|
||||
lang = track.language.code if track.language else None
|
||||
@@ -106,8 +106,8 @@ class SubtitleMatcher:
|
||||
|
||||
def _prefer(
|
||||
self,
|
||||
candidate: SubtitleTrack,
|
||||
existing: SubtitleTrack,
|
||||
candidate: SubtitleCandidate,
|
||||
existing: SubtitleCandidate,
|
||||
format_priority: list[str],
|
||||
) -> bool:
|
||||
"""Return True if candidate is preferable to existing."""
|
||||
|
||||
@@ -5,12 +5,12 @@ import os
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
from ..entities import SubtitleTrack
|
||||
from ..entities import SubtitleCandidate
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _build_dest_name(track: SubtitleTrack, video_stem: str) -> str:
|
||||
def _build_dest_name(track: SubtitleCandidate, video_stem: str) -> str:
|
||||
"""
|
||||
Build the destination filename for a subtitle track.
|
||||
|
||||
@@ -42,7 +42,7 @@ class PlacedTrack:
|
||||
@dataclass
|
||||
class PlaceResult:
|
||||
placed: list[PlacedTrack]
|
||||
skipped: list[tuple[SubtitleTrack, str]] # (track, reason)
|
||||
skipped: list[tuple[SubtitleCandidate, str]] # (track, reason)
|
||||
|
||||
@property
|
||||
def placed_count(self) -> int:
|
||||
@@ -55,7 +55,7 @@ class PlaceResult:
|
||||
|
||||
class SubtitlePlacer:
|
||||
"""
|
||||
Hard-links matched SubtitleTrack files next to a destination video.
|
||||
Hard-links matched SubtitleCandidate files next to a destination video.
|
||||
|
||||
Uses the same hard-link strategy as FileManager.copy_file:
|
||||
instant, no data duplication, qBittorrent keeps seeding.
|
||||
@@ -65,11 +65,11 @@ class SubtitlePlacer:
|
||||
|
||||
def place(
|
||||
self,
|
||||
tracks: list[SubtitleTrack],
|
||||
tracks: list[SubtitleCandidate],
|
||||
destination_video: Path,
|
||||
) -> PlaceResult:
|
||||
placed: list[PlacedTrack] = []
|
||||
skipped: list[tuple[SubtitleTrack, str]] = []
|
||||
skipped: list[tuple[SubtitleCandidate, str]] = []
|
||||
|
||||
dest_dir = destination_video.parent
|
||||
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
"""Subtitle service utilities."""
|
||||
|
||||
from ..entities import SubtitleTrack
|
||||
from ..entities import SubtitleCandidate
|
||||
|
||||
|
||||
def available_subtitles(tracks: list[SubtitleTrack]) -> list[SubtitleTrack]:
|
||||
def available_subtitles(tracks: list[SubtitleCandidate]) -> list[SubtitleCandidate]:
|
||||
"""
|
||||
Return the distinct subtitle tracks available, deduped by (language, type).
|
||||
|
||||
@@ -11,7 +11,7 @@ def available_subtitles(tracks: list[SubtitleTrack]) -> list[SubtitleTrack]:
|
||||
preferences — e.g. eng, eng.sdh, fra all show up as separate entries.
|
||||
"""
|
||||
seen: set[tuple] = set()
|
||||
result: list[SubtitleTrack] = []
|
||||
result: list[SubtitleCandidate] = []
|
||||
for track in tracks:
|
||||
lang = track.language.code if track.language else None
|
||||
key = (lang, track.subtitle_type)
|
||||
|
||||
Reference in New Issue
Block a user