refactor(subtitles): drop dead scanner module

SubtitleScanner was an earlier iteration superseded by SubtitleIdentifier
and never imported in production code (only by its own tests). Removing
both keeps the bounded context clean and shrinks the surface.
This commit is contained in:
2026-05-19 14:17:15 +02:00
parent f6eef59fca
commit eb8995cfc3
2 changed files with 0 additions and 439 deletions
-207
View File
@@ -1,207 +0,0 @@
"""SubtitleScanner — inspects local subtitle files and filters them per user preferences.
Given a video file path, the scanner:
1. Looks for subtitle files in the same directory as the video.
2. Optionally also inspects a Subs/ subfolder adjacent to the video.
3. Classifies each file (language, SDH, forced) from its filename, delegating
all token knowledge to SubtitleKnowledgeBase (which itself merges
LanguageRegistry + subtitle-specific tokens from subtitles.yaml).
4. Filters according to SubtitlePreferences (languages, min_size_kb, keep_sdh,
keep_forced).
5. Returns a list of SubtitleCandidate — one per file that passes the filter,
with the destination filename already computed.
Filename classification heuristics
-----------------------------------
We parse the stem of each subtitle file looking for known patterns:
fre.srt → lang=fre, sdh=False, forced=False
fre.sdh.srt → lang=fre, sdh=True
fre.forced.srt → lang=fre, forced=True
Breaking.Bad.S01E01.French.srt → lang=fre (alias match via LanguageRegistry)
Breaking.Bad.S01E01.VOSTFR.srt → lang=fre (subtitle-specific token)
ISO 639-2/B codes are used throughout (matching the project-wide canonical form
from iso_languages.yaml — what ffprobe emits).
Output naming convention (matches SubtitlePreferences docstring):
{lang}.srt
{lang}.sdh.srt
{lang}.forced.srt
"""
import logging
import re
from dataclasses import dataclass
from pathlib import Path
from .knowledge.base import SubtitleKnowledgeBase
from .value_objects import SubtitleType
logger = logging.getLogger(__name__)
_TOKEN_SPLIT = re.compile(r"[\.\s_\-]+")
@dataclass
class SubtitleCandidate:
"""A subtitle file that passed the filter, ready to be placed."""
source_path: Path
language: str # ISO 639-2/B code, e.g. "fre"
is_sdh: bool
is_forced: bool
extension: str # e.g. ".srt"
@property
def destination_name(self) -> str:
"""
Compute the destination filename per naming convention:
{lang}.srt
{lang}.sdh.srt
{lang}.forced.srt
"""
ext = self.extension.lstrip(".")
parts = [self.language]
if self.is_sdh:
parts.append("sdh")
elif self.is_forced:
parts.append("forced")
return ".".join(parts) + "." + ext
# Module-level KB instance — built lazily on first use to avoid loading YAML at import.
_KB: SubtitleKnowledgeBase | None = None
def _kb() -> SubtitleKnowledgeBase:
global _KB # noqa: PLW0603 — intentional lazy module-level cache
if _KB is None:
_KB = SubtitleKnowledgeBase()
return _KB
def _classify(path: Path) -> tuple[str | None, bool, bool]:
"""
Parse a subtitle filename and return (language_code, is_sdh, is_forced).
``language_code`` is the ISO 639-2/B canonical code (e.g. ``"fre"``).
Returns (None, False, False) if the language cannot be determined.
"""
stem = path.stem.lower()
tokens = _TOKEN_SPLIT.split(stem)
kb = _kb()
language: str | None = None
is_sdh = False
is_forced = False
for token in tokens:
if not token:
continue
if language is None:
lang = kb.language_for_token(token)
if lang is not None:
language = lang.code
continue
stype = kb.type_for_token(token)
if stype is SubtitleType.SDH:
is_sdh = True
elif stype is SubtitleType.FORCED:
is_forced = True
return language, is_sdh, is_forced
class SubtitleScanner:
"""
Scans subtitle files next to a video and filters them per SubtitlePreferences.
Usage:
scanner = SubtitleScanner(prefs)
candidates = scanner.scan(video_path)
# Each candidate has .source_path and .destination_name
"""
def __init__(
self, languages: list[str], min_size_kb: int, keep_sdh: bool, keep_forced: bool
):
self.languages = [lang.lower() for lang in languages]
self.min_size_kb = min_size_kb
self.keep_sdh = keep_sdh
self.keep_forced = keep_forced
self._kb = _kb()
self._subtitle_extensions = {e.lower() for e in self._kb.known_extensions()}
def scan(self, video_path: Path) -> list[SubtitleCandidate]:
"""
Return all subtitle candidates found next to the video that pass the filter.
Scans:
- Same directory as the video (flat siblings)
- Subs/ subfolder if present
"""
candidates: list[SubtitleCandidate] = []
search_dirs = [video_path.parent]
subs_dir = video_path.parent / "Subs"
if subs_dir.is_dir():
search_dirs.append(subs_dir)
logger.debug(f"SubtitleScanner: found Subs/ folder at {subs_dir}")
for directory in search_dirs:
for path in sorted(directory.iterdir()):
if not path.is_file():
continue
if path.suffix.lower() not in self._subtitle_extensions:
continue
candidate = self._evaluate(path)
if candidate is not None:
candidates.append(candidate)
logger.info(
f"SubtitleScanner: {len(candidates)} candidate(s) found for {video_path.name}"
)
return candidates
def _evaluate(self, path: Path) -> SubtitleCandidate | None:
"""Apply all filters to a single subtitle file. Returns None if it should be dropped."""
# Size filter
size_kb = path.stat().st_size / 1024
if size_kb < self.min_size_kb:
logger.debug(
f"SubtitleScanner: skip {path.name} (too small: {size_kb:.1f} KB)"
)
return None
language, is_sdh, is_forced = _classify(path)
# Language filter
if language is None:
logger.debug(f"SubtitleScanner: skip {path.name} (language unknown)")
return None
if language not in self.languages:
logger.debug(
f"SubtitleScanner: skip {path.name} (language '{language}' not in prefs)"
)
return None
# SDH filter
if is_sdh and not self.keep_sdh:
logger.debug(f"SubtitleScanner: skip {path.name} (SDH not wanted)")
return None
# Forced filter
if is_forced and not self.keep_forced:
logger.debug(f"SubtitleScanner: skip {path.name} (forced not wanted)")
return None
return SubtitleCandidate(
source_path=path,
language=language,
is_sdh=is_sdh,
is_forced=is_forced,
extension=path.suffix.lower(),
)
-232
View File
@@ -1,232 +0,0 @@
"""Tests for SubtitleScanner and _classify helper."""
from pathlib import Path
from alfred.domain.subtitles.scanner import (
SubtitleCandidate,
SubtitleScanner,
_classify,
)
# ---------------------------------------------------------------------------
# _classify — unit tests for the filename parser
# ---------------------------------------------------------------------------
class TestClassify:
def test_iso_lang_code_639_1_alias(self, tmp_path):
# ``fr`` is an alias of the canonical ISO 639-2/B code ``fre``.
p = tmp_path / "fr.srt"
p.write_text("")
lang, is_sdh, is_forced = _classify(p)
assert lang == "fre"
assert not is_sdh
assert not is_forced
def test_english_keyword(self, tmp_path):
p = tmp_path / "english.srt"
p.write_text("")
lang, _, _ = _classify(p)
assert lang == "eng"
def test_french_keyword(self, tmp_path):
p = tmp_path / "Show.S01E01.French.srt"
p.write_text("")
lang, _, _ = _classify(p)
assert lang == "fre"
def test_vostfr_is_french(self, tmp_path):
p = tmp_path / "Show.S01E01.VOSTFR.srt"
p.write_text("")
lang, _, _ = _classify(p)
assert lang == "fre"
def test_sdh_token(self, tmp_path):
p = tmp_path / "fre.sdh.srt"
p.write_text("")
lang, is_sdh, _ = _classify(p)
assert lang == "fre"
assert is_sdh
def test_hi_no_longer_marks_sdh(self, tmp_path):
# ``hi`` is the ISO 639-1 alias for Hindi; it must not mark a file as
# SDH any more (regression of the previous collision between SDH and
# Hindi tokens). Use ``sdh`` / ``cc`` / ``hearing`` to flag SDH instead.
p = tmp_path / "en.hi.srt"
p.write_text("")
lang, is_sdh, _ = _classify(p)
assert lang == "eng"
assert not is_sdh
def test_forced_token(self, tmp_path):
p = tmp_path / "fre.forced.srt"
p.write_text("")
_, _, is_forced = _classify(p)
assert is_forced
def test_unknown_language_returns_none(self, tmp_path):
p = tmp_path / "Show.S01E01.720p.srt"
p.write_text("")
lang, _, _ = _classify(p)
assert lang is None
def test_dot_separator(self, tmp_path):
p = tmp_path / "fre.sdh.srt"
p.write_text("")
lang, is_sdh, _ = _classify(p)
assert lang == "fre"
assert is_sdh
def test_hyphen_separator(self, tmp_path):
p = tmp_path / "fre-forced.srt"
p.write_text("")
lang, _, is_forced = _classify(p)
assert lang == "fre"
assert is_forced
# ---------------------------------------------------------------------------
# SubtitleCandidate.destination_name
# ---------------------------------------------------------------------------
class TestSubtitleCandidateDestinationName:
def _make(self, lang="fre", is_sdh=False, is_forced=False, ext=".srt", path=None):
return SubtitleCandidate(
source_path=path or Path("/fake/fre.srt"),
language=lang,
is_sdh=is_sdh,
is_forced=is_forced,
extension=ext,
)
def test_standard(self):
assert self._make().destination_name == "fre.srt"
def test_sdh(self):
assert self._make(is_sdh=True).destination_name == "fre.sdh.srt"
def test_forced(self):
assert self._make(is_forced=True).destination_name == "fre.forced.srt"
def test_ass_extension(self):
assert self._make(ext=".ass").destination_name == "fre.ass"
def test_english_standard(self):
assert self._make(lang="eng").destination_name == "eng.srt"
# ---------------------------------------------------------------------------
# SubtitleScanner — integration with real filesystem
# ---------------------------------------------------------------------------
class TestSubtitleScanner:
def _scanner(self, languages=None, min_size_kb=0, keep_sdh=True, keep_forced=True):
return SubtitleScanner(
languages=languages or ["fre", "eng"],
min_size_kb=min_size_kb,
keep_sdh=keep_sdh,
keep_forced=keep_forced,
)
def _video(self, tmp_path):
video = tmp_path / "Movie.mkv"
video.write_bytes(b"video")
return video
def test_finds_adjacent_subtitle(self, tmp_path):
video = self._video(tmp_path)
(tmp_path / "fre.srt").write_text("subtitle content")
candidates = self._scanner().scan(video)
assert len(candidates) == 1
assert candidates[0].language == "fre"
def test_finds_adjacent_subtitle_legacy_639_1(self, tmp_path):
# Reading existing media libraries: ``fr.srt`` is still recognized as
# French and classified canonically as ``fre`` — covers user libraries
# written before the ISO 639-2/B migration.
video = self._video(tmp_path)
(tmp_path / "fr.srt").write_text("subtitle content")
candidates = self._scanner().scan(video)
assert len(candidates) == 1
assert candidates[0].language == "fre"
def test_finds_multiple_languages(self, tmp_path):
video = self._video(tmp_path)
(tmp_path / "fre.srt").write_text("fr subtitle")
(tmp_path / "eng.srt").write_text("en subtitle")
candidates = self._scanner().scan(video)
langs = {c.language for c in candidates}
assert langs == {"fre", "eng"}
def test_scans_subs_subfolder(self, tmp_path):
video = self._video(tmp_path)
subs = tmp_path / "Subs"
subs.mkdir()
(subs / "fre.srt").write_text("subtitle")
candidates = self._scanner().scan(video)
assert any(c.language == "fre" for c in candidates)
def test_filters_unknown_language(self, tmp_path):
video = self._video(tmp_path)
(tmp_path / "unknown.srt").write_text("subtitle")
candidates = self._scanner().scan(video)
assert len(candidates) == 0
def test_filters_wrong_language(self, tmp_path):
video = self._video(tmp_path)
(tmp_path / "ger.srt").write_text("german subtitle")
candidates = self._scanner(languages=["fre"]).scan(video)
assert len(candidates) == 0
def test_filters_too_small_file(self, tmp_path):
video = self._video(tmp_path)
small = tmp_path / "fre.srt"
small.write_bytes(b"x") # 1 byte, well below any min_size_kb
candidates = self._scanner(min_size_kb=10).scan(video)
assert len(candidates) == 0
def test_filters_sdh_when_not_wanted(self, tmp_path):
video = self._video(tmp_path)
(tmp_path / "fre.sdh.srt").write_text("sdh subtitle")
candidates = self._scanner(keep_sdh=False).scan(video)
assert len(candidates) == 0
def test_filters_forced_when_not_wanted(self, tmp_path):
video = self._video(tmp_path)
(tmp_path / "fre.forced.srt").write_text("forced subtitle")
candidates = self._scanner(keep_forced=False).scan(video)
assert len(candidates) == 0
def test_keeps_sdh_when_wanted(self, tmp_path):
video = self._video(tmp_path)
(tmp_path / "fre.sdh.srt").write_text("sdh subtitle")
candidates = self._scanner(keep_sdh=True).scan(video)
assert len(candidates) == 1
assert candidates[0].is_sdh
def test_ignores_non_subtitle_files(self, tmp_path):
video = self._video(tmp_path)
(tmp_path / "fre.nfo").write_text("nfo file")
(tmp_path / "fre.jpg").write_bytes(b"image")
candidates = self._scanner().scan(video)
assert len(candidates) == 0
def test_returns_empty_when_no_subtitles(self, tmp_path):
video = self._video(tmp_path)
candidates = self._scanner().scan(video)
assert candidates == []