Files
alfred/alfred/infrastructure/knowledge/subtitles/base.py
T
francwa ced72547f7 refactor(knowledge): extract YAML loaders from domain to infrastructure
The domain layer no longer reads YAML files. All knowledge loaders move
from `alfred/domain/*/knowledge/` to `alfred/infrastructure/knowledge/`:

  domain/release/knowledge.py
    → infrastructure/knowledge/release.py
  domain/shared/knowledge/language_registry.py
    → infrastructure/knowledge/language_registry.py
  domain/subtitles/knowledge/{loader,base}.py
    → infrastructure/knowledge/subtitles/{loader,base}.py

Callers in domain/release/{services,value_objects}.py,
domain/subtitles/{aggregates,services/*}.py, and
application/filesystem/manage_subtitles.py updated to absolute imports.
Re-exports of KnowledgeLoader/SubtitleKnowledgeBase from
domain/subtitles/__init__.py dropped (no shim per project convention).
Tests follow the moved targets.
2026-05-19 14:35:18 +02:00

184 lines
6.6 KiB
Python

"""SubtitleKnowledgeBase — parsed, typed view of the loaded knowledge."""
import logging
from alfred.infrastructure.knowledge.language_registry import LanguageRegistry
from alfred.domain.subtitles.value_objects import (
ScanStrategy,
SubtitleFormat,
SubtitleLanguage,
SubtitleMatchingRules,
SubtitlePattern,
SubtitleType,
TypeDetectionMethod,
)
from .loader import KnowledgeLoader
logger = logging.getLogger(__name__)
class SubtitleKnowledgeBase:
"""
Typed access to subtitle knowledge (formats, types, languages, patterns).
Built from KnowledgeLoader — call kb.reload() to pick up newly learned entries
without restarting.
"""
def __init__(
self,
loader: KnowledgeLoader | None = None,
language_registry: LanguageRegistry | None = None,
):
self._loader = loader or KnowledgeLoader()
self._language_registry = language_registry or LanguageRegistry()
self._build()
def _build(self) -> None: # noqa: PLR0912 — straight-line YAML projection
data = self._loader.subtitles()
self._formats: dict[str, SubtitleFormat] = {}
for fid, fdata in data.get("formats", {}).items():
self._formats[fid] = SubtitleFormat(
id=fid,
extensions=fdata.get("extensions", []),
description=fdata.get("description", ""),
)
# Languages are sourced primarily from the canonical LanguageRegistry
# (alfred/knowledge/iso_languages.yaml — ISO 639-2/B). Subtitle-specific
# tokens (VOSTFR, VF, VFF…) are merged on top from subtitles.yaml's
# ``language_tokens`` section.
subtitle_extras: dict[str, list[str]] = {
code: list(tokens or [])
for code, tokens in (data.get("language_tokens", {}) or {}).items()
}
self._languages: dict[str, SubtitleLanguage] = {}
self._lang_token_map: dict[str, str] = {}
for language in self._language_registry.all():
tokens: list[str] = [language.iso, language.english_name.lower()]
if language.native_name.lower() not in tokens:
tokens.append(language.native_name.lower())
for alias in language.aliases:
if alias not in tokens:
tokens.append(alias)
for extra in subtitle_extras.get(language.iso, []):
if extra.lower() not in tokens:
tokens.append(extra.lower())
self._languages[language.iso] = SubtitleLanguage(
code=language.iso,
tokens=tokens,
)
for token in tokens:
self._lang_token_map[token.lower()] = language.iso
# Subtitle-specific tokens for languages NOT in the canonical registry
# are still honored: register them as a minimal SubtitleLanguage.
for code, extras in subtitle_extras.items():
if code in self._languages:
continue
tokens = [code] + [e.lower() for e in extras]
self._languages[code] = SubtitleLanguage(code=code, tokens=tokens)
for token in tokens:
self._lang_token_map[token.lower()] = code
# Build reverse token → type map
self._type_token_map: dict[str, SubtitleType] = {}
for type_id, tdata in data.get("types", {}).items():
stype = SubtitleType(type_id)
for token in tdata.get("tokens", []):
self._type_token_map[token.lower()] = stype
d = data.get("defaults", {})
self._default_rules = SubtitleMatchingRules(
preferred_languages=d.get("languages", ["fre", "eng"]),
preferred_formats=d.get("formats", ["srt"]),
allowed_types=d.get("types", ["standard", "forced"]),
format_priority=d.get("format_priority", ["srt", "ass"]),
min_confidence=d.get("min_confidence", 0.7),
)
self._patterns: dict[str, SubtitlePattern] = {}
for pid, pdata in self._loader.patterns().items():
try:
self._patterns[pid] = SubtitlePattern(
id=pid,
description=pdata.get("description", ""),
scan_strategy=ScanStrategy(pdata.get("scan_strategy", "adjacent")),
root_folder=pdata.get("root_folder"),
type_detection=TypeDetectionMethod(
pdata.get("type_detection", {}).get("method", "token_in_name")
),
version=pdata.get("version", "1.0"),
)
except ValueError as e:
logger.warning(f"SubtitleKnowledgeBase: skipping pattern '{pid}': {e}")
def reload(self) -> None:
self._loader = KnowledgeLoader()
self._build()
logger.info("SubtitleKnowledgeBase: reloaded")
# --- Defaults ---
def default_rules(self) -> SubtitleMatchingRules:
return self._default_rules
# --- Formats ---
def formats(self) -> dict[str, SubtitleFormat]:
return self._formats
def format_for_extension(self, ext: str) -> SubtitleFormat | None:
for fmt in self._formats.values():
if fmt.matches_extension(ext):
return fmt
return None
def known_extensions(self) -> set[str]:
exts = set()
for fmt in self._formats.values():
exts.update(fmt.extensions)
return exts
# --- Languages ---
def languages(self) -> dict[str, SubtitleLanguage]:
return self._languages
def language_for_token(self, token: str) -> SubtitleLanguage | None:
code = self._lang_token_map.get(token.lower())
return self._languages.get(code) if code else None
def is_known_lang_token(self, token: str) -> bool:
return token.lower() in self._lang_token_map
# --- Types ---
def type_for_token(self, token: str) -> SubtitleType | None:
return self._type_token_map.get(token.lower())
def is_known_type_token(self, token: str) -> bool:
return token.lower() in self._type_token_map
# --- Patterns ---
def patterns(self) -> dict[str, SubtitlePattern]:
return self._patterns
def pattern(self, pattern_id: str) -> SubtitlePattern | None:
return self._patterns.get(pattern_id)
def patterns_for_group(self, group_name: str) -> list[SubtitlePattern]:
group = self._loader.release_group(group_name)
if not group:
return []
return [
self._patterns[pid]
for pid in group.get("known_patterns", [])
if pid in self._patterns
]