"""SubtitleKnowledgeBase — parsed, typed view of the loaded knowledge.""" import logging from ...shared.knowledge.language_registry import LanguageRegistry from ..value_objects import ( ScanStrategy, SubtitleFormat, SubtitleLanguage, SubtitleMatchingRules, SubtitlePattern, SubtitleType, TypeDetectionMethod, ) from .loader import KnowledgeLoader logger = logging.getLogger(__name__) class SubtitleKnowledgeBase: """ Typed access to subtitle knowledge (formats, types, languages, patterns). Built from KnowledgeLoader — call kb.reload() to pick up newly learned entries without restarting. """ def __init__( self, loader: KnowledgeLoader | None = None, language_registry: LanguageRegistry | None = None, ): self._loader = loader or KnowledgeLoader() self._language_registry = language_registry or LanguageRegistry() self._build() def _build(self) -> None: # noqa: PLR0912 — straight-line YAML projection data = self._loader.subtitles() self._formats: dict[str, SubtitleFormat] = {} for fid, fdata in data.get("formats", {}).items(): self._formats[fid] = SubtitleFormat( id=fid, extensions=fdata.get("extensions", []), description=fdata.get("description", ""), ) # Languages are sourced primarily from the canonical LanguageRegistry # (alfred/knowledge/iso_languages.yaml — ISO 639-2/B). Subtitle-specific # tokens (VOSTFR, VF, VFF…) are merged on top from subtitles.yaml's # ``language_tokens`` section. subtitle_extras: dict[str, list[str]] = { code: list(tokens or []) for code, tokens in (data.get("language_tokens", {}) or {}).items() } self._languages: dict[str, SubtitleLanguage] = {} self._lang_token_map: dict[str, str] = {} for language in self._language_registry.all(): tokens: list[str] = [language.iso, language.english_name.lower()] if language.native_name.lower() not in tokens: tokens.append(language.native_name.lower()) for alias in language.aliases: if alias not in tokens: tokens.append(alias) for extra in subtitle_extras.get(language.iso, []): if extra.lower() not in tokens: tokens.append(extra.lower()) self._languages[language.iso] = SubtitleLanguage( code=language.iso, tokens=tokens, ) for token in tokens: self._lang_token_map[token.lower()] = language.iso # Subtitle-specific tokens for languages NOT in the canonical registry # are still honored: register them as a minimal SubtitleLanguage. for code, extras in subtitle_extras.items(): if code in self._languages: continue tokens = [code] + [e.lower() for e in extras] self._languages[code] = SubtitleLanguage(code=code, tokens=tokens) for token in tokens: self._lang_token_map[token.lower()] = code # Build reverse token → type map self._type_token_map: dict[str, SubtitleType] = {} for type_id, tdata in data.get("types", {}).items(): stype = SubtitleType(type_id) for token in tdata.get("tokens", []): self._type_token_map[token.lower()] = stype d = data.get("defaults", {}) self._default_rules = SubtitleMatchingRules( preferred_languages=d.get("languages", ["fre", "eng"]), preferred_formats=d.get("formats", ["srt"]), allowed_types=d.get("types", ["standard", "forced"]), format_priority=d.get("format_priority", ["srt", "ass"]), min_confidence=d.get("min_confidence", 0.7), ) self._patterns: dict[str, SubtitlePattern] = {} for pid, pdata in self._loader.patterns().items(): try: self._patterns[pid] = SubtitlePattern( id=pid, description=pdata.get("description", ""), scan_strategy=ScanStrategy(pdata.get("scan_strategy", "adjacent")), root_folder=pdata.get("root_folder"), type_detection=TypeDetectionMethod( pdata.get("type_detection", {}).get("method", "token_in_name") ), version=pdata.get("version", "1.0"), ) except ValueError as e: logger.warning(f"SubtitleKnowledgeBase: skipping pattern '{pid}': {e}") def reload(self) -> None: self._loader = KnowledgeLoader() self._build() logger.info("SubtitleKnowledgeBase: reloaded") # --- Defaults --- def default_rules(self) -> SubtitleMatchingRules: return self._default_rules # --- Formats --- def formats(self) -> dict[str, SubtitleFormat]: return self._formats def format_for_extension(self, ext: str) -> SubtitleFormat | None: for fmt in self._formats.values(): if fmt.matches_extension(ext): return fmt return None def known_extensions(self) -> set[str]: exts = set() for fmt in self._formats.values(): exts.update(fmt.extensions) return exts # --- Languages --- def languages(self) -> dict[str, SubtitleLanguage]: return self._languages def language_for_token(self, token: str) -> SubtitleLanguage | None: code = self._lang_token_map.get(token.lower()) return self._languages.get(code) if code else None def is_known_lang_token(self, token: str) -> bool: return token.lower() in self._lang_token_map # --- Types --- def type_for_token(self, token: str) -> SubtitleType | None: return self._type_token_map.get(token.lower()) def is_known_type_token(self, token: str) -> bool: return token.lower() in self._type_token_map # --- Patterns --- def patterns(self) -> dict[str, SubtitlePattern]: return self._patterns def pattern(self, pattern_id: str) -> SubtitlePattern | None: return self._patterns.get(pattern_id) def patterns_for_group(self, group_name: str) -> list[SubtitlePattern]: group = self._loader.release_group(group_name) if not group: return [] return [ self._patterns[pid] for pid in group.get("known_patterns", []) if pid in self._patterns ]