alfred/alfred/domain/subtitles/services/pattern_detector.py

"""PatternDetector — discovers the subtitle structure of a release folder."""

import logging
from pathlib import Path

from alfred.infrastructure.knowledge.subtitles.base import SubtitleKnowledgeBase

from ...shared.ports import FilesystemScanner, MediaProber
from ..value_objects import ScanStrategy, SubtitlePattern

logger = logging.getLogger(__name__)


class PatternDetector:
    """
    Inspects a release folder and returns the best matching known pattern,
    plus a confidence score and a description of what was found.

    Used for "pattern discovery" — when we don't yet know which pattern
    a release follows. The result is proposed to the user for confirmation.
    """

    def __init__(
        self,
        kb: SubtitleKnowledgeBase,
        prober: MediaProber,
        scanner: FilesystemScanner,
    ):
        self.kb = kb
        self.prober = prober
        self.scanner = scanner

    def detect(self, release_root: Path, sample_video: Path) -> dict:
        """
        Analyse the release folder and return:
        {
            "detected": SubtitlePattern | None,
            "confidence": float,
            "description": str,         # human-readable description of what was found
            "candidate_pattern_ids": list[str],
        }
        """
        findings = self._inspect(release_root, sample_video)
        best, confidence = self._match_pattern(findings)

        return {
            "detected": best,
            "confidence": confidence,
            "description": self._describe(findings),
            "candidate_pattern_ids": [best.id] if best else [],
            "raw_findings": findings,
        }

    def _has_embedded_subtitles(self, video_path: Path) -> bool:
        return len(self.prober.list_subtitle_streams(video_path)) > 0

    def _inspect(self, release_root: Path, sample_video: Path) -> dict:
        """Gather structural facts about the release."""
        known_exts = self.kb.known_extensions()
        findings: dict = {
            "has_subs_folder": False,
            "subs_strategy": None,  # "flat" | "episode_subfolder"
            "subs_root": None,
            "adjacent_subs": False,
            "has_embedded": self._has_embedded_subtitles(sample_video),
            "files_per_episode": 0,
            "has_lang_tokens": False,
            "has_numeric_prefix": False,
        }

        # Check for Subs/ folder — adjacent or at release root
        for subs_candidate in (
            sample_video.parent / "Subs",
            release_root / "Subs",
        ):
            children = self.scanner.scan_dir(subs_candidate)
            if not children:
                continue

            findings["has_subs_folder"] = True
            findings["subs_root"] = str(subs_candidate)

            # Is it flat or episode_subfolder?
            sub_files = [
                c for c in children if c.is_file and c.suffix.lower() in known_exts
            ]
            sub_dirs = [c for c in children if c.is_dir]

            if sub_dirs and not sub_files:
                findings["subs_strategy"] = "episode_subfolder"
                # Count files in a sample subfolder
                sample_files = [
                    f
                    for f in self.scanner.scan_dir(sub_dirs[0].path)
                    if f.is_file and f.suffix.lower() in known_exts
                ]
                findings["files_per_episode"] = len(sample_files)
                # Check naming conventions
                for f in sample_files:
                    parts = f.stem.split("_")
                    if parts[0].isdigit():
                        findings["has_numeric_prefix"] = True
                    if any(
                        self.kb.is_known_lang_token(t.lower())
                        for t in f.stem.replace("_", ".").split(".")
                    ):
                        findings["has_lang_tokens"] = True
            else:
                findings["subs_strategy"] = "flat"
                findings["files_per_episode"] = len(sub_files)
                for f in sub_files:
                    if any(
                        self.kb.is_known_lang_token(t.lower())
                        for t in f.stem.replace("_", ".").split(".")
                    ):
                        findings["has_lang_tokens"] = True
            break

        # Check adjacent subs (next to the video)
        if not findings["has_subs_folder"]:
            adjacent = [
                e
                for e in self.scanner.scan_dir(sample_video.parent)
                if e.is_file and e.suffix.lower() in known_exts
            ]
            if adjacent:
                findings["adjacent_subs"] = True
                findings["files_per_episode"] = len(adjacent)

        return findings

    def _match_pattern(self, findings: dict) -> tuple[SubtitlePattern | None, float]:
        """Score all known patterns against the findings."""
        scores: list[tuple[float, SubtitlePattern]] = []

        for pattern in self.kb.patterns().values():
            score = self._score(pattern, findings)
            scores.append((score, pattern))

        if not scores:
            return None, 0.0

        scores.sort(key=lambda x: x[0], reverse=True)
        best_score, best_pattern = scores[0]

        if best_score < 0.4:
            return None, best_score

        return best_pattern, best_score

    def _score(self, pattern: SubtitlePattern, findings: dict) -> float:
        """Return a 0.0–1.0 match score for this pattern against the findings."""
        score = 0.0
        total = 0.0

        strategy = pattern.scan_strategy

        if strategy == ScanStrategy.EMBEDDED:
            total += 1
            if findings.get("has_embedded"):
                score += 1.0
            if not findings.get("has_subs_folder") and not findings.get(
                "adjacent_subs"
            ):
                score += 0.5
                total += 0.5

        elif strategy == ScanStrategy.EPISODE_SUBFOLDER:
            total += 3
            if findings.get("has_subs_folder"):
                score += 1.0
            if findings.get("subs_strategy") == "episode_subfolder":
                score += 2.0

        elif strategy == ScanStrategy.FLAT:
            total += 2
            if findings.get("has_subs_folder"):
                score += 1.0
            if findings.get("subs_strategy") == "flat":
                score += 1.0

        elif strategy == ScanStrategy.ADJACENT:
            total += 2
            if findings.get("adjacent_subs"):
                score += 1.0
            if not findings.get("has_subs_folder"):
                score += 1.0

        return score / total if total > 0 else 0.0

    def _describe(self, findings: dict) -> str:
        parts = []
        if findings.get("has_subs_folder"):
            strategy = findings.get("subs_strategy", "?")
            n = findings.get("files_per_episode", 0)
            parts.append(f"Subs/ folder found ({strategy}), ~{n} file(s) per episode")
            if findings.get("has_numeric_prefix"):
                parts.append("files have numeric prefix (e.g. 2_English.srt)")
            if findings.get("has_lang_tokens"):
                parts.append("language tokens found in filenames")
        elif findings.get("adjacent_subs"):
            parts.append("subtitle files adjacent to video")
        else:
            parts.append("no external subtitle files found")

        if findings.get("has_embedded"):
            parts.append("embedded tracks detected")

        return " — ".join(parts) if parts else "nothing found"