88f156b7a4
The old name conflated 'might become a placed subtitle' with 'what a scan pass produced'. The class is the output of a scan/identify pass — language/format may still be None while classification is in progress, confidence reflects classifier certainty, raw_tokens holds filename fragments under analysis. SubtitleScanResult says that directly. Pure rename + refreshed docstring; no behavior change. Touches the domain entity, the matcher/identifier/utils services, the manage_subtitles use case, the placer, the metadata store, the shared-media cross-ref comment, and 7 test modules.
121 lines
4.0 KiB
Python
121 lines
4.0 KiB
Python
"""SubtitleMatcher — filters tracks against resolved rules."""
|
|
|
|
import logging
|
|
|
|
from ..entities import SubtitleScanResult
|
|
from ..value_objects import SubtitleMatchingRules
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class SubtitleMatcher:
|
|
"""
|
|
Filters a list of SubtitleScanResult against effective SubtitleMatchingRules.
|
|
|
|
Returns matched tracks (pass all filters, confidence >= min_confidence)
|
|
and unresolved tracks (need user clarification).
|
|
|
|
Conflict resolution: when two tracks share the same language + type,
|
|
format_priority decides which one to keep.
|
|
"""
|
|
|
|
def match(
|
|
self,
|
|
tracks: list[SubtitleScanResult],
|
|
rules: SubtitleMatchingRules,
|
|
) -> tuple[list[SubtitleScanResult], list[SubtitleScanResult]]:
|
|
"""
|
|
Returns (matched, unresolved).
|
|
"""
|
|
matched: list[SubtitleScanResult] = []
|
|
unresolved: list[SubtitleScanResult] = []
|
|
|
|
for track in tracks:
|
|
if track.is_embedded:
|
|
continue
|
|
|
|
if track.language is None or track.confidence < rules.min_confidence:
|
|
unresolved.append(track)
|
|
continue
|
|
|
|
if not self._passes_filters(track, rules):
|
|
logger.debug(f"SubtitleMatcher: filtered out {track}")
|
|
continue
|
|
|
|
matched.append(track)
|
|
|
|
matched = self._resolve_conflicts(matched, rules)
|
|
logger.info(
|
|
f"SubtitleMatcher: {len(matched)} matched, {len(unresolved)} unresolved"
|
|
)
|
|
return matched, unresolved
|
|
|
|
def _passes_filters(
|
|
self, track: SubtitleScanResult, rules: SubtitleMatchingRules
|
|
) -> bool:
|
|
# Language filter
|
|
if rules.preferred_languages:
|
|
if not track.language:
|
|
return False
|
|
if track.language.code not in rules.preferred_languages:
|
|
return False
|
|
|
|
# Format filter (only for external files)
|
|
if rules.preferred_formats and not track.is_embedded:
|
|
if not track.format:
|
|
return False
|
|
if track.format.id not in rules.preferred_formats:
|
|
return False
|
|
|
|
# Type filter
|
|
if rules.allowed_types:
|
|
if track.subtitle_type.value not in rules.allowed_types:
|
|
return False
|
|
|
|
return True
|
|
|
|
def _resolve_conflicts(
|
|
self,
|
|
tracks: list[SubtitleScanResult],
|
|
rules: SubtitleMatchingRules,
|
|
) -> list[SubtitleScanResult]:
|
|
"""
|
|
When multiple tracks have same language + type, keep only the best one
|
|
according to format_priority. If no format_priority applies, keep the first.
|
|
"""
|
|
seen: dict[tuple, SubtitleScanResult] = {}
|
|
|
|
for track in tracks:
|
|
lang = track.language.code if track.language else None
|
|
stype = track.subtitle_type.value
|
|
key = (lang, stype)
|
|
|
|
if key not in seen:
|
|
seen[key] = track
|
|
else:
|
|
existing = seen[key]
|
|
if self._prefer(track, existing, rules.format_priority):
|
|
logger.debug(
|
|
f"SubtitleMatcher: conflict {key} — "
|
|
f"preferring {track.format.id if track.format else 'embedded'} "
|
|
f"over {existing.format.id if existing.format else 'embedded'}"
|
|
)
|
|
seen[key] = track
|
|
|
|
return list(seen.values())
|
|
|
|
def _prefer(
|
|
self,
|
|
candidate: SubtitleScanResult,
|
|
existing: SubtitleScanResult,
|
|
format_priority: list[str],
|
|
) -> bool:
|
|
"""Return True if candidate is preferable to existing."""
|
|
if not format_priority:
|
|
return False
|
|
c_fmt = candidate.format.id if candidate.format else ""
|
|
e_fmt = existing.format.id if existing.format else ""
|
|
c_rank = format_priority.index(c_fmt) if c_fmt in format_priority else 999
|
|
e_rank = format_priority.index(e_fmt) if e_fmt in format_priority else 999
|
|
return c_rank < e_rank
|