Merge branch 'feat/parser-phase-d'
This commit is contained in:
@@ -15,6 +15,37 @@ callers).
|
||||
|
||||
## [Unreleased]
|
||||
|
||||
### Fixed
|
||||
|
||||
- **Multi-episode chain (e.g. `S14E09E10E11`) now collapses to a full
|
||||
range.** The parser previously captured `episode=9, episode_end=10`
|
||||
and dropped E11+. It now returns `episode=first, episode_end=last`,
|
||||
with intermediate values implied. Fixture
|
||||
`shitty/archer_multi_episode/` updated from anti-regression-of-bug
|
||||
to anti-regression-of-fix.
|
||||
- **Apostrophes in titles no longer push the release through the AI
|
||||
fallback.** `Honey.Don't.2025.2160p.WEBRip.DSNP.DV.HDR.x265-Amen`
|
||||
previously parsed with `parse_path="ai"` and everything UNKNOWN
|
||||
because `'` is in the forbidden-chars list. Apostrophes are now
|
||||
pre-stripped before the well-formed check, so the parse completes
|
||||
normally (`title=Honey.Dont, year=2025, quality=2160p, ...`); only
|
||||
the title text loses its apostrophe. `parse_path` becomes
|
||||
`sanitized` to surface the cleanup. Side win: PoP fixture
|
||||
`the_prodigy_full_chaos/` also moves from total failure to a
|
||||
partially-correct parse (year, source, codec extracted).
|
||||
- **Season-range markers (`Sxx-yy`) are now recognized as
|
||||
`tv_complete`.** `Der.Tatortreiniger.S01-06.GERMAN...` previously
|
||||
parsed as `media_type=movie` with `S01-06` glued onto the title.
|
||||
The parser now recognizes the range, sets `season=first`,
|
||||
`media_type=tv_complete`, and removes the marker from the title.
|
||||
`is_season_pack` flips to `true`.
|
||||
- **Pure-punctuation TITLE tokens are dropped at assembly.** Releases
|
||||
with surrounding ` - ` separators (`Vinyl - 1x01 - FHD`) previously
|
||||
produced `title="Vinyl.-"`. Such tokens (a stray dash, a wide pipe
|
||||
`|`, …) carry no title content and are now filtered out. Side
|
||||
effect: PoP fixture `khruangbin_yt_wide_pipe/` also benefits — the
|
||||
YouTube wide-pipe no longer leaks into the title.
|
||||
|
||||
### Added
|
||||
|
||||
- **`LanguageRepository` port** in `alfred.domain.shared.ports`. Structural
|
||||
|
||||
@@ -91,14 +91,17 @@ def tokenize(name: str, kb: ReleaseKnowledge) -> tuple[list[Token], str | None]:
|
||||
|
||||
|
||||
def _parse_season_episode(text: str) -> tuple[int, int | None, int | None] | None:
|
||||
"""Parse a single token as ``SxxExx`` / ``SxxExxExx`` / ``Sxx`` / ``NxNN``.
|
||||
"""Parse a single token as ``SxxExx`` / ``SxxExxExx`` / ``Sxx`` /
|
||||
``Sxx-yy`` (season range) / ``NxNN``.
|
||||
|
||||
Returns ``(season, episode, episode_end)`` or ``None`` if the token
|
||||
is not a season/episode marker.
|
||||
is not a season/episode marker. For ``Sxx-yy``, returns the first
|
||||
season with no episode info — the caller is expected to detect the
|
||||
range form and promote ``media_type`` to ``tv_complete`` separately.
|
||||
"""
|
||||
upper = text.upper()
|
||||
|
||||
# SxxExx form
|
||||
# SxxExx form (and Sxx, Sxx-yy)
|
||||
if len(upper) >= 3 and upper[0] == "S" and upper[1:3].isdigit():
|
||||
season = int(upper[1:3])
|
||||
rest = upper[3:]
|
||||
@@ -106,6 +109,15 @@ def _parse_season_episode(text: str) -> tuple[int, int | None, int | None] | Non
|
||||
if not rest:
|
||||
return season, None, None
|
||||
|
||||
# Sxx-yy season-range form: capture the first season, treat as a
|
||||
# complete-series marker (no episode info).
|
||||
if (
|
||||
len(rest) == 3
|
||||
and rest[0] == "-"
|
||||
and rest[1:3].isdigit()
|
||||
):
|
||||
return season, None, None
|
||||
|
||||
episodes: list[int] = []
|
||||
while rest.startswith("E") and len(rest) >= 3 and rest[1:3].isdigit():
|
||||
episodes.append(int(rest[1:3]))
|
||||
@@ -113,7 +125,9 @@ def _parse_season_episode(text: str) -> tuple[int, int | None, int | None] | Non
|
||||
|
||||
if not episodes:
|
||||
return None
|
||||
return season, episodes[0], episodes[1] if len(episodes) >= 2 else None
|
||||
# For chained multi-episode markers (E09E10E11), the range is the
|
||||
# first → last episode. Intermediate values are implied.
|
||||
return season, episodes[0], episodes[-1] if len(episodes) >= 2 else None
|
||||
|
||||
# NxNN form
|
||||
if "X" in upper:
|
||||
@@ -616,7 +630,14 @@ def assemble(
|
||||
layer in additional fields (``parse_path``, ``raw``, …) before
|
||||
instantiation.
|
||||
"""
|
||||
title_parts = [t.text for t in annotated if t.role is TokenRole.TITLE]
|
||||
# Pure-punctuation tokens (e.g. a stray "-" left by ` - ` separators in
|
||||
# human-friendly release names) carry no title content and would leak
|
||||
# into the joined title as ``"Show.-.Episode"``. Drop them here.
|
||||
title_parts = [
|
||||
t.text
|
||||
for t in annotated
|
||||
if t.role is TokenRole.TITLE and any(c.isalnum() for c in t.text)
|
||||
]
|
||||
title = ".".join(title_parts) if title_parts else (
|
||||
annotated[0].text if annotated else raw_name
|
||||
)
|
||||
@@ -636,6 +657,7 @@ def assemble(
|
||||
edition: str | None = None
|
||||
distributor: str | None = None
|
||||
languages: list[str] = []
|
||||
is_season_range = False
|
||||
|
||||
for tok in annotated:
|
||||
# Skip non-primary members of a multi-token sequence.
|
||||
@@ -649,6 +671,16 @@ def assemble(
|
||||
parsed = _parse_season_episode(tok.text)
|
||||
if parsed is not None:
|
||||
season, episode, episode_end = parsed
|
||||
# Detect Sxx-yy range form to flag it as a multi-season pack.
|
||||
upper = tok.text.upper()
|
||||
if (
|
||||
len(upper) == 6
|
||||
and upper[0] == "S"
|
||||
and upper[1:3].isdigit()
|
||||
and upper[3] == "-"
|
||||
and upper[4:6].isdigit()
|
||||
):
|
||||
is_season_range = True
|
||||
elif role is TokenRole.RESOLUTION:
|
||||
quality = tok.text
|
||||
elif role is TokenRole.SOURCE:
|
||||
@@ -696,6 +728,8 @@ def assemble(
|
||||
media_type = "documentary"
|
||||
elif upper_tokens & concert_tokens:
|
||||
media_type = "concert"
|
||||
elif is_season_range:
|
||||
media_type = "tv_complete"
|
||||
elif (
|
||||
edition in {"COMPLETE", "INTEGRALE", "COLLECTION"}
|
||||
or upper_tokens & integrale_tokens
|
||||
|
||||
@@ -46,7 +46,16 @@ def parse_release(
|
||||
"""
|
||||
parse_path = ParsePath.DIRECT.value
|
||||
|
||||
clean, site_tag = _v2.strip_site_tag(name)
|
||||
# Apostrophes inside titles ("Don't", "L'avare") are common and should
|
||||
# not push the release through the AI fallback. Strip them up front so
|
||||
# both strip_site_tag and tokenize see "Dont" / "Lavare", which is good
|
||||
# enough for token-level matching. The raw name is preserved on the VO.
|
||||
working_name = name
|
||||
if "'" in working_name:
|
||||
working_name = working_name.replace("'", "")
|
||||
parse_path = ParsePath.SANITIZED.value
|
||||
|
||||
clean, site_tag = _v2.strip_site_tag(working_name)
|
||||
if site_tag is not None:
|
||||
parse_path = ParsePath.SANITIZED.value
|
||||
|
||||
@@ -77,7 +86,7 @@ def parse_release(
|
||||
)
|
||||
return parsed, report
|
||||
|
||||
tokens, v2_tag = _v2.tokenize(name, kb)
|
||||
tokens, v2_tag = _v2.tokenize(working_name, kb)
|
||||
annotated = _v2.annotate(tokens, kb)
|
||||
fields = _v2.assemble(annotated, v2_tag, name, kb)
|
||||
|
||||
|
||||
@@ -1,13 +1,15 @@
|
||||
release_name: "Khruangbin | Austin City Limits Music Festival 2024 | Full Set [V_-7WWPPeBs].webm"
|
||||
|
||||
# yt-dlp slug: UTF-8 wide pipe '|' (U+FF5C, not the ASCII '|'), trailing
|
||||
# YouTube video ID in brackets, .webm extension. Parser extracts the year
|
||||
# (2024) correctly but mistakes the YouTube ID '7WWPPeBs' for a release
|
||||
# group, and the wide pipe survives the tokenizer (not a separator).
|
||||
# YouTube video ID in brackets, .webm extension. The wide pipe survives
|
||||
# the tokenizer (not a separator) but is now dropped at title assembly
|
||||
# (pure-punctuation TITLE tokens carry no content). Year (2024) parses
|
||||
# correctly; the YouTube ID '7WWPPeBs' is still mistaken for a release
|
||||
# group (separate gap, see PoP backlog).
|
||||
# This is a concert recording — closer to "live music" than "movie", but
|
||||
# media_type=movie is the current degenerate best guess.
|
||||
parsed:
|
||||
title: "Khruangbin.|.Austin.City.Limits.Music.Festival"
|
||||
title: "Khruangbin.Austin.City.Limits.Music.Festival"
|
||||
year: 2024
|
||||
season: null
|
||||
episode: null
|
||||
|
||||
+16
-18
@@ -1,28 +1,26 @@
|
||||
release_name: "The Prodigy World's on Fire 2011 Blu-ray Remux 1080i AVC DTS-HD MA 5.1 - KRaLiMaRKo.mkv"
|
||||
|
||||
# Apocalypse case combining every horror:
|
||||
# - Unescaped apostrophe ("World's") → forces parse_path="ai" fallback
|
||||
# - Spaces AND dashes used as separators inconsistently
|
||||
# - "Blu-ray" with a dash (vs. canonical BluRay)
|
||||
# - "1080i" interlaced flag (not 1080p)
|
||||
# - "DTS-HD MA 5.1" multi-word audio codec
|
||||
# - " - GROUP.mkv" trailing format (space-dash-space before group)
|
||||
# Apocalypse case combining every horror — partially tamed by the
|
||||
# apostrophe fix. Remaining gaps (still PoP-worthy):
|
||||
# - "1080i" interlaced flag (not in quality KB)
|
||||
# - "Blu-ray" with a dash (vs. canonical BluRay) — recognized as source
|
||||
# but with the dash form
|
||||
# - "DTS-HD MA 5.1" multi-word audio codec — the trailing "HD" leaks
|
||||
# into the group
|
||||
# - Trailing .mkv extension survives in title
|
||||
# Result: total degeneration — UNKNOWN across the board, title=raw input.
|
||||
# Once the apostrophe + multi-word-audio + 1080i are handled this fixture
|
||||
# should be revisited. For now: anti-regression of the failure shape.
|
||||
# - " - GROUP" trailing format (space-dash-space before group)
|
||||
parsed:
|
||||
title: "The Prodigy World's on Fire 2011 Blu-ray Remux 1080i AVC DTS-HD MA 5.1 - KRaLiMaRKo.mkv"
|
||||
year: null
|
||||
title: "The.Prodigy.Worlds.on.Fire"
|
||||
year: 2011
|
||||
season: null
|
||||
episode: null
|
||||
quality: null
|
||||
source: null
|
||||
codec: null
|
||||
group: "UNKNOWN"
|
||||
tech_string: ""
|
||||
media_type: "unknown"
|
||||
parse_path: "ai"
|
||||
source: "Blu-ray"
|
||||
codec: "AVC"
|
||||
group: "HD"
|
||||
tech_string: "Blu-ray.AVC"
|
||||
media_type: "movie"
|
||||
parse_path: "sanitized"
|
||||
is_season_pack: false
|
||||
|
||||
tree:
|
||||
|
||||
@@ -1,14 +1,13 @@
|
||||
release_name: "Archer.S14E09E10E11.1080p.WEB.h264-ETHEL"
|
||||
|
||||
# Tech debt: triple-episode chain (E09E10E11) — current parser captures
|
||||
# episode=9 and episode_end=10, but E11 is lost. Anti-regression: lock in
|
||||
# the partial behavior so any future improvement is intentional.
|
||||
# Triple-episode chain (E09E10E11) — the parser collapses the chain to a
|
||||
# range (episode=first, episode_end=last). Intermediate values are implied.
|
||||
parsed:
|
||||
title: "Archer"
|
||||
year: null
|
||||
season: 14
|
||||
episode: 9
|
||||
episode_end: 10
|
||||
episode_end: 11
|
||||
quality: "1080p"
|
||||
source: "WEB"
|
||||
codec: "h264"
|
||||
|
||||
+14
-13
@@ -1,21 +1,22 @@
|
||||
release_name: "Honey.Don't.2025.2160p.WEBRip.DSNP.DV.HDR.x265.EAC3.5.1-Amen"
|
||||
|
||||
# Tech debt: the unescaped apostrophe in "Don't" pushes the whole release
|
||||
# through the AI fallback path (parse_path="ai") and the parse degenerates to
|
||||
# UNKNOWN across the board. Anti-regression here — once the tokenizer learns
|
||||
# to handle apostrophes, this fixture should be revisited.
|
||||
# Apostrophes inside titles ("Don't", "L'avare") used to push the release
|
||||
# through the AI fallback (parse_path="ai", everything UNKNOWN). They are
|
||||
# now pre-stripped before well-formed check and tokenize, so the parse
|
||||
# completes normally — only the title text loses its apostrophe
|
||||
# ("Honey.Dont").
|
||||
parsed:
|
||||
title: "Honey.Don't.2025.2160p.WEBRip.DSNP.DV.HDR.x265.EAC3.5.1-Amen"
|
||||
year: null
|
||||
title: "Honey.Dont"
|
||||
year: 2025
|
||||
season: null
|
||||
episode: null
|
||||
quality: null
|
||||
source: null
|
||||
codec: null
|
||||
group: "UNKNOWN"
|
||||
tech_string: ""
|
||||
media_type: "unknown"
|
||||
parse_path: "ai"
|
||||
quality: "2160p"
|
||||
source: "WEBRip"
|
||||
codec: "x265"
|
||||
group: "Amen"
|
||||
tech_string: "2160p.WEBRip.x265"
|
||||
media_type: "movie"
|
||||
parse_path: "sanitized"
|
||||
is_season_pack: false
|
||||
|
||||
tree:
|
||||
|
||||
+7
-7
@@ -1,22 +1,22 @@
|
||||
release_name: "Der.Tatortreiniger.S01-06.GERMAN.1080p.WEB.x264-WAYNE"
|
||||
|
||||
# Tech debt: range syntax 'S01-06' is not recognized as TV — falls through
|
||||
# to media_type=movie with the range glued onto the title. Captured here so a
|
||||
# future ranger-aware parser change is intentional.
|
||||
# Range syntax 'S01-06' is now recognized as a season-range marker:
|
||||
# season=1 (first of the range), media_type=tv_complete, and the token
|
||||
# no longer leaks into the title.
|
||||
parsed:
|
||||
title: "Der.Tatortreiniger.S01-06"
|
||||
title: "Der.Tatortreiniger"
|
||||
year: null
|
||||
season: null
|
||||
season: 1
|
||||
episode: null
|
||||
quality: "1080p"
|
||||
source: "WEB"
|
||||
codec: "x264"
|
||||
group: "WAYNE"
|
||||
tech_string: "1080p.WEB.x264"
|
||||
media_type: "movie"
|
||||
media_type: "tv_complete"
|
||||
languages: ["GERMAN"]
|
||||
parse_path: "direct"
|
||||
is_season_pack: false
|
||||
is_season_pack: true
|
||||
|
||||
tree:
|
||||
- "Der.Tatortreiniger.S01-06.GERMAN.1080p.WEB.x264-WAYNE/"
|
||||
|
||||
@@ -1,11 +1,12 @@
|
||||
release_name: "Vinyl - 1x01 - FHD"
|
||||
|
||||
# Tech debt: surrounding ' - ' separators leave a stray '-' token attached
|
||||
# to the title ("Vinyl.-"). NxNN form correctly identifies S01E01; everything
|
||||
# tech-side empty (no quality token in KB — "FHD" not yet known). Anti-regression
|
||||
# the current degenerate title so a future fix is intentional.
|
||||
# Surrounding ' - ' separators in human-friendly release names left stray
|
||||
# '-' tokens attached to the title. They are now dropped at assembly time
|
||||
# (pure-punctuation TITLE tokens carry no content). NxNN form correctly
|
||||
# identifies S01E01; tech-side stays empty (no quality token in KB — "FHD"
|
||||
# not yet known).
|
||||
parsed:
|
||||
title: "Vinyl.-"
|
||||
title: "Vinyl"
|
||||
year: null
|
||||
season: 1
|
||||
episode: 1
|
||||
|
||||
Reference in New Issue
Block a user