diff --git a/CHANGELOG.md b/CHANGELOG.md index 9d70d2d..eb5ae81 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,6 +23,16 @@ callers). with intermediate values implied. Fixture `shitty/archer_multi_episode/` updated from anti-regression-of-bug to anti-regression-of-fix. +- **Apostrophes in titles no longer push the release through the AI + fallback.** `Honey.Don't.2025.2160p.WEBRip.DSNP.DV.HDR.x265-Amen` + previously parsed with `parse_path="ai"` and everything UNKNOWN + because `'` is in the forbidden-chars list. Apostrophes are now + pre-stripped before the well-formed check, so the parse completes + normally (`title=Honey.Dont, year=2025, quality=2160p, ...`); only + the title text loses its apostrophe. `parse_path` becomes + `sanitized` to surface the cleanup. Side win: PoP fixture + `the_prodigy_full_chaos/` also moves from total failure to a + partially-correct parse (year, source, codec extracted). - **Season-range markers (`Sxx-yy`) are now recognized as `tv_complete`.** `Der.Tatortreiniger.S01-06.GERMAN...` previously parsed as `media_type=movie` with `S01-06` glued onto the title. diff --git a/alfred/domain/release/services.py b/alfred/domain/release/services.py index d8ba8e3..a13b989 100644 --- a/alfred/domain/release/services.py +++ b/alfred/domain/release/services.py @@ -46,7 +46,16 @@ def parse_release( """ parse_path = ParsePath.DIRECT.value - clean, site_tag = _v2.strip_site_tag(name) + # Apostrophes inside titles ("Don't", "L'avare") are common and should + # not push the release through the AI fallback. Strip them up front so + # both strip_site_tag and tokenize see "Dont" / "Lavare", which is good + # enough for token-level matching. The raw name is preserved on the VO. + working_name = name + if "'" in working_name: + working_name = working_name.replace("'", "") + parse_path = ParsePath.SANITIZED.value + + clean, site_tag = _v2.strip_site_tag(working_name) if site_tag is not None: parse_path = ParsePath.SANITIZED.value @@ -77,7 +86,7 @@ def parse_release( ) return parsed, report - tokens, v2_tag = _v2.tokenize(name, kb) + tokens, v2_tag = _v2.tokenize(working_name, kb) annotated = _v2.annotate(tokens, kb) fields = _v2.assemble(annotated, v2_tag, name, kb) diff --git a/tests/fixtures/releases/path_of_pain/the_prodigy_full_chaos/expected.yaml b/tests/fixtures/releases/path_of_pain/the_prodigy_full_chaos/expected.yaml index 091e757..c943d90 100644 --- a/tests/fixtures/releases/path_of_pain/the_prodigy_full_chaos/expected.yaml +++ b/tests/fixtures/releases/path_of_pain/the_prodigy_full_chaos/expected.yaml @@ -1,28 +1,26 @@ release_name: "The Prodigy World's on Fire 2011 Blu-ray Remux 1080i AVC DTS-HD MA 5.1 - KRaLiMaRKo.mkv" -# Apocalypse case combining every horror: -# - Unescaped apostrophe ("World's") → forces parse_path="ai" fallback -# - Spaces AND dashes used as separators inconsistently -# - "Blu-ray" with a dash (vs. canonical BluRay) -# - "1080i" interlaced flag (not 1080p) -# - "DTS-HD MA 5.1" multi-word audio codec -# - " - GROUP.mkv" trailing format (space-dash-space before group) +# Apocalypse case combining every horror — partially tamed by the +# apostrophe fix. Remaining gaps (still PoP-worthy): +# - "1080i" interlaced flag (not in quality KB) +# - "Blu-ray" with a dash (vs. canonical BluRay) — recognized as source +# but with the dash form +# - "DTS-HD MA 5.1" multi-word audio codec — the trailing "HD" leaks +# into the group # - Trailing .mkv extension survives in title -# Result: total degeneration — UNKNOWN across the board, title=raw input. -# Once the apostrophe + multi-word-audio + 1080i are handled this fixture -# should be revisited. For now: anti-regression of the failure shape. +# - " - GROUP" trailing format (space-dash-space before group) parsed: - title: "The Prodigy World's on Fire 2011 Blu-ray Remux 1080i AVC DTS-HD MA 5.1 - KRaLiMaRKo.mkv" - year: null + title: "The.Prodigy.Worlds.on.Fire" + year: 2011 season: null episode: null quality: null - source: null - codec: null - group: "UNKNOWN" - tech_string: "" - media_type: "unknown" - parse_path: "ai" + source: "Blu-ray" + codec: "AVC" + group: "HD" + tech_string: "Blu-ray.AVC" + media_type: "movie" + parse_path: "sanitized" is_season_pack: false tree: diff --git a/tests/fixtures/releases/shitty/honey_uhd_hdr/expected.yaml b/tests/fixtures/releases/shitty/honey_uhd_hdr/expected.yaml index ff6f975..1b73c7c 100644 --- a/tests/fixtures/releases/shitty/honey_uhd_hdr/expected.yaml +++ b/tests/fixtures/releases/shitty/honey_uhd_hdr/expected.yaml @@ -1,21 +1,22 @@ release_name: "Honey.Don't.2025.2160p.WEBRip.DSNP.DV.HDR.x265.EAC3.5.1-Amen" -# Tech debt: the unescaped apostrophe in "Don't" pushes the whole release -# through the AI fallback path (parse_path="ai") and the parse degenerates to -# UNKNOWN across the board. Anti-regression here — once the tokenizer learns -# to handle apostrophes, this fixture should be revisited. +# Apostrophes inside titles ("Don't", "L'avare") used to push the release +# through the AI fallback (parse_path="ai", everything UNKNOWN). They are +# now pre-stripped before well-formed check and tokenize, so the parse +# completes normally — only the title text loses its apostrophe +# ("Honey.Dont"). parsed: - title: "Honey.Don't.2025.2160p.WEBRip.DSNP.DV.HDR.x265.EAC3.5.1-Amen" - year: null + title: "Honey.Dont" + year: 2025 season: null episode: null - quality: null - source: null - codec: null - group: "UNKNOWN" - tech_string: "" - media_type: "unknown" - parse_path: "ai" + quality: "2160p" + source: "WEBRip" + codec: "x265" + group: "Amen" + tech_string: "2160p.WEBRip.x265" + media_type: "movie" + parse_path: "sanitized" is_season_pack: false tree: