1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464 | """Parse comic book archive names using the simple 'parse' parser."""
from __future__ import annotations
from calendar import month_abbr
from copy import copy
from pathlib import Path
from pprint import pformat
from sys import maxsize
from typing import TYPE_CHECKING
from comicfn2dict.log import print_log_header
from comicfn2dict.regex import (
ACRONYM_TRAIL_DOT_RE,
ALPHA_MONTH_RANGE_RE,
BOOK_VOLUME_RE,
BY_AUTHOR_RE,
DASH_SEPARATOR_RE,
ISSUE_BEGIN_RE,
ISSUE_END_RE,
ISSUE_LETTER_RE,
ISSUE_NUMBER_RE,
ISSUE_WITH_COUNT_RE,
LETTER_DOT_RE,
MONTH_FIRST_DATE_RE,
ORIGINAL_FORMAT_NAKED_RE,
ORIGINAL_FORMAT_SCAN_INFO_ADJACENT_RE,
ORIGINAL_FORMAT_SCAN_INFO_RE,
ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE,
PUBLISHER_AMBIGUOUS_RE,
PUBLISHER_AMBIGUOUS_TOKEN_RE,
PUBLISHER_UNAMBIGUOUS_RE,
PUBLISHER_UNAMBIGUOUS_TOKEN_RE,
REGEX_SUBS,
REMAINDER_PAREN_GROUPS_RE,
REMAINING_GROUP_RE,
SCAN_INFO_SECONDARY_RE,
TITLE_PAREN_RE,
TOKEN_DELIMITER,
VOLUME_RE,
VOLUME_WITH_COUNT_RE,
WORD_NUMBER_TO_DIGIT,
YEAR_END_RE,
YEAR_FIRST_DATE_RE,
YEAR_TOKEN_RE,
)
if TYPE_CHECKING:
from re import Match, Pattern
_DATE_KEYS: frozenset[str] = frozenset({"year", "month", "day"})
_REMAINING_GROUP_KEYS = ("series", "title")
# Ordered by commonness.
_TITLE_PRECEDING_KEYS = ("issue", "year", "volume", "month")
class ComicFilenameParser:
"""Parse a filename metadata into a dict."""
def path_index(self, key: str, default: int = -1) -> int:
"""Lazily retrieve and memoize the key's location in the path."""
if key == "remainders":
return default
value: str = self.metadata.get(key, "") # pyright: ignore[reportAssignmentType], # ty: ignore[invalid-assignment]
if not value:
return default
if value not in self._path_indexes:
# This is fragile, but it's difficult to calculate the original
# position at match time from the ever changing _unparsed_path.
index = self.path.rfind(value) if key == "ext" else self.path.find(value)
self._path_indexes[value] = index
return self._path_indexes[value]
def _log(self, label: str) -> None:
if not self._debug:
return
print_log_header(label)
combined = {}
for key in self.metadata:
combined[key] = (self.metadata.get(key), self.path_index(key))
print(" " + self._unparsed_path) # noqa: T201
print(" " + pformat(combined)) # noqa: T201
def _parse_ext(self) -> None:
"""Pop the extension from the pathname."""
path = Path(self._unparsed_path)
suffix = path.suffix
if not suffix:
return
data = path.name.removesuffix(suffix)
ext = suffix.lstrip(".")
self.metadata["ext"] = ext
self._unparsed_path = data
def _clean_dividers(self) -> None:
"""Replace non space dividers and clean extra spaces out of string."""
data = self._unparsed_path
# Simple substitutions
for regex, pair in REGEX_SUBS.items():
replacement, count = pair
data = regex.sub(replacement, data, count=count).strip()
self._unparsed_path = data.strip()
self._log("After Clean Path")
def _parse_items_update_metadata(
self, matches: Match, exclude: str, *, require_all: bool, first_only: bool
) -> bool:
"""Update Metadata."""
matched_metadata = {}
for key, value in matches.groupdict().items():
if value == exclude:
continue
if not value:
if require_all:
return False
continue
matched_metadata[key] = value
if first_only:
break
if not matched_metadata:
return False
self.metadata.update(matched_metadata)
return True
def _parse_items_pop_tokens(self, regex: Pattern, *, first_only: bool) -> None:
"""Pop tokens from unparsed path."""
count = 1 if first_only else 0
marked_str = regex.sub(TOKEN_DELIMITER, self._unparsed_path, count=count)
parts = []
for part in marked_str.split(TOKEN_DELIMITER):
token = part.strip()
if token:
parts.append(token)
self._unparsed_path = TOKEN_DELIMITER.join(parts)
def _parse_items(
self,
regex: Pattern,
exclude: str = "",
*,
require_all: bool = False,
first_only: bool = False,
pop: bool = True,
) -> None:
"""Parse a value from the data list into metadata and alter the data list."""
# Match
matches = regex.search(self._unparsed_path)
if not matches:
return
if not self._parse_items_update_metadata(
matches, exclude, require_all=require_all, first_only=first_only
):
return
if pop:
self._parse_items_pop_tokens(regex, first_only=first_only)
def _parse_issue(self) -> None:
"""Parse Issue."""
self._parse_items(ISSUE_NUMBER_RE)
if "issue" not in self.metadata:
self._parse_items(ISSUE_WITH_COUNT_RE)
if "issue" not in self.metadata:
# Letter-only issues like "#Omega" or "#Alpha" — only fires when
# no digit-bearing issue regex matched.
self._parse_items(ISSUE_LETTER_RE)
self._log("After Issue")
def _parse_volume(self) -> None:
"""Parse Volume."""
self._parse_items(VOLUME_RE)
if "volume" not in self.metadata:
self._parse_items(VOLUME_WITH_COUNT_RE)
self._log("After Volume")
def _alpha_month_to_numeric(self) -> None:
"""Translate alpha_month to numeric month."""
alpha_month: str = self.metadata.pop("alpha_month", "") # pyright: ignore[reportAssignmentType], # ty: ignore[invalid-assignment]
if alpha_month:
alpha_month = alpha_month.capitalize()
for index, abbr in enumerate(month_abbr):
if abbr and alpha_month.startswith(abbr):
month = f"{index:02d}"
self.metadata["month"] = month
break
def _parse_dates(self) -> None:
"""Parse date schemes."""
# Discard second month of alpha month ranges.
self._unparsed_path = ALPHA_MONTH_RANGE_RE.sub(r"\1", self._unparsed_path)
# Month first date
self._parse_items(MONTH_FIRST_DATE_RE)
self._alpha_month_to_numeric()
# Year first date
if _DATE_KEYS - self.metadata.keys():
self._parse_items(YEAR_FIRST_DATE_RE)
self._alpha_month_to_numeric()
if "year" not in self.metadata:
self._parse_items(YEAR_TOKEN_RE, first_only=True)
if "volume" in self.metadata:
return
# A second year will be the real year.
# Move the first year to volume
if volume := self.metadata.get("year", ""):
self._parse_items(YEAR_TOKEN_RE)
if self.metadata.get("year", "") != volume:
self.metadata["volume"] = volume
self._log("After Date")
def _parse_format_and_scan_info(self) -> None:
"""Format & Scan Info."""
# Try adjacent "(format) (scan_info)" pairs first so compound formats
# like "(digital-mobile) (Empire)" don't get split as
# format=digital + scan_info=mobile by the combined regex.
self._parse_items(
ORIGINAL_FORMAT_SCAN_INFO_ADJACENT_RE,
require_all=True,
)
if "original_format" not in self.metadata:
self._parse_items(
ORIGINAL_FORMAT_SCAN_INFO_RE,
require_all=True,
)
if "original_format" not in self.metadata:
self._parse_items(
ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE,
)
self._parse_items(SCAN_INFO_SECONDARY_RE)
if (
scan_info_secondary := self.metadata.pop("secondary_scan_info", "")
) and "scan_info" not in self.metadata:
self.metadata["scan_info"] = scan_info_secondary
self._log("After original_format & scan_info")
def _parse_paren_subtitle(self) -> None:
"""
Promote a Title Case paren group to a title (FCBD-style subtitle).
Only fires when there's a single remaining paren group, to avoid
misclassifying scan_info releaser groups like "(Shadowcat-Empire)"
that follow another paren.
"""
if "title" in self.metadata:
return
if self._unparsed_path.count("(") != 1:
return
self._parse_items(TITLE_PAREN_RE, first_only=True)
self._log("After paren subtitle")
def _parse_remainder_paren_groups(self) -> None:
"""Remove extraneous paren groups."""
self._parse_items(REMAINDER_PAREN_GROUPS_RE)
remainders: str = self.metadata.get("remainders", "") # pyright: ignore[reportAssignmentType], # ty: ignore[invalid-assignment]
if remainders:
self.metadata["remainders"] = (remainders,)
self._log("After parsing remainder paren and bracket groups")
def _parse_ends_of_remaining_tokens(self) -> None:
# Volume left on the end of string tokens
if "volume" not in self.metadata:
self._parse_items(BOOK_VOLUME_RE)
# BOOK_VOLUME_RE accepts word-number volumes ("Book One"); convert
# them to digit strings so downstream consumers see "1" not "one".
volume = self.metadata.get("volume", "")
if isinstance(volume, str) and (
digit := WORD_NUMBER_TO_DIGIT.get(volume.lower())
):
self.metadata["volume"] = digit
self._log("After original_format & scan_info")
# Years left on the end of string tokens
year_end_matched = False
if "year" not in self.metadata:
self._parse_items(YEAR_END_RE, pop=False)
year_end_matched = "year" in self.metadata
self._log("After Year on end of token")
# Issue left on the end of string tokens
if "issue" not in self.metadata and not year_end_matched:
exclude: str = self.metadata.get("year", "") # pyright: ignore[reportAssignmentType], # ty: ignore[invalid-assignment]
self._parse_items(ISSUE_END_RE, exclude=exclude)
if "issue" not in self.metadata:
self._parse_items(ISSUE_BEGIN_RE)
self._log("After Issue on ends of tokens")
def _parse_publisher(self) -> None:
"""Parse Publisher."""
# Pop publisher tokens so they don't end up as titles, but only if
# other tokens remain — otherwise the publisher IS the series
# (e.g. "Marvel #001 (2020).cbz").
if TOKEN_DELIMITER in self._unparsed_path:
self._parse_items(PUBLISHER_UNAMBIGUOUS_TOKEN_RE, first_only=True)
if "publisher" not in self.metadata:
self._parse_items(PUBLISHER_AMBIGUOUS_TOKEN_RE, first_only=True)
if "publisher" not in self.metadata:
self._parse_items(PUBLISHER_UNAMBIGUOUS_RE, pop=False, first_only=True)
if "publisher" not in self.metadata:
self._parse_items(PUBLISHER_AMBIGUOUS_RE, pop=False, first_only=True)
self._log("After publisher")
def _is_at_title_position(self, value: str) -> bool:
"""Title is in correct position."""
title_index = self.path.find(value)
# Titles must come after series but before format and scan_info
if (
title_index < self.path_index("series")
or title_index > self.path_index("original_format", maxsize)
or title_index > self.path_index("scan_info", maxsize)
):
return False
# Titles must be after the series and one other token.
title_ok = False
other_tokens_exist = False
for preceding_key in _TITLE_PRECEDING_KEYS:
other_tokens_exist = True
if title_index > self.path_index(preceding_key):
title_ok = True
break
return title_ok or not other_tokens_exist
def _grouping_operators_strip(self, value: str) -> str:
"""Strip spaces and parens."""
value = value.strip()
value = value.strip("()").strip()
value = value.strip("-").strip()
value = value.strip(",").strip()
value = value.strip("'").strip()
return value.strip('"').strip()
def _parse_series_and_title_token(
self, remaining_key_index: int, tokens: list[str]
) -> str:
"""Parse one series or title token."""
key = _REMAINING_GROUP_KEYS[remaining_key_index]
if key in self.metadata:
return ""
token = tokens.pop(0)
match = REMAINING_GROUP_RE.search(token)
if not match:
return token
value = match.group()
if key == "title":
if not self._is_at_title_position(value):
return token
# Parse titles that are really formats as formats.
if "original_format" not in self.metadata and (
match := ORIGINAL_FORMAT_NAKED_RE.fullmatch(value)
):
self.metadata["original_format"] = match.group()
return ""
# Acronyms produce overlapping matches (A.X.E. needs two passes)
while (new := LETTER_DOT_RE.sub(r"\1 \2", value)) != value:
value = new
# Drop trailing acronym dot ("A X E." -> "A X E", "S H I E L D." ->
# "S H I E L D"). Keeps "Dr.", "Inc.", "vs." since those aren't
# whitespace-bounded single letters.
value = ACRONYM_TRAIL_DOT_RE.sub(r"\1\2\3", value)
# Drop "by Author1 (& Author2)" attribution from series names.
if key == "series":
value = BY_AUTHOR_RE.sub("", value)
value = self._grouping_operators_strip(value)
if value:
self.metadata[key] = value
return ""
def _parse_series_and_title(self) -> None:
"""Assign series and title."""
if not self._unparsed_path:
return
tokens = self._unparsed_path.split(TOKEN_DELIMITER)
# Promote a single dash separator in the only remaining token to a
# series/title boundary. Catches the common convention where the
# canonical ":" is replaced with " - " (or "word- ") because
# filesystems disallow ":". Restricted to the single-token case so
# multi-dash co-headlining like "Aquaman - Green Arrow - Deep Target"
# stays in the series and so a later token already destined for the
# title isn't displaced.
if "title" not in self.metadata and len(tokens) == 1:
matches = DASH_SEPARATOR_RE.findall(tokens[0])
if len(matches) == 1:
head, tail = DASH_SEPARATOR_RE.split(tokens[0], maxsplit=1)
tokens = [head, tail]
remaining_key_index = 0
unused_tokens = []
while tokens and remaining_key_index < len(_REMAINING_GROUP_KEYS):
unused_token = self._parse_series_and_title_token(
remaining_key_index, tokens
)
if unused_token:
unused_tokens.append(unused_token)
remaining_key_index += 1
self._unparsed_path = " ".join(unused_tokens) if unused_tokens else ""
self._log("After Series & Title")
def _add_remainders(self) -> None:
"""Add Remainders."""
remainders = []
for token in self._unparsed_path.split(TOKEN_DELIMITER):
remainder = token.strip()
if remainder:
remainders.append(remainder)
if remainders:
self.metadata["remainders"] = tuple(
remainders + list(self.metadata.get("remainders", []))
)
def parse(self) -> dict[str, str | tuple[str, ...]]:
"""Parse the filename with a hierarchy of regexes."""
self._log("Init")
self._parse_ext()
self._clean_dividers()
self._parse_issue()
self._parse_volume()
self._parse_dates()
self._parse_format_and_scan_info()
self._parse_paren_subtitle()
self._parse_remainder_paren_groups()
self._parse_ends_of_remaining_tokens()
self._parse_publisher()
self._parse_series_and_title()
# Copy volume into issue if it's all we have.
if "issue" not in self.metadata and "volume" in self.metadata:
self.metadata["issue"] = self.metadata["volume"]
self._log("Using volume for issue.")
self._log("After issue can be volume")
self._add_remainders()
return self.metadata
def __init__(self, path: str | Path, verbose: int = 0) -> None:
"""Initialize."""
self._debug: bool = verbose > 0
# munge path
if isinstance(path, str):
path = path.strip()
p_path = Path(path)
self.path = str(p_path.name).strip()
self.metadata: dict[str, str | tuple[str, ...]] = {}
self._unparsed_path = copy(self.path)
self._path_indexes: dict[str, int] = {}
def comicfn2dict(
path: str | Path, verbose: int = 0
) -> dict[str, str | tuple[str, ...]]:
"""Simplfily the API."""
parser = ComicFilenameParser(path, verbose=verbose)
return parser.parse()
|