comicfn2dict.parse - comicfn2dict

[docs] module comicfn2dict.parse
"""Parse comic book archive names using the simple 'parse' parser."""from __future__ importannotationsfrom calendar importmonth_abbrfrom copy importcopyfrom pathlib importPathfrom pprint importpformatfrom sys importmaxsizefrom typing importTYPE_CHECKINGfrom comicfn2dict.log importprint_log_headerfrom comicfn2dict.regex import(ACRONYM_TRAIL_DOT_RE,ALPHA_MONTH_RANGE_RE,BOOK_VOLUME_RE,BY_AUTHOR_RE,DASH_SEPARATOR_RE,ISSUE_BEGIN_RE,ISSUE_END_RE,ISSUE_LETTER_RE,ISSUE_NUMBER_RE,ISSUE_WITH_COUNT_RE,LETTER_DOT_RE,MONTH_FIRST_DATE_RE,ORIGINAL_FORMAT_NAKED_RE,ORIGINAL_FORMAT_SCAN_INFO_ADJACENT_RE,ORIGINAL_FORMAT_SCAN_INFO_RE,ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE,PUBLISHER_AMBIGUOUS_RE,PUBLISHER_AMBIGUOUS_TOKEN_RE,PUBLISHER_UNAMBIGUOUS_RE,PUBLISHER_UNAMBIGUOUS_TOKEN_RE,REGEX_SUBS,REMAINDER_PAREN_GROUPS_RE,REMAINING_GROUP_RE,SCAN_INFO_SECONDARY_RE,TITLE_PAREN_RE,TOKEN_DELIMITER,VOLUME_RE,VOLUME_WITH_COUNT_RE,WORD_NUMBER_TO_DIGIT,YEAR_END_RE,YEAR_FIRST_DATE_RE,YEAR_TOKEN_RE,)ifTYPE_CHECKING:from re importMatch,Pattern_DATE_KEYS:frozenset[str]=frozenset({"year","month","day"})_REMAINING_GROUP_KEYS=("series","title")# Ordered by commonness._TITLE_PRECEDING_KEYS=("issue","year","volume","month")class ComicFilenameParser:[docs]
    """Parse a filename metadata into a dict."""def path_index(self,key:str,default:int=-1)->int:[docs]
        """Lazily retrieve and memoize the key's location in the path."""ifkey=="remainders":returndefaultvalue:str=self.metadata.get(key,"")# pyright: ignore[reportAssignmentType], # ty: ignore[invalid-assignment]ifnotvalue:returndefaultifvaluenotinself._path_indexes:# This is fragile, but it's difficult to calculate the original#     position at match time from the ever changing _unparsed_path.index=self.path.rfind(value)ifkey=="ext"elseself.path.find(value)self._path_indexes[value]=indexreturnself._path_indexes[value]def _log(self,label:str)->None:ifnotself._debug:returnprint_log_header(label)combined={}forkeyinself.metadata:combined[key]=(self.metadata.get(key),self.path_index(key))print("  "+self._unparsed_path)# noqa: T201print("  "+pformat(combined))# noqa: T201def _parse_ext(self)->None:        """Pop the extension from the pathname."""path=Path(self._unparsed_path)suffix=path.suffixifnotsuffix:returndata=path.name.removesuffix(suffix)ext=suffix.lstrip(".")self.metadata["ext"]=extself._unparsed_path=datadef _clean_dividers(self)->None:        """Replace non space dividers and clean extra spaces out of string."""data=self._unparsed_path# Simple substitutionsforregex,pairinREGEX_SUBS.items():replacement,count=pairdata=regex.sub(replacement,data,count=count).strip()self._unparsed_path=data.strip()self._log("After Clean Path")def _parse_items_update_metadata(self,matches:Match,exclude:str,*,require_all:bool,first_only:bool)->bool:        """Update Metadata."""matched_metadata={}forkey,valueinmatches.groupdict().items():ifvalue==exclude:continueifnotvalue:ifrequire_all:returnFalsecontinuematched_metadata[key]=valueiffirst_only:breakifnotmatched_metadata:returnFalseself.metadata.update(matched_metadata)returnTruedef _parse_items_pop_tokens(self,regex:Pattern,*,first_only:bool)->None:        """Pop tokens from unparsed path."""count=1iffirst_onlyelse0marked_str=regex.sub(TOKEN_DELIMITER,self._unparsed_path,count=count)parts=[]forpartinmarked_str.split(TOKEN_DELIMITER):token=part.strip()iftoken:parts.append(token)self._unparsed_path=TOKEN_DELIMITER.join(parts)def _parse_items(self,regex:Pattern,exclude:str="",*,require_all:bool=False,first_only:bool=False,pop:bool=True,)->None:        """Parse a value from the data list into metadata and alter the data list."""# Matchmatches=regex.search(self._unparsed_path)ifnotmatches:returnifnotself._parse_items_update_metadata(matches,exclude,require_all=require_all,first_only=first_only):returnifpop:self._parse_items_pop_tokens(regex,first_only=first_only)def _parse_issue(self)->None:        """Parse Issue."""self._parse_items(ISSUE_NUMBER_RE)if"issue"notinself.metadata:self._parse_items(ISSUE_WITH_COUNT_RE)if"issue"notinself.metadata:# Letter-only issues like "#Omega" or "#Alpha" — only fires when# no digit-bearing issue regex matched.self._parse_items(ISSUE_LETTER_RE)self._log("After Issue")def _parse_volume(self)->None:        """Parse Volume."""self._parse_items(VOLUME_RE)if"volume"notinself.metadata:self._parse_items(VOLUME_WITH_COUNT_RE)self._log("After Volume")def _alpha_month_to_numeric(self)->None:        """Translate alpha_month to numeric month."""alpha_month:str=self.metadata.pop("alpha_month","")# pyright: ignore[reportAssignmentType], # ty: ignore[invalid-assignment]ifalpha_month:alpha_month=alpha_month.capitalize()forindex,abbrinenumerate(month_abbr):ifabbrandalpha_month.startswith(abbr):month=f"{index:02d}"self.metadata["month"]=monthbreakdef _parse_dates(self)->None:        """Parse date schemes."""# Discard second month of alpha month ranges.self._unparsed_path=ALPHA_MONTH_RANGE_RE.sub(r"\1",self._unparsed_path)# Month first dateself._parse_items(MONTH_FIRST_DATE_RE)self._alpha_month_to_numeric()# Year first dateif_DATE_KEYS-self.metadata.keys():self._parse_items(YEAR_FIRST_DATE_RE)self._alpha_month_to_numeric()if"year"notinself.metadata:self._parse_items(YEAR_TOKEN_RE,first_only=True)if"volume"inself.metadata:return# A second year will be the real year.# Move the first year to volumeifvolume:=self.metadata.get("year",""):self._parse_items(YEAR_TOKEN_RE)ifself.metadata.get("year","")!=volume:self.metadata["volume"]=volumeself._log("After Date")def _parse_format_and_scan_info(self)->None:        """Format & Scan Info."""# Try adjacent "(format) (scan_info)" pairs first so compound formats# like "(digital-mobile) (Empire)" don't get split as# format=digital + scan_info=mobile by the combined regex.self._parse_items(ORIGINAL_FORMAT_SCAN_INFO_ADJACENT_RE,require_all=True,)if"original_format"notinself.metadata:self._parse_items(ORIGINAL_FORMAT_SCAN_INFO_RE,require_all=True,)if"original_format"notinself.metadata:self._parse_items(ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE,)self._parse_items(SCAN_INFO_SECONDARY_RE)if(scan_info_secondary:=self.metadata.pop("secondary_scan_info",""))and"scan_info"notinself.metadata:self.metadata["scan_info"]=scan_info_secondaryself._log("After original_format & scan_info")def _parse_paren_subtitle(self)->None:        """        Promote a Title Case paren group to a title (FCBD-style subtitle).        Only fires when there's a single remaining paren group, to avoid        misclassifying scan_info releaser groups like "(Shadowcat-Empire)"        that follow another paren.        """if"title"inself.metadata:returnifself._unparsed_path.count("(")!=1:returnself._parse_items(TITLE_PAREN_RE,first_only=True)self._log("After paren subtitle")def _parse_remainder_paren_groups(self)->None:        """Remove extraneous paren groups."""self._parse_items(REMAINDER_PAREN_GROUPS_RE)remainders:str=self.metadata.get("remainders","")# pyright: ignore[reportAssignmentType], # ty: ignore[invalid-assignment]ifremainders:self.metadata["remainders"]=(remainders,)self._log("After parsing remainder paren and bracket groups")def _parse_ends_of_remaining_tokens(self)->None:# Volume left on the end of string tokensif"volume"notinself.metadata:self._parse_items(BOOK_VOLUME_RE)# BOOK_VOLUME_RE accepts word-number volumes ("Book One"); convert# them to digit strings so downstream consumers see "1" not "one".volume=self.metadata.get("volume","")ifisinstance(volume,str)and(digit:=WORD_NUMBER_TO_DIGIT.get(volume.lower())):self.metadata["volume"]=digitself._log("After original_format & scan_info")# Years left on the end of string tokensyear_end_matched=Falseif"year"notinself.metadata:self._parse_items(YEAR_END_RE,pop=False)year_end_matched="year"inself.metadataself._log("After Year on end of token")# Issue left on the end of string tokensif"issue"notinself.metadataandnotyear_end_matched:exclude:str=self.metadata.get("year","")# pyright: ignore[reportAssignmentType], # ty: ignore[invalid-assignment]self._parse_items(ISSUE_END_RE,exclude=exclude)if"issue"notinself.metadata:self._parse_items(ISSUE_BEGIN_RE)self._log("After Issue on ends of tokens")def _parse_publisher(self)->None:        """Parse Publisher."""# Pop publisher tokens so they don't end up as titles, but only if# other tokens remain — otherwise the publisher IS the series# (e.g. "Marvel #001 (2020).cbz").ifTOKEN_DELIMITERinself._unparsed_path:self._parse_items(PUBLISHER_UNAMBIGUOUS_TOKEN_RE,first_only=True)if"publisher"notinself.metadata:self._parse_items(PUBLISHER_AMBIGUOUS_TOKEN_RE,first_only=True)if"publisher"notinself.metadata:self._parse_items(PUBLISHER_UNAMBIGUOUS_RE,pop=False,first_only=True)if"publisher"notinself.metadata:self._parse_items(PUBLISHER_AMBIGUOUS_RE,pop=False,first_only=True)self._log("After publisher")def _is_at_title_position(self,value:str)->bool:        """Title is in correct position."""title_index=self.path.find(value)# Titles must come after series but before format and scan_infoif(title_index<self.path_index("series")ortitle_index>self.path_index("original_format",maxsize)ortitle_index>self.path_index("scan_info",maxsize)):returnFalse# Titles must be after the series and one other token.title_ok=Falseother_tokens_exist=Falseforpreceding_keyin_TITLE_PRECEDING_KEYS:other_tokens_exist=Trueiftitle_index>self.path_index(preceding_key):title_ok=Truebreakreturntitle_okornotother_tokens_existdef _grouping_operators_strip(self,value:str)->str:        """Strip spaces and parens."""value=value.strip()value=value.strip("()").strip()value=value.strip("-").strip()value=value.strip(",").strip()value=value.strip("'").strip()returnvalue.strip('"').strip()def _parse_series_and_title_token(self,remaining_key_index:int,tokens:list[str])->str:        """Parse one series or title token."""key=_REMAINING_GROUP_KEYS[remaining_key_index]ifkeyinself.metadata:return""token=tokens.pop(0)match=REMAINING_GROUP_RE.search(token)ifnotmatch:returntokenvalue=match.group()ifkey=="title":ifnotself._is_at_title_position(value):returntoken# Parse titles that are really formats as formats.if"original_format"notinself.metadataand(match:=ORIGINAL_FORMAT_NAKED_RE.fullmatch(value)):self.metadata["original_format"]=match.group()return""# Acronyms produce overlapping matches (A.X.E. needs two passes)while(new:=LETTER_DOT_RE.sub(r"\1 \2",value))!=value:value=new# Drop trailing acronym dot ("A X E." -> "A X E", "S H I E L D." -># "S H I E L D"). Keeps "Dr.", "Inc.", "vs." since those aren't# whitespace-bounded single letters.value=ACRONYM_TRAIL_DOT_RE.sub(r"\1\2\3",value)# Drop "by Author1 (& Author2)" attribution from series names.ifkey=="series":value=BY_AUTHOR_RE.sub("",value)value=self._grouping_operators_strip(value)ifvalue:self.metadata[key]=valuereturn""def _parse_series_and_title(self)->None:        """Assign series and title."""ifnotself._unparsed_path:returntokens=self._unparsed_path.split(TOKEN_DELIMITER)# Promote a single dash separator in the only remaining token to a# series/title boundary. Catches the common convention where the# canonical ":" is replaced with " - " (or "word- ") because# filesystems disallow ":". Restricted to the single-token case so# multi-dash co-headlining like "Aquaman - Green Arrow - Deep Target"# stays in the series and so a later token already destined for the# title isn't displaced.if"title"notinself.metadataandlen(tokens)==1:matches=DASH_SEPARATOR_RE.findall(tokens[0])iflen(matches)==1:head,tail=DASH_SEPARATOR_RE.split(tokens[0],maxsplit=1)tokens=[head,tail]remaining_key_index=0unused_tokens=[]whiletokensandremaining_key_index<len(_REMAINING_GROUP_KEYS):unused_token=self._parse_series_and_title_token(remaining_key_index,tokens)ifunused_token:unused_tokens.append(unused_token)remaining_key_index+=1self._unparsed_path=" ".join(unused_tokens)ifunused_tokenselse""self._log("After Series & Title")def _add_remainders(self)->None:        """Add Remainders."""remainders=[]fortokeninself._unparsed_path.split(TOKEN_DELIMITER):remainder=token.strip()ifremainder:remainders.append(remainder)ifremainders:self.metadata["remainders"]=tuple(remainders+list(self.metadata.get("remainders",[])))def parse(self)->dict[str,str|tuple[str,...]]:[docs]
        """Parse the filename with a hierarchy of regexes."""self._log("Init")self._parse_ext()self._clean_dividers()self._parse_issue()self._parse_volume()self._parse_dates()self._parse_format_and_scan_info()self._parse_paren_subtitle()self._parse_remainder_paren_groups()self._parse_ends_of_remaining_tokens()self._parse_publisher()self._parse_series_and_title()# Copy volume into issue if it's all we have.if"issue"notinself.metadataand"volume"inself.metadata:self.metadata["issue"]=self.metadata["volume"]self._log("Using volume for issue.")self._log("After issue can be volume")self._add_remainders()returnself.metadatadef __init__(self,path:str|Path,verbose:int=0)->None:        """Initialize."""self._debug:bool=verbose>0# munge pathifisinstance(path,str):path=path.strip()p_path=Path(path)self.path=str(p_path.name).strip()self.metadata:dict[str,str|tuple[str,...]]={}self._unparsed_path=copy(self.path)self._path_indexes:dict[str,int]={}def comicfn2dict([docs]
path:str|Path,verbose:int=0)->dict[str,str|tuple[str,...]]:    """Simplfily the API."""parser=ComicFilenameParser(path,verbose=verbose)returnparser.parse()
Keys	Action
`?`	Open this help
`n`	Next page
`p`	Previous page
`s`	Search
comicfn2dict.parse¶