"""Parsers for Obsidian metadata files.""" from dataclasses import dataclass import emoji import regex as re from obsidian_metadata.models.enums import Wrapping @dataclass class Parser: """Regex parsers for Obsidian metadata files. All methods return a list of matches """ # Reusable regex patterns internal_link = r"\[\[[^\[\]]*?\]\]" # An Obsidian link of the form [[]] chars_not_in_tags = r"\u2000-\u206F\u2E00-\u2E7F'!\"#\$%&\(\)\*+,\.:;<=>?@\^`\{\|\}~\[\]\\\s" # Compiled regex patterns tag = re.compile( r""" (?: (?:^|\s|\\{2}) # If tarts with newline, space, or "\\"" (?P\#[^\u2000-\u206F\u2E00-\u2E7F'!\"\#\$%&\(\)\*+,\.:;<=>?@\^`\{\|\}~\[\]\\\s]+) # capture tag | # Else (?:(?<= \#[^\u2000-\u206F\u2E00-\u2E7F'!\"\#\$%&\(\)\*+,\.:;<=>?@\^`\{\|\}~\[\]\\\s]+ )) # if lookbehind is a tag (?P\#[^\u2000-\u206F\u2E00-\u2E7F'!\"\#\$%&\(\)\*+,\.:;<=>?@\^`\{\|\}~\[\]\\\s]+) # capture tag | # Else (*FAIL) ) """, re.X, ) frontmatter_complete = re.compile(r"^\s*(?P---.*?---)", flags=re.DOTALL) frontmatter_data = re.compile( r"(?P^\s*---)(?P.*?)(?P---)", flags=re.DOTALL ) code_block = re.compile(r"```.*?```", flags=re.DOTALL) inline_code = re.compile(r"(?\[)(?!\[) # Open bracket (?P[0-9\p{Letter}\w\s_/-;\*\~`]+?) # Find key (?.*?) # Value (?\])(?!\]) # Close bracket | # Else if opening wrapper is a parenthesis (?\()(?!\() # Open parens (?P[0-9\p{Letter}\w\s_/-;\*\~`]+?) # Find key (?.*?) # Value (?\))(?!\)) # Close parenthesis ) | # Else grab entire line (?P[0-9\p{Letter}\w\s_/-;\*\~`]+?) # Find key (?.*) # Value ) """, re.X | re.I, ) top_with_header = re.compile( r"""^\s* # Start of note (?P # Capture the top of the note .* # Anything above the first header \#+[ ].*?[\r\n] # Full header, if it exists ) # End capture group """, flags=re.DOTALL | re.X, ) validate_key_text = re.compile(r"[^-_\w\d\/\*\u263a-\U0001f999]") validate_tag_text = re.compile(r"[ \|,;:\*\(\)\[\]\\\.\n#&]") def return_inline_metadata(self, line: str) -> list[tuple[str, str, Wrapping]] | None: """Return a list of metadata matches for a single line. Args: line (str): The text to search. Returns: list[tuple[str, str, Wrapping]] | None: A list of tuples containing the key, value, and wrapping type. """ sep = r"(? str | None: """Return a list of metadata matches. Args: text (str): The text to search. data_only (bool, optional): If True, only return the frontmatter data and strip the "---" lines from the returned string. Defaults to False Returns: str | None: The frontmatter block, or None if no frontmatter is found. """ if data_only: result = self.frontmatter_data.search(text) else: result = self.frontmatter_complete.search(text) if result: return result.group("frontmatter").strip() return None def return_tags(self, text: str) -> list[str]: """Return a list of tags. Args: text (str): The text to search. Returns: list[str]: A list of tags. """ return [ t.group("tag") for t in self.tag.finditer(text) if not re.match(r"^#[0-9]+$", t.group("tag")) ] def return_top_with_header(self, text: str) -> str: """Returns the top content of a string until the end of the first markdown header found. Args: text (str): The text to search. Returns: str: The top content of the string. """ result = self.top_with_header.search(text) if result: return result.group("top") return None def strip_frontmatter(self, text: str, data_only: bool = False) -> str: """Strip frontmatter from a string. Args: text (str): The text to search. data_only (bool, optional): If True, only strip the frontmatter data and leave the '---' lines. Defaults to False """ if data_only: return self.frontmatter_data.sub(r"\g\n\g", text) return self.frontmatter_complete.sub("", text) def strip_code_blocks(self, text: str) -> str: """Strip code blocks from a string.""" return self.code_block.sub("", text) def strip_inline_code(self, text: str) -> str: """Strip inline code from a string.""" return self.inline_code.sub("", text)