"""Parsers for Obsidian metadata files."""
from dataclasses import dataclass
import emoji
import regex as re
from obsidian_metadata.models.enums import Wrapping
@dataclass
class Parser:
"""Regex parsers for Obsidian metadata files.
All methods return a list of matches
"""
# Reusable regex patterns
internal_link = r"\[\[[^\[\]]*?\]\]" # An Obsidian link of the form [[]]
chars_not_in_tags = r"\u2000-\u206F\u2E00-\u2E7F'!\"#\$%&\(\)\*+,\.:;<=>?@\^`\{\|\}~\[\]\\\s"
# Compiled regex patterns
tag = re.compile(
r"""
(?:
(?:^|\s|\\{2}) # If tarts with newline, space, or "\\""
(?P\#[^\u2000-\u206F\u2E00-\u2E7F'!\"\#\$%&\(\)\*+,\.:;<=>?@\^`\{\|\}~\[\]\\\s]+) # capture tag
| # Else
(?:(?<=
\#[^\u2000-\u206F\u2E00-\u2E7F'!\"\#\$%&\(\)\*+,\.:;<=>?@\^`\{\|\}~\[\]\\\s]+
)) # if lookbehind is a tag
(?P\#[^\u2000-\u206F\u2E00-\u2E7F'!\"\#\$%&\(\)\*+,\.:;<=>?@\^`\{\|\}~\[\]\\\s]+) # capture tag
| # Else
(*FAIL)
)
""",
re.X,
)
frontmatter_complete = re.compile(r"^\s*(?P---.*?---)", flags=re.DOTALL)
frontmatter_data = re.compile(
r"(?P^\s*---)(?P.*?)(?P---)", flags=re.DOTALL
)
code_block = re.compile(r"```.*?```", flags=re.DOTALL)
inline_code = re.compile(r"(?\[)(?!\[) # Open bracket
(?P[0-9\p{Letter}\w\s_/-;\*\~`]+?) # Find key
(?.*?) # Value
(?\])(?!\]) # Close bracket
| # Else if opening wrapper is a parenthesis
(?\()(?!\() # Open parens
(?P[0-9\p{Letter}\w\s_/-;\*\~`]+?) # Find key
(?.*?) # Value
(?\))(?!\)) # Close parenthesis
)
| # Else grab entire line
(?P[0-9\p{Letter}\w\s_/-;\*\~`]+?) # Find key
(?.*) # Value
)
""",
re.X | re.I,
)
top_with_header = re.compile(
r"""^\s* # Start of note
(?P # Capture the top of the note
.* # Anything above the first header
\#+[ ].*?[\r\n] # Full header, if it exists
) # End capture group
""",
flags=re.DOTALL | re.X,
)
validate_key_text = re.compile(r"[^-_\w\d\/\*\u263a-\U0001f999]")
validate_tag_text = re.compile(r"[ \|,;:\*\(\)\[\]\\\.\n#&]")
def return_inline_metadata(self, line: str) -> list[tuple[str, str, Wrapping]] | None:
"""Return a list of metadata matches for a single line.
Args:
line (str): The text to search.
Returns:
list[tuple[str, str, Wrapping]] | None: A list of tuples containing the key, value, and wrapping type.
"""
sep = r"(? str | None:
"""Return a list of metadata matches.
Args:
text (str): The text to search.
data_only (bool, optional): If True, only return the frontmatter data and strip the "---" lines from the returned string. Defaults to False
Returns:
str | None: The frontmatter block, or None if no frontmatter is found.
"""
if data_only:
result = self.frontmatter_data.search(text)
else:
result = self.frontmatter_complete.search(text)
if result:
return result.group("frontmatter").strip()
return None
def return_tags(self, text: str) -> list[str]:
"""Return a list of tags.
Args:
text (str): The text to search.
Returns:
list[str]: A list of tags.
"""
return [
t.group("tag")
for t in self.tag.finditer(text)
if not re.match(r"^#[0-9]+$", t.group("tag"))
]
def return_top_with_header(self, text: str) -> str:
"""Returns the top content of a string until the end of the first markdown header found.
Args:
text (str): The text to search.
Returns:
str: The top content of the string.
"""
result = self.top_with_header.search(text)
if result:
return result.group("top")
return None
def strip_frontmatter(self, text: str, data_only: bool = False) -> str:
"""Strip frontmatter from a string.
Args:
text (str): The text to search.
data_only (bool, optional): If True, only strip the frontmatter data and leave the '---' lines. Defaults to False
"""
if data_only:
return self.frontmatter_data.sub(r"\g\n\g", text)
return self.frontmatter_complete.sub("", text)
def strip_code_blocks(self, text: str) -> str:
"""Strip code blocks from a string."""
return self.code_block.sub("", text)
def strip_inline_code(self, text: str) -> str:
"""Strip inline code from a string."""
return self.inline_code.sub("", text)