Source code for gitmatch

"""
Gitignore-style path matching

``gitmatch`` provides ``gitignore``-style pattern matching of file paths.
Simply pass in a sequence of ``gitignore`` patterns and you'll get back an
object for testing whether a given relative path matches the patterns.

Visit <https://github.com/jwodder/gitmatch> or <https://gitmatch.rtfd.io> for
more information.
"""

from __future__ import annotations
from collections.abc import Iterable
from dataclasses import asdict, dataclass
import os
from pathlib import PurePosixPath, PureWindowsPath
import posixpath
import re
from typing import Any, AnyStr, Generic, Optional

__version__ = "0.1.0"
__author__ = "John Thorvald Wodder II"
__author_email__ = "gitmatch@varonathe.org"
__license__ = "MIT"
__url__ = "https://github.com/jwodder/gitmatch"

__all__ = [
    "Gitignore",
    "InvalidPathError",
    "InvalidPatternError",
    "Match",
    "Pattern",
    "Regex",
    "compile",
    "pattern2regex",
]


[docs]@dataclass class Gitignore(Generic[AnyStr]): """A collection of compiled gitignore patterns""" #: :meta private: patterns: list[Pattern[AnyStr]]
[docs] def match( self, path: AnyStr | os.PathLike[AnyStr], is_dir: bool = False ) -> Optional[Match[AnyStr]]: """ Test whether the given relative path matches the collection of patterns. If ``is_dir`` is true or if ``path`` ends in a slash, ``path`` is treated as a path to a directory; otherwise, it treated as a path to a file. If on Windows and ``path`` is not an instance of `pathlib.PurePosixPath`, or if on any OS and ``path`` is an instance of `pathlib.PureWindowsPath`, any backslashes in ``path`` will be converted to forward slashes before matching. If a match is found, a `Match` object is returned containing information about the matching pattern and the path or portion thereof that matched. The `Match` object may be either "truthy" or "falsy" depending on whether the matching pattern is negative or not. If none of the patterns match the path, `match()` returns `None`. Hence, if you're just interested in whether the patterns say the path should be gitignored, call `bool()` on the result or use it in a boolean context like an ``if ... :`` line. :raises InvalidPathError: If ``path`` is empty, is absolute, is not normalized (aside from an optional trailing slash), contains a NUL character, or starts with ``..``. """ orig = path path = os.fspath(path) if isinstance(path, str): NUL = "\0" SLASH = "/" SEP = os.sep WINSEP = "\\" CURDIR = "." PARDIR = ".." else: NUL = b"\0" SLASH = b"/" SEP = os.sep.encode("us-ascii") WINSEP = b"\\" CURDIR = b"." PARDIR = b".." if not path: raise InvalidPathError("Empty path", orig) if NUL in path: raise InvalidPathError("Path contains NUL byte", orig) if os.path.isabs(path): raise InvalidPathError("Path is not relative", orig) if SEP != SLASH and not isinstance(orig, PurePosixPath): path = path.replace(SEP, SLASH) elif isinstance(orig, PureWindowsPath): path = path.replace(WINSEP, SLASH) if path.endswith(SLASH): is_dir = True path = path[:-1] if posixpath.normpath(path) != path: raise InvalidPathError("Path is not normalized", orig) if path.split(SLASH)[0] == PARDIR: raise InvalidPathError("Path cannot begin with '..'", orig) if path == CURDIR: return None for p in pathway(path): for pat in reversed(self.patterns): if pat.match(p, is_dir=(is_dir if p == path else True)): if not pat.negative: return Match(pat, p) elif p == path: return Match(pat, p) else: break return None
[docs]@dataclass class Match(Generic[AnyStr]): """ Information about a successful match of a path against a pattern. A `Match` is truthy if the pattern was not negative and falsy otherwise. """ #: The compiled `Pattern` object that matched the path pattern_obj: Pattern[AnyStr] #: The path that matched. This may be a parent path of the value passed to #: `~Gitignore.match()`. path: AnyStr @property def pattern(self) -> AnyStr: """ The original gitignore pattern provided to `compile()`, with trailing spaces stripped """ return self.pattern_obj.pattern def __bool__(self) -> bool: return not self.pattern_obj.negative
[docs]@dataclass class Pattern(Generic[AnyStr]): """A compiled gitignore pattern""" #: The original gitignore pattern provided to `compile()`, with trailing #: spaces stripped pattern: AnyStr #: A compiled regular expression pattern regex: re.Pattern[AnyStr] #: Whether the pattern is negative or not negative: bool #: Whether the pattern only matches directories dir_only: bool #: Whether the pattern is case-insensitive ignorecase: bool
[docs] def match(self, path: AnyStr, is_dir: bool = False) -> bool: """ Test whether the pattern matches the given path. ``path`` is assumed to be a relative, normalized, ``/``-separated path. If ``is_dir`` is true, the path is assumed to refer to a directory; otherwise, it is assumed to refer to a file. Unlike `Gitignore.match()`, this method only tests ``path`` itself, not any of its parent paths. """ if self.dir_only and not is_dir: return False return bool(self.regex.fullmatch(path))
[docs]@dataclass class Regex(Generic[AnyStr]): """A gitignore pattern that has been converted to a regular expression""" #: The original gitignore pattern provided to `compile()`, with trailing #: spaces stripped pattern: AnyStr #: The regular expression equivalent of the pattern regex: AnyStr #: Whether the pattern is negative or not negative: bool #: Whether the pattern only matches directories dir_only: bool #: Whether the pattern is case-insensitive ignorecase: bool
[docs] def compile(self) -> Pattern[AnyStr]: """Compile the regular expression""" return Pattern( pattern=self.pattern, regex=re.compile(self.regex), negative=self.negative, dir_only=self.dir_only, ignorecase=self.ignorecase, )
[docs]def compile(patterns: Iterable[AnyStr], ignorecase: bool = False) -> Gitignore[AnyStr]: """ Compile a collection of gitignore patterns into a `Gitignore` instance. Any invalid or empty patterns are discarded. Trailing newlines are stripped from the patterns before compiling, so you can compile a pre-existing :file:`.gitignore` file by simply doing: .. code:: python with open("path/to/.gitignore") as fp: gi = gitmatch.compile(fp) :param patterns: an iterable of gitignore patterns :param bool ignorecase: Whether the patterns should match case-insensitively """ compiled_patterns: list[Pattern[AnyStr]] = [] for pat in patterns: try: regex = pattern2regex(chomp(pat), ignorecase=ignorecase) except InvalidPatternError: continue if regex is None: continue compiled_patterns.append(regex.compile()) return Gitignore(compiled_patterns)
@dataclass class ParserStrs(Generic[AnyStr]): """ A collection of either `str` or `bytes` constants used by `pattern2regex()` """ posix_classes: dict[AnyStr, AnyStr] parser: re.Pattern[AnyStr] range_parser: re.Pattern[AnyStr] octothorpe: AnyStr bang: AnyStr slash: AnyStr start: AnyStr istart: AnyStr end: AnyStr leading_globstar_slash: re.Pattern[AnyStr] is_anchored: re.Pattern[AnyStr] unanchored_start: AnyStr slash_globstar: AnyStr slash_globstar_slash: AnyStr globstar_slash: AnyStr qm: AnyStr star: AnyStr openrange: AnyStr caret: AnyStr close_bracket: AnyStr close_bracket_in_range: re.Pattern[AnyStr] hyphen: AnyStr def encode(self: ParserStrs[str]) -> ParserStrs[bytes]: return ParserStrs( **{name: self.encode_field(value) for name, value in asdict(self).items()} ) @staticmethod def encode_field(value: Any) -> Any: if isinstance(value, str): return value.encode("us-ascii") elif isinstance(value, re.Pattern): return re.compile( value.pattern.encode("us-ascii"), flags=value.flags & ~re.U ) elif isinstance(value, dict): return { k.encode("us-ascii"): v.encode("us-ascii") for k, v in value.items() } else: raise TypeError(value) # pragma: no cover PARSER_STRS = ParserStrs( posix_classes={ "alpha": r"A-Za-z", "alnum": r"A-Za-z0-9", "blank": r" \t", "cntrl": r"\0-\x1F\x7F", "digit": r"0-9", "graph": r"!-\~", "lower": r"a-z", "print": r" -\~", "punct": r"!-/:-@[-`{-\~", "space": r"\t\n\r ", "upper": r"A-Z", "xdigit": r"0-9A-Fa-f", }, parser=re.compile( r""" (?P<slash_globstar>/\*\*\Z) |(?P<slash_globstar_slash>/\*\*(/\*\*)*/) |(?P<globstar_slash>\*\*/(\*\*/)*) |(?P<qm>\?) |(?P<star>\*\*?) |(?P<openrange>\[) |(?P<char>\x5C[^\0]|[^\0\x5C]) """, flags=re.X, ), range_parser=re.compile( r""" (?P<left>\x5C[^\0]|[^\0\x5C])-(?P<right>\x5C[^\0]|[^\0\x5C\x5D]) |\[:(?P<posix_class>[^\]]*):\] |(?P<char>\x5C[^\0]|[^\0\x5C\x5D]) |(?P<end>\]) """, flags=re.X, ), octothorpe="#", bang="!", slash="/", start=r"(?a:", istart=r"(?ai:", end=r")", leading_globstar_slash=re.compile(r"\*\*/(?:\*\*/)*"), is_anchored=re.compile(r"^/|/."), unanchored_start=r"(?:[^/\0]+/)*", slash_globstar=r"(?:(?:/[^/\0]+)+/?|/)", slash_globstar_slash=r"/(?:[^/\0]+/)*", globstar_slash=r"(?:[^/\0]*/)?(?:[^/\0]+/)*", qm=r"[^/\0]", star=r"[^/\0]*", openrange=r"(?![/\0])[", caret="^", close_bracket="]", close_bracket_in_range=re.compile(r"\](?!-[^\]])"), hyphen="-", ) PARSER_BYTES = PARSER_STRS.encode()
[docs]def pattern2regex(pattern: AnyStr, ignorecase: bool = False) -> Optional[Regex[AnyStr]]: """ Convert a gitignore pattern to a regular expression and return a `Regex` object. If the pattern is empty or a comment, returns `None`. :param pattern: a gitignore pattern :param bool ignorecase: Whether the pattern should match case-insensitively :raises InvalidPatternError: If the given pattern is invalid """ strs: ParserStrs if isinstance(pattern, str): strs = PARSER_STRS else: strs = PARSER_BYTES orig = pattern pattern = source = trim_trailing_spaces(pattern) if pattern.startswith(strs.octothorpe): return None if pattern.startswith(strs.bang): negative = True pattern = pattern[1:] if not pattern: return None else: negative = False if pattern.endswith(strs.slash): dir_only = True pattern = pattern[:-1] else: dir_only = False if not pattern: return None pos = 0 regex = strs.istart if ignorecase else strs.start m = strs.leading_globstar_slash.match(pattern) if m or not strs.is_anchored.search(pattern): regex += strs.unanchored_start if m: pos += m.end() if not m and pattern.startswith(strs.slash): pos += 1 while pos < len(pattern): m = strs.parser.match(pattern, pos) if not m: raise InvalidPatternError(orig) pos += m.end() - m.start() if m["slash_globstar"] is not None: regex += strs.slash_globstar elif m["slash_globstar_slash"] is not None: regex += strs.slash_globstar_slash elif m["globstar_slash"] is not None: regex += strs.globstar_slash elif m["qm"] is not None: regex += strs.qm elif m["star"] is not None: regex += strs.star elif m["openrange"] is not None: regex += strs.openrange if pattern[pos : pos + 1] in (strs.caret, strs.bang): regex += strs.caret pos += 1 if strs.close_bracket_in_range.match(pattern, pos=pos): regex += strs.close_bracket pos += 1 while True: m = strs.range_parser.match(pattern, pos) if not m: raise InvalidPatternError(orig) pos += m.end() - m.start() if m["left"] is not None: lchar = m["left"][-1:] rchar = m["right"][-1:] if ord(lchar) > ord(rchar): raise InvalidPatternError(orig) regex += re.escape(lchar) + strs.hyphen + re.escape(rchar) elif m["posix_class"] is not None: try: regex += strs.posix_classes[m["posix_class"]] except KeyError: raise InvalidPatternError(orig) elif m["char"] is not None: regex += re.escape(m["char"][-1:]) elif m["end"] is not None: regex += strs.close_bracket break else: raise AssertionError( "Unhandled pattern structure" ) # pragma: no cover elif m["char"] is not None: regex += re.escape(m["char"][-1:]) else: raise AssertionError("Unhandled pattern structure") # pragma: no cover regex += strs.end return Regex( pattern=source, regex=regex, negative=negative, dir_only=dir_only, ignorecase=ignorecase, )
[docs]class InvalidPathError(ValueError): """Raised by `Gitignore.match()` when given an invalid path""" def __init__( self, msg: str, path: str | bytes | os.PathLike[str] | os.PathLike[bytes] ) -> None: #: A description of the problem with the path self.msg = msg #: The invalid path self.path = path def __str__(self) -> str: return f"{self.msg}: {self.path!r}"
[docs]class InvalidPatternError(ValueError): """Raised by `pattern2regex()` when given an invalid pattern""" def __init__(self, pattern: str | bytes) -> None: #: The invalid pattern self.pattern = pattern def __str__(self) -> str: return f"Invalid gitignore pattern: {self.pattern!r}"
def pathway(path: AnyStr) -> list[AnyStr]: """ Return a list of parent paths of ``path`` (not including the root) plus ``path`` itself """ pway: list[AnyStr] = [] while path: pway.append(path) path = posixpath.dirname(path) pway.reverse() return pway TRIM_RGX = r"(?<!\\)(?P<keep>(?:\\\\)*(\\[ \t])?)[ \t]*\Z" TRIM_STR = re.compile(TRIM_RGX) TRIM_BYTES = re.compile(TRIM_RGX.encode("us-ascii")) def trim_trailing_spaces(s: AnyStr) -> AnyStr: """Remove trailing unescaped space and tab characters from ``s``""" if isinstance(s, str): rgx = TRIM_STR keep = r"\g<keep>" else: rgx = TRIM_BYTES keep = rb"\g<keep>" return rgx.sub(keep, s) def chomp(s: AnyStr) -> AnyStr: """Remove trailing newline, if any""" if s and ord(s[-1:]) == 10: s = s[:-1] if s and ord(s[-1:]) == 13: s = s[:-1] return s