Source code for gitmatch

"""
Gitignore-style path matching

``gitmatch`` provides ``gitignore``-style pattern matching of file paths.
Simply pass in a sequence of ``gitignore`` patterns and you'll get back an
object for testing whether a given relative path matches the patterns.

Visit <https://github.com/jwodder/gitmatch> or <https://gitmatch.rtfd.io> for
more information.
"""

from __future__ import annotations
from collections.abc import Iterable
from dataclasses import asdict, dataclass
import os
import os.path
from pathlib import PurePosixPath, PureWindowsPath
import posixpath
import re
import sys
from typing import Any, AnyStr, Generic

__version__ = "0.3.0"
__author__ = "John Thorvald Wodder II"
__author_email__ = "gitmatch@varonathe.org"
__license__ = "MIT"
__url__ = "https://github.com/jwodder/gitmatch"

__all__ = [
    "Gitignore",
    "InvalidPathError",
    "InvalidPatternError",
    "Match",
    "Pattern",
    "Regex",
    "compile",
    "pattern2regex",
]

ON_WINDOWS = os.name == "nt"


[docs] @dataclass class Gitignore(Generic[AnyStr]): """A collection of compiled gitignore patterns""" #: :meta private: patterns: list[Pattern[AnyStr]]
[docs] def match( self, path: AnyStr | os.PathLike[AnyStr], is_dir: bool = False ) -> Match[AnyStr] | None: """ Test whether the given relative path matches the collection of patterns. If ``is_dir`` is true or if ``path`` ends in a slash, ``path`` is treated as a path to a directory; otherwise, it treated as a path to a file. If on Windows and ``path`` is not an instance of `pathlib.PurePosixPath`, or if on any OS and ``path`` is an instance of `pathlib.PureWindowsPath`, any backslashes in ``path`` will be converted to forward slashes before matching. If a match is found, a `Match` object is returned containing information about the matching pattern and the path or portion thereof that matched. The `Match` object may be either "truthy" or "falsy" depending on whether the matching pattern is negative or not. If none of the patterns match the path, `match()` returns `None`. Hence, if you're just interested in whether the patterns say the path should be gitignored, call `bool()` on the result or use it in a boolean context like an ``if ... :`` line. :raises InvalidPathError: If ``path`` is empty, is absolute, has an anchor (Windows paths only), is not normalized (aside from an optional trailing slash), contains a NUL character, or starts with ``..`` """ orig = path path = os.fspath(path) if isinstance(path, str): NUL = "\0" SLASH = "/" SEP = os.sep WINSEP = "\\" CURDIR = "." PARDIR = ".." else: NUL = b"\0" SLASH = b"/" SEP = os.sep.encode("us-ascii") WINSEP = b"\\" CURDIR = b"." PARDIR = b".." if not path: raise InvalidPathError("Empty path", orig) if NUL in path: raise InvalidPathError("Path contains NUL byte", orig) if is_complex_path(path): raise InvalidPathError("Path is not relative", orig) if SEP != SLASH and not isinstance(orig, PurePosixPath): path = path.replace(SEP, SLASH) elif isinstance(orig, PureWindowsPath): path = path.replace(WINSEP, SLASH) if path.endswith(SLASH): is_dir = True path = path[:-1] if posixpath.normpath(path) != path: raise InvalidPathError("Path is not normalized", orig) if path.split(SLASH)[0] == PARDIR: raise InvalidPathError("Path cannot begin with '..'", orig) if path == CURDIR: return None for p in pathway(path): for pat in reversed(self.patterns): if pat.match(p, is_dir=(is_dir if p == path else True)): if not pat.negative: return Match(pat, p) elif p == path: return Match(pat, p) else: break return None
[docs] @dataclass class Match(Generic[AnyStr]): """ Information about a successful match of a path against a pattern. A `Match` is truthy if the pattern was not negative and falsy otherwise. """ #: The compiled `Pattern` object that matched the path pattern_obj: Pattern[AnyStr] #: The path that matched. This may be a parent path of the value passed to #: `~Gitignore.match()`. path: AnyStr @property def pattern(self) -> AnyStr: """ The original gitignore pattern provided to `compile()`, with trailing spaces stripped """ return self.pattern_obj.pattern def __bool__(self) -> bool: return not self.pattern_obj.negative
[docs] @dataclass class Pattern(Generic[AnyStr]): """A compiled gitignore pattern""" #: The original gitignore pattern provided to `compile()`, with trailing #: spaces stripped pattern: AnyStr #: A compiled regular expression pattern regex: re.Pattern[AnyStr] #: Whether the pattern is negative or not negative: bool #: Whether the pattern only matches directories dir_only: bool #: Whether the pattern is case-insensitive ignorecase: bool
[docs] def match(self, path: AnyStr, is_dir: bool = False) -> bool: """ Test whether the pattern matches the given path. ``path`` is assumed to be a relative, normalized, ``/``-separated path. If ``is_dir`` is true, the path is assumed to refer to a directory; otherwise, it is assumed to refer to a file. Unlike `Gitignore.match()`, this method only tests ``path`` itself, not any of its parent paths. """ if self.dir_only and not is_dir: return False return bool(self.regex.fullmatch(path))
[docs] @dataclass class Regex(Generic[AnyStr]): """A gitignore pattern that has been converted to a regular expression""" #: The original gitignore pattern provided to `compile()`, with trailing #: spaces stripped pattern: AnyStr #: The regular expression equivalent of the pattern regex: AnyStr #: Whether the pattern is negative or not negative: bool #: Whether the pattern only matches directories dir_only: bool #: Whether the pattern is case-insensitive ignorecase: bool
[docs] def compile(self) -> Pattern[AnyStr]: # noqa: A003 """Compile the regular expression""" return Pattern( pattern=self.pattern, regex=re.compile(self.regex), negative=self.negative, dir_only=self.dir_only, ignorecase=self.ignorecase, )
[docs] def compile( # noqa: A001 patterns: Iterable[AnyStr], ignorecase: bool = False ) -> Gitignore[AnyStr]: """ Compile a collection of gitignore patterns into a `Gitignore` instance. Any invalid or empty patterns are discarded. Trailing newlines are stripped from the patterns before compiling, so you can compile a pre-existing :file:`.gitignore` file by simply doing: .. code:: python with open("path/to/.gitignore") as fp: gi = gitmatch.compile(fp) :param patterns: an iterable of gitignore patterns :param bool ignorecase: Whether the patterns should match case-insensitively """ compiled_patterns: list[Pattern[AnyStr]] = [] for pat in patterns: try: regex = pattern2regex(chomp(pat), ignorecase=ignorecase) except InvalidPatternError: continue if regex is None: continue compiled_patterns.append(regex.compile()) return Gitignore(compiled_patterns)
@dataclass class ParserStrs(Generic[AnyStr]): """ A collection of either `str` or `bytes` constants used by `pattern2regex()` """ posix_classes: dict[AnyStr, AnyStr] parser: re.Pattern[AnyStr] range_parser: re.Pattern[AnyStr] octothorpe: AnyStr bang: AnyStr slash: AnyStr start: AnyStr istart: AnyStr end: AnyStr leading_globstar_slash: re.Pattern[AnyStr] is_anchored: re.Pattern[AnyStr] unanchored_start: AnyStr slash_globstar: AnyStr slash_globstar_slash: AnyStr qm: AnyStr star: AnyStr openrange: AnyStr caret: AnyStr close_bracket: AnyStr close_bracket_in_range: re.Pattern[AnyStr] hyphen: AnyStr def encode(self: ParserStrs[str]) -> ParserStrs[bytes]: return ParserStrs( **{name: self.encode_field(value) for name, value in asdict(self).items()} ) @staticmethod def encode_field(value: Any) -> Any: if isinstance(value, str): return value.encode("us-ascii") elif isinstance(value, re.Pattern): return re.compile( value.pattern.encode("us-ascii"), flags=value.flags & ~re.U ) elif isinstance(value, dict): return { k.encode("us-ascii"): v.encode("us-ascii") for k, v in value.items() } else: raise TypeError(value) # pragma: no cover PARSER_STRS = ParserStrs( posix_classes={ "alpha": r"A-Za-z", "alnum": r"A-Za-z0-9", "blank": r" \t", "cntrl": r"\0-\x1F\x7F", "digit": r"0-9", "graph": r"!-\~", "lower": r"a-z", "print": r" -\~", "punct": r"!-/:-@[-`{-\~", "space": r"\t\n\r ", "upper": r"A-Z", "xdigit": r"0-9A-Fa-f", }, parser=re.compile( r""" (?P<slash_globstar>/\*\*\Z) |(?P<slash_globstar_slash>/\*\*(/\*\*)*/) |(?P<qm>\?) |(?P<star>\*\*?) |(?P<openrange>\[) |(?P<char>\x5C[^\0]|[^\0\x5C]) """, flags=re.X, ), range_parser=re.compile( r""" (?P<left>\x5C[^\0]|[^\0\x5C])-(?P<right>\x5C[^\0]|[^\0\x5C\x5D]) |\[:(?P<posix_class>[^\]]*):\] |(?P<char>\x5C[^\0]|[^\0\x5C\x5D]) |(?P<end>\]) """, flags=re.X, ), octothorpe="#", bang="!", slash="/", start=r"(?a:", istart=r"(?ai:", end=r")", leading_globstar_slash=re.compile(r"\*\*/(?:\*\*/)*"), is_anchored=re.compile(r"^/|/."), unanchored_start=r"(?:[^/\0]+/)*", slash_globstar=r"(?:(?:/[^/\0]+)+/?|/)", slash_globstar_slash=r"/(?:[^/\0]+/)*", qm=r"[^/\0]", star=r"[^/\0]*", openrange=r"(?![/\0])[", caret="^", close_bracket="]", close_bracket_in_range=re.compile(r"\](?!-[^\]])"), hyphen="-", ) PARSER_BYTES = PARSER_STRS.encode()
[docs] def pattern2regex(pattern: AnyStr, ignorecase: bool = False) -> Regex[AnyStr] | None: """ Convert a gitignore pattern to a regular expression and return a `Regex` object. If the pattern is empty or a comment, returns `None`. :param pattern: a gitignore pattern :param bool ignorecase: Whether the pattern should match case-insensitively :raises InvalidPatternError: If the given pattern is invalid """ strs: ParserStrs if isinstance(pattern, str): strs = PARSER_STRS else: strs = PARSER_BYTES orig = pattern pattern = source = trim_trailing_spaces(pattern) if pattern.startswith(strs.octothorpe): return None if pattern.startswith(strs.bang): negative = True pattern = pattern[1:] if not pattern: return None else: negative = False if pattern.endswith(strs.slash): dir_only = True pattern = pattern[:-1] else: dir_only = False if not pattern: return None pos = 0 regex = strs.istart if ignorecase else strs.start m = strs.leading_globstar_slash.match(pattern) if m or not strs.is_anchored.search(pattern): regex += strs.unanchored_start if m: pos += m.end() if not m and pattern.startswith(strs.slash): pos += 1 while pos < len(pattern): m = strs.parser.match(pattern, pos) if not m: raise InvalidPatternError(orig) pos += m.end() - m.start() if m["slash_globstar"] is not None: regex += strs.slash_globstar elif m["slash_globstar_slash"] is not None: regex += strs.slash_globstar_slash elif m["qm"] is not None: regex += strs.qm elif m["star"] is not None: regex += strs.star elif m["openrange"] is not None: regex += strs.openrange if pattern[pos : pos + 1] in (strs.caret, strs.bang): regex += strs.caret pos += 1 if strs.close_bracket_in_range.match(pattern, pos=pos): regex += strs.close_bracket pos += 1 while True: m = strs.range_parser.match(pattern, pos) if not m: raise InvalidPatternError(orig) pos += m.end() - m.start() if m["left"] is not None: lchar = m["left"][-1:] rchar = m["right"][-1:] if ord(lchar) > ord(rchar): raise InvalidPatternError(orig) regex += re.escape(lchar) + strs.hyphen + re.escape(rchar) elif m["posix_class"] is not None: try: regex += strs.posix_classes[m["posix_class"]] except KeyError: raise InvalidPatternError(orig) elif m["char"] is not None: regex += re.escape(m["char"][-1:]) elif m["end"] is not None: regex += strs.close_bracket break else: raise AssertionError( "Unhandled pattern structure" ) # pragma: no cover elif m["char"] is not None: regex += re.escape(m["char"][-1:]) else: raise AssertionError("Unhandled pattern structure") # pragma: no cover regex += strs.end return Regex( pattern=source, regex=regex, negative=negative, dir_only=dir_only, ignorecase=ignorecase, )
[docs] class InvalidPathError(ValueError): """Raised by `Gitignore.match()` when given an invalid path""" def __init__( self, msg: str, path: str | bytes | os.PathLike[str] | os.PathLike[bytes] ) -> None: #: A description of the problem with the path self.msg = msg #: The invalid path self.path = path super().__init__(msg, path) def __str__(self) -> str: return f"{self.msg}: {self.path!r}"
[docs] class InvalidPatternError(ValueError): """Raised by `pattern2regex()` when given an invalid pattern""" def __init__(self, pattern: str | bytes) -> None: #: The invalid pattern self.pattern = pattern super().__init__(pattern) def __str__(self) -> str: return f"Invalid gitignore pattern: {self.pattern!r}"
def pathway(path: AnyStr) -> list[AnyStr]: """ Return a list of parent paths of ``path`` (not including the root) plus ``path`` itself """ pway: list[AnyStr] = [] while path: pway.append(path) path = posixpath.dirname(path) pway.reverse() return pway TRIM_RGX = r"(?<!\\)(?P<keep>(?:\\\\)*(\\[ \t])?)[ \t]*\Z" TRIM_STR = re.compile(TRIM_RGX) TRIM_BYTES = re.compile(TRIM_RGX.encode("us-ascii")) def trim_trailing_spaces(s: AnyStr) -> AnyStr: """Remove trailing unescaped space and tab characters from ``s``""" if isinstance(s, str): rgx = TRIM_STR keep = r"\g<keep>" else: rgx = TRIM_BYTES keep = rb"\g<keep>" return rgx.sub(keep, s) def chomp(s: AnyStr) -> AnyStr: """Remove trailing newline, if any""" if s and ord(s[-1:]) == 10: s = s[:-1] if s and ord(s[-1:]) == 13: s = s[:-1] return s def is_complex_path(path: AnyStr | os.PathLike[AnyStr]) -> bool: """ Returns true if `path` is absolute or (Windows only) contains any parts other than a directory and/or file path """ if os.path.isabs(path): return True elif ON_WINDOWS: if sys.version_info[:2] >= (3, 13): # isabs() changed in Python 3.13 to not regard paths starting with # just one (back)slash as absolute on Windows p = os.fsencode(path) if re.match(rb"/(?!/)|\\(?!\\)", p): # <https://github.com/python/mypy/issues/18210> return True # type: ignore[no-any-return] return bool(os.path.splitdrive(path)[0]) elif isinstance(path, PureWindowsPath): return bool(path.anchor) else: return False