"""
Gitignore-style path matching
``gitmatch`` provides ``gitignore``-style pattern matching of file paths.
Simply pass in a sequence of ``gitignore`` patterns and you'll get back an
object for testing whether a given relative path matches the patterns.
Visit <https://github.com/jwodder/gitmatch> or <https://gitmatch.rtfd.io> for
more information.
"""
from __future__ import annotations
from collections.abc import Iterable
from dataclasses import asdict, dataclass
import os
import os.path
from pathlib import PurePosixPath, PureWindowsPath
import posixpath
import re
import sys
from typing import Any, AnyStr, Generic
__version__ = "0.3.0"
__author__ = "John Thorvald Wodder II"
__author_email__ = "gitmatch@varonathe.org"
__license__ = "MIT"
__url__ = "https://github.com/jwodder/gitmatch"
__all__ = [
"Gitignore",
"InvalidPathError",
"InvalidPatternError",
"Match",
"Pattern",
"Regex",
"compile",
"pattern2regex",
]
ON_WINDOWS = os.name == "nt"
[docs]
@dataclass
class Gitignore(Generic[AnyStr]):
"""A collection of compiled gitignore patterns"""
#: :meta private:
patterns: list[Pattern[AnyStr]]
[docs]
def match(
self, path: AnyStr | os.PathLike[AnyStr], is_dir: bool = False
) -> Match[AnyStr] | None:
"""
Test whether the given relative path matches the collection of
patterns. If ``is_dir`` is true or if ``path`` ends in a slash,
``path`` is treated as a path to a directory; otherwise, it treated as
a path to a file.
If on Windows and ``path`` is not an instance of
`pathlib.PurePosixPath`, or if on any OS and ``path`` is an instance of
`pathlib.PureWindowsPath`, any backslashes in ``path`` will be
converted to forward slashes before matching.
If a match is found, a `Match` object is returned containing
information about the matching pattern and the path or portion thereof
that matched. The `Match` object may be either "truthy" or "falsy"
depending on whether the matching pattern is negative or not. If none
of the patterns match the path, `match()` returns `None`. Hence, if
you're just interested in whether the patterns say the path should be
gitignored, call `bool()` on the result or use it in a boolean context
like an ``if ... :`` line.
:raises InvalidPathError:
If ``path`` is empty, is absolute, has an anchor (Windows paths
only), is not normalized (aside from an optional trailing slash),
contains a NUL character, or starts with ``..``
"""
orig = path
path = os.fspath(path)
if isinstance(path, str):
NUL = "\0"
SLASH = "/"
SEP = os.sep
WINSEP = "\\"
CURDIR = "."
PARDIR = ".."
else:
NUL = b"\0"
SLASH = b"/"
SEP = os.sep.encode("us-ascii")
WINSEP = b"\\"
CURDIR = b"."
PARDIR = b".."
if not path:
raise InvalidPathError("Empty path", orig)
if NUL in path:
raise InvalidPathError("Path contains NUL byte", orig)
if is_complex_path(path):
raise InvalidPathError("Path is not relative", orig)
if SEP != SLASH and not isinstance(orig, PurePosixPath):
path = path.replace(SEP, SLASH)
elif isinstance(orig, PureWindowsPath):
path = path.replace(WINSEP, SLASH)
if path.endswith(SLASH):
is_dir = True
path = path[:-1]
if posixpath.normpath(path) != path:
raise InvalidPathError("Path is not normalized", orig)
if path.split(SLASH)[0] == PARDIR:
raise InvalidPathError("Path cannot begin with '..'", orig)
if path == CURDIR:
return None
for p in pathway(path):
for pat in reversed(self.patterns):
if pat.match(p, is_dir=(is_dir if p == path else True)):
if not pat.negative:
return Match(pat, p)
elif p == path:
return Match(pat, p)
else:
break
return None
[docs]
@dataclass
class Match(Generic[AnyStr]):
"""
Information about a successful match of a path against a pattern. A
`Match` is truthy if the pattern was not negative and falsy otherwise.
"""
#: The compiled `Pattern` object that matched the path
pattern_obj: Pattern[AnyStr]
#: The path that matched. This may be a parent path of the value passed to
#: `~Gitignore.match()`.
path: AnyStr
@property
def pattern(self) -> AnyStr:
"""
The original gitignore pattern provided to `compile()`, with trailing
spaces stripped
"""
return self.pattern_obj.pattern
def __bool__(self) -> bool:
return not self.pattern_obj.negative
[docs]
@dataclass
class Pattern(Generic[AnyStr]):
"""A compiled gitignore pattern"""
#: The original gitignore pattern provided to `compile()`, with trailing
#: spaces stripped
pattern: AnyStr
#: A compiled regular expression pattern
regex: re.Pattern[AnyStr]
#: Whether the pattern is negative or not
negative: bool
#: Whether the pattern only matches directories
dir_only: bool
#: Whether the pattern is case-insensitive
ignorecase: bool
[docs]
def match(self, path: AnyStr, is_dir: bool = False) -> bool:
"""
Test whether the pattern matches the given path. ``path`` is assumed
to be a relative, normalized, ``/``-separated path. If ``is_dir`` is
true, the path is assumed to refer to a directory; otherwise, it is
assumed to refer to a file.
Unlike `Gitignore.match()`, this method only tests ``path`` itself, not
any of its parent paths.
"""
if self.dir_only and not is_dir:
return False
return bool(self.regex.fullmatch(path))
[docs]
@dataclass
class Regex(Generic[AnyStr]):
"""A gitignore pattern that has been converted to a regular expression"""
#: The original gitignore pattern provided to `compile()`, with trailing
#: spaces stripped
pattern: AnyStr
#: The regular expression equivalent of the pattern
regex: AnyStr
#: Whether the pattern is negative or not
negative: bool
#: Whether the pattern only matches directories
dir_only: bool
#: Whether the pattern is case-insensitive
ignorecase: bool
[docs]
def compile(self) -> Pattern[AnyStr]: # noqa: A003
"""Compile the regular expression"""
return Pattern(
pattern=self.pattern,
regex=re.compile(self.regex),
negative=self.negative,
dir_only=self.dir_only,
ignorecase=self.ignorecase,
)
[docs]
def compile( # noqa: A001
patterns: Iterable[AnyStr], ignorecase: bool = False
) -> Gitignore[AnyStr]:
"""
Compile a collection of gitignore patterns into a `Gitignore` instance.
Any invalid or empty patterns are discarded.
Trailing newlines are stripped from the patterns before compiling, so you
can compile a pre-existing :file:`.gitignore` file by simply doing:
.. code:: python
with open("path/to/.gitignore") as fp:
gi = gitmatch.compile(fp)
:param patterns: an iterable of gitignore patterns
:param bool ignorecase:
Whether the patterns should match case-insensitively
"""
compiled_patterns: list[Pattern[AnyStr]] = []
for pat in patterns:
try:
regex = pattern2regex(chomp(pat), ignorecase=ignorecase)
except InvalidPatternError:
continue
if regex is None:
continue
compiled_patterns.append(regex.compile())
return Gitignore(compiled_patterns)
@dataclass
class ParserStrs(Generic[AnyStr]):
"""
A collection of either `str` or `bytes` constants used by `pattern2regex()`
"""
posix_classes: dict[AnyStr, AnyStr]
parser: re.Pattern[AnyStr]
range_parser: re.Pattern[AnyStr]
octothorpe: AnyStr
bang: AnyStr
slash: AnyStr
start: AnyStr
istart: AnyStr
end: AnyStr
leading_globstar_slash: re.Pattern[AnyStr]
is_anchored: re.Pattern[AnyStr]
unanchored_start: AnyStr
slash_globstar: AnyStr
slash_globstar_slash: AnyStr
qm: AnyStr
star: AnyStr
openrange: AnyStr
caret: AnyStr
close_bracket: AnyStr
close_bracket_in_range: re.Pattern[AnyStr]
hyphen: AnyStr
def encode(self: ParserStrs[str]) -> ParserStrs[bytes]:
return ParserStrs(
**{name: self.encode_field(value) for name, value in asdict(self).items()}
)
@staticmethod
def encode_field(value: Any) -> Any:
if isinstance(value, str):
return value.encode("us-ascii")
elif isinstance(value, re.Pattern):
return re.compile(
value.pattern.encode("us-ascii"), flags=value.flags & ~re.U
)
elif isinstance(value, dict):
return {
k.encode("us-ascii"): v.encode("us-ascii") for k, v in value.items()
}
else:
raise TypeError(value) # pragma: no cover
PARSER_STRS = ParserStrs(
posix_classes={
"alpha": r"A-Za-z",
"alnum": r"A-Za-z0-9",
"blank": r" \t",
"cntrl": r"\0-\x1F\x7F",
"digit": r"0-9",
"graph": r"!-\~",
"lower": r"a-z",
"print": r" -\~",
"punct": r"!-/:-@[-`{-\~",
"space": r"\t\n\r ",
"upper": r"A-Z",
"xdigit": r"0-9A-Fa-f",
},
parser=re.compile(
r"""
(?P<slash_globstar>/\*\*\Z)
|(?P<slash_globstar_slash>/\*\*(/\*\*)*/)
|(?P<qm>\?)
|(?P<star>\*\*?)
|(?P<openrange>\[)
|(?P<char>\x5C[^\0]|[^\0\x5C])
""",
flags=re.X,
),
range_parser=re.compile(
r"""
(?P<left>\x5C[^\0]|[^\0\x5C])-(?P<right>\x5C[^\0]|[^\0\x5C\x5D])
|\[:(?P<posix_class>[^\]]*):\]
|(?P<char>\x5C[^\0]|[^\0\x5C\x5D])
|(?P<end>\])
""",
flags=re.X,
),
octothorpe="#",
bang="!",
slash="/",
start=r"(?a:",
istart=r"(?ai:",
end=r")",
leading_globstar_slash=re.compile(r"\*\*/(?:\*\*/)*"),
is_anchored=re.compile(r"^/|/."),
unanchored_start=r"(?:[^/\0]+/)*",
slash_globstar=r"(?:(?:/[^/\0]+)+/?|/)",
slash_globstar_slash=r"/(?:[^/\0]+/)*",
qm=r"[^/\0]",
star=r"[^/\0]*",
openrange=r"(?![/\0])[",
caret="^",
close_bracket="]",
close_bracket_in_range=re.compile(r"\](?!-[^\]])"),
hyphen="-",
)
PARSER_BYTES = PARSER_STRS.encode()
[docs]
def pattern2regex(pattern: AnyStr, ignorecase: bool = False) -> Regex[AnyStr] | None:
"""
Convert a gitignore pattern to a regular expression and return a `Regex`
object. If the pattern is empty or a comment, returns `None`.
:param pattern: a gitignore pattern
:param bool ignorecase: Whether the pattern should match case-insensitively
:raises InvalidPatternError: If the given pattern is invalid
"""
strs: ParserStrs
if isinstance(pattern, str):
strs = PARSER_STRS
else:
strs = PARSER_BYTES
orig = pattern
pattern = source = trim_trailing_spaces(pattern)
if pattern.startswith(strs.octothorpe):
return None
if pattern.startswith(strs.bang):
negative = True
pattern = pattern[1:]
if not pattern:
return None
else:
negative = False
if pattern.endswith(strs.slash):
dir_only = True
pattern = pattern[:-1]
else:
dir_only = False
if not pattern:
return None
pos = 0
regex = strs.istart if ignorecase else strs.start
m = strs.leading_globstar_slash.match(pattern)
if m or not strs.is_anchored.search(pattern):
regex += strs.unanchored_start
if m:
pos += m.end()
if not m and pattern.startswith(strs.slash):
pos += 1
while pos < len(pattern):
m = strs.parser.match(pattern, pos)
if not m:
raise InvalidPatternError(orig)
pos += m.end() - m.start()
if m["slash_globstar"] is not None:
regex += strs.slash_globstar
elif m["slash_globstar_slash"] is not None:
regex += strs.slash_globstar_slash
elif m["qm"] is not None:
regex += strs.qm
elif m["star"] is not None:
regex += strs.star
elif m["openrange"] is not None:
regex += strs.openrange
if pattern[pos : pos + 1] in (strs.caret, strs.bang):
regex += strs.caret
pos += 1
if strs.close_bracket_in_range.match(pattern, pos=pos):
regex += strs.close_bracket
pos += 1
while True:
m = strs.range_parser.match(pattern, pos)
if not m:
raise InvalidPatternError(orig)
pos += m.end() - m.start()
if m["left"] is not None:
lchar = m["left"][-1:]
rchar = m["right"][-1:]
if ord(lchar) > ord(rchar):
raise InvalidPatternError(orig)
regex += re.escape(lchar) + strs.hyphen + re.escape(rchar)
elif m["posix_class"] is not None:
try:
regex += strs.posix_classes[m["posix_class"]]
except KeyError:
raise InvalidPatternError(orig)
elif m["char"] is not None:
regex += re.escape(m["char"][-1:])
elif m["end"] is not None:
regex += strs.close_bracket
break
else:
raise AssertionError(
"Unhandled pattern structure"
) # pragma: no cover
elif m["char"] is not None:
regex += re.escape(m["char"][-1:])
else:
raise AssertionError("Unhandled pattern structure") # pragma: no cover
regex += strs.end
return Regex(
pattern=source,
regex=regex,
negative=negative,
dir_only=dir_only,
ignorecase=ignorecase,
)
[docs]
class InvalidPathError(ValueError):
"""Raised by `Gitignore.match()` when given an invalid path"""
def __init__(
self, msg: str, path: str | bytes | os.PathLike[str] | os.PathLike[bytes]
) -> None:
#: A description of the problem with the path
self.msg = msg
#: The invalid path
self.path = path
super().__init__(msg, path)
def __str__(self) -> str:
return f"{self.msg}: {self.path!r}"
[docs]
class InvalidPatternError(ValueError):
"""Raised by `pattern2regex()` when given an invalid pattern"""
def __init__(self, pattern: str | bytes) -> None:
#: The invalid pattern
self.pattern = pattern
super().__init__(pattern)
def __str__(self) -> str:
return f"Invalid gitignore pattern: {self.pattern!r}"
def pathway(path: AnyStr) -> list[AnyStr]:
"""
Return a list of parent paths of ``path`` (not including the root) plus
``path`` itself
"""
pway: list[AnyStr] = []
while path:
pway.append(path)
path = posixpath.dirname(path)
pway.reverse()
return pway
TRIM_RGX = r"(?<!\\)(?P<keep>(?:\\\\)*(\\[ \t])?)[ \t]*\Z"
TRIM_STR = re.compile(TRIM_RGX)
TRIM_BYTES = re.compile(TRIM_RGX.encode("us-ascii"))
def trim_trailing_spaces(s: AnyStr) -> AnyStr:
"""Remove trailing unescaped space and tab characters from ``s``"""
if isinstance(s, str):
rgx = TRIM_STR
keep = r"\g<keep>"
else:
rgx = TRIM_BYTES
keep = rb"\g<keep>"
return rgx.sub(keep, s)
def chomp(s: AnyStr) -> AnyStr:
"""Remove trailing newline, if any"""
if s and ord(s[-1:]) == 10:
s = s[:-1]
if s and ord(s[-1:]) == 13:
s = s[:-1]
return s
def is_complex_path(path: AnyStr | os.PathLike[AnyStr]) -> bool:
"""
Returns true if `path` is absolute or (Windows only) contains any parts
other than a directory and/or file path
"""
if os.path.isabs(path):
return True
elif ON_WINDOWS:
if sys.version_info[:2] >= (3, 13):
# isabs() changed in Python 3.13 to not regard paths starting with
# just one (back)slash as absolute on Windows
p = os.fsencode(path)
if re.match(rb"/(?!/)|\\(?!\\)", p):
# <https://github.com/python/mypy/issues/18210>
return True # type: ignore[no-any-return]
return bool(os.path.splitdrive(path)[0])
elif isinstance(path, PureWindowsPath):
return bool(path.anchor)
else:
return False