bumpver/src/pycalver/v2patterns.py
Manuel Barkhau 2f421daf16 avoid duplicate pattern groups
Since patterns are always wrapped in a named group anyway,
we don't need to do another unnamed group also. This makes
the regular expressions more readable.
2020-10-03 18:04:08 +00:00

335 lines
10 KiB
Python

# This file is part of the pycalver project
# https://github.com/mbarkhau/pycalver
#
# Copyright (c) 2018-2020 Manuel Barkhau (mbarkhau@gmail.com) - MIT License
# SPDX-License-Identifier: MIT
"""Compose Regular Expressions from Patterns.
>>> pattern = compile_pattern("vYYYY0M.BUILD[-RELEASE]")
>>> version_info = pattern.regexp.match("v201712.0123-alpha")
>>> assert version_info.groupdict() == {
... "year_y" : "2017",
... "month" : "12",
... "bid" : "0123",
... "tag" : "alpha",
... }
>>>
>>> version_info = pattern.regexp.match("201712.1234")
>>> assert version_info is None
>>> version_info = pattern.regexp.match("v201713.1234")
>>> assert version_info is None
>>> version_info = pattern.regexp.match("v201712.1234")
>>> assert version_info.groupdict() == {
... "year_y" : "2017",
... "month" : "12",
... "bid" : "1234",
... "tag" : None,
... }
"""
import re
import typing as typ
import logging
from . import utils
from .patterns import RE_PATTERN_ESCAPES
from .patterns import Pattern
logger = logging.getLogger("pycalver.v2patterns")
# NOTE (mb 2020-09-17): For patterns with different options '(AAA|BB|C)', the
# patterns with more digits should be first/left of those with fewer digits:
#
# good: (?:1[0-2]|[1-9])
# bad: (?:[1-9]|1[0-2])
#
# This ensures that the longest match is done for a pattern.
#
# This implies that patterns for smaller numbers sometimes must be right of
# those for larger numbers. To be consistent we use this ordering not
# sometimes but always (even though in theory it wouldn't matter):
#
# good: (?:3[0-1]|[1-2][0-9]|[1-9])
# bad: (?:[1-2][0-9]|3[0-1]|[1-9])
PART_PATTERNS = {
# Based on calver.org
'YYYY': r"[1-9][0-9]{3}",
'YY' : r"[1-9][0-9]?",
'0Y' : r"[0-9]{2}",
'GGGG': r"[1-9][0-9]{3}",
'GG' : r"[1-9][0-9]?",
'0G' : r"[0-9]{2}",
'Q' : r"[1-4]",
'MM' : r"1[0-2]|[1-9]",
'0M' : r"1[0-2]|0[1-9]",
'DD' : r"3[0-1]|[1-2][0-9]|[1-9]",
'0D' : r"3[0-1]|[1-2][0-9]|0[1-9]",
'JJJ' : r"36[0-6]|3[0-5][0-9]|[1-2][0-9][0-9]|[1-9][0-9]|[1-9]",
'00J' : r"36[0-6]|3[0-5][0-9]|[1-2][0-9][0-9]|0[1-9][0-9]|00[1-9]",
# week numbering parts
'WW': r"5[0-2]|[1-4][0-9]|[0-9]",
'0W': r"5[0-2]|[0-4][0-9]",
'UU': r"5[0-2]|[1-4][0-9]|[0-9]",
'0U': r"5[0-2]|[0-4][0-9]",
'VV': r"5[0-3]|[1-4][0-9]|[1-9]",
'0V': r"5[0-3]|[1-4][0-9]|0[1-9]",
# non calver parts
'MAJOR' : r"[0-9]+",
'MINOR' : r"[0-9]+",
'PATCH' : r"[0-9]+",
'BUILD' : r"[0-9]+",
'BLD' : r"[1-9][0-9]*",
'RELEASE': r"preview|final|alpha|beta|post|rc",
'PYTAG' : r"post|rc|a|b",
'NUM' : r"[0-9]+",
}
PATTERN_PART_FIELDS = {
'YYYY' : 'year_y',
'YY' : 'year_y',
'0Y' : 'year_y',
'GGGG' : 'year_g',
'GG' : 'year_g',
'0G' : 'year_g',
'Q' : 'quarter',
'MM' : 'month',
'0M' : 'month',
'DD' : 'dom',
'0D' : 'dom',
'JJJ' : 'doy',
'00J' : 'doy',
'MAJOR' : 'major',
'MINOR' : 'minor',
'PATCH' : 'patch',
'BUILD' : 'bid',
'BLD' : 'bid',
'RELEASE': 'tag',
'PYTAG' : 'pytag',
'NUM' : 'num',
'WW' : 'week_w',
'0W' : 'week_w',
'UU' : 'week_u',
'0U' : 'week_u',
'VV' : 'week_v',
'0V' : 'week_v',
}
PEP440_PART_SUBSTITUTIONS = {
'0W' : "WW",
'0U' : "UU",
'0V' : "VV",
'0M' : "MM",
'0D' : "DD",
'00J' : "JJJ",
'BUILD' : "BLD",
'RELEASE': "PYTAG",
}
FieldValue = typ.Union[str, int]
def _fmt_num(val: FieldValue) -> str:
return str(val)
def _fmt_bld(val: FieldValue) -> str:
return str(int(val))
def _fmt_yy(year_y: FieldValue) -> str:
return str(int(str(year_y)[-2:]))
def _fmt_0y(year_y: FieldValue) -> str:
return "{0:02}".format(int(str(year_y)[-2:]))
def _fmt_gg(year_g: FieldValue) -> str:
return str(int(str(year_g)[-2:]))
def _fmt_0g(year_g: FieldValue) -> str:
return "{0:02}".format(int(str(year_g)[-2:]))
def _fmt_0m(month: FieldValue) -> str:
return "{0:02}".format(int(month))
def _fmt_0d(dom: FieldValue) -> str:
return "{0:02}".format(int(dom))
def _fmt_00j(doy: FieldValue) -> str:
return "{0:03}".format(int(doy))
def _fmt_0w(week_w: FieldValue) -> str:
return "{0:02}".format(int(week_w))
def _fmt_0u(week_u: FieldValue) -> str:
return "{0:02}".format(int(week_u))
def _fmt_0v(week_v: FieldValue) -> str:
return "{0:02}".format(int(week_v))
PART_FORMATS: typ.Dict[str, typ.Callable[[FieldValue], str]] = {
'YYYY' : _fmt_num,
'YY' : _fmt_yy,
'0Y' : _fmt_0y,
'GGGG' : _fmt_num,
'GG' : _fmt_gg,
'0G' : _fmt_0g,
'Q' : _fmt_num,
'MM' : _fmt_num,
'0M' : _fmt_0m,
'DD' : _fmt_num,
'0D' : _fmt_0d,
'JJJ' : _fmt_num,
'00J' : _fmt_00j,
'MAJOR' : _fmt_num,
'MINOR' : _fmt_num,
'PATCH' : _fmt_num,
'BUILD' : _fmt_num,
'BLD' : _fmt_bld,
'RELEASE': _fmt_num,
'PYTAG' : _fmt_num,
'NUM' : _fmt_num,
'WW' : _fmt_num,
'0W' : _fmt_0w,
'UU' : _fmt_num,
'0U' : _fmt_0u,
'VV' : _fmt_num,
'0V' : _fmt_0v,
}
def _convert_to_pep440(version_pattern: str) -> str:
# NOTE (mb 2020-09-20): This does not support some
# corner cases as specified in PEP440, in particular
# related to post and dev releases.
pep440_pattern = version_pattern
if pep440_pattern.startswith("v"):
pep440_pattern = pep440_pattern[1:]
pep440_pattern = pep440_pattern.replace(r"\[", "")
pep440_pattern = pep440_pattern.replace(r"\]", "")
pep440_pattern, _ = re.subn(r"[^a-zA-Z0-9\.\[\]]", "", pep440_pattern)
part_names = list(PATTERN_PART_FIELDS.keys())
part_names.sort(key=len, reverse=True)
for part_name in part_names:
if part_name not in version_pattern:
continue
if part_name not in PEP440_PART_SUBSTITUTIONS:
continue
substitution = PEP440_PART_SUBSTITUTIONS[part_name]
is_numerical_part = part_name not in ('RELEASE', 'PYTAG')
if is_numerical_part:
part_index = pep440_pattern.find(part_name)
is_zero_truncation_part = part_index == 0 or pep440_pattern[part_index - 1] == "."
if is_zero_truncation_part:
pep440_pattern = pep440_pattern.replace(part_name, substitution)
else:
pep440_pattern = pep440_pattern.replace(part_name, substitution)
# PYTAG and NUM must be adjacent and also be the last (optional) part
if 'PYTAGNUM' not in pep440_pattern:
pep440_pattern = pep440_pattern.replace("PYTAG", "")
pep440_pattern = pep440_pattern.replace("NUM" , "")
pep440_pattern = pep440_pattern.replace("[]" , "")
pep440_pattern += "[PYTAGNUM]"
return pep440_pattern
def normalize_pattern(version_pattern: str, raw_pattern: str) -> str:
normalized_pattern = raw_pattern
if "{version}" in raw_pattern:
normalized_pattern = normalized_pattern.replace("{version}", version_pattern)
if "{pep440_version}" in normalized_pattern:
pep440_version_pattern = _convert_to_pep440(version_pattern)
normalized_pattern = normalized_pattern.replace("{pep440_version}", pep440_version_pattern)
return normalized_pattern
def _replace_pattern_parts(pattern: str) -> str:
# The pattern is escaped, so that everything besides the format
# string variables is treated literally.
while True:
new_pattern, _n = re.subn(r"([^\\]|^)\[", r"\1(?:", pattern)
new_pattern, _m = re.subn(r"([^\\]|^)\]", r"\1)?" , new_pattern)
pattern = new_pattern
if _n + _m == 0:
break
SortKey = typ.Tuple[int, int]
PostitionedPart = typ.Tuple[int, int, str]
part_patterns_by_index: typ.Dict[SortKey, PostitionedPart] = {}
for part_name, part_pattern in PART_PATTERNS.items():
start_idx = pattern.find(part_name)
if start_idx >= 0:
field = PATTERN_PART_FIELDS[part_name]
named_part_pattern = f"(?P<{field}>{part_pattern})"
end_idx = start_idx + len(part_name)
sort_key = (-end_idx, -len(part_name))
part_patterns_by_index[sort_key] = (start_idx, end_idx, named_part_pattern)
# NOTE (mb 2020-09-17): The sorting is done so that we process items:
# - right before left
# - longer before shorter
last_start_idx = len(pattern) + 1
result_pattern = pattern
for _, (start_idx, end_idx, named_part_pattern) in sorted(part_patterns_by_index.items()):
if end_idx <= last_start_idx:
result_pattern = (
result_pattern[:start_idx] + named_part_pattern + result_pattern[end_idx:]
)
last_start_idx = start_idx
return result_pattern
def _compile_pattern_re(normalized_pattern: str) -> typ.Pattern[str]:
escaped_pattern = normalized_pattern
for char, escaped in RE_PATTERN_ESCAPES:
# [] braces are used for optional parts, such as [-RELEASE]/[-beta]
# and need to be escaped manually.
is_semantic_char = char in "[]\\"
if not is_semantic_char:
# escape it so it is a literal in the re pattern
escaped_pattern = escaped_pattern.replace(char, escaped)
pattern_str = _replace_pattern_parts(escaped_pattern)
return re.compile(pattern_str)
@utils.memo
def compile_pattern(version_pattern: str, raw_pattern: typ.Optional[str] = None) -> Pattern:
_raw_pattern = version_pattern if raw_pattern is None else raw_pattern
normalized_pattern = normalize_pattern(version_pattern, _raw_pattern)
regexp = _compile_pattern_re(normalized_pattern)
return Pattern(version_pattern, normalized_pattern, regexp)
def compile_patterns(version_pattern: str, raw_patterns: typ.List[str]) -> typ.List[Pattern]:
return [compile_pattern(version_pattern, raw_pattern) for raw_pattern in raw_patterns]