diff --git a/pkgs/sane-scripts/src/sane-date-math b/pkgs/sane-scripts/src/sane-date-math index b0bf12b0..95ed81cb 100755 --- a/pkgs/sane-scripts/src/sane-date-math +++ b/pkgs/sane-scripts/src/sane-date-math @@ -2,8 +2,8 @@ # i just went overboard playing around with parsers, is all. # use this like `./sane-date-math 'today - 5d'` -# of course, it handles parenthesizes and operator precedence, so you can do sillier things like -# `./sane-date-math ' today - (3*4+1 - ((0)) ) *7d '` +# of course, it handles parentheses and operator precedence/associativity, so you can do sillier things like +# `./sane-date-math ' today - (1+3 *4 - ((0)) ) *7d '` import abc @@ -15,8 +15,6 @@ class Token: self.c = c def __repr__(self) -> str: - if self == EOF: - return "" return f"{self.c!r}" def __str__(self) -> str: @@ -25,7 +23,6 @@ class Token: def __eq__(self, other: 'Token') -> bool: return self.c == other.c -EOF = Token('\x05') PLUS = Token('+') MINUS = Token('-') ASTERISK = Token('*') @@ -40,349 +37,235 @@ ALPHA = ALPHA_LOWER + ALPHA_UPPER ALPHA_UNDER = ALPHA + [UNDERSCORE] ALPHA_NUM_UNDER = ALPHA_UNDER + DIGITS +class ParserContext: + def feed(self, token: Token) -> 'ParserContext': + return None # can't ingest the token + def upgrade(self) -> 'ParserContext': + return None # no upgrade path -# TODO: should be enum -class ParseCode: - # return if the parser cannot parse the provided token - HALT = 0 - # return if the parser is already "complete" and the token should be yielded to the outer context instead - YIELD = 1 - # return is the parser successfully consumed the provided token and parsing should continue - CONTINUE = 2 - -class ParserContext(metaclass=abc.ABCMeta): - @abc.abstractmethod - def feed(self, token: Token) -> ParseCode: - """ - possibly ingests this token, modifying internal state, - and providing instruction to the outer parser layer on - how to proceed. - """ - pass - - def context(self) -> 'ParserContext': - """ - hack to make type-level "Productions" compatible with instance-level "ParserContext"s. - """ - return self - - def destructure(self) -> object: - """ - destructure the outer layer of this ParserContext to obtain access to whatever state it captured. - e.g. Then([A, Choice([B, C])]) destructures first to [A, Choice([B, C])]. - it's not recursive; the inner layers must be manually destructured. - """ - return self - -class Then(ParserContext): +class Parser: """ - primitive combinator: given a sequence of parser constructs, parse the input - using the first parser until that parser yields, then parse using the second - parser, and so on. + LR parser. + keeps exactly one root item, and for each input token + feeds it to the root, possibly "upgrading" the root N times + before it's able to be fed. """ - def __init__(self, items: list): - self.items = [i.context() for i in items] + def __init__(self, root: ParserContext): + self.root = root + + def feed(self, token: Token) -> bool: + new_root = self.root.feed(token) + if new_root is not None: + self.root = new_root + return True + else: + # root can't directly accept this item. + # "upgrade" it and try again. + new_root = self.root.upgrade() + if new_root is None: return False + self.root = new_root + return self.feed(token) + + def complete(self) -> ParserContext: + # upgrade the root as far as possible before returning + root = None + new_root = self.root + while new_root is not None: + root = new_root + new_root = root.upgrade() + + return root + +class ReprParserContext(ParserContext): + """ helper that gives a good default repr to most contexts """ + def __init__(self, items: list = None): + self.items = items if items is not None else [] def __repr__(self) -> str: - return f"Then({self.items!r})" + return f'{self.__class__.__name__}({self.items!r})' - def __str__(self) -> str: - return str(self.items) - def feed(self, token: Token) -> ParseCode: - # we expect parser contexts to be fused: once they YIELD, - # they should yield on all future calls as well - for i in self.items: - result = i.feed(token) - if result != ParseCode.YIELD: return result - else: - # all items are done parsing; so are we! - return ParseCode.YIELD +class BaseContext(ReprParserContext): + """ empty context; initial state of the parser """ + def feed(self, token: Token) -> ParserContext: + if token == SPACE: + return self + if token == OPEN_PAREN: + return ParenContext(BaseContext()) + if token in DIGITS: + return IntegerContext([token]) + if token in ALPHA_UNDER: + return IdentifierContext([token]) - def destructure(self) -> list: - return self.items +class IdentifierContext(ReprParserContext): + """ context is an identifier like `today` """ + def __init__(self, tokens: list): + super().__init__(tokens) + self.tokens = tokens -class Choice(ParserContext): + def feed(self, token: Token) -> ParserContext: + if token in ALPHA_NUM_UNDER: + return IdentifierContext(self.tokens + [token]) + + def upgrade(self) -> ParserContext: + return StrongValueContext(self) + +class IntegerContext(ReprParserContext): + """ context is an integer like `45` """ + def __init__(self, tokens: list): + super().__init__(tokens) + self.tokens = tokens + + def feed(self, token: Token) -> ParserContext: + if token in DIGITS: + return IntegerContext(self.tokens + [token]) + if token == Token('d'): + return DurationContext(self) + + def upgrade(self) -> ParserContext: + # can't continue the integer; it becomes a value + return StrongValueContext(self) + +class DurationContext(ReprParserContext): + """ context is a duration like `14d` """ + def __init__(self, value: IntegerContext): + super().__init__([value]) + self.value = value + + def upgrade(self) -> ParserContext: + return StrongValueContext(self) + +class BaseValueContext(ReprParserContext): + """ abstract base for types that can be used in compound expressions """ + def __init__(self, value: ParserContext): + super().__init__([value]) + self.value = value + + def feed(self, token: Token) -> ParserContext: + if token == SPACE: + return self + +class StrongValueContext(BaseValueContext): """ - primitive combinator: try each parser in order and use the first match. - NB: there's no lookahead. whichever parser is able to parse the first token - is used for the entire stream. + in the context of operators, a strong value is something which prefers + to not be grabbed by a lhs value. + + so for example, strong values have the opportunity to initiate a multiply operation before the lhs closes an addition operation that this strong value is a part of """ - def __init__(self, choices: list): - self.choices = choices - self.active = None + def feed(self, token: Token) -> ParserContext: + if token == ASTERISK: + return BinaryOpContext(self, token, BaseContext()) + return super().feed(token) - def __repr__(self) -> str: - return f"Choice({self.choices!r})" + def upgrade(self) -> ParserContext: + return WeakValueContext(self.value) - def __str__(self) -> str: - if self.active is not None: - return str(self.active) - else: - return repr(self) +class WeakValueContext(BaseValueContext): + def feed(self, token: Token) -> ParserContext: + if token == PLUS: + return BinaryOpContext(self, token, BaseContext()) + if token == MINUS: + return BinaryOpContext(self, token, BaseContext()) - def feed(self, token: Token) -> ParseCode: - if self.active is not None: - return self.active.feed(token) + return super().feed(token) - for choice in self.choices: - item = choice.context() - result = item.feed(token) - if result is not ParseCode.HALT: - self.active = item - return result +class BinaryOpContext(ReprParserContext): + """ context for a binary operation. the LHS and operator are parsed, but the rhs may not yet contain a value """ + def __init__(self, lhs: BaseValueContext, oper: Token, rhs: ParserContext): + super().__init__([lhs, oper, rhs]) + self.lhs = lhs + self.oper = oper + self.rhs = rhs - return ParseCode.HALT # no matches + @property + def precedence_class(self) -> type: + if self.oper in [PLUS, MINUS]: + return WeakValueContext + if self.oper == ASTERISK: + return StrongValueContext - def destructure(self) -> ParserContext: - return self.active + def feed(self, token: Token) -> ParserContext: + new_rhs = self.rhs.feed(token) + if new_rhs is not None: + return BinaryOpContext(self.lhs, self.oper, new_rhs) -class WantToken(ParserContext): - """ - match a single token out of a list of valid tokens - """ - def __init__(self, want: list): - self.has = None - self.want = [want] if isinstance(want, Token) else want + def upgrade(self) -> ParserContext: + new_rhs = self.rhs.upgrade() + if new_rhs is None: return None - def __repr__(self) -> str: - return f"WantToken({self.want!r})" + # upgrade self once the rhs has reach the required precedence compatible with this operator + new_self = BinaryOpContext(self.lhs, self.oper, new_rhs) + if isinstance(new_rhs, self.precedence_class): + return StrongValueContext(self) # close the operation - def feed(self, token: Token) -> ParseCode: - if self.has is not None: return ParseCode.YIELD - if token in self.want: - self.has = token - return ParseCode.CONTINUE - return ParseCode.HALT + return new_self - def destructure(self) -> Token: - return self.has +class ParenContext(ReprParserContext): + """ context for a value contained within parentheses """ + def __init__(self, inner: ParserContext): + super().__init__([inner]) + self.inner = inner -class Empty(ParserContext): - """ - used as a terminal to allow for constructs like `optional` - """ - def feed(self, token: Token) -> ParseCode: - return ParseCode.YIELD + def feed(self, token: Token) -> ParserContext: + new_inner = self.inner.feed(token) + if new_inner is not None: + return ParenContext(new_inner) - def destructure(self) -> None: - return None + if token == CLOSE_PAREN and isinstance(self.inner, WeakValueContext): + return StrongValueContext(self) -def optional(context: ParserContext) -> ParserContext: - return Choice([context, Empty()]) + def upgrade(self) -> ParserContext: + new_inner = self.inner.upgrade() + if new_inner is not None: + return ParenContext(new_inner) -## "Productions" sit on top of these base ParserContexts in order to give names to -## large token sequences and to "reduce" them into AST types more intelligently. - -class ProductionContext(ParserContext): - """ - this adapts from the Production system of specification to the ParserContext system. - this is instantiated for high-level productions where we specify a grammar - and then parse "all in one go", sealing away incomplete state, and converting - the parsed tokens into actually useful abstractions (like signed numbers). - """ - def __init__(self, production: 'Production', context: ParserContext = None): - self.production = production - self.context = context if context is not None else production.grammar() - - def __repr__(self) -> str: - return f"ProductionContext({self.production!r}, {self.context!r})" - - def __str__(self) -> str: - return str(self.context) - - def feed(self, token: Token) -> ParseCode: - return self.context.feed(token) - - def reduce_inner(self, inner: ParserContext): - if isinstance(inner, ProductionContext): - return inner.reduce() # easy - elif isinstance(inner, ParserContext): - return self.reduce_inner(inner.destructure()) - elif isinstance(inner, list): # happens via unpacking of Then objects - return [self.reduce_inner(i) for i in inner] - else: - return inner - - def reduce(self) -> object: - # XXX this ends up being a leaf -> root reduction, - # which generally makes it harder to achieve detailed control when nesting. - return self.production.reduce(self.reduce_inner(self.context)) - -class Production: - """ - non-generic, likely multi-token productions, - specified in terms of other Productions and the above primitives - """ - def grammar(self) -> ParserContext: - raise NotImplementedError() - - def context(self) -> ParserContext: - return ProductionContext(self) - - def reduce(self, inner: object) -> object: - """ - use to construct the outer types out of already-converted inner types. - e.g. Number = Then([optional(Minus), Digits, optional(Suffix)]) - gets called with reduce([a, b, c]), where a is the already reduced `optional(Minus)`, - i.e. `None` or whatever type corresponds to the Minus token. - """ - return inner - -class DigitProduction(Production): - """ one digit token """ - def grammar(self) -> ParserContext: - return WantToken(DIGITS) - - def reduce(self, inner: Token) -> int: - return int(inner.c) - -class IntProduction(Production): - """ multi-digit integer """ - def grammar(self) -> ParserContext: - return Then([ - DigitProduction(), - optional(IntProduction()), - ]) - - def reduce(self, inner: list) -> int: - # TODO: wrong associativity - leading, trailing = inner - if trailing is None: - return leading - else: - return leading*10 + trailing - -class DurationOrIntProduction(Production): - # due to a lack of lookahead, we combine duration and int parsing into one production - # because a duration shares a complete int as prefix - def grammar(self) -> ParserContext: - return Then([ - IntProduction(), - optional(WantToken(Token('d'))), - ]) - - def reduce(self, inner: list) -> 'Literal': - value, suffix = inner - if suffix is None: - return Literal(value) - else: - return Literal(timedelta(value)) - -class Whitespace(Production): - def grammar(self) -> ParserContext: - return Then([ - WantToken(SPACE), - optional(Whitespace()), - ]) - -class ParenthesizedExpr(Production): - def grammar(self) -> ParserContext: - return Then([ - WantToken(OPEN_PAREN), - Expr(), - WantToken(CLOSE_PAREN), - ]) - - def reduce(self, inner: list) -> 'AstItem': - open, expr, close = inner - return expr - -class IdentifierTail(Production): - def grammar(self) -> ParserContext: - return Then([ - WantToken(ALPHA_NUM_UNDER), - optional(IdentifierTail()), - ]) - - -class Identifier(Production): - """ variable-style identifier, e.g. 'TODAY' """ - def grammar(self) -> ParserContext: - return Then([ - WantToken(ALPHA_UNDER), - optional(IdentifierTail()), - ]) - - def reduce(self, inner: list) -> 'Literal': - # fold the tokens into a string - first, rest = inner - head = first.c - while rest is not None: - next, rest = rest - head += next.c - return Variable(head) - -class UnaryExpr(Production): - """ some expression which does not invoke any operators at the outermost level """ - def grammar(self) -> ParserContext: - return Then([ - optional(Whitespace()), - Choice([ - DurationOrIntProduction(), - Identifier(), - ParenthesizedExpr(), - ]), - optional(Whitespace()), - ]) - - def reduce(self, inner: list): - # drop the whitespace - leading, primary, trailing = inner - return primary - -class ExprRHS(Production): - """ right hand side of a binary operation """ - def grammar(self) -> ParserContext: - return Then([ - Choice([WantToken(ASTERISK), WantToken(PLUS), WantToken(MINUS)]), - # remaining, is just another `Expr`, but we need to keep the fields expanded here to control precedence. - UnaryExpr(), - Choice([ExprRHS(), Empty()]), - ]) - -class Expr(Production): - """ this is the top-level production """ - def grammar(self) -> ParserContext: - return Then([ - UnaryExpr(), - Choice([ExprRHS(), Empty()]) - ]) - - def reduce(self, inner: list): - lhs, rhs = inner - if rhs is None: return lhs - - # convert the whole right-hand-side of the tree, iteratively. - oper, rhs, rhs_next = rhs - if oper == ASTERISK: - # multiplication has high precedence and we grab the adjacent token ASAP - lhs = MulOp(lhs, rhs) - if rhs_next is not None: - lhs = self.reduce([lhs, rhs_next]) - else: - # reduce the rhs and *then* apply this operator - if rhs_next is not None: - rhs = self.reduce([rhs, rhs_next]) - - if oper == PLUS: - lhs = AddOp(lhs, rhs) - elif oper == MINUS: - lhs = SubOp(lhs, rhs) - - return lhs - - -## parsed productions are `reduce`d to more useful `AstItem` items which we use -## for the actual evaluation/computation +## AstItems are produced from a ParserContext input +## ParserContext parse outputs are translated into `AstItem`s before evaluation +## so that we can operate on a higher-level tree that directly encodes native values like integers class AstItem(metaclass=abc.ABCMeta): @abc.abstractmethod def eval(self, context: dict): pass + @staticmethod + def decode_item(p: ParserContext) -> 'AstItem': + if isinstance(p, IntegerContext): + return Literal(AstItem.decode_integer(p)) + if isinstance(p, DurationContext): + return Literal(timedelta(AstItem.decode_integer(p.value))) + if isinstance(p, IdentifierContext): + return Variable(AstItem.decode_identifier(p)) + if isinstance(p, BaseValueContext): + return AstItem.decode_item(p.value) + if isinstance(p, BinaryOpContext): + return AstItem.decode_bin_op( + p.oper.c, + AstItem.decode_item(p.lhs), + AstItem.decode_item(p.rhs) + ) + if isinstance(p, ParenContext): + return AstItem.decode_item(p.inner) + + @staticmethod + def decode_integer(p: IntegerContext) -> int: + return int(''.join(t.c for t in p.tokens)) + + @staticmethod + def decode_identifier(p: IdentifierContext) -> str: + return ''.join(t.c for t in p.tokens) + + @staticmethod + def decode_bin_op(ty: str, lhs: 'AstItem', rhs: 'AstItem') -> 'BinaryOp': + if ty == '+': + return AddOp(lhs, rhs) + if ty == '-': + return SubOp(lhs, rhs) + if ty == '*': + return MulOp(lhs, rhs) + class Literal(AstItem): def __init__(self, v): self.v = v @@ -429,32 +312,33 @@ class MulOp(BinaryOp): def eval(self, context: dict): return self.lhs.eval(context) * self.rhs.eval(context) + +## toplevel routine. tokenize -> parse -> decode to AST -> evaluate + def tokenize(stream: str) -> list: return [Token(char) for char in stream] -def parse(ty: Production, tokens: list) -> AstItem: - ctx = Then([ty, Empty()]) +def parse(tokens: list) -> ParserContext: + parser = Parser(BaseContext()) for i, t in enumerate(tokens): - result = ctx.feed(t) + result = parser.feed(t) # print(f"i={i}; t={t}; state: {ctx!r}") - assert result == ParseCode.CONTINUE, f"unexpected token '{t}' at {i}; state: {ctx!r}" + assert result, f"unexpected token '{t}' at {i}; state: {parser.complete()!r}" - # feed a trailing EOF which no production should consume. - # this either drives the context to a HALT state, if it's expecting - # some specific other token, or YIELD if it's happy for the stream to be closed. - assert ctx.feed(EOF) == ParseCode.YIELD, f"incomplete expression: {ctx!r}" - - return ctx.destructure()[0].reduce() + return parser.complete() def evaluate(expr: str) -> object: tok = tokenize(expr) - expr = parse(Expr(), tok) - print(expr) + parse_tree = parse(tok) + print(parse_tree) + ast = AstItem.decode_item(parse_tree) + print(ast) + env = dict( today=datetime.now() ) - return expr.eval(env) + return ast.eval(env) if __name__ == '__main__':