sane-date-math: convert to LR parser

2022-12-24 05:08:17 +00:00 · 2022-12-24 05:08:17 +00:00 · 16fa1e0eda
commit 16fa1e0eda
parent 51a96525d9
1 changed files with 212 additions and 328 deletions
--- a/pkgs/sane-scripts/src/sane-date-math
+++ b/pkgs/sane-scripts/src/sane-date-math
@ -2,8 +2,8 @@
 # i just went overboard playing around with parsers, is all.
 # use this like `./sane-date-math 'today - 5d'`
-# of course, it handles parenthesizes and operator precedence, so you can do sillier things like
+# of course, it handles parentheses and operator precedence/associativity, so you can do sillier things like
-# `./sane-date-math '  today - (3*4+1 - ((0)) ) *7d '`
+# `./sane-date-math '  today - (1+3 *4 - ((0)) ) *7d '`
 import abc
@ -15,8 +15,6 @@ class Token:
        self.c = c
    def __repr__(self) -> str:
        if self == EOF:
            return "<EOF>"
        return f"{self.c!r}"
    def __str__(self) -> str:
@ -25,7 +23,6 @@ class Token:
    def __eq__(self, other: 'Token') -> bool:
        return self.c == other.c
 EOF = Token('\x05')
 PLUS = Token('+')
 MINUS = Token('-')
 ASTERISK = Token('*')
@ -40,349 +37,235 @@ ALPHA = ALPHA_LOWER + ALPHA_UPPER
 ALPHA_UNDER = ALPHA + [UNDERSCORE]
 ALPHA_NUM_UNDER = ALPHA_UNDER + DIGITS
 class ParserContext:
    def feed(self, token: Token) -> 'ParserContext':
        return None  # can't ingest the token
    def upgrade(self) -> 'ParserContext':
        return None  # no upgrade path
-# TODO: should be enum
+class Parser:
 class ParseCode:
    # return if the parser cannot parse the provided token
    HALT = 0
    # return if the parser is already "complete" and the token should be yielded to the outer context instead
    YIELD = 1
    # return is the parser successfully consumed the provided token and parsing should continue
    CONTINUE = 2
 class ParserContext(metaclass=abc.ABCMeta):
    @abc.abstractmethod
    def feed(self, token: Token) -> ParseCode:
        """
        possibly ingests this token, modifying internal state,
        and providing instruction to the outer parser layer on
        how to proceed.
        """
        pass
    def context(self) -> 'ParserContext':
        """
        hack to make type-level "Productions" compatible with instance-level "ParserContext"s.
        """
        return self
    def destructure(self) -> object:
        """
        destructure the outer layer of this ParserContext to obtain access to whatever state it captured.
        e.g. Then([A, Choice([B, C])]) destructures first to [A, Choice([B, C])].
        it's not recursive; the inner layers must be manually destructured.
        """
        return self
 class Then(ParserContext):
    """
-    primitive combinator: given a sequence of parser constructs, parse the input
+    LR parser.
-    using the first parser until that parser yields, then parse using the second
+    keeps exactly one root item, and for each input token
-    parser, and so on.
+    feeds it to the root, possibly "upgrading" the root N times
    before it's able to be fed.
    """
-    def __init__(self, items: list):
+    def __init__(self, root: ParserContext):
-        self.items = [i.context() for i in items]
+        self.root = root
    def feed(self, token: Token) -> bool:
        new_root = self.root.feed(token)
        if new_root is not None:
            self.root = new_root
            return True
        else:
            # root can't directly accept this item.
            # "upgrade" it and try again.
            new_root = self.root.upgrade()
            if new_root is None: return False
            self.root = new_root
            return self.feed(token)
    def complete(self) -> ParserContext:
        # upgrade the root as far as possible before returning
        root = None
        new_root = self.root
        while new_root is not None:
            root = new_root
            new_root = root.upgrade()
        return root
 class ReprParserContext(ParserContext):
    """ helper that gives a good default repr to most contexts """
    def __init__(self, items: list = None):
        self.items = items if items is not None else []
    def __repr__(self) -> str:
-        return f"Then({self.items!r})"
+        return f'{self.__class__.__name__}({self.items!r})'
    def __str__(self) -> str:
        return str(self.items)
-    def feed(self, token: Token) -> ParseCode:
+class BaseContext(ReprParserContext):
-        # we expect parser contexts to be fused: once they YIELD,
+    """ empty context; initial state of the parser """
-        # they should yield on all future calls as well
+    def feed(self, token: Token) -> ParserContext:
-        for i in self.items:
+        if token == SPACE:
-            result = i.feed(token)
+            return self
-            if result != ParseCode.YIELD: return result
+        if token == OPEN_PAREN:
-        else:
+            return ParenContext(BaseContext())
-            # all items are done parsing; so are we!
+        if token in DIGITS:
-            return ParseCode.YIELD
+            return IntegerContext([token])
        if token in ALPHA_UNDER:
            return IdentifierContext([token])
-    def destructure(self) -> list:
+class IdentifierContext(ReprParserContext):
-        return self.items
+    """ context is an identifier like `today` """
    def __init__(self, tokens: list):
        super().__init__(tokens)
        self.tokens = tokens
-class Choice(ParserContext):
+    def feed(self, token: Token) -> ParserContext:
        if token in ALPHA_NUM_UNDER:
            return IdentifierContext(self.tokens + [token])
    def upgrade(self) -> ParserContext:
        return StrongValueContext(self)
 class IntegerContext(ReprParserContext):
    """ context is an integer like `45` """
    def __init__(self, tokens: list):
        super().__init__(tokens)
        self.tokens = tokens
    def feed(self, token: Token) -> ParserContext:
        if token in DIGITS:
            return IntegerContext(self.tokens + [token])
        if token == Token('d'):
            return DurationContext(self)
    def upgrade(self) -> ParserContext:
        # can't continue the integer; it becomes a value
        return StrongValueContext(self)
 class DurationContext(ReprParserContext):
    """ context is a duration like `14d` """
    def __init__(self, value: IntegerContext):
        super().__init__([value])
        self.value = value
    def upgrade(self) -> ParserContext:
        return StrongValueContext(self)
 class BaseValueContext(ReprParserContext):
    """ abstract base for types that can be used in compound expressions """
    def __init__(self, value: ParserContext):
        super().__init__([value])
        self.value = value
    def feed(self, token: Token) -> ParserContext:
        if token == SPACE:
            return self
 class StrongValueContext(BaseValueContext):
    """
-    primitive combinator: try each parser in order and use the first match.
+    in the context of operators, a strong value is something which prefers
-    NB: there's no lookahead. whichever parser is able to parse the first token
+    to not be grabbed by a lhs value.
-    is used for the entire stream.
+
    so for example, strong values have the opportunity to initiate a multiply operation before the lhs closes an addition operation that this strong value is a part of
    """
-    def __init__(self, choices: list):
+    def feed(self, token: Token) -> ParserContext:
-        self.choices = choices
+        if token == ASTERISK:
-        self.active = None
+            return BinaryOpContext(self, token, BaseContext())
        return super().feed(token)
-    def __repr__(self) -> str:
+    def upgrade(self) -> ParserContext:
-        return f"Choice({self.choices!r})"
+        return WeakValueContext(self.value)
-    def __str__(self) -> str:
+class WeakValueContext(BaseValueContext):
-        if self.active is not None:
+    def feed(self, token: Token) -> ParserContext:
-            return str(self.active)
+        if token == PLUS:
-        else:
+            return BinaryOpContext(self, token, BaseContext())
-            return repr(self)
+        if token == MINUS:
            return BinaryOpContext(self, token, BaseContext())
-    def feed(self, token: Token) -> ParseCode:
+        return super().feed(token)
        if self.active is not None:
            return self.active.feed(token)
-        for choice in self.choices:
+class BinaryOpContext(ReprParserContext):
-            item = choice.context()
+    """ context for a binary operation. the LHS and operator are parsed, but the rhs may not yet contain a value """
-            result = item.feed(token)
+    def __init__(self, lhs: BaseValueContext, oper: Token, rhs: ParserContext):
-            if result is not ParseCode.HALT:
+        super().__init__([lhs, oper, rhs])
-                self.active = item
+        self.lhs = lhs
-                return result
+        self.oper = oper
        self.rhs = rhs
-        return ParseCode.HALT  # no matches
+    @property
    def precedence_class(self) -> type:
        if self.oper in [PLUS, MINUS]:
            return WeakValueContext
        if self.oper == ASTERISK:
            return StrongValueContext
-    def destructure(self) -> ParserContext:
+    def feed(self, token: Token) -> ParserContext:
-        return self.active
+        new_rhs = self.rhs.feed(token)
        if new_rhs is not None:
            return BinaryOpContext(self.lhs, self.oper, new_rhs)
-class WantToken(ParserContext):
+    def upgrade(self) -> ParserContext:
-    """
+        new_rhs = self.rhs.upgrade()
-    match a single token out of a list of valid tokens
+        if new_rhs is None: return None
    """
    def __init__(self, want: list):
        self.has = None
        self.want = [want] if isinstance(want, Token) else want
-    def __repr__(self) -> str:
+        # upgrade self once the rhs has reach the required precedence compatible with this operator
-        return f"WantToken({self.want!r})"
+        new_self = BinaryOpContext(self.lhs, self.oper, new_rhs)
        if isinstance(new_rhs, self.precedence_class):
            return StrongValueContext(self)  # close the operation
-    def feed(self, token: Token) -> ParseCode:
+        return new_self
        if self.has is not None: return ParseCode.YIELD
        if token in self.want:
            self.has = token
            return ParseCode.CONTINUE
        return ParseCode.HALT
-    def destructure(self) -> Token:
+class ParenContext(ReprParserContext):
-        return self.has
+    """ context for a value contained within parentheses """
    def __init__(self, inner: ParserContext):
        super().__init__([inner])
        self.inner = inner
-class Empty(ParserContext):
+    def feed(self, token: Token) -> ParserContext:
-    """
+        new_inner = self.inner.feed(token)
-    used as a terminal to allow for constructs like `optional`
+        if new_inner is not None:
-    """
+            return ParenContext(new_inner)
    def feed(self, token: Token) -> ParseCode:
        return ParseCode.YIELD
-    def destructure(self) -> None:
+        if token == CLOSE_PAREN and isinstance(self.inner, WeakValueContext):
-        return None
+            return StrongValueContext(self)
-def optional(context: ParserContext) -> ParserContext:
+    def upgrade(self) -> ParserContext:
-    return Choice([context, Empty()])
+        new_inner = self.inner.upgrade()
        if new_inner is not None:
            return ParenContext(new_inner)
-## "Productions" sit on top of these base ParserContexts in order to give names to
+## AstItems are produced from a ParserContext input
-## large token sequences and to "reduce" them into AST types more intelligently.
+## ParserContext parse outputs are translated into `AstItem`s before evaluation
-
+## so that we can operate on a higher-level tree that directly encodes native values like integers
 class ProductionContext(ParserContext):
    """
    this adapts from the Production system of specification to the ParserContext system.
    this is instantiated for high-level productions where we specify a grammar
    and then parse "all in one go", sealing away incomplete state, and converting
    the parsed tokens into actually useful abstractions (like signed numbers).
    """
    def __init__(self, production: 'Production', context: ParserContext = None):
        self.production = production
        self.context = context if context is not None else production.grammar()
    def __repr__(self) -> str:
        return f"ProductionContext({self.production!r}, {self.context!r})"
    def __str__(self) -> str:
        return str(self.context)
    def feed(self, token: Token) -> ParseCode:
        return self.context.feed(token)
    def reduce_inner(self, inner: ParserContext):
        if isinstance(inner, ProductionContext):
            return inner.reduce()  # easy
        elif isinstance(inner, ParserContext):
            return self.reduce_inner(inner.destructure())
        elif isinstance(inner, list):  # happens via unpacking of Then objects
            return [self.reduce_inner(i) for i in inner]
        else:
            return inner
    def reduce(self) -> object:
        # XXX this ends up being a leaf -> root reduction,
        # which generally makes it harder to achieve detailed control when nesting.
        return self.production.reduce(self.reduce_inner(self.context))
 class Production:
    """
    non-generic, likely multi-token productions,
    specified in terms of other Productions and the above primitives
    """
    def grammar(self) -> ParserContext:
        raise NotImplementedError()
    def context(self) -> ParserContext:
        return ProductionContext(self)
    def reduce(self, inner: object) -> object:
        """
        use to construct the outer types out of already-converted inner types.
        e.g. Number = Then([optional(Minus), Digits, optional(Suffix)])
            gets called with reduce([a, b, c]), where a is the already reduced `optional(Minus)`,
            i.e. `None` or whatever type corresponds to the Minus token.
        """
        return inner
 class DigitProduction(Production):
    """ one digit token """
    def grammar(self) -> ParserContext:
        return WantToken(DIGITS)
    def reduce(self, inner: Token) -> int:
        return int(inner.c)
 class IntProduction(Production):
    """ multi-digit integer """
    def grammar(self) -> ParserContext:
        return Then([
            DigitProduction(),
            optional(IntProduction()),
        ])
    def reduce(self, inner: list) -> int:
        # TODO: wrong associativity
        leading, trailing = inner
        if trailing is None:
            return leading
        else:
            return leading*10 + trailing
 class DurationOrIntProduction(Production):
    # due to a lack of lookahead, we combine duration and int parsing into one production
    # because a duration shares a complete int as prefix
    def grammar(self) -> ParserContext:
        return Then([
            IntProduction(),
            optional(WantToken(Token('d'))),
        ])
    def reduce(self, inner: list) -> 'Literal':
        value, suffix = inner
        if suffix is None:
            return Literal(value)
        else:
            return Literal(timedelta(value))
 class Whitespace(Production):
    def grammar(self) -> ParserContext:
        return Then([
            WantToken(SPACE),
            optional(Whitespace()),
        ])
 class ParenthesizedExpr(Production):
    def grammar(self) -> ParserContext:
        return Then([
            WantToken(OPEN_PAREN),
            Expr(),
            WantToken(CLOSE_PAREN),
        ])
    def reduce(self, inner: list) -> 'AstItem':
        open, expr, close = inner
        return expr
 class IdentifierTail(Production):
    def grammar(self) -> ParserContext:
        return Then([
            WantToken(ALPHA_NUM_UNDER),
            optional(IdentifierTail()),
        ])
 class Identifier(Production):
    """ variable-style identifier, e.g. 'TODAY' """
    def grammar(self) -> ParserContext:
        return Then([
            WantToken(ALPHA_UNDER),
            optional(IdentifierTail()),
        ])
    def reduce(self, inner: list) -> 'Literal':
        # fold the tokens into a string
        first, rest = inner
        head = first.c
        while rest is not None:
            next, rest = rest
            head += next.c
        return Variable(head)
 class UnaryExpr(Production):
    """ some expression which does not invoke any operators at the outermost level """
    def grammar(self) -> ParserContext:
        return Then([
            optional(Whitespace()),
            Choice([
                DurationOrIntProduction(),
                Identifier(),
                ParenthesizedExpr(),
            ]),
            optional(Whitespace()),
        ])
    def reduce(self, inner: list):
        # drop the whitespace
        leading, primary, trailing = inner
        return primary
 class ExprRHS(Production):
    """ right hand side of a binary operation """
    def grammar(self) -> ParserContext:
        return Then([
            Choice([WantToken(ASTERISK), WantToken(PLUS), WantToken(MINUS)]),
            # remaining, is just another `Expr`, but we need to keep the fields expanded here to control precedence.
            UnaryExpr(),
            Choice([ExprRHS(), Empty()]),
        ])
 class Expr(Production):
    """ this is the top-level production """
    def grammar(self) -> ParserContext:
        return Then([
            UnaryExpr(),
            Choice([ExprRHS(), Empty()])
        ])
    def reduce(self, inner: list):
        lhs, rhs = inner
        if rhs is None: return lhs
        # convert the whole right-hand-side of the tree, iteratively.
        oper, rhs, rhs_next = rhs
        if oper == ASTERISK:
            # multiplication has high precedence and we grab the adjacent token ASAP
            lhs = MulOp(lhs, rhs)
            if rhs_next is not None:
                lhs = self.reduce([lhs, rhs_next])
        else:
            # reduce the rhs and *then* apply this operator
            if rhs_next is not None:
                rhs = self.reduce([rhs, rhs_next])
            if oper == PLUS:
                lhs = AddOp(lhs, rhs)
            elif oper == MINUS:
                lhs = SubOp(lhs, rhs)
        return lhs
 ## parsed productions are `reduce`d to more useful `AstItem` items which we use
 ## for the actual evaluation/computation
 class AstItem(metaclass=abc.ABCMeta):
    @abc.abstractmethod
    def eval(self, context: dict):
        pass
    @staticmethod
    def decode_item(p: ParserContext) -> 'AstItem':
        if isinstance(p, IntegerContext):
            return Literal(AstItem.decode_integer(p))
        if isinstance(p, DurationContext):
            return Literal(timedelta(AstItem.decode_integer(p.value)))
        if isinstance(p, IdentifierContext):
            return Variable(AstItem.decode_identifier(p))
        if isinstance(p, BaseValueContext):
            return AstItem.decode_item(p.value)
        if isinstance(p, BinaryOpContext):
            return AstItem.decode_bin_op(
                p.oper.c,
                AstItem.decode_item(p.lhs),
                AstItem.decode_item(p.rhs)
            )
        if isinstance(p, ParenContext):
            return AstItem.decode_item(p.inner)
    @staticmethod
    def decode_integer(p: IntegerContext) -> int:
        return int(''.join(t.c for t in p.tokens))
    @staticmethod
    def decode_identifier(p: IdentifierContext) -> str:
        return ''.join(t.c for t in p.tokens)
    @staticmethod
    def decode_bin_op(ty: str, lhs: 'AstItem', rhs: 'AstItem') -> 'BinaryOp':
        if ty == '+':
            return AddOp(lhs, rhs)
        if ty == '-':
            return SubOp(lhs, rhs)
        if ty == '*':
            return MulOp(lhs, rhs)
 class Literal(AstItem):
    def __init__(self, v):
        self.v = v
@ -429,32 +312,33 @@ class MulOp(BinaryOp):
    def eval(self, context: dict):
        return self.lhs.eval(context) * self.rhs.eval(context)
 ## toplevel routine. tokenize -> parse -> decode to AST -> evaluate
 def tokenize(stream: str) -> list:
    return [Token(char) for char in stream]
-def parse(ty: Production, tokens: list) -> AstItem:
+def parse(tokens: list) -> ParserContext:
-    ctx = Then([ty, Empty()])
+    parser = Parser(BaseContext())
    for i, t in enumerate(tokens):
-        result = ctx.feed(t)
+        result = parser.feed(t)
        # print(f"i={i}; t={t}; state: {ctx!r}")
-        assert result == ParseCode.CONTINUE, f"unexpected token '{t}' at {i}; state: {ctx!r}"
+        assert result, f"unexpected token '{t}' at {i}; state: {parser.complete()!r}"
-    # feed a trailing EOF which no production should consume.
+    return parser.complete()
    # this either drives the context to a HALT state, if it's expecting
    # some specific other token, or YIELD if it's happy for the stream to be closed.
    assert ctx.feed(EOF) == ParseCode.YIELD, f"incomplete expression: {ctx!r}"
    return ctx.destructure()[0].reduce()
 def evaluate(expr: str) -> object:
    tok = tokenize(expr)
-    expr = parse(Expr(), tok)
+    parse_tree = parse(tok)
-    print(expr)
+    print(parse_tree)
    ast = AstItem.decode_item(parse_tree)
    print(ast)
    env = dict(
        today=datetime.now()
    )
-    return expr.eval(env)
+    return ast.eval(env)
 if __name__ == '__main__':