sane-date-math: convert to LR parser

2022-12-24 05:08:17 +00:00 · 2022-12-24 05:08:17 +00:00 · 16fa1e0eda
commit 16fa1e0eda
parent 51a96525d9
1 changed files with 212 additions and 328 deletions
--- a/pkgs/sane-scripts/src/sane-date-math
+++ b/pkgs/sane-scripts/src/sane-date-math
@ -2,8 +2,8 @@

 # i just went overboard playing around with parsers, is all.
 # use this like `./sane-date-math 'today - 5d'`
-# of course, it handles parenthesizes and operator precedence, so you can do sillier things like
-# `./sane-date-math '  today - (3*4+1 - ((0)) ) *7d '`
+# of course, it handles parentheses and operator precedence/associativity, so you can do sillier things like
+# `./sane-date-math '  today - (1+3 *4 - ((0)) ) *7d '`


 import abc
@ -15,8 +15,6 @@ class Token:
        self.c = c

    def __repr__(self) -> str:
-        if self == EOF:
-            return "<EOF>"
        return f"{self.c!r}"

    def __str__(self) -> str:
@ -25,7 +23,6 @@ class Token:
    def __eq__(self, other: 'Token') -> bool:
        return self.c == other.c

-EOF = Token('\x05')
 PLUS = Token('+')
 MINUS = Token('-')
 ASTERISK = Token('*')
@ -40,349 +37,235 @@ ALPHA = ALPHA_LOWER + ALPHA_UPPER
 ALPHA_UNDER = ALPHA + [UNDERSCORE]
 ALPHA_NUM_UNDER = ALPHA_UNDER + DIGITS

+class ParserContext:
+    def feed(self, token: Token) -> 'ParserContext':
+        return None  # can't ingest the token
+
+    def upgrade(self) -> 'ParserContext':
+        return None  # no upgrade path
+
+class Parser:
+    """
+    LR parser.
+    keeps exactly one root item, and for each input token
+    feeds it to the root, possibly "upgrading" the root N times
+    before it's able to be fed.
+    """
+    def __init__(self, root: ParserContext):
+        self.root = root
+
+    def feed(self, token: Token) -> bool:
+        new_root = self.root.feed(token)
+        if new_root is not None:
+            self.root = new_root
+            return True
+        else:
+            # root can't directly accept this item.
+            # "upgrade" it and try again.
+            new_root = self.root.upgrade()
+            if new_root is None: return False
+            self.root = new_root
+            return self.feed(token)
+
+    def complete(self) -> ParserContext:
+        # upgrade the root as far as possible before returning
+        root = None
+        new_root = self.root
+        while new_root is not None:
+            root = new_root
+            new_root = root.upgrade()
+
+        return root
+
+class ReprParserContext(ParserContext):
+    """ helper that gives a good default repr to most contexts """
+    def __init__(self, items: list = None):
+        self.items = items if items is not None else []
+
+    def __repr__(self) -> str:
+        return f'{self.__class__.__name__}({self.items!r})'


-# TODO: should be enum
-class ParseCode:
-    # return if the parser cannot parse the provided token
-    HALT = 0
-    # return if the parser is already "complete" and the token should be yielded to the outer context instead
-    YIELD = 1
-    # return is the parser successfully consumed the provided token and parsing should continue
-    CONTINUE = 2
+class BaseContext(ReprParserContext):
+    """ empty context; initial state of the parser """
+    def feed(self, token: Token) -> ParserContext:
+        if token == SPACE:
+            return self
+        if token == OPEN_PAREN:
+            return ParenContext(BaseContext())
+        if token in DIGITS:
+            return IntegerContext([token])
+        if token in ALPHA_UNDER:
+            return IdentifierContext([token])

-class ParserContext(metaclass=abc.ABCMeta):
-    @abc.abstractmethod
-    def feed(self, token: Token) -> ParseCode:
-        """
-        possibly ingests this token, modifying internal state,
-        and providing instruction to the outer parser layer on
-        how to proceed.
-        """
-        pass
+class IdentifierContext(ReprParserContext):
+    """ context is an identifier like `today` """
+    def __init__(self, tokens: list):
+        super().__init__(tokens)
+        self.tokens = tokens

-    def context(self) -> 'ParserContext':
-        """
-        hack to make type-level "Productions" compatible with instance-level "ParserContext"s.
-        """
+    def feed(self, token: Token) -> ParserContext:
+        if token in ALPHA_NUM_UNDER:
+            return IdentifierContext(self.tokens + [token])
+
+    def upgrade(self) -> ParserContext:
+        return StrongValueContext(self)
+
+class IntegerContext(ReprParserContext):
+    """ context is an integer like `45` """
+    def __init__(self, tokens: list):
+        super().__init__(tokens)
+        self.tokens = tokens
+
+    def feed(self, token: Token) -> ParserContext:
+        if token in DIGITS:
+            return IntegerContext(self.tokens + [token])
+        if token == Token('d'):
+            return DurationContext(self)
+
+    def upgrade(self) -> ParserContext:
+        # can't continue the integer; it becomes a value
+        return StrongValueContext(self)
+
+class DurationContext(ReprParserContext):
+    """ context is a duration like `14d` """
+    def __init__(self, value: IntegerContext):
+        super().__init__([value])
+        self.value = value
+
+    def upgrade(self) -> ParserContext:
+        return StrongValueContext(self)
+
+class BaseValueContext(ReprParserContext):
+    """ abstract base for types that can be used in compound expressions """
+    def __init__(self, value: ParserContext):
+        super().__init__([value])
+        self.value = value
+
+    def feed(self, token: Token) -> ParserContext:
+        if token == SPACE:
            return self

-    def destructure(self) -> object:
+class StrongValueContext(BaseValueContext):
    """
-        destructure the outer layer of this ParserContext to obtain access to whatever state it captured.
-        e.g. Then([A, Choice([B, C])]) destructures first to [A, Choice([B, C])].
-        it's not recursive; the inner layers must be manually destructured.
+    in the context of operators, a strong value is something which prefers
+    to not be grabbed by a lhs value.
+
+    so for example, strong values have the opportunity to initiate a multiply operation before the lhs closes an addition operation that this strong value is a part of
    """
-        return self
+    def feed(self, token: Token) -> ParserContext:
+        if token == ASTERISK:
+            return BinaryOpContext(self, token, BaseContext())
+        return super().feed(token)

-class Then(ParserContext):
-    """
-    primitive combinator: given a sequence of parser constructs, parse the input
-    using the first parser until that parser yields, then parse using the second
-    parser, and so on.
-    """
-    def __init__(self, items: list):
-        self.items = [i.context() for i in items]
+    def upgrade(self) -> ParserContext:
+        return WeakValueContext(self.value)

-    def __repr__(self) -> str:
-        return f"Then({self.items!r})"
+class WeakValueContext(BaseValueContext):
+    def feed(self, token: Token) -> ParserContext:
+        if token == PLUS:
+            return BinaryOpContext(self, token, BaseContext())
+        if token == MINUS:
+            return BinaryOpContext(self, token, BaseContext())

-    def __str__(self) -> str:
-        return str(self.items)
+        return super().feed(token)

-    def feed(self, token: Token) -> ParseCode:
-        # we expect parser contexts to be fused: once they YIELD,
-        # they should yield on all future calls as well
-        for i in self.items:
-            result = i.feed(token)
-            if result != ParseCode.YIELD: return result
-        else:
-            # all items are done parsing; so are we!
-            return ParseCode.YIELD
+class BinaryOpContext(ReprParserContext):
+    """ context for a binary operation. the LHS and operator are parsed, but the rhs may not yet contain a value """
+    def __init__(self, lhs: BaseValueContext, oper: Token, rhs: ParserContext):
+        super().__init__([lhs, oper, rhs])
+        self.lhs = lhs
+        self.oper = oper
+        self.rhs = rhs

-    def destructure(self) -> list:
-        return self.items
+    @property
+    def precedence_class(self) -> type:
+        if self.oper in [PLUS, MINUS]:
+            return WeakValueContext
+        if self.oper == ASTERISK:
+            return StrongValueContext

-class Choice(ParserContext):
-    """
-    primitive combinator: try each parser in order and use the first match.
-    NB: there's no lookahead. whichever parser is able to parse the first token
-    is used for the entire stream.
-    """
-    def __init__(self, choices: list):
-        self.choices = choices
-        self.active = None
+    def feed(self, token: Token) -> ParserContext:
+        new_rhs = self.rhs.feed(token)
+        if new_rhs is not None:
+            return BinaryOpContext(self.lhs, self.oper, new_rhs)

-    def __repr__(self) -> str:
-        return f"Choice({self.choices!r})"
+    def upgrade(self) -> ParserContext:
+        new_rhs = self.rhs.upgrade()
+        if new_rhs is None: return None

-    def __str__(self) -> str:
-        if self.active is not None:
-            return str(self.active)
-        else:
-            return repr(self)
+        # upgrade self once the rhs has reach the required precedence compatible with this operator
+        new_self = BinaryOpContext(self.lhs, self.oper, new_rhs)
+        if isinstance(new_rhs, self.precedence_class):
+            return StrongValueContext(self)  # close the operation

-    def feed(self, token: Token) -> ParseCode:
-        if self.active is not None:
-            return self.active.feed(token)
+        return new_self

-        for choice in self.choices:
-            item = choice.context()
-            result = item.feed(token)
-            if result is not ParseCode.HALT:
-                self.active = item
-                return result
+class ParenContext(ReprParserContext):
+    """ context for a value contained within parentheses """
+    def __init__(self, inner: ParserContext):
+        super().__init__([inner])
+        self.inner = inner

-        return ParseCode.HALT  # no matches
+    def feed(self, token: Token) -> ParserContext:
+        new_inner = self.inner.feed(token)
+        if new_inner is not None:
+            return ParenContext(new_inner)

-    def destructure(self) -> ParserContext:
-        return self.active
+        if token == CLOSE_PAREN and isinstance(self.inner, WeakValueContext):
+            return StrongValueContext(self)

-class WantToken(ParserContext):
-    """
-    match a single token out of a list of valid tokens
-    """
-    def __init__(self, want: list):
-        self.has = None
-        self.want = [want] if isinstance(want, Token) else want
-
-    def __repr__(self) -> str:
-        return f"WantToken({self.want!r})"
-
-    def feed(self, token: Token) -> ParseCode:
-        if self.has is not None: return ParseCode.YIELD
-        if token in self.want:
-            self.has = token
-            return ParseCode.CONTINUE
-        return ParseCode.HALT
-
-    def destructure(self) -> Token:
-        return self.has
-
-class Empty(ParserContext):
-    """
-    used as a terminal to allow for constructs like `optional`
-    """
-    def feed(self, token: Token) -> ParseCode:
-        return ParseCode.YIELD
-
-    def destructure(self) -> None:
-        return None
-
-def optional(context: ParserContext) -> ParserContext:
-    return Choice([context, Empty()])
+    def upgrade(self) -> ParserContext:
+        new_inner = self.inner.upgrade()
+        if new_inner is not None:
+            return ParenContext(new_inner)


-## "Productions" sit on top of these base ParserContexts in order to give names to
-## large token sequences and to "reduce" them into AST types more intelligently.
-
-class ProductionContext(ParserContext):
-    """
-    this adapts from the Production system of specification to the ParserContext system.
-    this is instantiated for high-level productions where we specify a grammar
-    and then parse "all in one go", sealing away incomplete state, and converting
-    the parsed tokens into actually useful abstractions (like signed numbers).
-    """
-    def __init__(self, production: 'Production', context: ParserContext = None):
-        self.production = production
-        self.context = context if context is not None else production.grammar()
-
-    def __repr__(self) -> str:
-        return f"ProductionContext({self.production!r}, {self.context!r})"
-
-    def __str__(self) -> str:
-        return str(self.context)
-
-    def feed(self, token: Token) -> ParseCode:
-        return self.context.feed(token)
-
-    def reduce_inner(self, inner: ParserContext):
-        if isinstance(inner, ProductionContext):
-            return inner.reduce()  # easy
-        elif isinstance(inner, ParserContext):
-            return self.reduce_inner(inner.destructure())
-        elif isinstance(inner, list):  # happens via unpacking of Then objects
-            return [self.reduce_inner(i) for i in inner]
-        else:
-            return inner
-
-    def reduce(self) -> object:
-        # XXX this ends up being a leaf -> root reduction,
-        # which generally makes it harder to achieve detailed control when nesting.
-        return self.production.reduce(self.reduce_inner(self.context))
-
-class Production:
-    """
-    non-generic, likely multi-token productions,
-    specified in terms of other Productions and the above primitives
-    """
-    def grammar(self) -> ParserContext:
-        raise NotImplementedError()
-
-    def context(self) -> ParserContext:
-        return ProductionContext(self)
-
-    def reduce(self, inner: object) -> object:
-        """
-        use to construct the outer types out of already-converted inner types.
-        e.g. Number = Then([optional(Minus), Digits, optional(Suffix)])
-            gets called with reduce([a, b, c]), where a is the already reduced `optional(Minus)`,
-            i.e. `None` or whatever type corresponds to the Minus token.
-        """
-        return inner
-
-class DigitProduction(Production):
-    """ one digit token """
-    def grammar(self) -> ParserContext:
-        return WantToken(DIGITS)
-
-    def reduce(self, inner: Token) -> int:
-        return int(inner.c)
-
-class IntProduction(Production):
-    """ multi-digit integer """
-    def grammar(self) -> ParserContext:
-        return Then([
-            DigitProduction(),
-            optional(IntProduction()),
-        ])
-
-    def reduce(self, inner: list) -> int:
-        # TODO: wrong associativity
-        leading, trailing = inner
-        if trailing is None:
-            return leading
-        else:
-            return leading*10 + trailing
-
-class DurationOrIntProduction(Production):
-    # due to a lack of lookahead, we combine duration and int parsing into one production
-    # because a duration shares a complete int as prefix
-    def grammar(self) -> ParserContext:
-        return Then([
-            IntProduction(),
-            optional(WantToken(Token('d'))),
-        ])
-
-    def reduce(self, inner: list) -> 'Literal':
-        value, suffix = inner
-        if suffix is None:
-            return Literal(value)
-        else:
-            return Literal(timedelta(value))
-
-class Whitespace(Production):
-    def grammar(self) -> ParserContext:
-        return Then([
-            WantToken(SPACE),
-            optional(Whitespace()),
-        ])
-
-class ParenthesizedExpr(Production):
-    def grammar(self) -> ParserContext:
-        return Then([
-            WantToken(OPEN_PAREN),
-            Expr(),
-            WantToken(CLOSE_PAREN),
-        ])
-
-    def reduce(self, inner: list) -> 'AstItem':
-        open, expr, close = inner
-        return expr
-
-class IdentifierTail(Production):
-    def grammar(self) -> ParserContext:
-        return Then([
-            WantToken(ALPHA_NUM_UNDER),
-            optional(IdentifierTail()),
-        ])
-
-
-class Identifier(Production):
-    """ variable-style identifier, e.g. 'TODAY' """
-    def grammar(self) -> ParserContext:
-        return Then([
-            WantToken(ALPHA_UNDER),
-            optional(IdentifierTail()),
-        ])
-
-    def reduce(self, inner: list) -> 'Literal':
-        # fold the tokens into a string
-        first, rest = inner
-        head = first.c
-        while rest is not None:
-            next, rest = rest
-            head += next.c
-        return Variable(head)
-
-class UnaryExpr(Production):
-    """ some expression which does not invoke any operators at the outermost level """
-    def grammar(self) -> ParserContext:
-        return Then([
-            optional(Whitespace()),
-            Choice([
-                DurationOrIntProduction(),
-                Identifier(),
-                ParenthesizedExpr(),
-            ]),
-            optional(Whitespace()),
-        ])
-
-    def reduce(self, inner: list):
-        # drop the whitespace
-        leading, primary, trailing = inner
-        return primary
-
-class ExprRHS(Production):
-    """ right hand side of a binary operation """
-    def grammar(self) -> ParserContext:
-        return Then([
-            Choice([WantToken(ASTERISK), WantToken(PLUS), WantToken(MINUS)]),
-            # remaining, is just another `Expr`, but we need to keep the fields expanded here to control precedence.
-            UnaryExpr(),
-            Choice([ExprRHS(), Empty()]),
-        ])
-
-class Expr(Production):
-    """ this is the top-level production """
-    def grammar(self) -> ParserContext:
-        return Then([
-            UnaryExpr(),
-            Choice([ExprRHS(), Empty()])
-        ])
-
-    def reduce(self, inner: list):
-        lhs, rhs = inner
-        if rhs is None: return lhs
-
-        # convert the whole right-hand-side of the tree, iteratively.
-        oper, rhs, rhs_next = rhs
-        if oper == ASTERISK:
-            # multiplication has high precedence and we grab the adjacent token ASAP
-            lhs = MulOp(lhs, rhs)
-            if rhs_next is not None:
-                lhs = self.reduce([lhs, rhs_next])
-        else:
-            # reduce the rhs and *then* apply this operator
-            if rhs_next is not None:
-                rhs = self.reduce([rhs, rhs_next])
-
-            if oper == PLUS:
-                lhs = AddOp(lhs, rhs)
-            elif oper == MINUS:
-                lhs = SubOp(lhs, rhs)
-
-        return lhs
-
-
-## parsed productions are `reduce`d to more useful `AstItem` items which we use
-## for the actual evaluation/computation
+## AstItems are produced from a ParserContext input
+## ParserContext parse outputs are translated into `AstItem`s before evaluation
+## so that we can operate on a higher-level tree that directly encodes native values like integers

 class AstItem(metaclass=abc.ABCMeta):
    @abc.abstractmethod
    def eval(self, context: dict):
        pass

+    @staticmethod
+    def decode_item(p: ParserContext) -> 'AstItem':
+        if isinstance(p, IntegerContext):
+            return Literal(AstItem.decode_integer(p))
+        if isinstance(p, DurationContext):
+            return Literal(timedelta(AstItem.decode_integer(p.value)))
+        if isinstance(p, IdentifierContext):
+            return Variable(AstItem.decode_identifier(p))
+        if isinstance(p, BaseValueContext):
+            return AstItem.decode_item(p.value)
+        if isinstance(p, BinaryOpContext):
+            return AstItem.decode_bin_op(
+                p.oper.c,
+                AstItem.decode_item(p.lhs),
+                AstItem.decode_item(p.rhs)
+            )
+        if isinstance(p, ParenContext):
+            return AstItem.decode_item(p.inner)
+
+    @staticmethod
+    def decode_integer(p: IntegerContext) -> int:
+        return int(''.join(t.c for t in p.tokens))
+
+    @staticmethod
+    def decode_identifier(p: IdentifierContext) -> str:
+        return ''.join(t.c for t in p.tokens)
+
+    @staticmethod
+    def decode_bin_op(ty: str, lhs: 'AstItem', rhs: 'AstItem') -> 'BinaryOp':
+        if ty == '+':
+            return AddOp(lhs, rhs)
+        if ty == '-':
+            return SubOp(lhs, rhs)
+        if ty == '*':
+            return MulOp(lhs, rhs)
+
 class Literal(AstItem):
    def __init__(self, v):
        self.v = v
@ -429,32 +312,33 @@ class MulOp(BinaryOp):
    def eval(self, context: dict):
        return self.lhs.eval(context) * self.rhs.eval(context)

+
+## toplevel routine. tokenize -> parse -> decode to AST -> evaluate
+
 def tokenize(stream: str) -> list:
    return [Token(char) for char in stream]

-def parse(ty: Production, tokens: list) -> AstItem:
-    ctx = Then([ty, Empty()])
+def parse(tokens: list) -> ParserContext:
+    parser = Parser(BaseContext())
    for i, t in enumerate(tokens):
-        result = ctx.feed(t)
+        result = parser.feed(t)
        # print(f"i={i}; t={t}; state: {ctx!r}")
-        assert result == ParseCode.CONTINUE, f"unexpected token '{t}' at {i}; state: {ctx!r}"
+        assert result, f"unexpected token '{t}' at {i}; state: {parser.complete()!r}"

-    # feed a trailing EOF which no production should consume.
-    # this either drives the context to a HALT state, if it's expecting
-    # some specific other token, or YIELD if it's happy for the stream to be closed.
-    assert ctx.feed(EOF) == ParseCode.YIELD, f"incomplete expression: {ctx!r}"
-
-    return ctx.destructure()[0].reduce()
+    return parser.complete()


 def evaluate(expr: str) -> object:
    tok = tokenize(expr)
-    expr = parse(Expr(), tok)
-    print(expr)
+    parse_tree = parse(tok)
+    print(parse_tree)
+    ast = AstItem.decode_item(parse_tree)
+    print(ast)
+
    env = dict(
        today=datetime.now()
    )
-    return expr.eval(env)
+    return ast.eval(env)


 if __name__ == '__main__':