sane-date-math: convert to LR parser

This commit is contained in:
colin 2022-12-24 05:08:17 +00:00
parent 51a96525d9
commit 16fa1e0eda

View File

@ -2,8 +2,8 @@
# i just went overboard playing around with parsers, is all.
# use this like `./sane-date-math 'today - 5d'`
# of course, it handles parenthesizes and operator precedence, so you can do sillier things like
# `./sane-date-math ' today - (3*4+1 - ((0)) ) *7d '`
# of course, it handles parentheses and operator precedence/associativity, so you can do sillier things like
# `./sane-date-math ' today - (1+3 *4 - ((0)) ) *7d '`
import abc
@ -15,8 +15,6 @@ class Token:
self.c = c
def __repr__(self) -> str:
if self == EOF:
return "<EOF>"
return f"{self.c!r}"
def __str__(self) -> str:
@ -25,7 +23,6 @@ class Token:
def __eq__(self, other: 'Token') -> bool:
return self.c == other.c
EOF = Token('\x05')
PLUS = Token('+')
MINUS = Token('-')
ASTERISK = Token('*')
@ -40,349 +37,235 @@ ALPHA = ALPHA_LOWER + ALPHA_UPPER
ALPHA_UNDER = ALPHA + [UNDERSCORE]
ALPHA_NUM_UNDER = ALPHA_UNDER + DIGITS
class ParserContext:
def feed(self, token: Token) -> 'ParserContext':
return None # can't ingest the token
def upgrade(self) -> 'ParserContext':
return None # no upgrade path
# TODO: should be enum
class ParseCode:
# return if the parser cannot parse the provided token
HALT = 0
# return if the parser is already "complete" and the token should be yielded to the outer context instead
YIELD = 1
# return is the parser successfully consumed the provided token and parsing should continue
CONTINUE = 2
class ParserContext(metaclass=abc.ABCMeta):
@abc.abstractmethod
def feed(self, token: Token) -> ParseCode:
"""
possibly ingests this token, modifying internal state,
and providing instruction to the outer parser layer on
how to proceed.
"""
pass
def context(self) -> 'ParserContext':
"""
hack to make type-level "Productions" compatible with instance-level "ParserContext"s.
"""
return self
def destructure(self) -> object:
"""
destructure the outer layer of this ParserContext to obtain access to whatever state it captured.
e.g. Then([A, Choice([B, C])]) destructures first to [A, Choice([B, C])].
it's not recursive; the inner layers must be manually destructured.
"""
return self
class Then(ParserContext):
class Parser:
"""
primitive combinator: given a sequence of parser constructs, parse the input
using the first parser until that parser yields, then parse using the second
parser, and so on.
LR parser.
keeps exactly one root item, and for each input token
feeds it to the root, possibly "upgrading" the root N times
before it's able to be fed.
"""
def __init__(self, items: list):
self.items = [i.context() for i in items]
def __init__(self, root: ParserContext):
self.root = root
def feed(self, token: Token) -> bool:
new_root = self.root.feed(token)
if new_root is not None:
self.root = new_root
return True
else:
# root can't directly accept this item.
# "upgrade" it and try again.
new_root = self.root.upgrade()
if new_root is None: return False
self.root = new_root
return self.feed(token)
def complete(self) -> ParserContext:
# upgrade the root as far as possible before returning
root = None
new_root = self.root
while new_root is not None:
root = new_root
new_root = root.upgrade()
return root
class ReprParserContext(ParserContext):
""" helper that gives a good default repr to most contexts """
def __init__(self, items: list = None):
self.items = items if items is not None else []
def __repr__(self) -> str:
return f"Then({self.items!r})"
return f'{self.__class__.__name__}({self.items!r})'
def __str__(self) -> str:
return str(self.items)
def feed(self, token: Token) -> ParseCode:
# we expect parser contexts to be fused: once they YIELD,
# they should yield on all future calls as well
for i in self.items:
result = i.feed(token)
if result != ParseCode.YIELD: return result
else:
# all items are done parsing; so are we!
return ParseCode.YIELD
class BaseContext(ReprParserContext):
""" empty context; initial state of the parser """
def feed(self, token: Token) -> ParserContext:
if token == SPACE:
return self
if token == OPEN_PAREN:
return ParenContext(BaseContext())
if token in DIGITS:
return IntegerContext([token])
if token in ALPHA_UNDER:
return IdentifierContext([token])
def destructure(self) -> list:
return self.items
class IdentifierContext(ReprParserContext):
""" context is an identifier like `today` """
def __init__(self, tokens: list):
super().__init__(tokens)
self.tokens = tokens
class Choice(ParserContext):
def feed(self, token: Token) -> ParserContext:
if token in ALPHA_NUM_UNDER:
return IdentifierContext(self.tokens + [token])
def upgrade(self) -> ParserContext:
return StrongValueContext(self)
class IntegerContext(ReprParserContext):
""" context is an integer like `45` """
def __init__(self, tokens: list):
super().__init__(tokens)
self.tokens = tokens
def feed(self, token: Token) -> ParserContext:
if token in DIGITS:
return IntegerContext(self.tokens + [token])
if token == Token('d'):
return DurationContext(self)
def upgrade(self) -> ParserContext:
# can't continue the integer; it becomes a value
return StrongValueContext(self)
class DurationContext(ReprParserContext):
""" context is a duration like `14d` """
def __init__(self, value: IntegerContext):
super().__init__([value])
self.value = value
def upgrade(self) -> ParserContext:
return StrongValueContext(self)
class BaseValueContext(ReprParserContext):
""" abstract base for types that can be used in compound expressions """
def __init__(self, value: ParserContext):
super().__init__([value])
self.value = value
def feed(self, token: Token) -> ParserContext:
if token == SPACE:
return self
class StrongValueContext(BaseValueContext):
"""
primitive combinator: try each parser in order and use the first match.
NB: there's no lookahead. whichever parser is able to parse the first token
is used for the entire stream.
in the context of operators, a strong value is something which prefers
to not be grabbed by a lhs value.
so for example, strong values have the opportunity to initiate a multiply operation before the lhs closes an addition operation that this strong value is a part of
"""
def __init__(self, choices: list):
self.choices = choices
self.active = None
def feed(self, token: Token) -> ParserContext:
if token == ASTERISK:
return BinaryOpContext(self, token, BaseContext())
return super().feed(token)
def __repr__(self) -> str:
return f"Choice({self.choices!r})"
def upgrade(self) -> ParserContext:
return WeakValueContext(self.value)
def __str__(self) -> str:
if self.active is not None:
return str(self.active)
else:
return repr(self)
class WeakValueContext(BaseValueContext):
def feed(self, token: Token) -> ParserContext:
if token == PLUS:
return BinaryOpContext(self, token, BaseContext())
if token == MINUS:
return BinaryOpContext(self, token, BaseContext())
def feed(self, token: Token) -> ParseCode:
if self.active is not None:
return self.active.feed(token)
return super().feed(token)
for choice in self.choices:
item = choice.context()
result = item.feed(token)
if result is not ParseCode.HALT:
self.active = item
return result
class BinaryOpContext(ReprParserContext):
""" context for a binary operation. the LHS and operator are parsed, but the rhs may not yet contain a value """
def __init__(self, lhs: BaseValueContext, oper: Token, rhs: ParserContext):
super().__init__([lhs, oper, rhs])
self.lhs = lhs
self.oper = oper
self.rhs = rhs
return ParseCode.HALT # no matches
@property
def precedence_class(self) -> type:
if self.oper in [PLUS, MINUS]:
return WeakValueContext
if self.oper == ASTERISK:
return StrongValueContext
def destructure(self) -> ParserContext:
return self.active
def feed(self, token: Token) -> ParserContext:
new_rhs = self.rhs.feed(token)
if new_rhs is not None:
return BinaryOpContext(self.lhs, self.oper, new_rhs)
class WantToken(ParserContext):
"""
match a single token out of a list of valid tokens
"""
def __init__(self, want: list):
self.has = None
self.want = [want] if isinstance(want, Token) else want
def upgrade(self) -> ParserContext:
new_rhs = self.rhs.upgrade()
if new_rhs is None: return None
def __repr__(self) -> str:
return f"WantToken({self.want!r})"
# upgrade self once the rhs has reach the required precedence compatible with this operator
new_self = BinaryOpContext(self.lhs, self.oper, new_rhs)
if isinstance(new_rhs, self.precedence_class):
return StrongValueContext(self) # close the operation
def feed(self, token: Token) -> ParseCode:
if self.has is not None: return ParseCode.YIELD
if token in self.want:
self.has = token
return ParseCode.CONTINUE
return ParseCode.HALT
return new_self
def destructure(self) -> Token:
return self.has
class ParenContext(ReprParserContext):
""" context for a value contained within parentheses """
def __init__(self, inner: ParserContext):
super().__init__([inner])
self.inner = inner
class Empty(ParserContext):
"""
used as a terminal to allow for constructs like `optional`
"""
def feed(self, token: Token) -> ParseCode:
return ParseCode.YIELD
def feed(self, token: Token) -> ParserContext:
new_inner = self.inner.feed(token)
if new_inner is not None:
return ParenContext(new_inner)
def destructure(self) -> None:
return None
if token == CLOSE_PAREN and isinstance(self.inner, WeakValueContext):
return StrongValueContext(self)
def optional(context: ParserContext) -> ParserContext:
return Choice([context, Empty()])
def upgrade(self) -> ParserContext:
new_inner = self.inner.upgrade()
if new_inner is not None:
return ParenContext(new_inner)
## "Productions" sit on top of these base ParserContexts in order to give names to
## large token sequences and to "reduce" them into AST types more intelligently.
class ProductionContext(ParserContext):
"""
this adapts from the Production system of specification to the ParserContext system.
this is instantiated for high-level productions where we specify a grammar
and then parse "all in one go", sealing away incomplete state, and converting
the parsed tokens into actually useful abstractions (like signed numbers).
"""
def __init__(self, production: 'Production', context: ParserContext = None):
self.production = production
self.context = context if context is not None else production.grammar()
def __repr__(self) -> str:
return f"ProductionContext({self.production!r}, {self.context!r})"
def __str__(self) -> str:
return str(self.context)
def feed(self, token: Token) -> ParseCode:
return self.context.feed(token)
def reduce_inner(self, inner: ParserContext):
if isinstance(inner, ProductionContext):
return inner.reduce() # easy
elif isinstance(inner, ParserContext):
return self.reduce_inner(inner.destructure())
elif isinstance(inner, list): # happens via unpacking of Then objects
return [self.reduce_inner(i) for i in inner]
else:
return inner
def reduce(self) -> object:
# XXX this ends up being a leaf -> root reduction,
# which generally makes it harder to achieve detailed control when nesting.
return self.production.reduce(self.reduce_inner(self.context))
class Production:
"""
non-generic, likely multi-token productions,
specified in terms of other Productions and the above primitives
"""
def grammar(self) -> ParserContext:
raise NotImplementedError()
def context(self) -> ParserContext:
return ProductionContext(self)
def reduce(self, inner: object) -> object:
"""
use to construct the outer types out of already-converted inner types.
e.g. Number = Then([optional(Minus), Digits, optional(Suffix)])
gets called with reduce([a, b, c]), where a is the already reduced `optional(Minus)`,
i.e. `None` or whatever type corresponds to the Minus token.
"""
return inner
class DigitProduction(Production):
""" one digit token """
def grammar(self) -> ParserContext:
return WantToken(DIGITS)
def reduce(self, inner: Token) -> int:
return int(inner.c)
class IntProduction(Production):
""" multi-digit integer """
def grammar(self) -> ParserContext:
return Then([
DigitProduction(),
optional(IntProduction()),
])
def reduce(self, inner: list) -> int:
# TODO: wrong associativity
leading, trailing = inner
if trailing is None:
return leading
else:
return leading*10 + trailing
class DurationOrIntProduction(Production):
# due to a lack of lookahead, we combine duration and int parsing into one production
# because a duration shares a complete int as prefix
def grammar(self) -> ParserContext:
return Then([
IntProduction(),
optional(WantToken(Token('d'))),
])
def reduce(self, inner: list) -> 'Literal':
value, suffix = inner
if suffix is None:
return Literal(value)
else:
return Literal(timedelta(value))
class Whitespace(Production):
def grammar(self) -> ParserContext:
return Then([
WantToken(SPACE),
optional(Whitespace()),
])
class ParenthesizedExpr(Production):
def grammar(self) -> ParserContext:
return Then([
WantToken(OPEN_PAREN),
Expr(),
WantToken(CLOSE_PAREN),
])
def reduce(self, inner: list) -> 'AstItem':
open, expr, close = inner
return expr
class IdentifierTail(Production):
def grammar(self) -> ParserContext:
return Then([
WantToken(ALPHA_NUM_UNDER),
optional(IdentifierTail()),
])
class Identifier(Production):
""" variable-style identifier, e.g. 'TODAY' """
def grammar(self) -> ParserContext:
return Then([
WantToken(ALPHA_UNDER),
optional(IdentifierTail()),
])
def reduce(self, inner: list) -> 'Literal':
# fold the tokens into a string
first, rest = inner
head = first.c
while rest is not None:
next, rest = rest
head += next.c
return Variable(head)
class UnaryExpr(Production):
""" some expression which does not invoke any operators at the outermost level """
def grammar(self) -> ParserContext:
return Then([
optional(Whitespace()),
Choice([
DurationOrIntProduction(),
Identifier(),
ParenthesizedExpr(),
]),
optional(Whitespace()),
])
def reduce(self, inner: list):
# drop the whitespace
leading, primary, trailing = inner
return primary
class ExprRHS(Production):
""" right hand side of a binary operation """
def grammar(self) -> ParserContext:
return Then([
Choice([WantToken(ASTERISK), WantToken(PLUS), WantToken(MINUS)]),
# remaining, is just another `Expr`, but we need to keep the fields expanded here to control precedence.
UnaryExpr(),
Choice([ExprRHS(), Empty()]),
])
class Expr(Production):
""" this is the top-level production """
def grammar(self) -> ParserContext:
return Then([
UnaryExpr(),
Choice([ExprRHS(), Empty()])
])
def reduce(self, inner: list):
lhs, rhs = inner
if rhs is None: return lhs
# convert the whole right-hand-side of the tree, iteratively.
oper, rhs, rhs_next = rhs
if oper == ASTERISK:
# multiplication has high precedence and we grab the adjacent token ASAP
lhs = MulOp(lhs, rhs)
if rhs_next is not None:
lhs = self.reduce([lhs, rhs_next])
else:
# reduce the rhs and *then* apply this operator
if rhs_next is not None:
rhs = self.reduce([rhs, rhs_next])
if oper == PLUS:
lhs = AddOp(lhs, rhs)
elif oper == MINUS:
lhs = SubOp(lhs, rhs)
return lhs
## parsed productions are `reduce`d to more useful `AstItem` items which we use
## for the actual evaluation/computation
## AstItems are produced from a ParserContext input
## ParserContext parse outputs are translated into `AstItem`s before evaluation
## so that we can operate on a higher-level tree that directly encodes native values like integers
class AstItem(metaclass=abc.ABCMeta):
@abc.abstractmethod
def eval(self, context: dict):
pass
@staticmethod
def decode_item(p: ParserContext) -> 'AstItem':
if isinstance(p, IntegerContext):
return Literal(AstItem.decode_integer(p))
if isinstance(p, DurationContext):
return Literal(timedelta(AstItem.decode_integer(p.value)))
if isinstance(p, IdentifierContext):
return Variable(AstItem.decode_identifier(p))
if isinstance(p, BaseValueContext):
return AstItem.decode_item(p.value)
if isinstance(p, BinaryOpContext):
return AstItem.decode_bin_op(
p.oper.c,
AstItem.decode_item(p.lhs),
AstItem.decode_item(p.rhs)
)
if isinstance(p, ParenContext):
return AstItem.decode_item(p.inner)
@staticmethod
def decode_integer(p: IntegerContext) -> int:
return int(''.join(t.c for t in p.tokens))
@staticmethod
def decode_identifier(p: IdentifierContext) -> str:
return ''.join(t.c for t in p.tokens)
@staticmethod
def decode_bin_op(ty: str, lhs: 'AstItem', rhs: 'AstItem') -> 'BinaryOp':
if ty == '+':
return AddOp(lhs, rhs)
if ty == '-':
return SubOp(lhs, rhs)
if ty == '*':
return MulOp(lhs, rhs)
class Literal(AstItem):
def __init__(self, v):
self.v = v
@ -429,32 +312,33 @@ class MulOp(BinaryOp):
def eval(self, context: dict):
return self.lhs.eval(context) * self.rhs.eval(context)
## toplevel routine. tokenize -> parse -> decode to AST -> evaluate
def tokenize(stream: str) -> list:
return [Token(char) for char in stream]
def parse(ty: Production, tokens: list) -> AstItem:
ctx = Then([ty, Empty()])
def parse(tokens: list) -> ParserContext:
parser = Parser(BaseContext())
for i, t in enumerate(tokens):
result = ctx.feed(t)
result = parser.feed(t)
# print(f"i={i}; t={t}; state: {ctx!r}")
assert result == ParseCode.CONTINUE, f"unexpected token '{t}' at {i}; state: {ctx!r}"
assert result, f"unexpected token '{t}' at {i}; state: {parser.complete()!r}"
# feed a trailing EOF which no production should consume.
# this either drives the context to a HALT state, if it's expecting
# some specific other token, or YIELD if it's happy for the stream to be closed.
assert ctx.feed(EOF) == ParseCode.YIELD, f"incomplete expression: {ctx!r}"
return ctx.destructure()[0].reduce()
return parser.complete()
def evaluate(expr: str) -> object:
tok = tokenize(expr)
expr = parse(Expr(), tok)
print(expr)
parse_tree = parse(tok)
print(parse_tree)
ast = AstItem.decode_item(parse_tree)
print(ast)
env = dict(
today=datetime.now()
)
return expr.eval(env)
return ast.eval(env)
if __name__ == '__main__':