sane-date-math: convert to LR parser

This commit is contained in:
colin 2022-12-24 05:08:17 +00:00
parent 51a96525d9
commit 16fa1e0eda

View File

@ -2,8 +2,8 @@
# i just went overboard playing around with parsers, is all.
# use this like `./sane-date-math 'today - 5d'`
# of course, it handles parenthesizes and operator precedence, so you can do sillier things like
# `./sane-date-math ' today - (3*4+1 - ((0)) ) *7d '`
# of course, it handles parentheses and operator precedence/associativity, so you can do sillier things like
# `./sane-date-math ' today - (1+3 *4 - ((0)) ) *7d '`
import abc
@ -15,8 +15,6 @@ class Token:
self.c = c
def __repr__(self) -> str:
if self == EOF:
return "<EOF>"
return f"{self.c!r}"
def __str__(self) -> str:
@ -25,7 +23,6 @@ class Token:
def __eq__(self, other: 'Token') -> bool:
return self.c == other.c
EOF = Token('\x05')
PLUS = Token('+')
MINUS = Token('-')
ASTERISK = Token('*')
@ -40,349 +37,235 @@ ALPHA = ALPHA_LOWER + ALPHA_UPPER
ALPHA_UNDER = ALPHA + [UNDERSCORE]
ALPHA_NUM_UNDER = ALPHA_UNDER + DIGITS
class ParserContext:
def feed(self, token: Token) -> 'ParserContext':
return None # can't ingest the token
def upgrade(self) -> 'ParserContext':
return None # no upgrade path
class Parser:
"""
LR parser.
keeps exactly one root item, and for each input token
feeds it to the root, possibly "upgrading" the root N times
before it's able to be fed.
"""
def __init__(self, root: ParserContext):
self.root = root
def feed(self, token: Token) -> bool:
new_root = self.root.feed(token)
if new_root is not None:
self.root = new_root
return True
else:
# root can't directly accept this item.
# "upgrade" it and try again.
new_root = self.root.upgrade()
if new_root is None: return False
self.root = new_root
return self.feed(token)
def complete(self) -> ParserContext:
# upgrade the root as far as possible before returning
root = None
new_root = self.root
while new_root is not None:
root = new_root
new_root = root.upgrade()
return root
class ReprParserContext(ParserContext):
""" helper that gives a good default repr to most contexts """
def __init__(self, items: list = None):
self.items = items if items is not None else []
def __repr__(self) -> str:
return f'{self.__class__.__name__}({self.items!r})'
# TODO: should be enum
class ParseCode:
# return if the parser cannot parse the provided token
HALT = 0
# return if the parser is already "complete" and the token should be yielded to the outer context instead
YIELD = 1
# return is the parser successfully consumed the provided token and parsing should continue
CONTINUE = 2
class BaseContext(ReprParserContext):
""" empty context; initial state of the parser """
def feed(self, token: Token) -> ParserContext:
if token == SPACE:
return self
if token == OPEN_PAREN:
return ParenContext(BaseContext())
if token in DIGITS:
return IntegerContext([token])
if token in ALPHA_UNDER:
return IdentifierContext([token])
class ParserContext(metaclass=abc.ABCMeta):
@abc.abstractmethod
def feed(self, token: Token) -> ParseCode:
"""
possibly ingests this token, modifying internal state,
and providing instruction to the outer parser layer on
how to proceed.
"""
pass
class IdentifierContext(ReprParserContext):
""" context is an identifier like `today` """
def __init__(self, tokens: list):
super().__init__(tokens)
self.tokens = tokens
def context(self) -> 'ParserContext':
"""
hack to make type-level "Productions" compatible with instance-level "ParserContext"s.
"""
def feed(self, token: Token) -> ParserContext:
if token in ALPHA_NUM_UNDER:
return IdentifierContext(self.tokens + [token])
def upgrade(self) -> ParserContext:
return StrongValueContext(self)
class IntegerContext(ReprParserContext):
""" context is an integer like `45` """
def __init__(self, tokens: list):
super().__init__(tokens)
self.tokens = tokens
def feed(self, token: Token) -> ParserContext:
if token in DIGITS:
return IntegerContext(self.tokens + [token])
if token == Token('d'):
return DurationContext(self)
def upgrade(self) -> ParserContext:
# can't continue the integer; it becomes a value
return StrongValueContext(self)
class DurationContext(ReprParserContext):
""" context is a duration like `14d` """
def __init__(self, value: IntegerContext):
super().__init__([value])
self.value = value
def upgrade(self) -> ParserContext:
return StrongValueContext(self)
class BaseValueContext(ReprParserContext):
""" abstract base for types that can be used in compound expressions """
def __init__(self, value: ParserContext):
super().__init__([value])
self.value = value
def feed(self, token: Token) -> ParserContext:
if token == SPACE:
return self
def destructure(self) -> object:
class StrongValueContext(BaseValueContext):
"""
destructure the outer layer of this ParserContext to obtain access to whatever state it captured.
e.g. Then([A, Choice([B, C])]) destructures first to [A, Choice([B, C])].
it's not recursive; the inner layers must be manually destructured.
in the context of operators, a strong value is something which prefers
to not be grabbed by a lhs value.
so for example, strong values have the opportunity to initiate a multiply operation before the lhs closes an addition operation that this strong value is a part of
"""
return self
def feed(self, token: Token) -> ParserContext:
if token == ASTERISK:
return BinaryOpContext(self, token, BaseContext())
return super().feed(token)
class Then(ParserContext):
"""
primitive combinator: given a sequence of parser constructs, parse the input
using the first parser until that parser yields, then parse using the second
parser, and so on.
"""
def __init__(self, items: list):
self.items = [i.context() for i in items]
def upgrade(self) -> ParserContext:
return WeakValueContext(self.value)
def __repr__(self) -> str:
return f"Then({self.items!r})"
class WeakValueContext(BaseValueContext):
def feed(self, token: Token) -> ParserContext:
if token == PLUS:
return BinaryOpContext(self, token, BaseContext())
if token == MINUS:
return BinaryOpContext(self, token, BaseContext())
def __str__(self) -> str:
return str(self.items)
return super().feed(token)
def feed(self, token: Token) -> ParseCode:
# we expect parser contexts to be fused: once they YIELD,
# they should yield on all future calls as well
for i in self.items:
result = i.feed(token)
if result != ParseCode.YIELD: return result
else:
# all items are done parsing; so are we!
return ParseCode.YIELD
class BinaryOpContext(ReprParserContext):
""" context for a binary operation. the LHS and operator are parsed, but the rhs may not yet contain a value """
def __init__(self, lhs: BaseValueContext, oper: Token, rhs: ParserContext):
super().__init__([lhs, oper, rhs])
self.lhs = lhs
self.oper = oper
self.rhs = rhs
def destructure(self) -> list:
return self.items
@property
def precedence_class(self) -> type:
if self.oper in [PLUS, MINUS]:
return WeakValueContext
if self.oper == ASTERISK:
return StrongValueContext
class Choice(ParserContext):
"""
primitive combinator: try each parser in order and use the first match.
NB: there's no lookahead. whichever parser is able to parse the first token
is used for the entire stream.
"""
def __init__(self, choices: list):
self.choices = choices
self.active = None
def feed(self, token: Token) -> ParserContext:
new_rhs = self.rhs.feed(token)
if new_rhs is not None:
return BinaryOpContext(self.lhs, self.oper, new_rhs)
def __repr__(self) -> str:
return f"Choice({self.choices!r})"
def upgrade(self) -> ParserContext:
new_rhs = self.rhs.upgrade()
if new_rhs is None: return None
def __str__(self) -> str:
if self.active is not None:
return str(self.active)
else:
return repr(self)
# upgrade self once the rhs has reach the required precedence compatible with this operator
new_self = BinaryOpContext(self.lhs, self.oper, new_rhs)
if isinstance(new_rhs, self.precedence_class):
return StrongValueContext(self) # close the operation
def feed(self, token: Token) -> ParseCode:
if self.active is not None:
return self.active.feed(token)
return new_self
for choice in self.choices:
item = choice.context()
result = item.feed(token)
if result is not ParseCode.HALT:
self.active = item
return result
class ParenContext(ReprParserContext):
""" context for a value contained within parentheses """
def __init__(self, inner: ParserContext):
super().__init__([inner])
self.inner = inner
return ParseCode.HALT # no matches
def feed(self, token: Token) -> ParserContext:
new_inner = self.inner.feed(token)
if new_inner is not None:
return ParenContext(new_inner)
def destructure(self) -> ParserContext:
return self.active
if token == CLOSE_PAREN and isinstance(self.inner, WeakValueContext):
return StrongValueContext(self)
class WantToken(ParserContext):
"""
match a single token out of a list of valid tokens
"""
def __init__(self, want: list):
self.has = None
self.want = [want] if isinstance(want, Token) else want
def __repr__(self) -> str:
return f"WantToken({self.want!r})"
def feed(self, token: Token) -> ParseCode:
if self.has is not None: return ParseCode.YIELD
if token in self.want:
self.has = token
return ParseCode.CONTINUE
return ParseCode.HALT
def destructure(self) -> Token:
return self.has
class Empty(ParserContext):
"""
used as a terminal to allow for constructs like `optional`
"""
def feed(self, token: Token) -> ParseCode:
return ParseCode.YIELD
def destructure(self) -> None:
return None
def optional(context: ParserContext) -> ParserContext:
return Choice([context, Empty()])
def upgrade(self) -> ParserContext:
new_inner = self.inner.upgrade()
if new_inner is not None:
return ParenContext(new_inner)
## "Productions" sit on top of these base ParserContexts in order to give names to
## large token sequences and to "reduce" them into AST types more intelligently.
class ProductionContext(ParserContext):
"""
this adapts from the Production system of specification to the ParserContext system.
this is instantiated for high-level productions where we specify a grammar
and then parse "all in one go", sealing away incomplete state, and converting
the parsed tokens into actually useful abstractions (like signed numbers).
"""
def __init__(self, production: 'Production', context: ParserContext = None):
self.production = production
self.context = context if context is not None else production.grammar()
def __repr__(self) -> str:
return f"ProductionContext({self.production!r}, {self.context!r})"
def __str__(self) -> str:
return str(self.context)
def feed(self, token: Token) -> ParseCode:
return self.context.feed(token)
def reduce_inner(self, inner: ParserContext):
if isinstance(inner, ProductionContext):
return inner.reduce() # easy
elif isinstance(inner, ParserContext):
return self.reduce_inner(inner.destructure())
elif isinstance(inner, list): # happens via unpacking of Then objects
return [self.reduce_inner(i) for i in inner]
else:
return inner
def reduce(self) -> object:
# XXX this ends up being a leaf -> root reduction,
# which generally makes it harder to achieve detailed control when nesting.
return self.production.reduce(self.reduce_inner(self.context))
class Production:
"""
non-generic, likely multi-token productions,
specified in terms of other Productions and the above primitives
"""
def grammar(self) -> ParserContext:
raise NotImplementedError()
def context(self) -> ParserContext:
return ProductionContext(self)
def reduce(self, inner: object) -> object:
"""
use to construct the outer types out of already-converted inner types.
e.g. Number = Then([optional(Minus), Digits, optional(Suffix)])
gets called with reduce([a, b, c]), where a is the already reduced `optional(Minus)`,
i.e. `None` or whatever type corresponds to the Minus token.
"""
return inner
class DigitProduction(Production):
""" one digit token """
def grammar(self) -> ParserContext:
return WantToken(DIGITS)
def reduce(self, inner: Token) -> int:
return int(inner.c)
class IntProduction(Production):
""" multi-digit integer """
def grammar(self) -> ParserContext:
return Then([
DigitProduction(),
optional(IntProduction()),
])
def reduce(self, inner: list) -> int:
# TODO: wrong associativity
leading, trailing = inner
if trailing is None:
return leading
else:
return leading*10 + trailing
class DurationOrIntProduction(Production):
# due to a lack of lookahead, we combine duration and int parsing into one production
# because a duration shares a complete int as prefix
def grammar(self) -> ParserContext:
return Then([
IntProduction(),
optional(WantToken(Token('d'))),
])
def reduce(self, inner: list) -> 'Literal':
value, suffix = inner
if suffix is None:
return Literal(value)
else:
return Literal(timedelta(value))
class Whitespace(Production):
def grammar(self) -> ParserContext:
return Then([
WantToken(SPACE),
optional(Whitespace()),
])
class ParenthesizedExpr(Production):
def grammar(self) -> ParserContext:
return Then([
WantToken(OPEN_PAREN),
Expr(),
WantToken(CLOSE_PAREN),
])
def reduce(self, inner: list) -> 'AstItem':
open, expr, close = inner
return expr
class IdentifierTail(Production):
def grammar(self) -> ParserContext:
return Then([
WantToken(ALPHA_NUM_UNDER),
optional(IdentifierTail()),
])
class Identifier(Production):
""" variable-style identifier, e.g. 'TODAY' """
def grammar(self) -> ParserContext:
return Then([
WantToken(ALPHA_UNDER),
optional(IdentifierTail()),
])
def reduce(self, inner: list) -> 'Literal':
# fold the tokens into a string
first, rest = inner
head = first.c
while rest is not None:
next, rest = rest
head += next.c
return Variable(head)
class UnaryExpr(Production):
""" some expression which does not invoke any operators at the outermost level """
def grammar(self) -> ParserContext:
return Then([
optional(Whitespace()),
Choice([
DurationOrIntProduction(),
Identifier(),
ParenthesizedExpr(),
]),
optional(Whitespace()),
])
def reduce(self, inner: list):
# drop the whitespace
leading, primary, trailing = inner
return primary
class ExprRHS(Production):
""" right hand side of a binary operation """
def grammar(self) -> ParserContext:
return Then([
Choice([WantToken(ASTERISK), WantToken(PLUS), WantToken(MINUS)]),
# remaining, is just another `Expr`, but we need to keep the fields expanded here to control precedence.
UnaryExpr(),
Choice([ExprRHS(), Empty()]),
])
class Expr(Production):
""" this is the top-level production """
def grammar(self) -> ParserContext:
return Then([
UnaryExpr(),
Choice([ExprRHS(), Empty()])
])
def reduce(self, inner: list):
lhs, rhs = inner
if rhs is None: return lhs
# convert the whole right-hand-side of the tree, iteratively.
oper, rhs, rhs_next = rhs
if oper == ASTERISK:
# multiplication has high precedence and we grab the adjacent token ASAP
lhs = MulOp(lhs, rhs)
if rhs_next is not None:
lhs = self.reduce([lhs, rhs_next])
else:
# reduce the rhs and *then* apply this operator
if rhs_next is not None:
rhs = self.reduce([rhs, rhs_next])
if oper == PLUS:
lhs = AddOp(lhs, rhs)
elif oper == MINUS:
lhs = SubOp(lhs, rhs)
return lhs
## parsed productions are `reduce`d to more useful `AstItem` items which we use
## for the actual evaluation/computation
## AstItems are produced from a ParserContext input
## ParserContext parse outputs are translated into `AstItem`s before evaluation
## so that we can operate on a higher-level tree that directly encodes native values like integers
class AstItem(metaclass=abc.ABCMeta):
@abc.abstractmethod
def eval(self, context: dict):
pass
@staticmethod
def decode_item(p: ParserContext) -> 'AstItem':
if isinstance(p, IntegerContext):
return Literal(AstItem.decode_integer(p))
if isinstance(p, DurationContext):
return Literal(timedelta(AstItem.decode_integer(p.value)))
if isinstance(p, IdentifierContext):
return Variable(AstItem.decode_identifier(p))
if isinstance(p, BaseValueContext):
return AstItem.decode_item(p.value)
if isinstance(p, BinaryOpContext):
return AstItem.decode_bin_op(
p.oper.c,
AstItem.decode_item(p.lhs),
AstItem.decode_item(p.rhs)
)
if isinstance(p, ParenContext):
return AstItem.decode_item(p.inner)
@staticmethod
def decode_integer(p: IntegerContext) -> int:
return int(''.join(t.c for t in p.tokens))
@staticmethod
def decode_identifier(p: IdentifierContext) -> str:
return ''.join(t.c for t in p.tokens)
@staticmethod
def decode_bin_op(ty: str, lhs: 'AstItem', rhs: 'AstItem') -> 'BinaryOp':
if ty == '+':
return AddOp(lhs, rhs)
if ty == '-':
return SubOp(lhs, rhs)
if ty == '*':
return MulOp(lhs, rhs)
class Literal(AstItem):
def __init__(self, v):
self.v = v
@ -429,32 +312,33 @@ class MulOp(BinaryOp):
def eval(self, context: dict):
return self.lhs.eval(context) * self.rhs.eval(context)
## toplevel routine. tokenize -> parse -> decode to AST -> evaluate
def tokenize(stream: str) -> list:
return [Token(char) for char in stream]
def parse(ty: Production, tokens: list) -> AstItem:
ctx = Then([ty, Empty()])
def parse(tokens: list) -> ParserContext:
parser = Parser(BaseContext())
for i, t in enumerate(tokens):
result = ctx.feed(t)
result = parser.feed(t)
# print(f"i={i}; t={t}; state: {ctx!r}")
assert result == ParseCode.CONTINUE, f"unexpected token '{t}' at {i}; state: {ctx!r}"
assert result, f"unexpected token '{t}' at {i}; state: {parser.complete()!r}"
# feed a trailing EOF which no production should consume.
# this either drives the context to a HALT state, if it's expecting
# some specific other token, or YIELD if it's happy for the stream to be closed.
assert ctx.feed(EOF) == ParseCode.YIELD, f"incomplete expression: {ctx!r}"
return ctx.destructure()[0].reduce()
return parser.complete()
def evaluate(expr: str) -> object:
tok = tokenize(expr)
expr = parse(Expr(), tok)
print(expr)
parse_tree = parse(tok)
print(parse_tree)
ast = AstItem.decode_item(parse_tree)
print(ast)
env = dict(
today=datetime.now()
)
return expr.eval(env)
return ast.eval(env)
if __name__ == '__main__':