sane-date-math: convert to LR parser

This commit is contained in:
colin 2022-12-24 05:08:17 +00:00
parent 51a96525d9
commit 16fa1e0eda

View File

@ -2,8 +2,8 @@
# i just went overboard playing around with parsers, is all. # i just went overboard playing around with parsers, is all.
# use this like `./sane-date-math 'today - 5d'` # use this like `./sane-date-math 'today - 5d'`
# of course, it handles parenthesizes and operator precedence, so you can do sillier things like # of course, it handles parentheses and operator precedence/associativity, so you can do sillier things like
# `./sane-date-math ' today - (3*4+1 - ((0)) ) *7d '` # `./sane-date-math ' today - (1+3 *4 - ((0)) ) *7d '`
import abc import abc
@ -15,8 +15,6 @@ class Token:
self.c = c self.c = c
def __repr__(self) -> str: def __repr__(self) -> str:
if self == EOF:
return "<EOF>"
return f"{self.c!r}" return f"{self.c!r}"
def __str__(self) -> str: def __str__(self) -> str:
@ -25,7 +23,6 @@ class Token:
def __eq__(self, other: 'Token') -> bool: def __eq__(self, other: 'Token') -> bool:
return self.c == other.c return self.c == other.c
EOF = Token('\x05')
PLUS = Token('+') PLUS = Token('+')
MINUS = Token('-') MINUS = Token('-')
ASTERISK = Token('*') ASTERISK = Token('*')
@ -40,349 +37,235 @@ ALPHA = ALPHA_LOWER + ALPHA_UPPER
ALPHA_UNDER = ALPHA + [UNDERSCORE] ALPHA_UNDER = ALPHA + [UNDERSCORE]
ALPHA_NUM_UNDER = ALPHA_UNDER + DIGITS ALPHA_NUM_UNDER = ALPHA_UNDER + DIGITS
class ParserContext:
def feed(self, token: Token) -> 'ParserContext':
return None # can't ingest the token
def upgrade(self) -> 'ParserContext':
return None # no upgrade path
# TODO: should be enum class Parser:
class ParseCode:
# return if the parser cannot parse the provided token
HALT = 0
# return if the parser is already "complete" and the token should be yielded to the outer context instead
YIELD = 1
# return is the parser successfully consumed the provided token and parsing should continue
CONTINUE = 2
class ParserContext(metaclass=abc.ABCMeta):
@abc.abstractmethod
def feed(self, token: Token) -> ParseCode:
"""
possibly ingests this token, modifying internal state,
and providing instruction to the outer parser layer on
how to proceed.
"""
pass
def context(self) -> 'ParserContext':
"""
hack to make type-level "Productions" compatible with instance-level "ParserContext"s.
"""
return self
def destructure(self) -> object:
"""
destructure the outer layer of this ParserContext to obtain access to whatever state it captured.
e.g. Then([A, Choice([B, C])]) destructures first to [A, Choice([B, C])].
it's not recursive; the inner layers must be manually destructured.
"""
return self
class Then(ParserContext):
""" """
primitive combinator: given a sequence of parser constructs, parse the input LR parser.
using the first parser until that parser yields, then parse using the second keeps exactly one root item, and for each input token
parser, and so on. feeds it to the root, possibly "upgrading" the root N times
before it's able to be fed.
""" """
def __init__(self, items: list): def __init__(self, root: ParserContext):
self.items = [i.context() for i in items] self.root = root
def feed(self, token: Token) -> bool:
new_root = self.root.feed(token)
if new_root is not None:
self.root = new_root
return True
else:
# root can't directly accept this item.
# "upgrade" it and try again.
new_root = self.root.upgrade()
if new_root is None: return False
self.root = new_root
return self.feed(token)
def complete(self) -> ParserContext:
# upgrade the root as far as possible before returning
root = None
new_root = self.root
while new_root is not None:
root = new_root
new_root = root.upgrade()
return root
class ReprParserContext(ParserContext):
""" helper that gives a good default repr to most contexts """
def __init__(self, items: list = None):
self.items = items if items is not None else []
def __repr__(self) -> str: def __repr__(self) -> str:
return f"Then({self.items!r})" return f'{self.__class__.__name__}({self.items!r})'
def __str__(self) -> str:
return str(self.items)
def feed(self, token: Token) -> ParseCode: class BaseContext(ReprParserContext):
# we expect parser contexts to be fused: once they YIELD, """ empty context; initial state of the parser """
# they should yield on all future calls as well def feed(self, token: Token) -> ParserContext:
for i in self.items: if token == SPACE:
result = i.feed(token) return self
if result != ParseCode.YIELD: return result if token == OPEN_PAREN:
else: return ParenContext(BaseContext())
# all items are done parsing; so are we! if token in DIGITS:
return ParseCode.YIELD return IntegerContext([token])
if token in ALPHA_UNDER:
return IdentifierContext([token])
def destructure(self) -> list: class IdentifierContext(ReprParserContext):
return self.items """ context is an identifier like `today` """
def __init__(self, tokens: list):
super().__init__(tokens)
self.tokens = tokens
class Choice(ParserContext): def feed(self, token: Token) -> ParserContext:
if token in ALPHA_NUM_UNDER:
return IdentifierContext(self.tokens + [token])
def upgrade(self) -> ParserContext:
return StrongValueContext(self)
class IntegerContext(ReprParserContext):
""" context is an integer like `45` """
def __init__(self, tokens: list):
super().__init__(tokens)
self.tokens = tokens
def feed(self, token: Token) -> ParserContext:
if token in DIGITS:
return IntegerContext(self.tokens + [token])
if token == Token('d'):
return DurationContext(self)
def upgrade(self) -> ParserContext:
# can't continue the integer; it becomes a value
return StrongValueContext(self)
class DurationContext(ReprParserContext):
""" context is a duration like `14d` """
def __init__(self, value: IntegerContext):
super().__init__([value])
self.value = value
def upgrade(self) -> ParserContext:
return StrongValueContext(self)
class BaseValueContext(ReprParserContext):
""" abstract base for types that can be used in compound expressions """
def __init__(self, value: ParserContext):
super().__init__([value])
self.value = value
def feed(self, token: Token) -> ParserContext:
if token == SPACE:
return self
class StrongValueContext(BaseValueContext):
""" """
primitive combinator: try each parser in order and use the first match. in the context of operators, a strong value is something which prefers
NB: there's no lookahead. whichever parser is able to parse the first token to not be grabbed by a lhs value.
is used for the entire stream.
so for example, strong values have the opportunity to initiate a multiply operation before the lhs closes an addition operation that this strong value is a part of
""" """
def __init__(self, choices: list): def feed(self, token: Token) -> ParserContext:
self.choices = choices if token == ASTERISK:
self.active = None return BinaryOpContext(self, token, BaseContext())
return super().feed(token)
def __repr__(self) -> str: def upgrade(self) -> ParserContext:
return f"Choice({self.choices!r})" return WeakValueContext(self.value)
def __str__(self) -> str: class WeakValueContext(BaseValueContext):
if self.active is not None: def feed(self, token: Token) -> ParserContext:
return str(self.active) if token == PLUS:
else: return BinaryOpContext(self, token, BaseContext())
return repr(self) if token == MINUS:
return BinaryOpContext(self, token, BaseContext())
def feed(self, token: Token) -> ParseCode: return super().feed(token)
if self.active is not None:
return self.active.feed(token)
for choice in self.choices: class BinaryOpContext(ReprParserContext):
item = choice.context() """ context for a binary operation. the LHS and operator are parsed, but the rhs may not yet contain a value """
result = item.feed(token) def __init__(self, lhs: BaseValueContext, oper: Token, rhs: ParserContext):
if result is not ParseCode.HALT: super().__init__([lhs, oper, rhs])
self.active = item self.lhs = lhs
return result self.oper = oper
self.rhs = rhs
return ParseCode.HALT # no matches @property
def precedence_class(self) -> type:
if self.oper in [PLUS, MINUS]:
return WeakValueContext
if self.oper == ASTERISK:
return StrongValueContext
def destructure(self) -> ParserContext: def feed(self, token: Token) -> ParserContext:
return self.active new_rhs = self.rhs.feed(token)
if new_rhs is not None:
return BinaryOpContext(self.lhs, self.oper, new_rhs)
class WantToken(ParserContext): def upgrade(self) -> ParserContext:
""" new_rhs = self.rhs.upgrade()
match a single token out of a list of valid tokens if new_rhs is None: return None
"""
def __init__(self, want: list):
self.has = None
self.want = [want] if isinstance(want, Token) else want
def __repr__(self) -> str: # upgrade self once the rhs has reach the required precedence compatible with this operator
return f"WantToken({self.want!r})" new_self = BinaryOpContext(self.lhs, self.oper, new_rhs)
if isinstance(new_rhs, self.precedence_class):
return StrongValueContext(self) # close the operation
def feed(self, token: Token) -> ParseCode: return new_self
if self.has is not None: return ParseCode.YIELD
if token in self.want:
self.has = token
return ParseCode.CONTINUE
return ParseCode.HALT
def destructure(self) -> Token: class ParenContext(ReprParserContext):
return self.has """ context for a value contained within parentheses """
def __init__(self, inner: ParserContext):
super().__init__([inner])
self.inner = inner
class Empty(ParserContext): def feed(self, token: Token) -> ParserContext:
""" new_inner = self.inner.feed(token)
used as a terminal to allow for constructs like `optional` if new_inner is not None:
""" return ParenContext(new_inner)
def feed(self, token: Token) -> ParseCode:
return ParseCode.YIELD
def destructure(self) -> None: if token == CLOSE_PAREN and isinstance(self.inner, WeakValueContext):
return None return StrongValueContext(self)
def optional(context: ParserContext) -> ParserContext: def upgrade(self) -> ParserContext:
return Choice([context, Empty()]) new_inner = self.inner.upgrade()
if new_inner is not None:
return ParenContext(new_inner)
## "Productions" sit on top of these base ParserContexts in order to give names to ## AstItems are produced from a ParserContext input
## large token sequences and to "reduce" them into AST types more intelligently. ## ParserContext parse outputs are translated into `AstItem`s before evaluation
## so that we can operate on a higher-level tree that directly encodes native values like integers
class ProductionContext(ParserContext):
"""
this adapts from the Production system of specification to the ParserContext system.
this is instantiated for high-level productions where we specify a grammar
and then parse "all in one go", sealing away incomplete state, and converting
the parsed tokens into actually useful abstractions (like signed numbers).
"""
def __init__(self, production: 'Production', context: ParserContext = None):
self.production = production
self.context = context if context is not None else production.grammar()
def __repr__(self) -> str:
return f"ProductionContext({self.production!r}, {self.context!r})"
def __str__(self) -> str:
return str(self.context)
def feed(self, token: Token) -> ParseCode:
return self.context.feed(token)
def reduce_inner(self, inner: ParserContext):
if isinstance(inner, ProductionContext):
return inner.reduce() # easy
elif isinstance(inner, ParserContext):
return self.reduce_inner(inner.destructure())
elif isinstance(inner, list): # happens via unpacking of Then objects
return [self.reduce_inner(i) for i in inner]
else:
return inner
def reduce(self) -> object:
# XXX this ends up being a leaf -> root reduction,
# which generally makes it harder to achieve detailed control when nesting.
return self.production.reduce(self.reduce_inner(self.context))
class Production:
"""
non-generic, likely multi-token productions,
specified in terms of other Productions and the above primitives
"""
def grammar(self) -> ParserContext:
raise NotImplementedError()
def context(self) -> ParserContext:
return ProductionContext(self)
def reduce(self, inner: object) -> object:
"""
use to construct the outer types out of already-converted inner types.
e.g. Number = Then([optional(Minus), Digits, optional(Suffix)])
gets called with reduce([a, b, c]), where a is the already reduced `optional(Minus)`,
i.e. `None` or whatever type corresponds to the Minus token.
"""
return inner
class DigitProduction(Production):
""" one digit token """
def grammar(self) -> ParserContext:
return WantToken(DIGITS)
def reduce(self, inner: Token) -> int:
return int(inner.c)
class IntProduction(Production):
""" multi-digit integer """
def grammar(self) -> ParserContext:
return Then([
DigitProduction(),
optional(IntProduction()),
])
def reduce(self, inner: list) -> int:
# TODO: wrong associativity
leading, trailing = inner
if trailing is None:
return leading
else:
return leading*10 + trailing
class DurationOrIntProduction(Production):
# due to a lack of lookahead, we combine duration and int parsing into one production
# because a duration shares a complete int as prefix
def grammar(self) -> ParserContext:
return Then([
IntProduction(),
optional(WantToken(Token('d'))),
])
def reduce(self, inner: list) -> 'Literal':
value, suffix = inner
if suffix is None:
return Literal(value)
else:
return Literal(timedelta(value))
class Whitespace(Production):
def grammar(self) -> ParserContext:
return Then([
WantToken(SPACE),
optional(Whitespace()),
])
class ParenthesizedExpr(Production):
def grammar(self) -> ParserContext:
return Then([
WantToken(OPEN_PAREN),
Expr(),
WantToken(CLOSE_PAREN),
])
def reduce(self, inner: list) -> 'AstItem':
open, expr, close = inner
return expr
class IdentifierTail(Production):
def grammar(self) -> ParserContext:
return Then([
WantToken(ALPHA_NUM_UNDER),
optional(IdentifierTail()),
])
class Identifier(Production):
""" variable-style identifier, e.g. 'TODAY' """
def grammar(self) -> ParserContext:
return Then([
WantToken(ALPHA_UNDER),
optional(IdentifierTail()),
])
def reduce(self, inner: list) -> 'Literal':
# fold the tokens into a string
first, rest = inner
head = first.c
while rest is not None:
next, rest = rest
head += next.c
return Variable(head)
class UnaryExpr(Production):
""" some expression which does not invoke any operators at the outermost level """
def grammar(self) -> ParserContext:
return Then([
optional(Whitespace()),
Choice([
DurationOrIntProduction(),
Identifier(),
ParenthesizedExpr(),
]),
optional(Whitespace()),
])
def reduce(self, inner: list):
# drop the whitespace
leading, primary, trailing = inner
return primary
class ExprRHS(Production):
""" right hand side of a binary operation """
def grammar(self) -> ParserContext:
return Then([
Choice([WantToken(ASTERISK), WantToken(PLUS), WantToken(MINUS)]),
# remaining, is just another `Expr`, but we need to keep the fields expanded here to control precedence.
UnaryExpr(),
Choice([ExprRHS(), Empty()]),
])
class Expr(Production):
""" this is the top-level production """
def grammar(self) -> ParserContext:
return Then([
UnaryExpr(),
Choice([ExprRHS(), Empty()])
])
def reduce(self, inner: list):
lhs, rhs = inner
if rhs is None: return lhs
# convert the whole right-hand-side of the tree, iteratively.
oper, rhs, rhs_next = rhs
if oper == ASTERISK:
# multiplication has high precedence and we grab the adjacent token ASAP
lhs = MulOp(lhs, rhs)
if rhs_next is not None:
lhs = self.reduce([lhs, rhs_next])
else:
# reduce the rhs and *then* apply this operator
if rhs_next is not None:
rhs = self.reduce([rhs, rhs_next])
if oper == PLUS:
lhs = AddOp(lhs, rhs)
elif oper == MINUS:
lhs = SubOp(lhs, rhs)
return lhs
## parsed productions are `reduce`d to more useful `AstItem` items which we use
## for the actual evaluation/computation
class AstItem(metaclass=abc.ABCMeta): class AstItem(metaclass=abc.ABCMeta):
@abc.abstractmethod @abc.abstractmethod
def eval(self, context: dict): def eval(self, context: dict):
pass pass
@staticmethod
def decode_item(p: ParserContext) -> 'AstItem':
if isinstance(p, IntegerContext):
return Literal(AstItem.decode_integer(p))
if isinstance(p, DurationContext):
return Literal(timedelta(AstItem.decode_integer(p.value)))
if isinstance(p, IdentifierContext):
return Variable(AstItem.decode_identifier(p))
if isinstance(p, BaseValueContext):
return AstItem.decode_item(p.value)
if isinstance(p, BinaryOpContext):
return AstItem.decode_bin_op(
p.oper.c,
AstItem.decode_item(p.lhs),
AstItem.decode_item(p.rhs)
)
if isinstance(p, ParenContext):
return AstItem.decode_item(p.inner)
@staticmethod
def decode_integer(p: IntegerContext) -> int:
return int(''.join(t.c for t in p.tokens))
@staticmethod
def decode_identifier(p: IdentifierContext) -> str:
return ''.join(t.c for t in p.tokens)
@staticmethod
def decode_bin_op(ty: str, lhs: 'AstItem', rhs: 'AstItem') -> 'BinaryOp':
if ty == '+':
return AddOp(lhs, rhs)
if ty == '-':
return SubOp(lhs, rhs)
if ty == '*':
return MulOp(lhs, rhs)
class Literal(AstItem): class Literal(AstItem):
def __init__(self, v): def __init__(self, v):
self.v = v self.v = v
@ -429,32 +312,33 @@ class MulOp(BinaryOp):
def eval(self, context: dict): def eval(self, context: dict):
return self.lhs.eval(context) * self.rhs.eval(context) return self.lhs.eval(context) * self.rhs.eval(context)
## toplevel routine. tokenize -> parse -> decode to AST -> evaluate
def tokenize(stream: str) -> list: def tokenize(stream: str) -> list:
return [Token(char) for char in stream] return [Token(char) for char in stream]
def parse(ty: Production, tokens: list) -> AstItem: def parse(tokens: list) -> ParserContext:
ctx = Then([ty, Empty()]) parser = Parser(BaseContext())
for i, t in enumerate(tokens): for i, t in enumerate(tokens):
result = ctx.feed(t) result = parser.feed(t)
# print(f"i={i}; t={t}; state: {ctx!r}") # print(f"i={i}; t={t}; state: {ctx!r}")
assert result == ParseCode.CONTINUE, f"unexpected token '{t}' at {i}; state: {ctx!r}" assert result, f"unexpected token '{t}' at {i}; state: {parser.complete()!r}"
# feed a trailing EOF which no production should consume. return parser.complete()
# this either drives the context to a HALT state, if it's expecting
# some specific other token, or YIELD if it's happy for the stream to be closed.
assert ctx.feed(EOF) == ParseCode.YIELD, f"incomplete expression: {ctx!r}"
return ctx.destructure()[0].reduce()
def evaluate(expr: str) -> object: def evaluate(expr: str) -> object:
tok = tokenize(expr) tok = tokenize(expr)
expr = parse(Expr(), tok) parse_tree = parse(tok)
print(expr) print(parse_tree)
ast = AstItem.decode_item(parse_tree)
print(ast)
env = dict( env = dict(
today=datetime.now() today=datetime.now()
) )
return expr.eval(env) return ast.eval(env)
if __name__ == '__main__': if __name__ == '__main__':