Cleaning up the lexer a bit.

This commit is contained in:
Benjamin Fry 2015-08-29 13:55:42 -07:00
parent 2715967845
commit 179be5618e
23 changed files with 576 additions and 206 deletions

32
Cargo.lock generated
View File

@ -3,15 +3,6 @@ name = "trust-dns"
version = "0.1.1"
dependencies = [
"log 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)",
"regex 0.1.41 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "aho-corasick"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"memchr 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
@ -27,26 +18,3 @@ dependencies = [
"libc 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "memchr"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"libc 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "regex"
version = "0.1.41"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"aho-corasick 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
"memchr 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)",
"regex-syntax 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "regex-syntax"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"

View File

@ -28,5 +28,5 @@ license = "Apache-2.0"
[dependencies]
log = "^0.3.1"
regex = "^0.1.41"
# regex = "^0.1.41"
# rustc-serialize = "^0.3.16"

0
cargo Normal file
View File

View File

@ -28,6 +28,8 @@ pub enum LexerError {
UnrecognizedOctet(u32),
ParseIntError(num::ParseIntError),
UnclosedQuotedString,
UnclosedList,
UnrecognizedDollar(String),
EOF,
}
@ -42,6 +44,8 @@ impl fmt::Display for LexerError {
LexerError::UnrecognizedOctet(o) => write!(f, "Unrecognized octet: {}", o),
LexerError::ParseIntError(ref err) => err.fmt(f),
LexerError::UnclosedQuotedString => write!(f, "Unclosed quoted string"),
LexerError::UnclosedList => write!(f, "Unclosed list, missing ')'"),
LexerError::UnrecognizedDollar(ref s) => write!(f, "Unrecognized dollar content: {}", s),
LexerError::EOF => write!(f, "End of input reached before next read could complete"),
}
}
@ -58,6 +62,8 @@ impl Error for LexerError {
LexerError::UnrecognizedOctet(..) => "Unrecognized octet",
LexerError::ParseIntError(ref err) => err.description(),
LexerError::UnclosedQuotedString => "Unclosed quoted string",
LexerError::UnclosedList => "Unclosed list",
LexerError::UnrecognizedDollar(..) => "Unrecognized dollar content",
LexerError::EOF => "End of input",
}
}
@ -77,7 +83,6 @@ impl From<FromUtf8Error> for LexerError {
}
}
impl From<num::ParseIntError> for LexerError {
fn from(err: num::ParseIntError) -> LexerError {
LexerError::ParseIntError(err)

View File

@ -18,13 +18,16 @@ mod decode_error;
mod encode_error;
mod client_error;
mod lexer_error;
mod parse_error;
pub use self::decode_error::DecodeError;
pub use self::encode_error::EncodeError;
pub use self::client_error::ClientError;
pub use self::lexer_error::LexerError;
pub use self::parse_error::ParseError;
pub type DecodeResult<T> = Result<T, DecodeError>;
pub type EncodeResult = Result<(), EncodeError>;
pub type ClientResult<T> = Result<T, ClientError>;
pub type LexerResult<T> = Result<T, LexerError>;
pub type ParseResult<T> = Result<T, ParseError>;

68
src/error/parse_error.rs Normal file
View File

@ -0,0 +1,68 @@
/*
* Copyright (C) 2015 Benjamin Fry <benjaminfry@me.com>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
use std::error::Error;
use std::fmt;
use std::num;
use super::DecodeError;
use super::LexerError;
use ::serialize::txt::Token;
#[derive(Debug)]
pub enum ParseError {
LexerError(LexerError),
DecodeError(DecodeError),
UnrecognizedToken(Token),
}
impl fmt::Display for ParseError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match *self {
ParseError::LexerError(ref err) => err.fmt(f),
ParseError::DecodeError(ref err) => err.fmt(f),
ParseError::UnrecognizedToken(ref t) => write!(f, "Unrecognized Token in stream: {:?}", t),
}
}
}
impl Error for ParseError {
fn description(&self) -> &str {
match *self {
ParseError::LexerError(ref err) => err.description(),
ParseError::DecodeError(ref err) => err.description(),
ParseError::UnrecognizedToken(..) => "Unrecognized Token"
}
}
fn cause(&self) -> Option<&Error> {
match *self {
ParseError::LexerError(ref err) => Some(err),
_ => None,
}
}
}
impl From<LexerError> for ParseError {
fn from(err: LexerError) -> ParseError {
ParseError::LexerError(err)
}
}
impl From<DecodeError> for ParseError {
fn from(err: DecodeError) -> ParseError {
ParseError::DecodeError(err)
}
}

View File

@ -20,4 +20,4 @@ pub mod error;
pub mod serialize;
#[macro_use] extern crate log;
extern crate regex;
// extern crate regex;

View File

@ -13,10 +13,16 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
pub mod record_type;
pub mod record_type;
pub mod dns_class;
pub mod resource;
pub mod record_data;
pub mod domain;
pub use self::record_type::RecordType;
pub use self::resource::Record;
pub use self::domain::Name;
pub use self::dns_class::DNSClass;
pub use self::record_data::RData;
mod rdata;

View File

@ -16,6 +16,7 @@
use std::net::Ipv4Addr;
use ::serialize::txt::*;
use ::serialize::binary::*;
use ::error::*;
use ::rr::record_data::RData;
@ -64,6 +65,10 @@ pub fn emit(encoder: &mut BinEncoder, a: &RData) -> EncodeResult {
}
}
pub fn parse(tokens: Vec<Token>) -> ParseResult<RData> {
unimplemented!()
}
#[cfg(test)]
mod mytests {
use std::net::Ipv4Addr;

View File

@ -16,6 +16,7 @@
use std::net::Ipv6Addr;
use ::serialize::txt::*;
use ::serialize::binary::*;
use ::error::*;
use ::rr::record_data::RData;
@ -59,6 +60,10 @@ pub fn emit(encoder: &mut BinEncoder, aaaa: &RData) -> EncodeResult {
}
}
pub fn parse(tokens: Vec<Token>) -> ParseResult<RData> {
unimplemented!()
}
#[cfg(test)]
mod tests {

View File

@ -6,7 +6,7 @@
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -14,6 +14,7 @@
* limitations under the License.
*/
use ::serialize::txt::*;
use ::serialize::binary::*;
use ::error::*;
use ::rr::record_data::RData;
@ -49,4 +50,9 @@ pub fn emit(encoder: &mut BinEncoder, cname_data: &RData) -> EncodeResult {
}
}
pub fn parse(tokens: Vec<Token>) -> ParseResult<RData> {
unimplemented!()
}
// #[test] is performed at the record_data module, the inner name in domain::Name

View File

@ -14,6 +14,7 @@
* limitations under the License.
*/
use ::serialize::txt::*;
use ::serialize::binary::*;
use ::error::*;
use ::rr::record_data::RData;
@ -56,4 +57,9 @@ pub fn emit(encoder: &mut BinEncoder, mx: &RData) -> EncodeResult {
}
}
pub fn parse(tokens: Vec<Token>) -> ParseResult<RData> {
unimplemented!()
}
// #[test] is performed at the record_data module, the inner name in domain::Name

View File

@ -14,6 +14,7 @@
* limitations under the License.
*/
use ::serialize::txt::*;
use ::serialize::binary::*;
use ::error::*;
use ::rr::record_data::RData;
@ -55,3 +56,7 @@ pub fn emit(encoder: &mut BinEncoder, ns: &RData) -> EncodeResult {
panic!("wrong type here {:?}", ns);
}
}
pub fn parse(tokens: Vec<Token>) -> ParseResult<RData> {
unimplemented!()
}

View File

@ -13,6 +13,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
use ::serialize::txt::*;
use ::serialize::binary::*;
use ::error::*;
use ::rr::record_data::RData;
@ -58,3 +59,7 @@ pub fn emit(encoder: &mut BinEncoder, nil: &RData) -> EncodeResult {
panic!("wrong type here {:?}", nil);
}
}
pub fn parse(tokens: Vec<Token>) -> ParseResult<RData> {
unimplemented!()
}

View File

@ -13,6 +13,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
use ::serialize::txt::*;
use ::serialize::binary::*;
use ::error::*;
use ::rr::record_data::RData;
@ -48,3 +49,7 @@ pub fn emit(encoder: &mut BinEncoder, ptr: &RData) -> EncodeResult {
panic!("wrong type: {:?}", ptr)
}
}
pub fn parse(tokens: Vec<Token>) -> ParseResult<RData> {
unimplemented!()
}

View File

@ -13,6 +13,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
use ::serialize::txt::*;
use ::serialize::binary::*;
use ::error::*;
use ::rr::record_data::RData;
@ -110,3 +111,7 @@ pub fn emit(encoder: &mut BinEncoder, soa: &RData) -> EncodeResult {
panic!("wrong type here {:?}", soa);
}
}
pub fn parse(tokens: Vec<Token>) -> ParseResult<RData> {
unimplemented!()
}

View File

@ -13,6 +13,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
use ::serialize::txt::*;
use ::serialize::binary::*;
use ::error::*;
use ::rr::record_data::RData;
@ -52,3 +53,7 @@ pub fn emit(encoder: &mut BinEncoder, txt: &RData) -> EncodeResult {
panic!("wrong type here {:?}", txt);
}
}
pub fn parse(tokens: Vec<Token>) -> ParseResult<RData> {
unimplemented!()
}

View File

@ -18,6 +18,7 @@ use std::convert::From;
use ::error::*;
use ::serialize::binary::*;
use ::serialize::txt::*;
use super::domain::Name;
use super::record_type::RecordType;
use super::rdata;
@ -327,8 +328,24 @@ pub enum RData {
AAAA { address: Ipv6Addr },
}
impl RData {
fn parse(record_type: RecordType, tokens: Vec<Token>) -> ParseResult<Self> {
match record_type {
RecordType::CNAME => rdata::cname::parse(tokens),
RecordType::MX => rdata::mx::parse(tokens),
RecordType::NULL => rdata::null::parse(tokens),
RecordType::NS => rdata::ns::parse(tokens),
RecordType::PTR => rdata::ptr::parse(tokens),
RecordType::SOA => rdata::soa::parse(tokens),
RecordType::TXT => rdata::txt::parse(tokens),
RecordType::A => rdata::a::parse(tokens),
RecordType::AAAA => rdata::aaaa::parse(tokens),
_ => panic!("unsupported RecordType: {:?}", record_type)
}
}
}
impl BinSerializable for RData {
// TODO, maybe move the rd_length into the BinDecoder
fn read(decoder: &mut BinDecoder) -> DecodeResult<Self> {
match try!(decoder.record_type().ok_or(DecodeError::NoRecordDataType)) {
RecordType::CNAME => rdata::cname::read(decoder),

View File

@ -20,7 +20,7 @@ use ::error::*;
type FromResult = Result<RecordType, DecodeError>;
#[derive(Debug, PartialEq, PartialOrd, Copy, Clone)]
#[derive(Debug, PartialEq, Eq, Hash, PartialOrd, Copy, Clone)]
#[allow(dead_code)]
pub enum RecordType {
A, // 1 RFC 1035[1] IPv4 Address record

View File

@ -1,31 +0,0 @@
/*
* Copyright (C) 2015 Benjamin Fry <benjaminfry@me.com>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
use ::error::{DecodeError, DecodeResult};
use ::rr::record_type::RecordType;
/// This is non-destructive to the inner buffer, b/c for pointer types we need to perform a reverse
/// seek to lookup names
///
/// A note on serialization, there was a thought to have this implement the rustc-serialization,
/// but given that this is such a small subset of all the serialization which that performs
/// this is a simpler implementation without the cruft, at least for serializing to/from the
/// binary DNS protocols. rustc-serialization will be used for other coms, e.g. json over http
pub struct TxtDecoder {
buffer: Vec<u8>,
index: usize,
record_type: Option<RecordType>,
rdata_length: Option<u16>,
}

225
src/serialize/txt/master.rs Normal file
View File

@ -0,0 +1,225 @@
/*
* Copyright (C) 2015 Benjamin Fry <benjaminfry@me.com>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
use std::collections::HashMap;
use std::io::Read;
use std::fs::File;
use ::error::*;
use ::rr::Name;
use ::rr::RecordType;
use ::rr::Record;
use ::rr::DNSClass;
use ::rr::RData;
use super::master_lex::{Lexer, Token};
// 5. MASTER FILES
//
// Master files are text files that contain RRs in text form. Since the
// contents of a zone can be expressed in the form of a list of RRs a
// master file is most often used to define a zone, though it can be used
// to list a cache's contents. Hence, this section first discusses the
// format of RRs in a master file, and then the special considerations when
// a master file is used to create a zone in some name server.
//
// 5.1. Format
//
// The format of these files is a sequence of entries. Entries are
// predominantly line-oriented, though parentheses can be used to continue
// a list of items across a line boundary, and text literals can contain
// CRLF within the text. Any combination of tabs and spaces act as a
// delimiter between the separate items that make up an entry. The end of
// any line in the master file can end with a comment. The comment starts
// with a ";" (semicolon).
//
// The following entries are defined:
//
// <blank>[<comment>]
//
// $ORIGIN <domain-name> [<comment>]
//
// $INCLUDE <file-name> [<domain-name>] [<comment>]
//
// <domain-name><rr> [<comment>]
//
// <blank><rr> [<comment>]
//
// Blank lines, with or without comments, are allowed anywhere in the file.
//
// Two control entries are defined: $ORIGIN and $INCLUDE. $ORIGIN is
// followed by a domain name, and resets the current origin for relative
// domain names to the stated name. $INCLUDE inserts the named file into
// the current file, and may optionally specify a domain name that sets the
// relative domain name origin for the included file. $INCLUDE may also
// have a comment. Note that a $INCLUDE entry never changes the relative
// origin of the parent file, regardless of changes to the relative origin
// made within the included file.
//
// The last two forms represent RRs. If an entry for an RR begins with a
// blank, then the RR is assumed to be owned by the last stated owner. If
// an RR entry begins with a <domain-name>, then the owner name is reset.
//
// <rr> contents take one of the following forms:
//
// [<TTL>] [<class>] <type> <RDATA>
//
// [<class>] [<TTL>] <type> <RDATA>
//
// The RR begins with optional TTL and class fields, followed by a type and
// RDATA field appropriate to the type and class. Class and type use the
// standard mnemonics, TTL is a decimal integer. Omitted class and TTL
// values are default to the last explicitly stated values. Since type and
// class mnemonics are disjoint, the parse is unique. (Note that this
// order is different from the order used in examples and the order used in
// the actual RRs; the given order allows easier parsing and defaulting.)
//
// <domain-name>s make up a large share of the data in the master file.
// The labels in the domain name are expressed as character strings and
// separated by dots. Quoting conventions allow arbitrary characters to be
// stored in domain names. Domain names that end in a dot are called
// absolute, and are taken as complete. Domain names which do not end in a
// dot are called relative; the actual domain name is the concatenation of
// the relative part with an origin specified in a $ORIGIN, $INCLUDE, or as
// an argument to the master file loading routine. A relative name is an
// error when no origin is available.
//
// <character-string> is expressed in one or two ways: as a contiguous set
// of characters without interior spaces, or as a string beginning with a "
// and ending with a ". Inside a " delimited string any character can
// occur, except for a " itself, which must be quoted using \ (back slash).
//
// Because these files are text files several special encodings are
// necessary to allow arbitrary data to be loaded. In particular:
//
// of the root.
//
// @ A free standing @ is used to denote the current origin.
//
// \X where X is any character other than a digit (0-9), is
// used to quote that character so that its special meaning
// does not apply. For example, "\." can be used to place
// a dot character in a label.
//
// \DDD where each D is a digit is the octet corresponding to
// the decimal number described by DDD. The resulting
// octet is assumed to be text and is not checked for
// special meaning.
//
// ( ) Parentheses are used to group data that crosses a line
// boundary. In effect, line terminations are not
// recognized within parentheses.
//
// ; Semicolon is used to start a comment; the remainder of
// the line is ignored.
pub struct Parser {
records: HashMap<RecordType, Record>,
origin: Option<Name>,
}
impl Parser {
pub fn new() -> Self {
Parser { records: HashMap::new(), origin: None }
}
pub fn parse(&mut self, file: File) {
let mut lexer = Lexer::with_chars(file.chars());
let mut previous_name: Option<Name> = None;
let mut rtype: Option<RecordType> = None;
let mut ttl: Option<i32> = None;
let mut class: Option<DNSClass> = None;
let mut state = State::StartLine;
let mut tokens: Vec<Token> = Vec::new();
while let Some(t) = lexer.next_token() {
state = match state {
State::StartLine => {
rtype = None;
ttl = None;
class = None;
tokens.clear();
match t {
// if Dollar, then $INCLUDE or $ORIGIN
Token::Dollar("INCLUDE") => unimplemented!(),
Token::Dollar("ORIGIN") => unimplemented!(),
// if CharData, then Name then ttl_class_type
Token::CharData(ref data) => unimplemented!(),
// if blank, then nothing or ttl_class_type... grr...
Token::Blank => unimplemented!(),
Token::EOL => State::StartLine, // probably a comment
// _ => return Err(ParseError::UnrecognizedToken(t)),
}
},
State::Ttl_Class_Type => {
match t {
// if number, TTL
// Token::Number(ref num) => ttl = Some(*num),
// One of Class or Type (these cannot be overlapping!)
Token::CharData(ref data) => {
// if it's a number it's a ttl
let result = i32::from_str(data);
if result.is_ok() {
ttl = result.ok();
return State::Ttl_Class_Type
}
// if can parse DNSClass, then class
let result = DNSClass::from_str(data);
if result.is_ok() {
class = Some(result);
return State::Ttl_Class_Type
}
// if can parse RecordType, then RecordType
rtype = try!(RecordType::from_str(data));
State::Record
}
// could be nothing if started with blank and is a comment, i.e. EOL
Token::EOL => {
State::StartLine // next line
}
}
},
State::Record => {
// b/c of ownership rules, perhaps, just collect all the RData components as a list of
// tokens to pass into the processor
match t {
Token::EOL => State::EndRecord,
_ => { tokens.push(t); State::Record },
}
},
State::EndRecord => {
// call out to parsers for difference record types
let RData = try!(RData::parse(tokens));
State::StartLine
},
}
}
// get the last one...
}
}
enum State {
StartLine, // start of line, @, $<WORD>, Name, Blank
Ttl_Class_Type, // [<TTL>] [<class>] <type>,
Record,
EndRecord,
}

View File

@ -8,6 +8,7 @@ use ::error::{LexerResult,LexerError};
pub struct Lexer<'a> {
txt: Peekable<Chars<'a>>,
is_first_line: bool,
in_list: bool,
}
impl<'a> Lexer<'a> {
@ -16,7 +17,7 @@ impl<'a> Lexer<'a> {
}
pub fn with_chars(chars: Chars) -> Lexer {
Lexer { txt: chars.peekable(), is_first_line: true }
Lexer { txt: chars.peekable(), is_first_line: true, in_list: false }
}
pub fn next_token(&mut self) -> LexerResult<Option<Token>> {
@ -28,80 +29,113 @@ impl<'a> Lexer<'a> {
assert!(i < 4095); // keeps the bounds of the loop defined (nothing lasts forever)
// This is to get around mutibility rules such that we can peek at the iter without moving next...
let ch: char = {
//let mut peekable = self.txt.by_ref().peekable();
let next_ch: Option<&char> = self.txt.peek();
if next_ch.is_some() { *next_ch.unwrap() } else { break 'out }
};
let ch: char = if let Some(c) = self.peek() { c } else { break 'out };
// collectors
if let Some(t) = cur_token.get() {
if let State::Comment = t {
let ch = self.txt.next();
if ch.is_none() || ch.unwrap() == '\n' { return Ok(Some(Token::EOL)); } // special case for comments
else { continue 'out } // gobbling rest of line for comment
} else if let State::Quote = t {
match ch {
'"' => { cur_token.set(Some(State::Quoted)) ; break 'out },
'/' => try!(self.escape_seq().and_then(|ch|Ok(self.push(State::Quote, &cur_token, &cur_string, ch)))),
_ => self.push(State::Quote, &cur_token, &cur_string, ch),
}
match t {
State::Comment => {
match ch {
'\n' => cur_token.set(None), // out of the comment
_ => { self.txt.next(); }, // advance the token by default
}
continue 'out; // skipping rest of processing for quoted strings.
continue 'out;
},
State::Quote => {
match ch {
'"' => { cur_token.set(Some(State::Quoted)); self.txt.next() ; break 'out },
'\\' => try!(self.escape_seq().and_then(|ch|Ok(self.push(State::Quote, &cur_token, &cur_string, ch)))),
_ => self.push(State::Quote, &cur_token, &cur_string, ch),
}
continue 'out; // skipping rest of processing for quoted strings.
}
State::Dollar => {
match ch {
'A' ... 'Z' => { self.push(State::Dollar, &cur_token, &cur_string, ch); continue 'out },
_ => { break 'out},
}
}
_ => (),// do nothing
}
}
// general case match for all other states...
match ch {
' '|'\t' => {
if self.is_first_line { self.set_token_if_not(State::Blank, &cur_token); break } // need the first blank on a line
if cur_token.get().is_some() { break } else { self.txt.next(); continue } // gobble all whitespace
},
'a' ... 'z' | 'A' ... 'Z' | '-' => { self.push(State::CharData, &cur_token, &cur_string, ch); },
'0' ... '9' => { self.push(State::Number, &cur_token, &cur_string, ch); },
'\u{E000}' ... '\u{10FFFF}' if ch.is_alphanumeric() => { self.push(State::CharData, &cur_token, &cur_string, ch); },
'a' ... 'z' | 'A' ... 'Z' | '-' | '.' | '0' ... '9' => { self.push(State::CharData, &cur_token, &cur_string, ch); },
'\r' => if cur_token.get().is_some() { break } else { self.txt.next(); continue },
'\n' => { self.set_token_if_not(State::EOL, &cur_token); self.is_first_line = true; break },
'@' => { self.set_token_if_not(State::At, &cur_token); break },
'\n' => {
if self.in_list {
// in a list act like a standard whitespace.
if cur_token.get().is_some() {
break
} else {
self.txt.next(); continue
}
} else {
self.set_token_if_not(State::EOL, &cur_token);
self.is_first_line = true;
break
}
},
'@' => { self.set_token_if_not(State::At, &cur_token); break },
'$' => if self.set_token_if_not(State::Dollar, &cur_token) { continue } else { break },
'(' => { self.set_token_if_not(State::LeftParen, &cur_token); break },
')' => { self.set_token_if_not(State::RightParen, &cur_token); break },
'(' => {
if self.set_token_if_not(State::StartList, &cur_token) {
if self.in_list { return Err(LexerError::IllegalCharacter(ch)) }
else { self.in_list = true; }
}
break
},
')' => {
if self.set_token_if_not(State::EndList, &cur_token) {
if !self.in_list { return Err(LexerError::IllegalCharacter(ch)) }
else { self.in_list = false; }
}
break
},
'"' => if self.set_token_if_not(State::Quote, &cur_token) { continue } else { break },
';' => if self.set_token_if_not(State::Comment, &cur_token) { continue } else { break },
'.' => { self.set_token_if_not(State::Dot, &cur_token) ; break },
'\\' => {
try!(self.escape_seq().and_then(|c|Ok(self.push(State::CharData, &cur_token, &cur_string, c))));
continue;
},
_ => return Err(LexerError::UnrecognizedChar(ch)),
_ if !ch.is_control() && !ch.is_whitespace() => { self.push(State::CharData, &cur_token, &cur_string, ch); },
_ => return Err(LexerError::UnrecognizedChar(ch)),
}
}
// if the token is unset, then we are at end of stream, aka None
if cur_token.get().is_none() { return Ok(None); }
Token::from(cur_token.get().unwrap(), cur_string.into_inner())
match cur_token.get() {
Some(State::Quote) => Err(LexerError::UnclosedQuotedString),
None if self.in_list => Err(LexerError::UnclosedList),
None => Ok(None),
Some(s) => Token::from(s, cur_string.into_inner()),
}
}
fn escape_seq(&mut self) -> LexerResult<char> {
// escaped character, let's decode it.
self.txt.next(); // consume the escape
let ch = {
let ch_opt = self.txt.peek(); // the next character
if ch_opt.is_none() { return Err(LexerError::EOF) }
*ch_opt.unwrap()
};
let ch = try!(self.peek().ok_or(LexerError::EOF));
if (!ch.is_control()) {
if (ch.is_numeric()) {
// in this case it's an excaped octal: \DDD
let d1 = self.txt.next(); // gobble
let d2 = self.txt.next(); // gobble
let d3 = try!(self.peek()); // peek b/c the push will advance
if d2.is_none() { return Err(LexerError::EOF) }
let d1 = try!(self.txt.next().ok_or(LexerError::EOF)); // gobble
let d2 = try!(self.txt.next().ok_or(LexerError::EOF)); // gobble
let d3 = try!(self.peek().ok_or(LexerError::EOF)); // peek b/c the push will advance
// let ddd: [u8; 3] = [d1.unwrap() as u8, d2.unwrap() as u8, *d3.unwrap() as u8];
// let ch: char = try!(u32::from_str_radix(&ddd.into(), 8)
let ddd: String = try!(String::from_utf8(vec![d1.unwrap() as u8, d2.unwrap() as u8, d3 as u8]));
let ddd: String = try!(String::from_utf8(vec![d1 as u8, d2 as u8, d3 as u8]));
let ch: char = try!(u32::from_str_radix(&ddd, 8)
.or(Err(LexerError::BadEscapedData(ddd)))
.and_then(|o|char::from_u32(o).ok_or(LexerError::UnrecognizedOctet(o))));
@ -118,10 +152,8 @@ impl<'a> Lexer<'a> {
}
fn peek(&mut self) -> LexerResult<char> {
let ch_opt = self.txt.peek(); // the next character
if ch_opt.is_none() { return Err(LexerError::EOF) }
Ok(*ch_opt.unwrap())
fn peek(&mut self) -> Option<char> {
self.txt.peek().map(|c|*c)
}
/// set's the token if it's not set, if it is succesul it advances the txt iter
@ -129,7 +161,7 @@ impl<'a> Lexer<'a> {
self.is_first_line = false;
if cur_token.get().is_none() {
cur_token.set(Some(next_state));
self.txt.next();
self.txt.next(); // if we set a new state, it means we can consume the char
true
} else {
false
@ -155,13 +187,11 @@ impl<'a> Lexer<'a> {
#[derive(Copy, Clone, PartialEq)]
pub enum State {
Blank, // only if the first part of the line
Dot, // .
LeftParen, // (
RightParen, // )
StartList, // (
EndList, // )
CharData, // [a-zA-Z, non-control utf8]+
Comment, // ;.*
At, // @
Number, // [0-9]+
Quote, // ".*"
Quoted, // finish the quoted sequence
Dollar, // $
@ -171,14 +201,13 @@ pub enum State {
#[derive(PartialEq, Debug)]
pub enum Token {
Blank, // only if the first part of the line
Dot, // .
LeftParen, // (
RightParen, // )
CharData(String), // [a-zA-Z, non-control utf8]+
StartList, // (
EndList, // )
CharData(String), // [a-zA-Z, non-control utf8, ., -, 0-9]+
At, // @
Number(i32), // [0-9]+
Quote(String), // ".*"
Dollar(String), // $
Include, // $INCLUDE
Origin, // $ORIGIN
EOL, // \n or \r\n
}
@ -186,16 +215,19 @@ impl Token {
pub fn from(state: State, value: Option<String>) -> LexerResult<Option<Token>> {
Ok(Some(match state {
State::Blank => Token::Blank,
State::Dot => Token::Dot,
State::LeftParen => Token::LeftParen,
State::RightParen => Token::RightParen,
State::StartList => Token::StartList,
State::EndList => Token::EndList,
State::CharData => Token::CharData(value.unwrap()),
State::Comment => Token::EOL, // comments can't end a sequence, so must be EOF/EOL
State::At => Token::At,
State::Number => Token::Number(value.unwrap().parse().unwrap()),
State::Quote => return Err(LexerError::UnclosedQuotedString),
State::Quoted => Token::Quote(value.unwrap_or_default()),
State::Dollar => Token::Dollar(value.unwrap_or_default()),
State::Dollar => {
let s = value.unwrap_or_default();
if "INCLUDE".to_string() == s { Token::Include }
else if "ORIGIN".to_string() == s { Token::Origin }
else { return Err(LexerError::UnrecognizedDollar(s)) }
},
State::EOL => Token::EOL,
}))
}
@ -203,6 +235,7 @@ impl Token {
#[cfg(test)]
mod lex_test {
use ::error::*;
use super::*;
#[test]
@ -228,30 +261,103 @@ mod lex_test {
}
// fun with tests!!! lots of options
#[test]
fn lex() {
assert_eq!(Lexer::new(".").next_token().unwrap().unwrap(), Token::Dot);
assert_eq!(Lexer::new(" .").next_token().unwrap().unwrap(), Token::Blank);
assert_eq!(Lexer::new("(").next_token().unwrap().unwrap(), Token::LeftParen);
assert_eq!(Lexer::new(")").next_token().unwrap().unwrap(), Token::RightParen);
assert_eq!(Lexer::new("abc").next_token().unwrap().unwrap(), Token::CharData("abc".to_string()));
assert_eq!(Lexer::new("abc.").next_token().unwrap().unwrap(), Token::CharData("abc".to_string()));
fn escape() {
assert_eq!(Lexer::new("a\\A").next_token().unwrap().unwrap(), Token::CharData("aA".to_string()));
assert_eq!(Lexer::new("a\\$").next_token().unwrap().unwrap(), Token::CharData("a$".to_string()));
assert_eq!(Lexer::new("a\\077").next_token().unwrap().unwrap(), Token::CharData("a?".to_string()));
assert!(Lexer::new("a\\").next_token().is_err());
assert!(Lexer::new("a\\0").next_token().is_err());
assert!(Lexer::new("a\\07").next_token().is_err());
}
#[test]
fn quoted_txt() {
assert_eq!(Lexer::new("\"Quoted\"").next_token().unwrap().unwrap(), Token::Quote("Quoted".to_string()));
assert_eq!(Lexer::new("\";@$\"").next_token().unwrap().unwrap(), Token::Quote(";@$".to_string()));
assert_eq!(Lexer::new("\"some \\A\"").next_token().unwrap().unwrap(), Token::Quote("some A".to_string()));
let mut lexer = Lexer::new("\"multi\nline\ntext\"");
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::Quote("multi\nline\ntext".to_string()));
assert_eq!(lexer.next_token().unwrap(), None);
let mut lexer = Lexer::new("\"multi\r\nline\r\ntext\"\r\n");
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::Quote("multi\r\nline\r\ntext".to_string()));
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::EOL);
assert_eq!(lexer.next_token().unwrap(), None);
assert!(Lexer::new("\"multi").next_token().is_err());
}
#[test]
fn unicode() {
assert_eq!(Lexer::new("").next_token().unwrap().unwrap(), Token::CharData("".to_string()));
}
// fun with tests!!! lots of options
#[test]
fn lex() {
assert_eq!(Lexer::new(".").next_token().unwrap().unwrap(), Token::CharData(".".to_string()));
assert_eq!(Lexer::new(" .").next_token().unwrap().unwrap(), Token::Blank);
assert_eq!(Lexer::new("abc").next_token().unwrap().unwrap(), Token::CharData("abc".to_string()));
assert_eq!(Lexer::new("abc.").next_token().unwrap().unwrap(), Token::CharData("abc.".to_string()));
assert_eq!(Lexer::new(";abc").next_token().unwrap().unwrap(), Token::EOL);
assert_eq!(Lexer::new(";;@$-\"").next_token().unwrap().unwrap(), Token::EOL);
assert_eq!(Lexer::new("@").next_token().unwrap().unwrap(), Token::At);
assert_eq!(Lexer::new("123").next_token().unwrap().unwrap(), Token::Number(123));
assert_eq!(Lexer::new("\"Quoted\"").next_token().unwrap().unwrap(), Token::Quote("Quoted".to_string()));
assert_eq!(Lexer::new("\";@$\"").next_token().unwrap().unwrap(), Token::Quote(";@$".to_string()));
assert_eq!(Lexer::new("$Bill").next_token().unwrap().unwrap(), Token::Dollar("Bill".to_string()));
assert_eq!(Lexer::new("$$Bill").next_token().unwrap().unwrap(), Token::Dollar("".to_string()));
assert_eq!(Lexer::new("123").next_token().unwrap().unwrap(), Token::CharData("123".to_string()));
assert_eq!(Lexer::new("$INCLUDE").next_token().unwrap().unwrap(), Token::Include);
assert_eq!(Lexer::new("$ORIGIN").next_token().unwrap().unwrap(), Token::Origin);
assert_eq!(Lexer::new("\n").next_token().unwrap().unwrap(), Token::EOL);
assert_eq!(Lexer::new("\r\n").next_token().unwrap().unwrap(), Token::EOL);
}
#[test]
fn list() {
let mut lexer = Lexer::new("(");
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::StartList);
assert!(lexer.next_token().is_err());
assert!(Lexer::new(")").next_token().is_err());
let mut lexer = Lexer::new("()");
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::StartList);
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::EndList);
assert_eq!(lexer.next_token().unwrap(), None);
let mut lexer = Lexer::new("(abc)");
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::StartList);
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::CharData("abc".to_string()));
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::EndList);
assert_eq!(lexer.next_token().unwrap(), None);
let mut lexer = Lexer::new("(\nabc\n)");
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::StartList);
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::CharData("abc".to_string()));
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::EndList);
assert_eq!(lexer.next_token().unwrap(), None);
let mut lexer = Lexer::new("(\nabc\nabc)");
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::StartList);
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::CharData("abc".to_string()));
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::CharData("abc".to_string()));
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::EndList);
assert_eq!(lexer.next_token().unwrap(), None);
let mut lexer = Lexer::new("(\n\"abc\"\n)");
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::StartList);
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::Quote("abc".to_string()));
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::EndList);
assert_eq!(lexer.next_token().unwrap(), None);
let mut lexer = Lexer::new("(\n\"abc\";comment\n)");
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::StartList);
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::Quote("abc".to_string()));
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::EndList);
assert_eq!(lexer.next_token().unwrap(), None);
}
#[test]
fn soa() {
let mut lexer = Lexer::new("@ IN SOA VENERA Action\\.domains (\n\
@ -272,34 +378,24 @@ A A 26.3.0.103\n\
VENERA A 10.1.0.52\n\
A 128.9.0.32\n\
\n\
$INCLUDE \\<SUBSYS\\>ISI-MAILBOXES.TXT");
$INCLUDE <SUBSYS>ISI-MAILBOXES.TXT");
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::At);
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::CharData("IN".to_string()));
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::CharData("SOA".to_string()));
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::CharData("VENERA".to_string()));
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::CharData("Action.domains".to_string()));
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::LeftParen);
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::EOL);
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::Number(20));
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::EOL);
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::Number(7200));
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::EOL);
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::Number(600));
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::EOL);
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::Number(3600000));
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::EOL);
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::Number(60));
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::RightParen);
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::StartList);
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::CharData("20".to_string()));
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::CharData("7200".to_string()));
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::CharData("600".to_string()));
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::CharData("3600000".to_string()));
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::CharData("60".to_string()));
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::EndList);
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::EOL);
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::EOL);
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::CharData("NS".to_string()));
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::CharData("A".to_string()));
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::Dot);
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::CharData("ISI".to_string()));
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::Dot);
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::CharData("EDU".to_string()));
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::Dot);
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::CharData("A.ISI.EDU.".to_string()));
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::EOL);
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::CharData("NS".to_string()));
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::CharData("VENERA".to_string()));
@ -308,49 +404,29 @@ $INCLUDE \\<SUBSYS\\>ISI-MAILBOXES.TXT");
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::CharData("VAXA".to_string()));
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::EOL);
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::CharData("MX".to_string()));
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::Number(10));
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::CharData("10".to_string()));
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::CharData("VENERA".to_string()));
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::EOL);
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::CharData("MX".to_string()));
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::Number(20));
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::CharData("20".to_string()));
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::CharData("VAXA".to_string()));
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::EOL);
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::EOL);
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::CharData("A".to_string()));
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::CharData("A".to_string()));
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::Number(26));
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::Dot);
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::Number(3));
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::Dot);
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::Number(0));
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::Dot);
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::Number(103));
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::CharData("26.3.0.103".to_string()));
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::EOL);
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::EOL);
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::CharData("VENERA".to_string()));
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::CharData("A".to_string()));
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::Number(10));
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::Dot);
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::Number(1));
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::Dot);
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::Number(0));
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::Dot);
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::Number(52));
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::CharData("10.1.0.52".to_string()));
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::EOL);
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::CharData("A".to_string()));
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::Number(128));
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::Dot);
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::Number(9));
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::Dot);
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::Number(0));
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::Dot);
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::Number(32));
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::CharData("128.9.0.32".to_string()));
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::EOL);
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::EOL);
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::Dollar("INCLUDE".to_string()));
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::CharData("<SUBSYS>ISI-MAILBOXES".to_string()));
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::Dot);
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::CharData("TXT".to_string()));
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::Include);
assert_eq!(lexer.next_token().unwrap().unwrap(), Token::CharData("<SUBSYS>ISI-MAILBOXES.TXT".to_string()));
assert!(lexer.next_token().unwrap().is_none());
}
}

View File

@ -13,22 +13,8 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// mod master;
mod master_lex;
mod decoder;
//mod encoder;
pub use self::decoder::TxtDecoder;
//pub use self::encoder::TxtEncoder;
//#[cfg(test)]
//pub mod txt_tests;
//use ::error::*;
// pub trait TxtSerializable {
// fn read(decoder: &mut TxtDecoder) -> DecodeResult<Self>;
// fn emit(&self, encoder: &mut TxtEncoder) -> EncodeResult;
// }
// mod master;
//
// pub use self::master::Parser;
pub use self::master_lex::Token;