proto: handle includes in the parser

2023-08-03 14:02:46 +02:00 · 2023-08-03 14:02:46 +02:00 · 77e06525c1
commit 77e06525c1
parent 4b334aee36
6 changed files with 229 additions and 253 deletions
--- a/crates/proto/src/serialize/txt/rdata_parsers/svcb.rs
+++ b/crates/proto/src/serialize/txt/rdata_parsers/svcb.rs
@ -335,7 +335,7 @@ mod tests {
    // TODO: make Parser return an iterator over all records in a stream.
    fn parse_record<D: RecordData>(txt: &str) -> D {
        let lex = Lexer::new(txt);
-        let records = Parser::new(lex, Some(Name::root()))
+        let records = Parser::new(lex, None, Some(Name::root()))
            .parse()
            .expect("failed to parse record")
            .1;
--- a/crates/proto/src/serialize/txt/zone.rs
+++ b/crates/proto/src/serialize/txt/zone.rs
@ -5,7 +5,12 @@
 // http://opensource.org/licenses/MIT>, at your option. This file may not be
 // copied, modified, or distributed except according to those terms.

-use std::{collections::BTreeMap, str::FromStr};
+use std::{
+    collections::BTreeMap,
+    fs, mem,
+    path::{Path, PathBuf},
+    str::FromStr,
+};

 use crate::{
    rr::{DNSClass, LowerName, Name, RData, Record, RecordSet, RecordType, RrKey},
@ -117,14 +122,20 @@ use crate::{
 ///                 the line is ignored.
 /// ```
 pub struct Parser<'a> {
-    lexer: Lexer<'a>,
+    lexers: Vec<(Lexer<'a>, Option<PathBuf>)>,
    origin: Option<Name>,
 }

 impl<'a> Parser<'a> {
    /// Returns a new Zone file parser
-    pub fn new(lexer: Lexer<'a>, origin: Option<Name>) -> Self {
-        Self { lexer, origin }
+    ///
+    /// The `path` argument's parent directory is used to resolve relative `$INCLUDE` paths.
+    /// Relative `$INCLUDE` paths will yield an error if `path` is `None`.
+    pub fn new(lexer: Lexer<'a>, path: Option<PathBuf>, origin: Option<Name>) -> Self {
+        Self {
+            lexers: vec![(lexer, path)],
+            origin,
+        }
    }

    /// Parse a file from the Lexer
@ -132,146 +143,196 @@ impl<'a> Parser<'a> {
    /// # Return
    ///
    /// A pair of the Zone origin name and a map of all Keys to RecordSets
-    pub fn parse(self) -> ParseResult<(Name, BTreeMap<RrKey, RecordSet>)> {
-        let Self {
-            mut lexer,
-            mut origin,
-        } = self;
+    pub fn parse(mut self) -> ParseResult<(Name, BTreeMap<RrKey, RecordSet>)> {
+        let mut origin = self.origin;
        let mut records: BTreeMap<RrKey, RecordSet> = BTreeMap::new();
        let mut class: DNSClass = DNSClass::IN;
        let mut current_name: Option<Name> = None;
        let mut rtype: Option<RecordType> = None;
        let mut ttl: Option<u32> = None;
        let mut state = State::StartLine;
+        let mut stack = self.lexers.len();

-        while let Some(t) = lexer.next_token()? {
-            state = match state {
-                State::StartLine => {
-                    // current_name is not reset on the next line b/c it might be needed from the previous
-                    rtype = None;
+        'outer: while let Some((lexer, path)) = self.lexers.last_mut() {
+            while let Some(t) = lexer.next_token()? {
+                state = match state {
+                    State::StartLine => {
+                        // current_name is not reset on the next line b/c it might be needed from the previous
+                        rtype = None;

-                    match t {
-                        // if Dollar, then $INCLUDE or $ORIGIN
-                        Token::Include => {
-                            return Err(ParseError::from(ParseErrorKind::Message("The parser does not support $INCLUDE. Consider inlining file before parsing")))
-                        },
-                        Token::Origin => State::Origin,
-                        Token::Ttl => State::Ttl,
+                        match t {
+                            // if Dollar, then $INCLUDE or $ORIGIN
+                            Token::Include => State::Include(None),
+                            Token::Origin => State::Origin,
+                            Token::Ttl => State::Ttl,

-                        // if CharData, then Name then ttl_class_type
-                        Token::CharData(data) => {
-                            current_name = Some(Name::parse(&data, origin.as_ref())?);
-                            State::TtlClassType
+                            // if CharData, then Name then ttl_class_type
+                            Token::CharData(data) => {
+                                current_name = Some(Name::parse(&data, origin.as_ref())?);
+                                State::TtlClassType
+                            }
+
+                            // @ is a placeholder for specifying the current origin
+                            Token::At => {
+                                current_name = origin.clone(); // TODO a COW or RC would reduce copies...
+                                State::TtlClassType
+                            }
+
+                            // if blank, then nothing or ttl_class_type
+                            Token::Blank => State::TtlClassType,
+                            Token::EOL => State::StartLine, // probably a comment
+                            _ => return Err(ParseErrorKind::UnexpectedToken(t).into()),
                        }
-
-                        // @ is a placeholder for specifying the current origin
-                        Token::At => {
-                            current_name = origin.clone(); // TODO a COW or RC would reduce copies...
-                            State::TtlClassType
-                        }
-
-                        // if blank, then nothing or ttl_class_type
-                        Token::Blank => State::TtlClassType,
-                        Token::EOL => State::StartLine, // probably a comment
-                        _ => return Err(ParseErrorKind::UnexpectedToken(t).into()),
                    }
-                }
-                State::Ttl => match t {
-                    Token::CharData(data) => {
-                        ttl = Some(Self::parse_time(&data)?);
-                        State::StartLine
-                    }
-                    _ => return Err(ParseErrorKind::UnexpectedToken(t).into()),
-                },
-                State::Origin => {
-                    match t {
+                    State::Ttl => match t {
                        Token::CharData(data) => {
-                            // TODO an origin was specified, should this be legal? definitely confusing...
-                            origin = Some(Name::parse(&data, None)?);
+                            ttl = Some(Self::parse_time(&data)?);
                            State::StartLine
                        }
                        _ => return Err(ParseErrorKind::UnexpectedToken(t).into()),
+                    },
+                    State::Origin => {
+                        match t {
+                            Token::CharData(data) => {
+                                // TODO an origin was specified, should this be legal? definitely confusing...
+                                origin = Some(Name::parse(&data, None)?);
+                                State::StartLine
+                            }
+                            _ => return Err(ParseErrorKind::UnexpectedToken(t).into()),
+                        }
                    }
-                }
-                State::Include => return Err(ParseError::from(ParseErrorKind::Message(
-                    "The parser does not support $INCLUDE. Consider inlining file before parsing",
-                ))),
-                State::TtlClassType => {
-                    match t {
-                        // if number, TTL
-                        // Token::Number(ref num) => ttl = Some(*num),
-                        // One of Class or Type (these cannot be overlapping!)
-                        Token::CharData(mut data) => {
-                            // if it's a number it's a ttl
-                            let result: ParseResult<u32> = Self::parse_time(&data);
-                            if result.is_ok() {
-                                ttl = result.ok();
-                                State::TtlClassType // hm, should this go to just ClassType?
-                            } else {
-                                // if can parse DNSClass, then class
-                                data.make_ascii_uppercase();
-                                let result = DNSClass::from_str(&data);
-                                if let Ok(parsed) = result {
-                                    class = parsed;
-                                    State::TtlClassType
+                    State::Include(include_path) => match (t, include_path) {
+                        (Token::CharData(data), None) => State::Include(Some(data)),
+                        (Token::EOL, Some(include_path)) => {
+                            // RFC1035 (section 5) does not specify how filename for $INCLUDE
+                            // should be resolved into file path. The underlying code implements the
+                            // following:
+                            // * if the path is absolute (relies on Path::is_absolute), it uses normalized path
+                            // * otherwise, it joins the path with parent root of the current file
+                            //
+                            // TODO: Inlining files specified using non-relative path might potentially introduce
+                            // security issue in some cases (e.g. when working with zone files from untrusted sources)
+                            // and should probably be configurable by user.
+
+                            if stack > MAX_INCLUDE_LEVEL {
+                                return Err(ParseErrorKind::Message(
+                                    "Max depth level for nested $INCLUDE is reached",
+                                )
+                                .into());
+                            }
+
+                            let include = Path::new(&include_path);
+                            let include = match (include.is_absolute(), path) {
+                                (true, _) => include.to_path_buf(),
+                                (false, Some(path)) => path
+                                    .parent()
+                                    .expect("file has to have parent folder")
+                                    .join(include),
+                                (false, None) => {
+                                    return Err(ParseErrorKind::Message(
+                                        "Relative $INCLUDE is not supported",
+                                    )
+                                    .into());
+                                }
+                            };
+
+                            let input = fs::read_to_string(&include)?;
+                            let lexer = Lexer::new(input);
+                            self.lexers.push((lexer, Some(include)));
+                            stack += 1;
+                            state = State::StartLine;
+                            continue 'outer;
+                        }
+                        (Token::CharData(_), Some(_)) => {
+                            return Err(ParseErrorKind::Message(
+                                "Domain name for $INCLUDE is not supported",
+                            )
+                            .into());
+                        }
+                        (t, _) => {
+                            return Err(ParseErrorKind::UnexpectedToken(t).into());
+                        }
+                    },
+                    State::TtlClassType => {
+                        match t {
+                            // if number, TTL
+                            // Token::Number(ref num) => ttl = Some(*num),
+                            // One of Class or Type (these cannot be overlapping!)
+                            Token::CharData(mut data) => {
+                                // if it's a number it's a ttl
+                                let result: ParseResult<u32> = Self::parse_time(&data);
+                                if result.is_ok() {
+                                    ttl = result.ok();
+                                    State::TtlClassType // hm, should this go to just ClassType?
                                } else {
-                                    // if can parse RecordType, then RecordType
-                                    rtype = Some(RecordType::from_str(&data)?);
-                                    State::Record(vec![])
+                                    // if can parse DNSClass, then class
+                                    data.make_ascii_uppercase();
+                                    let result = DNSClass::from_str(&data);
+                                    if let Ok(parsed) = result {
+                                        class = parsed;
+                                        State::TtlClassType
+                                    } else {
+                                        // if can parse RecordType, then RecordType
+                                        rtype = Some(RecordType::from_str(&data)?);
+                                        State::Record(vec![])
+                                    }
                                }
                            }
+                            // could be nothing if started with blank and is a comment, i.e. EOL
+                            Token::EOL => {
+                                State::StartLine // next line
+                            }
+                            _ => return Err(ParseErrorKind::UnexpectedToken(t).into()),
                        }
-                        // could be nothing if started with blank and is a comment, i.e. EOL
-                        Token::EOL => {
-                            State::StartLine // next line
-                        }
-                        _ => return Err(ParseErrorKind::UnexpectedToken(t).into()),
                    }
-                }
-                State::Record(record_parts) => {
-                    // b/c of ownership rules, perhaps, just collect all the RData components as a list of
-                    //  tokens to pass into the processor
-                    match t {
-                        Token::EOL => {
-                            Self::flush_record(
-                                record_parts,
-                                &origin,
-                                &current_name,
-                                rtype,
-                                &mut ttl,
-                                class,
-                                &mut records,
-                            )?;
-                            State::StartLine
+                    State::Record(record_parts) => {
+                        // b/c of ownership rules, perhaps, just collect all the RData components as a list of
+                        //  tokens to pass into the processor
+                        match t {
+                            Token::EOL => {
+                                Self::flush_record(
+                                    record_parts,
+                                    &origin,
+                                    &current_name,
+                                    rtype,
+                                    &mut ttl,
+                                    class,
+                                    &mut records,
+                                )?;
+                                State::StartLine
+                            }
+                            Token::CharData(part) => {
+                                let mut record_parts = record_parts;
+                                record_parts.push(part);
+                                State::Record(record_parts)
+                            }
+                            // TODO: we should not tokenize the list...
+                            Token::List(list) => {
+                                let mut record_parts = record_parts;
+                                record_parts.extend(list);
+                                State::Record(record_parts)
+                            }
+                            _ => return Err(ParseErrorKind::UnexpectedToken(t).into()),
                        }
-                        Token::CharData(part) => {
-                            let mut record_parts = record_parts;
-                            record_parts.push(part);
-                            State::Record(record_parts)
-                        }
-                        // TODO: we should not tokenize the list...
-                        Token::List(list) => {
-                            let mut record_parts = record_parts;
-                            record_parts.extend(list);
-                            State::Record(record_parts)
-                        }
-                        _ => return Err(ParseErrorKind::UnexpectedToken(t).into()),
                    }
-                }
+                };
            }
-        }

-        //Extra flush at the end for the case of missing endline
-        if let State::Record(record_parts) = state {
-            Self::flush_record(
-                record_parts,
-                &origin,
-                &current_name,
-                rtype,
-                &mut ttl,
-                class,
-                &mut records,
-            )?;
+            // Extra flush at the end for the case of missing endline
+            if let State::Record(record_parts) = mem::replace(&mut state, State::StartLine) {
+                Self::flush_record(
+                    record_parts,
+                    &origin,
+                    &current_name,
+                    rtype,
+                    &mut ttl,
+                    class,
+                    &mut records,
+                )?;
+            }
+
+            stack -= 1;
+            self.lexers.pop();
        }

        //
@ -455,10 +516,13 @@ enum State {
    TtlClassType, // [<TTL>] [<class>] <type>,
    Ttl,          // $TTL <time>
    Record(Vec<String>),
-    Include, // $INCLUDE <filename>
+    Include(Option<String>), // $INCLUDE <filename>
    Origin,
 }

+/// Max traversal depth for $INCLUDE files
+const MAX_INCLUDE_LEVEL: usize = 256;
+
 #[cfg(test)]
 mod tests {
    use super::*;
@ -473,7 +537,7 @@ mod tests {
 "#;

        let lexer = Lexer::new(zone_data);
-        let result = Parser::new(lexer, Some(domain)).parse();
+        let result = Parser::new(lexer, None, Some(domain)).parse();
        assert!(
            result.is_err()
                & result
--- a/crates/proto/src/serialize/txt/zone_lex.rs
+++ b/crates/proto/src/serialize/txt/zone_lex.rs
@ -5,23 +5,26 @@
 // http://opensource.org/licenses/MIT>, at your option. This file may not be
 // copied, modified, or distributed except according to those terms.

-use std::char;
-use std::iter::Peekable;
-use std::str::Chars;
+use std::borrow::Cow;
+use std::{char, iter::Peekable};

 use crate::serialize::txt::errors::{LexerError, LexerErrorKind, LexerResult};

 /// A Lexer for Zone files
 pub struct Lexer<'a> {
-    txt: Peekable<Chars<'a>>,
+    txt: Peekable<CowChars<'a>>,
    state: State,
 }

 impl<'a> Lexer<'a> {
    /// Creates a new lexer with the given data to parse
-    pub fn new(txt: &str) -> Lexer<'_> {
+    pub fn new(txt: impl Into<Cow<'a, str>>) -> Lexer<'a> {
        Lexer {
-            txt: txt.chars().peekable(),
+            txt: CowChars {
+                data: txt.into(),
+                offset: 0,
+            }
+            .peekable(),
            state: State::StartLine,
        }
    }
@ -327,7 +330,27 @@ impl<'a> Lexer<'a> {
    }

    fn peek(&mut self) -> Option<char> {
-        self.txt.peek().cloned()
+        self.txt.peek().copied()
+    }
+}
+
+struct CowChars<'a> {
+    data: Cow<'a, str>,
+    offset: usize,
+}
+
+impl<'a> Iterator for CowChars<'a> {
+    type Item = char;
+
+    fn next(&mut self) -> Option<char> {
+        let mut iter = self.data[self.offset..].char_indices();
+        let (_, ch) = iter.next()?; // The returned index is always `0`
+        match iter.next() {
+            Some((idx, _)) => self.offset += idx,
+            None => self.offset = self.data.len(),
+        }
+
+        Some(ch)
    }
 }

--- a/crates/server/src/store/file/authority.rs
+++ b/crates/server/src/store/file/authority.rs
@ -9,8 +9,7 @@

 use std::{
    collections::BTreeMap,
-    fs::File,
-    io::{BufRead, BufReader},
+    fs,
    ops::{Deref, DerefMut},
    path::{Path, PathBuf},
 };
@ -25,7 +24,7 @@ use crate::{
 use crate::{
    authority::{Authority, LookupError, LookupOptions, MessageRequest, UpdateResult, ZoneType},
    proto::rr::{LowerName, Name, RecordSet, RecordType, RrKey},
-    proto::serialize::txt::{Lexer, Parser, Token},
+    proto::serialize::txt::{Lexer, Parser},
    server::RequestInfo,
    store::{file::FileConfig, in_memory::InMemoryAuthority},
 };
@ -36,36 +35,6 @@ use crate::{
 /// start of authority for the zone, is a Secondary, or a cached zone.
 pub struct FileAuthority(InMemoryAuthority);

-/// Max traversal depth for $INCLUDE files
-const MAX_INCLUDE_LEVEL: u16 = 256;
-
-/// Inner state of zone file loader, tracks depth of $INCLUDE
-/// loads as well as visited previously files, so the loader
-/// is able to abort e.g. when cycle is detected
-///
-/// Note, that tracking max depth level explicitly covers also
-/// cycles in $INCLUDEs. The error description in this case would
-/// not be very helpful to detect the root cause of the problem
-/// though. The way to improve diagnose experience would be to
-/// traverse $INCLUDE files in topologically sorted order which
-/// requires quite some re-arrangements in the code and in the
-/// way loader is currently implemented.
-struct FileReaderState {
-    level: u16,
-}
-
-impl FileReaderState {
-    fn new() -> Self {
-        Self { level: 0 }
-    }
-
-    fn next_level(&self) -> Self {
-        Self {
-            level: self.level + 1,
-        }
-    }
-}
-
 impl FileAuthority {
    /// Creates a new Authority.
    ///
@ -91,85 +60,6 @@ impl FileAuthority {
        InMemoryAuthority::new(origin, records, zone_type, allow_axfr).map(Self)
    }

-    /// Read given file line by line and recursively invokes reader for
-    /// $INCLUDE directives
-    ///
-    /// TODO: it looks hacky as far we effectively duplicate parser's functionality
-    /// (at least partially) and performing lexing twice.
-    /// Better solution requires us to change lexer to deal
-    /// with Lines-like iterator instead of String buf (or capability to combine a few
-    /// lexer instances into a single lexer).
-    ///
-    /// TODO: $INCLUDE could specify domain name -- to support on-flight swap for Origin
-    /// value we definitely need to rethink and rework loader/parser/lexer
-    fn read_file(
-        zone_path: PathBuf,
-        buf: &mut String,
-        state: FileReaderState,
-    ) -> Result<(), String> {
-        let file = File::open(&zone_path)
-            .map_err(|e| format!("failed to read {}: {:?}", zone_path.display(), e))?;
-        let reader = BufReader::new(file);
-        for line in reader.lines() {
-            let content = line.map_err(|err| format!("failed to read line: {err:?}"))?;
-            let mut lexer = Lexer::new(&content);
-
-            match (lexer.next_token(), lexer.next_token(), lexer.next_token()) {
-                (
-                    Ok(Some(Token::Include)),
-                    Ok(Some(Token::CharData(include_path))),
-                    Ok(Some(Token::CharData(_domain))),
-                ) => {
-                    return Err(format!(
-                        "Domain name for $INCLUDE is not supported at {}, trying to include {}",
-                        zone_path.display(),
-                        include_path
-                    ));
-                }
-                (Ok(Some(Token::Include)), Ok(Some(Token::CharData(include_path))), _) => {
-                    // RFC1035 (section 5) does not specify how filename for $INCLUDE
-                    // should be resolved into file path. The underlying code implements the
-                    // following:
-                    // * if the path is absolute (relies on Path::is_absolute), it uses normalized path
-                    // * otherwise, it joins the path with parent root of the current file
-                    //
-                    // TODO: Inlining files specified using non-relative path might potentially introduce
-                    // security issue in some cases (e.g. when working with zone files from untrusted sources)
-                    // and should probably be configurable by user.
-                    let include_path = Path::new(&include_path);
-                    let include_zone_path = if include_path.is_absolute() {
-                        include_path.to_path_buf()
-                    } else {
-                        let parent_dir =
-                            zone_path.parent().expect("file has to have parent folder");
-                        parent_dir.join(include_path)
-                    };
-
-                    if state.level >= MAX_INCLUDE_LEVEL {
-                        return Err(format!("Max depth level for nested $INCLUDE is reached at {}, trying to include {}", zone_path.display(), include_zone_path.display()));
-                    }
-
-                    let mut include_buf = String::new();
-
-                    info!(
-                        "including file {} into {}",
-                        include_zone_path.display(),
-                        zone_path.display()
-                    );
-
-                    Self::read_file(include_zone_path, &mut include_buf, state.next_level())?;
-                    buf.push_str(&include_buf);
-                }
-                _ => {
-                    buf.push_str(&content);
-                }
-            }
-
-            buf.push('\n');
-        }
-        Ok(())
-    }
-
    /// Read the Authority for the origin from the specified configuration
    pub fn try_from_config(
        origin: Name,
@ -183,15 +73,13 @@ impl FileAuthority {

        info!("loading zone file: {:?}", zone_path);

-        let mut buf = String::new();
-
        // TODO: this should really use something to read line by line or some other method to
        //  keep the usage down. and be a custom lexer...
-        Self::read_file(zone_path, &mut buf, FileReaderState::new())
+        let buf = fs::read_to_string(&zone_path)
            .map_err(|e| format!("failed to read {}: {:?}", &config.zone_file_path, e))?;

-        let lexer = Lexer::new(&buf);
-        let (origin, records) = Parser::new(lexer, Some(origin))
+        let lexer = Lexer::new(buf);
+        let (origin, records) = Parser::new(lexer, Some(zone_path), Some(origin))
            .parse()
            .map_err(|e| format!("failed to parse {}: {:?}", config.zone_file_path, e))?;

--- a/crates/server/src/store/recursor/config.rs
+++ b/crates/server/src/store/recursor/config.rs
@ -44,8 +44,9 @@ impl RecursiveConfig {
        let mut roots_str = String::new();
        roots.read_to_string(&mut roots_str)?;

-        let lexer = Lexer::new(&roots_str);
-        let (_zone, roots_zone) = Parser::new(lexer, Some(Name::root())).parse()?;
+        let lexer = Lexer::new(roots_str);
+        let (_zone, roots_zone) =
+            Parser::new(lexer, Some(path.into_owned()), Some(Name::root())).parse()?;

        // TODO: we may want to deny some of the root nameservers, for reasons...
        Ok(roots_zone
--- a/crates/server/tests/txt_tests.rs
+++ b/crates/server/tests/txt_tests.rs
@ -57,7 +57,7 @@ tech.   3600    in      soa     ns0.centralnic.net.     hostmaster.centralnic.ne
 "#,
    );

-    let records = Parser::new(lexer, Some(Name::from_str("isi.edu").unwrap())).parse();
+    let records = Parser::new(lexer, None, Some(Name::from_str("isi.edu").unwrap())).parse();
    if records.is_err() {
        panic!("failed to parse: {:?}", records.err())
    }
@ -420,7 +420,7 @@ a       A       127.0.0.1
 ",
    );

-    let records = Parser::new(lexer, Some(Name::from_str("isi.edu").unwrap())).parse();
+    let records = Parser::new(lexer, None, Some(Name::from_str("isi.edu").unwrap())).parse();

    if records.is_err() {
        panic!("failed to parse: {:?}", records.err())
@ -448,7 +448,7 @@ b       A       127.0.0.2
 ",
    );

-    let records = Parser::new(lexer, Some(Name::from_str("isi.edu").unwrap())).parse();
+    let records = Parser::new(lexer, None, Some(Name::from_str("isi.edu").unwrap())).parse();

    if records.is_err() {
        panic!("failed to parse: {:?}", records.err())
@ -475,7 +475,7 @@ a       A       127.0.0.1
 ",
    );

-    let records = Parser::new(lexer, Some(Name::from_str("isi.edu").unwrap())).parse();
+    let records = Parser::new(lexer, None, Some(Name::from_str("isi.edu").unwrap())).parse();

    if records.is_err() {
        panic!("failed to parse: {:?}", records.err())
@ -494,7 +494,7 @@ fn test_named_root() {
 "###,
    );

-    let records = Parser::new(lexer, Some(Name::root())).parse();
+    let records = Parser::new(lexer, None, Some(Name::root())).parse();

    if records.is_err() {
        panic!("failed to parse: {:?}", records.err())