diff options
Diffstat (limited to 'src/tokeniser.rs')
-rw-r--r-- | src/tokeniser.rs | 167 |
1 files changed, 114 insertions, 53 deletions
diff --git a/src/tokeniser.rs b/src/tokeniser.rs index 6ae9055..7ab44f0 100644 --- a/src/tokeniser.rs +++ b/src/tokeniser.rs @@ -5,7 +5,7 @@ use std::path::PathBuf; /// Break a character stream down into individual tokens. pub struct Tokeniser { - /// Characters waiting to be parsed, in reverse order. + /// Characters waiting to be parsed. pub chars: Vec<char>, /// Path of the whole source file. pub source_path: Option<PathBuf>, @@ -13,32 +13,53 @@ pub struct Tokeniser { pub embedded_path: Option<PathBuf>, /// Line where the embedded source file begins. pub embedded_first_line: usize, - /// Position of the next character to be consumed. - pub position: SourcePosition, - /// Position of the most recently consumed character. - pub prev_position: SourcePosition, + /// Mark tracking the next character to parse. + pub mark: TokeniserMark, + /// Mark tracking the most recent character of the current token. + pub prev: TokeniserMark, /// Position of the first character of the current token. - pub start_position: SourcePosition, - /// The source characters consumed for the current token. - pub consumed: String, + pub start: TokeniserMark, + /// Position after the final character of the current token. + pub end: TokeniserMark, + /// Position to begin subtokenisation from. + pub child: TokeniserMark, /// List of characters that start a new token. pub delimiters: Vec<char>, /// List of characters that terminate a token. pub terminators: Vec<char>, } - impl Tokeniser { pub fn new<P: Into<PathBuf>>(source_code: &str, path: Option<P>) -> Self { Self { - chars: source_code.chars().rev().collect(), + chars: source_code.chars().collect(), source_path: path.map(|p| p.into()), embedded_path: None, embedded_first_line: 0, - position: SourcePosition::ZERO, - prev_position: SourcePosition::ZERO, - start_position: SourcePosition::ZERO, - consumed: String::new(), + mark: TokeniserMark::ZERO, + prev: TokeniserMark::ZERO, + start: TokeniserMark::ZERO, + end: TokeniserMark::ZERO, + child: TokeniserMark::ZERO, + delimiters: Vec::new(), + terminators: Vec::new(), + } + } + + /// Create a tokeniser from child to end. + pub fn subtokenise(&mut self) -> Self { + let mut start = self.child; + start.i = 0; + Self { + chars: self.get_chars(&self.child, &self.end), + source_path: self.source_path.clone(), + embedded_path: self.embedded_path.clone(), + embedded_first_line: self.embedded_first_line.clone(), + mark: start, + prev: start, + start: start, + end: start, + child: start, delimiters: Vec::new(), terminators: Vec::new(), } @@ -52,37 +73,33 @@ impl Tokeniser { self.terminators.extend_from_slice(terminators); } + pub fn get_chars(&self, start: &TokeniserMark, end: &TokeniserMark) -> Vec<char> { + self.chars[start.i..end.i].iter().map(char::to_owned).collect() + } + /// Return the next character without consuming it. pub fn peek_char(&self) -> Option<char> { - self.chars.last().copied() + self.chars.get(self.mark.i).copied() } /// Consume and return the next character. pub fn eat_char(&mut self) -> Option<char> { - let option = self.chars.pop(); + let option = self.peek_char(); if let Some(c) = option { - self.prev_position = self.position; - self.position.advance(c); - self.consumed.push(c); + self.prev = self.mark; + self.mark.advance(c); + self.mark_end(); } return option; } - /// Remove the next character. - pub fn drop_char(&mut self) { - if let Some(c) = self.chars.pop() { - self.prev_position = self.position; - self.position.advance(c); - } - } - - /// Remove whitespace. - pub fn drop_whitespace(&mut self) { + /// Consume whitespace. + pub fn eat_whitespace(&mut self) { while let Some(c) = self.peek_char() { match c.is_whitespace() { - true => self.drop_char(), + true => self.eat_char(), false => break, - } + }; } } @@ -107,9 +124,11 @@ impl Tokeniser { pub fn eat_to_delimiter(&mut self, delim: char) -> Option<String> { let mut token = String::new(); while let Some(c) = self.eat_char() { - self.consumed.push(c); match c == delim { - true => return Some(token), + true => { + self.end = self.prev; + return Some(token); + } false => token.push(c), } } @@ -130,30 +149,38 @@ impl Tokeniser { } /// Mark the next character to be consumed as the start character. - pub fn mark_start_position(&mut self) { - self.start_position = self.position; - self.consumed.clear(); - } - - /// Mark the previously-consumed character as the start character. - pub fn mark_prev_start_position(&mut self) { - self.start_position = self.prev_position; - let c = self.consumed.chars().last(); - self.consumed.clear(); - // Keep the previously pushed character. - if let Some(c) = c { - self.consumed.push(c); - } + pub fn mark_start(&mut self) { + self.start = self.mark; + } + + /// Mark the most recently consumed character as the start character. + pub fn mark_start_prev(&mut self) { + self.start = self.prev; + } + + /// Mark the next character as the character following the end character. + pub fn mark_end(&mut self) { + self.end = self.mark; + } + + /// Mark the next character as the character following the end character. + pub fn mark_end_prev(&mut self) { + self.end = self.prev; } - /// Only call this once per span, it consumes the token string. - pub fn get_source_span(&mut self) -> SourceSpan { + /// Mark the next character to be consumed as the start of the child. + pub fn mark_child(&mut self) { + self.child = self.mark; + } + + /// Return the SourceSpan between the start and end marks. + pub fn get_source(&mut self) -> SourceSpan { let in_merged = SourceLocation { path: self.source_path.to_owned(), - start: self.start_position, - end: self.prev_position, + start: self.start.position, + end: self.end.prev_position, }; - let in_source = if self.start_position.line >= self.embedded_first_line { + let in_source = if self.start.position.line >= self.embedded_first_line { if let Some(embedded_path) = &self.embedded_path { let offset = self.embedded_first_line; Some( @@ -176,7 +203,41 @@ impl Tokeniser { None }; - let string = std::mem::take(&mut self.consumed); + let string = self.get_chars(&self.start, &self.end).iter().collect(); SourceSpan { string, in_merged, in_source } } } + + +#[derive(Clone, Copy)] +pub struct TokeniserMark { + /// Position of the next character to be consumed. + pub position: SourcePosition, + /// Index of the next character to be consumed. + pub i: usize, + /// Position of the most recently consumed character. + pub prev_position: SourcePosition, + pub prev_prev_position: SourcePosition, +} + +impl TokeniserMark { + pub const ZERO: Self = Self { + position: SourcePosition::ZERO, + i: 0, + prev_position: SourcePosition::ZERO, + prev_prev_position: SourcePosition::ZERO, + }; + + /// Advance to the next character. + pub fn advance(&mut self, c: char) { + self.prev_prev_position = self.prev_position; + self.prev_position = self.position; + self.position.advance(c); + self.i += 1; + } + + /// Ignore the most recently consumed character. + pub fn undo(&mut self) { + self.prev_position = self.prev_prev_position; + } +} |