From 8d11be64f6c1747e7c4049105a6dd4ea9ab0d27f Mon Sep 17 00:00:00 2001 From: Ben Bridle Date: Tue, 4 Feb 2025 08:44:26 +1300 Subject: Implement a generic source code tokeniser This is a struct that provides various methods for consuming characters from a character stream and for tracking the provenance of each parsed token. --- src/tokeniser.rs | 167 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 167 insertions(+) create mode 100644 src/tokeniser.rs (limited to 'src/tokeniser.rs') diff --git a/src/tokeniser.rs b/src/tokeniser.rs new file mode 100644 index 0000000..eeab6e6 --- /dev/null +++ b/src/tokeniser.rs @@ -0,0 +1,167 @@ +use crate::*; + +use std::path::PathBuf; + + +pub struct Tokeniser { + /// Characters waiting to be parsed, in reverse order. + pub chars: Vec, + /// Path of the whole source file. + pub source_path: Option, + /// Original path of the embedded source file. + pub embedded_path: Option, + /// Line where the embedded source file begins. + pub embedded_first_line: usize, + /// Position of the next character to be consumed. + pub position: Position, + /// Position of the most recently consumed character. + pub prev_position: Position, + /// Position of the first character of the current token. + pub start_position: Position, + /// The source characters consumed for the current token. + pub consumed: String, + /// List of characters that start a new token. + pub delimiters: Vec, + /// List of characters that terminate a token. + pub terminators: Vec, +} + + +impl Tokeniser { + pub fn new>(source_code: &str,path: Option

) -> Self { + Self { + chars: source_code.chars().rev().collect(), + source_path: path.map(|p| p.into()), + embedded_path: None, + embedded_first_line: 0, + position: Position::ZERO, + prev_position: Position::ZERO, + start_position: Position::ZERO, + consumed: String::new(), + delimiters: Vec::new(), + terminators: Vec::new(), + } + } + + pub fn add_delimiters(&mut self, delimiters: &[char]) { + self.delimiters.extend_from_slice(delimiters); + } + + pub fn add_terminators(&mut self, terminators: &[char]) { + self.terminators.extend_from_slice(terminators); + } + + /// Return the next character without consuming it. + pub fn peek_char(&self) -> Option { + self.chars.last().copied() + } + + /// Consume and return the next character. + pub fn eat_char(&mut self) -> Option { + let option = self.chars.pop(); + if let Some(c) = option { + self.prev_position = self.position; + self.position.advance(c); + self.consumed.push(c); + } + return option; + } + + /// Remove the next character. + pub fn drop_char(&mut self) { + if let Some(c) = self.chars.pop() { + self.prev_position = self.position; + self.position.advance(c); + } + } + + /// Remove whitespace. + pub fn drop_whitespace(&mut self) { + while let Some(c) = self.peek_char() { + match c.is_whitespace() { + true => self.drop_char(), + false => break, + } + } + } + + /// Remove a full token from the queue. + pub fn eat_token(&mut self) -> String { + let mut token = String::new(); + while let Some(peek) = self.peek_char() { + if peek.is_whitespace() || self.delimiters.contains(&peek) { + break; + } + let c = self.eat_char().unwrap(); + token.push(c); + if self.terminators.contains(&c) { + break; + } + } + token + } + + /// Consume and return all characters up to and including the delimiter. + /// Returns None if end of source is reached before delimiter is found. + pub fn eat_to_delimiter(&mut self, delim: char) -> Option { + let mut token = String::new(); + while let Some(c) = self.eat_char() { + self.consumed.push(c); + match c == delim { + true => return Some(token), + false => token.push(c), + } + } + return None; + } + + /// Returns true if the remainder of the line is whitespace. + pub fn end_of_line(&self) -> bool { + for c in self.chars.iter().rev() { + if *c == '\n' { + return true; + } + if !c.is_whitespace() { + return false + } + } + return true; + } + + pub fn mark_start_position(&mut self) { + self.start_position = self.position; + } + + pub fn get_source_span(&mut self) -> SourceSpan { + let in_merged = SourceLocation { + path: self.source_path.to_owned(), + start: self.start_position, + end: self.prev_position, + }; + let in_source = if self.start_position.line >= self.embedded_first_line { + if let Some(embedded_path) = &self.embedded_path { + let offset = self.embedded_first_line; + Some( + SourceLocation { + path: Some(embedded_path.to_owned()), + start: Position { + line: in_merged.start.line.saturating_sub(offset), + column: in_merged.start.column, + }, + end: Position { + line: in_merged.end.line.saturating_sub(offset), + column: in_merged.end.column, + } + } + ) + } else { + None + } + } else { + None + }; + + let string = std::mem::take(&mut self.consumed); + SourceSpan { string, in_merged, in_source } + } +} -- cgit v1.2.3-70-g09d2