diff options
author | Ben Bridle <ben@derelict.engineering> | 2025-02-04 08:44:26 +1300 |
---|---|---|
committer | Ben Bridle <ben@derelict.engineering> | 2025-02-04 08:44:26 +1300 |
commit | 8d11be64f6c1747e7c4049105a6dd4ea9ab0d27f (patch) | |
tree | c83eda898f7512d50a6a806c6de0f99b0619274e | |
parent | 8ccb5b96f51bcf8e1a1c3e81d1305ff08825f9b1 (diff) | |
download | assembler-8d11be64f6c1747e7c4049105a6dd4ea9ab0d27f.zip |
Implement a generic source code tokeniser
This is a struct that provides various methods for consuming characters
from a character stream and for tracking the provenance of each parsed
token.
-rw-r--r-- | src/lib.rs | 4 | ||||
-rw-r--r-- | src/locators/source.rs | 2 | ||||
-rw-r--r-- | src/tokeniser.rs | 167 |
3 files changed, 172 insertions, 1 deletions
@@ -1,3 +1,5 @@ mod locators; - pub use locators::*; + +mod tokeniser; +pub use tokeniser::*; diff --git a/src/locators/source.rs b/src/locators/source.rs index 9fd1b2b..2cf1ef9 100644 --- a/src/locators/source.rs +++ b/src/locators/source.rs @@ -49,6 +49,8 @@ pub struct Position { } impl Position { + pub const ZERO: Self = Self { line: 0, column: 0 }; + pub fn to_next_char(&mut self) { self.column += 1; } diff --git a/src/tokeniser.rs b/src/tokeniser.rs new file mode 100644 index 0000000..eeab6e6 --- /dev/null +++ b/src/tokeniser.rs @@ -0,0 +1,167 @@ +use crate::*; + +use std::path::PathBuf; + + +pub struct Tokeniser { + /// Characters waiting to be parsed, in reverse order. + pub chars: Vec<char>, + /// Path of the whole source file. + pub source_path: Option<PathBuf>, + /// Original path of the embedded source file. + pub embedded_path: Option<PathBuf>, + /// Line where the embedded source file begins. + pub embedded_first_line: usize, + /// Position of the next character to be consumed. + pub position: Position, + /// Position of the most recently consumed character. + pub prev_position: Position, + /// Position of the first character of the current token. + pub start_position: Position, + /// The source characters consumed for the current token. + pub consumed: String, + /// List of characters that start a new token. + pub delimiters: Vec<char>, + /// List of characters that terminate a token. + pub terminators: Vec<char>, +} + + +impl Tokeniser { + pub fn new<P: Into<PathBuf>>(source_code: &str,path: Option<P>) -> Self { + Self { + chars: source_code.chars().rev().collect(), + source_path: path.map(|p| p.into()), + embedded_path: None, + embedded_first_line: 0, + position: Position::ZERO, + prev_position: Position::ZERO, + start_position: Position::ZERO, + consumed: String::new(), + delimiters: Vec::new(), + terminators: Vec::new(), + } + } + + pub fn add_delimiters(&mut self, delimiters: &[char]) { + self.delimiters.extend_from_slice(delimiters); + } + + pub fn add_terminators(&mut self, terminators: &[char]) { + self.terminators.extend_from_slice(terminators); + } + + /// Return the next character without consuming it. + pub fn peek_char(&self) -> Option<char> { + self.chars.last().copied() + } + + /// Consume and return the next character. + pub fn eat_char(&mut self) -> Option<char> { + let option = self.chars.pop(); + if let Some(c) = option { + self.prev_position = self.position; + self.position.advance(c); + self.consumed.push(c); + } + return option; + } + + /// Remove the next character. + pub fn drop_char(&mut self) { + if let Some(c) = self.chars.pop() { + self.prev_position = self.position; + self.position.advance(c); + } + } + + /// Remove whitespace. + pub fn drop_whitespace(&mut self) { + while let Some(c) = self.peek_char() { + match c.is_whitespace() { + true => self.drop_char(), + false => break, + } + } + } + + /// Remove a full token from the queue. + pub fn eat_token(&mut self) -> String { + let mut token = String::new(); + while let Some(peek) = self.peek_char() { + if peek.is_whitespace() || self.delimiters.contains(&peek) { + break; + } + let c = self.eat_char().unwrap(); + token.push(c); + if self.terminators.contains(&c) { + break; + } + } + token + } + + /// Consume and return all characters up to and including the delimiter. + /// Returns None if end of source is reached before delimiter is found. + pub fn eat_to_delimiter(&mut self, delim: char) -> Option<String> { + let mut token = String::new(); + while let Some(c) = self.eat_char() { + self.consumed.push(c); + match c == delim { + true => return Some(token), + false => token.push(c), + } + } + return None; + } + + /// Returns true if the remainder of the line is whitespace. + pub fn end_of_line(&self) -> bool { + for c in self.chars.iter().rev() { + if *c == '\n' { + return true; + } + if !c.is_whitespace() { + return false + } + } + return true; + } + + pub fn mark_start_position(&mut self) { + self.start_position = self.position; + } + + pub fn get_source_span(&mut self) -> SourceSpan { + let in_merged = SourceLocation { + path: self.source_path.to_owned(), + start: self.start_position, + end: self.prev_position, + }; + let in_source = if self.start_position.line >= self.embedded_first_line { + if let Some(embedded_path) = &self.embedded_path { + let offset = self.embedded_first_line; + Some( + SourceLocation { + path: Some(embedded_path.to_owned()), + start: Position { + line: in_merged.start.line.saturating_sub(offset), + column: in_merged.start.column, + }, + end: Position { + line: in_merged.end.line.saturating_sub(offset), + column: in_merged.end.column, + } + } + ) + } else { + None + } + } else { + None + }; + + let string = std::mem::take(&mut self.consumed); + SourceSpan { string, in_merged, in_source } + } +} |