From 8d11be64f6c1747e7c4049105a6dd4ea9ab0d27f Mon Sep 17 00:00:00 2001
From: Ben Bridle <ben@derelict.engineering>
Date: Tue, 4 Feb 2025 08:44:26 +1300
Subject: Implement a generic source code tokeniser

This is a struct that provides various methods for consuming characters
from a character stream and for tracking the provenance of each parsed
token.
---
 src/lib.rs             |   4 +-
 src/locators/source.rs |   2 +
 src/tokeniser.rs       | 167 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 172 insertions(+), 1 deletion(-)
 create mode 100644 src/tokeniser.rs

diff --git a/src/lib.rs b/src/lib.rs
index fba7e5d..2ebe010 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,3 +1,5 @@
 mod locators;
-
 pub use locators::*;
+
+mod tokeniser;
+pub use tokeniser::*;
diff --git a/src/locators/source.rs b/src/locators/source.rs
index 9fd1b2b..2cf1ef9 100644
--- a/src/locators/source.rs
+++ b/src/locators/source.rs
@@ -49,6 +49,8 @@ pub struct Position {
 }
 
 impl Position {
+    pub const ZERO: Self = Self { line: 0, column: 0 };
+
     pub fn to_next_char(&mut self) {
         self.column += 1;
     }
diff --git a/src/tokeniser.rs b/src/tokeniser.rs
new file mode 100644
index 0000000..eeab6e6
--- /dev/null
+++ b/src/tokeniser.rs
@@ -0,0 +1,167 @@
+use crate::*;
+
+use std::path::PathBuf;
+
+
+pub struct Tokeniser {
+    /// Characters waiting to be parsed, in reverse order.
+    pub chars: Vec<char>,
+    /// Path of the whole source file.
+    pub source_path: Option<PathBuf>,
+    /// Original path of the embedded source file.
+    pub embedded_path: Option<PathBuf>,
+    /// Line where the embedded source file begins.
+    pub embedded_first_line: usize,
+    /// Position of the next character to be consumed.
+    pub position: Position,
+    /// Position of the most recently consumed character.
+    pub prev_position: Position,
+    /// Position of the first character of the current token.
+    pub start_position: Position,
+    /// The source characters consumed for the current token.
+    pub consumed: String,
+    /// List of characters that start a new token.
+    pub delimiters: Vec<char>,
+    /// List of characters that terminate a token.
+    pub terminators: Vec<char>,
+}
+
+
+impl Tokeniser {
+    pub fn new<P: Into<PathBuf>>(source_code: &str,path: Option<P>) -> Self {
+        Self {
+            chars: source_code.chars().rev().collect(),
+            source_path: path.map(|p| p.into()),
+            embedded_path: None,
+            embedded_first_line: 0,
+            position: Position::ZERO,
+            prev_position: Position::ZERO,
+            start_position: Position::ZERO,
+            consumed: String::new(),
+            delimiters: Vec::new(),
+            terminators: Vec::new(),
+        }
+    }
+
+    pub fn add_delimiters(&mut self, delimiters: &[char]) {
+        self.delimiters.extend_from_slice(delimiters);
+    }
+
+    pub fn add_terminators(&mut self, terminators: &[char]) {
+        self.terminators.extend_from_slice(terminators);
+    }
+
+    /// Return the next character without consuming it.
+    pub fn peek_char(&self) -> Option<char> {
+        self.chars.last().copied()
+    }
+
+    /// Consume and return the next character.
+    pub fn eat_char(&mut self) -> Option<char> {
+        let option = self.chars.pop();
+        if let Some(c) = option {
+            self.prev_position = self.position;
+            self.position.advance(c);
+            self.consumed.push(c);
+        }
+        return option;
+    }
+
+    /// Remove the next character.
+    pub fn drop_char(&mut self) {
+        if let Some(c) = self.chars.pop() {
+            self.prev_position = self.position;
+            self.position.advance(c);
+        }
+    }
+
+    /// Remove whitespace.
+    pub fn drop_whitespace(&mut self) {
+        while let Some(c) = self.peek_char() {
+            match c.is_whitespace() {
+                true => self.drop_char(),
+                false => break,
+            }
+        }
+    }
+
+    /// Remove a full token from the queue.
+    pub fn eat_token(&mut self) -> String {
+        let mut token = String::new();
+        while let Some(peek) = self.peek_char() {
+            if peek.is_whitespace() || self.delimiters.contains(&peek) {
+                break;
+            }
+            let c = self.eat_char().unwrap();
+            token.push(c);
+            if self.terminators.contains(&c) {
+                break;
+            }
+        }
+        token
+    }
+
+    /// Consume and return all characters up to and including the delimiter.
+    /// Returns None if end of source is reached before delimiter is found.
+    pub fn eat_to_delimiter(&mut self, delim: char) -> Option<String> {
+        let mut token = String::new();
+        while let Some(c) = self.eat_char() {
+            self.consumed.push(c);
+            match c == delim {
+                true => return Some(token),
+                false => token.push(c),
+            }
+        }
+        return None;
+    }
+
+    /// Returns true if the remainder of the line is whitespace.
+    pub fn end_of_line(&self) -> bool {
+        for c in self.chars.iter().rev() {
+            if *c == '\n' {
+                return true;
+            }
+            if !c.is_whitespace() {
+                return false
+            }
+        }
+        return true;
+    }
+
+    pub fn mark_start_position(&mut self) {
+        self.start_position = self.position;
+    }
+
+    pub fn get_source_span(&mut self) -> SourceSpan {
+        let in_merged = SourceLocation {
+            path: self.source_path.to_owned(),
+            start: self.start_position,
+            end: self.prev_position,
+        };
+        let in_source = if self.start_position.line >= self.embedded_first_line {
+            if let Some(embedded_path) = &self.embedded_path {
+                let offset = self.embedded_first_line;
+                Some(
+                    SourceLocation {
+                        path: Some(embedded_path.to_owned()),
+                        start: Position {
+                            line: in_merged.start.line.saturating_sub(offset),
+                            column: in_merged.start.column,
+                        },
+                        end: Position {
+                            line: in_merged.end.line.saturating_sub(offset),
+                            column: in_merged.end.column,
+                        }
+                    }
+                )
+            } else {
+                None
+            }
+        } else {
+            None
+        };
+
+        let string = std::mem::take(&mut self.consumed);
+        SourceSpan { string, in_merged, in_source }
+    }
+}
-- 
cgit v1.2.3-70-g09d2