From 83aa972318fe1a9ada93c25f48fd5697ad996b32 Mon Sep 17 00:00:00 2001
From: Ben Bridle <ben@derelict.engineering>
Date: Thu, 20 Feb 2025 18:05:10 +1300
Subject: Rewrite tokeniser

This commit adds a subtokenise method that creates a child tokeniser
over all characters between the start and end marks.
---
 src/tokeniser.rs | 167 +++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 114 insertions(+), 53 deletions(-)

diff --git a/src/tokeniser.rs b/src/tokeniser.rs
index 6ae9055..7ab44f0 100644
--- a/src/tokeniser.rs
+++ b/src/tokeniser.rs
@@ -5,7 +5,7 @@ use std::path::PathBuf;
 
 /// Break a character stream down into individual tokens.
 pub struct Tokeniser {
-    /// Characters waiting to be parsed, in reverse order.
+    /// Characters waiting to be parsed.
     pub chars: Vec<char>,
     /// Path of the whole source file.
     pub source_path: Option<PathBuf>,
@@ -13,32 +13,53 @@ pub struct Tokeniser {
     pub embedded_path: Option<PathBuf>,
     /// Line where the embedded source file begins.
     pub embedded_first_line: usize,
-    /// Position of the next character to be consumed.
-    pub position: SourcePosition,
-    /// Position of the most recently consumed character.
-    pub prev_position: SourcePosition,
+    /// Mark tracking the next character to parse.
+    pub mark: TokeniserMark,
+    /// Mark tracking the most recent character of the current token.
+    pub prev: TokeniserMark,
     /// Position of the first character of the current token.
-    pub start_position: SourcePosition,
-    /// The source characters consumed for the current token.
-    pub consumed: String,
+    pub start: TokeniserMark,
+    /// Position after the final character of the current token.
+    pub end: TokeniserMark,
+    /// Position to begin subtokenisation from.
+    pub child: TokeniserMark,
     /// List of characters that start a new token.
     pub delimiters: Vec<char>,
     /// List of characters that terminate a token.
     pub terminators: Vec<char>,
 }
 
-
 impl Tokeniser {
     pub fn new<P: Into<PathBuf>>(source_code: &str, path: Option<P>) -> Self {
         Self {
-            chars: source_code.chars().rev().collect(),
+            chars: source_code.chars().collect(),
             source_path: path.map(|p| p.into()),
             embedded_path: None,
             embedded_first_line: 0,
-            position: SourcePosition::ZERO,
-            prev_position: SourcePosition::ZERO,
-            start_position: SourcePosition::ZERO,
-            consumed: String::new(),
+            mark: TokeniserMark::ZERO,
+            prev: TokeniserMark::ZERO,
+            start: TokeniserMark::ZERO,
+            end: TokeniserMark::ZERO,
+            child: TokeniserMark::ZERO,
+            delimiters: Vec::new(),
+            terminators: Vec::new(),
+        }
+    }
+
+    /// Create a tokeniser from child to end.
+    pub fn subtokenise(&mut self) -> Self {
+        let mut start = self.child;
+        start.i = 0;
+        Self {
+            chars: self.get_chars(&self.child, &self.end),
+            source_path: self.source_path.clone(),
+            embedded_path: self.embedded_path.clone(),
+            embedded_first_line: self.embedded_first_line.clone(),
+            mark: start,
+            prev: start,
+            start: start,
+            end: start,
+            child: start,
             delimiters: Vec::new(),
             terminators: Vec::new(),
         }
@@ -52,37 +73,33 @@ impl Tokeniser {
         self.terminators.extend_from_slice(terminators);
     }
 
+    pub fn get_chars(&self, start: &TokeniserMark, end: &TokeniserMark) -> Vec<char> {
+        self.chars[start.i..end.i].iter().map(char::to_owned).collect()
+    }
+
     /// Return the next character without consuming it.
     pub fn peek_char(&self) -> Option<char> {
-        self.chars.last().copied()
+        self.chars.get(self.mark.i).copied()
     }
 
     /// Consume and return the next character.
     pub fn eat_char(&mut self) -> Option<char> {
-        let option = self.chars.pop();
+        let option = self.peek_char();
         if let Some(c) = option {
-            self.prev_position = self.position;
-            self.position.advance(c);
-            self.consumed.push(c);
+            self.prev = self.mark;
+            self.mark.advance(c);
+            self.mark_end();
         }
         return option;
     }
 
-    /// Remove the next character.
-    pub fn drop_char(&mut self) {
-        if let Some(c) = self.chars.pop() {
-            self.prev_position = self.position;
-            self.position.advance(c);
-        }
-    }
-
-    /// Remove whitespace.
-    pub fn drop_whitespace(&mut self) {
+    /// Consume whitespace.
+    pub fn eat_whitespace(&mut self) {
         while let Some(c) = self.peek_char() {
             match c.is_whitespace() {
-                true => self.drop_char(),
+                true => self.eat_char(),
                 false => break,
-            }
+            };
         }
     }
 
@@ -107,9 +124,11 @@ impl Tokeniser {
     pub fn eat_to_delimiter(&mut self, delim: char) -> Option<String> {
         let mut token = String::new();
         while let Some(c) = self.eat_char() {
-            self.consumed.push(c);
             match c == delim {
-                true => return Some(token),
+                true => {
+                    self.end = self.prev;
+                    return Some(token);
+                }
                 false => token.push(c),
             }
         }
@@ -130,30 +149,38 @@ impl Tokeniser {
     }
 
     /// Mark the next character to be consumed as the start character.
-    pub fn mark_start_position(&mut self) {
-        self.start_position = self.position;
-        self.consumed.clear();
-    }
-
-    /// Mark the previously-consumed character as the start character.
-    pub fn mark_prev_start_position(&mut self) {
-        self.start_position = self.prev_position;
-        let c = self.consumed.chars().last();
-        self.consumed.clear();
-        // Keep the previously pushed character.
-        if let Some(c) = c {
-            self.consumed.push(c);
-        }
+    pub fn mark_start(&mut self) {
+        self.start = self.mark;
+    }
+
+    /// Mark the most recently consumed character as the start character.
+    pub fn mark_start_prev(&mut self) {
+        self.start = self.prev;
+    }
+
+    /// Mark the next character as the character following the end character.
+    pub fn mark_end(&mut self) {
+        self.end = self.mark;
+    }
+
+    /// Mark the next character as the character following the end character.
+    pub fn mark_end_prev(&mut self) {
+        self.end = self.prev;
     }
 
-    /// Only call this once per span, it consumes the token string.
-    pub fn get_source_span(&mut self) -> SourceSpan {
+    /// Mark the next character to be consumed as the start of the child.
+    pub fn mark_child(&mut self) {
+        self.child = self.mark;
+    }
+
+    /// Return the SourceSpan between the start and end marks.
+    pub fn get_source(&mut self) -> SourceSpan {
         let in_merged = SourceLocation {
             path: self.source_path.to_owned(),
-            start: self.start_position,
-            end: self.prev_position,
+            start: self.start.position,
+            end: self.end.prev_position,
         };
-        let in_source = if self.start_position.line >= self.embedded_first_line {
+        let in_source = if self.start.position.line >= self.embedded_first_line {
             if let Some(embedded_path) = &self.embedded_path {
                 let offset = self.embedded_first_line;
                 Some(
@@ -176,7 +203,41 @@ impl Tokeniser {
             None
         };
 
-        let string = std::mem::take(&mut self.consumed);
+        let string = self.get_chars(&self.start, &self.end).iter().collect();
         SourceSpan { string, in_merged, in_source }
     }
 }
+
+
+#[derive(Clone, Copy)]
+pub struct TokeniserMark {
+    /// Position of the next character to be consumed.
+    pub position: SourcePosition,
+    /// Index of the next character to be consumed.
+    pub i: usize,
+    /// Position of the most recently consumed character.
+    pub prev_position: SourcePosition,
+    pub prev_prev_position: SourcePosition,
+}
+
+impl TokeniserMark {
+    pub const ZERO: Self = Self {
+        position: SourcePosition::ZERO,
+        i: 0,
+        prev_position: SourcePosition::ZERO,
+        prev_prev_position: SourcePosition::ZERO,
+    };
+
+    /// Advance to the next character.
+    pub fn advance(&mut self, c: char) {
+        self.prev_prev_position = self.prev_position;
+        self.prev_position = self.position;
+        self.position.advance(c);
+        self.i += 1;
+    }
+
+    /// Ignore the most recently consumed character.
+    pub fn undo(&mut self) {
+        self.prev_position = self.prev_prev_position;
+    }
+}
-- 
cgit v1.2.3-70-g09d2