Use source code tokeniser from assembler crate

Work-in-progress commit while functionality is moved over to the assembler crate. This commit doesn't compile.
author: Ben Bridle <ben@derelict.engineering> 2025-02-04 18:18:31 +1300
committer: Ben Bridle <ben@derelict.engineering> 2025-02-04 18:18:40 +1300
commit: f69a8f8c312ded212446082682bcabba8e3a9c9f (patch)
tree: 887195a8b90235f6a9c584374567b45fffac15b4 /src/translators/syntactic_parser.rs
parent: cf1af202e01cdcbac437ac96f21c4437bf27bb0d (diff)
download: bedrock-asm-f69a8f8c312ded212446082682bcabba8e3a9c9f.zip
1 files changed, 24 insertions, 154 deletions
diff --git a/src/translators/syntactic_parser.rs b/src/translators/syntactic_parser.rs
index 7279daf..8f0850b 100644
--- a/src/translators/syntactic_parser.rs
+++ b/src/translators/syntactic_parser.rs
@@ -5,117 +5,18 @@ use std::path::PathBuf;
 
 /// Translate raw source code characters into syntactic tokens.
 pub struct SyntacticParser {
-    /// Path of file from which the source was read.
-    path: Option<PathBuf>,
-    /// Path of the original source file.
-    source_path: Option<PathBuf>,
-    /// Position of the next character to be read.
-    position: Position,
-    /// Previous value of the position field.
-    prev_position: Position,
-    /// Line where the embedded source file begins.
-    source_line_start: usize,
-    /// Characters waiting to be parsed, in reverse order.
-    chars: Vec<char>,
-    /// The token currently being parsed.
-    token_source_string: String,
+    tokeniser: Tokeniser,
     /// The name of the most recently parsed label.
     label: String,
 }
 
 
 impl SyntacticParser {
-    /// Parse source code.
     pub fn from_source_code<P: Into<PathBuf>>(source_code: &str, path: Option<P>) -> Self {
-        Self {
-            path: path.map(|p| p.into()),
-            source_path: None,
-            position: Position { line: 0, column: 0 },
-            prev_position: Position { line: 0, column: 0 },
-            source_line_start: 0,
-            chars: source_code.chars().rev().collect(),
-            token_source_string: String::new(),
-            label: String::new(),
-        }
-    }
-
-    /// Return the next character, keeping it on the queue.
-    fn peek_char(&self) -> Option<char> {
-        self.chars.last().copied()
-    }
-
-    /// Return the next character, removing it from the queue.
-    fn eat_char(&mut self) -> Option<char> {
-        let option = self.chars.pop();
-        if let Some(c) = option {
-            self.prev_position = self.position;
-            self.position.advance(c);
-            self.token_source_string.push(c);
-        }
-        return option;
-    }
-
-    /// Remove the next character from the queue.
-    fn drop_char(&mut self) {
-        if let Some(c) = self.chars.pop() {
-            self.prev_position = self.position;
-            self.position.advance(c);
-        }
-    }
-
-    /// Remove leading whitespace.
-    fn drop_whitespace(&mut self) {
-        while let Some(c) = self.peek_char() {
-            match c.is_whitespace() {
-                true => self.drop_char(),
-                false => break,
-            }
-        }
-    }
-
-    /// Remove a full token from the queue.
-    fn eat_token(&mut self) -> String {
-        const DELIMITERS: [char; 13] =
-            ['@', '&', '%', ';', '[', ']', '{', '}', '(', '"', '\'', '#', '~'];
-        let mut token = String::new();
-        while let Some(peek) = self.peek_char() {
-            if peek.is_whitespace() || DELIMITERS.contains(&peek) {
-                break;
-            }
-            let c = self.eat_char().unwrap();
-            token.push(c);
-            if c == ':' {
-                break;
-            }
-        }
-        token
-    }
-
-    /// Return all characters until the delimiter, removing all returned
-    /// characters and the delimiter from the queue. Returns None if end
-    /// of source is reached before delimiter is found.
-    fn eat_to_delim(&mut self, delim: char) -> Option<String> {
-        let mut token = String::new();
-        while let Some(c) = self.eat_char() {
-            self.token_source_string.push(c);
-            match c == delim {
-                true => return Some(token),
-                false => token.push(c),
-            }
-        }
-        return None;
-    }
-
-    fn is_line_empty(&self) -> bool {
-        for c in self.chars.iter().rev() {
-            if *c == '\n' {
-                return true;
-            }
-            if !c.is_whitespace() {
-                return false
-            }
-        }
-        return false;
+        let mut tokeniser = Tokeniser::new(source_code, path);
+        tokeniser.add_delimiters(&['@','&','%',';','[',']','{','}','(','"','\'','#','~']);
+        tokeniser.add_terminators(&[':']);
+        Self { tokeniser, label: String::new() }
     }
 }
 
@@ -127,35 +28,35 @@ impl Iterator for SyntacticParser {
     fn next(&mut self) -> Option<SyntacticToken> {
         use SyntacticTokenVariant as SynVar;
         use SyntacticParseError as SynErr;
+        let t = &mut self.tokeniser;
 
-        self.drop_whitespace();
-        let start = self.position;
+        t.drop_whitespace();
+        t.mark_start_position();
 
-        let variant = match self.eat_char()? {
+        let variant = match t.eat_char()? {
             '@' => {
-                self.label = self.eat_token();
+                self.label = t.eat_token();
                 SynVar::LabelDefinition(self.label.clone())
             }
             '&' => {
-                let token = self.eat_token();
-                let sublabel = format!("{}/{token}", self.label);
-                SynVar::LabelDefinition(sublabel)
+                let token = t.eat_token();
+                SynVar::LabelDefinition(format!("{}/{token}", self.label))
             }
-            '%' => SynVar::MacroDefinition(self.eat_token()),
+            '%' => SynVar::MacroDefinition(t.eat_token()),
             ';' => SynVar::MacroDefinitionTerminator,
             '[' => SynVar::MarkOpen,
             ']' => SynVar::MarkClose,
             '{' => SynVar::BlockOpen,
             '}' => SynVar::BlockClose,
-            '(' => match self.eat_to_delim(')') {
+            '(' => match t.eat_to_delimiter(')') {
                 Some(string) => SynVar::Comment(string),
                 None => SynVar::Error(SynErr::UnterminatedComment),
             }
-            '\'' => match self.eat_to_delim('\'') {
+            '\'' => match t.eat_to_delimiter('\'') {
                 Some(string) => SynVar::String(string.as_bytes().to_vec()),
                 None => SynVar::Error(SynErr::UnterminatedRawString),
             }
-            '"' => match self.eat_to_delim('"') {
+            '"' => match t.eat_to_delimiter('"') {
                 Some(string) => {
                     let mut bytes = string.as_bytes().to_vec();
                     bytes.push(0x00);
@@ -164,20 +65,20 @@ impl Iterator for SyntacticParser {
                 None => SynVar::Error(SynErr::UnterminatedNullString),
             }
             '#' => {
-                let token = self.eat_token();
+                let token = t.eat_token();
                 match token.parse::<Value>() {
                     Ok(value) => SynVar::Padding(value),
                     Err(_) => SynVar::Error(SynErr::InvalidPaddingValue(token)),
                 }
             },
             '~' => {
-                let token = self.eat_token();
+                let token = t.eat_token();
                 let symbol = format!("{}/{token}", self.label);
                 SynVar::Symbol(symbol)
             }
             ':' => SynVar::Symbol(String::from(':')),
             c => {
-                let token = format!("{c}{}", self.eat_token());
+                let token = format!("{c}{}", t.eat_token());
                 match token.parse::<Value>() {
                     Ok(value) => SynVar::Literal(value),
                     Err(_) => match token.parse::<Instruction>() {
@@ -190,47 +91,16 @@ impl Iterator for SyntacticParser {
 
         // Parse source path comments.
         if let SynVar::Comment(comment) = &variant {
-            // Check that the comment fills the entire line.
-            if start.column == 0 && self.is_line_empty() {
+            // Check if the comment fills the entire line.
+            if t.start_position.column == 0 && t.end_of_line() {
                 if let Some(path) = comment.strip_prefix(": ") {
-                    self.source_path = Some(PathBuf::from(path.trim()));
-                    self.source_line_start = start.line + 1;
+                    t.source_path = Some(PathBuf::from(path.trim()));
+                    t.embedded_first_line = t.start_position.line + 1;
                 }
             }
         }
 
-        // Find location in current merged file.
-        let in_merged = SourceLocation {
-            path: self.path.to_owned(),
-            start,
-            end: self.prev_position,
-        };
-
-        // Find location in original source file.
-        let in_source = if start.line >= self.source_line_start {
-            match &self.source_path {
-                Some(path) => {
-                    let offset = self.source_line_start;
-                    Some( SourceLocation {
-                        path: Some(path.to_owned()),
-                        start: Position {
-                            line: in_merged.start.line.saturating_sub(offset),
-                            column: in_merged.start.column,
-                        },
-                        end: Position {
-                            line: in_merged.end.line.saturating_sub(offset),
-                            column: in_merged.end.column,
-                        }
-                    })
-                }
-                None => None,
-            }
-        } else {
-            None
-        };
-
-        let string = std::mem::take(&mut self.token_source_string);
-        let source = SourceSpan { string, in_merged, in_source };
+        let source = t.get_source_span();
         Some( SyntacticToken { source, variant } )
     }
 }
author	Ben Bridle <ben@derelict.engineering>	2025-02-04 18:18:31 +1300
committer	Ben Bridle <ben@derelict.engineering>	2025-02-04 18:18:40 +1300
commit	f69a8f8c312ded212446082682bcabba8e3a9c9f (patch)
tree	887195a8b90235f6a9c584374567b45fffac15b4 /src/translators/syntactic_parser.rs
parent	cf1af202e01cdcbac437ac96f21c4437bf27bb0d (diff)
download	bedrock-asm-f69a8f8c312ded212446082682bcabba8e3a9c9f.zip