From e38f108921c61e1e66d65a368f2a67a763d61e69 Mon Sep 17 00:00:00 2001
From: Ben Bridle <bridle.benjamin@gmail.com>
Date: Sat, 6 May 2023 16:19:15 +1200
Subject: About to refactor parser to be a struct with a method for each stage

---
 .gitignore             |   1 +
 Cargo.lock             |   7 ++
 Cargo.toml             |   8 ++
 src/addressing.rs      |  44 +++++++
 src/error.rs           |   9 ++
 src/lib.rs             | 332 +++++++++++++++++++++++++++++++++++++++++++++++++
 src/main.rs            |  41 ++++++
 src/semantic_token.rs  |  38 ++++++
 src/syntactic_token.rs |  43 +++++++
 src/tokenizer.rs       | 183 +++++++++++++++++++++++++++
 10 files changed, 706 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 Cargo.lock
 create mode 100644 Cargo.toml
 create mode 100644 src/addressing.rs
 create mode 100644 src/error.rs
 create mode 100644 src/lib.rs
 create mode 100644 src/main.rs
 create mode 100644 src/semantic_token.rs
 create mode 100644 src/syntactic_token.rs
 create mode 100644 src/tokenizer.rs

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..ea8c4bf
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+/target
diff --git a/Cargo.lock b/Cargo.lock
new file mode 100644
index 0000000..a0ffbd8
--- /dev/null
+++ b/Cargo.lock
@@ -0,0 +1,7 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 3
+
+[[package]]
+name = "bedrock_asm"
+version = "0.1.0"
diff --git a/Cargo.toml b/Cargo.toml
new file mode 100644
index 0000000..1bc614d
--- /dev/null
+++ b/Cargo.toml
@@ -0,0 +1,8 @@
+[package]
+name = "bedrock_asm"
+version = "0.1.0"
+edition = "2021"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
diff --git a/src/addressing.rs b/src/addressing.rs
new file mode 100644
index 0000000..dd7638e
--- /dev/null
+++ b/src/addressing.rs
@@ -0,0 +1,44 @@
+#[derive(Clone,Copy)]
+pub struct CharAddress {
+    /// The number of lines that precede this line in the file.
+    pub line:usize,
+    /// The number of characters that precede this character in the line.
+    pub column:usize,
+}
+impl CharAddress {
+    pub fn new(line:usize, column:usize) -> Self {
+        Self { line, column }
+    }
+    pub fn zero() -> Self {
+        Self::new(0,0)
+    }
+}
+
+pub struct SourceLocation {
+    /// The slice of the source file from which this token was parsed.
+    pub source: String,
+    /// The address of the first character of this token.
+    pub start: CharAddress,
+    /// The address of the final character of this token.
+    pub end: CharAddress
+}
+impl SourceLocation {
+    pub fn new(source:String, start:CharAddress, end:CharAddress) -> Self {
+        Self { source, start, end }
+    }
+    pub fn zero() -> Self {
+        Self { source:String::new(), start:CharAddress::zero(), end:CharAddress::zero() }
+    }
+}
+
+pub struct BytecodeLocation {
+    /// The number of bytes that precede this byte sequence in the bytecode.
+    pub start: u16,
+    /// The length of this byte sequence, in bytes.
+    pub length: u16,
+}
+impl BytecodeLocation {
+    pub fn zero() -> Self {
+        Self { start:0, length:0 }
+    }
+}
diff --git a/src/error.rs b/src/error.rs
new file mode 100644
index 0000000..efbfc4f
--- /dev/null
+++ b/src/error.rs
@@ -0,0 +1,9 @@
+#[derive(Clone, Copy, Debug)]
+pub enum Error {
+    UnresolvedReference,
+    DuplicateDefinition,
+    InvalidHexadecimalLiteral,
+    InvalidTypeInMacroDefinition,
+    OrphanedMacroTerminator,
+    TokenPastEndOfProgram,
+}
diff --git a/src/lib.rs b/src/lib.rs
new file mode 100644
index 0000000..5d84600
--- /dev/null
+++ b/src/lib.rs
@@ -0,0 +1,332 @@
+mod addressing;
+mod syntactic_token;
+mod semantic_token;
+mod tokenizer;
+mod error;
+
+pub use addressing::*;
+pub use syntactic_token::*;
+pub use semantic_token::*;
+pub use error::*;
+pub use tokenizer::*;
+
+use std::collections::hash_map::{HashMap, Entry};
+use std::mem::take;
+
+// On Unicode support: Work with characters, not bytes. This will eventually be
+// used in Verdant and Doctrine, and it'd be nice to be able to support other languages.
+// The only reason to work with bytes over characters would be for a minor decrease in complexity.
+// Only support the assembly of files of up to 64kB. If assets need to be tacked on the end,
+// it can be done by another program. The VM will only be able to access the first 64kB of a file anyway.
+// Treat \t as a space, have it be a single character.
+
+// First, turn the program source code into a vector of SyntacticTokens. These
+// each contain a SourceLocation, and the type and value of the token. Every single
+// non-whitespace character of the program needs to be wrapped in a SyntacticToken.
+// The program source code can be accurately reconstructed from this list of
+// SyntacticTokens, and when I write GRID, if the mouse is hovering over any point
+// in the program listing, I'll be able to determine the exact token that is being hovered.
+// For macros, hovering over any character belonging to a macro definition will
+// highlight the entire macro definition, and also the currently-hovered body token
+// if there is one. Clicking the body token will bring up more information.
+
+// The SyntacticTokens will be collected into a vector, with label and macro definition
+// being constructed as we go. Label definitions are easy, I only need to note down the
+// names of the labels in order to validate label references in a later step. If a label
+// name has already been defined, tag the token with an error. If a macro name has already
+// been defined, tag the token with an error.
+// Collect children into macro definitions. This makes sense.
+
+// Step 2 is to generate bytecode, converting SyntacticTokens into SemanticTokens.
+// Label and macro definitions need to contain a list of usizes to references.
+// Macro definitions need to contain the body tokens as SemanticTokens.
+// Label and macro references need to point to their parents.
+// Can I stream-convert tokens from Syntactic to Semantic?
+// Each SynToken gets converted to a SemToken? Yeah.
+
+// I want to change the parser to be a multi-stage struct thing, holding its own state.
+
+enum SymbolDefinition { Macro(usize), Label(usize) }
+
+pub fn parse(source_code: &str) {
+    use SyntacticTokenType as Syn;
+    use SemanticTokenType as Sem;
+
+    // ============================ STEP 1 ============================
+    // Convert the source code into a sorted vector of syntactic tokens and a
+    // map of symbol definitions.
+    // ================================================================
+    println!("[DEBUG] STEP 1: Parse source code into syntactic tokens");
+    let mut syntactic_tokens: Vec<SyntacticToken> = Vec::new();
+    let mut symbol_definitions: HashMap<String,SymbolDefinition> = HashMap::new();
+    let mut macro_bodies: HashMap<usize, Vec<SyntacticToken>> = HashMap::new();
+    let mut macro_definition: Option<usize> = None;
+    let mut macro_definition_body_tokens: Vec<SyntacticToken> = Vec::new();
+
+    for mut token in TokenIterator::from_str(source_code) {
+        if let Some(mdt) = macro_definition {
+            token.use_in_macro_body();
+            let terminate = token.is_macro_terminator();
+            macro_definition_body_tokens.push(token);
+            if terminate {
+                macro_bodies.insert(mdt, take(&mut macro_definition_body_tokens));
+                macro_definition = None;
+            }
+        } else {
+            if let Syn::MacroDefinition(ref name) = token.r#type {
+                macro_definition = Some(syntactic_tokens.len());
+                match symbol_definitions.entry(name.to_string()) {
+                    Entry::Occupied(_) => {token.set_error(Error::DuplicateDefinition);}
+                    Entry::Vacant(v) => {v.insert(SymbolDefinition::Macro(syntactic_tokens.len()));}
+                }
+            } else if let Syn::LabelDefinition(ref name) = token.r#type {
+                match symbol_definitions.entry(name.to_string()) {
+                    Entry::Occupied(_) => {token.set_error(Error::DuplicateDefinition);}
+                    Entry::Vacant(v) => {v.insert(SymbolDefinition::Label(syntactic_tokens.len()));}
+                }
+            } else if token.is_macro_terminator() {
+                token.set_error(Error::OrphanedMacroTerminator);
+            }
+            syntactic_tokens.push(token);
+        }
+    }
+
+
+    // ============================ STEP 2 ============================
+    // Convert syntactic tokens into semantic tokens, resolving label and macro
+    // references in the process.
+    // ================================================================
+    println!("[DEBUG] STEP 2: Resolve label and macro references");
+    let syntactic_token_count = syntactic_tokens.len();
+    let mut semantic_tokens = Vec::new();
+    let mut semantic_macro_bodies: HashMap<usize, Vec<SemanticToken>> = HashMap::new();
+
+    for (i, mut syn_token) in syntactic_tokens.into_iter().enumerate() {
+        let sem_token_type = if let Some(err) = syn_token.error {
+            // Translate over any existing syntax errors
+            Sem::Error(syn_token.r#type, err)
+        } else {
+            match syn_token.r#type {
+                Syn::Reference(ref name) => {
+                    match symbol_definitions.get(name) {
+                        Some(SymbolDefinition::Macro(addr)) => Sem::MacroReference(*addr),
+                        Some(SymbolDefinition::Label(addr)) => Sem::LabelReference(*addr),
+                        None => Sem::Error(syn_token.r#type, Error::UnresolvedReference),
+                    }
+                }
+                Syn::LabelDefinition(name) => {
+                    let label_definition = LabelDefinition {
+                        name, address: 0, references: Vec::new() };
+                    Sem::LabelDefinition(label_definition)
+                }
+                Syn::MacroDefinition(name) => {
+                    let mut sem_body_tokens = Vec::new();
+                    // Iterate over every token in the body of the macro definition,
+                    // converting each one to a semantic token.
+                    for syn_body_token in macro_bodies.remove(&i).unwrap() {
+                        let sem_body_token_type = if let Some(err) = syn_body_token.error {
+                            // Translate over any existing syntax errors
+                            Sem::Error(syn_body_token.r#type, err)
+                        } else {
+                            match syn_body_token.r#type {
+                                Syn::Reference(ref name) => match symbol_definitions.get(name) {
+                                    Some(SymbolDefinition::Macro(addr)) => Sem::MacroReference(*addr),
+                                    Some(SymbolDefinition::Label(addr)) => Sem::LabelReference(*addr),
+                                    None => Sem::Error(syn_body_token.r#type, Error::UnresolvedReference),
+                                },
+
+                                Syn::LabelDefinition(_) => unreachable!(),
+                                Syn::MacroDefinition(_) => unreachable!(),
+                                Syn::MacroTerminator => {
+                                    syn_token.source_location.end =
+                                        syn_body_token.source_location.end;
+                                    Sem::MacroTerminator
+                                },
+
+                                Syn::Pad(v) => Sem::Pad(v),
+                                Syn::Byte(v) => Sem::Byte(v),
+                                Syn::Short(v) => Sem::Short(v),
+                                Syn::Instruction(v) => Sem::Instruction(v),
+
+                                Syn::Comment => Sem::Comment,
+                            }
+                        };
+                        let sem_body_token = SemanticToken {
+                            r#type: sem_body_token_type,
+                            source_location: syn_body_token.source_location,
+                            bytecode_location: BytecodeLocation::zero(),
+                        };
+                        sem_body_tokens.push(sem_body_token);
+                    }
+                    semantic_macro_bodies.insert(i, sem_body_tokens);
+                    let macro_definition = MacroDefinition {
+                        name, body_tokens: Vec::new(), references: Vec::new() };
+                    Sem::MacroDefinition(macro_definition)
+                }
+                Syn::MacroTerminator => unreachable!(),
+
+                Syn::Pad(v) => Sem::Pad(v),
+                Syn::Byte(v) => Sem::Byte(v),
+                Syn::Short(v) => Sem::Short(v),
+                Syn::Instruction(v) => Sem::Instruction(v),
+
+                Syn::Comment => Sem::Comment,
+            }
+        };
+        let sem_token = SemanticToken {
+            r#type: sem_token_type,
+            source_location: syn_token.source_location,
+            bytecode_location: BytecodeLocation::zero(),
+        };
+        semantic_tokens.push(sem_token);
+    }
+    assert_eq!(syntactic_token_count, semantic_tokens.len());
+
+
+    // ============================ STEP 3 ============================
+    // Iterate over each semantic token, generating bytecode.
+    // ================================================================
+    println!("[DEBUG] STEP 3: Generate bytecode");
+    let mut bytecode: Vec<u8> = Vec::new();
+    // Map each label token to a list of bytecode addresses to populate
+    let mut label_reference_addresses: HashMap<usize, Vec<u16>> = HashMap::new();
+    // Map each label or macro definition token to a list of reference token pointers
+    let mut references: HashMap<usize, Vec<usize>> = HashMap::new();
+
+    macro_rules! addr {() => {bytecode.len() as u16};}
+    macro_rules! push_u8 {($v:expr) => {bytecode.push($v); 1};}
+    macro_rules! push_u16 {($v:expr) => {bytecode.extend_from_slice(&$v.to_be_bytes()); 2};}
+    macro_rules! pad {($p:expr) => {bytecode.resize(bytecode.len() + $p as usize, 0); $p as u16};}
+
+    for (i, sem_token) in semantic_tokens.iter_mut().enumerate() {
+        let start_addr = addr!();
+        let byte_length: u16 = match &mut sem_token.r#type {
+            Sem::LabelReference(addr) => {
+                references.entry(*addr).or_default().push(i);
+                label_reference_addresses.entry(*addr).or_default().push(addr!());
+                push_u16!(0u16); 2
+            },
+            Sem::MacroReference(addr) => {
+                references.entry(*addr).or_default().push(i);
+                let mut macro_byte_length: u16 = 0;
+                for body_token in semantic_macro_bodies.get(addr).unwrap() {
+                    macro_byte_length += match &body_token.r#type {
+                        Sem::LabelReference(addr) => {
+                            label_reference_addresses.entry(*addr).or_default().push(addr!());
+                            push_u16!(0u16); 2
+                        },
+                        Sem::MacroReference(_) => todo!(),
+
+                        Sem::LabelDefinition(_) => unreachable!(),
+                        Sem::MacroDefinition(_) => unreachable!(),
+
+                        Sem::Pad(p) => { pad!(*p); *p },
+                        Sem::Byte(b) => { push_u8!(*b); 1 },
+                        Sem::Short(s) => { push_u16!(*s); 2 },
+                        Sem::Instruction(b) => { push_u8!(*b); 1 },
+
+                        Sem::MacroTerminator => 0,
+                        Sem::Comment => 0,
+                        Sem::Error(..) => 0,
+                    };
+                }
+                macro_byte_length
+            },
+
+            Sem::LabelDefinition(definition) => {definition.address=addr!(); 1},
+            Sem::MacroDefinition(_) => 0,
+
+            Sem::Pad(p) => { pad!(*p); *p },
+            Sem::Byte(b) => { push_u8!(*b); 1 },
+            Sem::Short(s) => { push_u16!(*s); 2 },
+            Sem::Instruction(b) => { push_u8!(*b); 1 },
+
+            Sem::MacroTerminator => unreachable!(),
+            Sem::Comment => 0,
+            Sem::Error(..) => 0,
+        };
+        sem_token.bytecode_location.start = start_addr;
+        sem_token.bytecode_location.length = byte_length;
+    }
+
+
+    // ============================ STEP 4 ============================
+    // Fill in addresses for label references.
+    // ================================================================
+    println!("[DEBUG] STEP 4: Fill in values for label references");
+    for (label_i, slots) in label_reference_addresses.iter() {
+        if let Sem::LabelDefinition(LabelDefinition { address, .. }) = semantic_tokens[*label_i].r#type {
+            let [h,l] = address.to_be_bytes();
+            for slot in slots {
+                bytecode[*slot as usize] = h;
+                bytecode[slot.wrapping_add(1) as usize] = l;
+            }
+        } else {
+            unreachable!()
+        }
+    }
+
+    // ============================ STEP 5 ============================
+    // Move references and macro body tokens into label and macro definitions.
+    // ================================================================
+    println!("[DEBUG] STEP 5: Move information into label and macro definition tokens");
+    for (i, token) in semantic_tokens.iter_mut().enumerate() {
+        if let Sem::MacroDefinition(macro_definition) = &mut token.r#type {
+            macro_definition.body_tokens = semantic_macro_bodies.remove(&i).unwrap();
+            if let Some(macro_references) = references.remove(&i) {
+                macro_definition.references = macro_references;
+            }
+        } else if let Sem::LabelDefinition(label_definition) = &mut token.r#type {
+            if let Some(label_references) = references.remove(&i) {
+                label_definition.references = label_references;
+            }
+        }
+    }
+    assert_eq!(references.len(), 0);
+
+
+    // ============================ STEP 6 ============================
+    // Remove trailing null-bytes from the bytecode.
+    // ================================================================
+    println!("[DEBUG] STEP 6: Trim trailing null bytes");
+    if let Some(final_nonnull_byte) = bytecode.iter().rposition(|b| *b != 0) {
+        let truncated_length = final_nonnull_byte + 1;
+        let removed_byte_count = bytecode.len() - truncated_length;
+        if removed_byte_count > 0 {
+            println!("[INFO] Removed {removed_byte_count} trailing null bytes from assembled bytecode");
+            bytecode.truncate(truncated_length);
+        }
+    }
+
+
+    for token in &semantic_tokens {
+        if let Sem::MacroDefinition(macro_definition) = &token.r#type {
+            for body_token in &macro_definition.body_tokens {
+                if let Sem::Error(_, err) = body_token.r#type {
+                    println!("[ERROR] (in macro '{}') {err:?} at {}:{}..{}:{}",
+                        macro_definition.name,
+                        body_token.source_location.start.line,
+                        body_token.source_location.start.column,
+                        body_token.source_location.end.line,
+                        body_token.source_location.end.column,
+                    )
+                }
+            }
+        } else if let Sem::Error(_, err) = token.r#type {
+            println!("[ERROR {}:{}-{}:{}] {err:?}",
+                token.source_location.start.line,
+                token.source_location.start.column,
+                token.source_location.end.line,
+                token.source_location.end.column,
+            )
+        }
+    }
+
+    println!("");
+    print!("Generated bytecode: [ ");
+    for i in &bytecode {
+        print!("{i:02x} ");
+    }
+    println!("]");
+}
+
diff --git a/src/main.rs b/src/main.rs
new file mode 100644
index 0000000..82bd92d
--- /dev/null
+++ b/src/main.rs
@@ -0,0 +1,41 @@
+const SOURCE:&'static str =
+// "
+// %SCREEN-SHUNT 00;
+// %SCREEN-DRAW 00;
+
+// @draw_horizontal_line  ( len* clr -- )
+// (1)  PSHr                       ( len*    | clr )
+// (4)  PSH:41 STD:SCREEN-SHUNT    ( len*    | clr )
+//       &loop
+// (2)   SKDr:SCREEN-DRAW          ( len*    | clr )
+// (4)   DEC* JKC*:~loop           ( len*    | clr )
+// (2)  STD:SCREEN-SHUNT
+// (3)  POP POPr JMPr*
+// ";
+
+"
+%RED 1234;
+%GREEN 5678 @test;
+%BLUE 9ABC;
+
+@start
+    RED
+    start
+    GREEN
+    BLUE
+    $4
+    @end
+";
+
+use bedrock_asm::*;
+
+fn main() {
+    println!("------- PROGRAM START -------");
+    for line in SOURCE.lines() {
+        println!("{line}");
+    }
+    println!("-------- PROGRAM END --------");
+    println!();
+
+    parse(SOURCE);
+}
diff --git a/src/semantic_token.rs b/src/semantic_token.rs
new file mode 100644
index 0000000..cac82a9
--- /dev/null
+++ b/src/semantic_token.rs
@@ -0,0 +1,38 @@
+use crate::*;
+
+pub enum SemanticTokenType {
+    LabelReference(usize),
+    MacroReference(usize),
+
+    LabelDefinition(LabelDefinition),
+    MacroDefinition(MacroDefinition),
+
+    Pad(u16),
+    Byte(u8),
+    Short(u16),
+    Instruction(u8),
+
+    MacroTerminator,
+    Comment,
+    Error(SyntacticTokenType, Error),
+}
+
+pub struct SemanticToken {
+    pub r#type: SemanticTokenType,
+    pub source_location: SourceLocation,
+    pub bytecode_location: BytecodeLocation,
+}
+
+pub struct LabelDefinition {
+    pub name: String,
+    pub address: u16,
+    /// A list of pointers to label reference tokens
+    pub references: Vec<usize>,
+}
+pub struct MacroDefinition {
+    pub name: String,
+    pub body_tokens: Vec<SemanticToken>,
+    /// A list of pointers to macro reference tokens
+    pub references: Vec<usize>,
+}
+
diff --git a/src/syntactic_token.rs b/src/syntactic_token.rs
new file mode 100644
index 0000000..ee473e2
--- /dev/null
+++ b/src/syntactic_token.rs
@@ -0,0 +1,43 @@
+use crate::*;
+
+pub enum SyntacticTokenType {
+    Reference(String),
+
+    LabelDefinition(String),
+    MacroDefinition(String),
+    MacroTerminator,
+
+    Pad(u16),
+    Byte(u8),
+    Short(u16),
+    Instruction(u8),
+
+    Comment,
+}
+
+pub struct SyntacticToken {
+    pub r#type: SyntacticTokenType,
+    pub source_location: SourceLocation,
+    pub error: Option<Error>,
+}
+
+impl SyntacticToken {
+    // Call when this token is found inside a macro definition.
+    pub fn use_in_macro_body(&mut self) {
+        match self.r#type {
+            SyntacticTokenType::LabelDefinition(..) |
+            SyntacticTokenType::MacroDefinition(..) => {
+                self.set_error(Error::InvalidTypeInMacroDefinition)
+            }
+            _ => (),
+        };
+    }
+
+    pub fn set_error(&mut self, error: Error) {
+        self.error = Some(error);
+    }
+
+    pub fn is_macro_terminator(&self) -> bool {
+        if let SyntacticTokenType::MacroTerminator = self.r#type {true} else {false}
+    }
+}
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
new file mode 100644
index 0000000..b68cc14
--- /dev/null
+++ b/src/tokenizer.rs
@@ -0,0 +1,183 @@
+use std::mem::take;
+use crate::*;
+
+pub struct TokenIterator {
+    /// The characters that comprise the program souce code.
+    chars: Vec<char>,
+    /// The index of the next character to read.
+    i: usize,
+    /// The address of the next character to read.
+    addr: CharAddress,
+    /// If true, skip over any whitespace characters. If false, stop reading
+    /// when a whitespace character is encountered.
+    skip_whitespace: bool,
+    /// The name of the most recently defined label.
+    label: String,
+
+    /// The address of the first character of the current token.
+    start: CharAddress,
+    /// The address of the final character of the current token.
+    end: CharAddress,
+    /// The entire current token.
+    source: String,
+    /// The first character of the current token.
+    prefix: char,
+    /// The second and remaining characters of the current token.
+    suffix: String,
+}
+
+impl TokenIterator {
+    /// Create an iterator from a string of program source code.
+    pub fn from_str(source_code: &str) -> Self {
+        Self {
+            chars: source_code.chars().collect(),
+            i: 0,
+            addr: CharAddress::zero(),
+            skip_whitespace: true,
+            label: String::new(),
+            start: CharAddress::zero(),
+            end: CharAddress::zero(),
+            source: String::new(),
+            prefix: ' ',
+            suffix: String::new(),
+        }
+    }
+    /// Append a character to the current token.
+    fn push(&mut self, c:char) {
+        self.end = self.addr;
+        self.source.push(c);
+        self.suffix.push(c);
+        self.next(c);
+    }
+    /// Move forward to the next source character.
+    fn next(&mut self, c: char) {
+        self.addr.column += 1;
+        self.i += 1;
+        if c == '\n' {
+            self.addr.column = 0;
+            self.addr.line += 1;
+        }
+    }
+    /// Mark the current character as being the first character of a new token.
+    fn mark_start(&mut self, c:char) {
+        self.start=self.addr;
+        self.end=self.addr;
+        self.prefix=c;
+        self.source.push(c);
+        self.skip_whitespace=false;
+        self.next(c);
+    }
+}
+
+impl Iterator for TokenIterator {
+    type Item = SyntacticToken;
+
+    fn next(&mut self) -> Option<SyntacticToken> {
+        // Initialise values before reading the next token
+        let mut is_comment = false;
+        self.skip_whitespace = true;
+
+        // Iterate over source characters until a full token is read
+        while let Some(c) = self.chars.get(self.i) {
+            let c = *c;
+            // Intercept comments
+            if is_comment {
+                self.push(c); if c == ')' { break; } else { continue }; }
+            else if self.skip_whitespace && c == '(' {
+                is_comment = true; self.mark_start(c); continue; }
+            // Allow a semicolon on the end of a token to be handled as a separate token
+            if self.source.len() > 0 && c == ';' { break; }
+            // Handle the current character
+            match (is_whitespace(c), self.skip_whitespace) {
+                (true, true) => self.next(c),        // c is the expected leading whitespace
+                (false, true) => self.mark_start(c), // c is the first character of the token
+                (false, false) => self.push(c),      // c is a character of the token
+                (true, false) => break,              // c is trailing whitespace
+            }
+            // Allow literal values to be attached to the end of the previous token
+            if self.source.len() > 0 && c == ':' { break; }
+        }
+
+        // If no source characters were grabbed then we have read through the entire source file
+        if self.source.len() == 0 { return None; }
+        // Allow handling macro terminators and symbols of length 1 in the match expression
+        if self.suffix.len() == 0 { self.prefix = '\0'; }
+        // Consume the collected characters to be used in the match expression
+        let full = take(&mut self.source);
+        let suffix = take(&mut self.suffix);
+        let mut error = None;
+        let mut parse_hex_lit = |v| {
+            parse_short(v).or_else(|| {
+                error = Some(Error::InvalidHexadecimalLiteral); Some(0)
+            }).unwrap()
+        };
+
+        let r#type = match self.prefix {
+            '(' => { SyntacticTokenType::Comment }
+            '@' => { SyntacticTokenType::LabelDefinition({self.label=suffix.clone(); suffix}) }
+            '&' => { SyntacticTokenType::LabelDefinition(format!("{}/{}", self.label, suffix)) }
+            '$' => { SyntacticTokenType::Pad(parse_hex_lit(&suffix)) }
+            '~' => { SyntacticTokenType::Reference(format!("{}/{}", self.label, suffix)) }
+            '%' => { SyntacticTokenType::MacroDefinition(suffix) }
+            _ => {
+                if ";" == &full                                    { SyntacticTokenType::MacroTerminator }
+                else if let Some(value) = parse_byte_lit(&full)    { SyntacticTokenType::Byte(value) }
+                else if let Some(value) = parse_short_lit(&full)   { SyntacticTokenType::Short(value) }
+                else if let Some(value) = parse_instruction(&full) { SyntacticTokenType::Instruction(value) }
+                else                                               { SyntacticTokenType::Reference(full.clone()) }
+            }
+        };
+        Some(SyntacticToken {
+            r#type,
+            source_location:SourceLocation::new(full,self.start,self.end),
+            error,
+        })
+    }
+}
+
+
+fn parse_byte_lit(token: &str) -> Option<u8> {
+    match token.len() { 2 => u8::from_str_radix(token, 16).ok(), _ => None } }
+fn parse_short_lit(token: &str) -> Option<u16> {
+    match token.len() { 4 => u16::from_str_radix(token, 16).ok(), _ => None } }
+fn parse_short(token: &str) -> Option<u16> {
+    match token.len() { 1..=4 => u16::from_str_radix(token, 16).ok(), _ => None } }
+fn is_whitespace(c: char) -> bool {
+    match c { ' '|'\t'|'\n'|'\r'|'['|']'|'(' =>true, _=>false } }
+fn parse_instruction(token: &str) -> Option<u8> {
+    Some(match token {
+        "BRK"=>0x00,"BRK*"=>0x20,"BRK:"=>0x40,"BRK*:"=>0x60,"BRKr"=>0x80,"BRKr*"=>0xA0,"BRKr:"=>0xC0,"BRKr*:"=>0xE0,
+        "EQU"=>0x01,"EQU*"=>0x21,"EQU:"=>0x41,"EQU*:"=>0x61,"EQUr"=>0x81,"EQUr*"=>0xA1,"EQUr:"=>0xC1,"EQUr*:"=>0xE1,
+        "LTH"=>0x02,"LTH*"=>0x22,"LTH:"=>0x42,"LTH*:"=>0x62,"LTHr"=>0x82,"LTHr*"=>0xA2,"LTHr:"=>0xC2,"LTHr*:"=>0xE2,
+        "GTH"=>0x03,"GTH*"=>0x23,"GTH:"=>0x43,"GTH*:"=>0x63,"GTHr"=>0x83,"GTHr*"=>0xA3,"GTHr:"=>0xC3,"GTHr*:"=>0xE3,
+        "JMP"=>0x04,"JMP*"=>0x24,"JMP:"=>0x44,"JMP*:"=>0x64,"JMPr"=>0x84,"JMPr*"=>0xA4,"JMPr:"=>0xC4,"JMPr*:"=>0xE4,
+        "JSR"=>0x05,"JSR*"=>0x25,"JSR:"=>0x45,"JSR*:"=>0x65,"JSRr"=>0x85,"JSRr*"=>0xA5,"JSRr:"=>0xC5,"JSRr*:"=>0xE5,
+        "JCN"=>0x06,"JCN*"=>0x26,"JCN:"=>0x46,"JCN*:"=>0x66,"JCNr"=>0x86,"JCNr*"=>0xA6,"JCNr:"=>0xC6,"JCNr*:"=>0xE6,
+        "JKC"=>0x07,"JKC*"=>0x27,"JKC:"=>0x47,"JKC*:"=>0x67,"JKCr"=>0x87,"JKCr*"=>0xA7,"JKCr:"=>0xC7,"JKCr*:"=>0xE7,
+        "LDA"=>0x08,"LDA*"=>0x28,"LDA:"=>0x48,"LDA*:"=>0x68,"LDAr"=>0x88,"LDAr*"=>0xA8,"LDAr:"=>0xC8,"LDAr*:"=>0xE8,
+        "LKA"=>0x09,"LKA*"=>0x29,"LKA:"=>0x49,"LKA*:"=>0x69,"LKAr"=>0x89,"LKAr*"=>0xA9,"LKAr:"=>0xC9,"LKAr*:"=>0xE9,
+        "STA"=>0x0A,"STA*"=>0x2A,"STA:"=>0x4A,"STA*:"=>0x6A,"STAr"=>0x8A,"STAr*"=>0xAA,"STAr:"=>0xCA,"STAr*:"=>0xEA,
+        "SKA"=>0x0B,"SKA*"=>0x2B,"SKA:"=>0x4B,"SKA*:"=>0x6B,"SKAr"=>0x8B,"SKAr*"=>0xAB,"SKAr:"=>0xCB,"SKAr*:"=>0xEB,
+        "LDD"=>0x0C,"LDD*"=>0x2C,"LDD:"=>0x4C,"LDD*:"=>0x6C,"LDDr"=>0x8C,"LDDr*"=>0xAC,"LDDr:"=>0xCC,"LDDr*:"=>0xEC,
+        "LKD"=>0x0D,"LKD*"=>0x2D,"LKD:"=>0x4D,"LKD*:"=>0x6D,"LKDr"=>0x8D,"LKDr*"=>0xAD,"LKDr:"=>0xCD,"LKDr*:"=>0xED,
+        "STD"=>0x0E,"STD*"=>0x2E,"STD:"=>0x4E,"STD*:"=>0x6E,"STDr"=>0x8E,"STDr*"=>0xAE,"STDr:"=>0xCE,"STDr*:"=>0xEE,
+        "SKD"=>0x0F,"SKD*"=>0x2F,"SKD:"=>0x4F,"SKD*:"=>0x6F,"SKDr"=>0x8F,"SKDr*"=>0xAF,"SKDr:"=>0xCF,"SKDr*:"=>0xEF,
+        "PSH"=>0x10,"PSH*"=>0x30,"PSH:"=>0x50,"PSH*:"=>0x70,"PSHr"=>0x90,"PSHr*"=>0xB0,"PSHr:"=>0xD0,"PSHr*:"=>0xF0,
+        "POP"=>0x11,"POP*"=>0x31,"POP:"=>0x51,"POP*:"=>0x71,"POPr"=>0x91,"POPr*"=>0xB1,"POPr:"=>0xD1,"POPr*:"=>0xF1,
+        "SHF"=>0x12,"SHF*"=>0x32,"SHF:"=>0x52,"SHF*:"=>0x72,"SHFr"=>0x92,"SHFr*"=>0xB2,"SHFr:"=>0xD2,"SHFr*:"=>0xF2,
+        "SHC"=>0x13,"SHC*"=>0x33,"SHC:"=>0x53,"SHC*:"=>0x73,"SHCr"=>0x93,"SHCr*"=>0xB3,"SHCr:"=>0xD3,"SHCr*:"=>0xF3,
+        "SWP"=>0x14,"SWP*"=>0x34,"SWP:"=>0x54,"SWP*:"=>0x74,"SWPr"=>0x94,"SWPr*"=>0xB4,"SWPr:"=>0xD4,"SWPr*:"=>0xF4,
+        "ROT"=>0x15,"ROT*"=>0x35,"ROT:"=>0x55,"ROT*:"=>0x75,"ROTr"=>0x95,"ROTr*"=>0xB5,"ROTr:"=>0xD5,"ROTr*:"=>0xF5,
+        "DUP"=>0x16,"DUP*"=>0x36,"DUP:"=>0x56,"DUP*:"=>0x76,"DUPr"=>0x96,"DUPr*"=>0xB6,"DUPr:"=>0xD6,"DUPr*:"=>0xF6,
+        "OVR"=>0x17,"OVR*"=>0x37,"OVR:"=>0x57,"OVR*:"=>0x77,"OVRr"=>0x97,"OVRr*"=>0xB7,"OVRr:"=>0xD7,"OVRr*:"=>0xF7,
+        "ADD"=>0x18,"ADD*"=>0x38,"ADD:"=>0x58,"ADD*:"=>0x78,"ADDr"=>0x98,"ADDr*"=>0xB8,"ADDr:"=>0xD8,"ADDr*:"=>0xF8,
+        "SUB"=>0x19,"SUB*"=>0x39,"SUB:"=>0x59,"SUB*:"=>0x79,"SUBr"=>0x99,"SUBr*"=>0xB9,"SUBr:"=>0xD9,"SUBr*:"=>0xF9,
+        "INC"=>0x1A,"INC*"=>0x3A,"INC:"=>0x5A,"INC*:"=>0x7A,"INCr"=>0x9A,"INCr*"=>0xBA,"INCr:"=>0xDA,"INCr*:"=>0xFA,
+        "DEC"=>0x1B,"DEC*"=>0x3B,"DEC:"=>0x5B,"DEC*:"=>0x7B,"DECr"=>0x9B,"DECr*"=>0xBB,"DECr:"=>0xDB,"DECr*:"=>0xFB,
+        "NOT"=>0x1C,"NOT*"=>0x3C,"NOT:"=>0x5C,"NOT*:"=>0x7C,"NOTr"=>0x9C,"NOTr*"=>0xBC,"NOTr:"=>0xDC,"NOTr*:"=>0xFC,
+        "AND"=>0x1D,"AND*"=>0x3D,"AND:"=>0x5D,"AND*:"=>0x7D,"ANDr"=>0x9D,"ANDr*"=>0xBD,"ANDr:"=>0xDD,"ANDr*:"=>0xFD,
+        "IOR"=>0x1E,"IOR*"=>0x3E,"IOR:"=>0x5E,"IOR*:"=>0x7E,"IORr"=>0x9E,"IORr*"=>0xBE,"IORr:"=>0xDE,"IORr*:"=>0xFE,
+        "XOR"=>0x1F,"XOR*"=>0x3F,"XOR:"=>0x5F,"XOR*:"=>0x7F,"XORr"=>0x9F,"XORr*"=>0xBF,"XORr:"=>0xDF,"XORr*:"=>0xFF,
+        _ => return None,
+    })
+}
-- 
cgit v1.2.3-70-g09d2