diff options
author | Ben Bridle <bridle.benjamin@gmail.com> | 2023-05-06 16:19:15 +1200 |
---|---|---|
committer | Ben Bridle <bridle.benjamin@gmail.com> | 2023-05-06 16:19:15 +1200 |
commit | e38f108921c61e1e66d65a368f2a67a763d61e69 (patch) | |
tree | 2718330c1e9963a21bc08db3ddc18574b078d004 /src/lib.rs | |
download | bedrock-asm-e38f108921c61e1e66d65a368f2a67a763d61e69.zip |
About to refactor parser to be a struct with a method for each stage
Diffstat (limited to 'src/lib.rs')
-rw-r--r-- | src/lib.rs | 332 |
1 files changed, 332 insertions, 0 deletions
diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..5d84600 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,332 @@ +mod addressing; +mod syntactic_token; +mod semantic_token; +mod tokenizer; +mod error; + +pub use addressing::*; +pub use syntactic_token::*; +pub use semantic_token::*; +pub use error::*; +pub use tokenizer::*; + +use std::collections::hash_map::{HashMap, Entry}; +use std::mem::take; + +// On Unicode support: Work with characters, not bytes. This will eventually be +// used in Verdant and Doctrine, and it'd be nice to be able to support other languages. +// The only reason to work with bytes over characters would be for a minor decrease in complexity. +// Only support the assembly of files of up to 64kB. If assets need to be tacked on the end, +// it can be done by another program. The VM will only be able to access the first 64kB of a file anyway. +// Treat \t as a space, have it be a single character. + +// First, turn the program source code into a vector of SyntacticTokens. These +// each contain a SourceLocation, and the type and value of the token. Every single +// non-whitespace character of the program needs to be wrapped in a SyntacticToken. +// The program source code can be accurately reconstructed from this list of +// SyntacticTokens, and when I write GRID, if the mouse is hovering over any point +// in the program listing, I'll be able to determine the exact token that is being hovered. +// For macros, hovering over any character belonging to a macro definition will +// highlight the entire macro definition, and also the currently-hovered body token +// if there is one. Clicking the body token will bring up more information. + +// The SyntacticTokens will be collected into a vector, with label and macro definition +// being constructed as we go. Label definitions are easy, I only need to note down the +// names of the labels in order to validate label references in a later step. If a label +// name has already been defined, tag the token with an error. If a macro name has already +// been defined, tag the token with an error. +// Collect children into macro definitions. This makes sense. + +// Step 2 is to generate bytecode, converting SyntacticTokens into SemanticTokens. +// Label and macro definitions need to contain a list of usizes to references. +// Macro definitions need to contain the body tokens as SemanticTokens. +// Label and macro references need to point to their parents. +// Can I stream-convert tokens from Syntactic to Semantic? +// Each SynToken gets converted to a SemToken? Yeah. + +// I want to change the parser to be a multi-stage struct thing, holding its own state. + +enum SymbolDefinition { Macro(usize), Label(usize) } + +pub fn parse(source_code: &str) { + use SyntacticTokenType as Syn; + use SemanticTokenType as Sem; + + // ============================ STEP 1 ============================ + // Convert the source code into a sorted vector of syntactic tokens and a + // map of symbol definitions. + // ================================================================ + println!("[DEBUG] STEP 1: Parse source code into syntactic tokens"); + let mut syntactic_tokens: Vec<SyntacticToken> = Vec::new(); + let mut symbol_definitions: HashMap<String,SymbolDefinition> = HashMap::new(); + let mut macro_bodies: HashMap<usize, Vec<SyntacticToken>> = HashMap::new(); + let mut macro_definition: Option<usize> = None; + let mut macro_definition_body_tokens: Vec<SyntacticToken> = Vec::new(); + + for mut token in TokenIterator::from_str(source_code) { + if let Some(mdt) = macro_definition { + token.use_in_macro_body(); + let terminate = token.is_macro_terminator(); + macro_definition_body_tokens.push(token); + if terminate { + macro_bodies.insert(mdt, take(&mut macro_definition_body_tokens)); + macro_definition = None; + } + } else { + if let Syn::MacroDefinition(ref name) = token.r#type { + macro_definition = Some(syntactic_tokens.len()); + match symbol_definitions.entry(name.to_string()) { + Entry::Occupied(_) => {token.set_error(Error::DuplicateDefinition);} + Entry::Vacant(v) => {v.insert(SymbolDefinition::Macro(syntactic_tokens.len()));} + } + } else if let Syn::LabelDefinition(ref name) = token.r#type { + match symbol_definitions.entry(name.to_string()) { + Entry::Occupied(_) => {token.set_error(Error::DuplicateDefinition);} + Entry::Vacant(v) => {v.insert(SymbolDefinition::Label(syntactic_tokens.len()));} + } + } else if token.is_macro_terminator() { + token.set_error(Error::OrphanedMacroTerminator); + } + syntactic_tokens.push(token); + } + } + + + // ============================ STEP 2 ============================ + // Convert syntactic tokens into semantic tokens, resolving label and macro + // references in the process. + // ================================================================ + println!("[DEBUG] STEP 2: Resolve label and macro references"); + let syntactic_token_count = syntactic_tokens.len(); + let mut semantic_tokens = Vec::new(); + let mut semantic_macro_bodies: HashMap<usize, Vec<SemanticToken>> = HashMap::new(); + + for (i, mut syn_token) in syntactic_tokens.into_iter().enumerate() { + let sem_token_type = if let Some(err) = syn_token.error { + // Translate over any existing syntax errors + Sem::Error(syn_token.r#type, err) + } else { + match syn_token.r#type { + Syn::Reference(ref name) => { + match symbol_definitions.get(name) { + Some(SymbolDefinition::Macro(addr)) => Sem::MacroReference(*addr), + Some(SymbolDefinition::Label(addr)) => Sem::LabelReference(*addr), + None => Sem::Error(syn_token.r#type, Error::UnresolvedReference), + } + } + Syn::LabelDefinition(name) => { + let label_definition = LabelDefinition { + name, address: 0, references: Vec::new() }; + Sem::LabelDefinition(label_definition) + } + Syn::MacroDefinition(name) => { + let mut sem_body_tokens = Vec::new(); + // Iterate over every token in the body of the macro definition, + // converting each one to a semantic token. + for syn_body_token in macro_bodies.remove(&i).unwrap() { + let sem_body_token_type = if let Some(err) = syn_body_token.error { + // Translate over any existing syntax errors + Sem::Error(syn_body_token.r#type, err) + } else { + match syn_body_token.r#type { + Syn::Reference(ref name) => match symbol_definitions.get(name) { + Some(SymbolDefinition::Macro(addr)) => Sem::MacroReference(*addr), + Some(SymbolDefinition::Label(addr)) => Sem::LabelReference(*addr), + None => Sem::Error(syn_body_token.r#type, Error::UnresolvedReference), + }, + + Syn::LabelDefinition(_) => unreachable!(), + Syn::MacroDefinition(_) => unreachable!(), + Syn::MacroTerminator => { + syn_token.source_location.end = + syn_body_token.source_location.end; + Sem::MacroTerminator + }, + + Syn::Pad(v) => Sem::Pad(v), + Syn::Byte(v) => Sem::Byte(v), + Syn::Short(v) => Sem::Short(v), + Syn::Instruction(v) => Sem::Instruction(v), + + Syn::Comment => Sem::Comment, + } + }; + let sem_body_token = SemanticToken { + r#type: sem_body_token_type, + source_location: syn_body_token.source_location, + bytecode_location: BytecodeLocation::zero(), + }; + sem_body_tokens.push(sem_body_token); + } + semantic_macro_bodies.insert(i, sem_body_tokens); + let macro_definition = MacroDefinition { + name, body_tokens: Vec::new(), references: Vec::new() }; + Sem::MacroDefinition(macro_definition) + } + Syn::MacroTerminator => unreachable!(), + + Syn::Pad(v) => Sem::Pad(v), + Syn::Byte(v) => Sem::Byte(v), + Syn::Short(v) => Sem::Short(v), + Syn::Instruction(v) => Sem::Instruction(v), + + Syn::Comment => Sem::Comment, + } + }; + let sem_token = SemanticToken { + r#type: sem_token_type, + source_location: syn_token.source_location, + bytecode_location: BytecodeLocation::zero(), + }; + semantic_tokens.push(sem_token); + } + assert_eq!(syntactic_token_count, semantic_tokens.len()); + + + // ============================ STEP 3 ============================ + // Iterate over each semantic token, generating bytecode. + // ================================================================ + println!("[DEBUG] STEP 3: Generate bytecode"); + let mut bytecode: Vec<u8> = Vec::new(); + // Map each label token to a list of bytecode addresses to populate + let mut label_reference_addresses: HashMap<usize, Vec<u16>> = HashMap::new(); + // Map each label or macro definition token to a list of reference token pointers + let mut references: HashMap<usize, Vec<usize>> = HashMap::new(); + + macro_rules! addr {() => {bytecode.len() as u16};} + macro_rules! push_u8 {($v:expr) => {bytecode.push($v); 1};} + macro_rules! push_u16 {($v:expr) => {bytecode.extend_from_slice(&$v.to_be_bytes()); 2};} + macro_rules! pad {($p:expr) => {bytecode.resize(bytecode.len() + $p as usize, 0); $p as u16};} + + for (i, sem_token) in semantic_tokens.iter_mut().enumerate() { + let start_addr = addr!(); + let byte_length: u16 = match &mut sem_token.r#type { + Sem::LabelReference(addr) => { + references.entry(*addr).or_default().push(i); + label_reference_addresses.entry(*addr).or_default().push(addr!()); + push_u16!(0u16); 2 + }, + Sem::MacroReference(addr) => { + references.entry(*addr).or_default().push(i); + let mut macro_byte_length: u16 = 0; + for body_token in semantic_macro_bodies.get(addr).unwrap() { + macro_byte_length += match &body_token.r#type { + Sem::LabelReference(addr) => { + label_reference_addresses.entry(*addr).or_default().push(addr!()); + push_u16!(0u16); 2 + }, + Sem::MacroReference(_) => todo!(), + + Sem::LabelDefinition(_) => unreachable!(), + Sem::MacroDefinition(_) => unreachable!(), + + Sem::Pad(p) => { pad!(*p); *p }, + Sem::Byte(b) => { push_u8!(*b); 1 }, + Sem::Short(s) => { push_u16!(*s); 2 }, + Sem::Instruction(b) => { push_u8!(*b); 1 }, + + Sem::MacroTerminator => 0, + Sem::Comment => 0, + Sem::Error(..) => 0, + }; + } + macro_byte_length + }, + + Sem::LabelDefinition(definition) => {definition.address=addr!(); 1}, + Sem::MacroDefinition(_) => 0, + + Sem::Pad(p) => { pad!(*p); *p }, + Sem::Byte(b) => { push_u8!(*b); 1 }, + Sem::Short(s) => { push_u16!(*s); 2 }, + Sem::Instruction(b) => { push_u8!(*b); 1 }, + + Sem::MacroTerminator => unreachable!(), + Sem::Comment => 0, + Sem::Error(..) => 0, + }; + sem_token.bytecode_location.start = start_addr; + sem_token.bytecode_location.length = byte_length; + } + + + // ============================ STEP 4 ============================ + // Fill in addresses for label references. + // ================================================================ + println!("[DEBUG] STEP 4: Fill in values for label references"); + for (label_i, slots) in label_reference_addresses.iter() { + if let Sem::LabelDefinition(LabelDefinition { address, .. }) = semantic_tokens[*label_i].r#type { + let [h,l] = address.to_be_bytes(); + for slot in slots { + bytecode[*slot as usize] = h; + bytecode[slot.wrapping_add(1) as usize] = l; + } + } else { + unreachable!() + } + } + + // ============================ STEP 5 ============================ + // Move references and macro body tokens into label and macro definitions. + // ================================================================ + println!("[DEBUG] STEP 5: Move information into label and macro definition tokens"); + for (i, token) in semantic_tokens.iter_mut().enumerate() { + if let Sem::MacroDefinition(macro_definition) = &mut token.r#type { + macro_definition.body_tokens = semantic_macro_bodies.remove(&i).unwrap(); + if let Some(macro_references) = references.remove(&i) { + macro_definition.references = macro_references; + } + } else if let Sem::LabelDefinition(label_definition) = &mut token.r#type { + if let Some(label_references) = references.remove(&i) { + label_definition.references = label_references; + } + } + } + assert_eq!(references.len(), 0); + + + // ============================ STEP 6 ============================ + // Remove trailing null-bytes from the bytecode. + // ================================================================ + println!("[DEBUG] STEP 6: Trim trailing null bytes"); + if let Some(final_nonnull_byte) = bytecode.iter().rposition(|b| *b != 0) { + let truncated_length = final_nonnull_byte + 1; + let removed_byte_count = bytecode.len() - truncated_length; + if removed_byte_count > 0 { + println!("[INFO] Removed {removed_byte_count} trailing null bytes from assembled bytecode"); + bytecode.truncate(truncated_length); + } + } + + + for token in &semantic_tokens { + if let Sem::MacroDefinition(macro_definition) = &token.r#type { + for body_token in ¯o_definition.body_tokens { + if let Sem::Error(_, err) = body_token.r#type { + println!("[ERROR] (in macro '{}') {err:?} at {}:{}..{}:{}", + macro_definition.name, + body_token.source_location.start.line, + body_token.source_location.start.column, + body_token.source_location.end.line, + body_token.source_location.end.column, + ) + } + } + } else if let Sem::Error(_, err) = token.r#type { + println!("[ERROR {}:{}-{}:{}] {err:?}", + token.source_location.start.line, + token.source_location.start.column, + token.source_location.end.line, + token.source_location.end.column, + ) + } + } + + println!(""); + print!("Generated bytecode: [ "); + for i in &bytecode { + print!("{i:02x} "); + } + println!("]"); +} + |