summaryrefslogtreecommitdiff
path: root/src/lib.rs
diff options
context:
space:
mode:
Diffstat (limited to 'src/lib.rs')
-rw-r--r--src/lib.rs332
1 files changed, 332 insertions, 0 deletions
diff --git a/src/lib.rs b/src/lib.rs
new file mode 100644
index 0000000..5d84600
--- /dev/null
+++ b/src/lib.rs
@@ -0,0 +1,332 @@
+mod addressing;
+mod syntactic_token;
+mod semantic_token;
+mod tokenizer;
+mod error;
+
+pub use addressing::*;
+pub use syntactic_token::*;
+pub use semantic_token::*;
+pub use error::*;
+pub use tokenizer::*;
+
+use std::collections::hash_map::{HashMap, Entry};
+use std::mem::take;
+
+// On Unicode support: Work with characters, not bytes. This will eventually be
+// used in Verdant and Doctrine, and it'd be nice to be able to support other languages.
+// The only reason to work with bytes over characters would be for a minor decrease in complexity.
+// Only support the assembly of files of up to 64kB. If assets need to be tacked on the end,
+// it can be done by another program. The VM will only be able to access the first 64kB of a file anyway.
+// Treat \t as a space, have it be a single character.
+
+// First, turn the program source code into a vector of SyntacticTokens. These
+// each contain a SourceLocation, and the type and value of the token. Every single
+// non-whitespace character of the program needs to be wrapped in a SyntacticToken.
+// The program source code can be accurately reconstructed from this list of
+// SyntacticTokens, and when I write GRID, if the mouse is hovering over any point
+// in the program listing, I'll be able to determine the exact token that is being hovered.
+// For macros, hovering over any character belonging to a macro definition will
+// highlight the entire macro definition, and also the currently-hovered body token
+// if there is one. Clicking the body token will bring up more information.
+
+// The SyntacticTokens will be collected into a vector, with label and macro definition
+// being constructed as we go. Label definitions are easy, I only need to note down the
+// names of the labels in order to validate label references in a later step. If a label
+// name has already been defined, tag the token with an error. If a macro name has already
+// been defined, tag the token with an error.
+// Collect children into macro definitions. This makes sense.
+
+// Step 2 is to generate bytecode, converting SyntacticTokens into SemanticTokens.
+// Label and macro definitions need to contain a list of usizes to references.
+// Macro definitions need to contain the body tokens as SemanticTokens.
+// Label and macro references need to point to their parents.
+// Can I stream-convert tokens from Syntactic to Semantic?
+// Each SynToken gets converted to a SemToken? Yeah.
+
+// I want to change the parser to be a multi-stage struct thing, holding its own state.
+
+enum SymbolDefinition { Macro(usize), Label(usize) }
+
+pub fn parse(source_code: &str) {
+ use SyntacticTokenType as Syn;
+ use SemanticTokenType as Sem;
+
+ // ============================ STEP 1 ============================
+ // Convert the source code into a sorted vector of syntactic tokens and a
+ // map of symbol definitions.
+ // ================================================================
+ println!("[DEBUG] STEP 1: Parse source code into syntactic tokens");
+ let mut syntactic_tokens: Vec<SyntacticToken> = Vec::new();
+ let mut symbol_definitions: HashMap<String,SymbolDefinition> = HashMap::new();
+ let mut macro_bodies: HashMap<usize, Vec<SyntacticToken>> = HashMap::new();
+ let mut macro_definition: Option<usize> = None;
+ let mut macro_definition_body_tokens: Vec<SyntacticToken> = Vec::new();
+
+ for mut token in TokenIterator::from_str(source_code) {
+ if let Some(mdt) = macro_definition {
+ token.use_in_macro_body();
+ let terminate = token.is_macro_terminator();
+ macro_definition_body_tokens.push(token);
+ if terminate {
+ macro_bodies.insert(mdt, take(&mut macro_definition_body_tokens));
+ macro_definition = None;
+ }
+ } else {
+ if let Syn::MacroDefinition(ref name) = token.r#type {
+ macro_definition = Some(syntactic_tokens.len());
+ match symbol_definitions.entry(name.to_string()) {
+ Entry::Occupied(_) => {token.set_error(Error::DuplicateDefinition);}
+ Entry::Vacant(v) => {v.insert(SymbolDefinition::Macro(syntactic_tokens.len()));}
+ }
+ } else if let Syn::LabelDefinition(ref name) = token.r#type {
+ match symbol_definitions.entry(name.to_string()) {
+ Entry::Occupied(_) => {token.set_error(Error::DuplicateDefinition);}
+ Entry::Vacant(v) => {v.insert(SymbolDefinition::Label(syntactic_tokens.len()));}
+ }
+ } else if token.is_macro_terminator() {
+ token.set_error(Error::OrphanedMacroTerminator);
+ }
+ syntactic_tokens.push(token);
+ }
+ }
+
+
+ // ============================ STEP 2 ============================
+ // Convert syntactic tokens into semantic tokens, resolving label and macro
+ // references in the process.
+ // ================================================================
+ println!("[DEBUG] STEP 2: Resolve label and macro references");
+ let syntactic_token_count = syntactic_tokens.len();
+ let mut semantic_tokens = Vec::new();
+ let mut semantic_macro_bodies: HashMap<usize, Vec<SemanticToken>> = HashMap::new();
+
+ for (i, mut syn_token) in syntactic_tokens.into_iter().enumerate() {
+ let sem_token_type = if let Some(err) = syn_token.error {
+ // Translate over any existing syntax errors
+ Sem::Error(syn_token.r#type, err)
+ } else {
+ match syn_token.r#type {
+ Syn::Reference(ref name) => {
+ match symbol_definitions.get(name) {
+ Some(SymbolDefinition::Macro(addr)) => Sem::MacroReference(*addr),
+ Some(SymbolDefinition::Label(addr)) => Sem::LabelReference(*addr),
+ None => Sem::Error(syn_token.r#type, Error::UnresolvedReference),
+ }
+ }
+ Syn::LabelDefinition(name) => {
+ let label_definition = LabelDefinition {
+ name, address: 0, references: Vec::new() };
+ Sem::LabelDefinition(label_definition)
+ }
+ Syn::MacroDefinition(name) => {
+ let mut sem_body_tokens = Vec::new();
+ // Iterate over every token in the body of the macro definition,
+ // converting each one to a semantic token.
+ for syn_body_token in macro_bodies.remove(&i).unwrap() {
+ let sem_body_token_type = if let Some(err) = syn_body_token.error {
+ // Translate over any existing syntax errors
+ Sem::Error(syn_body_token.r#type, err)
+ } else {
+ match syn_body_token.r#type {
+ Syn::Reference(ref name) => match symbol_definitions.get(name) {
+ Some(SymbolDefinition::Macro(addr)) => Sem::MacroReference(*addr),
+ Some(SymbolDefinition::Label(addr)) => Sem::LabelReference(*addr),
+ None => Sem::Error(syn_body_token.r#type, Error::UnresolvedReference),
+ },
+
+ Syn::LabelDefinition(_) => unreachable!(),
+ Syn::MacroDefinition(_) => unreachable!(),
+ Syn::MacroTerminator => {
+ syn_token.source_location.end =
+ syn_body_token.source_location.end;
+ Sem::MacroTerminator
+ },
+
+ Syn::Pad(v) => Sem::Pad(v),
+ Syn::Byte(v) => Sem::Byte(v),
+ Syn::Short(v) => Sem::Short(v),
+ Syn::Instruction(v) => Sem::Instruction(v),
+
+ Syn::Comment => Sem::Comment,
+ }
+ };
+ let sem_body_token = SemanticToken {
+ r#type: sem_body_token_type,
+ source_location: syn_body_token.source_location,
+ bytecode_location: BytecodeLocation::zero(),
+ };
+ sem_body_tokens.push(sem_body_token);
+ }
+ semantic_macro_bodies.insert(i, sem_body_tokens);
+ let macro_definition = MacroDefinition {
+ name, body_tokens: Vec::new(), references: Vec::new() };
+ Sem::MacroDefinition(macro_definition)
+ }
+ Syn::MacroTerminator => unreachable!(),
+
+ Syn::Pad(v) => Sem::Pad(v),
+ Syn::Byte(v) => Sem::Byte(v),
+ Syn::Short(v) => Sem::Short(v),
+ Syn::Instruction(v) => Sem::Instruction(v),
+
+ Syn::Comment => Sem::Comment,
+ }
+ };
+ let sem_token = SemanticToken {
+ r#type: sem_token_type,
+ source_location: syn_token.source_location,
+ bytecode_location: BytecodeLocation::zero(),
+ };
+ semantic_tokens.push(sem_token);
+ }
+ assert_eq!(syntactic_token_count, semantic_tokens.len());
+
+
+ // ============================ STEP 3 ============================
+ // Iterate over each semantic token, generating bytecode.
+ // ================================================================
+ println!("[DEBUG] STEP 3: Generate bytecode");
+ let mut bytecode: Vec<u8> = Vec::new();
+ // Map each label token to a list of bytecode addresses to populate
+ let mut label_reference_addresses: HashMap<usize, Vec<u16>> = HashMap::new();
+ // Map each label or macro definition token to a list of reference token pointers
+ let mut references: HashMap<usize, Vec<usize>> = HashMap::new();
+
+ macro_rules! addr {() => {bytecode.len() as u16};}
+ macro_rules! push_u8 {($v:expr) => {bytecode.push($v); 1};}
+ macro_rules! push_u16 {($v:expr) => {bytecode.extend_from_slice(&$v.to_be_bytes()); 2};}
+ macro_rules! pad {($p:expr) => {bytecode.resize(bytecode.len() + $p as usize, 0); $p as u16};}
+
+ for (i, sem_token) in semantic_tokens.iter_mut().enumerate() {
+ let start_addr = addr!();
+ let byte_length: u16 = match &mut sem_token.r#type {
+ Sem::LabelReference(addr) => {
+ references.entry(*addr).or_default().push(i);
+ label_reference_addresses.entry(*addr).or_default().push(addr!());
+ push_u16!(0u16); 2
+ },
+ Sem::MacroReference(addr) => {
+ references.entry(*addr).or_default().push(i);
+ let mut macro_byte_length: u16 = 0;
+ for body_token in semantic_macro_bodies.get(addr).unwrap() {
+ macro_byte_length += match &body_token.r#type {
+ Sem::LabelReference(addr) => {
+ label_reference_addresses.entry(*addr).or_default().push(addr!());
+ push_u16!(0u16); 2
+ },
+ Sem::MacroReference(_) => todo!(),
+
+ Sem::LabelDefinition(_) => unreachable!(),
+ Sem::MacroDefinition(_) => unreachable!(),
+
+ Sem::Pad(p) => { pad!(*p); *p },
+ Sem::Byte(b) => { push_u8!(*b); 1 },
+ Sem::Short(s) => { push_u16!(*s); 2 },
+ Sem::Instruction(b) => { push_u8!(*b); 1 },
+
+ Sem::MacroTerminator => 0,
+ Sem::Comment => 0,
+ Sem::Error(..) => 0,
+ };
+ }
+ macro_byte_length
+ },
+
+ Sem::LabelDefinition(definition) => {definition.address=addr!(); 1},
+ Sem::MacroDefinition(_) => 0,
+
+ Sem::Pad(p) => { pad!(*p); *p },
+ Sem::Byte(b) => { push_u8!(*b); 1 },
+ Sem::Short(s) => { push_u16!(*s); 2 },
+ Sem::Instruction(b) => { push_u8!(*b); 1 },
+
+ Sem::MacroTerminator => unreachable!(),
+ Sem::Comment => 0,
+ Sem::Error(..) => 0,
+ };
+ sem_token.bytecode_location.start = start_addr;
+ sem_token.bytecode_location.length = byte_length;
+ }
+
+
+ // ============================ STEP 4 ============================
+ // Fill in addresses for label references.
+ // ================================================================
+ println!("[DEBUG] STEP 4: Fill in values for label references");
+ for (label_i, slots) in label_reference_addresses.iter() {
+ if let Sem::LabelDefinition(LabelDefinition { address, .. }) = semantic_tokens[*label_i].r#type {
+ let [h,l] = address.to_be_bytes();
+ for slot in slots {
+ bytecode[*slot as usize] = h;
+ bytecode[slot.wrapping_add(1) as usize] = l;
+ }
+ } else {
+ unreachable!()
+ }
+ }
+
+ // ============================ STEP 5 ============================
+ // Move references and macro body tokens into label and macro definitions.
+ // ================================================================
+ println!("[DEBUG] STEP 5: Move information into label and macro definition tokens");
+ for (i, token) in semantic_tokens.iter_mut().enumerate() {
+ if let Sem::MacroDefinition(macro_definition) = &mut token.r#type {
+ macro_definition.body_tokens = semantic_macro_bodies.remove(&i).unwrap();
+ if let Some(macro_references) = references.remove(&i) {
+ macro_definition.references = macro_references;
+ }
+ } else if let Sem::LabelDefinition(label_definition) = &mut token.r#type {
+ if let Some(label_references) = references.remove(&i) {
+ label_definition.references = label_references;
+ }
+ }
+ }
+ assert_eq!(references.len(), 0);
+
+
+ // ============================ STEP 6 ============================
+ // Remove trailing null-bytes from the bytecode.
+ // ================================================================
+ println!("[DEBUG] STEP 6: Trim trailing null bytes");
+ if let Some(final_nonnull_byte) = bytecode.iter().rposition(|b| *b != 0) {
+ let truncated_length = final_nonnull_byte + 1;
+ let removed_byte_count = bytecode.len() - truncated_length;
+ if removed_byte_count > 0 {
+ println!("[INFO] Removed {removed_byte_count} trailing null bytes from assembled bytecode");
+ bytecode.truncate(truncated_length);
+ }
+ }
+
+
+ for token in &semantic_tokens {
+ if let Sem::MacroDefinition(macro_definition) = &token.r#type {
+ for body_token in &macro_definition.body_tokens {
+ if let Sem::Error(_, err) = body_token.r#type {
+ println!("[ERROR] (in macro '{}') {err:?} at {}:{}..{}:{}",
+ macro_definition.name,
+ body_token.source_location.start.line,
+ body_token.source_location.start.column,
+ body_token.source_location.end.line,
+ body_token.source_location.end.column,
+ )
+ }
+ }
+ } else if let Sem::Error(_, err) = token.r#type {
+ println!("[ERROR {}:{}-{}:{}] {err:?}",
+ token.source_location.start.line,
+ token.source_location.start.column,
+ token.source_location.end.line,
+ token.source_location.end.column,
+ )
+ }
+ }
+
+ println!("");
+ print!("Generated bytecode: [ ");
+ for i in &bytecode {
+ print!("{i:02x} ");
+ }
+ println!("]");
+}
+