summaryrefslogtreecommitdiff
path: root/src/lib.rs
blob: 5d84600e27748138a8af8fe5f8482d54c0285176 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
mod addressing;
mod syntactic_token;
mod semantic_token;
mod tokenizer;
mod error;

pub use addressing::*;
pub use syntactic_token::*;
pub use semantic_token::*;
pub use error::*;
pub use tokenizer::*;

use std::collections::hash_map::{HashMap, Entry};
use std::mem::take;

// On Unicode support: Work with characters, not bytes. This will eventually be
// used in Verdant and Doctrine, and it'd be nice to be able to support other languages.
// The only reason to work with bytes over characters would be for a minor decrease in complexity.
// Only support the assembly of files of up to 64kB. If assets need to be tacked on the end,
// it can be done by another program. The VM will only be able to access the first 64kB of a file anyway.
// Treat \t as a space, have it be a single character.

// First, turn the program source code into a vector of SyntacticTokens. These
// each contain a SourceLocation, and the type and value of the token. Every single
// non-whitespace character of the program needs to be wrapped in a SyntacticToken.
// The program source code can be accurately reconstructed from this list of
// SyntacticTokens, and when I write GRID, if the mouse is hovering over any point
// in the program listing, I'll be able to determine the exact token that is being hovered.
// For macros, hovering over any character belonging to a macro definition will
// highlight the entire macro definition, and also the currently-hovered body token
// if there is one. Clicking the body token will bring up more information.

// The SyntacticTokens will be collected into a vector, with label and macro definition
// being constructed as we go. Label definitions are easy, I only need to note down the
// names of the labels in order to validate label references in a later step. If a label
// name has already been defined, tag the token with an error. If a macro name has already
// been defined, tag the token with an error.
// Collect children into macro definitions. This makes sense.

// Step 2 is to generate bytecode, converting SyntacticTokens into SemanticTokens.
// Label and macro definitions need to contain a list of usizes to references.
// Macro definitions need to contain the body tokens as SemanticTokens.
// Label and macro references need to point to their parents.
// Can I stream-convert tokens from Syntactic to Semantic?
// Each SynToken gets converted to a SemToken? Yeah.

// I want to change the parser to be a multi-stage struct thing, holding its own state.

enum SymbolDefinition { Macro(usize), Label(usize) }

pub fn parse(source_code: &str) {
    use SyntacticTokenType as Syn;
    use SemanticTokenType as Sem;

    // ============================ STEP 1 ============================
    // Convert the source code into a sorted vector of syntactic tokens and a
    // map of symbol definitions.
    // ================================================================
    println!("[DEBUG] STEP 1: Parse source code into syntactic tokens");
    let mut syntactic_tokens: Vec<SyntacticToken> = Vec::new();
    let mut symbol_definitions: HashMap<String,SymbolDefinition> = HashMap::new();
    let mut macro_bodies: HashMap<usize, Vec<SyntacticToken>> = HashMap::new();
    let mut macro_definition: Option<usize> = None;
    let mut macro_definition_body_tokens: Vec<SyntacticToken> = Vec::new();

    for mut token in TokenIterator::from_str(source_code) {
        if let Some(mdt) = macro_definition {
            token.use_in_macro_body();
            let terminate = token.is_macro_terminator();
            macro_definition_body_tokens.push(token);
            if terminate {
                macro_bodies.insert(mdt, take(&mut macro_definition_body_tokens));
                macro_definition = None;
            }
        } else {
            if let Syn::MacroDefinition(ref name) = token.r#type {
                macro_definition = Some(syntactic_tokens.len());
                match symbol_definitions.entry(name.to_string()) {
                    Entry::Occupied(_) => {token.set_error(Error::DuplicateDefinition);}
                    Entry::Vacant(v) => {v.insert(SymbolDefinition::Macro(syntactic_tokens.len()));}
                }
            } else if let Syn::LabelDefinition(ref name) = token.r#type {
                match symbol_definitions.entry(name.to_string()) {
                    Entry::Occupied(_) => {token.set_error(Error::DuplicateDefinition);}
                    Entry::Vacant(v) => {v.insert(SymbolDefinition::Label(syntactic_tokens.len()));}
                }
            } else if token.is_macro_terminator() {
                token.set_error(Error::OrphanedMacroTerminator);
            }
            syntactic_tokens.push(token);
        }
    }


    // ============================ STEP 2 ============================
    // Convert syntactic tokens into semantic tokens, resolving label and macro
    // references in the process.
    // ================================================================
    println!("[DEBUG] STEP 2: Resolve label and macro references");
    let syntactic_token_count = syntactic_tokens.len();
    let mut semantic_tokens = Vec::new();
    let mut semantic_macro_bodies: HashMap<usize, Vec<SemanticToken>> = HashMap::new();

    for (i, mut syn_token) in syntactic_tokens.into_iter().enumerate() {
        let sem_token_type = if let Some(err) = syn_token.error {
            // Translate over any existing syntax errors
            Sem::Error(syn_token.r#type, err)
        } else {
            match syn_token.r#type {
                Syn::Reference(ref name) => {
                    match symbol_definitions.get(name) {
                        Some(SymbolDefinition::Macro(addr)) => Sem::MacroReference(*addr),
                        Some(SymbolDefinition::Label(addr)) => Sem::LabelReference(*addr),
                        None => Sem::Error(syn_token.r#type, Error::UnresolvedReference),
                    }
                }
                Syn::LabelDefinition(name) => {
                    let label_definition = LabelDefinition {
                        name, address: 0, references: Vec::new() };
                    Sem::LabelDefinition(label_definition)
                }
                Syn::MacroDefinition(name) => {
                    let mut sem_body_tokens = Vec::new();
                    // Iterate over every token in the body of the macro definition,
                    // converting each one to a semantic token.
                    for syn_body_token in macro_bodies.remove(&i).unwrap() {
                        let sem_body_token_type = if let Some(err) = syn_body_token.error {
                            // Translate over any existing syntax errors
                            Sem::Error(syn_body_token.r#type, err)
                        } else {
                            match syn_body_token.r#type {
                                Syn::Reference(ref name) => match symbol_definitions.get(name) {
                                    Some(SymbolDefinition::Macro(addr)) => Sem::MacroReference(*addr),
                                    Some(SymbolDefinition::Label(addr)) => Sem::LabelReference(*addr),
                                    None => Sem::Error(syn_body_token.r#type, Error::UnresolvedReference),
                                },

                                Syn::LabelDefinition(_) => unreachable!(),
                                Syn::MacroDefinition(_) => unreachable!(),
                                Syn::MacroTerminator => {
                                    syn_token.source_location.end =
                                        syn_body_token.source_location.end;
                                    Sem::MacroTerminator
                                },

                                Syn::Pad(v) => Sem::Pad(v),
                                Syn::Byte(v) => Sem::Byte(v),
                                Syn::Short(v) => Sem::Short(v),
                                Syn::Instruction(v) => Sem::Instruction(v),

                                Syn::Comment => Sem::Comment,
                            }
                        };
                        let sem_body_token = SemanticToken {
                            r#type: sem_body_token_type,
                            source_location: syn_body_token.source_location,
                            bytecode_location: BytecodeLocation::zero(),
                        };
                        sem_body_tokens.push(sem_body_token);
                    }
                    semantic_macro_bodies.insert(i, sem_body_tokens);
                    let macro_definition = MacroDefinition {
                        name, body_tokens: Vec::new(), references: Vec::new() };
                    Sem::MacroDefinition(macro_definition)
                }
                Syn::MacroTerminator => unreachable!(),

                Syn::Pad(v) => Sem::Pad(v),
                Syn::Byte(v) => Sem::Byte(v),
                Syn::Short(v) => Sem::Short(v),
                Syn::Instruction(v) => Sem::Instruction(v),

                Syn::Comment => Sem::Comment,
            }
        };
        let sem_token = SemanticToken {
            r#type: sem_token_type,
            source_location: syn_token.source_location,
            bytecode_location: BytecodeLocation::zero(),
        };
        semantic_tokens.push(sem_token);
    }
    assert_eq!(syntactic_token_count, semantic_tokens.len());


    // ============================ STEP 3 ============================
    // Iterate over each semantic token, generating bytecode.
    // ================================================================
    println!("[DEBUG] STEP 3: Generate bytecode");
    let mut bytecode: Vec<u8> = Vec::new();
    // Map each label token to a list of bytecode addresses to populate
    let mut label_reference_addresses: HashMap<usize, Vec<u16>> = HashMap::new();
    // Map each label or macro definition token to a list of reference token pointers
    let mut references: HashMap<usize, Vec<usize>> = HashMap::new();

    macro_rules! addr {() => {bytecode.len() as u16};}
    macro_rules! push_u8 {($v:expr) => {bytecode.push($v); 1};}
    macro_rules! push_u16 {($v:expr) => {bytecode.extend_from_slice(&$v.to_be_bytes()); 2};}
    macro_rules! pad {($p:expr) => {bytecode.resize(bytecode.len() + $p as usize, 0); $p as u16};}

    for (i, sem_token) in semantic_tokens.iter_mut().enumerate() {
        let start_addr = addr!();
        let byte_length: u16 = match &mut sem_token.r#type {
            Sem::LabelReference(addr) => {
                references.entry(*addr).or_default().push(i);
                label_reference_addresses.entry(*addr).or_default().push(addr!());
                push_u16!(0u16); 2
            },
            Sem::MacroReference(addr) => {
                references.entry(*addr).or_default().push(i);
                let mut macro_byte_length: u16 = 0;
                for body_token in semantic_macro_bodies.get(addr).unwrap() {
                    macro_byte_length += match &body_token.r#type {
                        Sem::LabelReference(addr) => {
                            label_reference_addresses.entry(*addr).or_default().push(addr!());
                            push_u16!(0u16); 2
                        },
                        Sem::MacroReference(_) => todo!(),

                        Sem::LabelDefinition(_) => unreachable!(),
                        Sem::MacroDefinition(_) => unreachable!(),

                        Sem::Pad(p) => { pad!(*p); *p },
                        Sem::Byte(b) => { push_u8!(*b); 1 },
                        Sem::Short(s) => { push_u16!(*s); 2 },
                        Sem::Instruction(b) => { push_u8!(*b); 1 },

                        Sem::MacroTerminator => 0,
                        Sem::Comment => 0,
                        Sem::Error(..) => 0,
                    };
                }
                macro_byte_length
            },

            Sem::LabelDefinition(definition) => {definition.address=addr!(); 1},
            Sem::MacroDefinition(_) => 0,

            Sem::Pad(p) => { pad!(*p); *p },
            Sem::Byte(b) => { push_u8!(*b); 1 },
            Sem::Short(s) => { push_u16!(*s); 2 },
            Sem::Instruction(b) => { push_u8!(*b); 1 },

            Sem::MacroTerminator => unreachable!(),
            Sem::Comment => 0,
            Sem::Error(..) => 0,
        };
        sem_token.bytecode_location.start = start_addr;
        sem_token.bytecode_location.length = byte_length;
    }


    // ============================ STEP 4 ============================
    // Fill in addresses for label references.
    // ================================================================
    println!("[DEBUG] STEP 4: Fill in values for label references");
    for (label_i, slots) in label_reference_addresses.iter() {
        if let Sem::LabelDefinition(LabelDefinition { address, .. }) = semantic_tokens[*label_i].r#type {
            let [h,l] = address.to_be_bytes();
            for slot in slots {
                bytecode[*slot as usize] = h;
                bytecode[slot.wrapping_add(1) as usize] = l;
            }
        } else {
            unreachable!()
        }
    }

    // ============================ STEP 5 ============================
    // Move references and macro body tokens into label and macro definitions.
    // ================================================================
    println!("[DEBUG] STEP 5: Move information into label and macro definition tokens");
    for (i, token) in semantic_tokens.iter_mut().enumerate() {
        if let Sem::MacroDefinition(macro_definition) = &mut token.r#type {
            macro_definition.body_tokens = semantic_macro_bodies.remove(&i).unwrap();
            if let Some(macro_references) = references.remove(&i) {
                macro_definition.references = macro_references;
            }
        } else if let Sem::LabelDefinition(label_definition) = &mut token.r#type {
            if let Some(label_references) = references.remove(&i) {
                label_definition.references = label_references;
            }
        }
    }
    assert_eq!(references.len(), 0);


    // ============================ STEP 6 ============================
    // Remove trailing null-bytes from the bytecode.
    // ================================================================
    println!("[DEBUG] STEP 6: Trim trailing null bytes");
    if let Some(final_nonnull_byte) = bytecode.iter().rposition(|b| *b != 0) {
        let truncated_length = final_nonnull_byte + 1;
        let removed_byte_count = bytecode.len() - truncated_length;
        if removed_byte_count > 0 {
            println!("[INFO] Removed {removed_byte_count} trailing null bytes from assembled bytecode");
            bytecode.truncate(truncated_length);
        }
    }


    for token in &semantic_tokens {
        if let Sem::MacroDefinition(macro_definition) = &token.r#type {
            for body_token in &macro_definition.body_tokens {
                if let Sem::Error(_, err) = body_token.r#type {
                    println!("[ERROR] (in macro '{}') {err:?} at {}:{}..{}:{}",
                        macro_definition.name,
                        body_token.source_location.start.line,
                        body_token.source_location.start.column,
                        body_token.source_location.end.line,
                        body_token.source_location.end.column,
                    )
                }
            }
        } else if let Sem::Error(_, err) = token.r#type {
            println!("[ERROR {}:{}-{}:{}] {err:?}",
                token.source_location.start.line,
                token.source_location.start.column,
                token.source_location.end.line,
                token.source_location.end.column,
            )
        }
    }

    println!("");
    print!("Generated bytecode: [ ");
    for i in &bytecode {
        print!("{i:02x} ");
    }
    println!("]");
}