diff options
author | Ben Bridle <bridle.benjamin@gmail.com> | 2023-05-06 16:19:15 +1200 |
---|---|---|
committer | Ben Bridle <bridle.benjamin@gmail.com> | 2023-05-06 16:19:15 +1200 |
commit | e38f108921c61e1e66d65a368f2a67a763d61e69 (patch) | |
tree | 2718330c1e9963a21bc08db3ddc18574b078d004 /src/tokenizer.rs | |
download | bedrock-asm-e38f108921c61e1e66d65a368f2a67a763d61e69.zip |
About to refactor parser to be a struct with a method for each stage
Diffstat (limited to 'src/tokenizer.rs')
-rw-r--r-- | src/tokenizer.rs | 183 |
1 files changed, 183 insertions, 0 deletions
diff --git a/src/tokenizer.rs b/src/tokenizer.rs new file mode 100644 index 0000000..b68cc14 --- /dev/null +++ b/src/tokenizer.rs @@ -0,0 +1,183 @@ +use std::mem::take; +use crate::*; + +pub struct TokenIterator { + /// The characters that comprise the program souce code. + chars: Vec<char>, + /// The index of the next character to read. + i: usize, + /// The address of the next character to read. + addr: CharAddress, + /// If true, skip over any whitespace characters. If false, stop reading + /// when a whitespace character is encountered. + skip_whitespace: bool, + /// The name of the most recently defined label. + label: String, + + /// The address of the first character of the current token. + start: CharAddress, + /// The address of the final character of the current token. + end: CharAddress, + /// The entire current token. + source: String, + /// The first character of the current token. + prefix: char, + /// The second and remaining characters of the current token. + suffix: String, +} + +impl TokenIterator { + /// Create an iterator from a string of program source code. + pub fn from_str(source_code: &str) -> Self { + Self { + chars: source_code.chars().collect(), + i: 0, + addr: CharAddress::zero(), + skip_whitespace: true, + label: String::new(), + start: CharAddress::zero(), + end: CharAddress::zero(), + source: String::new(), + prefix: ' ', + suffix: String::new(), + } + } + /// Append a character to the current token. + fn push(&mut self, c:char) { + self.end = self.addr; + self.source.push(c); + self.suffix.push(c); + self.next(c); + } + /// Move forward to the next source character. + fn next(&mut self, c: char) { + self.addr.column += 1; + self.i += 1; + if c == '\n' { + self.addr.column = 0; + self.addr.line += 1; + } + } + /// Mark the current character as being the first character of a new token. + fn mark_start(&mut self, c:char) { + self.start=self.addr; + self.end=self.addr; + self.prefix=c; + self.source.push(c); + self.skip_whitespace=false; + self.next(c); + } +} + +impl Iterator for TokenIterator { + type Item = SyntacticToken; + + fn next(&mut self) -> Option<SyntacticToken> { + // Initialise values before reading the next token + let mut is_comment = false; + self.skip_whitespace = true; + + // Iterate over source characters until a full token is read + while let Some(c) = self.chars.get(self.i) { + let c = *c; + // Intercept comments + if is_comment { + self.push(c); if c == ')' { break; } else { continue }; } + else if self.skip_whitespace && c == '(' { + is_comment = true; self.mark_start(c); continue; } + // Allow a semicolon on the end of a token to be handled as a separate token + if self.source.len() > 0 && c == ';' { break; } + // Handle the current character + match (is_whitespace(c), self.skip_whitespace) { + (true, true) => self.next(c), // c is the expected leading whitespace + (false, true) => self.mark_start(c), // c is the first character of the token + (false, false) => self.push(c), // c is a character of the token + (true, false) => break, // c is trailing whitespace + } + // Allow literal values to be attached to the end of the previous token + if self.source.len() > 0 && c == ':' { break; } + } + + // If no source characters were grabbed then we have read through the entire source file + if self.source.len() == 0 { return None; } + // Allow handling macro terminators and symbols of length 1 in the match expression + if self.suffix.len() == 0 { self.prefix = '\0'; } + // Consume the collected characters to be used in the match expression + let full = take(&mut self.source); + let suffix = take(&mut self.suffix); + let mut error = None; + let mut parse_hex_lit = |v| { + parse_short(v).or_else(|| { + error = Some(Error::InvalidHexadecimalLiteral); Some(0) + }).unwrap() + }; + + let r#type = match self.prefix { + '(' => { SyntacticTokenType::Comment } + '@' => { SyntacticTokenType::LabelDefinition({self.label=suffix.clone(); suffix}) } + '&' => { SyntacticTokenType::LabelDefinition(format!("{}/{}", self.label, suffix)) } + '$' => { SyntacticTokenType::Pad(parse_hex_lit(&suffix)) } + '~' => { SyntacticTokenType::Reference(format!("{}/{}", self.label, suffix)) } + '%' => { SyntacticTokenType::MacroDefinition(suffix) } + _ => { + if ";" == &full { SyntacticTokenType::MacroTerminator } + else if let Some(value) = parse_byte_lit(&full) { SyntacticTokenType::Byte(value) } + else if let Some(value) = parse_short_lit(&full) { SyntacticTokenType::Short(value) } + else if let Some(value) = parse_instruction(&full) { SyntacticTokenType::Instruction(value) } + else { SyntacticTokenType::Reference(full.clone()) } + } + }; + Some(SyntacticToken { + r#type, + source_location:SourceLocation::new(full,self.start,self.end), + error, + }) + } +} + + +fn parse_byte_lit(token: &str) -> Option<u8> { + match token.len() { 2 => u8::from_str_radix(token, 16).ok(), _ => None } } +fn parse_short_lit(token: &str) -> Option<u16> { + match token.len() { 4 => u16::from_str_radix(token, 16).ok(), _ => None } } +fn parse_short(token: &str) -> Option<u16> { + match token.len() { 1..=4 => u16::from_str_radix(token, 16).ok(), _ => None } } +fn is_whitespace(c: char) -> bool { + match c { ' '|'\t'|'\n'|'\r'|'['|']'|'(' =>true, _=>false } } +fn parse_instruction(token: &str) -> Option<u8> { + Some(match token { + "BRK"=>0x00,"BRK*"=>0x20,"BRK:"=>0x40,"BRK*:"=>0x60,"BRKr"=>0x80,"BRKr*"=>0xA0,"BRKr:"=>0xC0,"BRKr*:"=>0xE0, + "EQU"=>0x01,"EQU*"=>0x21,"EQU:"=>0x41,"EQU*:"=>0x61,"EQUr"=>0x81,"EQUr*"=>0xA1,"EQUr:"=>0xC1,"EQUr*:"=>0xE1, + "LTH"=>0x02,"LTH*"=>0x22,"LTH:"=>0x42,"LTH*:"=>0x62,"LTHr"=>0x82,"LTHr*"=>0xA2,"LTHr:"=>0xC2,"LTHr*:"=>0xE2, + "GTH"=>0x03,"GTH*"=>0x23,"GTH:"=>0x43,"GTH*:"=>0x63,"GTHr"=>0x83,"GTHr*"=>0xA3,"GTHr:"=>0xC3,"GTHr*:"=>0xE3, + "JMP"=>0x04,"JMP*"=>0x24,"JMP:"=>0x44,"JMP*:"=>0x64,"JMPr"=>0x84,"JMPr*"=>0xA4,"JMPr:"=>0xC4,"JMPr*:"=>0xE4, + "JSR"=>0x05,"JSR*"=>0x25,"JSR:"=>0x45,"JSR*:"=>0x65,"JSRr"=>0x85,"JSRr*"=>0xA5,"JSRr:"=>0xC5,"JSRr*:"=>0xE5, + "JCN"=>0x06,"JCN*"=>0x26,"JCN:"=>0x46,"JCN*:"=>0x66,"JCNr"=>0x86,"JCNr*"=>0xA6,"JCNr:"=>0xC6,"JCNr*:"=>0xE6, + "JKC"=>0x07,"JKC*"=>0x27,"JKC:"=>0x47,"JKC*:"=>0x67,"JKCr"=>0x87,"JKCr*"=>0xA7,"JKCr:"=>0xC7,"JKCr*:"=>0xE7, + "LDA"=>0x08,"LDA*"=>0x28,"LDA:"=>0x48,"LDA*:"=>0x68,"LDAr"=>0x88,"LDAr*"=>0xA8,"LDAr:"=>0xC8,"LDAr*:"=>0xE8, + "LKA"=>0x09,"LKA*"=>0x29,"LKA:"=>0x49,"LKA*:"=>0x69,"LKAr"=>0x89,"LKAr*"=>0xA9,"LKAr:"=>0xC9,"LKAr*:"=>0xE9, + "STA"=>0x0A,"STA*"=>0x2A,"STA:"=>0x4A,"STA*:"=>0x6A,"STAr"=>0x8A,"STAr*"=>0xAA,"STAr:"=>0xCA,"STAr*:"=>0xEA, + "SKA"=>0x0B,"SKA*"=>0x2B,"SKA:"=>0x4B,"SKA*:"=>0x6B,"SKAr"=>0x8B,"SKAr*"=>0xAB,"SKAr:"=>0xCB,"SKAr*:"=>0xEB, + "LDD"=>0x0C,"LDD*"=>0x2C,"LDD:"=>0x4C,"LDD*:"=>0x6C,"LDDr"=>0x8C,"LDDr*"=>0xAC,"LDDr:"=>0xCC,"LDDr*:"=>0xEC, + "LKD"=>0x0D,"LKD*"=>0x2D,"LKD:"=>0x4D,"LKD*:"=>0x6D,"LKDr"=>0x8D,"LKDr*"=>0xAD,"LKDr:"=>0xCD,"LKDr*:"=>0xED, + "STD"=>0x0E,"STD*"=>0x2E,"STD:"=>0x4E,"STD*:"=>0x6E,"STDr"=>0x8E,"STDr*"=>0xAE,"STDr:"=>0xCE,"STDr*:"=>0xEE, + "SKD"=>0x0F,"SKD*"=>0x2F,"SKD:"=>0x4F,"SKD*:"=>0x6F,"SKDr"=>0x8F,"SKDr*"=>0xAF,"SKDr:"=>0xCF,"SKDr*:"=>0xEF, + "PSH"=>0x10,"PSH*"=>0x30,"PSH:"=>0x50,"PSH*:"=>0x70,"PSHr"=>0x90,"PSHr*"=>0xB0,"PSHr:"=>0xD0,"PSHr*:"=>0xF0, + "POP"=>0x11,"POP*"=>0x31,"POP:"=>0x51,"POP*:"=>0x71,"POPr"=>0x91,"POPr*"=>0xB1,"POPr:"=>0xD1,"POPr*:"=>0xF1, + "SHF"=>0x12,"SHF*"=>0x32,"SHF:"=>0x52,"SHF*:"=>0x72,"SHFr"=>0x92,"SHFr*"=>0xB2,"SHFr:"=>0xD2,"SHFr*:"=>0xF2, + "SHC"=>0x13,"SHC*"=>0x33,"SHC:"=>0x53,"SHC*:"=>0x73,"SHCr"=>0x93,"SHCr*"=>0xB3,"SHCr:"=>0xD3,"SHCr*:"=>0xF3, + "SWP"=>0x14,"SWP*"=>0x34,"SWP:"=>0x54,"SWP*:"=>0x74,"SWPr"=>0x94,"SWPr*"=>0xB4,"SWPr:"=>0xD4,"SWPr*:"=>0xF4, + "ROT"=>0x15,"ROT*"=>0x35,"ROT:"=>0x55,"ROT*:"=>0x75,"ROTr"=>0x95,"ROTr*"=>0xB5,"ROTr:"=>0xD5,"ROTr*:"=>0xF5, + "DUP"=>0x16,"DUP*"=>0x36,"DUP:"=>0x56,"DUP*:"=>0x76,"DUPr"=>0x96,"DUPr*"=>0xB6,"DUPr:"=>0xD6,"DUPr*:"=>0xF6, + "OVR"=>0x17,"OVR*"=>0x37,"OVR:"=>0x57,"OVR*:"=>0x77,"OVRr"=>0x97,"OVRr*"=>0xB7,"OVRr:"=>0xD7,"OVRr*:"=>0xF7, + "ADD"=>0x18,"ADD*"=>0x38,"ADD:"=>0x58,"ADD*:"=>0x78,"ADDr"=>0x98,"ADDr*"=>0xB8,"ADDr:"=>0xD8,"ADDr*:"=>0xF8, + "SUB"=>0x19,"SUB*"=>0x39,"SUB:"=>0x59,"SUB*:"=>0x79,"SUBr"=>0x99,"SUBr*"=>0xB9,"SUBr:"=>0xD9,"SUBr*:"=>0xF9, + "INC"=>0x1A,"INC*"=>0x3A,"INC:"=>0x5A,"INC*:"=>0x7A,"INCr"=>0x9A,"INCr*"=>0xBA,"INCr:"=>0xDA,"INCr*:"=>0xFA, + "DEC"=>0x1B,"DEC*"=>0x3B,"DEC:"=>0x5B,"DEC*:"=>0x7B,"DECr"=>0x9B,"DECr*"=>0xBB,"DECr:"=>0xDB,"DECr*:"=>0xFB, + "NOT"=>0x1C,"NOT*"=>0x3C,"NOT:"=>0x5C,"NOT*:"=>0x7C,"NOTr"=>0x9C,"NOTr*"=>0xBC,"NOTr:"=>0xDC,"NOTr*:"=>0xFC, + "AND"=>0x1D,"AND*"=>0x3D,"AND:"=>0x5D,"AND*:"=>0x7D,"ANDr"=>0x9D,"ANDr*"=>0xBD,"ANDr:"=>0xDD,"ANDr*:"=>0xFD, + "IOR"=>0x1E,"IOR*"=>0x3E,"IOR:"=>0x5E,"IOR*:"=>0x7E,"IORr"=>0x9E,"IORr*"=>0xBE,"IORr:"=>0xDE,"IORr*:"=>0xFE, + "XOR"=>0x1F,"XOR*"=>0x3F,"XOR:"=>0x5F,"XOR*:"=>0x7F,"XORr"=>0x9F,"XORr*"=>0xBF,"XORr:"=>0xDF,"XORr*:"=>0xFF, + _ => return None, + }) +} |