summaryrefslogtreecommitdiff
path: root/src/processors/tokeniser.rs
diff options
context:
space:
mode:
authorBen Bridle <ben@derelict.engineering>2025-03-18 11:50:19 +1300
committerBen Bridle <ben@derelict.engineering>2025-03-18 11:50:29 +1300
commitf8a694267d3981b0437c05fc248406116ea9ec06 (patch)
treee7c0426f4278481490e269be7ccb0a710c22ebae /src/processors/tokeniser.rs
parent87cdf5e88fd1d0aaddb91e216c47344effd63ed3 (diff)
downloadassembler-f8a694267d3981b0437c05fc248406116ea9ec06.zip
Large restructure
Files were moved to be better organised, error messages were changed to be more general, and a Compiler type was added to the library.
Diffstat (limited to 'src/processors/tokeniser.rs')
-rw-r--r--src/processors/tokeniser.rs260
1 files changed, 260 insertions, 0 deletions
diff --git a/src/processors/tokeniser.rs b/src/processors/tokeniser.rs
new file mode 100644
index 0000000..0350afe
--- /dev/null
+++ b/src/processors/tokeniser.rs
@@ -0,0 +1,260 @@
+use crate::*;
+
+use std::path::PathBuf;
+
+
+/// Break a character stream down into individual tokens.
+pub struct Tokeniser {
+ /// All characters to be parsed, characters are never removed.
+ pub chars: Vec<char>,
+ /// Path of the whole source file.
+ pub source_path: Option<PathBuf>,
+ /// Original path of the embedded source file.
+ pub embedded_path: Option<PathBuf>,
+ /// Line where the embedded source file begins.
+ pub embedded_first_line: usize,
+ /// Mark tracking the next character to parse.
+ pub mark: TokeniserMark,
+ /// Mark tracking the most recent character of the current token.
+ pub prev: TokeniserMark,
+ /// Position of the first character of the current token.
+ pub start: TokeniserMark,
+ /// Position after the final character of the current token.
+ pub end: TokeniserMark,
+ /// Position to begin subtokenisation from.
+ pub child: TokeniserMark,
+ /// List of characters that start a new token.
+ pub delimiters: Vec<char>,
+ /// List of characters that terminate a token.
+ pub terminators: Vec<char>,
+}
+
+impl Tokeniser {
+ pub fn new<P: Into<PathBuf>>(source_code: &str, path: Option<P>) -> Self {
+ Self {
+ chars: source_code.chars().collect(),
+ source_path: path.map(|p| p.into()),
+ embedded_path: None,
+ embedded_first_line: 0,
+ mark: TokeniserMark::ZERO,
+ prev: TokeniserMark::ZERO,
+ start: TokeniserMark::ZERO,
+ end: TokeniserMark::ZERO,
+ child: TokeniserMark::ZERO,
+ delimiters: Vec::new(),
+ terminators: Vec::new(),
+ }
+ }
+
+ /// Create a tokeniser from child to end.
+ pub fn tokenise_child_span(&mut self) -> Self {
+ let mut start = self.child;
+ start.i = 0;
+ Self {
+ chars: self.get_chars(&self.child, &self.end),
+ source_path: self.source_path.clone(),
+ embedded_path: self.embedded_path.clone(),
+ embedded_first_line: self.embedded_first_line.clone(),
+ mark: start,
+ prev: start,
+ start: start,
+ end: start,
+ child: start,
+ delimiters: Vec::new(),
+ terminators: Vec::new(),
+ }
+ }
+
+ pub fn add_delimiters(&mut self, delimiters: &[char]) {
+ self.delimiters.extend_from_slice(delimiters);
+ }
+
+ pub fn add_terminators(&mut self, terminators: &[char]) {
+ self.terminators.extend_from_slice(terminators);
+ }
+
+ pub fn get_chars(&self, start: &TokeniserMark, end: &TokeniserMark) -> Vec<char> {
+ self.chars[start.i..end.i].iter().map(char::to_owned).collect()
+ }
+
+ /// Return the next character without consuming it.
+ pub fn peek_char(&self) -> Option<char> {
+ self.chars.get(self.mark.i).copied()
+ }
+
+ /// Consume and return the next character.
+ pub fn eat_char(&mut self) -> Option<char> {
+ let option = self.peek_char();
+ if let Some(c) = option {
+ self.prev = self.mark;
+ self.mark.advance(c);
+ self.mark_end();
+ }
+ return option;
+ }
+
+ /// Consume next characters if they match a pattern.
+ pub fn eat_if(&mut self, pattern: &str) -> Option<String> {
+ // Check that next characters match the pattern.
+ for (i, c) in pattern.chars().enumerate() {
+ if let Some(d) = self.chars.get(self.mark.i + i) {
+ if c == *d {
+ continue;
+ }
+ }
+ return None;
+ }
+ // Consume the next characters.
+ self.prev = self.mark;
+ for c in pattern.chars() {
+ self.mark.advance(c);
+ self.mark_end();
+ }
+ return Some(pattern.to_string());
+ }
+
+ /// Consume whitespace.
+ pub fn eat_whitespace(&mut self) {
+ while let Some(c) = self.peek_char() {
+ match c.is_whitespace() {
+ true => self.eat_char(),
+ false => break,
+ };
+ }
+ }
+
+ /// Remove a full token from the queue.
+ pub fn eat_token(&mut self) -> String {
+ let mut token = String::new();
+ while let Some(peek) = self.peek_char() {
+ if peek.is_whitespace() || self.delimiters.contains(&peek) {
+ break;
+ }
+ let c = self.eat_char().unwrap();
+ token.push(c);
+ if self.terminators.contains(&c) {
+ break;
+ }
+ }
+ return token;
+ }
+
+ /// Return all characters found until the predicate returns true.
+ /// Returns None if end of source is reached before delimiter is found.
+ pub fn track_until(&mut self, mut predicate: impl FnMut(&mut Self) -> bool) -> Option<String> {
+ let start = self.mark;
+ let mut end = self.mark;
+ while !predicate(self) {
+ self.peek_char()?;
+ end = self.mark;
+ }
+ self.end = self.prev;
+ return Some(self.get_chars(&start, &end).iter().collect());
+ }
+
+ /// Returns true if the remainder of the line is whitespace.
+ pub fn end_of_line(&self) -> bool {
+ for c in &self.chars[self.mark.i..] {
+ if *c == '\n' {
+ return true;
+ }
+ if !c.is_whitespace() {
+ return false
+ }
+ }
+ return true;
+ }
+
+ /// Mark the next character to be consumed as the start character.
+ pub fn mark_start(&mut self) {
+ self.start = self.mark;
+ }
+
+ /// Mark the most recently consumed character as the start character.
+ pub fn mark_start_prev(&mut self) {
+ self.start = self.prev;
+ }
+
+ /// Mark the next character as the character following the end character.
+ pub fn mark_end(&mut self) {
+ self.end = self.mark;
+ }
+
+ /// Mark the next character as the character following the end character.
+ pub fn mark_end_prev(&mut self) {
+ self.end = self.prev;
+ }
+
+ /// Mark the next character to be consumed as the start of the child.
+ pub fn mark_child(&mut self) {
+ self.child = self.mark;
+ }
+
+ /// Return the SourceSpan between the start and end marks.
+ pub fn get_source(&mut self) -> SourceSpan {
+ let in_merged = SourceLocation {
+ path: self.source_path.to_owned(),
+ start: self.start.position,
+ end: self.end.prev_position,
+ };
+ let in_source = if self.start.position.line >= self.embedded_first_line {
+ if let Some(embedded_path) = &self.embedded_path {
+ let offset = self.embedded_first_line;
+ Some(
+ SourceLocation {
+ path: Some(embedded_path.to_owned()),
+ start: SourcePosition {
+ line: in_merged.start.line.saturating_sub(offset),
+ column: in_merged.start.column,
+ },
+ end: SourcePosition {
+ line: in_merged.end.line.saturating_sub(offset),
+ column: in_merged.end.column,
+ }
+ }
+ )
+ } else {
+ None
+ }
+ } else {
+ None
+ };
+
+ let string = self.get_chars(&self.start, &self.end).iter().collect();
+ SourceSpan { string, in_merged, in_source, child: None }
+ }
+}
+
+
+#[derive(Clone, Copy)]
+pub struct TokeniserMark {
+ /// Position of the next character to be consumed.
+ pub position: SourcePosition,
+ /// Index of the next character to be consumed.
+ pub i: usize,
+ /// Position of the most recently consumed character.
+ pub prev_position: SourcePosition,
+ pub prev_prev_position: SourcePosition,
+}
+
+impl TokeniserMark {
+ pub const ZERO: Self = Self {
+ position: SourcePosition::ZERO,
+ i: 0,
+ prev_position: SourcePosition::ZERO,
+ prev_prev_position: SourcePosition::ZERO,
+ };
+
+ /// Advance to the next character.
+ pub fn advance(&mut self, c: char) {
+ self.prev_prev_position = self.prev_position;
+ self.prev_position = self.position;
+ self.position.advance(c);
+ self.i += 1;
+ }
+
+ /// Ignore the most recently consumed character.
+ pub fn undo(&mut self) {
+ self.prev_position = self.prev_prev_position;
+ }
+}