use crate::*; use std::path::PathBuf; /// Break a character stream down into individual tokens. pub struct Tokeniser { /// Characters waiting to be parsed. pub chars: Vec, /// Path of the whole source file. pub source_path: Option, /// Original path of the embedded source file. pub embedded_path: Option, /// Line where the embedded source file begins. pub embedded_first_line: usize, /// Mark tracking the next character to parse. pub mark: TokeniserMark, /// Mark tracking the most recent character of the current token. pub prev: TokeniserMark, /// Position of the first character of the current token. pub start: TokeniserMark, /// Position after the final character of the current token. pub end: TokeniserMark, /// Position to begin subtokenisation from. pub child: TokeniserMark, /// List of characters that start a new token. pub delimiters: Vec, /// List of characters that terminate a token. pub terminators: Vec, } impl Tokeniser { pub fn new>(source_code: &str, path: Option

) -> Self { Self { chars: source_code.chars().collect(), source_path: path.map(|p| p.into()), embedded_path: None, embedded_first_line: 0, mark: TokeniserMark::ZERO, prev: TokeniserMark::ZERO, start: TokeniserMark::ZERO, end: TokeniserMark::ZERO, child: TokeniserMark::ZERO, delimiters: Vec::new(), terminators: Vec::new(), } } /// Create a tokeniser from child to end. pub fn tokenise_child_span(&mut self) -> Self { let mut start = self.child; start.i = 0; Self { chars: self.get_chars(&self.child, &self.end), source_path: self.source_path.clone(), embedded_path: self.embedded_path.clone(), embedded_first_line: self.embedded_first_line.clone(), mark: start, prev: start, start: start, end: start, child: start, delimiters: Vec::new(), terminators: Vec::new(), } } pub fn add_delimiters(&mut self, delimiters: &[char]) { self.delimiters.extend_from_slice(delimiters); } pub fn add_terminators(&mut self, terminators: &[char]) { self.terminators.extend_from_slice(terminators); } pub fn get_chars(&self, start: &TokeniserMark, end: &TokeniserMark) -> Vec { self.chars[start.i..end.i].iter().map(char::to_owned).collect() } /// Return the next character without consuming it. pub fn peek_char(&self) -> Option { self.chars.get(self.mark.i).copied() } /// Consume and return the next character. pub fn eat_char(&mut self) -> Option { let option = self.peek_char(); if let Some(c) = option { self.prev = self.mark; self.mark.advance(c); self.mark_end(); } return option; } /// Consume next characters if they match a pattern. pub fn eat_if(&mut self, pattern: &str) -> Option { // Check that next characters match the pattern. for (i, c) in pattern.chars().enumerate() { if let Some(d) = self.chars.get(self.mark.i + i) { if c == *d { continue; } } return None; } // Consume the next characters. self.prev = self.mark; for c in pattern.chars() { self.mark.advance(c); self.mark_end(); } return Some(pattern.to_string()); } /// Consume whitespace. pub fn eat_whitespace(&mut self) { while let Some(c) = self.peek_char() { match c.is_whitespace() { true => self.eat_char(), false => break, }; } } /// Remove a full token from the queue. pub fn eat_token(&mut self) -> String { let mut token = String::new(); while let Some(peek) = self.peek_char() { if peek.is_whitespace() || self.delimiters.contains(&peek) { break; } let c = self.eat_char().unwrap(); token.push(c); if self.terminators.contains(&c) { break; } } return token; } /// Return all characters found until the predicate returns true. /// Returns None if end of source is reached before delimiter is found. pub fn track_until(&mut self, mut predicate: impl FnMut(&mut Self) -> bool) -> Option { let start = self.mark; let mut end = self.mark; while !predicate(self) { self.peek_char()?; end = self.mark; } self.end = self.prev; return Some(self.get_chars(&start, &end).iter().collect()); } /// Returns true if the remainder of the line is whitespace. pub fn end_of_line(&self) -> bool { for c in self.chars.iter().rev() { if *c == '\n' { return true; } if !c.is_whitespace() { return false } } return true; } /// Mark the next character to be consumed as the start character. pub fn mark_start(&mut self) { self.start = self.mark; } /// Mark the most recently consumed character as the start character. pub fn mark_start_prev(&mut self) { self.start = self.prev; } /// Mark the next character as the character following the end character. pub fn mark_end(&mut self) { self.end = self.mark; } /// Mark the next character as the character following the end character. pub fn mark_end_prev(&mut self) { self.end = self.prev; } /// Mark the next character to be consumed as the start of the child. pub fn mark_child(&mut self) { self.child = self.mark; } /// Return the SourceSpan between the start and end marks. pub fn get_source(&mut self) -> SourceSpan { let in_merged = SourceLocation { path: self.source_path.to_owned(), start: self.start.position, end: self.end.prev_position, }; let in_source = if self.start.position.line >= self.embedded_first_line { if let Some(embedded_path) = &self.embedded_path { let offset = self.embedded_first_line; Some( SourceLocation { path: Some(embedded_path.to_owned()), start: SourcePosition { line: in_merged.start.line.saturating_sub(offset), column: in_merged.start.column, }, end: SourcePosition { line: in_merged.end.line.saturating_sub(offset), column: in_merged.end.column, } } ) } else { None } } else { None }; let string = self.get_chars(&self.start, &self.end).iter().collect(); SourceSpan { string, in_merged, in_source, child: None } } } #[derive(Clone, Copy)] pub struct TokeniserMark { /// Position of the next character to be consumed. pub position: SourcePosition, /// Index of the next character to be consumed. pub i: usize, /// Position of the most recently consumed character. pub prev_position: SourcePosition, pub prev_prev_position: SourcePosition, } impl TokeniserMark { pub const ZERO: Self = Self { position: SourcePosition::ZERO, i: 0, prev_position: SourcePosition::ZERO, prev_prev_position: SourcePosition::ZERO, }; /// Advance to the next character. pub fn advance(&mut self, c: char) { self.prev_prev_position = self.prev_position; self.prev_position = self.position; self.position.advance(c); self.i += 1; } /// Ignore the most recently consumed character. pub fn undo(&mut self) { self.prev_position = self.prev_prev_position; } }