diff options
Diffstat (limited to 'src/tokeniser.rs')
-rw-r--r-- | src/tokeniser.rs | 260 |
1 files changed, 0 insertions, 260 deletions
diff --git a/src/tokeniser.rs b/src/tokeniser.rs deleted file mode 100644 index 0350afe..0000000 --- a/src/tokeniser.rs +++ /dev/null @@ -1,260 +0,0 @@ -use crate::*; - -use std::path::PathBuf; - - -/// Break a character stream down into individual tokens. -pub struct Tokeniser { - /// All characters to be parsed, characters are never removed. - pub chars: Vec<char>, - /// Path of the whole source file. - pub source_path: Option<PathBuf>, - /// Original path of the embedded source file. - pub embedded_path: Option<PathBuf>, - /// Line where the embedded source file begins. - pub embedded_first_line: usize, - /// Mark tracking the next character to parse. - pub mark: TokeniserMark, - /// Mark tracking the most recent character of the current token. - pub prev: TokeniserMark, - /// Position of the first character of the current token. - pub start: TokeniserMark, - /// Position after the final character of the current token. - pub end: TokeniserMark, - /// Position to begin subtokenisation from. - pub child: TokeniserMark, - /// List of characters that start a new token. - pub delimiters: Vec<char>, - /// List of characters that terminate a token. - pub terminators: Vec<char>, -} - -impl Tokeniser { - pub fn new<P: Into<PathBuf>>(source_code: &str, path: Option<P>) -> Self { - Self { - chars: source_code.chars().collect(), - source_path: path.map(|p| p.into()), - embedded_path: None, - embedded_first_line: 0, - mark: TokeniserMark::ZERO, - prev: TokeniserMark::ZERO, - start: TokeniserMark::ZERO, - end: TokeniserMark::ZERO, - child: TokeniserMark::ZERO, - delimiters: Vec::new(), - terminators: Vec::new(), - } - } - - /// Create a tokeniser from child to end. - pub fn tokenise_child_span(&mut self) -> Self { - let mut start = self.child; - start.i = 0; - Self { - chars: self.get_chars(&self.child, &self.end), - source_path: self.source_path.clone(), - embedded_path: self.embedded_path.clone(), - embedded_first_line: self.embedded_first_line.clone(), - mark: start, - prev: start, - start: start, - end: start, - child: start, - delimiters: Vec::new(), - terminators: Vec::new(), - } - } - - pub fn add_delimiters(&mut self, delimiters: &[char]) { - self.delimiters.extend_from_slice(delimiters); - } - - pub fn add_terminators(&mut self, terminators: &[char]) { - self.terminators.extend_from_slice(terminators); - } - - pub fn get_chars(&self, start: &TokeniserMark, end: &TokeniserMark) -> Vec<char> { - self.chars[start.i..end.i].iter().map(char::to_owned).collect() - } - - /// Return the next character without consuming it. - pub fn peek_char(&self) -> Option<char> { - self.chars.get(self.mark.i).copied() - } - - /// Consume and return the next character. - pub fn eat_char(&mut self) -> Option<char> { - let option = self.peek_char(); - if let Some(c) = option { - self.prev = self.mark; - self.mark.advance(c); - self.mark_end(); - } - return option; - } - - /// Consume next characters if they match a pattern. - pub fn eat_if(&mut self, pattern: &str) -> Option<String> { - // Check that next characters match the pattern. - for (i, c) in pattern.chars().enumerate() { - if let Some(d) = self.chars.get(self.mark.i + i) { - if c == *d { - continue; - } - } - return None; - } - // Consume the next characters. - self.prev = self.mark; - for c in pattern.chars() { - self.mark.advance(c); - self.mark_end(); - } - return Some(pattern.to_string()); - } - - /// Consume whitespace. - pub fn eat_whitespace(&mut self) { - while let Some(c) = self.peek_char() { - match c.is_whitespace() { - true => self.eat_char(), - false => break, - }; - } - } - - /// Remove a full token from the queue. - pub fn eat_token(&mut self) -> String { - let mut token = String::new(); - while let Some(peek) = self.peek_char() { - if peek.is_whitespace() || self.delimiters.contains(&peek) { - break; - } - let c = self.eat_char().unwrap(); - token.push(c); - if self.terminators.contains(&c) { - break; - } - } - return token; - } - - /// Return all characters found until the predicate returns true. - /// Returns None if end of source is reached before delimiter is found. - pub fn track_until(&mut self, mut predicate: impl FnMut(&mut Self) -> bool) -> Option<String> { - let start = self.mark; - let mut end = self.mark; - while !predicate(self) { - self.peek_char()?; - end = self.mark; - } - self.end = self.prev; - return Some(self.get_chars(&start, &end).iter().collect()); - } - - /// Returns true if the remainder of the line is whitespace. - pub fn end_of_line(&self) -> bool { - for c in &self.chars[self.mark.i..] { - if *c == '\n' { - return true; - } - if !c.is_whitespace() { - return false - } - } - return true; - } - - /// Mark the next character to be consumed as the start character. - pub fn mark_start(&mut self) { - self.start = self.mark; - } - - /// Mark the most recently consumed character as the start character. - pub fn mark_start_prev(&mut self) { - self.start = self.prev; - } - - /// Mark the next character as the character following the end character. - pub fn mark_end(&mut self) { - self.end = self.mark; - } - - /// Mark the next character as the character following the end character. - pub fn mark_end_prev(&mut self) { - self.end = self.prev; - } - - /// Mark the next character to be consumed as the start of the child. - pub fn mark_child(&mut self) { - self.child = self.mark; - } - - /// Return the SourceSpan between the start and end marks. - pub fn get_source(&mut self) -> SourceSpan { - let in_merged = SourceLocation { - path: self.source_path.to_owned(), - start: self.start.position, - end: self.end.prev_position, - }; - let in_source = if self.start.position.line >= self.embedded_first_line { - if let Some(embedded_path) = &self.embedded_path { - let offset = self.embedded_first_line; - Some( - SourceLocation { - path: Some(embedded_path.to_owned()), - start: SourcePosition { - line: in_merged.start.line.saturating_sub(offset), - column: in_merged.start.column, - }, - end: SourcePosition { - line: in_merged.end.line.saturating_sub(offset), - column: in_merged.end.column, - } - } - ) - } else { - None - } - } else { - None - }; - - let string = self.get_chars(&self.start, &self.end).iter().collect(); - SourceSpan { string, in_merged, in_source, child: None } - } -} - - -#[derive(Clone, Copy)] -pub struct TokeniserMark { - /// Position of the next character to be consumed. - pub position: SourcePosition, - /// Index of the next character to be consumed. - pub i: usize, - /// Position of the most recently consumed character. - pub prev_position: SourcePosition, - pub prev_prev_position: SourcePosition, -} - -impl TokeniserMark { - pub const ZERO: Self = Self { - position: SourcePosition::ZERO, - i: 0, - prev_position: SourcePosition::ZERO, - prev_prev_position: SourcePosition::ZERO, - }; - - /// Advance to the next character. - pub fn advance(&mut self, c: char) { - self.prev_prev_position = self.prev_position; - self.prev_position = self.position; - self.position.advance(c); - self.i += 1; - } - - /// Ignore the most recently consumed character. - pub fn undo(&mut self) { - self.prev_position = self.prev_prev_position; - } -} |