use crate::*;
use std::path::PathBuf;
/// Break a character stream down into individual tokens.
pub struct Tokeniser {
/// Characters waiting to be parsed.
pub chars: Vec<char>,
/// Path of the whole source file.
pub source_path: Option<PathBuf>,
/// Original path of the embedded source file.
pub embedded_path: Option<PathBuf>,
/// Line where the embedded source file begins.
pub embedded_first_line: usize,
/// Mark tracking the next character to parse.
pub mark: TokeniserMark,
/// Mark tracking the most recent character of the current token.
pub prev: TokeniserMark,
/// Position of the first character of the current token.
pub start: TokeniserMark,
/// Position after the final character of the current token.
pub end: TokeniserMark,
/// Position to begin subtokenisation from.
pub child: TokeniserMark,
/// List of characters that start a new token.
pub delimiters: Vec<char>,
/// List of characters that terminate a token.
pub terminators: Vec<char>,
}
impl Tokeniser {
pub fn new<P: Into<PathBuf>>(source_code: &str, path: Option<P>) -> Self {
Self {
chars: source_code.chars().collect(),
source_path: path.map(|p| p.into()),
embedded_path: None,
embedded_first_line: 0,
mark: TokeniserMark::ZERO,
prev: TokeniserMark::ZERO,
start: TokeniserMark::ZERO,
end: TokeniserMark::ZERO,
child: TokeniserMark::ZERO,
delimiters: Vec::new(),
terminators: Vec::new(),
}
}
/// Create a tokeniser from child to end.
pub fn subtokenise(&mut self) -> Self {
let mut start = self.child;
start.i = 0;
Self {
chars: self.get_chars(&self.child, &self.end),
source_path: self.source_path.clone(),
embedded_path: self.embedded_path.clone(),
embedded_first_line: self.embedded_first_line.clone(),
mark: start,
prev: start,
start: start,
end: start,
child: start,
delimiters: Vec::new(),
terminators: Vec::new(),
}
}
pub fn add_delimiters(&mut self, delimiters: &[char]) {
self.delimiters.extend_from_slice(delimiters);
}
pub fn add_terminators(&mut self, terminators: &[char]) {
self.terminators.extend_from_slice(terminators);
}
pub fn get_chars(&self, start: &TokeniserMark, end: &TokeniserMark) -> Vec<char> {
self.chars[start.i..end.i].iter().map(char::to_owned).collect()
}
/// Return the next character without consuming it.
pub fn peek_char(&self) -> Option<char> {
self.chars.get(self.mark.i).copied()
}
/// Consume and return the next character.
pub fn eat_char(&mut self) -> Option<char> {
let option = self.peek_char();
if let Some(c) = option {
self.prev = self.mark;
self.mark.advance(c);
self.mark_end();
}
return option;
}
/// Consume whitespace.
pub fn eat_whitespace(&mut self) {
while let Some(c) = self.peek_char() {
match c.is_whitespace() {
true => self.eat_char(),
false => break,
};
}
}
/// Remove a full token from the queue.
pub fn eat_token(&mut self) -> String {
let mut token = String::new();
while let Some(peek) = self.peek_char() {
if peek.is_whitespace() || self.delimiters.contains(&peek) {
break;
}
let c = self.eat_char().unwrap();
token.push(c);
if self.terminators.contains(&c) {
break;
}
}
return token;
}
/// Consume and return all characters up to and including the delimiter.
/// Returns None if end of source is reached before delimiter is found.
pub fn eat_to_delimiter(&mut self, delim: char) -> Option<String> {
let mut token = String::new();
while let Some(c) = self.eat_char() {
match c == delim {
true => {
self.end = self.prev;
return Some(token);
}
false => token.push(c),
}
}
return None;
}
/// Returns true if the remainder of the line is whitespace.
pub fn end_of_line(&self) -> bool {
for c in self.chars.iter().rev() {
if *c == '\n' {
return true;
}
if !c.is_whitespace() {
return false
}
}
return true;
}
/// Mark the next character to be consumed as the start character.
pub fn mark_start(&mut self) {
self.start = self.mark;
}
/// Mark the most recently consumed character as the start character.
pub fn mark_start_prev(&mut self) {
self.start = self.prev;
}
/// Mark the next character as the character following the end character.
pub fn mark_end(&mut self) {
self.end = self.mark;
}
/// Mark the next character as the character following the end character.
pub fn mark_end_prev(&mut self) {
self.end = self.prev;
}
/// Mark the next character to be consumed as the start of the child.
pub fn mark_child(&mut self) {
self.child = self.mark;
}
/// Return the SourceSpan between the start and end marks.
pub fn get_source(&mut self) -> SourceSpan {
let in_merged = SourceLocation {
path: self.source_path.to_owned(),
start: self.start.position,
end: self.end.prev_position,
};
let in_source = if self.start.position.line >= self.embedded_first_line {
if let Some(embedded_path) = &self.embedded_path {
let offset = self.embedded_first_line;
Some(
SourceLocation {
path: Some(embedded_path.to_owned()),
start: SourcePosition {
line: in_merged.start.line.saturating_sub(offset),
column: in_merged.start.column,
},
end: SourcePosition {
line: in_merged.end.line.saturating_sub(offset),
column: in_merged.end.column,
}
}
)
} else {
None
}
} else {
None
};
let string = self.get_chars(&self.start, &self.end).iter().collect();
SourceSpan { string, in_merged, in_source }
}
}
#[derive(Clone, Copy)]
pub struct TokeniserMark {
/// Position of the next character to be consumed.
pub position: SourcePosition,
/// Index of the next character to be consumed.
pub i: usize,
/// Position of the most recently consumed character.
pub prev_position: SourcePosition,
pub prev_prev_position: SourcePosition,
}
impl TokeniserMark {
pub const ZERO: Self = Self {
position: SourcePosition::ZERO,
i: 0,
prev_position: SourcePosition::ZERO,
prev_prev_position: SourcePosition::ZERO,
};
/// Advance to the next character.
pub fn advance(&mut self, c: char) {
self.prev_prev_position = self.prev_position;
self.prev_position = self.position;
self.position.advance(c);
self.i += 1;
}
/// Ignore the most recently consumed character.
pub fn undo(&mut self) {
self.prev_position = self.prev_prev_position;
}
}