summaryrefslogtreecommitdiff
path: root/src/tokeniser.rs
diff options
context:
space:
mode:
Diffstat (limited to 'src/tokeniser.rs')
-rw-r--r--src/tokeniser.rs167
1 files changed, 114 insertions, 53 deletions
diff --git a/src/tokeniser.rs b/src/tokeniser.rs
index 6ae9055..7ab44f0 100644
--- a/src/tokeniser.rs
+++ b/src/tokeniser.rs
@@ -5,7 +5,7 @@ use std::path::PathBuf;
/// Break a character stream down into individual tokens.
pub struct Tokeniser {
- /// Characters waiting to be parsed, in reverse order.
+ /// Characters waiting to be parsed.
pub chars: Vec<char>,
/// Path of the whole source file.
pub source_path: Option<PathBuf>,
@@ -13,32 +13,53 @@ pub struct Tokeniser {
pub embedded_path: Option<PathBuf>,
/// Line where the embedded source file begins.
pub embedded_first_line: usize,
- /// Position of the next character to be consumed.
- pub position: SourcePosition,
- /// Position of the most recently consumed character.
- pub prev_position: SourcePosition,
+ /// Mark tracking the next character to parse.
+ pub mark: TokeniserMark,
+ /// Mark tracking the most recent character of the current token.
+ pub prev: TokeniserMark,
/// Position of the first character of the current token.
- pub start_position: SourcePosition,
- /// The source characters consumed for the current token.
- pub consumed: String,
+ pub start: TokeniserMark,
+ /// Position after the final character of the current token.
+ pub end: TokeniserMark,
+ /// Position to begin subtokenisation from.
+ pub child: TokeniserMark,
/// List of characters that start a new token.
pub delimiters: Vec<char>,
/// List of characters that terminate a token.
pub terminators: Vec<char>,
}
-
impl Tokeniser {
pub fn new<P: Into<PathBuf>>(source_code: &str, path: Option<P>) -> Self {
Self {
- chars: source_code.chars().rev().collect(),
+ chars: source_code.chars().collect(),
source_path: path.map(|p| p.into()),
embedded_path: None,
embedded_first_line: 0,
- position: SourcePosition::ZERO,
- prev_position: SourcePosition::ZERO,
- start_position: SourcePosition::ZERO,
- consumed: String::new(),
+ mark: TokeniserMark::ZERO,
+ prev: TokeniserMark::ZERO,
+ start: TokeniserMark::ZERO,
+ end: TokeniserMark::ZERO,
+ child: TokeniserMark::ZERO,
+ delimiters: Vec::new(),
+ terminators: Vec::new(),
+ }
+ }
+
+ /// Create a tokeniser from child to end.
+ pub fn subtokenise(&mut self) -> Self {
+ let mut start = self.child;
+ start.i = 0;
+ Self {
+ chars: self.get_chars(&self.child, &self.end),
+ source_path: self.source_path.clone(),
+ embedded_path: self.embedded_path.clone(),
+ embedded_first_line: self.embedded_first_line.clone(),
+ mark: start,
+ prev: start,
+ start: start,
+ end: start,
+ child: start,
delimiters: Vec::new(),
terminators: Vec::new(),
}
@@ -52,37 +73,33 @@ impl Tokeniser {
self.terminators.extend_from_slice(terminators);
}
+ pub fn get_chars(&self, start: &TokeniserMark, end: &TokeniserMark) -> Vec<char> {
+ self.chars[start.i..end.i].iter().map(char::to_owned).collect()
+ }
+
/// Return the next character without consuming it.
pub fn peek_char(&self) -> Option<char> {
- self.chars.last().copied()
+ self.chars.get(self.mark.i).copied()
}
/// Consume and return the next character.
pub fn eat_char(&mut self) -> Option<char> {
- let option = self.chars.pop();
+ let option = self.peek_char();
if let Some(c) = option {
- self.prev_position = self.position;
- self.position.advance(c);
- self.consumed.push(c);
+ self.prev = self.mark;
+ self.mark.advance(c);
+ self.mark_end();
}
return option;
}
- /// Remove the next character.
- pub fn drop_char(&mut self) {
- if let Some(c) = self.chars.pop() {
- self.prev_position = self.position;
- self.position.advance(c);
- }
- }
-
- /// Remove whitespace.
- pub fn drop_whitespace(&mut self) {
+ /// Consume whitespace.
+ pub fn eat_whitespace(&mut self) {
while let Some(c) = self.peek_char() {
match c.is_whitespace() {
- true => self.drop_char(),
+ true => self.eat_char(),
false => break,
- }
+ };
}
}
@@ -107,9 +124,11 @@ impl Tokeniser {
pub fn eat_to_delimiter(&mut self, delim: char) -> Option<String> {
let mut token = String::new();
while let Some(c) = self.eat_char() {
- self.consumed.push(c);
match c == delim {
- true => return Some(token),
+ true => {
+ self.end = self.prev;
+ return Some(token);
+ }
false => token.push(c),
}
}
@@ -130,30 +149,38 @@ impl Tokeniser {
}
/// Mark the next character to be consumed as the start character.
- pub fn mark_start_position(&mut self) {
- self.start_position = self.position;
- self.consumed.clear();
- }
-
- /// Mark the previously-consumed character as the start character.
- pub fn mark_prev_start_position(&mut self) {
- self.start_position = self.prev_position;
- let c = self.consumed.chars().last();
- self.consumed.clear();
- // Keep the previously pushed character.
- if let Some(c) = c {
- self.consumed.push(c);
- }
+ pub fn mark_start(&mut self) {
+ self.start = self.mark;
+ }
+
+ /// Mark the most recently consumed character as the start character.
+ pub fn mark_start_prev(&mut self) {
+ self.start = self.prev;
+ }
+
+ /// Mark the next character as the character following the end character.
+ pub fn mark_end(&mut self) {
+ self.end = self.mark;
+ }
+
+ /// Mark the next character as the character following the end character.
+ pub fn mark_end_prev(&mut self) {
+ self.end = self.prev;
}
- /// Only call this once per span, it consumes the token string.
- pub fn get_source_span(&mut self) -> SourceSpan {
+ /// Mark the next character to be consumed as the start of the child.
+ pub fn mark_child(&mut self) {
+ self.child = self.mark;
+ }
+
+ /// Return the SourceSpan between the start and end marks.
+ pub fn get_source(&mut self) -> SourceSpan {
let in_merged = SourceLocation {
path: self.source_path.to_owned(),
- start: self.start_position,
- end: self.prev_position,
+ start: self.start.position,
+ end: self.end.prev_position,
};
- let in_source = if self.start_position.line >= self.embedded_first_line {
+ let in_source = if self.start.position.line >= self.embedded_first_line {
if let Some(embedded_path) = &self.embedded_path {
let offset = self.embedded_first_line;
Some(
@@ -176,7 +203,41 @@ impl Tokeniser {
None
};
- let string = std::mem::take(&mut self.consumed);
+ let string = self.get_chars(&self.start, &self.end).iter().collect();
SourceSpan { string, in_merged, in_source }
}
}
+
+
+#[derive(Clone, Copy)]
+pub struct TokeniserMark {
+ /// Position of the next character to be consumed.
+ pub position: SourcePosition,
+ /// Index of the next character to be consumed.
+ pub i: usize,
+ /// Position of the most recently consumed character.
+ pub prev_position: SourcePosition,
+ pub prev_prev_position: SourcePosition,
+}
+
+impl TokeniserMark {
+ pub const ZERO: Self = Self {
+ position: SourcePosition::ZERO,
+ i: 0,
+ prev_position: SourcePosition::ZERO,
+ prev_prev_position: SourcePosition::ZERO,
+ };
+
+ /// Advance to the next character.
+ pub fn advance(&mut self, c: char) {
+ self.prev_prev_position = self.prev_position;
+ self.prev_position = self.position;
+ self.position.advance(c);
+ self.i += 1;
+ }
+
+ /// Ignore the most recently consumed character.
+ pub fn undo(&mut self) {
+ self.prev_position = self.prev_prev_position;
+ }
+}