summaryrefslogtreecommitdiff
path: root/src/tokeniser.rs
blob: 0061e7a4b9176f00b8977bdfeab7151e911357ed (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
use crate::*;

use std::path::PathBuf;


/// Break a character stream down into individual tokens.
pub struct Tokeniser {
    /// Characters waiting to be parsed.
    pub chars: Vec<char>,
    /// Path of the whole source file.
    pub source_path: Option<PathBuf>,
    /// Original path of the embedded source file.
    pub embedded_path: Option<PathBuf>,
    /// Line where the embedded source file begins.
    pub embedded_first_line: usize,
    /// Mark tracking the next character to parse.
    pub mark: TokeniserMark,
    /// Mark tracking the most recent character of the current token.
    pub prev: TokeniserMark,
    /// Position of the first character of the current token.
    pub start: TokeniserMark,
    /// Position after the final character of the current token.
    pub end: TokeniserMark,
    /// Position to begin subtokenisation from.
    pub child: TokeniserMark,
    /// List of characters that start a new token.
    pub delimiters: Vec<char>,
    /// List of characters that terminate a token.
    pub terminators: Vec<char>,
}

impl Tokeniser {
    pub fn new<P: Into<PathBuf>>(source_code: &str, path: Option<P>) -> Self {
        Self {
            chars: source_code.chars().collect(),
            source_path: path.map(|p| p.into()),
            embedded_path: None,
            embedded_first_line: 0,
            mark: TokeniserMark::ZERO,
            prev: TokeniserMark::ZERO,
            start: TokeniserMark::ZERO,
            end: TokeniserMark::ZERO,
            child: TokeniserMark::ZERO,
            delimiters: Vec::new(),
            terminators: Vec::new(),
        }
    }

    /// Create a tokeniser from child to end.
    pub fn tokenise_child_span(&mut self) -> Self {
        let mut start = self.child;
        start.i = 0;
        Self {
            chars: self.get_chars(&self.child, &self.end),
            source_path: self.source_path.clone(),
            embedded_path: self.embedded_path.clone(),
            embedded_first_line: self.embedded_first_line.clone(),
            mark: start,
            prev: start,
            start: start,
            end: start,
            child: start,
            delimiters: Vec::new(),
            terminators: Vec::new(),
        }
    }

    pub fn add_delimiters(&mut self, delimiters: &[char]) {
        self.delimiters.extend_from_slice(delimiters);
    }

    pub fn add_terminators(&mut self, terminators: &[char]) {
        self.terminators.extend_from_slice(terminators);
    }

    pub fn get_chars(&self, start: &TokeniserMark, end: &TokeniserMark) -> Vec<char> {
        self.chars[start.i..end.i].iter().map(char::to_owned).collect()
    }

    /// Return the next character without consuming it.
    pub fn peek_char(&self) -> Option<char> {
        self.chars.get(self.mark.i).copied()
    }

    /// Consume and return the next character.
    pub fn eat_char(&mut self) -> Option<char> {
        let option = self.peek_char();
        if let Some(c) = option {
            self.prev = self.mark;
            self.mark.advance(c);
            self.mark_end();
        }
        return option;
    }

    /// Consume next characters if they match a pattern.
    pub fn eat_if(&mut self, pattern: &str) -> Option<String> {
        // Check that next characters match the pattern.
        for (i, c) in pattern.chars().enumerate() {
            if let Some(d) = self.chars.get(self.mark.i + i) {
                if c == *d {
                    continue;
                }
            }
            return None;
        }
        // Consume the next characters.
        self.prev = self.mark;
        for c in pattern.chars() {
            self.mark.advance(c);
            self.mark_end();
        }
        return Some(pattern.to_string());
    }

    /// Consume whitespace.
    pub fn eat_whitespace(&mut self) {
        while let Some(c) = self.peek_char() {
            match c.is_whitespace() {
                true => self.eat_char(),
                false => break,
            };
        }
    }

    /// Remove a full token from the queue.
    pub fn eat_token(&mut self) -> String {
        let mut token = String::new();
        while let Some(peek) = self.peek_char() {
            if peek.is_whitespace() || self.delimiters.contains(&peek) {
                break;
            }
            let c = self.eat_char().unwrap();
            token.push(c);
            if self.terminators.contains(&c) {
                break;
            }
        }
        return token;
    }

    /// Return all characters found until the predicate returns true.
    /// Returns None if end of source is reached before delimiter is found.
    pub fn track_until(&mut self, mut predicate: impl FnMut(&mut Self) -> bool) -> Option<String> {
        let start = self.mark;
        let mut end = self.mark;
        while !predicate(self) {
            self.peek_char()?;
            end = self.mark;
        }
        self.end = self.prev;
        return Some(self.get_chars(&start, &end).iter().collect());
    }

    /// Returns true if the remainder of the line is whitespace.
    pub fn end_of_line(&self) -> bool {
        for c in self.chars.iter().rev() {
            if *c == '\n' {
                return true;
            }
            if !c.is_whitespace() {
                return false
            }
        }
        return true;
    }

    /// Mark the next character to be consumed as the start character.
    pub fn mark_start(&mut self) {
        self.start = self.mark;
    }

    /// Mark the most recently consumed character as the start character.
    pub fn mark_start_prev(&mut self) {
        self.start = self.prev;
    }

    /// Mark the next character as the character following the end character.
    pub fn mark_end(&mut self) {
        self.end = self.mark;
    }

    /// Mark the next character as the character following the end character.
    pub fn mark_end_prev(&mut self) {
        self.end = self.prev;
    }

    /// Mark the next character to be consumed as the start of the child.
    pub fn mark_child(&mut self) {
        self.child = self.mark;
    }

    /// Return the SourceSpan between the start and end marks.
    pub fn get_source(&mut self) -> SourceSpan {
        let in_merged = SourceLocation {
            path: self.source_path.to_owned(),
            start: self.start.position,
            end: self.end.prev_position,
        };
        let in_source = if self.start.position.line >= self.embedded_first_line {
            if let Some(embedded_path) = &self.embedded_path {
                let offset = self.embedded_first_line;
                Some(
                    SourceLocation {
                        path: Some(embedded_path.to_owned()),
                        start: SourcePosition {
                            line: in_merged.start.line.saturating_sub(offset),
                            column: in_merged.start.column,
                        },
                        end: SourcePosition {
                            line: in_merged.end.line.saturating_sub(offset),
                            column: in_merged.end.column,
                        }
                    }
                )
            } else {
                None
            }
        } else {
            None
        };

        let string = self.get_chars(&self.start, &self.end).iter().collect();
        SourceSpan { string, in_merged, in_source, child: None }
    }
}


#[derive(Clone, Copy)]
pub struct TokeniserMark {
    /// Position of the next character to be consumed.
    pub position: SourcePosition,
    /// Index of the next character to be consumed.
    pub i: usize,
    /// Position of the most recently consumed character.
    pub prev_position: SourcePosition,
    pub prev_prev_position: SourcePosition,
}

impl TokeniserMark {
    pub const ZERO: Self = Self {
        position: SourcePosition::ZERO,
        i: 0,
        prev_position: SourcePosition::ZERO,
        prev_prev_position: SourcePosition::ZERO,
    };

    /// Advance to the next character.
    pub fn advance(&mut self, c: char) {
        self.prev_prev_position = self.prev_position;
        self.prev_position = self.position;
        self.position.advance(c);
        self.i += 1;
    }

    /// Ignore the most recently consumed character.
    pub fn undo(&mut self) {
        self.prev_position = self.prev_prev_position;
    }
}