diff options
Diffstat (limited to 'src/processors')
-rw-r--r-- | src/processors/compiler.rs | 134 | ||||
-rw-r--r-- | src/processors/mod.rs | 7 | ||||
-rw-r--r-- | src/processors/resolver.rs | 344 | ||||
-rw-r--r-- | src/processors/tokeniser.rs | 260 |
4 files changed, 745 insertions, 0 deletions
diff --git a/src/processors/compiler.rs b/src/processors/compiler.rs new file mode 100644 index 0000000..626dfd3 --- /dev/null +++ b/src/processors/compiler.rs @@ -0,0 +1,134 @@ +use crate::*; + +use log::{info, warn}; +use vagabond::*; + + +/// Compiles multiple source code files into one. +pub struct Compiler { + pub source_path: Option<PathBuf>, + pub resolver: Resolver, + pub parse_symbols: ParseFn, + pub push_code: PushFn, +} + +impl Compiler { + pub fn new(parse_symbols: ParseFn, push_code: PushFn) -> Self { + let resolver = Resolver::new(); + Self { source_path: None, resolver, parse_symbols, push_code } + } + + pub fn root_from_string<P: AsRef<Path>>(&mut self, source_code: String, path: P) { + let source_unit = SourceUnit::from_string(source_code, &path, self.parse_symbols); + self.source_path = Some(path.as_ref().to_path_buf()); + self.resolver.include_source_unit(source_unit, None); + } + + pub fn root_from_path<P: AsRef<Path>>(&mut self, path: P) -> Result<(), FileError> { + let source_unit = SourceUnit::from_path(&path, None, self.parse_symbols)?; + self.source_path = Some(path.as_ref().to_path_buf()); + self.resolver.include_source_unit(source_unit, None); + return Ok(()); + } + + /// Find library files descending from the parent directory. + pub fn include_libs_from_parent(&mut self, ext: &str) { + if let Some(path) = &self.source_path { + if let Some(parent_path) = path.parent() { + let parent_path = parent_path.to_owned(); + self.include_libs_from_path(&parent_path, ext); + } + } + } + + /// Find library files at or descending from a path. + pub fn include_libs_from_path(&mut self, path: &Path, ext: &str) { + let libraries = gather_from_path(path, Some(ext), self.parse_symbols); + self.resolver.add_library_source_units(libraries); + self.resolver.resolve(); + } + + /// Find library files from a PATH-style environment variable. + pub fn include_libs_from_path_variable(&mut self, name: &str, ext: &str) { + let libraries = gather_from_path_variable(name, Some(ext), self.parse_symbols); + self.resolver.add_library_source_units(libraries); + self.resolver.resolve(); + } + + pub fn error(&self) -> Option<ResolverError> { + self.resolver.error() + } + + pub fn hierarchy(&self) -> SourceHierarchy { + self.resolver.hierarchy() + } + + pub fn symbols(&self) -> SourceSymbols { + self.resolver.symbols() + } + + pub fn unused(&self) -> UnusedSymbols { + self.resolver.unused() + } + + pub fn get_compiled_source(&mut self) -> Result<String, MergeError> { + self.resolver.calculate_hierarchy(); + self.resolver.get_merged_source_code(self.push_code) + } +} + + +/// Gather all source units with a given extension using a PATH-style environment variable. +pub fn gather_from_path_variable(variable: &str, extension: Option<&str>, parse: ParseFn) -> Vec<SourceUnit> { + let mut source_units = Vec::new(); + if let Ok(string) = std::env::var(variable) { + for path in string.split(":").map(PathBuf::from) { + info!("Found path {path:?} in environment variable {variable:?}"); + source_units.extend(gather_from_path(&path, extension, parse)); + } + }; + return source_units; +} + +/// Gather source units with a given extension at or descending from a path. +pub fn gather_from_path(path: &Path, extension: Option<&str>, parse: ParseFn) -> Vec<SourceUnit> { + let mut source_units = Vec::new(); + let check_optional_file = |file: &Option<SourceFile>| -> bool { + match file { + Some(file) => match file.symbols { + Some(_) => { info!("Found source file at {:?}", file.path); true } + None => { warn!("Could not parse source file at {:?}", file.path); false } + } + None => true, + } + }; + let mut gather_source_unit = |path: &Path| { + if let Ok(unit) = SourceUnit::from_path(&path, extension, parse) { + if unit.main.symbols.is_some() { + info!("Found source file at {:?}", unit.main.path); + let head_good = check_optional_file(&unit.head); + let tail_good = check_optional_file(&unit.tail); + if head_good && tail_good { + source_units.push(unit); + } + } else { + warn!("Could not parse source file at {path:?}"); + check_optional_file(&unit.head); + check_optional_file(&unit.tail); + } + } + }; + if let Ok(entry) = Entry::from_path(path) { + if EntryType::File == entry.entry_type { + gather_source_unit(&entry.path) + } else if EntryType::Directory == entry.entry_type { + info!("Traversing directory {path:?} for source files"); + if let Ok(entries) = traverse_directory(entry.path) { + for entry in entries { + gather_source_unit(&entry.path) + } + } + } + }; + return source_units; +} diff --git a/src/processors/mod.rs b/src/processors/mod.rs new file mode 100644 index 0000000..a0ce56f --- /dev/null +++ b/src/processors/mod.rs @@ -0,0 +1,7 @@ +mod compiler; +mod resolver; +mod tokeniser; + +pub use compiler::*; +pub use resolver::*; +pub use tokeniser::*; diff --git a/src/processors/resolver.rs b/src/processors/resolver.rs new file mode 100644 index 0000000..6c1d0f1 --- /dev/null +++ b/src/processors/resolver.rs @@ -0,0 +1,344 @@ +use crate::*; + +use log::{info, error}; + +use std::collections::HashSet; + + +/// Resolve undeclared symbols in a source unit with definitions from other units. +pub struct Resolver { + /// Definitions from all included source units. + pub definitions: Vec<TrackedDefinition>, + /// All resolved references in all included source units. + pub resolved: Vec<TrackedReference>, + /// All unresolved references in all included source units. + pub unresolved: Vec<TrackedSymbol>, + /// All redefined (duplicate) definitions in all included source units. + pub redefinitions: Vec<TrackedRedefinition>, + /// All included source units. + pub source_units: Vec<HeirarchicalSourceUnit>, + /// The 'source_units' indices of the root source units. + pub root_unit_ids: Vec<usize>, + /// Source units that can be included later to resolve symbols. + pub library_source_units: Vec<SourceUnit>, +} + + +impl Resolver { + pub fn new() -> Self { + Self { + definitions: Vec::new(), + resolved: Vec::new(), + unresolved: Vec::new(), + redefinitions: Vec::new(), + source_units: Vec::new(), + root_unit_ids: Vec::new(), + library_source_units: Vec::new(), + } + } + + pub fn include_source_unit(&mut self, mut source_unit: SourceUnit, parent_id: Option<usize>) { + let source_id = self.source_units.len(); + use std::mem::take; + + info!("Including source unit at {:?}", source_unit.main.path); + if let Some(symbols) = &mut source_unit.main.symbols { + self.include_symbols(take(symbols), source_id, SourceRole::Main); + } + if let Some(head) = &mut source_unit.head { + if let Some(symbols) = &mut head.symbols { + self.include_symbols(take(symbols), source_id, SourceRole::Head); + } + } + if let Some(tail) = &mut source_unit.tail { + if let Some(symbols) = &mut tail.symbols { + self.include_symbols(take(symbols), source_id, SourceRole::Tail); + } + } + + match parent_id { + Some(parent_id) => match self.source_units.get_mut(parent_id) { + Some(parent) => { parent.child_ids.insert(source_id); } + None => { error!("Could not find parent (#{parent_id}) of source unit #{source_id}"); } + } + None => self.root_unit_ids.push(source_id), + } + self.source_units.push( + HeirarchicalSourceUnit { + source_unit, + child_ids: HashSet::new(), + parent_ids: HashSet::new(), + } + ); + } + + fn include_symbols(&mut self, symbols: Vec<Symbol>, source_id: usize, source_role: SourceRole) { + for symbol in symbols { + match symbol.role { + SymbolRole::Definition(_) => { + // Check if the symbol has already been defined. + let equal = |d: &TrackedDefinition| { &d.tracked.symbol == &symbol }; + if let Some(original) = self.definitions.iter().position(equal) { + let tracked = TrackedSymbol { symbol, source_id, source_role }; + let definition = original; + self.redefinitions.push(TrackedRedefinition { tracked, definition }); + } else { + // Resolve all unresolved references that match this symbol. + let defines = |r: &mut TrackedSymbol| symbol.defines(&r.symbol); + let definition = self.definitions.len(); + let mut references = Vec::new(); + for tracked in self.unresolved.extract_if(.., defines) { + references.push(self.resolved.len()); + self.resolved.push(TrackedReference { tracked, definition }); + } + let tracked = TrackedSymbol { symbol, source_id, source_role }; + self.definitions.push(TrackedDefinition { tracked, references }); + } + } + SymbolRole::Reference => { + let tracked = TrackedSymbol { symbol, source_id, source_role }; + let defines = |d: &TrackedDefinition| d.tracked.symbol.defines(&tracked.symbol); + match self.definitions.iter().position(defines) { + Some(index) => { + let definition = &mut self.definitions[index]; + definition.references.push(self.resolved.len()); + self.resolved.push(TrackedReference { tracked, definition: index }) + } + None => self.unresolved.push(tracked), + } + } + } + } + } + + /// Add a set of source units that might contain definitions for unresolved symbols. + pub fn add_library_source_units(&mut self, source_units: Vec<SourceUnit>) { + for source_unit in source_units { + // Discard source units that have already been included. + let in_included = self.source_units.iter().any(|s| s.source_unit == source_unit); + let in_library = self.library_source_units.iter().any(|s| *s == source_unit); + if !in_included && !in_library { + self.library_source_units.push(source_unit); + } + } + } + + /// Attempt to resolve unresolved references with library source units. + pub fn resolve(&mut self) { + // Repeatedly test if each library source unit resolves an unresolved + // symbol, breaking the loop when no new resolutions are found. + 'outer: loop { + for (i, source_unit) in self.library_source_units.iter().enumerate() { + if let Some(id) = self.unit_resolved_by_unit(&source_unit) { + let source_unit = self.library_source_units.remove(i); + self.include_source_unit(source_unit, Some(id)); + continue 'outer; + } + } + break; + } + } + + /// Returns true if all references in all included source units have been + /// resolved. + pub fn error(&self) -> Option<ResolverError> { + match self.unresolved.is_empty() && self.redefinitions.is_empty() { + true => None, + false => Some(ResolverError { resolver: self }) + } + } + + /// Return a type that can print all symbol definitions and references. + pub fn symbols(&self) -> SourceSymbols { + SourceSymbols { resolver: self } + } + + /// Return a type that can print all unused symbol definitions. + pub fn unused(&self) -> UnusedSymbols { + UnusedSymbols { resolver: self } + } + + /// Return a type that can print the structure of the source tree. + pub fn hierarchy(&self) -> SourceHierarchy { + SourceHierarchy { resolver: self } + } + + /// Return the ID of a source unit that contains an unresolved reference + /// to a symbol defined by this unit. + fn unit_resolved_by_unit(&self, source_unit: &SourceUnit) -> Option<usize> { + macro_rules! return_resolved_unit { + ($file:expr) => { + if let Some(symbols) = &$file.symbols { + if let Some(id) = self.unit_resolved_by_symbol(&symbols) { + return Some(id); + } + } + }; + } + return_resolved_unit!(&source_unit.main); + if let Some(head) = &source_unit.head { return_resolved_unit!(&head) } + if let Some(tail) = &source_unit.tail { return_resolved_unit!(&tail) } + return None; + } + + /// Returns the ID of a source unit that contains an unresolved reference + /// to a symbol defined by one of these symbols. + fn unit_resolved_by_symbol(&self, symbols: &[Symbol]) -> Option<usize> { + for symbol in symbols { + if let SymbolRole::Definition(_) = symbol.role { + for unresolved in &self.unresolved { + if unresolved.symbol.name == symbol.name { + return Some(unresolved.source_id); + } + } + } + } + return None; + } + + /// Populate the .parent_ids field of every source unit. The parents of + /// each source unit are the units that define a symbol referenced by the + /// unit, where the definition type is MustPrecedeReference. + pub fn calculate_hierarchy(&mut self) { + // Clear the .parent_ids field of every source unit. + for source_unit in &mut self.source_units { + source_unit.parent_ids.clear(); + } + // Populate the .parent_ids field of every source unit. + for reference in &self.resolved { + let predicate = |d: &&TrackedDefinition| d.tracked.symbol.name == reference.tracked.symbol.name; + if let Some(definition) = self.definitions.iter().find(predicate) { + // A unit cannot be its own parent. + let is_self = reference.tracked.source_id == definition.tracked.source_id; + let must_precede = SymbolRole::Definition(DefinitionType::MustPrecedeReference); + if is_self || definition.tracked.symbol.role != must_precede { continue; } + let referencing_unit = &mut self.source_units[reference.tracked.source_id]; + referencing_unit.parent_ids.insert(definition.tracked.source_id); + }; + } + } + + /// Concatenate all included source units into one string. + /// If the source unit dependency graph contains a cycle, the IDs of the + /// source units involved in the cycle will be returned. + pub fn get_merged_source_code(&self, push: PushFn) -> Result<String, MergeError> { + // The ID of each source unit will come after the IDs of all + // parents of that unit. + let head_order = { + let mut included_source_ids = Vec::new(); + let mut remaining_source_ids = (0..self.source_units.len()).collect::<Vec<_>>(); + + 'restart: while !remaining_source_ids.is_empty() { + // Iterate over source units, most-recently-included first. + 'next: for (i, id) in remaining_source_ids.iter().enumerate() { + let unit = &self.source_units[*id]; + for parent_id in &unit.parent_ids { + // Skip this unit if a parent hasn't yet been included. + if !included_source_ids.contains(parent_id) { + continue 'next; + } + } + // Include this unit, then check remaining units from the start. + included_source_ids.push(*id); + remaining_source_ids.remove(i); + continue 'restart; + } + // All remaining source units depend on at least one remaining + // source unit, indicating a dependency cycle. + return Err(MergeError { + resolver: self, + cyclic_unit_ids: remaining_source_ids, + }); + } + included_source_ids + }; + let mut source_code = String::new(); + + // Push head source code in calculated parent-preceding order. + for id in &head_order { + let source_unit = &self.source_units[*id]; + if let Some(head) = &source_unit.source_unit.head { + push(&mut source_code, head); + } + } + // Push main source code in source-added order. + // The root unit will be pushed first. + for source_unit in self.source_units.iter() { + let main = &source_unit.source_unit.main; + push(&mut source_code, &main); + } + // Push tail source code in reverse source-added order. + // The root unit will be pushed last. + for source_unit in self.source_units.iter().rev() { + if let Some(tail) = &source_unit.source_unit.tail { + push(&mut source_code, tail); + } + } + return Ok(source_code); + } +} + + +/// A source unit tracked with pointers to parents and dependents. +pub struct HeirarchicalSourceUnit { + pub source_unit: SourceUnit, + /// Pointers to source units that resolve references this unit. + pub child_ids: HashSet<usize>, + /// Pointers to source units that must be included before this unit. + pub parent_ids: HashSet<usize>, +} + +pub struct TrackedDefinition { + pub tracked: TrackedSymbol, + /// Points to resolved references in `resolved`. + pub references: Vec<usize>, +} + +pub struct TrackedRedefinition { + pub tracked: TrackedSymbol, + /// Points to existing definition in `definitions`. + pub definition: usize, +} + +pub struct TrackedReference { + pub tracked: TrackedSymbol, + /// Points to resolving definition in `definitions`. + pub definition: usize, +} + +pub struct TrackedSymbol { + pub symbol: Symbol, + pub source_id: usize, + pub source_role: SourceRole, +} + +impl TrackedSymbol { + pub fn context<'a>(&'a self, resolver: &'a Resolver) -> Context<'a> { + let source_unit = &resolver.source_units[self.source_id].source_unit; + let source_code = match self.source_role { + SourceRole::Main => source_unit.main.source_code.as_str(), + SourceRole::Head => match &source_unit.head { + Some(head) => head.source_code.as_str(), + None => unreachable!("Failed to find source code of head file"), + } + SourceRole::Tail => match &source_unit.tail { + Some(tail) => tail.source_code.as_str(), + None => unreachable!("Failed to find source code of tail file"), + } + }; + Context { source_code, source: &self.symbol.source } + } +} + +impl PartialEq for TrackedSymbol { + fn eq(&self, other: &TrackedSymbol) -> bool { + self.symbol.name.eq(&other.symbol.name) + } +} + +#[derive(Clone, Copy, Debug)] +pub enum SourceRole { + Main, + Head, + Tail, +} diff --git a/src/processors/tokeniser.rs b/src/processors/tokeniser.rs new file mode 100644 index 0000000..0350afe --- /dev/null +++ b/src/processors/tokeniser.rs @@ -0,0 +1,260 @@ +use crate::*; + +use std::path::PathBuf; + + +/// Break a character stream down into individual tokens. +pub struct Tokeniser { + /// All characters to be parsed, characters are never removed. + pub chars: Vec<char>, + /// Path of the whole source file. + pub source_path: Option<PathBuf>, + /// Original path of the embedded source file. + pub embedded_path: Option<PathBuf>, + /// Line where the embedded source file begins. + pub embedded_first_line: usize, + /// Mark tracking the next character to parse. + pub mark: TokeniserMark, + /// Mark tracking the most recent character of the current token. + pub prev: TokeniserMark, + /// Position of the first character of the current token. + pub start: TokeniserMark, + /// Position after the final character of the current token. + pub end: TokeniserMark, + /// Position to begin subtokenisation from. + pub child: TokeniserMark, + /// List of characters that start a new token. + pub delimiters: Vec<char>, + /// List of characters that terminate a token. + pub terminators: Vec<char>, +} + +impl Tokeniser { + pub fn new<P: Into<PathBuf>>(source_code: &str, path: Option<P>) -> Self { + Self { + chars: source_code.chars().collect(), + source_path: path.map(|p| p.into()), + embedded_path: None, + embedded_first_line: 0, + mark: TokeniserMark::ZERO, + prev: TokeniserMark::ZERO, + start: TokeniserMark::ZERO, + end: TokeniserMark::ZERO, + child: TokeniserMark::ZERO, + delimiters: Vec::new(), + terminators: Vec::new(), + } + } + + /// Create a tokeniser from child to end. + pub fn tokenise_child_span(&mut self) -> Self { + let mut start = self.child; + start.i = 0; + Self { + chars: self.get_chars(&self.child, &self.end), + source_path: self.source_path.clone(), + embedded_path: self.embedded_path.clone(), + embedded_first_line: self.embedded_first_line.clone(), + mark: start, + prev: start, + start: start, + end: start, + child: start, + delimiters: Vec::new(), + terminators: Vec::new(), + } + } + + pub fn add_delimiters(&mut self, delimiters: &[char]) { + self.delimiters.extend_from_slice(delimiters); + } + + pub fn add_terminators(&mut self, terminators: &[char]) { + self.terminators.extend_from_slice(terminators); + } + + pub fn get_chars(&self, start: &TokeniserMark, end: &TokeniserMark) -> Vec<char> { + self.chars[start.i..end.i].iter().map(char::to_owned).collect() + } + + /// Return the next character without consuming it. + pub fn peek_char(&self) -> Option<char> { + self.chars.get(self.mark.i).copied() + } + + /// Consume and return the next character. + pub fn eat_char(&mut self) -> Option<char> { + let option = self.peek_char(); + if let Some(c) = option { + self.prev = self.mark; + self.mark.advance(c); + self.mark_end(); + } + return option; + } + + /// Consume next characters if they match a pattern. + pub fn eat_if(&mut self, pattern: &str) -> Option<String> { + // Check that next characters match the pattern. + for (i, c) in pattern.chars().enumerate() { + if let Some(d) = self.chars.get(self.mark.i + i) { + if c == *d { + continue; + } + } + return None; + } + // Consume the next characters. + self.prev = self.mark; + for c in pattern.chars() { + self.mark.advance(c); + self.mark_end(); + } + return Some(pattern.to_string()); + } + + /// Consume whitespace. + pub fn eat_whitespace(&mut self) { + while let Some(c) = self.peek_char() { + match c.is_whitespace() { + true => self.eat_char(), + false => break, + }; + } + } + + /// Remove a full token from the queue. + pub fn eat_token(&mut self) -> String { + let mut token = String::new(); + while let Some(peek) = self.peek_char() { + if peek.is_whitespace() || self.delimiters.contains(&peek) { + break; + } + let c = self.eat_char().unwrap(); + token.push(c); + if self.terminators.contains(&c) { + break; + } + } + return token; + } + + /// Return all characters found until the predicate returns true. + /// Returns None if end of source is reached before delimiter is found. + pub fn track_until(&mut self, mut predicate: impl FnMut(&mut Self) -> bool) -> Option<String> { + let start = self.mark; + let mut end = self.mark; + while !predicate(self) { + self.peek_char()?; + end = self.mark; + } + self.end = self.prev; + return Some(self.get_chars(&start, &end).iter().collect()); + } + + /// Returns true if the remainder of the line is whitespace. + pub fn end_of_line(&self) -> bool { + for c in &self.chars[self.mark.i..] { + if *c == '\n' { + return true; + } + if !c.is_whitespace() { + return false + } + } + return true; + } + + /// Mark the next character to be consumed as the start character. + pub fn mark_start(&mut self) { + self.start = self.mark; + } + + /// Mark the most recently consumed character as the start character. + pub fn mark_start_prev(&mut self) { + self.start = self.prev; + } + + /// Mark the next character as the character following the end character. + pub fn mark_end(&mut self) { + self.end = self.mark; + } + + /// Mark the next character as the character following the end character. + pub fn mark_end_prev(&mut self) { + self.end = self.prev; + } + + /// Mark the next character to be consumed as the start of the child. + pub fn mark_child(&mut self) { + self.child = self.mark; + } + + /// Return the SourceSpan between the start and end marks. + pub fn get_source(&mut self) -> SourceSpan { + let in_merged = SourceLocation { + path: self.source_path.to_owned(), + start: self.start.position, + end: self.end.prev_position, + }; + let in_source = if self.start.position.line >= self.embedded_first_line { + if let Some(embedded_path) = &self.embedded_path { + let offset = self.embedded_first_line; + Some( + SourceLocation { + path: Some(embedded_path.to_owned()), + start: SourcePosition { + line: in_merged.start.line.saturating_sub(offset), + column: in_merged.start.column, + }, + end: SourcePosition { + line: in_merged.end.line.saturating_sub(offset), + column: in_merged.end.column, + } + } + ) + } else { + None + } + } else { + None + }; + + let string = self.get_chars(&self.start, &self.end).iter().collect(); + SourceSpan { string, in_merged, in_source, child: None } + } +} + + +#[derive(Clone, Copy)] +pub struct TokeniserMark { + /// Position of the next character to be consumed. + pub position: SourcePosition, + /// Index of the next character to be consumed. + pub i: usize, + /// Position of the most recently consumed character. + pub prev_position: SourcePosition, + pub prev_prev_position: SourcePosition, +} + +impl TokeniserMark { + pub const ZERO: Self = Self { + position: SourcePosition::ZERO, + i: 0, + prev_position: SourcePosition::ZERO, + prev_prev_position: SourcePosition::ZERO, + }; + + /// Advance to the next character. + pub fn advance(&mut self, c: char) { + self.prev_prev_position = self.prev_position; + self.prev_position = self.position; + self.position.advance(c); + self.i += 1; + } + + /// Ignore the most recently consumed character. + pub fn undo(&mut self) { + self.prev_position = self.prev_prev_position; + } +} |