diff options
-rw-r--r-- | .gitignore | 1 | ||||
-rw-r--r-- | Cargo.lock | 68 | ||||
-rw-r--r-- | Cargo.toml | 7 | ||||
-rw-r--r-- | src/highlighter.rs | 126 | ||||
-rw-r--r-- | src/lib.rs | 7 | ||||
-rw-r--r-- | src/span.rs | 11 | ||||
-rw-r--r-- | src/template.rs | 25 |
7 files changed, 245 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ea8c4bf --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/target diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..5a86a3a --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,68 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "aho-corasick" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" +dependencies = [ + "memchr", +] + +[[package]] +name = "bit-set" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08807e080ed7f9d5433fa9b275196cfc35414f66a0c79d864dc51a0d825231a3" +dependencies = [ + "bit-vec", +] + +[[package]] +name = "bit-vec" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7" + +[[package]] +name = "fancy-regex" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e24cb5a94bcae1e5408b0effca5cd7172ea3c5755049c5f3af4cd283a165298" +dependencies = [ + "bit-set", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "highlight" +version = "0.1.0" +dependencies = [ + "fancy-regex", +] + +[[package]] +name = "memchr" +version = "2.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" + +[[package]] +name = "regex-automata" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..d26dd5f --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,7 @@ +[package] +name = "highlight" +version = "0.1.0" +edition = "2024" + +[dependencies] +fancy-regex = "0.14.0" diff --git a/src/highlighter.rs b/src/highlighter.rs new file mode 100644 index 0000000..40e618e --- /dev/null +++ b/src/highlighter.rs @@ -0,0 +1,126 @@ +use crate::*; + +use std::collections::{HashMap, VecDeque}; + + +pub struct Highlighter { + pub templates: Vec<Template>, +} + +impl Highlighter { + pub fn from_str(template_set: &str) -> Self { + let mut variables = HashMap::new(); + let mut templates = Vec::new(); + + for line in template_set.lines().map(|l| l.trim()) { + if line.is_empty() || line.starts_with('#') { + continue; + } + let (left, right) = line.split_once('=').expect("Missing '=' character"); + let name = left.trim().to_string(); + let pattern = replace_variables(right.trim(), &variables); + if name.starts_with('<') && name.ends_with('>') { + variables.insert(name, pattern); + } else { + templates.push(Template::from_str(&pattern, name)); + } + } + Self { templates } + } + + pub fn highlight(&self, text: &str) -> Vec<Span> { + let mut remaining = &text[..]; + let mut accumulator = String::new(); + let mut spans = Vec::new(); + + 'outer: while !remaining.is_empty() { + // Check all templates. + for template in &self.templates { + if let Some(captures) = template.expression.captures(remaining).unwrap() { + if captures[0].len() > 0 { + // Bank the accumulator as an untagged span. + if !accumulator.is_empty() { + let text = std::mem::take(&mut accumulator); + spans.push(Span { tag: String::new(), text }); + } + // Tag each capture group. + let mut i = 0; + let mut groups = captures.iter().filter_map(|c| c); + groups.next(); + for (tag_i, group) in groups.enumerate() { + // Tag the text before this capture group. + if group.start() > i { + let text = captures[0][i..group.start()].to_string(); + spans.push(Span { tag: template.tag.clone(), text }) + } + // Tag the text in this capture group. + let text = captures[0][group.start()..group.end()].to_string(); + let tag = match template.subtags.get(tag_i) { + Some(tag) => tag.clone(), + None => template.tag.clone(), + }; + spans.push(Span { tag, text }); + i = group.end(); + } + // Tag the remaining text. + if captures[0].len() > i { + let text = captures[0][i..].to_string(); + spans.push(Span { tag: template.tag.clone(), text }) + } + // Continue to the next match. + remaining = &remaining[captures[0].len()..]; + continue 'outer; + } + } + } + // Pop the first character into accumulator. + if let Some(c) = remaining.chars().nth(0) { + remaining = &remaining[c.len_utf8()..]; + accumulator.push(c); + } + } + // Bank the accumulator as an untagged span. + if !accumulator.is_empty() { + let text = std::mem::take(&mut accumulator); + spans.push(Span { tag: String::new(), text }); + } + + // Merge adjacent spans that have the same tag. + let mut spans = VecDeque::from(spans); + let mut merged_spans: Vec<Span> = Vec::new(); + while let Some(span) = spans.pop_front() { + if let Some(last) = merged_spans.last_mut() { + if span.tag == last.tag { + last.text.push_str(&span.text); + continue; + } + } + merged_spans.push(span); + } + + return merged_spans; + } +} + + +fn replace_variables(pattern: &str, variables: &HashMap<String, String>) -> String { + let mut output = String::new(); + let mut chars = pattern.chars(); + while let Some(c) = chars.next() { + if c == '<' { + let mut name = String::from('<'); + loop { + match chars.next() { + Some('>') => { name.push('>'); break; } + Some(c) => name.push(c), + None => panic!("Missing '>' character"), + } + } + let pattern = variables.get(&name).expect(&format!("Missing definition for {name:?}")); + output.push_str(&format!("(?:{pattern})")); + } else { + output.push(c); + } + } + return output; +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..7a40c94 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,7 @@ +mod span; +mod template; +mod highlighter; + +pub use span::*; +pub use template::*; +pub use highlighter::*; diff --git a/src/span.rs b/src/span.rs new file mode 100644 index 0000000..5282cae --- /dev/null +++ b/src/span.rs @@ -0,0 +1,11 @@ +pub struct Span { + pub tag: String, + pub text: String, +} + +impl std::fmt::Debug for Span { + fn fmt(&self, f: &mut std::fmt::Formatter) -> Result<(), std::fmt::Error> { + let tag = format!("{:?}", self.tag); + write!(f, "{tag:>20}: {:?}", self.text) + } +} diff --git a/src/template.rs b/src/template.rs new file mode 100644 index 0000000..67eaceb --- /dev/null +++ b/src/template.rs @@ -0,0 +1,25 @@ +use fancy_regex::*; + + +pub struct Template { + pub tag: String, + pub subtags: Vec<String>, + pub expression: Regex, +} + +impl Template { + pub fn from_str(pattern: &str, tag: String) -> Self { + let pattern = format!("^(?:{pattern})"); + let expression = Regex::new(&pattern).unwrap(); + + if let Some((head, tail)) = tag.split_once('(') { + if let Some(tail) = tail.strip_suffix(')') { + let tag = head.trim().to_string(); + let subtags = tail.split(',').map(|t| t.trim().to_string()).collect(); + return Self { tag, subtags, expression }; + } + } + let subtags = Vec::new(); + return Self { tag, subtags, expression }; + } +} |