summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitignore1
-rw-r--r--Cargo.lock68
-rw-r--r--Cargo.toml7
-rw-r--r--src/highlighter.rs126
-rw-r--r--src/lib.rs7
-rw-r--r--src/span.rs11
-rw-r--r--src/template.rs25
7 files changed, 245 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..ea8c4bf
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+/target
diff --git a/Cargo.lock b/Cargo.lock
new file mode 100644
index 0000000..5a86a3a
--- /dev/null
+++ b/Cargo.lock
@@ -0,0 +1,68 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 4
+
+[[package]]
+name = "aho-corasick"
+version = "1.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916"
+dependencies = [
+ "memchr",
+]
+
+[[package]]
+name = "bit-set"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "08807e080ed7f9d5433fa9b275196cfc35414f66a0c79d864dc51a0d825231a3"
+dependencies = [
+ "bit-vec",
+]
+
+[[package]]
+name = "bit-vec"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7"
+
+[[package]]
+name = "fancy-regex"
+version = "0.14.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6e24cb5a94bcae1e5408b0effca5cd7172ea3c5755049c5f3af4cd283a165298"
+dependencies = [
+ "bit-set",
+ "regex-automata",
+ "regex-syntax",
+]
+
+[[package]]
+name = "highlight"
+version = "0.1.0"
+dependencies = [
+ "fancy-regex",
+]
+
+[[package]]
+name = "memchr"
+version = "2.7.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
+
+[[package]]
+name = "regex-automata"
+version = "0.4.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-syntax",
+]
+
+[[package]]
+name = "regex-syntax"
+version = "0.8.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c"
diff --git a/Cargo.toml b/Cargo.toml
new file mode 100644
index 0000000..d26dd5f
--- /dev/null
+++ b/Cargo.toml
@@ -0,0 +1,7 @@
+[package]
+name = "highlight"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
+fancy-regex = "0.14.0"
diff --git a/src/highlighter.rs b/src/highlighter.rs
new file mode 100644
index 0000000..40e618e
--- /dev/null
+++ b/src/highlighter.rs
@@ -0,0 +1,126 @@
+use crate::*;
+
+use std::collections::{HashMap, VecDeque};
+
+
+pub struct Highlighter {
+ pub templates: Vec<Template>,
+}
+
+impl Highlighter {
+ pub fn from_str(template_set: &str) -> Self {
+ let mut variables = HashMap::new();
+ let mut templates = Vec::new();
+
+ for line in template_set.lines().map(|l| l.trim()) {
+ if line.is_empty() || line.starts_with('#') {
+ continue;
+ }
+ let (left, right) = line.split_once('=').expect("Missing '=' character");
+ let name = left.trim().to_string();
+ let pattern = replace_variables(right.trim(), &variables);
+ if name.starts_with('<') && name.ends_with('>') {
+ variables.insert(name, pattern);
+ } else {
+ templates.push(Template::from_str(&pattern, name));
+ }
+ }
+ Self { templates }
+ }
+
+ pub fn highlight(&self, text: &str) -> Vec<Span> {
+ let mut remaining = &text[..];
+ let mut accumulator = String::new();
+ let mut spans = Vec::new();
+
+ 'outer: while !remaining.is_empty() {
+ // Check all templates.
+ for template in &self.templates {
+ if let Some(captures) = template.expression.captures(remaining).unwrap() {
+ if captures[0].len() > 0 {
+ // Bank the accumulator as an untagged span.
+ if !accumulator.is_empty() {
+ let text = std::mem::take(&mut accumulator);
+ spans.push(Span { tag: String::new(), text });
+ }
+ // Tag each capture group.
+ let mut i = 0;
+ let mut groups = captures.iter().filter_map(|c| c);
+ groups.next();
+ for (tag_i, group) in groups.enumerate() {
+ // Tag the text before this capture group.
+ if group.start() > i {
+ let text = captures[0][i..group.start()].to_string();
+ spans.push(Span { tag: template.tag.clone(), text })
+ }
+ // Tag the text in this capture group.
+ let text = captures[0][group.start()..group.end()].to_string();
+ let tag = match template.subtags.get(tag_i) {
+ Some(tag) => tag.clone(),
+ None => template.tag.clone(),
+ };
+ spans.push(Span { tag, text });
+ i = group.end();
+ }
+ // Tag the remaining text.
+ if captures[0].len() > i {
+ let text = captures[0][i..].to_string();
+ spans.push(Span { tag: template.tag.clone(), text })
+ }
+ // Continue to the next match.
+ remaining = &remaining[captures[0].len()..];
+ continue 'outer;
+ }
+ }
+ }
+ // Pop the first character into accumulator.
+ if let Some(c) = remaining.chars().nth(0) {
+ remaining = &remaining[c.len_utf8()..];
+ accumulator.push(c);
+ }
+ }
+ // Bank the accumulator as an untagged span.
+ if !accumulator.is_empty() {
+ let text = std::mem::take(&mut accumulator);
+ spans.push(Span { tag: String::new(), text });
+ }
+
+ // Merge adjacent spans that have the same tag.
+ let mut spans = VecDeque::from(spans);
+ let mut merged_spans: Vec<Span> = Vec::new();
+ while let Some(span) = spans.pop_front() {
+ if let Some(last) = merged_spans.last_mut() {
+ if span.tag == last.tag {
+ last.text.push_str(&span.text);
+ continue;
+ }
+ }
+ merged_spans.push(span);
+ }
+
+ return merged_spans;
+ }
+}
+
+
+fn replace_variables(pattern: &str, variables: &HashMap<String, String>) -> String {
+ let mut output = String::new();
+ let mut chars = pattern.chars();
+ while let Some(c) = chars.next() {
+ if c == '<' {
+ let mut name = String::from('<');
+ loop {
+ match chars.next() {
+ Some('>') => { name.push('>'); break; }
+ Some(c) => name.push(c),
+ None => panic!("Missing '>' character"),
+ }
+ }
+ let pattern = variables.get(&name).expect(&format!("Missing definition for {name:?}"));
+ output.push_str(&format!("(?:{pattern})"));
+ } else {
+ output.push(c);
+ }
+ }
+ return output;
+}
diff --git a/src/lib.rs b/src/lib.rs
new file mode 100644
index 0000000..7a40c94
--- /dev/null
+++ b/src/lib.rs
@@ -0,0 +1,7 @@
+mod span;
+mod template;
+mod highlighter;
+
+pub use span::*;
+pub use template::*;
+pub use highlighter::*;
diff --git a/src/span.rs b/src/span.rs
new file mode 100644
index 0000000..5282cae
--- /dev/null
+++ b/src/span.rs
@@ -0,0 +1,11 @@
+pub struct Span {
+ pub tag: String,
+ pub text: String,
+}
+
+impl std::fmt::Debug for Span {
+ fn fmt(&self, f: &mut std::fmt::Formatter) -> Result<(), std::fmt::Error> {
+ let tag = format!("{:?}", self.tag);
+ write!(f, "{tag:>20}: {:?}", self.text)
+ }
+}
diff --git a/src/template.rs b/src/template.rs
new file mode 100644
index 0000000..67eaceb
--- /dev/null
+++ b/src/template.rs
@@ -0,0 +1,25 @@
+use fancy_regex::*;
+
+
+pub struct Template {
+ pub tag: String,
+ pub subtags: Vec<String>,
+ pub expression: Regex,
+}
+
+impl Template {
+ pub fn from_str(pattern: &str, tag: String) -> Self {
+ let pattern = format!("^(?:{pattern})");
+ let expression = Regex::new(&pattern).unwrap();
+
+ if let Some((head, tail)) = tag.split_once('(') {
+ if let Some(tail) = tail.strip_suffix(')') {
+ let tag = head.trim().to_string();
+ let subtags = tail.split(',').map(|t| t.trim().to_string()).collect();
+ return Self { tag, subtags, expression };
+ }
+ }
+ let subtags = Vec::new();
+ return Self { tag, subtags, expression };
+ }
+}