diff options
author | Ben Bridle <bridle.benjamin@gmail.com> | 2022-08-25 21:09:25 +1200 |
---|---|---|
committer | Ben Bridle <bridle.benjamin@gmail.com> | 2022-08-25 21:09:25 +1200 |
commit | 54f5e9fd883e207931baa9c87b6181ca724d6bab (patch) | |
tree | 17111a1da036dbc061ae4062ea0716373e16e23d | |
download | markdown-54f5e9fd883e207931baa9c87b6181ca724d6bab.zip |
Initial commit
-rw-r--r-- | .gitignore | 2 | ||||
-rw-r--r-- | Cargo.toml | 10 | ||||
-rw-r--r-- | src/block.rs | 26 | ||||
-rw-r--r-- | src/lib.rs | 33 | ||||
-rw-r--r-- | src/main.rs | 37 | ||||
-rw-r--r-- | src/parse.rs | 283 | ||||
-rw-r--r-- | src/parse_heirarchical.rs | 137 | ||||
-rw-r--r-- | src/table.rs | 60 | ||||
-rw-r--r-- | src/text.rs | 30 |
9 files changed, 618 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..96ef6c0 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +/target +Cargo.lock diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..ec9290c --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "markdown_parser" +version = "1.0.0" +authors = ["Ben Bridle <bridle.benjamin@gmail.com>"] +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] + diff --git a/src/block.rs b/src/block.rs new file mode 100644 index 0000000..2a34fcf --- /dev/null +++ b/src/block.rs @@ -0,0 +1,26 @@ +use crate::{Line, Table}; + +pub enum Block { + Heading1(Line), + Heading2(Line), + Heading3(Line), + Paragraph(Line), + List(Vec<Line>), + Quote(Vec<Line>), + Code(String, Vec<String>), + Table(Table), +} +impl std::fmt::Debug for Block { + fn fmt(&self, f: &mut std::fmt::Formatter) -> Result<(), std::fmt::Error> { + f.write_str(match self { + Self::Heading1(_) => "Heading1", + Self::Heading2(_) => "Heading2", + Self::Heading3(_) => "Heading3", + Self::Paragraph(_) => "Paragraph", + Self::List(_) => "List", + Self::Quote(_) => "Quote", + Self::Code(_, _) => "Code", + Self::Table(_) => "Table", + }) + } +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..c0b8c84 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,33 @@ +#![feature(iter_zip)] + +mod block; +mod parse; +mod parse_heirarchical; +mod table; +mod text; + +pub use block::Block; +pub use parse::parse; +pub use parse_heirarchical::parse_heirarchical; +pub use table::{Alignment, Column, Table}; +pub use text::{Hyperlink, Text}; + +pub type Line = Vec<Text>; + +pub fn line_to_string(line: &[Text]) -> String { + let mut output = String::new(); + for text in line { + match text { + Text::Normal(content) => output.push_str(&content), + Text::Bold(content) => output.push_str(&format!("**{}**", content)), + Text::Italic(content) => output.push_str(&format!("_{}_", content)), + Text::BoldItalic(content) => output.push_str(&format!("**_{}_**", content)), + Text::Code(content) => output.push_str(&format!("`{}`", content)), + Text::WikiLink(content) => output.push_str(&format!("[[{}]]", content)), + Text::Hyperlink(Hyperlink { label, target }) => { + output.push_str(&format!("[{}]({})", label, target)) + } + } + } + return output; +} diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..8cc2bfa --- /dev/null +++ b/src/main.rs @@ -0,0 +1,37 @@ +use markdown_parser::parse; + +pub fn main() { + // let markdown = std::fs::read_to_string("/home/ben/markdown_test.md").unwrap(); + let markdown = " +This _is_ a **test** paragraph. +[This](http://www.google.com) is a regular full-length link. +|A|this is **middle** col|CC| +|-|:-:|---:| +|A||| +"; + let document = parse(&markdown); + for node in document { + println!("{:?}", node); + if let markdown_parser::Block::Paragraph(blocks) = node { + for block in blocks { + println!(" {:?}", block); + } + } else if let markdown_parser::Block::List(lines) = node { + for line in lines { + println!("-"); + for block in line { + println!(" {:?}", block); + } + } + } else if let markdown_parser::Block::Table(table) = node { + for column in table.columns { + print!(" {:?}: ", column.alignment); + for block in column.name { + print!("{:?} ", block); + } + println!(); + } + println!(); + } + } +} diff --git a/src/parse.rs b/src/parse.rs new file mode 100644 index 0000000..6e4cdd9 --- /dev/null +++ b/src/parse.rs @@ -0,0 +1,283 @@ +use crate::*; + +pub fn parse(markdown: &str) -> Vec<Block> { + let mut document = Vec::new(); + let lines: Vec<&str> = markdown.lines().map(|l| l.trim_start()).collect(); + let mut i = 0; + + // Gather all consecutive lines that begin with a given substring and run a + // function over them. The function must be `fn(&[&str])->Result<Block,()>`. + macro_rules! gather { + ($prefix:expr, $func:ident) => {{ + let start = i; + for line in &lines[i..] { + if line.starts_with($prefix) { + i += 1; + continue; + } + break; + } + let gathered_lines = &lines[start..i]; + match gathered_lines.is_empty() { + false => $func(gathered_lines), + true => Err(()), + } + }}; + } + + loop { + let line = match lines.get(i) { + Some(line) => line, + None => return document, + }; + if line.is_empty() { + i += 1; + continue; + } else if let Ok(heading) = parse_heading(line) { + document.push(heading); + i += 1; + } else if let Ok(quote) = gather!(">", parse_quote) { + document.push(quote); + } else if let Ok(list) = gather!("- ", parse_list) { + document.push(list); + } else if let Ok(table) = gather!("|", parse_table) { + document.push(table); + } else if line.starts_with("```") { + let language = line[3..].to_string(); + let mut code_lines = Vec::new(); + i += 1; + for line in &lines[i..] { + match line.trim() == "```" { + true => break, + false => { + code_lines.push(line.to_string()); + i += 1 + } + } + } + document.push(Block::Code(language, code_lines)); + i += 1; + } else { + document.push(parse_paragraph(line)); + i += 1; + }; + } +} + +/// Returns the substring from `chars` that is between the `start` and `end` +/// delimiters. Returns None if `chars` does not start with `start`, or if an +/// occurance of `start` and `end` cannot be found within `chars`. There must +/// not be a space after the occurance of `start` or before the occurance of +/// `end`. If `start` and `end` consist of just one or more of the same +/// character, the content must contain at least one other character than +/// that one. +fn capture(chars: &[char], start: &str, end: &str) -> Option<String> { + // Determine if `pattern` contains only a single unique character + let single_char_in_pattern = match start.chars().next() { + Some(first_char) => { + let start_and_end = start.chars().chain(end.chars()); + start_and_end.fold(Some(first_char), |accum, elem| match accum { + Some(c) if c == elem => accum, + _ => None, + }) + } + None => None, + }; + let is_space = |i: usize| chars.get(i) == Some(&' '); + fn starts_with_pattern(chars: &[char], pattern: &str) -> bool { + let mut i = 0; + for ref c in pattern.chars() { + match chars.get(i) { + Some(v) if v == c => i += 1, + _ => return false, + } + } + true + } + if !starts_with_pattern(chars, start) { + return None; + } + let text_start = start.len(); + if is_space(text_start) { + return None; + }; + let mut i = text_start; + loop { + i += 1; + if chars.get(i).is_none() { + return None; + } + if starts_with_pattern(&chars[i..], end) { + if is_space(i - 1) { + continue; + } + let text_content: String = chars[text_start..i].iter().collect(); + match single_char_in_pattern { + None => return Some(text_content), + Some(c) => { + if text_content.chars().any(|e| e != c) { + return Some(text_content); + } + } + }; + } + } +} + +fn parse_text(line: &str) -> Line { + let mut block_content: Line = Vec::new(); + let chars: Vec<char> = line.chars().collect(); + let mut normal = String::new(); + let mut i = 0; + + macro_rules! commit_normal { + () => { + if !normal.is_empty() { + let normal_text = Text::Normal(std::mem::take(&mut normal)); + block_content.push(normal_text); + } + }; + } + let patterns: [(&str, &str, fn(String) -> Text); 7] = [ + ("***", "***", Text::BoldItalic), + ("**", "**", Text::Bold), + ("*", "*", Text::Italic), + ("___", "___", Text::BoldItalic), + ("__", "__", Text::Bold), + ("_", "_", Text::Italic), + ("`", "`", Text::Code), + ]; + + 'outer: loop { + // Check if a simple, non-Normal text type starts at this character + for (start, end, text_type) in patterns.iter() { + if let Some(string) = capture(&chars[i..], start, end) { + i += string.len() + start.len() + end.len(); + commit_normal!(); + block_content.push(text_type(string)); + continue 'outer; + } + } + // Check if a wiki-style hyperlink starts at this character + if let Some(content) = capture(&chars[i..], "[[", "]]") { + i += content.len() + 4; + commit_normal!(); + block_content.push(Text::WikiLink(content)); + continue 'outer; + } + + // Check if a long-form hyperlink starts at this character + if let Some(label) = capture(&chars[i..], "[", "]") { + let target_len = label.len() + 2; + if let Some(target) = capture(&chars[i + target_len..], "(", ")") { + i += target_len + target.len() + 2; + commit_normal!(); + block_content.push(Text::Hyperlink(Hyperlink { label, target })) + } + } + + // No new text type started here, this must just be normal text + match chars.get(i) { + Some(c) => { + normal.push(*c); + i += 1; + } + None => { + commit_normal!(); + break; + } + } + } + return block_content; +} + +fn parse_heading(line: &str) -> Result<Block, ()> { + let (heading_type, content): (fn(Line) -> Block, &str) = if line.starts_with("# ") { + (Block::Heading1, &line[2..]) + } else if line.starts_with("## ") { + (Block::Heading2, &line[3..]) + } else if line.starts_with("### ") { + (Block::Heading3, &line[4..]) + } else { + return Err(()); + }; + if content.is_empty() { + return Err(()); + }; + Ok(heading_type(parse_text(content))) +} + +/// Accepts a slice of lines that begin with '>' +fn parse_quote(lines: &[&str]) -> Result<Block, ()> { + let mut content = Vec::new(); + for line in lines { + content.push(if *line == ">" { + Vec::new() + } else { + parse_text(&line[2..]) + }); + } + Ok(Block::Quote(content)) +} + +fn parse_list(lines: &[&str]) -> Result<Block, ()> { + Ok(Block::List( + lines.iter().map(|l| parse_text(&l[2..])).collect(), + )) +} + +fn parse_paragraph(line: &str) -> Block { + Block::Paragraph(parse_text(line)) +} + +fn parse_table(lines: &[&str]) -> Result<Block, ()> { + if lines.len() < 3 { + return Err(()); + } + let names = split_columns(lines[0])?; + let dividers = split_columns(lines[1])?; + if names.len() != dividers.len() { + return Err(()); + } + let mut columns = Vec::new(); + for (name, divider) in std::iter::zip(names.iter(), dividers.iter()) { + let alignment = Alignment::from_str(divider)?; + columns.push(Column { + name: parse_text(name), + alignment, + }) + } + let mut rows = Vec::new(); + for row in &lines[2..] { + let split_row: Vec<Line> = split_columns(row)?.iter().map(|s| parse_text(s)).collect(); + if split_row.len() != columns.len() { + return Err(()); + } + rows.push(split_row); + } + Ok(Block::Table(Table { columns, rows })) +} + +fn split_columns(line: &str) -> Result<Vec<String>, ()> { + // Find the index after the first |, and before the last | + let mut start = None; + let mut end = None; + for (i, c) in line.chars().enumerate() { + if c == '|' { + if start.is_none() { + start = Some(i + 1); + } else { + end = Some(i); + } + } + } + match (start, end) { + (Some(s), Some(e)) => { + let chars: Vec<char> = line.chars().collect(); + let string: String = chars[s..e].iter().collect(); + let split = string.split('|'); + Ok(split.map(|s| s.trim().to_string()).collect()) + } + _ => Err(()), + } +} diff --git a/src/parse_heirarchical.rs b/src/parse_heirarchical.rs new file mode 100644 index 0000000..75c2bec --- /dev/null +++ b/src/parse_heirarchical.rs @@ -0,0 +1,137 @@ +use crate::*; + +macro_rules! get_subsection { + ($t:ident) => { + pub fn get_subsection(&self, name: &str) -> Option<&$t> { + for section in &self.sections { + if line_to_string(§ion.title) == name { + return Some(section); + } + } + return None; + } + }; +} + +#[derive(Default)] +pub struct Document { + pub preamble: Vec<Block>, + pub sections: Vec<TopLevelSection>, +} +impl Document { + get_subsection! {TopLevelSection} +} + +#[derive(Default)] +pub struct TopLevelSection { + pub title: Line, + pub content: Vec<Block>, + pub sections: Vec<MidLevelSection>, +} +impl TopLevelSection { + get_subsection! {MidLevelSection} +} + +#[derive(Default)] +pub struct MidLevelSection { + pub title: Line, + pub content: Vec<Block>, + pub sections: Vec<LowLevelSection>, +} +impl MidLevelSection { + get_subsection! {LowLevelSection} +} + +#[derive(Default)] +pub struct LowLevelSection { + pub title: Line, + pub content: Vec<Block>, +} + +pub fn parse_heirarchical(markdown: &str) -> Result<Document, ()> { + macro_rules! push_section { + ($from:ident => $to:ident) => { + $to.sections.push(std::mem::take(&mut $from)) + }; + } + let mut document = Document::default(); + let mut h1_buffer = TopLevelSection::default(); + let mut h2_buffer = MidLevelSection::default(); + let mut h3_buffer = LowLevelSection::default(); + let mut level = 0; + + let blocks = parse(markdown); + for block in blocks { + match (level, block) { + (0, Block::Heading1(title)) => { + h1_buffer.title = title; + level = 1; + } + (0, Block::Heading2(_)) => return Err(()), + (0, Block::Heading3(_)) => return Err(()), + (0, block) => document.preamble.push(block), + (1, Block::Heading1(title)) => { + push_section!(h1_buffer => document); + h1_buffer.title = title; + } + (1, Block::Heading2(title)) => { + h2_buffer.title = title; + level = 2; + } + (1, Block::Heading3(_)) => return Err(()), + (1, block) => h1_buffer.content.push(block), + (2, Block::Heading1(title)) => { + push_section!(h2_buffer => h1_buffer); + push_section!(h1_buffer => document); + h1_buffer.title = title; + level = 1; + } + (2, Block::Heading2(title)) => { + push_section!(h2_buffer => h1_buffer); + h2_buffer.title = title; + } + (2, Block::Heading3(title)) => { + h3_buffer.title = title; + level = 3; + } + (2, block) => h2_buffer.content.push(block), + (3, Block::Heading1(title)) => { + push_section!(h3_buffer => h2_buffer); + push_section!(h2_buffer => h1_buffer); + push_section!(h1_buffer => document); + h1_buffer.title = title; + level = 1; + } + (3, Block::Heading2(title)) => { + push_section!(h3_buffer => h2_buffer); + push_section!(h2_buffer => h1_buffer); + h2_buffer.title = title; + level = 2; + } + (3, Block::Heading3(title)) => { + push_section!(h3_buffer => h2_buffer); + h3_buffer.title = title; + } + (3, block) => h3_buffer.content.push(block), + _ => unreachable!(), + } + } + + // Push all in-progress sections + match level { + 3 => { + push_section!(h3_buffer => h2_buffer); + push_section!(h2_buffer => h1_buffer); + push_section!(h1_buffer => document); + } + 2 => { + push_section!(h2_buffer => h1_buffer); + push_section!(h1_buffer => document); + } + 1 => { + push_section!(h1_buffer => document); + } + _ => (), + } + Ok(document) +} diff --git a/src/table.rs b/src/table.rs new file mode 100644 index 0000000..cc01ffc --- /dev/null +++ b/src/table.rs @@ -0,0 +1,60 @@ +use crate::Line; + +pub struct Table { + pub columns: Vec<Column>, + pub rows: Vec<Vec<Line>>, +} + +pub struct Column { + pub name: Line, + pub alignment: Alignment, +} + +pub enum Alignment { + Left, + Center, + Right, +} +impl Alignment { + pub fn from_str(s: &str) -> Result<Self, ()> { + let mut start = false; + let mut end = false; + for (i, c) in s.chars().enumerate() { + if c == ':' { + if i == 0 { + start = true; + } else if i == s.len() - 1 { + end = true; + } else { + return Err(()); + } + } else if c != '-' { + return Err(()); + } + } + Ok(match (start, end) { + (false, false) => Self::Left, + (true, false) => Self::Left, + (false, true) => Self::Right, + (true, true) => Self::Center, + }) + } +} +impl std::fmt::Display for Alignment { + fn fmt(&self, f: &mut std::fmt::Formatter) -> Result<(), std::fmt::Error> { + f.write_str(match self { + Self::Left => "left", + Self::Center => "center", + Self::Right => "right", + }) + } +} +impl std::fmt::Debug for Alignment { + fn fmt(&self, f: &mut std::fmt::Formatter) -> Result<(), std::fmt::Error> { + f.write_str(match self { + Self::Left => "Left", + Self::Center => "Center", + Self::Right => "Right", + }) + } +} diff --git a/src/text.rs b/src/text.rs new file mode 100644 index 0000000..e9dbdeb --- /dev/null +++ b/src/text.rs @@ -0,0 +1,30 @@ +pub enum Text { + Normal(String), + Bold(String), + Italic(String), + BoldItalic(String), + Code(String), + WikiLink(String), + Hyperlink(Hyperlink), +} +impl std::fmt::Debug for Text { + fn fmt(&self, f: &mut std::fmt::Formatter) -> Result<(), std::fmt::Error> { + let string = match self { + Text::Normal(text) => format!("Normal ('{}')", text), + Text::Bold(text) => format!("Bold ('{}')", text), + Text::Italic(text) => format!("Italic ('{}')", text), + Text::BoldItalic(text) => format!("BoldItalic ('{}')", text), + Text::Code(text) => format!("Code ('{}')", text), + Text::WikiLink(text) => format!("WikiLink ('{}')", text), + Text::Hyperlink(Hyperlink { label, target }) => { + format!("Hyperlink (label:'{}', target:'{}')", label, target) + } + }; + f.write_str(&string) + } +} + +pub struct Hyperlink { + pub label: String, + pub target: String, +} |