summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBen Bridle <bridle.benjamin@gmail.com>2022-08-25 21:09:25 +1200
committerBen Bridle <bridle.benjamin@gmail.com>2022-08-25 21:09:25 +1200
commit54f5e9fd883e207931baa9c87b6181ca724d6bab (patch)
tree17111a1da036dbc061ae4062ea0716373e16e23d
downloadmarkdown-54f5e9fd883e207931baa9c87b6181ca724d6bab.zip
Initial commit
-rw-r--r--.gitignore2
-rw-r--r--Cargo.toml10
-rw-r--r--src/block.rs26
-rw-r--r--src/lib.rs33
-rw-r--r--src/main.rs37
-rw-r--r--src/parse.rs283
-rw-r--r--src/parse_heirarchical.rs137
-rw-r--r--src/table.rs60
-rw-r--r--src/text.rs30
9 files changed, 618 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..96ef6c0
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+/target
+Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
new file mode 100644
index 0000000..ec9290c
--- /dev/null
+++ b/Cargo.toml
@@ -0,0 +1,10 @@
+[package]
+name = "markdown_parser"
+version = "1.0.0"
+authors = ["Ben Bridle <bridle.benjamin@gmail.com>"]
+edition = "2021"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+
diff --git a/src/block.rs b/src/block.rs
new file mode 100644
index 0000000..2a34fcf
--- /dev/null
+++ b/src/block.rs
@@ -0,0 +1,26 @@
+use crate::{Line, Table};
+
+pub enum Block {
+ Heading1(Line),
+ Heading2(Line),
+ Heading3(Line),
+ Paragraph(Line),
+ List(Vec<Line>),
+ Quote(Vec<Line>),
+ Code(String, Vec<String>),
+ Table(Table),
+}
+impl std::fmt::Debug for Block {
+ fn fmt(&self, f: &mut std::fmt::Formatter) -> Result<(), std::fmt::Error> {
+ f.write_str(match self {
+ Self::Heading1(_) => "Heading1",
+ Self::Heading2(_) => "Heading2",
+ Self::Heading3(_) => "Heading3",
+ Self::Paragraph(_) => "Paragraph",
+ Self::List(_) => "List",
+ Self::Quote(_) => "Quote",
+ Self::Code(_, _) => "Code",
+ Self::Table(_) => "Table",
+ })
+ }
+}
diff --git a/src/lib.rs b/src/lib.rs
new file mode 100644
index 0000000..c0b8c84
--- /dev/null
+++ b/src/lib.rs
@@ -0,0 +1,33 @@
+#![feature(iter_zip)]
+
+mod block;
+mod parse;
+mod parse_heirarchical;
+mod table;
+mod text;
+
+pub use block::Block;
+pub use parse::parse;
+pub use parse_heirarchical::parse_heirarchical;
+pub use table::{Alignment, Column, Table};
+pub use text::{Hyperlink, Text};
+
+pub type Line = Vec<Text>;
+
+pub fn line_to_string(line: &[Text]) -> String {
+ let mut output = String::new();
+ for text in line {
+ match text {
+ Text::Normal(content) => output.push_str(&content),
+ Text::Bold(content) => output.push_str(&format!("**{}**", content)),
+ Text::Italic(content) => output.push_str(&format!("_{}_", content)),
+ Text::BoldItalic(content) => output.push_str(&format!("**_{}_**", content)),
+ Text::Code(content) => output.push_str(&format!("`{}`", content)),
+ Text::WikiLink(content) => output.push_str(&format!("[[{}]]", content)),
+ Text::Hyperlink(Hyperlink { label, target }) => {
+ output.push_str(&format!("[{}]({})", label, target))
+ }
+ }
+ }
+ return output;
+}
diff --git a/src/main.rs b/src/main.rs
new file mode 100644
index 0000000..8cc2bfa
--- /dev/null
+++ b/src/main.rs
@@ -0,0 +1,37 @@
+use markdown_parser::parse;
+
+pub fn main() {
+ // let markdown = std::fs::read_to_string("/home/ben/markdown_test.md").unwrap();
+ let markdown = "
+This _is_ a **test** paragraph.
+[This](http://www.google.com) is a regular full-length link.
+|A|this is **middle** col|CC|
+|-|:-:|---:|
+|A|||
+";
+ let document = parse(&markdown);
+ for node in document {
+ println!("{:?}", node);
+ if let markdown_parser::Block::Paragraph(blocks) = node {
+ for block in blocks {
+ println!(" {:?}", block);
+ }
+ } else if let markdown_parser::Block::List(lines) = node {
+ for line in lines {
+ println!("-");
+ for block in line {
+ println!(" {:?}", block);
+ }
+ }
+ } else if let markdown_parser::Block::Table(table) = node {
+ for column in table.columns {
+ print!(" {:?}: ", column.alignment);
+ for block in column.name {
+ print!("{:?} ", block);
+ }
+ println!();
+ }
+ println!();
+ }
+ }
+}
diff --git a/src/parse.rs b/src/parse.rs
new file mode 100644
index 0000000..6e4cdd9
--- /dev/null
+++ b/src/parse.rs
@@ -0,0 +1,283 @@
+use crate::*;
+
+pub fn parse(markdown: &str) -> Vec<Block> {
+ let mut document = Vec::new();
+ let lines: Vec<&str> = markdown.lines().map(|l| l.trim_start()).collect();
+ let mut i = 0;
+
+ // Gather all consecutive lines that begin with a given substring and run a
+ // function over them. The function must be `fn(&[&str])->Result<Block,()>`.
+ macro_rules! gather {
+ ($prefix:expr, $func:ident) => {{
+ let start = i;
+ for line in &lines[i..] {
+ if line.starts_with($prefix) {
+ i += 1;
+ continue;
+ }
+ break;
+ }
+ let gathered_lines = &lines[start..i];
+ match gathered_lines.is_empty() {
+ false => $func(gathered_lines),
+ true => Err(()),
+ }
+ }};
+ }
+
+ loop {
+ let line = match lines.get(i) {
+ Some(line) => line,
+ None => return document,
+ };
+ if line.is_empty() {
+ i += 1;
+ continue;
+ } else if let Ok(heading) = parse_heading(line) {
+ document.push(heading);
+ i += 1;
+ } else if let Ok(quote) = gather!(">", parse_quote) {
+ document.push(quote);
+ } else if let Ok(list) = gather!("- ", parse_list) {
+ document.push(list);
+ } else if let Ok(table) = gather!("|", parse_table) {
+ document.push(table);
+ } else if line.starts_with("```") {
+ let language = line[3..].to_string();
+ let mut code_lines = Vec::new();
+ i += 1;
+ for line in &lines[i..] {
+ match line.trim() == "```" {
+ true => break,
+ false => {
+ code_lines.push(line.to_string());
+ i += 1
+ }
+ }
+ }
+ document.push(Block::Code(language, code_lines));
+ i += 1;
+ } else {
+ document.push(parse_paragraph(line));
+ i += 1;
+ };
+ }
+}
+
+/// Returns the substring from `chars` that is between the `start` and `end`
+/// delimiters. Returns None if `chars` does not start with `start`, or if an
+/// occurance of `start` and `end` cannot be found within `chars`. There must
+/// not be a space after the occurance of `start` or before the occurance of
+/// `end`. If `start` and `end` consist of just one or more of the same
+/// character, the content must contain at least one other character than
+/// that one.
+fn capture(chars: &[char], start: &str, end: &str) -> Option<String> {
+ // Determine if `pattern` contains only a single unique character
+ let single_char_in_pattern = match start.chars().next() {
+ Some(first_char) => {
+ let start_and_end = start.chars().chain(end.chars());
+ start_and_end.fold(Some(first_char), |accum, elem| match accum {
+ Some(c) if c == elem => accum,
+ _ => None,
+ })
+ }
+ None => None,
+ };
+ let is_space = |i: usize| chars.get(i) == Some(&' ');
+ fn starts_with_pattern(chars: &[char], pattern: &str) -> bool {
+ let mut i = 0;
+ for ref c in pattern.chars() {
+ match chars.get(i) {
+ Some(v) if v == c => i += 1,
+ _ => return false,
+ }
+ }
+ true
+ }
+ if !starts_with_pattern(chars, start) {
+ return None;
+ }
+ let text_start = start.len();
+ if is_space(text_start) {
+ return None;
+ };
+ let mut i = text_start;
+ loop {
+ i += 1;
+ if chars.get(i).is_none() {
+ return None;
+ }
+ if starts_with_pattern(&chars[i..], end) {
+ if is_space(i - 1) {
+ continue;
+ }
+ let text_content: String = chars[text_start..i].iter().collect();
+ match single_char_in_pattern {
+ None => return Some(text_content),
+ Some(c) => {
+ if text_content.chars().any(|e| e != c) {
+ return Some(text_content);
+ }
+ }
+ };
+ }
+ }
+}
+
+fn parse_text(line: &str) -> Line {
+ let mut block_content: Line = Vec::new();
+ let chars: Vec<char> = line.chars().collect();
+ let mut normal = String::new();
+ let mut i = 0;
+
+ macro_rules! commit_normal {
+ () => {
+ if !normal.is_empty() {
+ let normal_text = Text::Normal(std::mem::take(&mut normal));
+ block_content.push(normal_text);
+ }
+ };
+ }
+ let patterns: [(&str, &str, fn(String) -> Text); 7] = [
+ ("***", "***", Text::BoldItalic),
+ ("**", "**", Text::Bold),
+ ("*", "*", Text::Italic),
+ ("___", "___", Text::BoldItalic),
+ ("__", "__", Text::Bold),
+ ("_", "_", Text::Italic),
+ ("`", "`", Text::Code),
+ ];
+
+ 'outer: loop {
+ // Check if a simple, non-Normal text type starts at this character
+ for (start, end, text_type) in patterns.iter() {
+ if let Some(string) = capture(&chars[i..], start, end) {
+ i += string.len() + start.len() + end.len();
+ commit_normal!();
+ block_content.push(text_type(string));
+ continue 'outer;
+ }
+ }
+ // Check if a wiki-style hyperlink starts at this character
+ if let Some(content) = capture(&chars[i..], "[[", "]]") {
+ i += content.len() + 4;
+ commit_normal!();
+ block_content.push(Text::WikiLink(content));
+ continue 'outer;
+ }
+
+ // Check if a long-form hyperlink starts at this character
+ if let Some(label) = capture(&chars[i..], "[", "]") {
+ let target_len = label.len() + 2;
+ if let Some(target) = capture(&chars[i + target_len..], "(", ")") {
+ i += target_len + target.len() + 2;
+ commit_normal!();
+ block_content.push(Text::Hyperlink(Hyperlink { label, target }))
+ }
+ }
+
+ // No new text type started here, this must just be normal text
+ match chars.get(i) {
+ Some(c) => {
+ normal.push(*c);
+ i += 1;
+ }
+ None => {
+ commit_normal!();
+ break;
+ }
+ }
+ }
+ return block_content;
+}
+
+fn parse_heading(line: &str) -> Result<Block, ()> {
+ let (heading_type, content): (fn(Line) -> Block, &str) = if line.starts_with("# ") {
+ (Block::Heading1, &line[2..])
+ } else if line.starts_with("## ") {
+ (Block::Heading2, &line[3..])
+ } else if line.starts_with("### ") {
+ (Block::Heading3, &line[4..])
+ } else {
+ return Err(());
+ };
+ if content.is_empty() {
+ return Err(());
+ };
+ Ok(heading_type(parse_text(content)))
+}
+
+/// Accepts a slice of lines that begin with '>'
+fn parse_quote(lines: &[&str]) -> Result<Block, ()> {
+ let mut content = Vec::new();
+ for line in lines {
+ content.push(if *line == ">" {
+ Vec::new()
+ } else {
+ parse_text(&line[2..])
+ });
+ }
+ Ok(Block::Quote(content))
+}
+
+fn parse_list(lines: &[&str]) -> Result<Block, ()> {
+ Ok(Block::List(
+ lines.iter().map(|l| parse_text(&l[2..])).collect(),
+ ))
+}
+
+fn parse_paragraph(line: &str) -> Block {
+ Block::Paragraph(parse_text(line))
+}
+
+fn parse_table(lines: &[&str]) -> Result<Block, ()> {
+ if lines.len() < 3 {
+ return Err(());
+ }
+ let names = split_columns(lines[0])?;
+ let dividers = split_columns(lines[1])?;
+ if names.len() != dividers.len() {
+ return Err(());
+ }
+ let mut columns = Vec::new();
+ for (name, divider) in std::iter::zip(names.iter(), dividers.iter()) {
+ let alignment = Alignment::from_str(divider)?;
+ columns.push(Column {
+ name: parse_text(name),
+ alignment,
+ })
+ }
+ let mut rows = Vec::new();
+ for row in &lines[2..] {
+ let split_row: Vec<Line> = split_columns(row)?.iter().map(|s| parse_text(s)).collect();
+ if split_row.len() != columns.len() {
+ return Err(());
+ }
+ rows.push(split_row);
+ }
+ Ok(Block::Table(Table { columns, rows }))
+}
+
+fn split_columns(line: &str) -> Result<Vec<String>, ()> {
+ // Find the index after the first |, and before the last |
+ let mut start = None;
+ let mut end = None;
+ for (i, c) in line.chars().enumerate() {
+ if c == '|' {
+ if start.is_none() {
+ start = Some(i + 1);
+ } else {
+ end = Some(i);
+ }
+ }
+ }
+ match (start, end) {
+ (Some(s), Some(e)) => {
+ let chars: Vec<char> = line.chars().collect();
+ let string: String = chars[s..e].iter().collect();
+ let split = string.split('|');
+ Ok(split.map(|s| s.trim().to_string()).collect())
+ }
+ _ => Err(()),
+ }
+}
diff --git a/src/parse_heirarchical.rs b/src/parse_heirarchical.rs
new file mode 100644
index 0000000..75c2bec
--- /dev/null
+++ b/src/parse_heirarchical.rs
@@ -0,0 +1,137 @@
+use crate::*;
+
+macro_rules! get_subsection {
+ ($t:ident) => {
+ pub fn get_subsection(&self, name: &str) -> Option<&$t> {
+ for section in &self.sections {
+ if line_to_string(&section.title) == name {
+ return Some(section);
+ }
+ }
+ return None;
+ }
+ };
+}
+
+#[derive(Default)]
+pub struct Document {
+ pub preamble: Vec<Block>,
+ pub sections: Vec<TopLevelSection>,
+}
+impl Document {
+ get_subsection! {TopLevelSection}
+}
+
+#[derive(Default)]
+pub struct TopLevelSection {
+ pub title: Line,
+ pub content: Vec<Block>,
+ pub sections: Vec<MidLevelSection>,
+}
+impl TopLevelSection {
+ get_subsection! {MidLevelSection}
+}
+
+#[derive(Default)]
+pub struct MidLevelSection {
+ pub title: Line,
+ pub content: Vec<Block>,
+ pub sections: Vec<LowLevelSection>,
+}
+impl MidLevelSection {
+ get_subsection! {LowLevelSection}
+}
+
+#[derive(Default)]
+pub struct LowLevelSection {
+ pub title: Line,
+ pub content: Vec<Block>,
+}
+
+pub fn parse_heirarchical(markdown: &str) -> Result<Document, ()> {
+ macro_rules! push_section {
+ ($from:ident => $to:ident) => {
+ $to.sections.push(std::mem::take(&mut $from))
+ };
+ }
+ let mut document = Document::default();
+ let mut h1_buffer = TopLevelSection::default();
+ let mut h2_buffer = MidLevelSection::default();
+ let mut h3_buffer = LowLevelSection::default();
+ let mut level = 0;
+
+ let blocks = parse(markdown);
+ for block in blocks {
+ match (level, block) {
+ (0, Block::Heading1(title)) => {
+ h1_buffer.title = title;
+ level = 1;
+ }
+ (0, Block::Heading2(_)) => return Err(()),
+ (0, Block::Heading3(_)) => return Err(()),
+ (0, block) => document.preamble.push(block),
+ (1, Block::Heading1(title)) => {
+ push_section!(h1_buffer => document);
+ h1_buffer.title = title;
+ }
+ (1, Block::Heading2(title)) => {
+ h2_buffer.title = title;
+ level = 2;
+ }
+ (1, Block::Heading3(_)) => return Err(()),
+ (1, block) => h1_buffer.content.push(block),
+ (2, Block::Heading1(title)) => {
+ push_section!(h2_buffer => h1_buffer);
+ push_section!(h1_buffer => document);
+ h1_buffer.title = title;
+ level = 1;
+ }
+ (2, Block::Heading2(title)) => {
+ push_section!(h2_buffer => h1_buffer);
+ h2_buffer.title = title;
+ }
+ (2, Block::Heading3(title)) => {
+ h3_buffer.title = title;
+ level = 3;
+ }
+ (2, block) => h2_buffer.content.push(block),
+ (3, Block::Heading1(title)) => {
+ push_section!(h3_buffer => h2_buffer);
+ push_section!(h2_buffer => h1_buffer);
+ push_section!(h1_buffer => document);
+ h1_buffer.title = title;
+ level = 1;
+ }
+ (3, Block::Heading2(title)) => {
+ push_section!(h3_buffer => h2_buffer);
+ push_section!(h2_buffer => h1_buffer);
+ h2_buffer.title = title;
+ level = 2;
+ }
+ (3, Block::Heading3(title)) => {
+ push_section!(h3_buffer => h2_buffer);
+ h3_buffer.title = title;
+ }
+ (3, block) => h3_buffer.content.push(block),
+ _ => unreachable!(),
+ }
+ }
+
+ // Push all in-progress sections
+ match level {
+ 3 => {
+ push_section!(h3_buffer => h2_buffer);
+ push_section!(h2_buffer => h1_buffer);
+ push_section!(h1_buffer => document);
+ }
+ 2 => {
+ push_section!(h2_buffer => h1_buffer);
+ push_section!(h1_buffer => document);
+ }
+ 1 => {
+ push_section!(h1_buffer => document);
+ }
+ _ => (),
+ }
+ Ok(document)
+}
diff --git a/src/table.rs b/src/table.rs
new file mode 100644
index 0000000..cc01ffc
--- /dev/null
+++ b/src/table.rs
@@ -0,0 +1,60 @@
+use crate::Line;
+
+pub struct Table {
+ pub columns: Vec<Column>,
+ pub rows: Vec<Vec<Line>>,
+}
+
+pub struct Column {
+ pub name: Line,
+ pub alignment: Alignment,
+}
+
+pub enum Alignment {
+ Left,
+ Center,
+ Right,
+}
+impl Alignment {
+ pub fn from_str(s: &str) -> Result<Self, ()> {
+ let mut start = false;
+ let mut end = false;
+ for (i, c) in s.chars().enumerate() {
+ if c == ':' {
+ if i == 0 {
+ start = true;
+ } else if i == s.len() - 1 {
+ end = true;
+ } else {
+ return Err(());
+ }
+ } else if c != '-' {
+ return Err(());
+ }
+ }
+ Ok(match (start, end) {
+ (false, false) => Self::Left,
+ (true, false) => Self::Left,
+ (false, true) => Self::Right,
+ (true, true) => Self::Center,
+ })
+ }
+}
+impl std::fmt::Display for Alignment {
+ fn fmt(&self, f: &mut std::fmt::Formatter) -> Result<(), std::fmt::Error> {
+ f.write_str(match self {
+ Self::Left => "left",
+ Self::Center => "center",
+ Self::Right => "right",
+ })
+ }
+}
+impl std::fmt::Debug for Alignment {
+ fn fmt(&self, f: &mut std::fmt::Formatter) -> Result<(), std::fmt::Error> {
+ f.write_str(match self {
+ Self::Left => "Left",
+ Self::Center => "Center",
+ Self::Right => "Right",
+ })
+ }
+}
diff --git a/src/text.rs b/src/text.rs
new file mode 100644
index 0000000..e9dbdeb
--- /dev/null
+++ b/src/text.rs
@@ -0,0 +1,30 @@
+pub enum Text {
+ Normal(String),
+ Bold(String),
+ Italic(String),
+ BoldItalic(String),
+ Code(String),
+ WikiLink(String),
+ Hyperlink(Hyperlink),
+}
+impl std::fmt::Debug for Text {
+ fn fmt(&self, f: &mut std::fmt::Formatter) -> Result<(), std::fmt::Error> {
+ let string = match self {
+ Text::Normal(text) => format!("Normal ('{}')", text),
+ Text::Bold(text) => format!("Bold ('{}')", text),
+ Text::Italic(text) => format!("Italic ('{}')", text),
+ Text::BoldItalic(text) => format!("BoldItalic ('{}')", text),
+ Text::Code(text) => format!("Code ('{}')", text),
+ Text::WikiLink(text) => format!("WikiLink ('{}')", text),
+ Text::Hyperlink(Hyperlink { label, target }) => {
+ format!("Hyperlink (label:'{}', target:'{}')", label, target)
+ }
+ };
+ f.write_str(&string)
+ }
+}
+
+pub struct Hyperlink {
+ pub label: String,
+ pub target: String,
+}