Initial commit

author: Ben Bridle <bridle.benjamin@gmail.com> 2022-08-25 21:09:25 +1200
committer: Ben Bridle <bridle.benjamin@gmail.com> 2022-08-25 21:09:25 +1200
commit: 54f5e9fd883e207931baa9c87b6181ca724d6bab (patch)
tree: 17111a1da036dbc061ae4062ea0716373e16e23d
download: markdown-54f5e9fd883e207931baa9c87b6181ca724d6bab.zip
9 files changed, 618 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..96ef6c0
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+/target
+Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
new file mode 100644
index 0000000..ec9290c
--- /dev/null
+++ b/Cargo.toml
@@ -0,0 +1,10 @@
+[package]
+name = "markdown_parser"
+version = "1.0.0"
+authors = ["Ben Bridle <bridle.benjamin@gmail.com>"]
+edition = "2021"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+
diff --git a/src/block.rs b/src/block.rs
new file mode 100644
index 0000000..2a34fcf
--- /dev/null
+++ b/src/block.rs
@@ -0,0 +1,26 @@
+use crate::{Line, Table};
+
+pub enum Block {
+    Heading1(Line),
+    Heading2(Line),
+    Heading3(Line),
+    Paragraph(Line),
+    List(Vec<Line>),
+    Quote(Vec<Line>),
+    Code(String, Vec<String>),
+    Table(Table),
+}
+impl std::fmt::Debug for Block {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> Result<(), std::fmt::Error> {
+        f.write_str(match self {
+            Self::Heading1(_) => "Heading1",
+            Self::Heading2(_) => "Heading2",
+            Self::Heading3(_) => "Heading3",
+            Self::Paragraph(_) => "Paragraph",
+            Self::List(_) => "List",
+            Self::Quote(_) => "Quote",
+            Self::Code(_, _) => "Code",
+            Self::Table(_) => "Table",
+        })
+    }
+}
diff --git a/src/lib.rs b/src/lib.rs
new file mode 100644
index 0000000..c0b8c84
--- /dev/null
+++ b/src/lib.rs
@@ -0,0 +1,33 @@
+#![feature(iter_zip)]
+
+mod block;
+mod parse;
+mod parse_heirarchical;
+mod table;
+mod text;
+
+pub use block::Block;
+pub use parse::parse;
+pub use parse_heirarchical::parse_heirarchical;
+pub use table::{Alignment, Column, Table};
+pub use text::{Hyperlink, Text};
+
+pub type Line = Vec<Text>;
+
+pub fn line_to_string(line: &[Text]) -> String {
+    let mut output = String::new();
+    for text in line {
+        match text {
+            Text::Normal(content) => output.push_str(&content),
+            Text::Bold(content) => output.push_str(&format!("**{}**", content)),
+            Text::Italic(content) => output.push_str(&format!("_{}_", content)),
+            Text::BoldItalic(content) => output.push_str(&format!("**_{}_**", content)),
+            Text::Code(content) => output.push_str(&format!("`{}`", content)),
+            Text::WikiLink(content) => output.push_str(&format!("[[{}]]", content)),
+            Text::Hyperlink(Hyperlink { label, target }) => {
+                output.push_str(&format!("[{}]({})", label, target))
+            }
+        }
+    }
+    return output;
+}
diff --git a/src/main.rs b/src/main.rs
new file mode 100644
index 0000000..8cc2bfa
--- /dev/null
+++ b/src/main.rs
@@ -0,0 +1,37 @@
+use markdown_parser::parse;
+
+pub fn main() {
+    // let markdown = std::fs::read_to_string("/home/ben/markdown_test.md").unwrap();
+    let markdown = "
+This _is_ a **test** paragraph.
+[This](http://www.google.com) is a regular full-length link.
+|A|this is **middle** col|CC|
+|-|:-:|---:|
+|A|||
+";
+    let document = parse(&markdown);
+    for node in document {
+        println!("{:?}", node);
+        if let markdown_parser::Block::Paragraph(blocks) = node {
+            for block in blocks {
+                println!("  {:?}", block);
+            }
+        } else if let markdown_parser::Block::List(lines) = node {
+            for line in lines {
+                println!("-");
+                for block in line {
+                    println!("  {:?}", block);
+                }
+            }
+        } else if let markdown_parser::Block::Table(table) = node {
+            for column in table.columns {
+                print!("  {:?}: ", column.alignment);
+                for block in column.name {
+                    print!("{:?} ", block);
+                }
+                println!();
+            }
+            println!();
+        }
+    }
+}
diff --git a/src/parse.rs b/src/parse.rs
new file mode 100644
index 0000000..6e4cdd9
--- /dev/null
+++ b/src/parse.rs
@@ -0,0 +1,283 @@
+use crate::*;
+
+pub fn parse(markdown: &str) -> Vec<Block> {
+    let mut document = Vec::new();
+    let lines: Vec<&str> = markdown.lines().map(|l| l.trim_start()).collect();
+    let mut i = 0;
+
+    // Gather all consecutive lines that begin with a given substring and run a
+    // function over them. The function must be `fn(&[&str])->Result<Block,()>`.
+    macro_rules! gather {
+        ($prefix:expr, $func:ident) => {{
+            let start = i;
+            for line in &lines[i..] {
+                if line.starts_with($prefix) {
+                    i += 1;
+                    continue;
+                }
+                break;
+            }
+            let gathered_lines = &lines[start..i];
+            match gathered_lines.is_empty() {
+                false => $func(gathered_lines),
+                true => Err(()),
+            }
+        }};
+    }
+
+    loop {
+        let line = match lines.get(i) {
+            Some(line) => line,
+            None => return document,
+        };
+        if line.is_empty() {
+            i += 1;
+            continue;
+        } else if let Ok(heading) = parse_heading(line) {
+            document.push(heading);
+            i += 1;
+        } else if let Ok(quote) = gather!(">", parse_quote) {
+            document.push(quote);
+        } else if let Ok(list) = gather!("- ", parse_list) {
+            document.push(list);
+        } else if let Ok(table) = gather!("|", parse_table) {
+            document.push(table);
+        } else if line.starts_with("```") {
+            let language = line[3..].to_string();
+            let mut code_lines = Vec::new();
+            i += 1;
+            for line in &lines[i..] {
+                match line.trim() == "```" {
+                    true => break,
+                    false => {
+                        code_lines.push(line.to_string());
+                        i += 1
+                    }
+                }
+            }
+            document.push(Block::Code(language, code_lines));
+            i += 1;
+        } else {
+            document.push(parse_paragraph(line));
+            i += 1;
+        };
+    }
+}
+
+/// Returns the substring from `chars` that is between the `start` and `end`
+/// delimiters. Returns None if `chars` does not start with `start`, or if an
+/// occurance of `start` and `end` cannot be found within `chars`. There must
+/// not be a space after the occurance of `start` or before the occurance of
+/// `end`. If `start` and `end` consist of just one or more of the same
+/// character, the content must contain at least one other character than
+/// that one.
+fn capture(chars: &[char], start: &str, end: &str) -> Option<String> {
+    // Determine if `pattern` contains only a single unique character
+    let single_char_in_pattern = match start.chars().next() {
+        Some(first_char) => {
+            let start_and_end = start.chars().chain(end.chars());
+            start_and_end.fold(Some(first_char), |accum, elem| match accum {
+                Some(c) if c == elem => accum,
+                _ => None,
+            })
+        }
+        None => None,
+    };
+    let is_space = |i: usize| chars.get(i) == Some(&' ');
+    fn starts_with_pattern(chars: &[char], pattern: &str) -> bool {
+        let mut i = 0;
+        for ref c in pattern.chars() {
+            match chars.get(i) {
+                Some(v) if v == c => i += 1,
+                _ => return false,
+            }
+        }
+        true
+    }
+    if !starts_with_pattern(chars, start) {
+        return None;
+    }
+    let text_start = start.len();
+    if is_space(text_start) {
+        return None;
+    };
+    let mut i = text_start;
+    loop {
+        i += 1;
+        if chars.get(i).is_none() {
+            return None;
+        }
+        if starts_with_pattern(&chars[i..], end) {
+            if is_space(i - 1) {
+                continue;
+            }
+            let text_content: String = chars[text_start..i].iter().collect();
+            match single_char_in_pattern {
+                None => return Some(text_content),
+                Some(c) => {
+                    if text_content.chars().any(|e| e != c) {
+                        return Some(text_content);
+                    }
+                }
+            };
+        }
+    }
+}
+
+fn parse_text(line: &str) -> Line {
+    let mut block_content: Line = Vec::new();
+    let chars: Vec<char> = line.chars().collect();
+    let mut normal = String::new();
+    let mut i = 0;
+
+    macro_rules! commit_normal {
+        () => {
+            if !normal.is_empty() {
+                let normal_text = Text::Normal(std::mem::take(&mut normal));
+                block_content.push(normal_text);
+            }
+        };
+    }
+    let patterns: [(&str, &str, fn(String) -> Text); 7] = [
+        ("***", "***", Text::BoldItalic),
+        ("**", "**", Text::Bold),
+        ("*", "*", Text::Italic),
+        ("___", "___", Text::BoldItalic),
+        ("__", "__", Text::Bold),
+        ("_", "_", Text::Italic),
+        ("`", "`", Text::Code),
+    ];
+
+    'outer: loop {
+        // Check if a simple, non-Normal text type starts at this character
+        for (start, end, text_type) in patterns.iter() {
+            if let Some(string) = capture(&chars[i..], start, end) {
+                i += string.len() + start.len() + end.len();
+                commit_normal!();
+                block_content.push(text_type(string));
+                continue 'outer;
+            }
+        }
+        // Check if a wiki-style hyperlink starts at this character
+        if let Some(content) = capture(&chars[i..], "[[", "]]") {
+            i += content.len() + 4;
+            commit_normal!();
+            block_content.push(Text::WikiLink(content));
+            continue 'outer;
+        }
+
+        // Check if a long-form hyperlink starts at this character
+        if let Some(label) = capture(&chars[i..], "[", "]") {
+            let target_len = label.len() + 2;
+            if let Some(target) = capture(&chars[i + target_len..], "(", ")") {
+                i += target_len + target.len() + 2;
+                commit_normal!();
+                block_content.push(Text::Hyperlink(Hyperlink { label, target }))
+            }
+        }
+
+        // No new text type started here, this must just be normal text
+        match chars.get(i) {
+            Some(c) => {
+                normal.push(*c);
+                i += 1;
+            }
+            None => {
+                commit_normal!();
+                break;
+            }
+        }
+    }
+    return block_content;
+}
+
+fn parse_heading(line: &str) -> Result<Block, ()> {
+    let (heading_type, content): (fn(Line) -> Block, &str) = if line.starts_with("# ") {
+        (Block::Heading1, &line[2..])
+    } else if line.starts_with("## ") {
+        (Block::Heading2, &line[3..])
+    } else if line.starts_with("### ") {
+        (Block::Heading3, &line[4..])
+    } else {
+        return Err(());
+    };
+    if content.is_empty() {
+        return Err(());
+    };
+    Ok(heading_type(parse_text(content)))
+}
+
+/// Accepts a slice of lines that begin with '>'
+fn parse_quote(lines: &[&str]) -> Result<Block, ()> {
+    let mut content = Vec::new();
+    for line in lines {
+        content.push(if *line == ">" {
+            Vec::new()
+        } else {
+            parse_text(&line[2..])
+        });
+    }
+    Ok(Block::Quote(content))
+}
+
+fn parse_list(lines: &[&str]) -> Result<Block, ()> {
+    Ok(Block::List(
+        lines.iter().map(|l| parse_text(&l[2..])).collect(),
+    ))
+}
+
+fn parse_paragraph(line: &str) -> Block {
+    Block::Paragraph(parse_text(line))
+}
+
+fn parse_table(lines: &[&str]) -> Result<Block, ()> {
+    if lines.len() < 3 {
+        return Err(());
+    }
+    let names = split_columns(lines[0])?;
+    let dividers = split_columns(lines[1])?;
+    if names.len() != dividers.len() {
+        return Err(());
+    }
+    let mut columns = Vec::new();
+    for (name, divider) in std::iter::zip(names.iter(), dividers.iter()) {
+        let alignment = Alignment::from_str(divider)?;
+        columns.push(Column {
+            name: parse_text(name),
+            alignment,
+        })
+    }
+    let mut rows = Vec::new();
+    for row in &lines[2..] {
+        let split_row: Vec<Line> = split_columns(row)?.iter().map(|s| parse_text(s)).collect();
+        if split_row.len() != columns.len() {
+            return Err(());
+        }
+        rows.push(split_row);
+    }
+    Ok(Block::Table(Table { columns, rows }))
+}
+
+fn split_columns(line: &str) -> Result<Vec<String>, ()> {
+    // Find the index after the first |, and before the last |
+    let mut start = None;
+    let mut end = None;
+    for (i, c) in line.chars().enumerate() {
+        if c == '|' {
+            if start.is_none() {
+                start = Some(i + 1);
+            } else {
+                end = Some(i);
+            }
+        }
+    }
+    match (start, end) {
+        (Some(s), Some(e)) => {
+            let chars: Vec<char> = line.chars().collect();
+            let string: String = chars[s..e].iter().collect();
+            let split = string.split('|');
+            Ok(split.map(|s| s.trim().to_string()).collect())
+        }
+        _ => Err(()),
+    }
+}
diff --git a/src/parse_heirarchical.rs b/src/parse_heirarchical.rs
new file mode 100644
index 0000000..75c2bec
--- /dev/null
+++ b/src/parse_heirarchical.rs
@@ -0,0 +1,137 @@
+use crate::*;
+
+macro_rules! get_subsection {
+    ($t:ident) => {
+        pub fn get_subsection(&self, name: &str) -> Option<&$t> {
+            for section in &self.sections {
+                if line_to_string(&section.title) == name {
+                    return Some(section);
+                }
+            }
+            return None;
+        }
+    };
+}
+
+#[derive(Default)]
+pub struct Document {
+    pub preamble: Vec<Block>,
+    pub sections: Vec<TopLevelSection>,
+}
+impl Document {
+    get_subsection! {TopLevelSection}
+}
+
+#[derive(Default)]
+pub struct TopLevelSection {
+    pub title: Line,
+    pub content: Vec<Block>,
+    pub sections: Vec<MidLevelSection>,
+}
+impl TopLevelSection {
+    get_subsection! {MidLevelSection}
+}
+
+#[derive(Default)]
+pub struct MidLevelSection {
+    pub title: Line,
+    pub content: Vec<Block>,
+    pub sections: Vec<LowLevelSection>,
+}
+impl MidLevelSection {
+    get_subsection! {LowLevelSection}
+}
+
+#[derive(Default)]
+pub struct LowLevelSection {
+    pub title: Line,
+    pub content: Vec<Block>,
+}
+
+pub fn parse_heirarchical(markdown: &str) -> Result<Document, ()> {
+    macro_rules! push_section {
+        ($from:ident => $to:ident) => {
+            $to.sections.push(std::mem::take(&mut $from))
+        };
+    }
+    let mut document = Document::default();
+    let mut h1_buffer = TopLevelSection::default();
+    let mut h2_buffer = MidLevelSection::default();
+    let mut h3_buffer = LowLevelSection::default();
+    let mut level = 0;
+
+    let blocks = parse(markdown);
+    for block in blocks {
+        match (level, block) {
+            (0, Block::Heading1(title)) => {
+                h1_buffer.title = title;
+                level = 1;
+            }
+            (0, Block::Heading2(_)) => return Err(()),
+            (0, Block::Heading3(_)) => return Err(()),
+            (0, block) => document.preamble.push(block),
+            (1, Block::Heading1(title)) => {
+                push_section!(h1_buffer => document);
+                h1_buffer.title = title;
+            }
+            (1, Block::Heading2(title)) => {
+                h2_buffer.title = title;
+                level = 2;
+            }
+            (1, Block::Heading3(_)) => return Err(()),
+            (1, block) => h1_buffer.content.push(block),
+            (2, Block::Heading1(title)) => {
+                push_section!(h2_buffer => h1_buffer);
+                push_section!(h1_buffer => document);
+                h1_buffer.title = title;
+                level = 1;
+            }
+            (2, Block::Heading2(title)) => {
+                push_section!(h2_buffer => h1_buffer);
+                h2_buffer.title = title;
+            }
+            (2, Block::Heading3(title)) => {
+                h3_buffer.title = title;
+                level = 3;
+            }
+            (2, block) => h2_buffer.content.push(block),
+            (3, Block::Heading1(title)) => {
+                push_section!(h3_buffer => h2_buffer);
+                push_section!(h2_buffer => h1_buffer);
+                push_section!(h1_buffer => document);
+                h1_buffer.title = title;
+                level = 1;
+            }
+            (3, Block::Heading2(title)) => {
+                push_section!(h3_buffer => h2_buffer);
+                push_section!(h2_buffer => h1_buffer);
+                h2_buffer.title = title;
+                level = 2;
+            }
+            (3, Block::Heading3(title)) => {
+                push_section!(h3_buffer => h2_buffer);
+                h3_buffer.title = title;
+            }
+            (3, block) => h3_buffer.content.push(block),
+            _ => unreachable!(),
+        }
+    }
+
+    // Push all in-progress sections
+    match level {
+        3 => {
+            push_section!(h3_buffer => h2_buffer);
+            push_section!(h2_buffer => h1_buffer);
+            push_section!(h1_buffer => document);
+        }
+        2 => {
+            push_section!(h2_buffer => h1_buffer);
+            push_section!(h1_buffer => document);
+        }
+        1 => {
+            push_section!(h1_buffer => document);
+        }
+        _ => (),
+    }
+    Ok(document)
+}
diff --git a/src/table.rs b/src/table.rs
new file mode 100644
index 0000000..cc01ffc
--- /dev/null
+++ b/src/table.rs
@@ -0,0 +1,60 @@
+use crate::Line;
+
+pub struct Table {
+    pub columns: Vec<Column>,
+    pub rows: Vec<Vec<Line>>,
+}
+
+pub struct Column {
+    pub name: Line,
+    pub alignment: Alignment,
+}
+
+pub enum Alignment {
+    Left,
+    Center,
+    Right,
+}
+impl Alignment {
+    pub fn from_str(s: &str) -> Result<Self, ()> {
+        let mut start = false;
+        let mut end = false;
+        for (i, c) in s.chars().enumerate() {
+            if c == ':' {
+                if i == 0 {
+                    start = true;
+                } else if i == s.len() - 1 {
+                    end = true;
+                } else {
+                    return Err(());
+                }
+            } else if c != '-' {
+                return Err(());
+            }
+        }
+        Ok(match (start, end) {
+            (false, false) => Self::Left,
+            (true, false) => Self::Left,
+            (false, true) => Self::Right,
+            (true, true) => Self::Center,
+        })
+    }
+}
+impl std::fmt::Display for Alignment {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> Result<(), std::fmt::Error> {
+        f.write_str(match self {
+            Self::Left => "left",
+            Self::Center => "center",
+            Self::Right => "right",
+        })
+    }
+}
+impl std::fmt::Debug for Alignment {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> Result<(), std::fmt::Error> {
+        f.write_str(match self {
+            Self::Left => "Left",
+            Self::Center => "Center",
+            Self::Right => "Right",
+        })
+    }
+}
diff --git a/src/text.rs b/src/text.rs
new file mode 100644
index 0000000..e9dbdeb
--- /dev/null
+++ b/src/text.rs
@@ -0,0 +1,30 @@
+pub enum Text {
+    Normal(String),
+    Bold(String),
+    Italic(String),
+    BoldItalic(String),
+    Code(String),
+    WikiLink(String),
+    Hyperlink(Hyperlink),
+}
+impl std::fmt::Debug for Text {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> Result<(), std::fmt::Error> {
+        let string = match self {
+            Text::Normal(text) => format!("Normal ('{}')", text),
+            Text::Bold(text) => format!("Bold ('{}')", text),
+            Text::Italic(text) => format!("Italic ('{}')", text),
+            Text::BoldItalic(text) => format!("BoldItalic ('{}')", text),
+            Text::Code(text) => format!("Code ('{}')", text),
+            Text::WikiLink(text) => format!("WikiLink ('{}')", text),
+            Text::Hyperlink(Hyperlink { label, target }) => {
+                format!("Hyperlink (label:'{}',  target:'{}')", label, target)
+            }
+        };
+        f.write_str(&string)
+    }
+}
+
+pub struct Hyperlink {
+    pub label: String,
+    pub target: String,
+}
author	Ben Bridle <bridle.benjamin@gmail.com>	2022-08-25 21:09:25 +1200
committer	Ben Bridle <bridle.benjamin@gmail.com>	2022-08-25 21:09:25 +1200
commit	54f5e9fd883e207931baa9c87b6181ca724d6bab (patch)
tree	17111a1da036dbc061ae4062ea0716373e16e23d
download	markdown-54f5e9fd883e207931baa9c87b6181ca724d6bab.zip