diff options
| author | Ben Bridle <bridle.benjamin@gmail.com> | 2022-08-25 21:09:25 +1200 | 
|---|---|---|
| committer | Ben Bridle <bridle.benjamin@gmail.com> | 2022-08-25 21:09:25 +1200 | 
| commit | 54f5e9fd883e207931baa9c87b6181ca724d6bab (patch) | |
| tree | 17111a1da036dbc061ae4062ea0716373e16e23d | |
| download | markdown-54f5e9fd883e207931baa9c87b6181ca724d6bab.zip | |
Initial commit
| -rw-r--r-- | .gitignore | 2 | ||||
| -rw-r--r-- | Cargo.toml | 10 | ||||
| -rw-r--r-- | src/block.rs | 26 | ||||
| -rw-r--r-- | src/lib.rs | 33 | ||||
| -rw-r--r-- | src/main.rs | 37 | ||||
| -rw-r--r-- | src/parse.rs | 283 | ||||
| -rw-r--r-- | src/parse_heirarchical.rs | 137 | ||||
| -rw-r--r-- | src/table.rs | 60 | ||||
| -rw-r--r-- | src/text.rs | 30 | 
9 files changed, 618 insertions, 0 deletions
| diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..96ef6c0 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +/target +Cargo.lock diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..ec9290c --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "markdown_parser" +version = "1.0.0" +authors = ["Ben Bridle <bridle.benjamin@gmail.com>"] +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] + diff --git a/src/block.rs b/src/block.rs new file mode 100644 index 0000000..2a34fcf --- /dev/null +++ b/src/block.rs @@ -0,0 +1,26 @@ +use crate::{Line, Table}; + +pub enum Block { +    Heading1(Line), +    Heading2(Line), +    Heading3(Line), +    Paragraph(Line), +    List(Vec<Line>), +    Quote(Vec<Line>), +    Code(String, Vec<String>), +    Table(Table), +} +impl std::fmt::Debug for Block { +    fn fmt(&self, f: &mut std::fmt::Formatter) -> Result<(), std::fmt::Error> { +        f.write_str(match self { +            Self::Heading1(_) => "Heading1", +            Self::Heading2(_) => "Heading2", +            Self::Heading3(_) => "Heading3", +            Self::Paragraph(_) => "Paragraph", +            Self::List(_) => "List", +            Self::Quote(_) => "Quote", +            Self::Code(_, _) => "Code", +            Self::Table(_) => "Table", +        }) +    } +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..c0b8c84 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,33 @@ +#![feature(iter_zip)] + +mod block; +mod parse; +mod parse_heirarchical; +mod table; +mod text; + +pub use block::Block; +pub use parse::parse; +pub use parse_heirarchical::parse_heirarchical; +pub use table::{Alignment, Column, Table}; +pub use text::{Hyperlink, Text}; + +pub type Line = Vec<Text>; + +pub fn line_to_string(line: &[Text]) -> String { +    let mut output = String::new(); +    for text in line { +        match text { +            Text::Normal(content) => output.push_str(&content), +            Text::Bold(content) => output.push_str(&format!("**{}**", content)), +            Text::Italic(content) => output.push_str(&format!("_{}_", content)), +            Text::BoldItalic(content) => output.push_str(&format!("**_{}_**", content)), +            Text::Code(content) => output.push_str(&format!("`{}`", content)), +            Text::WikiLink(content) => output.push_str(&format!("[[{}]]", content)), +            Text::Hyperlink(Hyperlink { label, target }) => { +                output.push_str(&format!("[{}]({})", label, target)) +            } +        } +    } +    return output; +} diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..8cc2bfa --- /dev/null +++ b/src/main.rs @@ -0,0 +1,37 @@ +use markdown_parser::parse; + +pub fn main() { +    // let markdown = std::fs::read_to_string("/home/ben/markdown_test.md").unwrap(); +    let markdown = " +This _is_ a **test** paragraph. +[This](http://www.google.com) is a regular full-length link. +|A|this is **middle** col|CC| +|-|:-:|---:| +|A||| +"; +    let document = parse(&markdown); +    for node in document { +        println!("{:?}", node); +        if let markdown_parser::Block::Paragraph(blocks) = node { +            for block in blocks { +                println!("  {:?}", block); +            } +        } else if let markdown_parser::Block::List(lines) = node { +            for line in lines { +                println!("-"); +                for block in line { +                    println!("  {:?}", block); +                } +            } +        } else if let markdown_parser::Block::Table(table) = node { +            for column in table.columns { +                print!("  {:?}: ", column.alignment); +                for block in column.name { +                    print!("{:?} ", block); +                } +                println!(); +            } +            println!(); +        } +    } +} diff --git a/src/parse.rs b/src/parse.rs new file mode 100644 index 0000000..6e4cdd9 --- /dev/null +++ b/src/parse.rs @@ -0,0 +1,283 @@ +use crate::*; + +pub fn parse(markdown: &str) -> Vec<Block> { +    let mut document = Vec::new(); +    let lines: Vec<&str> = markdown.lines().map(|l| l.trim_start()).collect(); +    let mut i = 0; + +    // Gather all consecutive lines that begin with a given substring and run a +    // function over them. The function must be `fn(&[&str])->Result<Block,()>`. +    macro_rules! gather { +        ($prefix:expr, $func:ident) => {{ +            let start = i; +            for line in &lines[i..] { +                if line.starts_with($prefix) { +                    i += 1; +                    continue; +                } +                break; +            } +            let gathered_lines = &lines[start..i]; +            match gathered_lines.is_empty() { +                false => $func(gathered_lines), +                true => Err(()), +            } +        }}; +    } + +    loop { +        let line = match lines.get(i) { +            Some(line) => line, +            None => return document, +        }; +        if line.is_empty() { +            i += 1; +            continue; +        } else if let Ok(heading) = parse_heading(line) { +            document.push(heading); +            i += 1; +        } else if let Ok(quote) = gather!(">", parse_quote) { +            document.push(quote); +        } else if let Ok(list) = gather!("- ", parse_list) { +            document.push(list); +        } else if let Ok(table) = gather!("|", parse_table) { +            document.push(table); +        } else if line.starts_with("```") { +            let language = line[3..].to_string(); +            let mut code_lines = Vec::new(); +            i += 1; +            for line in &lines[i..] { +                match line.trim() == "```" { +                    true => break, +                    false => { +                        code_lines.push(line.to_string()); +                        i += 1 +                    } +                } +            } +            document.push(Block::Code(language, code_lines)); +            i += 1; +        } else { +            document.push(parse_paragraph(line)); +            i += 1; +        }; +    } +} + +/// Returns the substring from `chars` that is between the `start` and `end` +/// delimiters. Returns None if `chars` does not start with `start`, or if an +/// occurance of `start` and `end` cannot be found within `chars`. There must +/// not be a space after the occurance of `start` or before the occurance of +/// `end`. If `start` and `end` consist of just one or more of the same +/// character, the content must contain at least one other character than +/// that one. +fn capture(chars: &[char], start: &str, end: &str) -> Option<String> { +    // Determine if `pattern` contains only a single unique character +    let single_char_in_pattern = match start.chars().next() { +        Some(first_char) => { +            let start_and_end = start.chars().chain(end.chars()); +            start_and_end.fold(Some(first_char), |accum, elem| match accum { +                Some(c) if c == elem => accum, +                _ => None, +            }) +        } +        None => None, +    }; +    let is_space = |i: usize| chars.get(i) == Some(&' '); +    fn starts_with_pattern(chars: &[char], pattern: &str) -> bool { +        let mut i = 0; +        for ref c in pattern.chars() { +            match chars.get(i) { +                Some(v) if v == c => i += 1, +                _ => return false, +            } +        } +        true +    } +    if !starts_with_pattern(chars, start) { +        return None; +    } +    let text_start = start.len(); +    if is_space(text_start) { +        return None; +    }; +    let mut i = text_start; +    loop { +        i += 1; +        if chars.get(i).is_none() { +            return None; +        } +        if starts_with_pattern(&chars[i..], end) { +            if is_space(i - 1) { +                continue; +            } +            let text_content: String = chars[text_start..i].iter().collect(); +            match single_char_in_pattern { +                None => return Some(text_content), +                Some(c) => { +                    if text_content.chars().any(|e| e != c) { +                        return Some(text_content); +                    } +                } +            }; +        } +    } +} + +fn parse_text(line: &str) -> Line { +    let mut block_content: Line = Vec::new(); +    let chars: Vec<char> = line.chars().collect(); +    let mut normal = String::new(); +    let mut i = 0; + +    macro_rules! commit_normal { +        () => { +            if !normal.is_empty() { +                let normal_text = Text::Normal(std::mem::take(&mut normal)); +                block_content.push(normal_text); +            } +        }; +    } +    let patterns: [(&str, &str, fn(String) -> Text); 7] = [ +        ("***", "***", Text::BoldItalic), +        ("**", "**", Text::Bold), +        ("*", "*", Text::Italic), +        ("___", "___", Text::BoldItalic), +        ("__", "__", Text::Bold), +        ("_", "_", Text::Italic), +        ("`", "`", Text::Code), +    ]; + +    'outer: loop { +        // Check if a simple, non-Normal text type starts at this character +        for (start, end, text_type) in patterns.iter() { +            if let Some(string) = capture(&chars[i..], start, end) { +                i += string.len() + start.len() + end.len(); +                commit_normal!(); +                block_content.push(text_type(string)); +                continue 'outer; +            } +        } +        // Check if a wiki-style hyperlink starts at this character +        if let Some(content) = capture(&chars[i..], "[[", "]]") { +            i += content.len() + 4; +            commit_normal!(); +            block_content.push(Text::WikiLink(content)); +            continue 'outer; +        } + +        // Check if a long-form hyperlink starts at this character +        if let Some(label) = capture(&chars[i..], "[", "]") { +            let target_len = label.len() + 2; +            if let Some(target) = capture(&chars[i + target_len..], "(", ")") { +                i += target_len + target.len() + 2; +                commit_normal!(); +                block_content.push(Text::Hyperlink(Hyperlink { label, target })) +            } +        } + +        // No new text type started here, this must just be normal text +        match chars.get(i) { +            Some(c) => { +                normal.push(*c); +                i += 1; +            } +            None => { +                commit_normal!(); +                break; +            } +        } +    } +    return block_content; +} + +fn parse_heading(line: &str) -> Result<Block, ()> { +    let (heading_type, content): (fn(Line) -> Block, &str) = if line.starts_with("# ") { +        (Block::Heading1, &line[2..]) +    } else if line.starts_with("## ") { +        (Block::Heading2, &line[3..]) +    } else if line.starts_with("### ") { +        (Block::Heading3, &line[4..]) +    } else { +        return Err(()); +    }; +    if content.is_empty() { +        return Err(()); +    }; +    Ok(heading_type(parse_text(content))) +} + +/// Accepts a slice of lines that begin with '>' +fn parse_quote(lines: &[&str]) -> Result<Block, ()> { +    let mut content = Vec::new(); +    for line in lines { +        content.push(if *line == ">" { +            Vec::new() +        } else { +            parse_text(&line[2..]) +        }); +    } +    Ok(Block::Quote(content)) +} + +fn parse_list(lines: &[&str]) -> Result<Block, ()> { +    Ok(Block::List( +        lines.iter().map(|l| parse_text(&l[2..])).collect(), +    )) +} + +fn parse_paragraph(line: &str) -> Block { +    Block::Paragraph(parse_text(line)) +} + +fn parse_table(lines: &[&str]) -> Result<Block, ()> { +    if lines.len() < 3 { +        return Err(()); +    } +    let names = split_columns(lines[0])?; +    let dividers = split_columns(lines[1])?; +    if names.len() != dividers.len() { +        return Err(()); +    } +    let mut columns = Vec::new(); +    for (name, divider) in std::iter::zip(names.iter(), dividers.iter()) { +        let alignment = Alignment::from_str(divider)?; +        columns.push(Column { +            name: parse_text(name), +            alignment, +        }) +    } +    let mut rows = Vec::new(); +    for row in &lines[2..] { +        let split_row: Vec<Line> = split_columns(row)?.iter().map(|s| parse_text(s)).collect(); +        if split_row.len() != columns.len() { +            return Err(()); +        } +        rows.push(split_row); +    } +    Ok(Block::Table(Table { columns, rows })) +} + +fn split_columns(line: &str) -> Result<Vec<String>, ()> { +    // Find the index after the first |, and before the last | +    let mut start = None; +    let mut end = None; +    for (i, c) in line.chars().enumerate() { +        if c == '|' { +            if start.is_none() { +                start = Some(i + 1); +            } else { +                end = Some(i); +            } +        } +    } +    match (start, end) { +        (Some(s), Some(e)) => { +            let chars: Vec<char> = line.chars().collect(); +            let string: String = chars[s..e].iter().collect(); +            let split = string.split('|'); +            Ok(split.map(|s| s.trim().to_string()).collect()) +        } +        _ => Err(()), +    } +} diff --git a/src/parse_heirarchical.rs b/src/parse_heirarchical.rs new file mode 100644 index 0000000..75c2bec --- /dev/null +++ b/src/parse_heirarchical.rs @@ -0,0 +1,137 @@ +use crate::*; + +macro_rules! get_subsection { +    ($t:ident) => { +        pub fn get_subsection(&self, name: &str) -> Option<&$t> { +            for section in &self.sections { +                if line_to_string(§ion.title) == name { +                    return Some(section); +                } +            } +            return None; +        } +    }; +} + +#[derive(Default)] +pub struct Document { +    pub preamble: Vec<Block>, +    pub sections: Vec<TopLevelSection>, +} +impl Document { +    get_subsection! {TopLevelSection} +} + +#[derive(Default)] +pub struct TopLevelSection { +    pub title: Line, +    pub content: Vec<Block>, +    pub sections: Vec<MidLevelSection>, +} +impl TopLevelSection { +    get_subsection! {MidLevelSection} +} + +#[derive(Default)] +pub struct MidLevelSection { +    pub title: Line, +    pub content: Vec<Block>, +    pub sections: Vec<LowLevelSection>, +} +impl MidLevelSection { +    get_subsection! {LowLevelSection} +} + +#[derive(Default)] +pub struct LowLevelSection { +    pub title: Line, +    pub content: Vec<Block>, +} + +pub fn parse_heirarchical(markdown: &str) -> Result<Document, ()> { +    macro_rules! push_section { +        ($from:ident => $to:ident) => { +            $to.sections.push(std::mem::take(&mut $from)) +        }; +    } +    let mut document = Document::default(); +    let mut h1_buffer = TopLevelSection::default(); +    let mut h2_buffer = MidLevelSection::default(); +    let mut h3_buffer = LowLevelSection::default(); +    let mut level = 0; + +    let blocks = parse(markdown); +    for block in blocks { +        match (level, block) { +            (0, Block::Heading1(title)) => { +                h1_buffer.title = title; +                level = 1; +            } +            (0, Block::Heading2(_)) => return Err(()), +            (0, Block::Heading3(_)) => return Err(()), +            (0, block) => document.preamble.push(block), +            (1, Block::Heading1(title)) => { +                push_section!(h1_buffer => document); +                h1_buffer.title = title; +            } +            (1, Block::Heading2(title)) => { +                h2_buffer.title = title; +                level = 2; +            } +            (1, Block::Heading3(_)) => return Err(()), +            (1, block) => h1_buffer.content.push(block), +            (2, Block::Heading1(title)) => { +                push_section!(h2_buffer => h1_buffer); +                push_section!(h1_buffer => document); +                h1_buffer.title = title; +                level = 1; +            } +            (2, Block::Heading2(title)) => { +                push_section!(h2_buffer => h1_buffer); +                h2_buffer.title = title; +            } +            (2, Block::Heading3(title)) => { +                h3_buffer.title = title; +                level = 3; +            } +            (2, block) => h2_buffer.content.push(block), +            (3, Block::Heading1(title)) => { +                push_section!(h3_buffer => h2_buffer); +                push_section!(h2_buffer => h1_buffer); +                push_section!(h1_buffer => document); +                h1_buffer.title = title; +                level = 1; +            } +            (3, Block::Heading2(title)) => { +                push_section!(h3_buffer => h2_buffer); +                push_section!(h2_buffer => h1_buffer); +                h2_buffer.title = title; +                level = 2; +            } +            (3, Block::Heading3(title)) => { +                push_section!(h3_buffer => h2_buffer); +                h3_buffer.title = title; +            } +            (3, block) => h3_buffer.content.push(block), +            _ => unreachable!(), +        } +    } + +    // Push all in-progress sections +    match level { +        3 => { +            push_section!(h3_buffer => h2_buffer); +            push_section!(h2_buffer => h1_buffer); +            push_section!(h1_buffer => document); +        } +        2 => { +            push_section!(h2_buffer => h1_buffer); +            push_section!(h1_buffer => document); +        } +        1 => { +            push_section!(h1_buffer => document); +        } +        _ => (), +    } +    Ok(document) +} diff --git a/src/table.rs b/src/table.rs new file mode 100644 index 0000000..cc01ffc --- /dev/null +++ b/src/table.rs @@ -0,0 +1,60 @@ +use crate::Line; + +pub struct Table { +    pub columns: Vec<Column>, +    pub rows: Vec<Vec<Line>>, +} + +pub struct Column { +    pub name: Line, +    pub alignment: Alignment, +} + +pub enum Alignment { +    Left, +    Center, +    Right, +} +impl Alignment { +    pub fn from_str(s: &str) -> Result<Self, ()> { +        let mut start = false; +        let mut end = false; +        for (i, c) in s.chars().enumerate() { +            if c == ':' { +                if i == 0 { +                    start = true; +                } else if i == s.len() - 1 { +                    end = true; +                } else { +                    return Err(()); +                } +            } else if c != '-' { +                return Err(()); +            } +        } +        Ok(match (start, end) { +            (false, false) => Self::Left, +            (true, false) => Self::Left, +            (false, true) => Self::Right, +            (true, true) => Self::Center, +        }) +    } +} +impl std::fmt::Display for Alignment { +    fn fmt(&self, f: &mut std::fmt::Formatter) -> Result<(), std::fmt::Error> { +        f.write_str(match self { +            Self::Left => "left", +            Self::Center => "center", +            Self::Right => "right", +        }) +    } +} +impl std::fmt::Debug for Alignment { +    fn fmt(&self, f: &mut std::fmt::Formatter) -> Result<(), std::fmt::Error> { +        f.write_str(match self { +            Self::Left => "Left", +            Self::Center => "Center", +            Self::Right => "Right", +        }) +    } +} diff --git a/src/text.rs b/src/text.rs new file mode 100644 index 0000000..e9dbdeb --- /dev/null +++ b/src/text.rs @@ -0,0 +1,30 @@ +pub enum Text { +    Normal(String), +    Bold(String), +    Italic(String), +    BoldItalic(String), +    Code(String), +    WikiLink(String), +    Hyperlink(Hyperlink), +} +impl std::fmt::Debug for Text { +    fn fmt(&self, f: &mut std::fmt::Formatter) -> Result<(), std::fmt::Error> { +        let string = match self { +            Text::Normal(text) => format!("Normal ('{}')", text), +            Text::Bold(text) => format!("Bold ('{}')", text), +            Text::Italic(text) => format!("Italic ('{}')", text), +            Text::BoldItalic(text) => format!("BoldItalic ('{}')", text), +            Text::Code(text) => format!("Code ('{}')", text), +            Text::WikiLink(text) => format!("WikiLink ('{}')", text), +            Text::Hyperlink(Hyperlink { label, target }) => { +                format!("Hyperlink (label:'{}',  target:'{}')", label, target) +            } +        }; +        f.write_str(&string) +    } +} + +pub struct Hyperlink { +    pub label: String, +    pub target: String, +} | 
