AsciiDocParser.java
package pro.verron.officestamper.asciidoc;
import org.asciidoctor.Asciidoctor;
import org.asciidoctor.Options;
import org.asciidoctor.ast.*;
import org.asciidoctor.ast.Block;
import org.asciidoctor.ast.Cell;
import org.asciidoctor.ast.Row;
import org.asciidoctor.ast.Table;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.stream.Stream;
import static pro.verron.officestamper.asciidoc.AsciiDocModel.*;
/// Parser based on AsciidoctorJ producing an [AsciiDocModel].
///
/// Supported subset mapped into our model:
/// - Headings: document title (if present) and sections (levels 1..6)
/// - Paragraphs: paragraph blocks
/// - Inline emphasis: <code>*bold*</code> and <code>_italic_</code> via a lightweight inline parser
public final class AsciiDocParser {
private AsciiDocParser() {
// utility
}
/// Parses the given AsciiDoc string into a model using AsciidoctorJ AST traversal.
///
/// Notes:
/// - If the document has a header/title (e.g. a leading "= Title"), it is emitted as a level-1 Heading.
/// - Section levels are offset by +1 when a document title is present to preserve the perceived hierarchy of the
/// previous homemade parser where "= Title" was treated as a heading, not a special header.
///
/// @param asciidoc source text
///
/// @return parsed model
public static AsciiDocModel parse(String asciidoc) {
var blocks = new ArrayList<AsciiDocModel.Block>();
if (asciidoc == null || asciidoc.isBlank()) {
return AsciiDocModel.of(blocks);
}
try (Asciidoctor engine = Asciidoctor.Factory.create()) {
Options options = Options.builder()
.sourcemap(true)
.build();
Document doc = engine.load(asciidoc, options);
for (StructuralNode child : doc.getBlocks()) {
traverse(child, blocks);
}
}
return AsciiDocModel.of(blocks);
}
private static void traverse(StructuralNode node, List<AsciiDocModel.Block> out) {
switch (node) {
case Section section -> {
int lvl = section.getLevel();
if (lvl >= 1 && lvl <= 6) {
out.add(new Heading(lvl, parseInlines(section.getTitle())));
}
for (StructuralNode b : section.getBlocks()) {
traverse(b, out);
}
}
case Table table -> {
List<AsciiDocModel.Row> rows = extractTableRowsViaReflection(table);
if (!rows.isEmpty()) {
out.add(new AsciiDocModel.Table(rows));
}
// If extraction failed, continue traversal into children to salvage paragraphs
}
case PhraseNode phraseNode -> out.add(new Paragraph(parseInlines(phraseNode.getText())));
case Block block when "simple".equals(block.getContentModel()) ->
out.add(new Paragraph(parseInlines(String.join("\n", block.getLines()))));
default -> {
// Recurse into other container nodes to keep paragraphs found within
List<StructuralNode> children = node.getBlocks();
if (children != null) {
for (StructuralNode c : children) traverse(c, out);
}
}
}
}
private static List<Inline> parseInlines(String text) {
// Stack-based inline parser with simple tokens for '*', '_', text, and escapes.
// Non-overlapping nesting is allowed; crossing markers are treated as plain text.
var root = new Frame(FrameType.ROOT);
var stack = new ArrayList<Frame>();
stack.add(root);
if (text == null || text.isEmpty()) {
return root.children;
}
for (int i = 0; i < text.length(); i++) {
char c = text.charAt(i);
// Escapes for '*', '_', and '\\'
if (c == '\\') {
if (i + 1 < text.length()) {
char next = text.charAt(i + 1);
if (next == '*' || next == '_' || next == '\\') {
stack.getLast().text.append(next);
i++;
continue;
}
}
// Lone backslash
stack.getLast().text.append(c);
continue;
}
if (c == '*' || c == '_') {
FrameType type = (c == '*') ? FrameType.BOLD : FrameType.ITALIC;
Frame top = stack.getLast();
if (top.type == type) {
// Close current frame
top.flushTextToChildren();
Inline node = (type == FrameType.BOLD) ? new Bold(top.children) : new Italic(top.children);
stack.removeLast();
Frame parent = stack.getLast();
parent.children.add(node);
}
else if (top.type == FrameType.BOLD || top.type == FrameType.ITALIC || top.type == FrameType.ROOT) {
// Open new frame
Frame f = new Frame(type);
stack.add(f);
}
else {
// Should not happen
stack.getLast().text.append(c);
}
continue;
}
// Detect literal |TAB| token -> emit a Tab inline
if (c == '|' && i + 4 < text.length() && text.charAt(i + 1) == 'T' && text.charAt(i + 2) == 'A'
&& text.charAt(i + 3) == 'B' && text.charAt(i + 4) == '|') {
// Flush any pending text
stack.getLast()
.flushTextToChildren();
stack.getLast().children.add(new Tab());
i += 4;
continue;
}
// Regular char
stack.getLast().text.append(c);
}
// Unwind: any unclosed frames become literal markers + content as plain text in parent
while (stack.size() > 1) {
Frame unfinished = stack.removeLast();
char marker = unfinished.type == FrameType.BOLD ? '*' : '_';
unfinished.flushTextToChildren();
// Build literal: marker + children as text + (no closing marker since it is missing)
StringBuilder literal = new StringBuilder();
literal.append(marker);
for (Inline in : unfinished.children) {
literal.append(in.text());
}
stack.getLast().text.append(literal);
}
// Flush remainder text on root
root.flushTextToChildren();
return root.children;
}
private static List<AsciiDocModel.Row> extractTableRowsViaReflection(Table table) {
var header = table.getHeader()
.stream()
.map(AsciiDocParser::convertRowReflective)
.toList();
var body = table.getBody()
.stream()
.map(AsciiDocParser::convertRowReflective)
.toList();
var footer = table.getFooter()
.stream()
.map(AsciiDocParser::convertRowReflective)
.toList();
return Stream.of(header, body, footer)
.flatMap(Collection::stream)
.toList();
}
private static AsciiDocModel.Row convertRowReflective(Row row) {
return new AsciiDocModel.Row(row.getCells()
.stream()
.map(AsciiDocParser::convertCell)
.toList());
}
private static AsciiDocModel.Cell convertCell(Cell cell) {
return new AsciiDocModel.Cell(parseInlines(cell.getText()));
}
private enum FrameType {
ROOT,
BOLD,
ITALIC
}
private static final class Frame {
final FrameType type;
final List<Inline> children = new ArrayList<>();
final StringBuilder text = new StringBuilder();
Frame(FrameType type) {this.type = type;}
void flushTextToChildren() {
if (!text.isEmpty()) {
children.add(new Text(text.toString()));
text.setLength(0);
}
}
}
}