AsciiDocParser.java
package pro.verron.officestamper.asciidoc;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.function.Function;
import static pro.verron.officestamper.asciidoc.AsciiDocModel.*;
public final class AsciiDocParser
implements Function<String, AsciiDocModel> {
public static AsciiDocModel parse(String asciidoc) {
return new AsciiDocParser().apply(asciidoc);
}
public AsciiDocModel apply(String asciidoc) {
var blocks = new ArrayList<Block>();
if (asciidoc == null || asciidoc.isBlank()) {
return AsciiDocModel.of(blocks);
}
String[] lines = asciidoc.split("\r?\n");
StringBuilder currentParagraph = new StringBuilder();
boolean inTable = false;
boolean inBlockquote = false;
boolean inCodeBlock = false;
String currentLanguage = "";
List<Row> currentTableRows = new ArrayList<>();
StringBuilder currentBlockContent = new StringBuilder();
for (String line : lines) {
String trimmed = line.trim();
if (trimmed.equals("____")) {
if (inBlockquote) {
blocks.add(new Blockquote(parseInlines(currentBlockContent.toString()
.trim())));
currentBlockContent.setLength(0);
inBlockquote = false;
}
else {
if (!currentParagraph.isEmpty()) {
blocks.add(new Paragraph(parseInlines(currentParagraph.toString()
.trim())));
currentParagraph.setLength(0);
}
inBlockquote = true;
}
continue;
}
if (inBlockquote) {
if (!currentBlockContent.isEmpty()) {
currentBlockContent.append(" ");
}
currentBlockContent.append(trimmed);
continue;
}
if (trimmed.startsWith("[source")) {
int commaIndex = trimmed.indexOf(',');
if (commaIndex != -1) {
int bracketIndex = trimmed.indexOf(']');
currentLanguage = trimmed.substring(commaIndex + 1, bracketIndex)
.trim();
}
continue;
}
if (trimmed.equals("----")) {
if (inCodeBlock) {
blocks.add(new CodeBlock(currentLanguage,
currentBlockContent.toString()
.trim()));
currentBlockContent.setLength(0);
currentLanguage = "";
inCodeBlock = false;
}
else {
if (!currentParagraph.isEmpty()) {
blocks.add(new Paragraph(parseInlines(currentParagraph.toString()
.trim())));
currentParagraph.setLength(0);
}
inCodeBlock = true;
}
continue;
}
if (inCodeBlock) {
if (!currentBlockContent.isEmpty()) {
currentBlockContent.append("\n");
}
currentBlockContent.append(line); // Preserve indentation in code blocks
continue;
}
if (trimmed.startsWith("image::")) {
if (!currentParagraph.isEmpty()) {
blocks.add(new Paragraph(parseInlines(currentParagraph.toString()
.trim())));
currentParagraph.setLength(0);
}
int endUrl = trimmed.indexOf('[', 7);
if (endUrl != -1) {
int endText = trimmed.indexOf(']', endUrl);
if (endText != -1) {
String url = trimmed.substring(7, endUrl);
String altText = trimmed.substring(endUrl + 1, endText);
blocks.add(new ImageBlock(url, altText));
continue;
}
}
}
if (trimmed.equals("|===")) {
if (inTable) {
blocks.add(new Table(currentTableRows));
currentTableRows = new ArrayList<>();
inTable = false;
}
else {
if (!currentParagraph.isEmpty()) {
blocks.add(new Paragraph(parseInlines(currentParagraph.toString()
.trim())));
currentParagraph.setLength(0);
}
inTable = true;
}
continue;
}
if (inTable) {
if (trimmed.startsWith("|")) {
String[] cellTexts = trimmed.substring(1)
.split("\\|");
List<Cell> cells = new ArrayList<>();
for (String cellText : cellTexts) {
cells.add(Cell.ofInlines(parseInlines(cellText.trim())));
}
currentTableRows.add(new Row(cells));
}
continue;
}
if (trimmed.isBlank()) {
if (!currentParagraph.isEmpty()) {
blocks.add(new Paragraph(parseInlines(currentParagraph.toString()
.trim())));
currentParagraph.setLength(0);
}
continue;
}
// Check for Headings
if (trimmed.startsWith("=")) {
int level = 0;
while (level < trimmed.length() && trimmed.charAt(level) == '=') {
level++;
}
if (level > 0 && level <= 6 && level < trimmed.length()
&& Character.isWhitespace(trimmed.charAt(level))) {
if (!currentParagraph.isEmpty()) {
blocks.add(new Paragraph(parseInlines(currentParagraph.toString()
.trim())));
currentParagraph.setLength(0);
}
String title = trimmed.substring(level)
.trim();
blocks.add(new Heading(level, parseInlines(title)));
continue;
}
}
// Check for Unordered List Item
if (trimmed.startsWith("* ")) {
if (!currentParagraph.isEmpty()) {
blocks.add(new Paragraph(parseInlines(currentParagraph.toString()
.trim())));
currentParagraph.setLength(0);
}
String itemText = trimmed.substring(2)
.trim();
var item = new ListItem(parseInlines(itemText));
if (!blocks.isEmpty() && blocks.getLast() instanceof UnorderedList(List<ListItem> items1)) {
List<ListItem> items = new ArrayList<>(items1);
items.add(item);
blocks.set(blocks.size() - 1, new UnorderedList(items));
}
else {
blocks.add(new UnorderedList(List.of(item)));
}
continue;
}
// Check for Ordered List Item
if (trimmed.startsWith(". ")) {
if (!currentParagraph.isEmpty()) {
blocks.add(new Paragraph(parseInlines(currentParagraph.toString()
.trim())));
currentParagraph.setLength(0);
}
String itemText = trimmed.substring(2)
.trim();
var item = new ListItem(parseInlines(itemText));
if (!blocks.isEmpty() && blocks.getLast() instanceof OrderedList(List<ListItem> items1)) {
List<ListItem> items = new ArrayList<>(items1);
items.add(item);
blocks.set(blocks.size() - 1, new OrderedList(items));
}
else {
blocks.add(new OrderedList(List.of(item)));
}
continue;
}
// Otherwise, it's a paragraph part
if (!currentParagraph.isEmpty()) {
currentParagraph.append("\n");
}
currentParagraph.append(trimmed);
}
if (!currentParagraph.isEmpty()) {
blocks.add(new Paragraph(parseInlines(currentParagraph.toString()
.trim())));
}
return AsciiDocModel.of(blocks);
}
private static List<Inline> parseInlines(String text) {
// Stack-based inline parser with simple tokens for '*', '_', text, and escapes.
// Non-overlapping nesting is allowed; crossing markers are treated as plain text.
var root = new Frame(FrameType.ROOT);
var stack = new ArrayList<Frame>();
stack.add(root);
if (text == null || text.isEmpty()) {
return root.children;
}
for (int i = 0; i < text.length(); i++) {
char c = text.charAt(i);
// Escapes for '*', '_', and '\\'
if (c == '\\') {
if (i + 1 < text.length()) {
char next = text.charAt(i + 1);
if (next == '*' || next == '_' || next == '\\') {
stack.getLast().text.append(next);
i++;
continue;
}
}
// Lone backslash
stack.getLast().text.append(c);
continue;
}
if (c == '*' || c == '_') {
FrameType type = (c == '*') ? FrameType.BOLD : FrameType.ITALIC;
Frame top = stack.getLast();
if (top.type == type) {
// Close current frame
top.flushTextToChildren();
Inline node = (type == FrameType.BOLD) ? new Bold(top.children) : new Italic(top.children);
stack.removeLast();
Frame parent = stack.getLast();
parent.children.add(node);
}
else if (top.type == FrameType.BOLD || top.type == FrameType.ITALIC || top.type == FrameType.ROOT) {
// Open new frame
top.flushTextToChildren();
Frame f = new Frame(type);
stack.add(f);
}
else {
// Should not happen
stack.getLast().text.append(c);
}
continue;
}
// Detect literal |TAB| token -> emit a Tab inline
if (c == '|' && i + 4 < text.length() && text.charAt(i + 1) == 'T' && text.charAt(i + 2) == 'A'
&& text.charAt(i + 3) == 'B' && text.charAt(i + 4) == '|') {
// Flush any pending text
stack.getLast()
.flushTextToChildren();
stack.getLast().children.add(new Tab());
i += 4;
continue;
}
// Simple Link detection: https://example.com[Text]
if (c == 'h' && text.startsWith("http", i)) {
int endUrl = text.indexOf('[', i);
if (endUrl != -1) {
int endText = text.indexOf(']', endUrl);
if (endText != -1) {
stack.getLast()
.flushTextToChildren();
String url = text.substring(i, endUrl);
String linkText = text.substring(endUrl + 1, endText);
stack.getLast().children.add(new Link(url, linkText));
i = endText;
continue;
}
}
}
// Simple Image detection: image:url[AltText]
if (c == 'i' && text.startsWith("image:", i)) {
int endUrl = text.indexOf('[', i + 6);
if (endUrl != -1) {
int endText = text.indexOf(']', endUrl);
if (endText != -1) {
stack.getLast()
.flushTextToChildren();
String url = text.substring(i + 6, endUrl);
String title = text.substring(endUrl + 1, endText);
stack.getLast().children.add(new InlineImage(url, Map.of("title", title)));
i = endText;
continue;
}
}
}
// Regular char
stack.getLast().text.append(c);
}
// Unwind: any unclosed frames become literal markers + content as plain text in parent
while (stack.size() > 1) {
Frame unfinished = stack.removeLast();
char marker = unfinished.type == FrameType.BOLD ? '*' : '_';
unfinished.flushTextToChildren();
// Build literal: marker + children as text + (no closing marker since it is missing)
StringBuilder literal = new StringBuilder();
literal.append(marker);
for (Inline in : unfinished.children) {
literal.append(in.text());
}
stack.getLast().text.append(literal);
}
// Flush remainder text on root
root.flushTextToChildren();
return root.children;
}
private enum FrameType {
ROOT,
BOLD,
ITALIC
}
private static final class Frame {
final FrameType type;
final List<Inline> children = new ArrayList<>();
final StringBuilder text = new StringBuilder();
Frame(FrameType type) {this.type = type;}
void flushTextToChildren() {
if (!text.isEmpty()) {
children.add(new Text(text.toString()));
text.setLength(0);
}
}
}
}