PlaceholderHooker.java
package pro.verron.officestamper.api;
import org.docx4j.openpackaging.packages.WordprocessingMLPackage;
import org.docx4j.utils.TraversalUtilVisitor;
import org.docx4j.wml.P;
import pro.verron.officestamper.utils.wml.WmlUtils;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
import static pro.verron.officestamper.utils.wml.WmlUtils.asString;
import static pro.verron.officestamper.utils.wml.WmlUtils.insertSmartTag;
/// The [PlaceholderHooker] class is a pre-processor that prepares inline placeholders in a `WordprocessingML`
/// document. It searches for placeholders that match a given pattern and wraps them with a specified XML element to
/// ensure proper processing by the OfficeStamper engine.
///
/// This pre-processor is typically used to identify and mark inline expressions within paragraphs, making them
/// recognizable for subsequent processing steps.
public class PlaceholderHooker
implements PreProcessor {
private final Pattern pattern;
private final String element;
/// Constructs a new [PlaceholderHooker] instance with the specified regular expression and XML element
/// name.
///
/// @param regex the regular expression pattern used to identify inline placeholders in the document. This
/// pattern should contain at least two capturing groups where the second group represents the actual
/// placeholder content.
/// @param element the name of the XML element to wrap around identified placeholders. This element will be
/// used to mark the placeholders for further processing.
public PlaceholderHooker(String regex, String element) {
this(Pattern.compile(regex, Pattern.DOTALL), element);
}
/// Constructs a new [PlaceholderHooker] instance with the specified pattern and XML element name.
///
/// @param pattern the compiled regular expression pattern used to identify inline placeholders in the
/// document. This pattern should contain at least two capturing groups where the second group represents
/// the actual placeholder content.
/// @param element the name of the XML element to wrap around identified placeholders. This element will be
/// used to mark the placeholders for further processing.
public PlaceholderHooker(Pattern pattern, String element) {
this.pattern = pattern;
this.element = element;
}
@Override
public void process(WordprocessingMLPackage document) {
var visitor = new ParagraphCollector(pattern);
WmlUtils.visitDocument(document, visitor);
for (var paragraph : visitor.paragraphs()) {
var string = asString(paragraph);
var matcher = pattern.matcher(string);
// Iterates matches; replaces placeholder with a smart tag
while (matcher.find()) {
var start = matcher.start(1);
var end = matcher.end(1);
var placeholder = matcher.group(2);
var newContent = insertSmartTag(element, paragraph, placeholder, start, end);
var content = paragraph.getContent();
content.clear();
content.addAll(newContent);
string = asString(paragraph);
matcher = pattern.matcher(string);
}
}
}
/// A [TraversalUtilVisitor] implementation that collects paragraphs matching a given pattern.
///
/// This class is used to traverse a document and collect all paragraph elements ([P]) that match a specified
/// regular expression pattern. The collected paragraphs can be retrieved using the [#paragraphs()] method.
public static class ParagraphCollector
extends TraversalUtilVisitor<P> {
private final Pattern pattern;
private final List<P> results = new ArrayList<>();
/// Constructs a new [ParagraphCollector] with the specified pattern.
///
/// @param pattern the regular expression pattern to match against paragraphs
public ParagraphCollector(Pattern pattern) {
this.pattern = pattern;
}
@Override
public void apply(P element) {
var matcher = pattern.asPredicate();
var string = asString(element);
if (matcher.test(string)) {
results.add(element);
}
}
/// Returns the list of collected paragraphs that matched the pattern.
///
/// @return an unmodifiable list of paragraphs matching the specified pattern
public List<P> paragraphs() {
return results;
}
}
}