DocxImageExtractor.java

package pro.verron.officestamper.core;

import jakarta.xml.bind.JAXBElement;
import org.docx4j.dml.picture.Pic;
import org.docx4j.dml.wordprocessingDrawing.Inline;
import org.docx4j.openpackaging.exceptions.Docx4JException;
import org.docx4j.openpackaging.packages.WordprocessingMLPackage;
import org.docx4j.wml.Drawing;
import org.docx4j.wml.R;
import pro.verron.officestamper.api.OfficeStamperException;

import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;

/// Extracts images from a docx document.
///
/// @author Joseph Verron
/// @version ${version}
/// @since 1.4.7
public class DocxImageExtractor {

    private final WordprocessingMLPackage wordprocessingMLPackage;

    /// Creates a new image extractor for the given docx document.
    ///
    /// @param wordprocessingMLPackage the docx document to extract images from.
    public DocxImageExtractor(WordprocessingMLPackage wordprocessingMLPackage) {
        this.wordprocessingMLPackage = wordprocessingMLPackage;
    }

    /// Extract an image bytes from an embedded image run.
    ///
    /// @param run run containing the embedded drawing.
    byte[] getRunDrawingData(R run) {
        String imageRelId = getPic(run).getBlipFill()
                                       .getBlip()
                                       .getEmbed();
        String imageRelPartName = getImageRelPartName(imageRelId);
        long size = getImageSize(imageRelPartName);
        InputStream stream = getImageStream(imageRelPartName);
        return streamToByteArray(size, stream);
    }

    private static Pic getPic(R run) {
        var content = run.getContent();
        for (Object runContent : content) {
            if (runContent instanceof JAXBElement<?> runElement && runElement.getValue() instanceof Drawing drawing) {
                return getPic(drawing);
            }
        }
        throw new OfficeStamperException("Run drawing not found !");
    }

    private String getImageRelPartName(String imageRelId) {
        // TODO: find a better way to find image rel part name in source part store
        return wordprocessingMLPackage.getMainDocumentPart()
                                      .getRelationshipsPart()
                                      .getPart(imageRelId)
                                      .getPartName()
                                      .getName()
                                      .substring(1);
    }

    private long getImageSize(String imageRelPartName) {
        try {
            return wordprocessingMLPackage.getSourcePartStore()
                                          .getPartSize(imageRelPartName);
        } catch (Docx4JException e) {
            throw new OfficeStamperException(e);
        }
    }

    private InputStream getImageStream(String imageRelPartName) {
        try {
            return wordprocessingMLPackage.getSourcePartStore()
                                          .loadPart(imageRelPartName);
        } catch (Docx4JException e) {
            throw new OfficeStamperException(e);
        }
    }

    /// Converts an InputStream to a byte array.
    ///
    /// @param size expected size of the byte array.
    /// @param is   input stream to read data from.
    ///
    /// @return the data from the input stream.
    private static byte[] streamToByteArray(long size, InputStream is) {
        if (size > Integer.MAX_VALUE) throw new OfficeStamperException("Image size exceeds maximum allowed (2GB)");

        int intSize = (int) size;
        byte[] data = new byte[intSize];
        int numRead = tryRead(is, data);
        return Arrays.copyOfRange(data, 0, numRead);
    }

    private static Pic getPic(Drawing drawing) {
        var anchorOrInline = drawing.getAnchorOrInline();
        if (anchorOrInline.isEmpty()) throw new OfficeStamperException("Anchor or Inline is empty !");
        if (!(anchorOrInline.getFirst() instanceof Inline inline))
            throw new OfficeStamperException("Don't know how to process anchor !");
        return inline.getGraphic()
                     .getGraphicData()
                     .getPic();
    }

    private static int tryRead(InputStream is, byte[] data) {
        try {
            return is.read(data);
        } catch (IOException e) {
            throw new OfficeStamperException(e);
        }
    }

    /// Extract the width of the image from an embedded image run.
    ///
    /// @param run run containing the embedded drawing.
    ///
    /// @return a [Integer] object
    public Integer getRunDrawingMaxWidth(R run) {
        return (int) getPic(run).getSpPr()
                                .getXfrm()
                                .getExt()
                                .getCx();
    }
}