Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable outputting the replacement value on PDFs #179

Merged
merged 3 commits into from
Dec 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions docs/docs/filter_policies/pdf.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# PDF Redaction Configuration

PDF redaction can be configured through the `config.pdf` path of a policy.

The available options are:

| Key | Type | Default | Description |
|--------------------------|-----------|-------------|-----------------------------------------------------------------------------------------------------------------------------------|
| `redactionColor` | `string` | `black` | This is the color of the redaction boxes that are drawn over the PII. Available options are `white`, `black`, `red`, and `yellow` |
| `showReplacement` | `boolean` | `false` | If `true` then the output of the filter's strategy will be output on the redaction box in the PDF |
| `replacementFont` | `string` | `helvetica` | The font to use for the replacement output. Available options are `helvetica`, `times`, and `courier` |
| `replacementMaxFontSize` | `float` | `12` | The maximum font size for the replacement text. Best efforts will be made to fit the replacement text within the redaction box |
| `replacementFontColor` | `string` | `white` | The font color for the replacement. Available options match the `redactionColor` options |

### An Example PDF Configuration Policy

The following is an example policy setting the PDF redaction options.

```
{
"name": "example-pdf-policy",
"identifiers": {
"emailAddress": {
"emailAddressFilterStrategies": [
{
"strategy": "REDACT",
"redactionFormat": "{{{REDACTED-%t}}}"
}
]
}
},
"config": {
"pdf": {
"redactionColor": "red",
"showReplacement": true,
"replacementFontColor": "yellow"
}
}
}
```
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,11 @@
import ai.philterd.phileas.model.policy.filters.strategies.rules.VinFilterStrategy;
import ai.philterd.phileas.model.policy.filters.strategies.rules.ZipCodeFilterStrategy;
import org.apache.commons.io.FileUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;

import java.io.File;
import java.io.IOException;
Expand All @@ -72,6 +77,7 @@
import java.util.Set;

public class EndToEndTestsHelper {
private static final Logger LOGGER = LogManager.getLogger(EndToEndTestsHelper.class);


public static Policy getPolicyWithSentiment(String policyName) throws IOException {
Expand Down Expand Up @@ -454,4 +460,16 @@ public static Policy getPolicyJustPhoneNumber(String policyName) {

}

public static boolean documentContainsText(byte[] doc, String needle) throws IOException {
try (PDDocument pdDocument = Loader.loadPDF(doc)) {
PDFTextStripper textStripper = new PDFTextStripper();
String pdfText = textStripper.getText(pdDocument);

if(pdfText.trim().isEmpty()) {
LOGGER.warn("documentContainsText called on a PDF with no text streams");
}

return pdfText.contains(needle);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
import java.util.List;
import java.util.Properties;

import static ai.philterd.test.phileas.services.EndToEndTestsHelper.documentContainsText;
import static ai.philterd.test.phileas.services.EndToEndTestsHelper.getPdfPolicy;
import static ai.philterd.test.phileas.services.EndToEndTestsHelper.getPolicy;

Expand Down Expand Up @@ -106,6 +107,8 @@ public void pdf1() throws Exception {
final byte[] document = IOUtils.toByteArray(is);
is.close();

Assertions.assertTrue(documentContainsText(document, "Wendy"));

final Path temp = Files.createTempDirectory("philter");

final File file1 = Paths.get(temp.toFile().getAbsolutePath(), "pdf.json").toFile();
Expand All @@ -131,8 +134,10 @@ public void pdf1() throws Exception {
LOGGER.info("Spans: {}", response.getExplanation().appliedSpans().size());
showSpans(response.getExplanation().appliedSpans());

// TODO: How to assert? MD5 gives a different value each time.

// TODO: This is asserting that it doesn't contain anything as a text stream
// but it's possible that they're in the images, we would need to OCR
// the files for this assertion to be truly valuable
Assertions.assertFalse(documentContainsText(response.getDocument(), "Wendy"));
}

@Test
Expand All @@ -142,6 +147,8 @@ public void pdf2() throws Exception {
final byte[] document = IOUtils.toByteArray(is);
is.close();

Assertions.assertTrue(documentContainsText(document, "90210"));

final Path temp = Files.createTempDirectory("philter");

final File file1 = Paths.get(temp.toFile().getAbsolutePath(), "pdf.json").toFile();
Expand Down Expand Up @@ -170,7 +177,10 @@ public void pdf2() throws Exception {
// output:
// characterStart: 35; characterEnd: 40; filterType: zip-code; context: context; documentId: documentid; confidence: 0.9; text: 90210; replacement: {{{REDACTED-zip-code}}}; salt: ; ignored: false; classification: null;

// TODO: How to assert? MD5 gives a different value each time.
// TODO: This is asserting that it doesn't contain anything as a text stream
// but it's possible that they're in the images, we would need to OCR
// the files for this assertion to be truly valuable
Assertions.assertFalse(documentContainsText(response.getDocument(), "90210"));

}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,55 @@

public class Pdf {

@SerializedName("enabled")
@SerializedName("redactionColor")
@Expose
private String redactionColor = "black";

@SerializedName("showReplacement")
@Expose
private boolean showReplacement = false;

@SerializedName("replacementFont")
@Expose
private String replacementFont = "helvetica";

@SerializedName("replacementMaxFontSize")
@Expose
private float replacementMaxFontSize = 12;

@SerializedName("replacementFontColor")
@Expose
private String replacementFontColor;

public String getRedactionColor() {
return redactionColor;
}

public void setRedactionColor(String redactionColor) {
this.redactionColor = redactionColor;
public void setRedactionColor(String replacementColor) {
this.redactionColor = replacementColor;
}

public String getReplacementFont() {
return replacementFont;
}

public void setReplacementFont(String replacementFont) {
this.replacementFont = replacementFont;
}

public float getReplacementMaxFontSize() {
return replacementMaxFontSize;
}

public String getReplacementFontColor() {
return replacementFontColor;
}

public boolean getShowReplacement() {
return showReplacement;
}

public void setShowReplacement(boolean showReplacement) {
this.showReplacement = showReplacement;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.font.PDType1Font;
import org.apache.pdfbox.pdmodel.font.Standard14Fonts;
import org.apache.pdfbox.pdmodel.graphics.color.PDColor;
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB;
import org.apache.pdfbox.pdmodel.graphics.image.JPEGFactory;
Expand All @@ -44,8 +47,19 @@
import javax.imageio.ImageWriteParam;
import javax.imageio.ImageWriter;
import java.awt.image.BufferedImage;
import java.io.*;
import java.util.*;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;

Expand All @@ -64,13 +78,24 @@ public class PdfRedacter extends PDFTextStripper implements Redacter {
private final List<BoundingBox> boundingBoxes;

private static final Map<String, PDColor> COLORS = new LinkedHashMap<>();
private static final Map<String, PDFont> FONTS = new LinkedHashMap<>();

static {
COLORS.put("white", new PDColor(new float[]{255, 255, 255}, PDDeviceRGB.INSTANCE));
COLORS.put("black", new PDColor(new float[]{0, 0, 0}, PDDeviceRGB.INSTANCE));
COLORS.put("red", new PDColor(new float[]{255, 0, 0}, PDDeviceRGB.INSTANCE));
COLORS.put("yellow", new PDColor(new float[]{1, 1, 100 / 255F}, PDDeviceRGB.INSTANCE));

FONTS.put("helvetica", new PDType1Font(Standard14Fonts.FontName.HELVETICA));
FONTS.put("times", new PDType1Font(Standard14Fonts.FontName.TIMES_ROMAN));
FONTS.put("courier", new PDType1Font(Standard14Fonts.FontName.COURIER));
}

private final boolean showReplacement;
private final float replacementFontSize;
private final PDFont replacementFont;
private final PDColor replacementFontColor;

public PdfRedacter(Policy policy,
Set<Span> spans, PdfRedactionOptions pdfRedactionOptions,
List<BoundingBox> boundingBoxes) throws IOException {
Expand All @@ -79,6 +104,10 @@ public PdfRedacter(Policy policy,
this.spans = spans;
this.pdfRedactionOptions = pdfRedactionOptions;
this.boundingBoxes = boundingBoxes;
this.showReplacement = policy.getConfig().getPdf().getShowReplacement();
this.replacementFont = FONTS.getOrDefault(policy.getConfig().getPdf().getReplacementFont(), FONTS.get("helvetica"));
this.replacementFontSize = policy.getConfig().getPdf().getReplacementMaxFontSize();
this.replacementFontColor = COLORS.getOrDefault(policy.getConfig().getPdf().getReplacementFontColor(), COLORS.get("white"));

}

Expand Down Expand Up @@ -209,27 +238,63 @@ protected void endDocument(PDDocument doc) throws IOException {
for(int pageNumber : rectangles.keySet()) {

final PDPage page = document.getPage(pageNumber);
final PDPageContentStream contentStream = new PDPageContentStream(doc, page, PDPageContentStream.AppendMode.APPEND, true, true);
final PDPageContentStream rectContentStream = new PDPageContentStream(doc, page, PDPageContentStream.AppendMode.APPEND, true, true);
final PDPageContentStream textContentStream = new PDPageContentStream(doc, page, PDPageContentStream.AppendMode.APPEND, true, true);

for(final RedactedRectangle rectangle : rectangles.get(pageNumber)) {

contentStream.addRect(
rectContentStream.addRect(
rectangle.getPdRectangle().getLowerLeftX(),
rectangle.getPdRectangle().getLowerLeftY() - 3,
rectangle.getPdRectangle().getWidth(),
rectangle.getPdRectangle().getHeight() + buffer);

if(showReplacement) {
addReplacementTextToRect(rectangle, textContentStream);
}
}

// Get the color based on the filter.
final PDColor pdColor = COLORS.getOrDefault(policy.getConfig().getPdf().getRedactionColor(), COLORS.get("black"));
contentStream.setNonStrokingColor(pdColor);
contentStream.setRenderingMode(RenderingMode.FILL);
contentStream.fill();
contentStream.close();
rectContentStream.setNonStrokingColor(pdColor);
rectContentStream.setRenderingMode(RenderingMode.FILL);
rectContentStream.fill();
rectContentStream.close();

textContentStream.close();

}

}

public void addReplacementTextToRect(RedactedRectangle rectangle, PDPageContentStream textContentStream) throws IOException {
var replacementText = rectangle.getSpan().getReplacement();
var rectangleWidth = rectangle.getPdRectangle().getWidth();
var rectangleHeight = rectangle.getPdRectangle().getHeight();

var boxFontSize = replacementFontSize;
float textWidth = (replacementFont.getStringWidth(replacementText) / 1000.0f) * boxFontSize;
while (textWidth > rectangleWidth) {
boxFontSize -= 1;
textWidth = (replacementFont.getStringWidth(replacementText) / 1000.0f) * boxFontSize;
}

// Y position is actually based on the font's "baseline", so we use the descent
// (how far the font goes under the baseline) for the height calculation
var textDescent = (replacementFont.getFontDescriptor().getDescent() / 1000.0f) * boxFontSize;

var textXLocation = (rectangle.getPdRectangle().getLowerLeftX() +
((rectangleWidth / 2.0f) - (textWidth / 2.0f)));

var textYLocation = (rectangle.getPdRectangle().getLowerLeftY() +
((rectangleHeight / 2.0f) + (textDescent / 2.0f)));

textContentStream.beginText();
textContentStream.setNonStrokingColor(replacementFontColor);
textContentStream.setFont(replacementFont, boxFontSize);
textContentStream.newLineAtOffset(textXLocation, textYLocation);
textContentStream.showText(replacementText);
textContentStream.endText();
}

@Override
Expand Down
Loading
Loading