Skip to content

Commit

Permalink
Merge pull request #319 from marklogic/feature/17220-max-chunk-size-e…
Browse files Browse the repository at this point in the history
…rror-fix

Added validation and nice errors for max sizes
  • Loading branch information
rjrudin authored Oct 10, 2024
2 parents 79d560b + d376ac8 commit 23edb1e
Show file tree
Hide file tree
Showing 4 changed files with 79 additions and 7 deletions.
10 changes: 8 additions & 2 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -66,18 +66,24 @@ dependencies {
exclude group: "com.fasterxml.jackson.core"
}

shadowDependencies "dev.langchain4j:langchain4j:0.35.0"

// Need this so that an OkHttpClientConfigurator can be created.
shadowDependencies 'com.squareup.okhttp3:okhttp:4.12.0'

// Supports reading and writing RDF data.
shadowDependencies ("org.apache.jena:jena-arq:4.10.0") {
exclude group: "com.fasterxml.jackson.core"
exclude group: "com.fasterxml.jackson.dataformat"
}

// Supports splitting documents.
shadowDependencies "dev.langchain4j:langchain4j:0.35.0"

// Needed for constructing XML documents and for splitting them.
shadowDependencies "org.jdom:jdom2:2.0.6.1"

// Needed for splitting XML documents via XPath.
shadowDependencies "jaxen:jaxen:2.0.0"

testImplementation ('com.marklogic:ml-app-deployer:5.0.0') {
exclude group: "com.fasterxml.jackson.core"
exclude group: "com.fasterxml.jackson.dataformat"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,19 @@
public abstract class DocumentSplitterFactory {

public static DocumentSplitter makeDocumentSplitter(ContextSupport context) {
if (context.hasOption(Options.WRITE_SPLITTER_REGEX)) {
return makeRegexSplitter(context);
try {
if (context.hasOption(Options.WRITE_SPLITTER_REGEX)) {
return makeRegexSplitter(context);
}
return makeDefaultSplitter(context);
} catch (IllegalArgumentException ex) {
String message = ex.getMessage();
if (message != null) {
message = massageLangchain4jError(context, message);
}
// Not including the underlying error so that the langchain4j details aren't exposed to the user.
throw new ConnectorException(String.format("Unable to create splitter for documents; cause: %s", message));
}
return makeDefaultSplitter(context);
}

private static DocumentSplitter makeDefaultSplitter(ContextSupport context) {
Expand Down Expand Up @@ -49,6 +58,22 @@ private static int getMaxOverlapSize(ContextSupport context) {
return (int) context.getNumericOption(Options.WRITE_SPLITTER_MAX_OVERLAP_SIZE, 0, 0);
}

/**
* langchain4j does a nice job with validating inputs, but we don't want langchain4j-specific argument names to
* appear in our error messages.
*/
private static String massageLangchain4jError(ContextSupport context, String message) {
if (message.contains("maxChunkSize")) {
String optionName = context.getOptionNameForMessage(Options.WRITE_SPLITTER_MAX_CHUNK_SIZE);
message = message.replace("maxChunkSize", optionName);
}
if (message.contains("maxOverlapSize")) {
String optionName = context.getOptionNameForMessage(Options.WRITE_SPLITTER_MAX_OVERLAP_SIZE);
message = message.replace("maxOverlapSize", optionName);
}
return message;
}

private DocumentSplitterFactory() {
}
}
2 changes: 2 additions & 0 deletions src/main/resources/marklogic-spark-messages.properties
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,5 @@ spark.marklogic.write.threadCountPerPartition=
spark.marklogic.write.transformParams=
spark.marklogic.write.uriTemplate=
spark.marklogic.write.xmlRootName=
spark.marklogic.write.splitter.maxChunkSize=
spark.marklogic.write.splitter.maxOverlapSize=
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,6 @@ void undeclaredNamespace() {
.write().format(CONNECTOR_IDENTIFIER)
.option(Options.CLIENT_URI, makeClientUri())
.option(Options.WRITE_SPLITTER_XML_PATH, "/ex:root/ex:text/text()")
.option(Options.WRITE_PERMISSIONS, DEFAULT_PERMISSIONS)
.option(Options.WRITE_URI_TEMPLATE, "/namespace-test.xml")
.mode(SaveMode.Append);

ConnectorException ex = assertThrowsConnectorException(() -> writer.save());
Expand All @@ -78,6 +76,47 @@ void undeclaredNamespace() {
);
}

@Test
void overlapSizeGreaterThanChunkSize() {
DataFrameWriter writer = readDocument("/marklogic-docs/java-client-intro.xml")
.write().format(CONNECTOR_IDENTIFIER)
.option(Options.CLIENT_URI, makeClientUri())
.option(Options.WRITE_SPLITTER_XML_PATH, "/root/text/text()")
.option(Options.WRITE_SPLITTER_MAX_CHUNK_SIZE, 200)
.option(Options.WRITE_SPLITTER_MAX_OVERLAP_SIZE, 300)
.mode(SaveMode.Append);

ConnectorException ex = assertThrowsConnectorException(() -> writer.save());
assertEquals("Unable to create splitter for documents; cause: spark.marklogic.write.splitter.maxOverlapSize " +
"must be between 0 and 200, but is: 300", ex.getMessage().trim());
}

@Test
void chunkSizeBelowZero() {
DataFrameWriter writer = readDocument("/marklogic-docs/java-client-intro.xml")
.write().format(CONNECTOR_IDENTIFIER)
.option(Options.CLIENT_URI, makeClientUri())
.option(Options.WRITE_SPLITTER_XML_PATH, "/root/text/text()")
.option(Options.WRITE_SPLITTER_MAX_CHUNK_SIZE, -1)
.mode(SaveMode.Append);

ConnectorException ex = assertThrowsConnectorException(() -> writer.save());
assertEquals("The value of 'spark.marklogic.write.splitter.maxChunkSize' must be 0 or greater.", ex.getMessage());
}

@Test
void overlapSizeBelowZero() {
DataFrameWriter writer = readDocument("/marklogic-docs/java-client-intro.xml")
.write().format(CONNECTOR_IDENTIFIER)
.option(Options.CLIENT_URI, makeClientUri())
.option(Options.WRITE_SPLITTER_XML_PATH, "/root/text/text()")
.option(Options.WRITE_SPLITTER_MAX_OVERLAP_SIZE, -1)
.mode(SaveMode.Append);

ConnectorException ex = assertThrowsConnectorException(() -> writer.save());
assertEquals("The value of 'spark.marklogic.write.splitter.maxOverlapSize' must be 0 or greater.", ex.getMessage());
}

@Test
void regexSplitter() {
readDocument("/marklogic-docs/java-client-intro.xml")
Expand Down

0 comments on commit 23edb1e

Please sign in to comment.