SOLR-16420: {!mlt_content} (apache#1045)

* SOLR-16420: introducing {!mlt_content} accepting external content.
cowpaths · Nov 25, 2022 · 1eef75e · 1eef75e
1 parent a8719ba
commit 1eef75e
Show file tree

Hide file tree

Showing 12 changed files with 867 additions and 320 deletions.
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
@@ -71,6 +71,8 @@ Improvements
 * SOLR-11028: A v2 equivalent of the `/admin/collections?action= REPLACE` command is now available at
   `POST /api/cluster/nodes/nodeName/replace`. (Joshua Ouma via Jason Gerlowski)
 
+* SOLR-16420: Introducing `{!mlt_content}foo bar` to cover existing `/mlt` handler functionality for SolrCloud. (Mikhail Khludnev)
+
 Optimizations
 ---------------------
 
@@ -102,6 +104,8 @@ Bug Fixes
 
 * SOLR-16528: Jaegertracer module must include okhttp3 dependency (janhoy)
 
+* SOLR-16420: Default for cloud mode was fixed to `{!mlt mindf=5}` to comply with Reference Guide (Mikhail Khludnev)
+
 Build
 ---------------------
 * Upgrade forbiddenapis to 3.4 (Uwe Schindler)

diff --git a/solr/core/src/java/org/apache/solr/search/QParserPlugin.java b/solr/core/src/java/org/apache/solr/search/QParserPlugin.java
@@ -28,6 +28,7 @@
 import org.apache.solr.search.join.FiltersQParserPlugin;
 import org.apache.solr.search.join.GraphQParserPlugin;
 import org.apache.solr.search.join.HashRangeQParserPlugin;
+import org.apache.solr.search.mlt.MLTContentQParserPlugin;
 import org.apache.solr.search.mlt.MLTQParserPlugin;
 import org.apache.solr.search.neural.KnnQParserPlugin;
 import org.apache.solr.util.plugin.NamedListInitializedPlugin;
@@ -73,6 +74,7 @@ public abstract class QParserPlugin implements NamedListInitializedPlugin, SolrI
     map.put(ReRankQParserPlugin.NAME, new ReRankQParserPlugin());
     map.put(ExportQParserPlugin.NAME, new ExportQParserPlugin());
     map.put(MLTQParserPlugin.NAME, new MLTQParserPlugin());
+    map.put(MLTContentQParserPlugin.NAME, new MLTContentQParserPlugin());
     map.put(HashQParserPlugin.NAME, new HashQParserPlugin());
     map.put(GraphQParserPlugin.NAME, new GraphQParserPlugin());
     map.put(XmlQParserPlugin.NAME, new XmlQParserPlugin());

diff --git a/solr/core/src/java/org/apache/solr/search/mlt/AbstractMLTQParser.java b/solr/core/src/java/org/apache/solr/search/mlt/AbstractMLTQParser.java
@@ -0,0 +1,157 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.search.mlt;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.function.Supplier;
+import java.util.regex.Pattern;
+import org.apache.lucene.queries.mlt.MoreLikeThis;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.BoostQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.TermQuery;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.StringUtils;
+import org.apache.solr.common.params.SolrParams;
+import org.apache.solr.request.SolrQueryRequest;
+import org.apache.solr.schema.SchemaField;
+import org.apache.solr.search.QParser;
+import org.apache.solr.search.QueryUtils;
+import org.apache.solr.util.SolrPluginUtils;
+
+abstract class AbstractMLTQParser extends QParser {
+  // Pattern is thread safe -- TODO? share this with general 'fl' param
+  private static final Pattern splitList = Pattern.compile(",| ");
+
+  /** Retrieves text and string fields fom the schema */
+  protected String[] getFieldsFromSchema() {
+    Map<String, SchemaField> fieldDefinitions = req.getSearcher().getSchema().getFields();
+    ArrayList<String> fields = new ArrayList<>();
+    for (Map.Entry<String, SchemaField> entry : fieldDefinitions.entrySet()) {
+      if (entry.getValue().indexed() && entry.getValue().stored())
+        if (entry.getValue().getType().getNumberType() == null) fields.add(entry.getKey());
+    }
+    return fields.toArray(new String[0]);
+  }
+
+  /**
+   * Constructor for the QParser
+   *
+   * @param qstr The part of the query string specific to this parser
+   * @param localParams The set of parameters that are specific to this QParser. See
+   *     https://solr.apache.org/guide/solr/latest/query-guide/local-params.html
+   * @param params The rest of the {@link SolrParams}
+   * @param req The original {@link SolrQueryRequest}.
+   */
+  AbstractMLTQParser(String qstr, SolrParams localParams, SolrParams params, SolrQueryRequest req) {
+    super(qstr, localParams, params, req);
+  }
+
+  /** exclude current document from results */
+  public BooleanQuery exclude(BooleanQuery boostedMLTQuery, Query docIdQuery) {
+    BooleanQuery.Builder realMLTQuery = new BooleanQuery.Builder();
+    realMLTQuery.add(boostedMLTQuery, BooleanClause.Occur.MUST);
+    realMLTQuery.add(docIdQuery, BooleanClause.Occur.MUST_NOT);
+    return realMLTQuery.build();
+  }
+
+  @FunctionalInterface
+  protected interface MLTInvoker {
+    Query invoke(MoreLikeThis mlt) throws IOException;
+  }
+
+  protected BooleanQuery parseMLTQuery(
+      Supplier<String[]> fieldsFallback, MLTInvoker invoker, Query docIdQuery) throws IOException {
+    return exclude(parseMLTQuery(fieldsFallback, invoker), docIdQuery);
+  }
+
+  protected BooleanQuery parseMLTQuery(Supplier<String[]> fieldsFallback, MLTInvoker invoker)
+      throws IOException {
+    Map<String, Float> boostFields = new HashMap<>();
+    MoreLikeThis mlt = new MoreLikeThis(req.getSearcher().getIndexReader());
+
+    mlt.setMinTermFreq(localParams.getInt("mintf", MoreLikeThis.DEFAULT_MIN_TERM_FREQ));
+    // TODO def mindf was 0 for cloud, 5 for standalone
+    mlt.setMinDocFreq(localParams.getInt("mindf", MoreLikeThis.DEFAULT_MIN_DOC_FREQ));
+    mlt.setMinWordLen(localParams.getInt("minwl", MoreLikeThis.DEFAULT_MIN_WORD_LENGTH));
+    mlt.setMaxWordLen(localParams.getInt("maxwl", MoreLikeThis.DEFAULT_MAX_WORD_LENGTH));
+    mlt.setMaxQueryTerms(localParams.getInt("maxqt", MoreLikeThis.DEFAULT_MAX_QUERY_TERMS));
+    mlt.setMaxNumTokensParsed(
+        localParams.getInt("maxntp", MoreLikeThis.DEFAULT_MAX_NUM_TOKENS_PARSED));
+    mlt.setMaxDocFreq(localParams.getInt("maxdf", MoreLikeThis.DEFAULT_MAX_DOC_FREQ));
+
+    final boolean boost = localParams.getBool("boost", MoreLikeThis.DEFAULT_BOOST);
+    mlt.setBoost(boost);
+
+    mlt.setAnalyzer(req.getSchema().getIndexAnalyzer());
+
+    final String[] fieldNames;
+    String[] qf = localParams.getParams("qf");
+    if (qf != null) {
+      ArrayList<String> fields = new ArrayList<>();
+      for (String fieldName : qf) {
+        if (!StringUtils.isEmpty(fieldName)) {
+          String[] strings = splitList.split(fieldName);
+          for (String string : strings) {
+            if (!StringUtils.isEmpty(string)) {
+              fields.add(string);
+            }
+          }
+        }
+      }
+      // Parse field names and boosts from the fields
+      boostFields.putAll(SolrPluginUtils.parseFieldBoosts(fields.toArray(new String[0])));
+      fieldNames = boostFields.keySet().toArray(new String[0]);
+    } else {
+      fieldNames = fieldsFallback.get();
+    }
+    if (fieldNames.length < 1) {
+      throw new SolrException(
+          SolrException.ErrorCode.BAD_REQUEST,
+          "MoreLikeThis requires at least one similarity field: qf");
+    }
+    mlt.setFieldNames(fieldNames);
+    final BooleanQuery rawMLTQuery = (BooleanQuery) invoker.invoke(mlt);
+
+    if (boost && boostFields.size() > 0) {
+      BooleanQuery.Builder newQ = new BooleanQuery.Builder();
+      newQ.setMinimumNumberShouldMatch(rawMLTQuery.getMinimumNumberShouldMatch());
+
+      for (BooleanClause clause : rawMLTQuery) {
+        Query q = clause.getQuery();
+        float originalBoost = 1f;
+        if (q instanceof BoostQuery) {
+          BoostQuery bq = (BoostQuery) q;
+          q = bq.getQuery();
+          originalBoost = bq.getBoost();
+        }
+        Float fieldBoost = boostFields.get(((TermQuery) q).getTerm().field());
+        q =
+            ((fieldBoost != null)
+                ? new BoostQuery(q, fieldBoost * originalBoost)
+                : clause.getQuery());
+        newQ.add(q, clause.getOccur());
+      }
+      return QueryUtils.build(newQ, this);
+    }
+    return rawMLTQuery;
+  }
+}
diff --git a/solr/core/src/java/org/apache/solr/search/mlt/CloudMLTQParser.java b/solr/core/src/java/org/apache/solr/search/mlt/CloudMLTQParser.java
@@ -23,36 +23,22 @@
 import java.util.Collection;
 import java.util.HashMap;
 import java.util.Map;
-import java.util.regex.Pattern;
 import org.apache.lucene.index.IndexableField;
-import org.apache.lucene.index.Term;
 import org.apache.lucene.queries.mlt.MoreLikeThis;
-import org.apache.lucene.search.BooleanClause;
-import org.apache.lucene.search.BooleanQuery;
-import org.apache.lucene.search.BoostQuery;
 import org.apache.lucene.search.Query;
-import org.apache.lucene.search.TermQuery;
-import org.apache.lucene.util.BytesRefBuilder;
 import org.apache.solr.common.SolrDocument;
 import org.apache.solr.common.SolrException;
-import org.apache.solr.common.StringUtils;
 import org.apache.solr.common.params.ModifiableSolrParams;
 import org.apache.solr.common.params.SolrParams;
 import org.apache.solr.common.util.NamedList;
 import org.apache.solr.core.SolrCore;
-import org.apache.solr.legacy.LegacyNumericUtils;
 import org.apache.solr.request.SolrQueryRequest;
 import org.apache.solr.request.SolrQueryRequestBase;
 import org.apache.solr.response.SolrQueryResponse;
 import org.apache.solr.schema.SchemaField;
-import org.apache.solr.search.QParser;
 import org.apache.solr.search.QueryParsing;
-import org.apache.solr.search.QueryUtils;
-import org.apache.solr.util.SolrPluginUtils;
 
-public class CloudMLTQParser extends QParser {
-  // Pattern is thread safe -- TODO? share this with general 'fl' param
-  private static final Pattern splitList = Pattern.compile(",| ");
+public class CloudMLTQParser extends SimpleMLTQParser {
 
   public CloudMLTQParser(
       String qstr, SolrParams localParams, SolrParams params, SolrQueryRequest req) {
@@ -69,65 +55,18 @@ public Query parse() {
           SolrException.ErrorCode.BAD_REQUEST,
           "Error completing MLT request. Could not fetch " + "document with id [" + id + "]");
     }
-
-    String[] qf = localParams.getParams("qf");
-    Map<String, Float> boostFields = new HashMap<>();
-    MoreLikeThis mlt = new MoreLikeThis(req.getSearcher().getIndexReader());
-
-    mlt.setMinTermFreq(localParams.getInt("mintf", MoreLikeThis.DEFAULT_MIN_TERM_FREQ));
-    mlt.setMinDocFreq(localParams.getInt("mindf", 0));
-    mlt.setMinWordLen(localParams.getInt("minwl", MoreLikeThis.DEFAULT_MIN_WORD_LENGTH));
-    mlt.setMaxWordLen(localParams.getInt("maxwl", MoreLikeThis.DEFAULT_MAX_WORD_LENGTH));
-    mlt.setMaxQueryTerms(localParams.getInt("maxqt", MoreLikeThis.DEFAULT_MAX_QUERY_TERMS));
-    mlt.setMaxNumTokensParsed(
-        localParams.getInt("maxntp", MoreLikeThis.DEFAULT_MAX_NUM_TOKENS_PARSED));
-    mlt.setMaxDocFreq(localParams.getInt("maxdf", MoreLikeThis.DEFAULT_MAX_DOC_FREQ));
-
-    Boolean boost = localParams.getBool("boost", MoreLikeThis.DEFAULT_BOOST);
-    mlt.setBoost(boost);
-
-    mlt.setAnalyzer(req.getSchema().getIndexAnalyzer());
-
-    Map<String, Collection<Object>> filteredDocument = new HashMap<>();
-    String[] fieldNames;
-
-    if (qf != null) {
-      ArrayList<String> fields = new ArrayList<>();
-      for (String fieldName : qf) {
-        if (!StringUtils.isEmpty(fieldName)) {
-          String[] strings = splitList.split(fieldName);
-          for (String string : strings) {
-            if (!StringUtils.isEmpty(string)) {
-              fields.add(string);
-            }
-          }
-        }
-      }
-      // Parse field names and boosts from the fields
-      boostFields = SolrPluginUtils.parseFieldBoosts(fields.toArray(new String[0]));
-      fieldNames = boostFields.keySet().toArray(new String[0]);
-    } else {
-      ArrayList<String> fields = new ArrayList<>();
-      for (String field : doc.getFieldNames()) {
-        // Only use fields that are stored and have an explicit analyzer.
-        // This makes sense as the query uses tf/idf/.. for query construction.
-        // We might want to relook and change this in the future though.
-        SchemaField f = req.getSchema().getFieldOrNull(field);
-        if (f != null && f.stored() && f.getType().isExplicitAnalyzer()) {
-          fields.add(field);
-        }
-      }
-      fieldNames = fields.toArray(new String[0]);
+    try {
+      final Query docIdQuery = createIdQuery(req.getSchema().getUniqueKeyField().getName(), id);
+      return parseMLTQuery(() -> getFieldsFromDoc(doc), (mlt) -> likeDoc(mlt, doc), docIdQuery);
+    } catch (IOException e) {
+      throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Bad Request", e);
     }
+  }
 
-    if (fieldNames.length < 1) {
-      throw new SolrException(
-          SolrException.ErrorCode.BAD_REQUEST,
-          "MoreLikeThis requires at least one similarity field: qf");
-    }
+  protected Query likeDoc(MoreLikeThis moreLikeThis, SolrDocument doc) throws IOException {
+    Map<String, Collection<Object>> filteredDocument = new HashMap<>();
 
-    mlt.setFieldNames(fieldNames);
-    for (String field : fieldNames) {
+    for (String field : moreLikeThis.getFieldNames()) {
       Collection<Object> fieldValues = doc.getFieldValues(field);
       if (fieldValues != null) {
         Collection<Object> values = new ArrayList<>();
@@ -141,45 +80,21 @@ public Query parse() {
         filteredDocument.put(field, values);
       }
     }
+    return moreLikeThis.like(filteredDocument);
+  }
 
-    try {
-      Query rawMLTQuery = mlt.like(filteredDocument);
-      BooleanQuery boostedMLTQuery = (BooleanQuery) rawMLTQuery;
-
-      if (boost && boostFields.size() > 0) {
-        BooleanQuery.Builder newQ = new BooleanQuery.Builder();
-        newQ.setMinimumNumberShouldMatch(boostedMLTQuery.getMinimumNumberShouldMatch());
-
-        for (BooleanClause clause : boostedMLTQuery) {
-          Query q = clause.getQuery();
-          float originalBoost = 1f;
-          if (q instanceof BoostQuery) {
-            BoostQuery bq = (BoostQuery) q;
-            q = bq.getQuery();
-            originalBoost = bq.getBoost();
-          }
-          Float fieldBoost = boostFields.get(((TermQuery) q).getTerm().field());
-          q =
-              ((fieldBoost != null)
-                  ? new BoostQuery(q, fieldBoost * originalBoost)
-                  : clause.getQuery());
-          newQ.add(q, clause.getOccur());
-        }
-
-        boostedMLTQuery = QueryUtils.build(newQ, this);
+  protected String[] getFieldsFromDoc(SolrDocument doc) {
+    ArrayList<String> fields = new ArrayList<>();
+    for (String field : doc.getFieldNames()) {
+      // Only use fields that are stored and have an explicit analyzer.
+      // This makes sense as the query uses tf/idf/.. for query construction.
+      // We might want to relook and change this in the future though.
+      SchemaField f = req.getSchema().getFieldOrNull(field);
+      if (f != null && f.stored() && f.getType().isExplicitAnalyzer()) {
+        fields.add(field);
       }
-
-      // exclude current document from results
-      BooleanQuery.Builder realMLTQuery = new BooleanQuery.Builder();
-      realMLTQuery.add(boostedMLTQuery, BooleanClause.Occur.MUST);
-      realMLTQuery.add(
-          createIdQuery(req.getSchema().getUniqueKeyField().getName(), id),
-          BooleanClause.Occur.MUST_NOT);
-
-      return realMLTQuery.build();
-    } catch (IOException e) {
-      throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Bad Request", e);
     }
+    return fields.toArray(new String[0]);
   }
 
   private SolrDocument getDocument(String id) {
@@ -195,18 +110,4 @@ private SolrDocument getDocument(String id) {
 
     return (SolrDocument) response.get("doc");
   }
-
-  private Query createIdQuery(String defaultField, String uniqueValue) {
-    return new TermQuery(
-        req.getSchema().getField(defaultField).getType().getNumberType() != null
-            ? createNumericTerm(defaultField, uniqueValue)
-            : new Term(defaultField, uniqueValue));
-  }
-
-  private Term createNumericTerm(String field, String uniqueValue) {
-    BytesRefBuilder bytesRefBuilder = new BytesRefBuilder();
-    bytesRefBuilder.grow(LegacyNumericUtils.BUF_SIZE_INT);
-    LegacyNumericUtils.intToPrefixCoded(Integer.parseInt(uniqueValue), 0, bytesRefBuilder);
-    return new Term(field, bytesRefBuilder.toBytesRef());
-  }
 }