From 1eef75edf2f10189f69014fdb6faaa6fff63788c Mon Sep 17 00:00:00 2001 From: Mikhail Khludnev Date: Fri, 25 Nov 2022 15:36:14 +0300 Subject: [PATCH] SOLR-16420: {!mlt_content} (#1045) * SOLR-16420: introducing {!mlt_content} accepting external content. --- solr/CHANGES.txt | 4 + .../org/apache/solr/search/QParserPlugin.java | 2 + .../solr/search/mlt/AbstractMLTQParser.java | 157 +++++++++ .../solr/search/mlt/CloudMLTQParser.java | 143 ++------- .../search/mlt/MLTContentQParserPlugin.java | 68 ++++ .../solr/search/mlt/SimpleMLTQParser.java | 108 +------ .../apache/solr/search/QueryEqualityTest.java | 14 + .../mlt/CloudMLTQContentParserTest.java | 297 ++++++++++++++++++ .../solr/search/mlt/CloudMLTQParserTest.java | 41 ++- .../mlt/SimpleMLTContentQParserTest.java | 192 +++++++++++ .../solr/search/mlt/SimpleMLTQParserTest.java | 154 ++++----- .../query-guide/pages/morelikethis.adoc | 7 +- 12 files changed, 867 insertions(+), 320 deletions(-) create mode 100644 solr/core/src/java/org/apache/solr/search/mlt/AbstractMLTQParser.java create mode 100644 solr/core/src/java/org/apache/solr/search/mlt/MLTContentQParserPlugin.java create mode 100644 solr/core/src/test/org/apache/solr/search/mlt/CloudMLTQContentParserTest.java create mode 100644 solr/core/src/test/org/apache/solr/search/mlt/SimpleMLTContentQParserTest.java diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index e90b6502ddf..2be67187f01 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -71,6 +71,8 @@ Improvements * SOLR-11028: A v2 equivalent of the `/admin/collections?action= REPLACE` command is now available at `POST /api/cluster/nodes/nodeName/replace`. (Joshua Ouma via Jason Gerlowski) +* SOLR-16420: Introducing `{!mlt_content}foo bar` to cover existing `/mlt` handler functionality for SolrCloud. (Mikhail Khludnev) + Optimizations --------------------- @@ -102,6 +104,8 @@ Bug Fixes * SOLR-16528: Jaegertracer module must include okhttp3 dependency (janhoy) +* SOLR-16420: Default for cloud mode was fixed to `{!mlt mindf=5}` to comply with Reference Guide (Mikhail Khludnev) + Build --------------------- * Upgrade forbiddenapis to 3.4 (Uwe Schindler) diff --git a/solr/core/src/java/org/apache/solr/search/QParserPlugin.java b/solr/core/src/java/org/apache/solr/search/QParserPlugin.java index 84f41c99ee3..99cb9efb0c7 100644 --- a/solr/core/src/java/org/apache/solr/search/QParserPlugin.java +++ b/solr/core/src/java/org/apache/solr/search/QParserPlugin.java @@ -28,6 +28,7 @@ import org.apache.solr.search.join.FiltersQParserPlugin; import org.apache.solr.search.join.GraphQParserPlugin; import org.apache.solr.search.join.HashRangeQParserPlugin; +import org.apache.solr.search.mlt.MLTContentQParserPlugin; import org.apache.solr.search.mlt.MLTQParserPlugin; import org.apache.solr.search.neural.KnnQParserPlugin; import org.apache.solr.util.plugin.NamedListInitializedPlugin; @@ -73,6 +74,7 @@ public abstract class QParserPlugin implements NamedListInitializedPlugin, SolrI map.put(ReRankQParserPlugin.NAME, new ReRankQParserPlugin()); map.put(ExportQParserPlugin.NAME, new ExportQParserPlugin()); map.put(MLTQParserPlugin.NAME, new MLTQParserPlugin()); + map.put(MLTContentQParserPlugin.NAME, new MLTContentQParserPlugin()); map.put(HashQParserPlugin.NAME, new HashQParserPlugin()); map.put(GraphQParserPlugin.NAME, new GraphQParserPlugin()); map.put(XmlQParserPlugin.NAME, new XmlQParserPlugin()); diff --git a/solr/core/src/java/org/apache/solr/search/mlt/AbstractMLTQParser.java b/solr/core/src/java/org/apache/solr/search/mlt/AbstractMLTQParser.java new file mode 100644 index 00000000000..1cd25c44dd4 --- /dev/null +++ b/solr/core/src/java/org/apache/solr/search/mlt/AbstractMLTQParser.java @@ -0,0 +1,157 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.search.mlt; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Map; +import java.util.function.Supplier; +import java.util.regex.Pattern; +import org.apache.lucene.queries.mlt.MoreLikeThis; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.BoostQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; +import org.apache.solr.common.SolrException; +import org.apache.solr.common.StringUtils; +import org.apache.solr.common.params.SolrParams; +import org.apache.solr.request.SolrQueryRequest; +import org.apache.solr.schema.SchemaField; +import org.apache.solr.search.QParser; +import org.apache.solr.search.QueryUtils; +import org.apache.solr.util.SolrPluginUtils; + +abstract class AbstractMLTQParser extends QParser { + // Pattern is thread safe -- TODO? share this with general 'fl' param + private static final Pattern splitList = Pattern.compile(",| "); + + /** Retrieves text and string fields fom the schema */ + protected String[] getFieldsFromSchema() { + Map fieldDefinitions = req.getSearcher().getSchema().getFields(); + ArrayList fields = new ArrayList<>(); + for (Map.Entry entry : fieldDefinitions.entrySet()) { + if (entry.getValue().indexed() && entry.getValue().stored()) + if (entry.getValue().getType().getNumberType() == null) fields.add(entry.getKey()); + } + return fields.toArray(new String[0]); + } + + /** + * Constructor for the QParser + * + * @param qstr The part of the query string specific to this parser + * @param localParams The set of parameters that are specific to this QParser. See + * https://solr.apache.org/guide/solr/latest/query-guide/local-params.html + * @param params The rest of the {@link SolrParams} + * @param req The original {@link SolrQueryRequest}. + */ + AbstractMLTQParser(String qstr, SolrParams localParams, SolrParams params, SolrQueryRequest req) { + super(qstr, localParams, params, req); + } + + /** exclude current document from results */ + public BooleanQuery exclude(BooleanQuery boostedMLTQuery, Query docIdQuery) { + BooleanQuery.Builder realMLTQuery = new BooleanQuery.Builder(); + realMLTQuery.add(boostedMLTQuery, BooleanClause.Occur.MUST); + realMLTQuery.add(docIdQuery, BooleanClause.Occur.MUST_NOT); + return realMLTQuery.build(); + } + + @FunctionalInterface + protected interface MLTInvoker { + Query invoke(MoreLikeThis mlt) throws IOException; + } + + protected BooleanQuery parseMLTQuery( + Supplier fieldsFallback, MLTInvoker invoker, Query docIdQuery) throws IOException { + return exclude(parseMLTQuery(fieldsFallback, invoker), docIdQuery); + } + + protected BooleanQuery parseMLTQuery(Supplier fieldsFallback, MLTInvoker invoker) + throws IOException { + Map boostFields = new HashMap<>(); + MoreLikeThis mlt = new MoreLikeThis(req.getSearcher().getIndexReader()); + + mlt.setMinTermFreq(localParams.getInt("mintf", MoreLikeThis.DEFAULT_MIN_TERM_FREQ)); + // TODO def mindf was 0 for cloud, 5 for standalone + mlt.setMinDocFreq(localParams.getInt("mindf", MoreLikeThis.DEFAULT_MIN_DOC_FREQ)); + mlt.setMinWordLen(localParams.getInt("minwl", MoreLikeThis.DEFAULT_MIN_WORD_LENGTH)); + mlt.setMaxWordLen(localParams.getInt("maxwl", MoreLikeThis.DEFAULT_MAX_WORD_LENGTH)); + mlt.setMaxQueryTerms(localParams.getInt("maxqt", MoreLikeThis.DEFAULT_MAX_QUERY_TERMS)); + mlt.setMaxNumTokensParsed( + localParams.getInt("maxntp", MoreLikeThis.DEFAULT_MAX_NUM_TOKENS_PARSED)); + mlt.setMaxDocFreq(localParams.getInt("maxdf", MoreLikeThis.DEFAULT_MAX_DOC_FREQ)); + + final boolean boost = localParams.getBool("boost", MoreLikeThis.DEFAULT_BOOST); + mlt.setBoost(boost); + + mlt.setAnalyzer(req.getSchema().getIndexAnalyzer()); + + final String[] fieldNames; + String[] qf = localParams.getParams("qf"); + if (qf != null) { + ArrayList fields = new ArrayList<>(); + for (String fieldName : qf) { + if (!StringUtils.isEmpty(fieldName)) { + String[] strings = splitList.split(fieldName); + for (String string : strings) { + if (!StringUtils.isEmpty(string)) { + fields.add(string); + } + } + } + } + // Parse field names and boosts from the fields + boostFields.putAll(SolrPluginUtils.parseFieldBoosts(fields.toArray(new String[0]))); + fieldNames = boostFields.keySet().toArray(new String[0]); + } else { + fieldNames = fieldsFallback.get(); + } + if (fieldNames.length < 1) { + throw new SolrException( + SolrException.ErrorCode.BAD_REQUEST, + "MoreLikeThis requires at least one similarity field: qf"); + } + mlt.setFieldNames(fieldNames); + final BooleanQuery rawMLTQuery = (BooleanQuery) invoker.invoke(mlt); + + if (boost && boostFields.size() > 0) { + BooleanQuery.Builder newQ = new BooleanQuery.Builder(); + newQ.setMinimumNumberShouldMatch(rawMLTQuery.getMinimumNumberShouldMatch()); + + for (BooleanClause clause : rawMLTQuery) { + Query q = clause.getQuery(); + float originalBoost = 1f; + if (q instanceof BoostQuery) { + BoostQuery bq = (BoostQuery) q; + q = bq.getQuery(); + originalBoost = bq.getBoost(); + } + Float fieldBoost = boostFields.get(((TermQuery) q).getTerm().field()); + q = + ((fieldBoost != null) + ? new BoostQuery(q, fieldBoost * originalBoost) + : clause.getQuery()); + newQ.add(q, clause.getOccur()); + } + return QueryUtils.build(newQ, this); + } + return rawMLTQuery; + } +} diff --git a/solr/core/src/java/org/apache/solr/search/mlt/CloudMLTQParser.java b/solr/core/src/java/org/apache/solr/search/mlt/CloudMLTQParser.java index d1316564c7d..90ff5949138 100644 --- a/solr/core/src/java/org/apache/solr/search/mlt/CloudMLTQParser.java +++ b/solr/core/src/java/org/apache/solr/search/mlt/CloudMLTQParser.java @@ -23,36 +23,22 @@ import java.util.Collection; import java.util.HashMap; import java.util.Map; -import java.util.regex.Pattern; import org.apache.lucene.index.IndexableField; -import org.apache.lucene.index.Term; import org.apache.lucene.queries.mlt.MoreLikeThis; -import org.apache.lucene.search.BooleanClause; -import org.apache.lucene.search.BooleanQuery; -import org.apache.lucene.search.BoostQuery; import org.apache.lucene.search.Query; -import org.apache.lucene.search.TermQuery; -import org.apache.lucene.util.BytesRefBuilder; import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrException; -import org.apache.solr.common.StringUtils; import org.apache.solr.common.params.ModifiableSolrParams; import org.apache.solr.common.params.SolrParams; import org.apache.solr.common.util.NamedList; import org.apache.solr.core.SolrCore; -import org.apache.solr.legacy.LegacyNumericUtils; import org.apache.solr.request.SolrQueryRequest; import org.apache.solr.request.SolrQueryRequestBase; import org.apache.solr.response.SolrQueryResponse; import org.apache.solr.schema.SchemaField; -import org.apache.solr.search.QParser; import org.apache.solr.search.QueryParsing; -import org.apache.solr.search.QueryUtils; -import org.apache.solr.util.SolrPluginUtils; -public class CloudMLTQParser extends QParser { - // Pattern is thread safe -- TODO? share this with general 'fl' param - private static final Pattern splitList = Pattern.compile(",| "); +public class CloudMLTQParser extends SimpleMLTQParser { public CloudMLTQParser( String qstr, SolrParams localParams, SolrParams params, SolrQueryRequest req) { @@ -69,65 +55,18 @@ public Query parse() { SolrException.ErrorCode.BAD_REQUEST, "Error completing MLT request. Could not fetch " + "document with id [" + id + "]"); } - - String[] qf = localParams.getParams("qf"); - Map boostFields = new HashMap<>(); - MoreLikeThis mlt = new MoreLikeThis(req.getSearcher().getIndexReader()); - - mlt.setMinTermFreq(localParams.getInt("mintf", MoreLikeThis.DEFAULT_MIN_TERM_FREQ)); - mlt.setMinDocFreq(localParams.getInt("mindf", 0)); - mlt.setMinWordLen(localParams.getInt("minwl", MoreLikeThis.DEFAULT_MIN_WORD_LENGTH)); - mlt.setMaxWordLen(localParams.getInt("maxwl", MoreLikeThis.DEFAULT_MAX_WORD_LENGTH)); - mlt.setMaxQueryTerms(localParams.getInt("maxqt", MoreLikeThis.DEFAULT_MAX_QUERY_TERMS)); - mlt.setMaxNumTokensParsed( - localParams.getInt("maxntp", MoreLikeThis.DEFAULT_MAX_NUM_TOKENS_PARSED)); - mlt.setMaxDocFreq(localParams.getInt("maxdf", MoreLikeThis.DEFAULT_MAX_DOC_FREQ)); - - Boolean boost = localParams.getBool("boost", MoreLikeThis.DEFAULT_BOOST); - mlt.setBoost(boost); - - mlt.setAnalyzer(req.getSchema().getIndexAnalyzer()); - - Map> filteredDocument = new HashMap<>(); - String[] fieldNames; - - if (qf != null) { - ArrayList fields = new ArrayList<>(); - for (String fieldName : qf) { - if (!StringUtils.isEmpty(fieldName)) { - String[] strings = splitList.split(fieldName); - for (String string : strings) { - if (!StringUtils.isEmpty(string)) { - fields.add(string); - } - } - } - } - // Parse field names and boosts from the fields - boostFields = SolrPluginUtils.parseFieldBoosts(fields.toArray(new String[0])); - fieldNames = boostFields.keySet().toArray(new String[0]); - } else { - ArrayList fields = new ArrayList<>(); - for (String field : doc.getFieldNames()) { - // Only use fields that are stored and have an explicit analyzer. - // This makes sense as the query uses tf/idf/.. for query construction. - // We might want to relook and change this in the future though. - SchemaField f = req.getSchema().getFieldOrNull(field); - if (f != null && f.stored() && f.getType().isExplicitAnalyzer()) { - fields.add(field); - } - } - fieldNames = fields.toArray(new String[0]); + try { + final Query docIdQuery = createIdQuery(req.getSchema().getUniqueKeyField().getName(), id); + return parseMLTQuery(() -> getFieldsFromDoc(doc), (mlt) -> likeDoc(mlt, doc), docIdQuery); + } catch (IOException e) { + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Bad Request", e); } + } - if (fieldNames.length < 1) { - throw new SolrException( - SolrException.ErrorCode.BAD_REQUEST, - "MoreLikeThis requires at least one similarity field: qf"); - } + protected Query likeDoc(MoreLikeThis moreLikeThis, SolrDocument doc) throws IOException { + Map> filteredDocument = new HashMap<>(); - mlt.setFieldNames(fieldNames); - for (String field : fieldNames) { + for (String field : moreLikeThis.getFieldNames()) { Collection fieldValues = doc.getFieldValues(field); if (fieldValues != null) { Collection values = new ArrayList<>(); @@ -141,45 +80,21 @@ public Query parse() { filteredDocument.put(field, values); } } + return moreLikeThis.like(filteredDocument); + } - try { - Query rawMLTQuery = mlt.like(filteredDocument); - BooleanQuery boostedMLTQuery = (BooleanQuery) rawMLTQuery; - - if (boost && boostFields.size() > 0) { - BooleanQuery.Builder newQ = new BooleanQuery.Builder(); - newQ.setMinimumNumberShouldMatch(boostedMLTQuery.getMinimumNumberShouldMatch()); - - for (BooleanClause clause : boostedMLTQuery) { - Query q = clause.getQuery(); - float originalBoost = 1f; - if (q instanceof BoostQuery) { - BoostQuery bq = (BoostQuery) q; - q = bq.getQuery(); - originalBoost = bq.getBoost(); - } - Float fieldBoost = boostFields.get(((TermQuery) q).getTerm().field()); - q = - ((fieldBoost != null) - ? new BoostQuery(q, fieldBoost * originalBoost) - : clause.getQuery()); - newQ.add(q, clause.getOccur()); - } - - boostedMLTQuery = QueryUtils.build(newQ, this); + protected String[] getFieldsFromDoc(SolrDocument doc) { + ArrayList fields = new ArrayList<>(); + for (String field : doc.getFieldNames()) { + // Only use fields that are stored and have an explicit analyzer. + // This makes sense as the query uses tf/idf/.. for query construction. + // We might want to relook and change this in the future though. + SchemaField f = req.getSchema().getFieldOrNull(field); + if (f != null && f.stored() && f.getType().isExplicitAnalyzer()) { + fields.add(field); } - - // exclude current document from results - BooleanQuery.Builder realMLTQuery = new BooleanQuery.Builder(); - realMLTQuery.add(boostedMLTQuery, BooleanClause.Occur.MUST); - realMLTQuery.add( - createIdQuery(req.getSchema().getUniqueKeyField().getName(), id), - BooleanClause.Occur.MUST_NOT); - - return realMLTQuery.build(); - } catch (IOException e) { - throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Bad Request", e); } + return fields.toArray(new String[0]); } private SolrDocument getDocument(String id) { @@ -195,18 +110,4 @@ private SolrDocument getDocument(String id) { return (SolrDocument) response.get("doc"); } - - private Query createIdQuery(String defaultField, String uniqueValue) { - return new TermQuery( - req.getSchema().getField(defaultField).getType().getNumberType() != null - ? createNumericTerm(defaultField, uniqueValue) - : new Term(defaultField, uniqueValue)); - } - - private Term createNumericTerm(String field, String uniqueValue) { - BytesRefBuilder bytesRefBuilder = new BytesRefBuilder(); - bytesRefBuilder.grow(LegacyNumericUtils.BUF_SIZE_INT); - LegacyNumericUtils.intToPrefixCoded(Integer.parseInt(uniqueValue), 0, bytesRefBuilder); - return new Term(field, bytesRefBuilder.toBytesRef()); - } } diff --git a/solr/core/src/java/org/apache/solr/search/mlt/MLTContentQParserPlugin.java b/solr/core/src/java/org/apache/solr/search/mlt/MLTContentQParserPlugin.java new file mode 100644 index 00000000000..5c79f13d421 --- /dev/null +++ b/solr/core/src/java/org/apache/solr/search/mlt/MLTContentQParserPlugin.java @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.search.mlt; + +import java.io.IOException; +import java.io.StringReader; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; +import org.apache.lucene.queries.mlt.MoreLikeThis; +import org.apache.lucene.search.Query; +import org.apache.solr.common.SolrException; +import org.apache.solr.common.params.SolrParams; +import org.apache.solr.request.SolrQueryRequest; +import org.apache.solr.search.QParser; +import org.apache.solr.search.QParserPlugin; +import org.apache.solr.search.QueryParsing; +import org.apache.solr.search.SyntaxError; + +public class MLTContentQParserPlugin extends QParserPlugin { + public static final String NAME = "mlt_content"; + + @Override + public QParser createParser( + String qstr, SolrParams localParams, SolrParams params, SolrQueryRequest req) { + return new AbstractMLTQParser(qstr, localParams, params, req) { + @Override + public Query parse() throws SyntaxError { + String content = localParams.get(QueryParsing.V); + try { + return parseMLTQuery(this::getFieldsFromSchema, (mlt) -> likeContent(mlt, content)); + } catch (IOException e) { + throw new SolrException( + SolrException.ErrorCode.BAD_REQUEST, "Error completing MLT request" + e.getMessage()); + } + } + }; + } + + protected Query likeContent(MoreLikeThis moreLikeThis, String content) throws IOException { + final String[] fieldNames = moreLikeThis.getFieldNames(); + if (fieldNames.length == 1) { + return moreLikeThis.like(fieldNames[0], new StringReader(content)); + } else { + Collection streamValue = Collections.singleton(content); + Map> multifieldDoc = new HashMap<>(fieldNames.length); + for (String field : fieldNames) { + multifieldDoc.put(field, streamValue); + } + return moreLikeThis.like(multifieldDoc); + } + } +} diff --git a/solr/core/src/java/org/apache/solr/search/mlt/SimpleMLTQParser.java b/solr/core/src/java/org/apache/solr/search/mlt/SimpleMLTQParser.java index c9aba360dce..083211f144a 100644 --- a/solr/core/src/java/org/apache/solr/search/mlt/SimpleMLTQParser.java +++ b/solr/core/src/java/org/apache/solr/search/mlt/SimpleMLTQParser.java @@ -17,35 +17,20 @@ package org.apache.solr.search.mlt; import java.io.IOException; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.Map; -import java.util.regex.Pattern; import org.apache.lucene.index.Term; -import org.apache.lucene.queries.mlt.MoreLikeThis; -import org.apache.lucene.search.BooleanClause; -import org.apache.lucene.search.BooleanQuery; -import org.apache.lucene.search.BoostQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.util.BytesRefBuilder; import org.apache.solr.common.SolrException; -import org.apache.solr.common.StringUtils; import org.apache.solr.common.params.SolrParams; import org.apache.solr.legacy.LegacyNumericUtils; import org.apache.solr.request.SolrQueryRequest; -import org.apache.solr.schema.SchemaField; -import org.apache.solr.search.QParser; import org.apache.solr.search.QueryParsing; -import org.apache.solr.search.QueryUtils; import org.apache.solr.search.SolrIndexSearcher; -import org.apache.solr.util.SolrPluginUtils; -public class SimpleMLTQParser extends QParser { - // Pattern is thread safe -- TODO? share this with general 'fl' param - private static final Pattern splitList = Pattern.compile(",| "); +public class SimpleMLTQParser extends AbstractMLTQParser { public SimpleMLTQParser( String qstr, SolrParams localParams, SolrParams params, SolrQueryRequest req) { @@ -55,13 +40,10 @@ public SimpleMLTQParser( @Override public Query parse() { - String defaultField = req.getSchema().getUniqueKeyField().getName(); String uniqueValue = localParams.get(QueryParsing.V); - String[] qf = localParams.getParams("qf"); SolrIndexSearcher searcher = req.getSearcher(); - Query docIdQuery = createIdQuery(defaultField, uniqueValue); - Map boostFields = new HashMap<>(); + Query docIdQuery = createIdQuery(req.getSchema().getUniqueKeyField().getName(), uniqueValue); try { TopDocs td = searcher.search(docIdQuery, 2); @@ -73,93 +55,17 @@ public Query parse() { + uniqueValue + "]"); ScoreDoc[] scoreDocs = td.scoreDocs; - MoreLikeThis mlt = new MoreLikeThis(req.getSearcher().getIndexReader()); - - mlt.setMinTermFreq(localParams.getInt("mintf", MoreLikeThis.DEFAULT_MIN_TERM_FREQ)); - mlt.setMinDocFreq(localParams.getInt("mindf", MoreLikeThis.DEFAULT_MIN_DOC_FREQ)); - mlt.setMinWordLen(localParams.getInt("minwl", MoreLikeThis.DEFAULT_MIN_WORD_LENGTH)); - mlt.setMaxWordLen(localParams.getInt("maxwl", MoreLikeThis.DEFAULT_MAX_WORD_LENGTH)); - mlt.setMaxQueryTerms(localParams.getInt("maxqt", MoreLikeThis.DEFAULT_MAX_QUERY_TERMS)); - mlt.setMaxNumTokensParsed( - localParams.getInt("maxntp", MoreLikeThis.DEFAULT_MAX_NUM_TOKENS_PARSED)); - mlt.setMaxDocFreq(localParams.getInt("maxdf", MoreLikeThis.DEFAULT_MAX_DOC_FREQ)); - Boolean boost = localParams.getBool("boost", false); - mlt.setBoost(boost); - - String[] fieldNames; - - if (qf != null) { - ArrayList fields = new ArrayList<>(); - for (String fieldName : qf) { - if (!StringUtils.isEmpty(fieldName)) { - String[] strings = splitList.split(fieldName); - for (String string : strings) { - if (!StringUtils.isEmpty(string)) { - fields.add(string); - } - } - } - } - // Parse field names and boosts from the fields - boostFields = SolrPluginUtils.parseFieldBoosts(fields.toArray(new String[0])); - fieldNames = boostFields.keySet().toArray(new String[0]); - } else { - Map fieldDefinitions = req.getSearcher().getSchema().getFields(); - ArrayList fields = new ArrayList<>(); - for (Map.Entry entry : fieldDefinitions.entrySet()) { - if (entry.getValue().indexed() && entry.getValue().stored()) - if (entry.getValue().getType().getNumberType() == null) fields.add(entry.getKey()); - } - fieldNames = fields.toArray(new String[0]); - } - if (fieldNames.length < 1) { - throw new SolrException( - SolrException.ErrorCode.BAD_REQUEST, - "MoreLikeThis requires at least one similarity field: qf"); - } - - mlt.setFieldNames(fieldNames); - mlt.setAnalyzer(req.getSchema().getIndexAnalyzer()); - - Query rawMLTQuery = mlt.like(scoreDocs[0].doc); - BooleanQuery boostedMLTQuery = (BooleanQuery) rawMLTQuery; - - if (boost && boostFields.size() > 0) { - BooleanQuery.Builder newQ = new BooleanQuery.Builder(); - newQ.setMinimumNumberShouldMatch(boostedMLTQuery.getMinimumNumberShouldMatch()); - - for (BooleanClause clause : boostedMLTQuery) { - Query q = clause.getQuery(); - float originalBoost = 1f; - if (q instanceof BoostQuery) { - BoostQuery bq = (BoostQuery) q; - q = bq.getQuery(); - originalBoost = bq.getBoost(); - } - Float fieldBoost = boostFields.get(((TermQuery) q).getTerm().field()); - q = - ((fieldBoost != null) - ? new BoostQuery(q, fieldBoost * originalBoost) - : clause.getQuery()); - newQ.add(q, clause.getOccur()); - } - - boostedMLTQuery = QueryUtils.build(newQ, this); - } - - // exclude current document from results - BooleanQuery.Builder realMLTQuery = new BooleanQuery.Builder(); - realMLTQuery.add(boostedMLTQuery, BooleanClause.Occur.MUST); - realMLTQuery.add(docIdQuery, BooleanClause.Occur.MUST_NOT); - - return realMLTQuery.build(); + return parseMLTQuery( + this::getFieldsFromSchema, + moreLikeThis -> moreLikeThis.like(scoreDocs[0].doc), + docIdQuery); } catch (IOException e) { throw new SolrException( SolrException.ErrorCode.BAD_REQUEST, "Error completing MLT request" + e.getMessage()); } } - private Query createIdQuery(String defaultField, String uniqueValue) { + protected Query createIdQuery(String defaultField, String uniqueValue) { return new TermQuery( req.getSchema().getField(defaultField).getType().getNumberType() != null ? createNumericTerm(defaultField, uniqueValue) diff --git a/solr/core/src/test/org/apache/solr/search/QueryEqualityTest.java b/solr/core/src/test/org/apache/solr/search/QueryEqualityTest.java index 9aafec5a013..c38d5f45094 100644 --- a/solr/core/src/test/org/apache/solr/search/QueryEqualityTest.java +++ b/solr/core/src/test/org/apache/solr/search/QueryEqualityTest.java @@ -1294,6 +1294,20 @@ public void testQueryMLT() throws Exception { } } + public void testQueryMLTContent() throws Exception { + assertU(adoc("id", "1", "lowerfilt", "sample data")); + assertU(commit()); + try { + assertQueryEquals( + "mlt", + "{!mlt_content qf=lowerfilt}sample data", + "{!mlt_content qf=lowerfilt v='sample data'}"); + } finally { + delQ("*:*"); + assertU(commit()); + } + } + public void testQueryKNN() throws Exception { SolrInputDocument doc = new SolrInputDocument(); doc.addField("id", "0"); diff --git a/solr/core/src/test/org/apache/solr/search/mlt/CloudMLTQContentParserTest.java b/solr/core/src/test/org/apache/solr/search/mlt/CloudMLTQContentParserTest.java new file mode 100644 index 00000000000..ec8dcae907e --- /dev/null +++ b/solr/core/src/test/org/apache/solr/search/mlt/CloudMLTQContentParserTest.java @@ -0,0 +1,297 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.search.mlt; + +import java.util.ArrayList; +import java.util.Arrays; +import org.apache.solr.client.solrj.SolrQuery; +import org.apache.solr.client.solrj.response.QueryResponse; +import org.apache.solr.cloud.SolrCloudTestCase; +import org.apache.solr.common.SolrDocument; +import org.apache.solr.common.SolrDocumentList; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +public class CloudMLTQContentParserTest extends SolrCloudTestCase { + + private final String seventeenth = + "The quote red fox jumped moon over the lazy brown dogs moon." + + " Of course moon. Foxes and moon come back to the foxes and moon"; + + @Before + public void setupCluster() throws Exception { + configureCluster(2).addConfig("conf", configset("cloud-dynamic")).configure(); + + CloudMLTQParserTest.indexDocs(); + } + + @After + public void cleanCluster() throws Exception { + if (null != cluster) { + cluster.shutdown(); + } + } + + public static final String COLLECTION = "mlt-collection"; + + @Test + public void testMLTQParser() throws Exception { + + QueryResponse queryResponse = + cluster + .getSolrClient() + .query( + COLLECTION, + new SolrQuery( + "q", "{!mlt_content qf=lowerfilt_u mindf=0}" + seventeenth, "fq", "-id:17") + .setShowDebugInfo(true)); + SolrDocumentList solrDocuments = queryResponse.getResults(); + int[] expectedIds = new int[] {7, 9, 13, 14, 15, 16, 20, 22, 24, 32}; + int[] actualIds = new int[10]; + int i = 0; + for (SolrDocument solrDocument : solrDocuments) { + actualIds[i++] = Integer.parseInt(String.valueOf(solrDocument.getFieldValue("id"))); + } + + Arrays.sort(actualIds); + Arrays.sort(expectedIds); + assertArrayEquals(expectedIds, actualIds); + } + + @Test + public void testBoost() throws Exception { + + QueryResponse queryResponse = + cluster + .getSolrClient() + .query( + COLLECTION, + new SolrQuery( + "q", + "{!mlt_content qf=lowerfilt_u boost=true mindf=0}" + seventeenth, + "fq", + "-id:17")); + SolrDocumentList solrDocuments = queryResponse.getResults(); + int[] expectedIds = new int[] {7, 9, 13, 14, 15, 16, 20, 22, 24, 32}; + int[] actualIds = new int[solrDocuments.size()]; + int i = 0; + for (SolrDocument solrDocument : solrDocuments) { + actualIds[i++] = Integer.parseInt(String.valueOf(solrDocument.getFieldValue("id"))); + } + + Arrays.sort(actualIds); + Arrays.sort(expectedIds); + assertArrayEquals(expectedIds, actualIds); + + String thirtineenth = "The quote red fox jumped over the lazy brown dogs." + "red green yellow"; + queryResponse = + cluster + .getSolrClient() + .query( + COLLECTION, + new SolrQuery( + "q", + "{!mlt_content qf=lowerfilt_u^10,lowerfilt1_u^1000 boost=false mintf=0 mindf=0}" + + thirtineenth, + "fq", + "-id:30")); + solrDocuments = queryResponse.getResults(); + expectedIds = new int[] {31, 18, 23, 13, 14, 20, 22, 32, 19, 21}; + actualIds = new int[solrDocuments.size()]; + i = 0; + for (SolrDocument solrDocument : solrDocuments) { + actualIds[i++] = Integer.parseInt(String.valueOf(solrDocument.getFieldValue("id"))); + } + + Arrays.sort(actualIds); + Arrays.sort(expectedIds); + assertArrayEquals(expectedIds, actualIds); + + queryResponse = + cluster + .getSolrClient() + .query( + COLLECTION, + new SolrQuery( + "q", + "{!mlt_content qf=lowerfilt_u^10,lowerfilt1_u^1000 boost=true mintf=0 mindf=0}" + + thirtineenth, + "fq", + "-id:30")); + solrDocuments = queryResponse.getResults(); + expectedIds = new int[] {29, 31, 32, 18, 23, 13, 14, 20, 22, 19}; + actualIds = new int[solrDocuments.size()]; + i = 0; + for (SolrDocument solrDocument : solrDocuments) { + actualIds[i++] = Integer.parseInt(String.valueOf(solrDocument.getFieldValue("id"))); + } + + Arrays.sort(actualIds); + Arrays.sort(expectedIds); + assertArrayEquals(expectedIds, actualIds); + } + + @Test + @SuppressWarnings({"unchecked"}) + public void testMinDF() throws Exception { + + QueryResponse queryResponse = + cluster + .getSolrClient() + .query( + COLLECTION, + new SolrQuery( + "q", + "{!mlt_content qf=lowerfilt_u mindf=0 mintf=1}" + "bmw usa", + "fq", + "-id:3") + .setShowDebugInfo(true)); + SolrDocumentList solrDocuments = queryResponse.getResults(); + int[] expectedIds = new int[] {29, 27, 26, 28}; + int[] actualIds = new int[solrDocuments.size()]; + int i = 0; + for (SolrDocument solrDocument : solrDocuments) { + actualIds[i++] = Integer.parseInt(String.valueOf(solrDocument.getFieldValue("id"))); + } + + Arrays.sort(actualIds); + Arrays.sort(expectedIds); + assertArrayEquals(expectedIds, actualIds); + + String[] expectedQueryStrings = + new String[] {"lowerfilt_u:bmw lowerfilt_u:usa", "lowerfilt_u:usa lowerfilt_u:bmw"}; + + String[] actualParsedQueries; + if (queryResponse.getDebugMap().get("parsedquery") instanceof String) { + String parsedQueryString = (String) queryResponse.getDebugMap().get("parsedquery"); + assertTrue( + parsedQueryString.equals(expectedQueryStrings[0]) + || parsedQueryString.equals(expectedQueryStrings[1])); + } else { + actualParsedQueries = + ((ArrayList) queryResponse.getDebugMap().get("parsedquery")) + .toArray(new String[0]); + Arrays.sort(actualParsedQueries); + assertArrayEquals(expectedQueryStrings, actualParsedQueries); + } + } + + @Test + public void testMultipleFields() throws Exception { + + QueryResponse queryResponse = + cluster + .getSolrClient() + .query( + COLLECTION, + new SolrQuery( + "q", + "{!mlt_content qf=lowerfilt_u,lowerfilt1_u mindf=0 mintf=1}" + "bmw usa 328i", + "fq", + "-id:26")); + SolrDocumentList solrDocuments = queryResponse.getResults(); + int[] expectedIds = new int[] {3, 29, 27, 28}; + int[] actualIds = new int[solrDocuments.size()]; + int i = 0; + for (SolrDocument solrDocument : solrDocuments) { + actualIds[i++] = Integer.parseInt(String.valueOf(solrDocument.getFieldValue("id"))); + } + + Arrays.sort(actualIds); + Arrays.sort(expectedIds); + assertArrayEquals(expectedIds, actualIds); + } + + @Test + public void testHighDFValue() throws Exception { + + // Test out a high value of df and make sure nothing matches. + QueryResponse queryResponse = + cluster + .getSolrClient() + .query( + COLLECTION, + new SolrQuery("q", "{!mlt_content qf=lowerfilt_u mindf=20 mintf=1}" + "bmw usa")); + SolrDocumentList solrDocuments = queryResponse.getResults(); + assertEquals( + "Expected to match 0 documents with a mindf of 20 but found more", solrDocuments.size(), 0); + } + + @Test + public void testHighWLValue() throws Exception { + + // Test out a high value of wl and make sure nothing matches. + QueryResponse queryResponse = + cluster + .getSolrClient() + .query( + COLLECTION, + new SolrQuery("q", "{!mlt_content qf=lowerfilt_u minwl=4 mintf=1}" + "bmw usa")); + SolrDocumentList solrDocuments = queryResponse.getResults(); + assertEquals( + "Expected to match 0 documents with a minwl of 4 but found more", solrDocuments.size(), 0); + } + + @Test + public void testLowMinWLValue() throws Exception { + + // Test out a low enough value of minwl and make sure we get the expected matches. + QueryResponse queryResponse = + cluster + .getSolrClient() + .query( + COLLECTION, + new SolrQuery( + "q", + "{!mlt_content qf=lowerfilt_u minwl=3 mintf=1 mindf=0}" + "bmw usa", + "fq", + "-id:3")); + SolrDocumentList solrDocuments = queryResponse.getResults(); + assertEquals( + "Expected to match 4 documents with a minwl of 3 but found more", 4, solrDocuments.size()); + } + + // there's a problem with this feature {!mlt} picks only fields from the doc + // but here I have to specify field explicitly otherwise it picks ou all fields from schema and + // fails on analysin UUID field + @Test + public void testUnstoredAndUnanalyzedFieldsAreIgnored() throws Exception { + QueryResponse queryResponse = + cluster + .getSolrClient() + .query( + COLLECTION, + new SolrQuery( + "q", + "{!mlt_content qf=lowerfilt_u}" + + "The quote red fox jumped over the lazy brown dogs.", + "fq", + "-id:20")); + SolrDocumentList solrDocuments = queryResponse.getResults(); + int[] actualIds = new int[solrDocuments.size()]; + int[] expectedIds = new int[] {13, 14, 15, 16, 22, 24, 32, 18, 19, 21}; + int i = 0; + for (SolrDocument solrDocument : solrDocuments) { + actualIds[i++] = Integer.parseInt(String.valueOf(solrDocument.getFieldValue("id"))); + } + + Arrays.sort(actualIds); + Arrays.sort(expectedIds); + assertArrayEquals(expectedIds, actualIds); + } +} diff --git a/solr/core/src/test/org/apache/solr/search/mlt/CloudMLTQParserTest.java b/solr/core/src/test/org/apache/solr/search/mlt/CloudMLTQParserTest.java index 91975464372..5912465f883 100644 --- a/solr/core/src/test/org/apache/solr/search/mlt/CloudMLTQParserTest.java +++ b/solr/core/src/test/org/apache/solr/search/mlt/CloudMLTQParserTest.java @@ -37,6 +37,11 @@ public class CloudMLTQParserTest extends SolrCloudTestCase { public void setupCluster() throws Exception { configureCluster(2).addConfig("conf", configset("cloud-dynamic")).configure(); + indexDocs(); + } + + static void indexDocs() + throws org.apache.solr.client.solrj.SolrServerException, java.io.IOException { final CloudSolrClient client = cluster.getSolrClient(); CollectionAdminRequest.createCollection(COLLECTION, "conf", 2, 1).process(client); @@ -132,7 +137,9 @@ public void testMLTQParser() throws Exception { QueryResponse queryResponse = cluster .getSolrClient() - .query(COLLECTION, new SolrQuery("{!mlt qf=lowerfilt_u}17").setShowDebugInfo(true)); + .query( + COLLECTION, + new SolrQuery("{!mlt qf=lowerfilt_u mindf=0}17").setShowDebugInfo(true)); SolrDocumentList solrDocuments = queryResponse.getResults(); int[] expectedIds = new int[] {7, 9, 13, 14, 15, 16, 20, 22, 24, 32}; int[] actualIds = new int[10]; @@ -152,7 +159,7 @@ public void testBoost() throws Exception { QueryResponse queryResponse = cluster .getSolrClient() - .query(COLLECTION, new SolrQuery("{!mlt qf=lowerfilt_u boost=true}17")); + .query(COLLECTION, new SolrQuery("{!mlt qf=lowerfilt_u boost=true mindf=0}17")); SolrDocumentList solrDocuments = queryResponse.getResults(); int[] expectedIds = new int[] {7, 9, 13, 14, 15, 16, 20, 22, 24, 32}; int[] actualIds = new int[solrDocuments.size()]; @@ -182,7 +189,6 @@ public void testBoost() throws Exception { Arrays.sort(actualIds); Arrays.sort(expectedIds); - System.out.println("DEBUG ACTUAL IDS 1: " + Arrays.toString(actualIds)); assertArrayEquals(expectedIds, actualIds); queryResponse = @@ -202,9 +208,7 @@ public void testBoost() throws Exception { Arrays.sort(actualIds); Arrays.sort(expectedIds); - System.out.println("DEBUG ACTUAL IDS 2: " + Arrays.toString(actualIds)); - assertArrayEquals( - Arrays.toString(expectedIds) + " " + Arrays.toString(actualIds), expectedIds, actualIds); + assertArrayEquals(expectedIds, actualIds); } @Test @@ -227,8 +231,7 @@ public void testMinDF() throws Exception { Arrays.sort(actualIds); Arrays.sort(expectedIds); - assertArrayEquals( - Arrays.toString(expectedIds) + " " + Arrays.toString(actualIds), expectedIds, actualIds); + assertArrayEquals(expectedIds, actualIds); String[] expectedQueryStrings = new String[] { @@ -268,8 +271,7 @@ public void testMultipleFields() throws Exception { Arrays.sort(actualIds); Arrays.sort(expectedIds); - assertArrayEquals( - Arrays.toString(expectedIds) + " " + Arrays.toString(actualIds), expectedIds, actualIds); + assertArrayEquals(expectedIds, actualIds); } @Test @@ -305,7 +307,7 @@ public void testLowMinWLValue() throws Exception { QueryResponse queryResponse = cluster .getSolrClient() - .query(COLLECTION, new SolrQuery("{!mlt qf=lowerfilt_u minwl=3 mintf=1}3")); + .query(COLLECTION, new SolrQuery("{!mlt qf=lowerfilt_u minwl=3 mintf=1 mindf=0}3")); SolrDocumentList solrDocuments = queryResponse.getResults(); assertEquals( "Expected to match 4 documents with a minwl of 3 but found more", 4, solrDocuments.size()); @@ -322,25 +324,20 @@ public void testUnstoredAndUnanalyzedFieldsAreIgnored() throws Exception { int[] actualIds = new int[solrDocuments.size()]; int[] expectedIds = new int[] {13, 14, 15, 16, 22, 24, 32, 18, 19, 21}; int i = 0; - StringBuilder sb = new StringBuilder(); for (SolrDocument solrDocument : solrDocuments) { actualIds[i++] = Integer.parseInt(String.valueOf(solrDocument.getFieldValue("id"))); - sb.append(actualIds[i - 1]).append(", "); } - Arrays.sort(actualIds); Arrays.sort(expectedIds); assertArrayEquals(expectedIds, actualIds); } public void testInvalidSourceDocument() { - SolrException e = - expectThrows( - SolrException.class, - () -> { - cluster - .getSolrClient() - .query(COLLECTION, new SolrQuery("{!mlt qf=lowerfilt_u}999999")); - }); + expectThrows( + SolrException.class, + () -> + cluster + .getSolrClient() + .query(COLLECTION, new SolrQuery("{!mlt qf=lowerfilt_u}999999"))); } } diff --git a/solr/core/src/test/org/apache/solr/search/mlt/SimpleMLTContentQParserTest.java b/solr/core/src/test/org/apache/solr/search/mlt/SimpleMLTContentQParserTest.java new file mode 100644 index 00000000000..689ee1f38bf --- /dev/null +++ b/solr/core/src/test/org/apache/solr/search/mlt/SimpleMLTContentQParserTest.java @@ -0,0 +1,192 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.search.mlt; + +import org.apache.solr.SolrTestCaseJ4; +import org.apache.solr.common.params.CommonParams; +import org.apache.solr.common.params.SolrParams; +import org.junit.BeforeClass; +import org.junit.Test; + +public class SimpleMLTContentQParserTest extends SolrTestCaseJ4 { + + @BeforeClass + public static void moreLikeThisBeforeClass() throws Exception { + initCore("solrconfig.xml", "schema.xml"); + } + + @Test + public void doTest() { + SimpleMLTQParserTest.setupDocsForMLT(); + + // for score tiebreaker, use doc ID order + final SolrParams sortParams = params("sort", "score desc, id asc"); + + final String seventeenth = + "The quote red fox jumped moon over the lazy " + + "brown dogs moon. Of course moon. Foxes and moon come back to the foxes and moon"; + assertQ( + req( + sortParams, + CommonParams.Q, + "{!mlt_content qf=lowerfilt mintf=0 mindf=0}" + seventeenth), + "//result/doc[1]/str[@name='id'][.='17']"); + assertQ( + req( + sortParams, + CommonParams.Q, + "{!mlt_content qf=lowerfilt}" + seventeenth, + "fq", + "-id:17"), + "//result/doc[1]/str[@name='id'][.='13']", + "//result/doc[2]/str[@name='id'][.='14']", + "//result/doc[3]/str[@name='id'][.='15']", + "//result/doc[4]/str[@name='id'][.='16']", + "//result/doc[5]/str[@name='id'][.='18']", + "//result/doc[6]/str[@name='id'][.='19']", + "//result/doc[7]/str[@name='id'][.='20']", + "//result/doc[8]/str[@name='id'][.='21']", + "//result/doc[9]/str[@name='id'][.='22']", + "//result/doc[10]/str[@name='id'][.='23']"); + + assertQ( + req( + sortParams, + CommonParams.Q, + "{!mlt_content qf=lowerfilt boost=true}" + seventeenth, + "fq", + "-id:17"), + "//result/doc[1]/str[@name='id'][.='13']", + "//result/doc[2]/str[@name='id'][.='14']", + "//result/doc[3]/str[@name='id'][.='15']", + "//result/doc[4]/str[@name='id'][.='16']", + "//result/doc[5]/str[@name='id'][.='18']", + "//result/doc[6]/str[@name='id'][.='19']", + "//result/doc[7]/str[@name='id'][.='20']", + "//result/doc[8]/str[@name='id'][.='21']", + "//result/doc[9]/str[@name='id'][.='22']", + "//result/doc[10]/str[@name='id'][.='23']"); + final String thirteenth = "The quote red fox jumped over the lazy brown dogs."; + final String thirteenth1 = "red green yellow"; + assertQ( + req( + sortParams, + CommonParams.Q, + " " /*space matterz*/ + + "{!mlt_content qf=lowerfilt boost=false mintf=0 mindf=0 v=$lowerfilt30} " + + "{!mlt_content qf=lowerfilt1^1000 boost=false mintf=0 mindf=0 v=$lowerfilt130}", + "lowerfilt30", + thirteenth, + "lowerfilt130", + thirteenth1, + "fl", + "lowerfilt,lowerfilt1,id,score", + "explainOther", + "id:31", + "indent", + "on", + "debugQuery", + "on"), + "//result/doc[1]/str[@name='id'][.='30']", + "//result/doc[2]/str[@name='id'][.='31']"); + assertQ( + req( + sortParams, + CommonParams.Q, + " " /*space matterz*/ + + "{!mlt_content qf=lowerfilt,lowerfilt1^1000 boost=false mintf=0 mindf=0 v=$lowerfilt30} ", + "lowerfilt30", + thirteenth + " " + thirteenth1), + "//result/doc[1]/str[@name='id'][.='30']", + "//result/doc[2]/str[@name='id'][.='31']"); + assertQ( + req( + sortParams, + CommonParams.Q, + "{!mlt_content qf=lowerfilt,lowerfilt1^1000 boost=false mintf=0 mindf=0}" + + thirteenth + + " " + + thirteenth1, + "fq", + "-id:30"), + "//result/doc[1]/str[@name='id'][.='31']", + "//result/doc[2]/str[@name='id'][.='13']", + "//result/doc[3]/str[@name='id'][.='14']", + "//result/doc[4]/str[@name='id'][.='18']", + "//result/doc[5]/str[@name='id'][.='20']", + "//result/doc[6]/str[@name='id'][.='22']", + "//result/doc[7]/str[@name='id'][.='23']", + "//result/doc[8]/str[@name='id'][.='32']", + "//result/doc[9]/str[@name='id'][.='15']", + "//result/doc[10]/str[@name='id'][.='16']"); + + assertQ( + req( + sortParams, + CommonParams.Q, + "{!mlt_content qf=lowerfilt,lowerfilt1^1000 boost=true mintf=0 mindf=0}" + + thirteenth + + " " + + thirteenth1, + "fq", + "-id:30"), + "//result/doc[1]/str[@name='id'][.='29']", + "//result/doc[2]/str[@name='id'][.='31']", + "//result/doc[3]/str[@name='id'][.='32']", + "//result/doc[4]/str[@name='id'][.='13']", + "//result/doc[5]/str[@name='id'][.='14']", + "//result/doc[6]/str[@name='id'][.='18']", + "//result/doc[7]/str[@name='id'][.='20']", + "//result/doc[8]/str[@name='id'][.='22']", + "//result/doc[9]/str[@name='id'][.='23']", + "//result/doc[10]/str[@name='id'][.='15']"); + + String s26th = "bmw usa 328i"; + assertQ( + req( + sortParams, + CommonParams.Q, + "{!mlt_content qf=lowerfilt mindf=0 mintf=1}" + s26th, + "fq", + "-id:26"), + "//result/doc[1]/str[@name='id'][.='29']", + "//result/doc[2]/str[@name='id'][.='27']", + "//result/doc[3]/str[@name='id'][.='28']"); + + assertQ( + req(CommonParams.Q, "{!mlt_content qf=lowerfilt mindf=10 mintf=1}" + s26th, "fq", "-id:26"), + "//result[@numFound='0']"); + + assertQ( + req( + CommonParams.Q, + "{!mlt_content qf=lowerfilt minwl=3 mintf=1 mindf=1}" + s26th, + "fq", + "-id:26"), + "//result[@numFound='3']"); + + assertQ( + req( + CommonParams.Q, + "{!mlt_content qf=lowerfilt minwl=4 mintf=1 mindf=1}" + s26th, + "fq", + "-id:26", + CommonParams.DEBUG, + "true"), + "//result[@numFound='0']"); + } +} diff --git a/solr/core/src/test/org/apache/solr/search/mlt/SimpleMLTQParserTest.java b/solr/core/src/test/org/apache/solr/search/mlt/SimpleMLTQParserTest.java index b8f9552a68b..15d2f297c41 100644 --- a/solr/core/src/test/org/apache/solr/search/mlt/SimpleMLTQParserTest.java +++ b/solr/core/src/test/org/apache/solr/search/mlt/SimpleMLTQParserTest.java @@ -32,81 +32,7 @@ public static void moreLikeThisBeforeClass() throws Exception { @Test public void doTest() { - String id = "id"; - String FIELD1 = "lowerfilt"; - String FIELD2 = "lowerfilt1"; - delQ("*:*"); - assertU(adoc(id, "1", FIELD1, "toyota")); - assertU(adoc(id, "2", FIELD1, "chevrolet")); - assertU(adoc(id, "3", FIELD1, "suzuki")); - assertU(adoc(id, "4", FIELD1, "ford")); - assertU(adoc(id, "5", FIELD1, "ferrari")); - assertU(adoc(id, "6", FIELD1, "jaguar")); - assertU( - adoc( - id, - "7", - FIELD1, - "mclaren moon or the moon and moon moon shine " - + "and the moon but moon was good foxes too")); - assertU(adoc(id, "8", FIELD1, "sonata")); - assertU( - adoc( - id, - "9", - FIELD1, - "The quick red fox jumped over the lazy big " + "and large brown dogs.")); - assertU(adoc(id, "10", FIELD1, "blue")); - assertU(adoc(id, "12", FIELD1, "glue")); - assertU(adoc(id, "13", FIELD1, "The quote red fox jumped over the lazy brown dogs.")); - assertU(adoc(id, "14", FIELD1, "The quote red fox jumped over the lazy brown dogs.")); - assertU(adoc(id, "15", FIELD1, "The fat red fox jumped over the lazy brown dogs.")); - assertU(adoc(id, "16", FIELD1, "The slim red fox jumped over the lazy brown dogs.")); - assertU( - adoc( - id, - "17", - FIELD1, - "The quote red fox jumped moon over the lazy " - + "brown dogs moon. Of course moon. Foxes and moon come back to the foxes and moon")); - assertU(adoc(id, "18", FIELD1, "The quote red fox jumped over the lazy brown dogs.")); - assertU(adoc(id, "19", FIELD1, "The hose red fox jumped over the lazy brown dogs.")); - assertU(adoc(id, "20", FIELD1, "The quote red fox jumped over the lazy brown dogs.")); - assertU(adoc(id, "21", FIELD1, "The court red fox jumped over the lazy brown dogs.")); - assertU(adoc(id, "22", FIELD1, "The quote red fox jumped over the lazy brown dogs.")); - assertU(adoc(id, "23", FIELD1, "The quote red fox jumped over the lazy brown dogs.")); - assertU(adoc(id, "24", FIELD1, "The file red fox jumped over the lazy brown dogs.")); - assertU(adoc(id, "25", FIELD1, "rod fix")); - assertU(adoc(id, "26", FIELD1, "bmw usa 328i")); - assertU(adoc(id, "27", FIELD1, "bmw usa 535i")); - assertU(adoc(id, "28", FIELD1, "bmw 750Li")); - assertU(adoc(id, "29", FIELD1, "bmw usa", FIELD2, "red green blue")); - assertU( - adoc( - id, - "30", - FIELD1, - "The quote red fox jumped over the lazy brown dogs.", - FIELD2, - "red green yellow")); - assertU( - adoc( - id, - "31", - FIELD1, - "The fat red fox jumped over the lazy brown dogs.", - FIELD2, - "green blue yellow")); - assertU( - adoc( - id, - "32", - FIELD1, - "The slim red fox jumped over the lazy brown dogs.", - FIELD2, - "yellow white black")); - - assertU(commit()); + setupDocsForMLT(); // for score tiebreaker, use doc ID order final SolrParams sortParams = params("sort", "score desc, id asc"); @@ -188,4 +114,82 @@ public void doTest() { CommonParams.DEBUG, "true"), "//result[@numFound='0']"); } + + public static void setupDocsForMLT() { + String id = "id"; + String FIELD1 = "lowerfilt"; + String FIELD2 = "lowerfilt1"; + delQ("*:*"); + assertU(adoc(id, "1", FIELD1, "toyota")); + assertU(adoc(id, "2", FIELD1, "chevrolet")); + assertU(adoc(id, "3", FIELD1, "suzuki")); + assertU(adoc(id, "4", FIELD1, "ford")); + assertU(adoc(id, "5", FIELD1, "ferrari")); + assertU(adoc(id, "6", FIELD1, "jaguar")); + assertU( + adoc( + id, + "7", + FIELD1, + "mclaren moon or the moon and moon moon shine " + + "and the moon but moon was good foxes too")); + assertU(adoc(id, "8", FIELD1, "sonata")); + assertU( + adoc( + id, + "9", + FIELD1, + "The quick red fox jumped over the lazy big " + "and large brown dogs.")); + assertU(adoc(id, "10", FIELD1, "blue")); + assertU(adoc(id, "12", FIELD1, "glue")); + assertU(adoc(id, "13", FIELD1, "The quote red fox jumped over the lazy brown dogs.")); + assertU(adoc(id, "14", FIELD1, "The quote red fox jumped over the lazy brown dogs.")); + assertU(adoc(id, "15", FIELD1, "The fat red fox jumped over the lazy brown dogs.")); + assertU(adoc(id, "16", FIELD1, "The slim red fox jumped over the lazy brown dogs.")); + assertU( + adoc( + id, + "17", + FIELD1, + "The quote red fox jumped moon over the lazy " + + "brown dogs moon. Of course moon. Foxes and moon come back to the foxes and moon")); + assertU(adoc(id, "18", FIELD1, "The quote red fox jumped over the lazy brown dogs.")); + assertU(adoc(id, "19", FIELD1, "The hose red fox jumped over the lazy brown dogs.")); + assertU(adoc(id, "20", FIELD1, "The quote red fox jumped over the lazy brown dogs.")); + assertU(adoc(id, "21", FIELD1, "The court red fox jumped over the lazy brown dogs.")); + assertU(adoc(id, "22", FIELD1, "The quote red fox jumped over the lazy brown dogs.")); + assertU(adoc(id, "23", FIELD1, "The quote red fox jumped over the lazy brown dogs.")); + assertU(adoc(id, "24", FIELD1, "The file red fox jumped over the lazy brown dogs.")); + assertU(adoc(id, "25", FIELD1, "rod fix")); + assertU(adoc(id, "26", FIELD1, "bmw usa 328i")); + assertU(adoc(id, "27", FIELD1, "bmw usa 535i")); + assertU(adoc(id, "28", FIELD1, "bmw 750Li")); + assertU(adoc(id, "29", FIELD1, "bmw usa", FIELD2, "red green blue")); + assertU( + adoc( + id, + "30", + FIELD1, + "The quote red fox jumped over the lazy brown dogs.", + FIELD2, + "red green yellow")); + assertU( + adoc( + id, + "31", + FIELD1, + "The fat red fox jumped over the lazy brown dogs.", + FIELD2, + "green blue yellow")); + assertU( + adoc( + id, + "32", + FIELD1, + "The slim red fox jumped over the lazy brown dogs.", + FIELD2, + "yellow white black")); + + assertU(commit()); + } } diff --git a/solr/solr-ref-guide/modules/query-guide/pages/morelikethis.adoc b/solr/solr-ref-guide/modules/query-guide/pages/morelikethis.adoc index 273ba202c52..775b699d97a 100644 --- a/solr/solr-ref-guide/modules/query-guide/pages/morelikethis.adoc +++ b/solr/solr-ref-guide/modules/query-guide/pages/morelikethis.adoc @@ -330,7 +330,7 @@ http://localhost:8983/solr/mlt?stream.body=electronics%20memory&mlt.fl=manu,cat& This query would pass the terms "electronics memory" to the request handler instead of using a document already in the index. -The response in this case would look similar to the response above that used a document already in the index. +The response in this case would look similar to the response above that used a document already in the index. Note: it doesn't work for SolrCloud, check below for the possible solution. === MoreLikeThis Search Component @@ -644,3 +644,8 @@ The query parser response includes only the similar documents sorted by score: "_version_":1693062911095734272}] }} ---- + +=== Query Parser for External Content + +Use `{!mlt_content}lorem ipsum` or `{!mlt_content q='lorem ipsum'}` to find docs similar to an external content absent in index in SolrCloud mode like `/mlt` handler and content streams. Parameters and response are the same as above. It queries fields passed via `qf` parameter with the given content. When `qf` is omitted it queries all fields in the schema that usually fails on numerics and other specific formatted field types. If you need to query different fields with different content combine several `{!mlt_content qf=fieldA}lorem ipsum` with `{!bool}` query or other. +