Skip to content

Commit

Permalink
SOLR-16420: {!mlt_content} (apache#1045)
Browse files Browse the repository at this point in the history
* SOLR-16420: introducing {!mlt_content} accepting external content.
  • Loading branch information
mkhludnev authored Nov 25, 2022
1 parent a8719ba commit 1eef75e
Show file tree
Hide file tree
Showing 12 changed files with 867 additions and 320 deletions.
4 changes: 4 additions & 0 deletions solr/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,8 @@ Improvements
* SOLR-11028: A v2 equivalent of the `/admin/collections?action= REPLACE` command is now available at
`POST /api/cluster/nodes/nodeName/replace`. (Joshua Ouma via Jason Gerlowski)

* SOLR-16420: Introducing `{!mlt_content}foo bar` to cover existing `/mlt` handler functionality for SolrCloud. (Mikhail Khludnev)

Optimizations
---------------------

Expand Down Expand Up @@ -102,6 +104,8 @@ Bug Fixes

* SOLR-16528: Jaegertracer module must include okhttp3 dependency (janhoy)

* SOLR-16420: Default for cloud mode was fixed to `{!mlt mindf=5}` to comply with Reference Guide (Mikhail Khludnev)

Build
---------------------
* Upgrade forbiddenapis to 3.4 (Uwe Schindler)
Expand Down
2 changes: 2 additions & 0 deletions solr/core/src/java/org/apache/solr/search/QParserPlugin.java
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import org.apache.solr.search.join.FiltersQParserPlugin;
import org.apache.solr.search.join.GraphQParserPlugin;
import org.apache.solr.search.join.HashRangeQParserPlugin;
import org.apache.solr.search.mlt.MLTContentQParserPlugin;
import org.apache.solr.search.mlt.MLTQParserPlugin;
import org.apache.solr.search.neural.KnnQParserPlugin;
import org.apache.solr.util.plugin.NamedListInitializedPlugin;
Expand Down Expand Up @@ -73,6 +74,7 @@ public abstract class QParserPlugin implements NamedListInitializedPlugin, SolrI
map.put(ReRankQParserPlugin.NAME, new ReRankQParserPlugin());
map.put(ExportQParserPlugin.NAME, new ExportQParserPlugin());
map.put(MLTQParserPlugin.NAME, new MLTQParserPlugin());
map.put(MLTContentQParserPlugin.NAME, new MLTContentQParserPlugin());
map.put(HashQParserPlugin.NAME, new HashQParserPlugin());
map.put(GraphQParserPlugin.NAME, new GraphQParserPlugin());
map.put(XmlQParserPlugin.NAME, new XmlQParserPlugin());
Expand Down
157 changes: 157 additions & 0 deletions solr/core/src/java/org/apache/solr/search/mlt/AbstractMLTQParser.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.search.mlt;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import java.util.function.Supplier;
import java.util.regex.Pattern;
import org.apache.lucene.queries.mlt.MoreLikeThis;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.BoostQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.StringUtils;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.search.QParser;
import org.apache.solr.search.QueryUtils;
import org.apache.solr.util.SolrPluginUtils;

abstract class AbstractMLTQParser extends QParser {
// Pattern is thread safe -- TODO? share this with general 'fl' param
private static final Pattern splitList = Pattern.compile(",| ");

/** Retrieves text and string fields fom the schema */
protected String[] getFieldsFromSchema() {
Map<String, SchemaField> fieldDefinitions = req.getSearcher().getSchema().getFields();
ArrayList<String> fields = new ArrayList<>();
for (Map.Entry<String, SchemaField> entry : fieldDefinitions.entrySet()) {
if (entry.getValue().indexed() && entry.getValue().stored())
if (entry.getValue().getType().getNumberType() == null) fields.add(entry.getKey());
}
return fields.toArray(new String[0]);
}

/**
* Constructor for the QParser
*
* @param qstr The part of the query string specific to this parser
* @param localParams The set of parameters that are specific to this QParser. See
* https://solr.apache.org/guide/solr/latest/query-guide/local-params.html
* @param params The rest of the {@link SolrParams}
* @param req The original {@link SolrQueryRequest}.
*/
AbstractMLTQParser(String qstr, SolrParams localParams, SolrParams params, SolrQueryRequest req) {
super(qstr, localParams, params, req);
}

/** exclude current document from results */
public BooleanQuery exclude(BooleanQuery boostedMLTQuery, Query docIdQuery) {
BooleanQuery.Builder realMLTQuery = new BooleanQuery.Builder();
realMLTQuery.add(boostedMLTQuery, BooleanClause.Occur.MUST);
realMLTQuery.add(docIdQuery, BooleanClause.Occur.MUST_NOT);
return realMLTQuery.build();
}

@FunctionalInterface
protected interface MLTInvoker {
Query invoke(MoreLikeThis mlt) throws IOException;
}

protected BooleanQuery parseMLTQuery(
Supplier<String[]> fieldsFallback, MLTInvoker invoker, Query docIdQuery) throws IOException {
return exclude(parseMLTQuery(fieldsFallback, invoker), docIdQuery);
}

protected BooleanQuery parseMLTQuery(Supplier<String[]> fieldsFallback, MLTInvoker invoker)
throws IOException {
Map<String, Float> boostFields = new HashMap<>();
MoreLikeThis mlt = new MoreLikeThis(req.getSearcher().getIndexReader());

mlt.setMinTermFreq(localParams.getInt("mintf", MoreLikeThis.DEFAULT_MIN_TERM_FREQ));
// TODO def mindf was 0 for cloud, 5 for standalone
mlt.setMinDocFreq(localParams.getInt("mindf", MoreLikeThis.DEFAULT_MIN_DOC_FREQ));
mlt.setMinWordLen(localParams.getInt("minwl", MoreLikeThis.DEFAULT_MIN_WORD_LENGTH));
mlt.setMaxWordLen(localParams.getInt("maxwl", MoreLikeThis.DEFAULT_MAX_WORD_LENGTH));
mlt.setMaxQueryTerms(localParams.getInt("maxqt", MoreLikeThis.DEFAULT_MAX_QUERY_TERMS));
mlt.setMaxNumTokensParsed(
localParams.getInt("maxntp", MoreLikeThis.DEFAULT_MAX_NUM_TOKENS_PARSED));
mlt.setMaxDocFreq(localParams.getInt("maxdf", MoreLikeThis.DEFAULT_MAX_DOC_FREQ));

final boolean boost = localParams.getBool("boost", MoreLikeThis.DEFAULT_BOOST);
mlt.setBoost(boost);

mlt.setAnalyzer(req.getSchema().getIndexAnalyzer());

final String[] fieldNames;
String[] qf = localParams.getParams("qf");
if (qf != null) {
ArrayList<String> fields = new ArrayList<>();
for (String fieldName : qf) {
if (!StringUtils.isEmpty(fieldName)) {
String[] strings = splitList.split(fieldName);
for (String string : strings) {
if (!StringUtils.isEmpty(string)) {
fields.add(string);
}
}
}
}
// Parse field names and boosts from the fields
boostFields.putAll(SolrPluginUtils.parseFieldBoosts(fields.toArray(new String[0])));
fieldNames = boostFields.keySet().toArray(new String[0]);
} else {
fieldNames = fieldsFallback.get();
}
if (fieldNames.length < 1) {
throw new SolrException(
SolrException.ErrorCode.BAD_REQUEST,
"MoreLikeThis requires at least one similarity field: qf");
}
mlt.setFieldNames(fieldNames);
final BooleanQuery rawMLTQuery = (BooleanQuery) invoker.invoke(mlt);

if (boost && boostFields.size() > 0) {
BooleanQuery.Builder newQ = new BooleanQuery.Builder();
newQ.setMinimumNumberShouldMatch(rawMLTQuery.getMinimumNumberShouldMatch());

for (BooleanClause clause : rawMLTQuery) {
Query q = clause.getQuery();
float originalBoost = 1f;
if (q instanceof BoostQuery) {
BoostQuery bq = (BoostQuery) q;
q = bq.getQuery();
originalBoost = bq.getBoost();
}
Float fieldBoost = boostFields.get(((TermQuery) q).getTerm().field());
q =
((fieldBoost != null)
? new BoostQuery(q, fieldBoost * originalBoost)
: clause.getQuery());
newQ.add(q, clause.getOccur());
}
return QueryUtils.build(newQ, this);
}
return rawMLTQuery;
}
}
143 changes: 22 additions & 121 deletions solr/core/src/java/org/apache/solr/search/mlt/CloudMLTQParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,36 +23,22 @@
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Pattern;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.Term;
import org.apache.lucene.queries.mlt.MoreLikeThis;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.BoostQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.StringUtils;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.SolrCore;
import org.apache.solr.legacy.LegacyNumericUtils;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.request.SolrQueryRequestBase;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.search.QParser;
import org.apache.solr.search.QueryParsing;
import org.apache.solr.search.QueryUtils;
import org.apache.solr.util.SolrPluginUtils;

public class CloudMLTQParser extends QParser {
// Pattern is thread safe -- TODO? share this with general 'fl' param
private static final Pattern splitList = Pattern.compile(",| ");
public class CloudMLTQParser extends SimpleMLTQParser {

public CloudMLTQParser(
String qstr, SolrParams localParams, SolrParams params, SolrQueryRequest req) {
Expand All @@ -69,65 +55,18 @@ public Query parse() {
SolrException.ErrorCode.BAD_REQUEST,
"Error completing MLT request. Could not fetch " + "document with id [" + id + "]");
}

String[] qf = localParams.getParams("qf");
Map<String, Float> boostFields = new HashMap<>();
MoreLikeThis mlt = new MoreLikeThis(req.getSearcher().getIndexReader());

mlt.setMinTermFreq(localParams.getInt("mintf", MoreLikeThis.DEFAULT_MIN_TERM_FREQ));
mlt.setMinDocFreq(localParams.getInt("mindf", 0));
mlt.setMinWordLen(localParams.getInt("minwl", MoreLikeThis.DEFAULT_MIN_WORD_LENGTH));
mlt.setMaxWordLen(localParams.getInt("maxwl", MoreLikeThis.DEFAULT_MAX_WORD_LENGTH));
mlt.setMaxQueryTerms(localParams.getInt("maxqt", MoreLikeThis.DEFAULT_MAX_QUERY_TERMS));
mlt.setMaxNumTokensParsed(
localParams.getInt("maxntp", MoreLikeThis.DEFAULT_MAX_NUM_TOKENS_PARSED));
mlt.setMaxDocFreq(localParams.getInt("maxdf", MoreLikeThis.DEFAULT_MAX_DOC_FREQ));

Boolean boost = localParams.getBool("boost", MoreLikeThis.DEFAULT_BOOST);
mlt.setBoost(boost);

mlt.setAnalyzer(req.getSchema().getIndexAnalyzer());

Map<String, Collection<Object>> filteredDocument = new HashMap<>();
String[] fieldNames;

if (qf != null) {
ArrayList<String> fields = new ArrayList<>();
for (String fieldName : qf) {
if (!StringUtils.isEmpty(fieldName)) {
String[] strings = splitList.split(fieldName);
for (String string : strings) {
if (!StringUtils.isEmpty(string)) {
fields.add(string);
}
}
}
}
// Parse field names and boosts from the fields
boostFields = SolrPluginUtils.parseFieldBoosts(fields.toArray(new String[0]));
fieldNames = boostFields.keySet().toArray(new String[0]);
} else {
ArrayList<String> fields = new ArrayList<>();
for (String field : doc.getFieldNames()) {
// Only use fields that are stored and have an explicit analyzer.
// This makes sense as the query uses tf/idf/.. for query construction.
// We might want to relook and change this in the future though.
SchemaField f = req.getSchema().getFieldOrNull(field);
if (f != null && f.stored() && f.getType().isExplicitAnalyzer()) {
fields.add(field);
}
}
fieldNames = fields.toArray(new String[0]);
try {
final Query docIdQuery = createIdQuery(req.getSchema().getUniqueKeyField().getName(), id);
return parseMLTQuery(() -> getFieldsFromDoc(doc), (mlt) -> likeDoc(mlt, doc), docIdQuery);
} catch (IOException e) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Bad Request", e);
}
}

if (fieldNames.length < 1) {
throw new SolrException(
SolrException.ErrorCode.BAD_REQUEST,
"MoreLikeThis requires at least one similarity field: qf");
}
protected Query likeDoc(MoreLikeThis moreLikeThis, SolrDocument doc) throws IOException {
Map<String, Collection<Object>> filteredDocument = new HashMap<>();

mlt.setFieldNames(fieldNames);
for (String field : fieldNames) {
for (String field : moreLikeThis.getFieldNames()) {
Collection<Object> fieldValues = doc.getFieldValues(field);
if (fieldValues != null) {
Collection<Object> values = new ArrayList<>();
Expand All @@ -141,45 +80,21 @@ public Query parse() {
filteredDocument.put(field, values);
}
}
return moreLikeThis.like(filteredDocument);
}

try {
Query rawMLTQuery = mlt.like(filteredDocument);
BooleanQuery boostedMLTQuery = (BooleanQuery) rawMLTQuery;

if (boost && boostFields.size() > 0) {
BooleanQuery.Builder newQ = new BooleanQuery.Builder();
newQ.setMinimumNumberShouldMatch(boostedMLTQuery.getMinimumNumberShouldMatch());

for (BooleanClause clause : boostedMLTQuery) {
Query q = clause.getQuery();
float originalBoost = 1f;
if (q instanceof BoostQuery) {
BoostQuery bq = (BoostQuery) q;
q = bq.getQuery();
originalBoost = bq.getBoost();
}
Float fieldBoost = boostFields.get(((TermQuery) q).getTerm().field());
q =
((fieldBoost != null)
? new BoostQuery(q, fieldBoost * originalBoost)
: clause.getQuery());
newQ.add(q, clause.getOccur());
}

boostedMLTQuery = QueryUtils.build(newQ, this);
protected String[] getFieldsFromDoc(SolrDocument doc) {
ArrayList<String> fields = new ArrayList<>();
for (String field : doc.getFieldNames()) {
// Only use fields that are stored and have an explicit analyzer.
// This makes sense as the query uses tf/idf/.. for query construction.
// We might want to relook and change this in the future though.
SchemaField f = req.getSchema().getFieldOrNull(field);
if (f != null && f.stored() && f.getType().isExplicitAnalyzer()) {
fields.add(field);
}

// exclude current document from results
BooleanQuery.Builder realMLTQuery = new BooleanQuery.Builder();
realMLTQuery.add(boostedMLTQuery, BooleanClause.Occur.MUST);
realMLTQuery.add(
createIdQuery(req.getSchema().getUniqueKeyField().getName(), id),
BooleanClause.Occur.MUST_NOT);

return realMLTQuery.build();
} catch (IOException e) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Bad Request", e);
}
return fields.toArray(new String[0]);
}

private SolrDocument getDocument(String id) {
Expand All @@ -195,18 +110,4 @@ private SolrDocument getDocument(String id) {

return (SolrDocument) response.get("doc");
}

private Query createIdQuery(String defaultField, String uniqueValue) {
return new TermQuery(
req.getSchema().getField(defaultField).getType().getNumberType() != null
? createNumericTerm(defaultField, uniqueValue)
: new Term(defaultField, uniqueValue));
}

private Term createNumericTerm(String field, String uniqueValue) {
BytesRefBuilder bytesRefBuilder = new BytesRefBuilder();
bytesRefBuilder.grow(LegacyNumericUtils.BUF_SIZE_INT);
LegacyNumericUtils.intToPrefixCoded(Integer.parseInt(uniqueValue), 0, bytesRefBuilder);
return new Term(field, bytesRefBuilder.toBytesRef());
}
}
Loading

0 comments on commit 1eef75e

Please sign in to comment.