Skip to content

Commit

Permalink
Fuzzy Matching Functions - Jaro Winkler Similarity and Levenshtein Di…
Browse files Browse the repository at this point in the history
…stance (#2839)
  • Loading branch information
abhishoya-gs authored May 30, 2024
1 parent b2634f5 commit 7e535d6
Show file tree
Hide file tree
Showing 21 changed files with 167 additions and 13 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,10 @@
<groupId>commons-codec</groupId>
<artifactId>commons-codec</artifactId>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-text</artifactId>
</dependency>
<!-- COMMONS CODEC -->

<!-- TEST -->
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@

import org.apache.commons.codec.binary.Base64;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.text.similarity.JaroWinklerSimilarity;
import org.apache.commons.text.similarity.LevenshteinDistance;
import org.eclipse.collections.api.factory.Lists;
import org.eclipse.collections.api.list.MutableList;
import org.finos.legend.engine.plan.dependencies.domain.date.DayOfWeek;
Expand Down Expand Up @@ -1776,4 +1778,14 @@ public static double coTangent(double input)
{
return 1.0 / Math.tan(input);
}

public static double jaroWinklerSimilarity(String str1, String str2)
{
return new JaroWinklerSimilarity().apply(str1, str2);
}

public static long levenshteinDistance(String str1, String str2)
{
return new LevenshteinDistance().apply(str1, str2);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -1362,6 +1362,8 @@ private void registerStrings()
m(h("meta::pure::functions::string::encodeUrl_String_1__String_1__String_1_", true, ps -> res("String", "one"), ps -> ps.size() == 2))));
register(m(m(h("meta::pure::functions::string::decodeUrl_String_1__String_1_", false, ps -> res("String", "one"), ps -> ps.size() == 1)),
m(h("meta::pure::functions::string::decodeUrl_String_1__String_1__String_1_", true, ps -> res("String", "one"), ps -> ps.size() == 2))));
register("meta::pure::functions::string::jaroWinklerSimilarity_String_1__String_1__Float_1_", true, ps -> res("Float", "one"));
register("meta::pure::functions::string::levenshteinDistance_String_1__String_1__Integer_1_", true, ps -> res("Integer", "one"));
}

private void registerTrigo()
Expand Down Expand Up @@ -2513,6 +2515,8 @@ private Map<String, Dispatch> buildDispatch()
map.put("meta::pure::functions::string::lpad_String_1__Integer_1__String_1__String_1_", (List<ValueSpecification> ps) -> ps.size() == 3 && isOne(ps.get(0)._multiplicity()) && ("Nil".equals(ps.get(0)._genericType()._rawType()._name()) || "String".equals(ps.get(0)._genericType()._rawType()._name())) && isOne(ps.get(1)._multiplicity()) && ("Nil".equals(ps.get(1)._genericType()._rawType()._name()) || "Integer".equals(ps.get(1)._genericType()._rawType()._name())) && isOne(ps.get(2)._multiplicity()) && ("Nil".equals(ps.get(2)._genericType()._rawType()._name()) || "String".equals(ps.get(2)._genericType()._rawType()._name())));
map.put("meta::pure::functions::string::rpad_String_1__Integer_1__String_1_", (List<ValueSpecification> ps) -> ps.size() == 2 && isOne(ps.get(0)._multiplicity()) && ("Nil".equals(ps.get(0)._genericType()._rawType()._name()) || "String".equals(ps.get(0)._genericType()._rawType()._name())) && isOne(ps.get(1)._multiplicity()) && ("Nil".equals(ps.get(1)._genericType()._rawType()._name()) || "Integer".equals(ps.get(1)._genericType()._rawType()._name())));
map.put("meta::pure::functions::string::rpad_String_1__Integer_1__String_1__String_1_", (List<ValueSpecification> ps) -> ps.size() == 3 && isOne(ps.get(0)._multiplicity()) && ("Nil".equals(ps.get(0)._genericType()._rawType()._name()) || "String".equals(ps.get(0)._genericType()._rawType()._name())) && isOne(ps.get(1)._multiplicity()) && ("Nil".equals(ps.get(1)._genericType()._rawType()._name()) || "Integer".equals(ps.get(1)._genericType()._rawType()._name())) && isOne(ps.get(2)._multiplicity()) && ("Nil".equals(ps.get(2)._genericType()._rawType()._name()) || "String".equals(ps.get(2)._genericType()._rawType()._name())));
map.put("meta::pure::functions::string::jaroWinklerSimilarity_String_1__String_1__Float_1_", (List<ValueSpecification> ps) -> ps.size() == 2 && isOne(ps.get(0)._multiplicity()) && ("Nil".equals(ps.get(0)._genericType()._rawType()._name()) || "String".equals(ps.get(0)._genericType()._rawType()._name())) && isOne(ps.get(1)._multiplicity()) && ("Nil".equals(ps.get(1)._genericType()._rawType()._name()) || "String".equals(ps.get(1)._genericType()._rawType()._name())));
map.put("meta::pure::functions::string::levenshteinDistance_String_1__String_1__Integer_1_", (List<ValueSpecification> ps) -> ps.size() == 2 && isOne(ps.get(0)._multiplicity()) && ("Nil".equals(ps.get(0)._genericType()._rawType()._name()) || "String".equals(ps.get(0)._genericType()._rawType()._name())) && isOne(ps.get(1)._multiplicity()) && ("Nil".equals(ps.get(1)._genericType()._rawType()._name()) || "String".equals(ps.get(1)._genericType()._rawType()._name())));
map.put("meta::pure::graphFetch::calculateSourceTree_RootGraphFetchTree_1__Mapping_1__Extension_MANY__RootGraphFetchTree_1_", (List<ValueSpecification> ps) -> ps.size() == 3 && isOne(ps.get(0)._multiplicity()) && Sets.immutable.with("Nil", "RootGraphFetchTree", "ExtendedRootGraphFetchTree", "RoutedRootGraphFetchTree", "SerializeTopRootGraphFetchTree").contains(ps.get(0)._genericType()._rawType()._name()) && isOne(ps.get(1)._multiplicity()) && ("Nil".equals(ps.get(1)._genericType()._rawType()._name()) || "Mapping".equals(ps.get(1)._genericType()._rawType()._name())) && ("Nil".equals(ps.get(2)._genericType()._rawType()._name()) || "Extension".equals(ps.get(2)._genericType()._rawType()._name())));
map.put("meta::pure::graphFetch::execution::graphFetchChecked_T_MANY__RootGraphFetchTree_1__Checked_MANY_", (List<ValueSpecification> ps) -> ps.size() == 2 && isOne(ps.get(1)._multiplicity()) && Sets.immutable.with("Nil", "RootGraphFetchTree", "ExtendedRootGraphFetchTree", "RoutedRootGraphFetchTree", "SerializeTopRootGraphFetchTree").contains(ps.get(1)._genericType()._rawType()._name()));
map.put("meta::pure::graphFetch::execution::graphFetch_T_MANY__RootGraphFetchTree_1__Integer_1__T_MANY_", (List<ValueSpecification> ps) -> ps.size() == 3 && isOne(ps.get(1)._multiplicity()) && Sets.immutable.with("Nil", "RootGraphFetchTree", "ExtendedRootGraphFetchTree", "RoutedRootGraphFetchTree", "SerializeTopRootGraphFetchTree").contains(ps.get(1)._genericType()._rawType()._name()) && isOne(ps.get(2)._multiplicity()) && ("Nil".equals(ps.get(2)._genericType()._rawType()._name()) || "Integer".equals(ps.get(2)._genericType()._rawType()._name())));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -402,6 +402,9 @@ Class meta::legend::test::handlers::model::TestString
toUpper(){$this.string->toUpper()}:String[1];
trim(){$this.string->trim()}:String[1];
hashString(){$this.string->meta::pure::functions::hash::hash(meta::pure::functions::hash::HashType.MD5)}:String[1];

jaroWinklerSimilarity(){$this.string->meta::pure::functions::string::jaroWinklerSimilarity($this.string)}:Float[1];
levenshteinDistance(){$this.string->meta::pure::functions::string::levenshteinDistance($this.string)}:Integer[1];
}

Class meta::legend::test::handlers::model::TestDate
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -784,6 +784,8 @@ function meta::pure::router::routing::shouldStopFunctions(extensions:meta::pure:
left_String_1__Integer_1__String_1_,
right_String_1__Integer_1__String_1_,
typeName_Any_1__String_1_,
levenshteinDistance_String_1__String_1__Integer_1_,
jaroWinklerSimilarity_String_1__String_1__Float_1_,
meta::pure::tds::extensions::firstNotNull_T_MANY__T_$0_1$_,
meta::pure::functions::date::calendar::annualized_Date_1__String_1__Date_1__Number_$0_1$__Number_$0_1$_,
meta::pure::functions::date::calendar::cme_Date_1__String_1__Date_1__Number_$0_1$__Number_$0_1$_,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -87,8 +87,9 @@ function meta::pure::executionPlan::platformBinding::legendJava::library::string
fc2(decodeUrl_String_1__String_1__String_1_, {ctx,str,charset | $library->j_invoke('decodeUrl', [$str, $charset], javaString())}),
fc2(encodeUrl_String_1__String_1__String_1_, {ctx,str,charset | $library->j_invoke('encodeUrl', [$str, $charset], javaString())}),

fc2(hash_String_1__HashType_1__String_1_, {ctx,text,hashType | $library->j_invoke('hash', [$text, $hashType], javaString())})

fc2(hash_String_1__HashType_1__String_1_, {ctx,text,hashType | $library->j_invoke('hash', [$text, $hashType], javaString())}),
fc2(jaroWinklerSimilarity_String_1__String_1__Float_1_, {ctx,str1,str2 | $library->j_invoke('jaroWinklerSimilarity',[$str1, $str2], javaDouble())}),
fc2(levenshteinDistance_String_1__String_1__Integer_1_, {ctx,str1,str2 | $library->j_invoke('levenshteinDistance',[$str1, $str2], javaLong())})
]);

$conventions->registerLibrary($lib);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ function meta::relational::functions::sqlQueryToString::duckDB::convertDateToSql
| if ($date->hasSubsecond(),
| let d= format('%t{[' + $timeZone + ']yyyy-MM-dd HH:mm:ss.SSSSSS}', $date);
format('TIMESTAMP \'%s\'',$d);,
| let d= format('%t{[' + $timeZone + ']yyyy-MM-dd HH:mm:ss}', $date);
| let d= format('%t{[' + $timeZone + ']yyyy-MM-dd HH:mm:ss}', $date);
format('TIMESTAMP_S \'%s\'',$d);
),
| let d =format('%t{[' + $timeZone + ']yyyy-MM-dd}', $date);
Expand All @@ -74,12 +74,12 @@ function meta::relational::functions::sqlQueryToString::duckDB::convertDateToSql
function <<access.private>> meta::relational::functions::sqlQueryToString::duckDB::getDynaFunctionToSqlForDuckDB(): DynaFunctionToSql[*]
{
let allStates = allGenerationStates();

[
dynaFnToSql('adjust', $allStates, ^ToSql(format='date_add(%s)', transform={p:String[3] | $p->at(0) + ',' + constructIntervalFunction($p->at(2), $p->at(1)) })),
dynaFnToSql('booland', $allStates, ^ToSql(format='every(%s)')),
dynaFnToSql('boolor', $allStates, ^ToSql(format='any(%s)')),
dynaFnToSql('castBoolean', $allStates, ^ToSql(format='cast(%s as boolean)')),
dynaFnToSql('boolor', $allStates, ^ToSql(format='any(%s)')),
dynaFnToSql('castBoolean', $allStates, ^ToSql(format='cast(%s as boolean)')),
dynaFnToSql('chr', $allStates, ^ToSql(format='char(%s)')),
dynaFnToSql('concat', $allStates, ^ToSql(format='concat%s', transform={p:String[*]|$p->joinStrings('(', ', ', ')')})),
// dynaFnToSql('convertDate', $allStates, ^ToSql(format='%s', transform={p:String[*] | $p->convertToDateH2()})),
Expand Down Expand Up @@ -145,7 +145,9 @@ function <<access.private>> meta::relational::functions::sqlQueryToString::duckD
dynaFnToSql('toString', $allStates, ^ToSql(format='cast(%s as varchar)')),
// dynaFnToSql('toTimestamp', $allStates, ^ToSql(format='%s', transform={p:String[2] | $p->transformToTimestampH2()})),
dynaFnToSql('weekOfYear', $allStates, ^ToSql(format='week(%s)')),
dynaFnToSql('year', $allStates, ^ToSql(format='year(%s)'))
dynaFnToSql('year', $allStates, ^ToSql(format='year(%s)')),
dynaFnToSql('jaroWinklerSimilarity', $allStates, ^ToSql(format='jaro_winkler_similarity(%s, %s)')),
dynaFnToSql('levenshteinDistance', $allStates, ^ToSql(format='levenshtein(%s, %s)'))
];
}

Expand All @@ -158,8 +160,8 @@ function <<access.private>> meta::relational::functions::sqlQueryToString::duckD
if($dayOfWeek->at(1)=='\'Sunday\'',
|'dayofweek('+$dayOfWeek->at(0)+')+1',
|'isodow('+$dayOfWeek->at(0)+')' // (Monday = 1, Sunday = 7).
);
);
);
);
}

function <<access.private>> meta::relational::functions::sqlQueryToString::duckDB::processPaddingParams(p:String[*]):String[*]
Expand All @@ -177,7 +179,7 @@ function <<access.private>> meta::relational::functions::sqlQueryToString::duckD
function meta::relational::functions::sqlQueryToString::duckDB::constructIntervalFunction(unit:String[1], i:String[1]):String[1]
{
let unitWithoutQuotes = $unit->removeQuotesIfExist();

let interval_func= [
pair(DurationUnit.YEARS->toString(), 'to_years'),
pair(DurationUnit.MONTHS->toString(), 'to_months'),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -158,9 +158,11 @@ function <<access.private>> meta::relational::functions::sqlQueryToString::snowf
dynaFnToSql('hour', $allStates, ^ToSql(format='date_part(\'hour\', %s)')),
dynaFnToSql('indexOf', $allStates, ^ToSql(format='CHARINDEX(%s)', transform={p:String[2] | $p->at(1) + ', ' + $p->at(0)})),
dynaFnToSql('isAlphaNumeric', $allStates, ^ToSql(format=regexpPattern('%s'), transform={p:String[1]|$p->transformAlphaNumericParamsDefault()})),
dynaFnToSql('jaroWinklerSimilarity', $allStates, ^ToSql(format='(jarowinkler_similarity(%s, %s)/100)')),
dynaFnToSql('joinStrings', $allStates, ^ToSql(format='listagg(%s, %s)')),
dynaFnToSql('log10', $allStates, ^ToSql(format='log(10, %s)')),
dynaFnToSql('length', $allStates, ^ToSql(format='length(%s)')),
dynaFnToSql('levenshteinDistance', $allStates, ^ToSql(format='editdistance(%s, %s)')),
dynaFnToSql('matches', $allStates, ^ToSql(format=regexpPattern('%s'), transform={p:String[2]|$p->transformRegexpParams()})),
dynaFnToSql('minute', $allStates, ^ToSql(format='minute(%s)')),
dynaFnToSql('month', $allStates, ^ToSql(format='MONTH(%s)')),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -222,3 +222,23 @@ function <<test.Test>> meta::relational::tests::sqlToString::snowflake::simpleGr
let snowflakeSql = toSQLString($fn, meta::relational::tests::simpleRelationalMapping, meta::relational::runtime::DatabaseType.Snowflake, meta::relational::extension::relationalExtensions());
assertEquals('select "productTable_d#5_d#2_m2".NAME as "90.01", count(*) as "cnt" from tradeTable as "root" left outer join productSchema.productTable as "productTable_d#5_d#2_m2" on ("root".prodId = "productTable_d#5_d#2_m2".ID) group by "90.01"', $snowflakeSql);
}

function <<test.Test>> meta::relational::tests::sqlToString::snowflake::testJaroWinklerSimilarity():Boolean[1]
{
let sql = toSQLString(
|Person.all()
->project(p|$p.firstName->jaroWinklerSimilarity('John'), 'similarity')
, simpleRelationalMapping, DatabaseType.Snowflake, meta::relational::extension::relationalExtensions());

assertEquals('select (jarowinkler_similarity("root".FIRSTNAME, \'John\')/100) as "similarity" from personTable as "root"', $sql);
}

function <<test.Test>> meta::relational::tests::sqlToString::snowflake::testLevenshteinDistance():Boolean[1]
{
let sql = toSQLString(
|Person.all()
->project(p|$p.firstName->levenshteinDistance('John'), 'similarity')
, simpleRelationalMapping, DatabaseType.Snowflake, meta::relational::extension::relationalExtensions());

assertEquals('select editdistance("root".FIRSTNAME, \'John\') as "similarity" from personTable as "root"', $sql);
}
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,13 @@
</dependency>
<!-- COMMONS-LANG -->

<!-- COMMONS-TEXT (Fuzzy Search) -->
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-text</artifactId>
</dependency>
<!-- COMMONS-TEXT -->

<!-- COMMONS CODEC (Base64)-->
<dependency>
<groupId>commons-codec</groupId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
import com.fasterxml.jackson.databind.ObjectMapper;
import org.apache.commons.codec.binary.Base64;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.text.similarity.JaroWinklerSimilarity;
import org.apache.commons.text.similarity.LevenshteinDistance;
import org.finos.legend.engine.shared.core.ObjectMapperFactory;
import org.h2.tools.SimpleResultSet;
import org.h2.value.Value;
Expand Down Expand Up @@ -309,4 +311,24 @@ else if (value instanceof Integer || value instanceof Long)
throw new RuntimeException(e);
}
}

public static Value legend_h2_extension_edit_distance(Value string1, Value string2)
{
if (string1 == ValueNull.INSTANCE || string2 == ValueNull.INSTANCE)
{
return ValueNull.INSTANCE;
}

return ValueInteger.get(new LevenshteinDistance().apply(string1.getString(), string2.getString()));
}

public static Value legend_h2_extension_jaro_winkler_similarity(Value string1, Value string2)
{
if (string1 == ValueNull.INSTANCE || string2 == ValueNull.INSTANCE)
{
return ValueNull.INSTANCE;
}

return ValueDouble.get(new JaroWinklerSimilarity().apply(string1.getString(), string2.getString()));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,9 @@ private static List<String> getLegendH2ExtensionSQLs()
"CREATE ALIAS IF NOT EXISTS legend_h2_extension_base64_encode FOR \"org.finos.legend.engine.plan.execution.stores.relational.LegendH2Extensions.legend_h2_extension_base64_encode\";",
"CREATE ALIAS IF NOT EXISTS legend_h2_extension_reverse_string FOR \"org.finos.legend.engine.plan.execution.stores.relational.LegendH2Extensions.legend_h2_extension_reverse_string\";",
"CREATE ALIAS IF NOT EXISTS legend_h2_extension_flatten_array FOR \"org.finos.legend.engine.plan.execution.stores.relational.LegendH2Extensions.legend_h2_extension_flatten_array\";",
"CREATE ALIAS IF NOT EXISTS legend_h2_extension_split_part FOR \"org.finos.legend.engine.plan.execution.stores.relational.LegendH2Extensions.legend_h2_extension_split_part\";"
"CREATE ALIAS IF NOT EXISTS legend_h2_extension_split_part FOR \"org.finos.legend.engine.plan.execution.stores.relational.LegendH2Extensions.legend_h2_extension_split_part\";",
"CREATE ALIAS IF NOT EXISTS legend_h2_extension_edit_distance FOR \"org.finos.legend.engine.plan.execution.stores.relational.LegendH2Extensions.legend_h2_extension_edit_distance\";",
"CREATE ALIAS IF NOT EXISTS legend_h2_extension_jaro_winkler_similarity FOR \"org.finos.legend.engine.plan.execution.stores.relational.LegendH2Extensions.legend_h2_extension_jaro_winkler_similarity\";"
);
}

Expand All @@ -123,7 +125,9 @@ private static List<String> getLegendH2_1_4_200_ExtensionSQLs()
"CREATE ALIAS IF NOT EXISTS legend_h2_extension_hash_md5 FOR \"org.finos.legend.engine.plan.execution.stores.relational.LegendH2Extensions_1_4_200.legend_h2_extension_hash_md5\";",
"CREATE ALIAS IF NOT EXISTS legend_h2_extension_hash_sha1 FOR \"org.finos.legend.engine.plan.execution.stores.relational.LegendH2Extensions_1_4_200.legend_h2_extension_hash_sha1\";",
"CREATE ALIAS IF NOT EXISTS legend_h2_extension_flatten_array FOR \"org.finos.legend.engine.plan.execution.stores.relational.LegendH2Extensions_1_4_200.legend_h2_extension_flatten_array\";",
"CREATE ALIAS IF NOT EXISTS legend_h2_extension_split_part FOR \"org.finos.legend.engine.plan.execution.stores.relational.LegendH2Extensions_1_4_200.legend_h2_extension_split_part\";"
"CREATE ALIAS IF NOT EXISTS legend_h2_extension_split_part FOR \"org.finos.legend.engine.plan.execution.stores.relational.LegendH2Extensions_1_4_200.legend_h2_extension_split_part\";",
"CREATE ALIAS IF NOT EXISTS legend_h2_extension_edit_distance FOR \"org.finos.legend.engine.plan.execution.stores.relational.LegendH2Extensions_1_4_200.legend_h2_extension_edit_distance\";",
"CREATE ALIAS IF NOT EXISTS legend_h2_extension_jaro_winkler_similarity FOR \"org.finos.legend.engine.plan.execution.stores.relational.LegendH2Extensions_1_4_200.legend_h2_extension_jaro_winkler_similarity\";"
);
}
}
Loading

0 comments on commit 7e535d6

Please sign in to comment.