Skip to content

Commit

Permalink
adding use_log_scale and log_scale_similarity_threshold
Browse files Browse the repository at this point in the history
  • Loading branch information
seperman committed May 17, 2024
1 parent a739a50 commit ff6ff87
Show file tree
Hide file tree
Showing 8 changed files with 138 additions and 34 deletions.
20 changes: 12 additions & 8 deletions deepdiff/diff.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
np, get_truncate_datetime, dict_, CannotCompare, ENUM_INCLUDE_KEYS,
PydanticBaseModel, Opcode, SetOrdered)
from deepdiff.serialization import SerializationMixin
from deepdiff.distance import DistanceMixin
from deepdiff.distance import DistanceMixin, logarithmic_similarity
from deepdiff.model import (
RemapDict, ResultDict, TextResult, TreeResult, DiffLevel,
DictRelationship, AttributeRelationship, REPORT_KEYS,
Expand Down Expand Up @@ -157,7 +157,9 @@ def __init__(self,
progress_logger: Callable=logger.info,
report_repetition: bool=False,
significant_digits: Optional[int]=None,
threshold_to_diff_deeper: float = 0,
use_log_scale: bool=False,
log_scale_similarity_threshold: int=0.1,
threshold_to_diff_deeper: float = 0.33,
truncate_datetime: Optional[str]=None,
use_enum_value: bool=False,
verbose_level: int=1,
Expand All @@ -178,7 +180,7 @@ def __init__(self,
"cutoff_distance_for_pairs, cutoff_intersection_for_pairs, log_frequency_in_sec, cache_size, "
"cache_tuning_sample_size, get_deep_distance, group_by, group_by_sort_key, cache_purge_level, "
"math_epsilon, iterable_compare_func, use_enum_value, _original_type, threshold_to_diff_deeper, "
"ignore_order_func, custom_operators, encodings, ignore_encoding_errors, "
"ignore_order_func, custom_operators, encodings, ignore_encoding_errors, use_log_scale, log_scale_similarity_threshold "
"_parameters and _shared_parameters.") % ', '.join(kwargs.keys()))

if _parameters:
Expand All @@ -196,6 +198,8 @@ def __init__(self,
if strings == ignore_type_in_groups or strings in ignore_type_in_groups:
ignore_string_type_changes = True
self.use_enum_value = use_enum_value
self.log_scale_similarity_threshold = log_scale_similarity_threshold
self.use_log_scale = use_log_scale
self.threshold_to_diff_deeper = threshold_to_diff_deeper
self.ignore_string_type_changes = ignore_string_type_changes
self.ignore_type_in_groups = self.get_ignore_types_in_groups(
Expand Down Expand Up @@ -583,9 +587,8 @@ def _diff_dict(
t_keys_union = t2_keys | t1_keys
t_keys_added = t2_keys - t_keys_intersect
t_keys_removed = t1_keys - t_keys_intersect

if self.threshold_to_diff_deeper:
if len(t_keys_union) and len(t_keys_intersect) / len(t_keys_union) < self.threshold_to_diff_deeper:
if len(t_keys_union) > 1 and len(t_keys_intersect) / len(t_keys_union) < self.threshold_to_diff_deeper:
self._report_result('values_changed', level, local_tree=local_tree)
return

Expand Down Expand Up @@ -1145,7 +1148,6 @@ def defaultdict_orderedset():
pairs = dict_()

pre_calced_distances = None

if hashes_added and hashes_removed and np and len(hashes_added) > 1 and len(hashes_removed) > 1:
# pre-calculates distances ONLY for 1D arrays whether an _original_type
# was explicitly passed or a homogeneous array is detected.
Expand Down Expand Up @@ -1233,7 +1235,6 @@ def _diff_iterable_with_deephash(self, level, parents_ids, _original_type=None,
else:
t1_hashtable = {k: v for k, v in full_t1_hashtable.items() if k in hashes_removed}
t2_hashtable = {k: v for k, v in full_t2_hashtable.items() if k in hashes_added}

if self._stats[PASSES_COUNT] < self.max_passes and get_pairs:
self._stats[PASSES_COUNT] += 1
pairs = self._get_most_in_common_pairs_in_iterables(
Expand Down Expand Up @@ -1403,7 +1404,10 @@ def _diff_numbers(self, level, local_tree=None, report_type_change=True):
else:
t1_type = t2_type = ''

if self.math_epsilon is not None:
if self.use_log_scale:
if not logarithmic_similarity(level.t1, level.t2, threshold=self.log_scale_similarity_threshold):
self._report_result('values_changed', level, local_tree=local_tree)
elif self.math_epsilon is not None:
if not is_close(level.t1, level.t2, abs_tol=self.math_epsilon):
self._report_result('values_changed', level, local_tree=local_tree)
elif self.significant_digits is None:
Expand Down
74 changes: 63 additions & 11 deletions deepdiff/distance.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import numpy as np
import math
import datetime
from deepdiff.deephash import DeepHash
from deepdiff.helper import (
Expand Down Expand Up @@ -31,7 +33,7 @@ def _get_rough_distance(self):
"""

_distance = get_numeric_types_distance(
self.t1, self.t2, max_=self.cutoff_distance_for_pairs)
self.t1, self.t2, max_=self.cutoff_distance_for_pairs, use_log_scale=self.use_log_scale, log_scale_similarity_threshold=self.log_scale_similarity_threshold)

if _distance is not not_found:
return _distance
Expand Down Expand Up @@ -122,7 +124,10 @@ def _precalculate_numpy_arrays_distance(

distances = _get_numpy_array_distance(
pairs_transposed[0], pairs_transposed[1],
max_=self.cutoff_distance_for_pairs)
max_=self.cutoff_distance_for_pairs,
use_log_scale=self.use_log_scale,
log_scale_similarity_threshold=self.log_scale_similarity_threshold,
)

i = 0
for added_hash in hashes_added:
Expand Down Expand Up @@ -186,14 +191,19 @@ def _get_item_length(item, parents_ids=frozenset([])):
return length


def _get_numbers_distance(num1, num2, max_=1):
def _get_numbers_distance(num1, num2, max_=1, use_log_scale=False, log_scale_similarity_threshold=0.1):
"""
Get the distance of 2 numbers. The output is a number between 0 to the max.
The reason is the
When max is returned means the 2 numbers are really far, and 0 means they are equal.
"""
if num1 == num2:
return 0
if use_log_scale:
distance = logarithmic_distance(num1, num2)
if distance < logarithmic_distance:
return 0
return distance

Check warning on line 206 in deepdiff/distance.py

View check run for this annotation

Codecov / codecov/patch

deepdiff/distance.py#L203-L206

Added lines #L203 - L206 were not covered by tests
if not isinstance(num1, float):
num1 = float(num1)
if not isinstance(num2, float):
Expand All @@ -218,8 +228,42 @@ def _numpy_div(a, b, replace_inf_with=1):
result[a == b] = 0
return result

# To deal with numbers close to zero
MATH_LOG_OFFSET = 1e-10

def numpy_apply_log_keep_sign(array, offset=MATH_LOG_OFFSET):
# Calculate the absolute value and add the offset
abs_plus_offset = np.abs(array) + offset

Check warning on line 236 in deepdiff/distance.py

View check run for this annotation

Codecov / codecov/patch

deepdiff/distance.py#L236

Added line #L236 was not covered by tests

# Calculate the logarithm
log_values = np.log(abs_plus_offset)

Check warning on line 239 in deepdiff/distance.py

View check run for this annotation

Codecov / codecov/patch

deepdiff/distance.py#L239

Added line #L239 was not covered by tests

# Apply the original signs to the log values
signed_log_values = np.copysign(log_values, array)

Check warning on line 242 in deepdiff/distance.py

View check run for this annotation

Codecov / codecov/patch

deepdiff/distance.py#L242

Added line #L242 was not covered by tests

return signed_log_values

Check warning on line 244 in deepdiff/distance.py

View check run for this annotation

Codecov / codecov/patch

deepdiff/distance.py#L244

Added line #L244 was not covered by tests


def _get_numpy_array_distance(num1, num2, max_=1):
def logarithmic_similarity(a: numbers, b: numbers, threshold: float=0.1):
"""
A threshold of 0.1 translates to about 10.5% difference.
A threshold of 0.5 translates to about 65% difference.
A threshold of 0.05 translates to about 5.1% difference.
"""
return logarithmic_distance(a, b) < threshold


def logarithmic_distance(a: numbers, b: numbers):
# Apply logarithm to the absolute values and consider the sign
a = float(a)
b = float(b)
log_a = math.copysign(math.log(abs(a) + MATH_LOG_OFFSET), a)
log_b = math.copysign(math.log(abs(b) + MATH_LOG_OFFSET), b)

return abs(log_a - log_b)


def _get_numpy_array_distance(num1, num2, max_=1, use_log_scale=False, log_scale_similarity_threshold=0.1):
"""
Get the distance of 2 numbers. The output is a number between 0 to the max.
The reason is the
Expand All @@ -229,24 +273,32 @@ def _get_numpy_array_distance(num1, num2, max_=1):
# getting the pairs of items during the ingore_order=True
# calculations, we need to make the divisor of comparison very big
# so that any 2 numbers can be chosen as pairs.
if use_log_scale:
num1 = numpy_apply_log_keep_sign(num1)
num2 = numpy_apply_log_keep_sign(num2)

Check warning on line 278 in deepdiff/distance.py

View check run for this annotation

Codecov / codecov/patch

deepdiff/distance.py#L277-L278

Added lines #L277 - L278 were not covered by tests

divisor = (num1 + num2) / max_
result = _numpy_div((num1 - num2), divisor, replace_inf_with=max_)
return np.clip(np.absolute(result), 0, max_)

distance_array = np.clip(np.absolute(result), 0, max_)
if use_log_scale:
distance_array[distance_array < log_scale_similarity_threshold] = 0

Check warning on line 285 in deepdiff/distance.py

View check run for this annotation

Codecov / codecov/patch

deepdiff/distance.py#L285

Added line #L285 was not covered by tests
return distance_array


def _get_datetime_distance(date1, date2, max_):
def _get_datetime_distance(date1, date2, max_, use_log_scale, log_scale_similarity_threshold):
return _get_numbers_distance(date1.timestamp(), date2.timestamp(), max_)


def _get_date_distance(date1, date2, max_):
def _get_date_distance(date1, date2, max_, use_log_scale, log_scale_similarity_threshold):
return _get_numbers_distance(date1.toordinal(), date2.toordinal(), max_)


def _get_timedelta_distance(timedelta1, timedelta2, max_):
def _get_timedelta_distance(timedelta1, timedelta2, max_, use_log_scale, log_scale_similarity_threshold):
return _get_numbers_distance(timedelta1.total_seconds(), timedelta2.total_seconds(), max_)


def _get_time_distance(time1, time2, max_):
def _get_time_distance(time1, time2, max_, use_log_scale, log_scale_similarity_threshold):
return _get_numbers_distance(time_to_seconds(time1), time_to_seconds(time2), max_)


Expand All @@ -259,8 +311,8 @@ def _get_time_distance(time1, time2, max_):
]


def get_numeric_types_distance(num1, num2, max_):
def get_numeric_types_distance(num1, num2, max_, use_log_scale=False, log_scale_similarity_threshold=0.1):
for type_, func in TYPES_TO_DIST_FUNC:
if isinstance(num1, type_) and isinstance(num2, type_):
return func(num1, num2, max_)
return func(num1, num2, max_, use_log_scale, log_scale_similarity_threshold)
return not_found
1 change: 1 addition & 0 deletions deepdiff/helper.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import sys
import re
import os
import math
import datetime
import uuid
import logging
Expand Down
3 changes: 2 additions & 1 deletion tests/test_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,8 @@ def test_cache_deeply_nested_b(self, nested_b_t1, nested_b_t2, nested_b_result):
'MAX PASS LIMIT REACHED': False,
'MAX DIFF LIMIT REACHED': False
}
assert expected_stats == stats
stats_diff = DeepDiff(expected_stats, stats, use_log_scale=True, log_scale_similarity_threshold=0.15)
assert not stats_diff
assert nested_b_result == diff

diff_of_diff = DeepDiff(nested_b_result, diff.to_dict(), ignore_order=False)
Expand Down
6 changes: 3 additions & 3 deletions tests/test_delta.py
Original file line number Diff line number Diff line change
Expand Up @@ -448,7 +448,7 @@ def test_delta_dict_items_added_retain_order(self):
}
}

diff = DeepDiff(t1, t2)
diff = DeepDiff(t1, t2, threshold_to_diff_deeper=0)
delta_dict = diff._to_delta_dict()
assert expected_delta_dict == delta_dict
delta = Delta(diff, bidirectional=False, raise_errors=True)
Expand Down Expand Up @@ -828,9 +828,9 @@ def compare_func(item1, item2, level=None):
'delta_case14b_threshold_to_diff_deeper': {
't1': picklalbe_obj_without_item,
't2': PicklableClass(11),
'deepdiff_kwargs': {'threshold_to_diff_deeper': 0.33},
'deepdiff_kwargs': {'threshold_to_diff_deeper': 0.5},
'to_delta_kwargs': {},
'expected_delta_dict': {'values_changed': {'root': {'new_value': PicklableClass(11)}}}
'expected_delta_dict': {'attribute_added': {'root.item': 11}}
},
'delta_case15_diffing_simple_numbers': {
't1': 1,
Expand Down
38 changes: 34 additions & 4 deletions tests/test_diff_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ def test_value_change(self):
def test_item_added_and_removed(self):
t1 = {1: 1, 2: 2, 3: [3], 4: 4}
t2 = {1: 1, 2: 4, 3: [3, 4], 5: 5, 6: 6}
ddiff = DeepDiff(t1, t2)
ddiff = DeepDiff(t1, t2, threshold_to_diff_deeper=0)
result = {
'dictionary_item_added': ["root[5]", "root[6]"],
'dictionary_item_removed': ["root[4]"],
Expand Down Expand Up @@ -1023,7 +1023,7 @@ def test_dictionary_with_string_keys1(self):
t1 = {"veggie": "carrots"}
t2 = {"meat": "carrots"}

diff = DeepDiff(t1, t2)
diff = DeepDiff(t1, t2, threshold_to_diff_deeper=0)
assert {'dictionary_item_added': ["root['meat']"],
'dictionary_item_removed': ["root['veggie']"]} == diff

Expand All @@ -1037,9 +1037,12 @@ def test_dictionary_with_string_keys_threshold_to_diff_deeper(self):
def test_dictionary_with_numeric_keys(self):
t1 = {Decimal('10.01'): "carrots"}
t2 = {10.01: "carrots"}
diff = DeepDiff(t1, t2)
diff = DeepDiff(t1, t2, threshold_to_diff_deeper=0)
assert {'dictionary_item_added': ["root[10.01]"], 'dictionary_item_removed': ["root[Decimal('10.01')]"]} == diff

diff2 = DeepDiff(t1, t2)
assert {'values_changed': {'root': {'new_value': {10.01: 'carrots'}, 'old_value': {Decimal('10.01'): 'carrots'}}}} == diff2

def test_loop(self):
class LoopTest:
def __init__(self, a):
Expand Down Expand Up @@ -1331,6 +1334,33 @@ def test_decimal_digits(self, t1, t2, significant_digits, expected_result):
ddiff = DeepDiff(t1, t2, ignore_numeric_type_changes=True, ignore_string_type_changes=True, significant_digits=significant_digits)
assert expected_result == ddiff

@pytest.mark.parametrize('test_num, t1, t2, log_scale_similarity_threshold, expected', [
(
1,
{'foo': 110, 'bar': 306}, # t1
{'foo': 140, 'bar': 298}, # t2
0.01, # threshold
{'values_changed': {"root['foo']": {'new_value': 140, 'old_value': 110}, "root['bar']": {'new_value': 298, 'old_value': 306}}}, # expected
),
(
2,
{'foo': 110, 'bar': 306}, # t1
{'foo': 140, 'bar': 298}, # t2
0.1, # threshold
{'values_changed': {"root['foo']": {'new_value': 140, 'old_value': 110}}}, # expected
),
(
2,
{'foo': 110, 'bar': 306}, # t1
{'foo': 140, 'bar': 298}, # t2
0.3, # threshold
{}, # expected
),
])
def test_log_scale(self, test_num, t1, t2, log_scale_similarity_threshold, expected):
diff = DeepDiff(t1, t2, use_log_scale=True, log_scale_similarity_threshold=log_scale_similarity_threshold)
assert expected == diff, f"test_log_scale #{test_num} failed."

def test_ignore_type_in_groups(self):
t1 = [1, 2, 3]
t2 = [1.0, 2.0, 3.0]
Expand All @@ -1348,7 +1378,7 @@ def test_ignore_type_in_groups3(self):
t1 = {Decimal('10.01'): "carrots"}
t2 = {10.01: "carrots"}

diff1 = DeepDiff(t1, t2)
diff1 = DeepDiff(t1, t2, threshold_to_diff_deeper=0)

diff2 = DeepDiff(t1, t2, ignore_numeric_type_changes=True)

Expand Down
Loading

0 comments on commit ff6ff87

Please sign in to comment.