Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow generators and iterators #194

Merged
merged 26 commits into from
Dec 18, 2020
Merged
Show file tree
Hide file tree
Changes from 23 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
10c5c2b
Allow generators and iterators in evaluate
dkarrasch Dec 5, 2020
d3dd6e4
fix test
dkarrasch Dec 5, 2020
27699ff
fix one type-thing
dkarrasch Dec 5, 2020
e127fb4
include result_type proposal, add hamming tests
dkarrasch Dec 5, 2020
948291d
include renyi_divergence, haversine, bregman
dkarrasch Dec 6, 2020
243b7b0
include bhattacharyya / hellinger
dkarrasch Dec 6, 2020
8101bb3
Update test/test_dists.jl
dkarrasch Dec 8, 2020
8f44a30
include some review comments
dkarrasch Dec 8, 2020
c055d4d
relax parameter types
dkarrasch Dec 8, 2020
0227942
clean up UnionMetric evaluate
dkarrasch Dec 8, 2020
9b34ed9
include iterator-based pair- and colwise
dkarrasch Dec 12, 2020
85cdb1b
simplify/optimize pairwise
dkarrasch Dec 12, 2020
46a91c2
include generic result_type tests
dkarrasch Dec 13, 2020
8fb5108
Revert "clean up UnionMetric evaluate"
dkarrasch Dec 13, 2020
5d04ff0
minor UnionMetric edits
dkarrasch Dec 14, 2020
27994eb
include comments from code review
dkarrasch Dec 14, 2020
6e4c09b
add colwise & pairwise docstrings
dkarrasch Dec 14, 2020
5b096d4
Apply suggestions from code review
dkarrasch Dec 15, 2020
518fd3d
simplify _eltype, add a note to colwise docstring
dkarrasch Dec 15, 2020
18f17af
fix typo
dkarrasch Dec 15, 2020
8640d36
transpose -> permutedims
dkarrasch Dec 15, 2020
5be0415
handle CartesianIndex
dkarrasch Dec 15, 2020
4333bda
increase code coverage
dkarrasch Dec 15, 2020
8219ad0
fix docstrings
dkarrasch Dec 16, 2020
5402ed8
Revert "handle CartesianIndex"
dkarrasch Dec 16, 2020
1350d47
rm redundant tests
dkarrasch Dec 17, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions src/Distances.jl
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,15 @@ export
rmsd,
nrmsd

if VERSION < v"1.2-"
import Base: has_offset_axes
require_one_based_indexing(A...) =
!has_offset_axes(A...) ||
throw(ArgumentError("offset arrays are not supported but got an array with index other than 1"))
else
import Base: require_one_based_indexing
end

include("common.jl")
include("generic.jl")
include("metrics.jl")
Expand Down
47 changes: 31 additions & 16 deletions src/bhattacharyya.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,39 +9,54 @@ struct HellingerDist <: Metric end

# Bhattacharyya coefficient

function bhattacharyya_coeff(a::AbstractVector{T}, b::AbstractVector{T}) where {T <: Number}
if length(a) != length(b)
throw(DimensionMismatch("first array has length $(length(a)) which does not match the length of the second, $(length(b))."))
function bhattacharyya_coeff(a, b)
dkarrasch marked this conversation as resolved.
Show resolved Hide resolved
n = length(a)
if n != length(b)
throw(DimensionMismatch("first argument has length $n which does not match the length of the second, $(length(b))."))
end
sqab, asum, bsum = _bhattacharyya_coeff(a, b)
# We must normalize since we cannot assume that the vectors are normalized to probability vectors.
return sqab / sqrt(asum * bsum)
end

n = length(a)
@inline function _bhattacharyya_coeff(a, b)
Ta = _eltype(a)
Tb = _eltype(b)
T = typeof(sqrt(zero(promote_type(Ta, Tb))))
sqab = zero(T)
# We must normalize since we cannot assume that the vectors are normalized to probability vectors.
asum = zero(T)
bsum = zero(T)
asum = zero(Ta)
bsum = zero(Tb)

for (ai, bi) in zip(a, b)
sqab += sqrt(ai * bi)
asum += ai
bsum += bi
end
return sqab, asum, bsum
end
@inline function _bhattacharyya_coeff(a::AbstractVector{Ta}, b::AbstractVector{Tb}) where {Ta<:Number,Tb<:Number}
T = typeof(sqrt(oneunit(Ta)*oneunit(Tb)))
sqab = zero(T)
asum = zero(Ta)
bsum = zero(Tb)

@simd for i = 1:n
@simd for i in eachindex(a, b)
@inbounds ai = a[i]
@inbounds bi = b[i]
sqab += sqrt(ai * bi)
asum += ai
bsum += bi
end

sqab / sqrt(asum * bsum)
return sqab, asum, bsum
end

bhattacharyya_coeff(a::T, b::T) where {T <: Number} = throw("Bhattacharyya coefficient cannot be calculated for scalars")

# Faster pair- and column-wise versions TBD...


# Bhattacharyya distance
(::BhattacharyyaDist)(a::AbstractVector{T}, b::AbstractVector{T}) where {T <: Number} = -log(bhattacharyya_coeff(a, b))
(::BhattacharyyaDist)(a::T, b::T) where {T <: Number} = throw("Bhattacharyya distance cannot be calculated for scalars")
(::BhattacharyyaDist)(a, b) = -log(bhattacharyya_coeff(a, b))
bhattacharyya(a, b) = BhattacharyyaDist()(a, b)

# Hellinger distance
(::HellingerDist)(a::AbstractVector{T}, b::AbstractVector{T}) where {T <: Number} = sqrt(1 - bhattacharyya_coeff(a, b))
(::HellingerDist)(a::T, b::T) where {T <: Number} = throw("Hellinger distance cannot be calculated for scalars")
(::HellingerDist)(a, b) = sqrt(1 - bhattacharyya_coeff(a, b))
hellinger(a, b) = HellingerDist()(a, b)
16 changes: 8 additions & 8 deletions src/bregman.jl
Original file line number Diff line number Diff line change
Expand Up @@ -22,26 +22,26 @@ end
Bregman(F, ∇) = Bregman(F, ∇, LinearAlgebra.dot)

# Evaluation fuction
function (dist::Bregman)(p::AbstractVector, q::AbstractVector)
function (dist::Bregman)(p, q)
# Create cache vals.
FP_val = dist.F(p);
FQ_val = dist.F(q);
DQ_val = dist.∇(q);
p_size = size(p);
FP_val = dist.F(p)
FQ_val = dist.F(q)
DQ_val = dist.∇(q)
p_size = length(p)
# Check F codomain.
if !(isa(FP_val, Real) && isa(FQ_val, Real))
throw(ArgumentError("F Codomain Error: F doesn't map the vectors to real numbers"))
end
# Check vector size.
if !(p_size == size(q))
if p_size != length(q)
throw(DimensionMismatch("The vector p ($(size(p))) and q ($(size(q))) are different sizes."))
end
# Check gradient size.
if !(size(DQ_val) == p_size)
if length(DQ_val) != p_size
throw(DimensionMismatch("The gradient result is not the same size as p and q"))
end
# Return the Bregman divergence.
return FP_val - FQ_val - dist.inner(DQ_val, p-q);
return FP_val - FQ_val - dist.inner(DQ_val, p .- q)
end

# Convenience function.
Expand Down
6 changes: 6 additions & 0 deletions src/common.jl
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,12 @@ function get_common_ncols(a::AbstractMatrix, b::AbstractMatrix)
return na
end

function get_common_length(a, b)
n = length(a)
length(b) == n || throw(DimensionMismatch("The lengths of a and b must match."))
return n
end

function get_pairwise_dims(r::AbstractMatrix, a::AbstractMatrix, b::AbstractMatrix)
ma, na = size(a)
mb, nb = size(b)
Expand Down
169 changes: 149 additions & 20 deletions src/generic.jl
Original file line number Diff line number Diff line change
Expand Up @@ -26,49 +26,116 @@ evaluate(dist::PreMetric, a, b) = dist(a, b)
# Generic functions

"""
result_type(dist::PreMetric, Ta::Type, Tb::Type) -> T
result_type(dist::PreMetric, a::AbstractArray, b::AbstractArray) -> T
result_type(dist, Ta::Type, Tb::Type) -> T
result_type(dist, a, b) -> T

Infer the result type of metric `dist` with input type `Ta` and `Tb`, or input
data `a` and `b`.
Infer the result type of metric `dist` with input types `Ta` and `Tb`, or element types
of iterators `a` and `b`.
"""
result_type(::PreMetric, ::Type, ::Type) = Float64 # fallback
result_type(dist::PreMetric, a::AbstractArray, b::AbstractArray) = result_type(dist, eltype(a), eltype(b))
result_type(dist, a, b) = result_type(dist, _eltype(a), _eltype(b))
result_type(f, a::Type, b::Type) = typeof(f(oneunit(a), oneunit(b))) # don't require `PreMetric` subtyping


_eltype(a) = __eltype(Base.IteratorEltype(a), a)
_eltype(::Type{T}) where {T} = eltype(T) === T ? T : _eltype(eltype(T))

__eltype(::Base.HasEltype, a) = _eltype(eltype(a))
__eltype(::Base.EltypeUnknown, a) = _eltype(typeof(first(a)))

# Generic column-wise evaluation

"""
colwise!(r::AbstractArray, metric::PreMetric, a, b)

Compute distances between corresponding elements of the iterable collections
`a` and `b` according to distance `metric`, and store the result in `r`.

`a` and `b` must have the same number of elements, `r` must be a vector of length
`length(a) == length(b)`.
"""
function colwise!(r::AbstractArray, metric::PreMetric, a, b)
require_one_based_indexing(r)
n = length(a)
length(b) == n || throw(DimensionMismatch("iterators have different lengths"))
length(r) == n || throw(DimensionMismatch("Incorrect size of r."))
@inbounds for (j, ab) in enumerate(zip(a, b))
r[j] = metric(ab...)
end
r
end

function colwise!(r::AbstractArray, metric::PreMetric, a::AbstractVector, b::AbstractMatrix)
require_one_based_indexing(r)
n = size(b, 2)
length(r) == n || throw(DimensionMismatch("Incorrect size of r."))
@inbounds for j = 1:n
r[j] = metric(a, view(b, :, j))
@inbounds for (rj, bj) in enumerate(axes(b, 2))
r[rj] = metric(a, view(b, :, bj))
end
r
end

function colwise!(r::AbstractArray, metric::PreMetric, a::AbstractMatrix, b::AbstractVector)
require_one_based_indexing(r)
n = size(a, 2)
length(r) == n || throw(DimensionMismatch("Incorrect size of r."))
@inbounds for j = 1:n
r[j] = metric(view(a, :, j), b)
@inbounds for (rj, aj) in enumerate(axes(a, 2))
r[rj] = metric(view(a, :, aj), b)
end
r
end

"""
colwise!(r::AbstractArray, metric::PreMetric,
a::AbstractMatrix, b::AbstractMatrix)
dkarrasch marked this conversation as resolved.
Show resolved Hide resolved

Compute distances between each corresponding columns of `a` and `b` according
to distance `metric`, and store the result in `r`. Exactly one of `a` or `b`
can be a vector, in which case the distance between that vector and all columns
of the other matrix are computed.

`a` and `b` must have the same number of columns if neither of the two is a
vector. `r` must be a vector of length `maximum(size(a, 2), size(b, 2))`.
"""
function colwise!(r::AbstractArray, metric::PreMetric, a::AbstractMatrix, b::AbstractMatrix)
require_one_based_indexing(r, a, b)
n = get_common_ncols(a, b)
length(r) == n || throw(DimensionMismatch("Incorrect size of r."))
@inbounds for j = 1:n
@inbounds for j in 1:n
r[j] = metric(view(a, :, j), view(b, :, j))
end
r
end

function colwise!(r::AbstractArray, metric::SemiMetric, a::AbstractMatrix, b::AbstractVector)
colwise!(r, metric, b, a)
"""
colwise(metric::PreMetric, a, b)

Compute distances between corresponding elements of the iterable collections
`a` and `b` according to distance `metric`.

`a` and `b` must have the same number of elements (`length(a) == length(b)`).
"""
function colwise(metric::PreMetric, a, b)
n = get_common_length(a, b)
r = Vector{result_type(metric, a, b)}(undef, n)
colwise!(r, metric, a, b)
end

"""
colwise(metric::PreMetric, a::AbstractMatrix, b::AbstractMatrix)
colwise(metric::PreMetric, a::AbstractVector, b::AbstractMatrix)
colwise(metric::PreMetric, a::AbstractMatrix, b::AbstractVector)

Compute distances between corresponding columns of `a` and `b` according to
distance `metric`. Exactly one of `a` or `b` can be a vector, in which case the
distance between that vector and all columns of the other matrix are computed.

!!! note
If both `a` and `b` are vectors, the generic, iterator-based method of
`colwise` applies.

`a` and `b` must have the same number of columns if neither of the two is a
vector.
"""
function colwise(metric::PreMetric, a::AbstractMatrix, b::AbstractMatrix)
n = get_common_ncols(a, b)
r = Vector{result_type(metric, a, b)}(undef, n)
Expand All @@ -90,8 +157,20 @@ end

# Generic pairwise evaluation

function _pairwise!(r::AbstractMatrix, metric::PreMetric, a, b=a)
require_one_based_indexing(r)
na = length(a)
nb = length(b)
size(r) == (na, nb) || throw(DimensionMismatch("Incorrect size of r."))
@inbounds for (j, bj) in enumerate(b), (i, ai) in enumerate(a)
r[i, j] = metric(ai, bj)
end
r
end

function _pairwise!(r::AbstractMatrix, metric::PreMetric,
a::AbstractMatrix, b::AbstractMatrix=a)
require_one_based_indexing(r, a, b)
na = size(a, 2)
nb = size(b, 2)
size(r) == (na, nb) || throw(DimensionMismatch("Incorrect size of r."))
Expand All @@ -104,18 +183,36 @@ function _pairwise!(r::AbstractMatrix, metric::PreMetric,
r
end

function _pairwise!(r::AbstractMatrix, metric::SemiMetric, a)
require_one_based_indexing(r)
n = length(a)
size(r) == (n, n) || throw(DimensionMismatch("Incorrect size of r."))
itr = Iterators.product(enumerate(a), enumerate(a))
@inbounds for ((i, ai), (j, aj)) in itr
r[i, j] = if i > j
metric(ai, aj)
elseif i == j
0
else
r[j, i]
end
end
r
end

function _pairwise!(r::AbstractMatrix, metric::SemiMetric, a::AbstractMatrix)
require_one_based_indexing(r)
n = size(a, 2)
size(r) == (n, n) || throw(DimensionMismatch("Incorrect size of r."))
@inbounds for j = 1:n
for i = 1:(j - 1)
r[i, j] = r[j, i] # leveraging the symmetry of SemiMetric
end
r[j, j] = 0
aj = view(a, :, j)
for i = (j + 1):n
r[i, j] = metric(view(a, :, i), aj)
end
r[j, j] = 0
for i = 1:(j - 1)
r[i, j] = r[j, i] # leveraging the symmetry of SemiMetric
end
end
r
end
Expand All @@ -140,7 +237,7 @@ in `a` and `b` according to distance `metric`, and store the result in `r`.
If a single matrix `a` is provided, compute distances between its rows or columns.

`a` and `b` must have the same numbers of columns if `dims=1`, or of rows if `dims=2`.
`r` must be a square matrix with size `size(a, dims) == size(b, dims)`.
`r` must be a matrix with size `size(a, dims) × size(b, dims)`.
"""
function pairwise!(r::AbstractMatrix, metric::PreMetric,
a::AbstractMatrix, b::AbstractMatrix;
Expand All @@ -161,7 +258,7 @@ function pairwise!(r::AbstractMatrix, metric::PreMetric,
size(r) == (na, nb) ||
throw(DimensionMismatch("Incorrect size of r (got $(size(r)), expected $((na, nb)))."))
if dims == 1
_pairwise!(r, metric, transpose(a), transpose(b))
_pairwise!(r, metric, permutedims(a), permutedims(b))
else
_pairwise!(r, metric, a, b)
end
Expand All @@ -179,12 +276,24 @@ function pairwise!(r::AbstractMatrix, metric::PreMetric, a::AbstractMatrix;
size(r) == (n, n) ||
throw(DimensionMismatch("Incorrect size of r (got $(size(r)), expected $((n, n)))."))
if dims == 1
_pairwise!(r, metric, transpose(a))
_pairwise!(r, metric, permutedims(a))
else
_pairwise!(r, metric, a)
end
end

"""
pairwise!(r::AbstractMatrix, metric::PreMetric, a, b=a)

Compute distances between each element of collection `a` and each element of
collection `b` according to distance `metric`, and store the result in `r`.
If a single iterable `a` is provided, compute distances between its elements.

`r` must be a matrix with size `length(a) × length(b)`.
"""
pairwise!(r::AbstractMatrix, metric::PreMetric, a, b) = _pairwise!(r, metric, a, b)
pairwise!(r::AbstractMatrix, metric::PreMetric, a) = _pairwise!(r, metric, a)

"""
pairwise(metric::PreMetric, a::AbstractMatrix, b::AbstractMatrix=a; dims)

Expand Down Expand Up @@ -212,3 +321,23 @@ function pairwise(metric::PreMetric, a::AbstractMatrix;
r = Matrix{result_type(metric, a, a)}(undef, n, n)
pairwise!(r, metric, a, dims=dims)
end

"""
pairwise(metric::PreMetric, a, b=a)

Compute distances between each element of collection `a` and each element of
collection `b` according to distance `metric`. If a single iterable `a` is
provided, compute distances between its elements.
"""
function pairwise(metric::PreMetric, a, b)
m = length(a)
n = length(b)
r = Matrix{result_type(metric, a, b)}(undef, m, n)
_pairwise!(r, metric, a, b)
end

function pairwise(metric::PreMetric, a)
n = length(a)
r = Matrix{result_type(metric, a, a)}(undef, n, n)
_pairwise!(r, metric, a)
end
Loading