Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Wrong implementation of transpositions score in Jaro-Winkler distance #9

Open
illabo opened this issue Apr 6, 2021 · 0 comments
Open

Comments

@illabo
Copy link

illabo commented Apr 6, 2021

Thanks for your convenient extension. It may be really useful in many cases. But it gives the wrong output for Jaro-Winkler score.

The problem is in Jaro part: you can't count transpositions correctly running only one loop. There are missing transpositions in a case when strings are offset, has different length etc. I was banging my head in denial of the second for loop too. And there is no way to avoid it, presumably.

You may want to check this implementation: https://www.rosettacode.org/wiki/Jaro-Winkler_distance#Swift, however it looks like there is a problem in common prefix count.
Also you may want to look into these implementations, prefix similarity is treated correct there: https://www.geeksforgeeks.org/jaro-and-jaro-winkler-similarity/, but there aren't Swift example.

Comparison of my take on Jaro-Winkler score vs yours vs Rosetta Code's
import Foundation

extension String {
    func similarity(to: String, with algorithm: StringSimilarityAlgorithm = .jaroWinkler) -> Double {
        stringSimilarityScore(self, to: to, with: algorithm)
    }
}

// Reimplementation
func stringSimilarityScore(_ from: String, to: String, with algo: StringSimilarityAlgorithm = .jaroWinkler) -> Double {
    let fromLen = from.count
    let toLen = to.count

    guard fromLen != 0, toLen != 0 else {
        if fromLen == toLen { return 1 }
        return 0
    }
    let matchDistance: Int = {
        let d = (max(fromLen, toLen)/2) - 1
        if d == 0 { return 1 }
        return d
    }()
    var matches = 0
    var fromMatches = Array<Bool>(repeating: false, count: fromLen)
    var toMatches = Array<Bool>(repeating: false, count: toLen)
    from.enumerated().forEach{ ee in
        let seekStartIdx = max(0, ee.offset - matchDistance)
        let seekEndIdx = min(ee.offset + matchDistance, toLen - 1)
        guard seekStartIdx <= seekEndIdx else {
            return
        }
        for si in seekStartIdx...seekEndIdx {
            if toMatches[si] { continue }
            let ti = to.index(to.startIndex, offsetBy: si)
            let tc = to[ti]
            if tc == ee.element {
                matches += 1
                fromMatches[ee.offset] = true
                toMatches[si] = true
                break
            }
        }
    }

    var transpositions = 0
    var seenToIdx = 0
    for i in 0..<fromLen {
        if fromMatches[i] == false {
            continue
        }
        if seenToIdx >= toLen {
            break
        }
        while toMatches[seenToIdx] == false {
            seenToIdx += 1
        }
        let fromStrIdx = from.index(from.startIndex, offsetBy: i)
        let fromChar = from[fromStrIdx]
        let toStrIdx = to.index(to.startIndex, offsetBy: seenToIdx)
        let toChar = to[toStrIdx]
        if fromChar != toChar {
            transpositions += 1
        }
        seenToIdx += 1
    }

    let jaro: Double = {
        if matches == 0 { return 0 }
        let matchesDouble = Double(matches)
        let mDivFrom = matchesDouble / Double(fromLen)
        let mDivTo = matchesDouble / Double(toLen)
        let halfTransp = Double(transpositions) / 2
        return (mDivFrom + mDivTo + (matchesDouble - halfTransp) / matchesDouble) / 3
    }()

    switch algo {
    case .jaro:
        return jaro
    case .jaroWinkler:
        let commonPrefix = min(4, from.commonPrefix(with: to).count)
        return (jaro + Double(commonPrefix) * 0.1 * (1.0 - jaro))
    }
}

enum StringSimilarityAlgorithm {
    case jaro, jaroWinkler
}

// from https://github.com/autozimu/StringMetric.swift
extension String {
    /// Get Jaro-Winkler distance.
    ///
    /// (Score is normalized such that 0 equates to no similarity and 1 is an exact match).
    ///
    /// Reference <https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance>
    /// - Parameter target: The target `String`.
    /// - Returns: The Jaro-Winkler distance between the receiver and `target`.
    public func distanceJaroWinkler(between target: String) -> Double {
        var stringOne = self
        var stringTwo = target
        if stringOne.count > stringTwo.count {
            stringTwo = self
            stringOne = target
        }

        let stringOneCount = stringOne.count
        let stringTwoCount = stringTwo.count

        if stringOneCount == 0 && stringTwoCount == 0 {
            return 1.0
        }

        let matchingDistance = stringTwoCount / 2
        var matchingCharactersCount: Double = 0
        var transpositionsCount: Double = 0
        var previousPosition = -1

        // Count matching characters and transpositions.
        for (i, stringOneChar) in stringOne.enumerated() {
            for (j, stringTwoChar) in stringTwo.enumerated() {
                if max(0, i - matchingDistance)..<min(stringTwoCount, i + matchingDistance) ~= j {
                    if stringOneChar == stringTwoChar {
                        matchingCharactersCount += 1
                        if previousPosition != -1 && j < previousPosition {
                            transpositionsCount += 1
                        }
                        previousPosition = j
                        break
                    }
                }
            }
        }

        if matchingCharactersCount == 0.0 {
            return 0.0
        }

        // Count common prefix (up to a maximum of 4 characters)
        let commonPrefixCount = min(max(Double(self.commonPrefix(with: target).count), 0), 4)

        let jaroSimilarity = (matchingCharactersCount / Double(stringOneCount) + matchingCharactersCount / Double(stringTwoCount) + (matchingCharactersCount - transpositionsCount) / matchingCharactersCount) / 3

        // Default is 0.1, should never exceed 0.25 (otherwise similarity score could exceed 1.0)
        let commonPrefixScalingFactor = 0.1

        return jaroSimilarity + commonPrefixCount * commonPrefixScalingFactor * (1 - jaroSimilarity)
    }

}

// From rosettacode.org
func jaroWinklerDistance(string1: String, string2: String) -> Double {
    var st1 = Array(string1)
    var st2 = Array(string2)
    var len1 = st1.count
    var len2 = st2.count
    if len1 < len2 {
        swap(&st1, &st2)
        swap(&len1, &len2)
    }
    if len2 == 0 {
        return len1 == 0 ? 0.0 : 1.0
    }
    let delta = max(1, len1 / 2) - 1
    var flag = Array(repeating: false, count: len2)
    var ch1Match: [Character] = []
    ch1Match.reserveCapacity(len1)
    for idx1 in 0..<len1 {
        let ch1 = st1[idx1]
        for idx2 in 0..<len2 {
            let ch2 = st2[idx2]
            if idx2 <= idx1 + delta && idx2 + delta >= idx1 && ch1 == ch2 && !flag[idx2] {
                flag[idx2] = true
                ch1Match.append(ch1)
                break
            }
        }
    }
    let matches = ch1Match.count
    if matches == 0 {
        return 1.0
    }
    var transpositions = 0
    var idx1 = 0
    for idx2 in 0..<len2 {
        if flag[idx2] {
            if st2[idx2] != ch1Match[idx1] {
                transpositions += 1
            }
            idx1 += 1
        }
    }
    let m = Double(matches)
    let jaro =
        (m / Double(len1) + m / Double(len2) + (m - Double(transpositions) / 2.0) / m) / 3.0
    var commonPrefix = 0
    for i in 0..<min(4, len2) {
        if st1[i] == st2[i] {
            commonPrefix += 1
        }
    }
    return 1.0 - (jaro + Double(commonPrefix) * 0.1 * (1.0 - jaro))
}

extension String {
    func rosettaJaroWinkler(to: String) -> Double {
        return 1 - jaroWinklerDistance(string1: self, string2: to)
    }
}

print("illabo")

print("transpositions".similarity(to: "sun transitions"))
print("transpositions".similarity(to: "trance positions"))
print("transpositions".similarity(to: "post transitions"))
print("transpositions".similarity(to: "disposition"))
print("transpositions".similarity(to: "transpositiosn"))
print("transpositions".similarity(to: "tranpsositions"))
print("kitten".similarity(to: "sitting"))
print("君子和而不同".similarity(to: "小人同而不和"))
print("accomodate".similarity(to: "accommodate"))
print("accomodate".similarity(to: "accommodated"))
print("accomodate".similarity(to: "accommodates"))
print("accomodate".similarity(to: "accommodating"))
print("accomodate".similarity(to: "accommodation"))
print("CRATE".similarity(to: "TRACE"))
print("TRATE".similarity(to: "TRACE"))
print("DwAyNE".similarity(to: "DuANE"))

print("autozimu")

print("transpositions".distanceJaroWinkler(between: "sun transitions"))
print("transpositions".distanceJaroWinkler(between: "trance positions"))
print("transpositions".distanceJaroWinkler(between: "post transitions"))
print("transpositions".distanceJaroWinkler(between: "disposition"))
print("transpositions".distanceJaroWinkler(between: "transpositiosn"))
print("transpositions".distanceJaroWinkler(between: "tranpsositions"))
print("kitten".distanceJaroWinkler(between: "sitting"))
print("君子和而不同".distanceJaroWinkler(between: "小人同而不和"))
print("accomodate".distanceJaroWinkler(between: "accommodate"))
print("accomodate".distanceJaroWinkler(between: "accommodated"))
print("accomodate".distanceJaroWinkler(between: "accommodates"))
print("accomodate".distanceJaroWinkler(between: "accommodating"))
print("accomodate".distanceJaroWinkler(between: "accommodation"))
print("CRATE".distanceJaroWinkler(between: "TRACE"))
print("TRATE".distanceJaroWinkler(between: "TRACE"))
print("DwAyNE".distanceJaroWinkler(between: "DuANE"))

print("rosettacode.org")

print("transpositions".rosettaJaroWinkler(to: "sun transitions"))
print("transpositions".rosettaJaroWinkler(to: "trance positions"))
print("transpositions".rosettaJaroWinkler(to: "post transitions"))
print("transpositions".rosettaJaroWinkler(to: "disposition"))
print("transpositions".rosettaJaroWinkler(to: "transpositiosn"))
print("transpositions".rosettaJaroWinkler(to: "tranpsositions"))
print("kitten".rosettaJaroWinkler(to: "sitting"))
print("君子和而不同".rosettaJaroWinkler(to: "小人同而不和"))
print("accomodate".rosettaJaroWinkler(to: "accommodate"))
print("accomodate".rosettaJaroWinkler(to: "accommodated"))
print("accomodate".rosettaJaroWinkler(to: "accommodates"))
print("accomodate".rosettaJaroWinkler(to: "accommodating"))
print("accomodate".rosettaJaroWinkler(to: "accommodation"))
print("CRATE".rosettaJaroWinkler(to: "TRACE"))
print("TRATE".rosettaJaroWinkler(to: "TRACE"))
print("DwAyNE".rosettaJaroWinkler(to: "DuANE"))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant