From fdd2daec89dfb7d718b400f398bd37a5b861a7b6 Mon Sep 17 00:00:00 2001 From: Sky Blaise <69448615+SkyBlaise99@users.noreply.github.com> Date: Sun, 3 Mar 2024 13:01:13 +0800 Subject: [PATCH] [#944] Improve performance (#2108) Currently, the authorship credit analysis process exceeds an hour in duration, which is significantly longer than the mere 5 minutes required when the feature is deactivated. Let's speed up the performance by implementing a caching mechanism and refining the dynamic programming algorithm utilized for computing the Levenshtein distance. --- .../analyzer/AuthorshipAnalyzer.java | 87 +++++++++++++++---- src/main/java/reposense/util/StringsUtil.java | 67 ++++++++++---- .../java/reposense/util/StringsUtilTest.java | 30 +++++-- 3 files changed, 144 insertions(+), 40 deletions(-) diff --git a/src/main/java/reposense/authorship/analyzer/AuthorshipAnalyzer.java b/src/main/java/reposense/authorship/analyzer/AuthorshipAnalyzer.java index 498ccd0162..bcc90b395e 100644 --- a/src/main/java/reposense/authorship/analyzer/AuthorshipAnalyzer.java +++ b/src/main/java/reposense/authorship/analyzer/AuthorshipAnalyzer.java @@ -2,6 +2,8 @@ import java.nio.file.Paths; import java.time.ZonedDateTime; +import java.util.ArrayList; +import java.util.concurrent.ConcurrentHashMap; import java.util.logging.Logger; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -41,6 +43,10 @@ public class AuthorshipAnalyzer { private static final String ADDED_LINE_SYMBOL = "+"; private static final String DELETED_LINE_SYMBOL = "-"; + private static final ConcurrentHashMap GIT_LOG_CACHE = new ConcurrentHashMap<>(); + private static final ConcurrentHashMap>> GIT_DIFF_CACHE = + new ConcurrentHashMap<>(); + /** * Analyzes the authorship of {@code lineContent} in {@code filePath} based on {@code originalityThreshold}. * Returns {@code true} if {@code currentAuthor} should be assigned full credit, {@code false} otherwise. @@ -85,32 +91,73 @@ public static boolean analyzeAuthorship(RepoConfiguration config, String filePat */ private static CandidateLine getDeletedLineWithLowestOriginality(RepoConfiguration config, String filePath, String lineContent, String commitHash) { - String gitLogResults = GitLog.getParentCommits(config.getRepoRoot(), commitHash); - String[] parentCommits = gitLogResults.split(" "); - CandidateLine lowestOriginalityLine = null; + String gitLogCacheKey = config.getRepoRoot() + commitHash; + String[] parentCommits; + if (GIT_LOG_CACHE.containsKey(gitLogCacheKey)) { + parentCommits = GIT_LOG_CACHE.get(gitLogCacheKey); + } else { + String gitLogResults = GitLog.getParentCommits(config.getRepoRoot(), commitHash); + parentCommits = gitLogResults.split(" "); + GIT_LOG_CACHE.put(gitLogCacheKey, parentCommits); + } + for (String parentCommit : parentCommits) { - // Generate diff between commit and parent commit - String gitDiffResult = GitDiff.diffCommits(config.getRepoRoot(), parentCommit, commitHash); - String[] fileDiffResultList = gitDiffResult.split(DIFF_FILE_CHUNK_SEPARATOR); + String gitDiffCacheKey = config.getRepoRoot() + parentCommit + commitHash; + ArrayList fileDiffResultList; + ArrayList preImageFilePathList; + ArrayList postImageFilePathList; + + if (GIT_DIFF_CACHE.containsKey(gitDiffCacheKey)) { + ArrayList> cacheResults = GIT_DIFF_CACHE.get(gitDiffCacheKey); + fileDiffResultList = cacheResults.get(0); + preImageFilePathList = cacheResults.get(1); + postImageFilePathList = cacheResults.get(2); + } else { + fileDiffResultList = new ArrayList<>(); + preImageFilePathList = new ArrayList<>(); + postImageFilePathList = new ArrayList<>(); + + // Generate diff between commit and parent commit + String gitDiffResult = GitDiff.diffCommits(config.getRepoRoot(), parentCommit, commitHash); + String[] fileDiffResults = gitDiffResult.split(DIFF_FILE_CHUNK_SEPARATOR); + + for (String fileDiffResult : fileDiffResults) { + Matcher filePathMatcher = FILE_CHANGED_PATTERN.matcher(fileDiffResult); + if (!filePathMatcher.find()) { + continue; + } - for (String fileDiffResult : fileDiffResultList) { - Matcher filePathMatcher = FILE_CHANGED_PATTERN.matcher(fileDiffResult); - if (!filePathMatcher.find()) { - continue; + // If file was added in the commit + String preImageFilePath = filePathMatcher.group(PRE_IMAGE_FILE_PATH_GROUP_NAME); + if (preImageFilePath.equals(FILE_ADDED_SYMBOL)) { + continue; + } + + String postImageFilePath = filePathMatcher.group(POST_IMAGE_FILE_PATH_GROUP_NAME); + + fileDiffResultList.add(fileDiffResult); + preImageFilePathList.add(preImageFilePath); + postImageFilePathList.add(postImageFilePath); } - String preImageFilePath = filePathMatcher.group(PRE_IMAGE_FILE_PATH_GROUP_NAME); - String postImageFilePath = filePathMatcher.group(POST_IMAGE_FILE_PATH_GROUP_NAME); + ArrayList> cacheResults = new ArrayList<>(); + cacheResults.add(fileDiffResultList); + cacheResults.add(preImageFilePathList); + cacheResults.add(postImageFilePathList); + + GIT_DIFF_CACHE.put(gitDiffCacheKey, cacheResults); + } - // If file was added in the commit or file name does not match - if (preImageFilePath.equals(FILE_ADDED_SYMBOL) || !postImageFilePath.equals(filePath)) { + for (int i = 0; i < fileDiffResultList.size(); i++) { + // If file name does not match + if (!postImageFilePathList.get(i).equals(filePath)) { continue; } CandidateLine candidateLine = getDeletedLineWithLowestOriginalityInDiff( - fileDiffResult, lineContent, parentCommit, preImageFilePath); + fileDiffResultList.get(i), lineContent, parentCommit, preImageFilePathList.get(i)); if (candidateLine == null) { continue; } @@ -152,7 +199,11 @@ private static CandidateLine getDeletedLineWithLowestOriginalityInDiff(String fi if (lineChanged.startsWith(DELETED_LINE_SYMBOL)) { String deletedLineContent = lineChanged.substring(DELETED_LINE_SYMBOL.length()); - double originalityScore = computeOriginalityScore(lineContent, deletedLineContent); + double lowestOriginalityScore = lowestOriginalityLine == null + ? Integer.MAX_VALUE + : lowestOriginalityLine.getOriginalityScore(); + double originalityScore = computeOriginalityScore(lineContent, deletedLineContent, + lowestOriginalityScore); if (lowestOriginalityLine == null || originalityScore < lowestOriginalityLine.getOriginalityScore()) { @@ -192,8 +243,8 @@ private static int getPreImageStartingLineNumber(String linesChangedHeader) { /** * Calculates the originality score of {@code s} with {@code baseString}. */ - private static double computeOriginalityScore(String s, String baseString) { - double levenshteinDistance = StringsUtil.getLevenshteinDistance(s, baseString); + private static double computeOriginalityScore(String s, String baseString, double limit) { + double levenshteinDistance = StringsUtil.getLevenshteinDistance(s, baseString, limit * baseString.length()); return levenshteinDistance / baseString.length(); } diff --git a/src/main/java/reposense/util/StringsUtil.java b/src/main/java/reposense/util/StringsUtil.java index 02787e8c38..51534d96ad 100644 --- a/src/main/java/reposense/util/StringsUtil.java +++ b/src/main/java/reposense/util/StringsUtil.java @@ -94,35 +94,72 @@ public static boolean isNumeric(String string) { /** * Calculates the Levenshtein Distance between two strings using Dynamic Programming. + * Insertion, deletion, and substitution are all of cost 1. + * This version improves the space complexity down to O(min(s, t)) + *

+ * The dp will stop if the {@code limit} is reached, this means that if the final distance is 7 and the limit is set + * to 3, the algorithm ends early once it reaches 3. This is possible as we are using this method to find the string + * with the lowest Levenshtein distance. + *

+ * Returns {@code Integer.MAX_VALUE} if limit is reached, else returns the computed Levenshtein distance. */ - public static int getLevenshteinDistance(String s, String t) { - // dp[i][j] stores the distance between s.substring(0, i) and t.substring(0, j) -> distance(s[:i], t[:j]) - int[][] dp = new int[s.length() + 1][t.length() + 1]; + public static int getLevenshteinDistance(String s, String t, double limit) { + // Early termination if either string is empty, lev dist is just the length of the other string. + if (s.isEmpty()) { + return t.length(); + } + + if (t.isEmpty()) { + return s.length(); + } - // Distance between a string and an empty string is the length of the string - for (int i = 0; i <= s.length(); i++) { - dp[i][0] = i; + // The final lev dist is at least k where k = difference in length = number of insert/delete. + if (Math.abs(s.length() - t.length()) >= limit) { + return Integer.MAX_VALUE; } + if (s.length() < t.length()) { + // Swap s and t to ensure s is always the longer string + String temp = s; + s = t; + t = temp; + } + + int[] dp = new int[t.length() + 1]; for (int i = 0; i <= t.length(); i++) { - dp[0][i] = i; + dp[i] = i; } for (int i = 1; i <= s.length(); i++) { + // Store the value of the previous row's column + int prev = dp[0]; + dp[0] = i; + + // If for this row, all the values are at least k, then the final lev dist computed will also be at least k. + // hasLower will check for values smaller than the limit, and terminate early if limit is reached. + boolean hasLower = false; + for (int j = 1; j <= t.length(); j++) { - // If s[i-1] and t[j-1] are equal, distance(s[:i], t[:j]) equals to distance(s[:i-1], t[:j-1]) + int temp = dp[j]; + if (s.charAt(i - 1) == t.charAt(j - 1)) { - dp[i][j] = dp[i - 1][j - 1]; + dp[j] = prev; } else { - // distance(s[:i], t[:j]) is the minimum of: - // 1) distance(s[:i-1], t[:j]) + 1 -> add s[i] - // 2) distance(s[:i], t[:j-1]) + 1 -> add t[j] - // 3) distance(s[:i-1], t[:j-1]) + 1 -> substitute s[i] with t[j] - dp[i][j] = Math.min(dp[i - 1][j], Math.min(dp[i][j - 1], dp[i - 1][j - 1])) + 1; + dp[j] = Math.min(prev, Math.min(dp[j - 1], dp[j])) + 1; + } + + prev = temp; + + if (dp[j] < limit) { + hasLower = true; } } + + if (!hasLower) { + return Integer.MAX_VALUE; + } } - return dp[s.length()][t.length()]; + return dp[t.length()]; } } diff --git a/src/test/java/reposense/util/StringsUtilTest.java b/src/test/java/reposense/util/StringsUtilTest.java index 38defe0b0e..1c1fc93ac3 100644 --- a/src/test/java/reposense/util/StringsUtilTest.java +++ b/src/test/java/reposense/util/StringsUtilTest.java @@ -91,36 +91,52 @@ public void addQuotesForFilePath_specialBashCharacters_success() { @Test public void getLevenshteinDistance_success() { - Assertions.assertEquals(4, StringsUtil.getLevenshteinDistance("potato", "tomatoes")); + Assertions.assertEquals(4, StringsUtil.getLevenshteinDistance("potato", "tomatoes", Integer.MAX_VALUE)); } @Test public void getLevenshteinDistance_insertion_success() { - Assertions.assertEquals(2, StringsUtil.getLevenshteinDistance("abcd", "abcdef")); + Assertions.assertEquals(2, StringsUtil.getLevenshteinDistance("abcd", "abcdef", Integer.MAX_VALUE)); } @Test public void getLevenshteinDistance_deletion_success() { - Assertions.assertEquals(3, StringsUtil.getLevenshteinDistance("abcde", "ab")); + Assertions.assertEquals(3, StringsUtil.getLevenshteinDistance("abcde", "ab", Integer.MAX_VALUE)); } @Test public void getLevenshteinDistance_substitution_success() { - Assertions.assertEquals(4, StringsUtil.getLevenshteinDistance("xxxxefg", "abcdefg")); + Assertions.assertEquals(4, StringsUtil.getLevenshteinDistance("xxxxefg", "abcdefg", Integer.MAX_VALUE)); } @Test public void getLevenshteinDistance_identicalStrings_success() { - Assertions.assertEquals(0, StringsUtil.getLevenshteinDistance("abcdefg", "abcdefg")); + Assertions.assertEquals(0, StringsUtil.getLevenshteinDistance("abcdefg", "abcdefg", Integer.MAX_VALUE)); } @Test public void getLevenshteinDistance_emptyStrings_success() { - Assertions.assertEquals(0, StringsUtil.getLevenshteinDistance("", "")); + Assertions.assertEquals(0, StringsUtil.getLevenshteinDistance("", "", Integer.MAX_VALUE)); } @Test public void getLevenshteinDistance_emptyString_success() { - Assertions.assertEquals(6, StringsUtil.getLevenshteinDistance("abcdef", "")); + Assertions.assertEquals(6, StringsUtil.getLevenshteinDistance("abcdef", "", Integer.MAX_VALUE)); + } + + @Test + public void getLevenshteinDistance_belowLimit_success() { + Assertions.assertEquals(4, StringsUtil.getLevenshteinDistance("xxxxefg", "abcdefg", 4.0001)); + Assertions.assertEquals(4, StringsUtil.getLevenshteinDistance("xxxxefg", "abcdefg", 123.456)); + Assertions.assertEquals(4, StringsUtil.getLevenshteinDistance("xxxxefg", "abcdefg", Integer.MAX_VALUE)); + } + + @Test + public void getLevenshteinDistance_exceedLimit_success() { + Assertions.assertEquals(Integer.MAX_VALUE, StringsUtil.getLevenshteinDistance("xxxxefg", "abcdefg", 4.000)); + Assertions.assertEquals(Integer.MAX_VALUE, StringsUtil.getLevenshteinDistance("xxxxefg", "abcdefg", 3.99999)); + Assertions.assertEquals(Integer.MAX_VALUE, StringsUtil.getLevenshteinDistance("xxxxefg", "abcdefg", 0.89014)); + + Assertions.assertEquals(Integer.MAX_VALUE, StringsUtil.getLevenshteinDistance("a", "1234567", 2.0)); } }