Skip to content

Commit

Permalink
[#944] Improve performance (#2108)
Browse files Browse the repository at this point in the history
Currently, the authorship credit analysis process exceeds an hour in
duration, which is significantly longer than the mere 5 minutes required
when the feature is deactivated.

Let's speed up the performance by implementing a caching mechanism and
refining the dynamic programming algorithm utilized for computing the
Levenshtein distance.
  • Loading branch information
SkyBlaise99 authored Mar 3, 2024
1 parent 79209a3 commit fdd2dae
Show file tree
Hide file tree
Showing 3 changed files with 144 additions and 40 deletions.
87 changes: 69 additions & 18 deletions src/main/java/reposense/authorship/analyzer/AuthorshipAnalyzer.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

import java.nio.file.Paths;
import java.time.ZonedDateTime;
import java.util.ArrayList;
import java.util.concurrent.ConcurrentHashMap;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
Expand Down Expand Up @@ -41,6 +43,10 @@ public class AuthorshipAnalyzer {
private static final String ADDED_LINE_SYMBOL = "+";
private static final String DELETED_LINE_SYMBOL = "-";

private static final ConcurrentHashMap<String, String[]> GIT_LOG_CACHE = new ConcurrentHashMap<>();
private static final ConcurrentHashMap<String, ArrayList<ArrayList<String>>> GIT_DIFF_CACHE =
new ConcurrentHashMap<>();

/**
* Analyzes the authorship of {@code lineContent} in {@code filePath} based on {@code originalityThreshold}.
* Returns {@code true} if {@code currentAuthor} should be assigned full credit, {@code false} otherwise.
Expand Down Expand Up @@ -85,32 +91,73 @@ public static boolean analyzeAuthorship(RepoConfiguration config, String filePat
*/
private static CandidateLine getDeletedLineWithLowestOriginality(RepoConfiguration config, String filePath,
String lineContent, String commitHash) {
String gitLogResults = GitLog.getParentCommits(config.getRepoRoot(), commitHash);
String[] parentCommits = gitLogResults.split(" ");

CandidateLine lowestOriginalityLine = null;

String gitLogCacheKey = config.getRepoRoot() + commitHash;
String[] parentCommits;
if (GIT_LOG_CACHE.containsKey(gitLogCacheKey)) {
parentCommits = GIT_LOG_CACHE.get(gitLogCacheKey);
} else {
String gitLogResults = GitLog.getParentCommits(config.getRepoRoot(), commitHash);
parentCommits = gitLogResults.split(" ");
GIT_LOG_CACHE.put(gitLogCacheKey, parentCommits);
}

for (String parentCommit : parentCommits) {
// Generate diff between commit and parent commit
String gitDiffResult = GitDiff.diffCommits(config.getRepoRoot(), parentCommit, commitHash);
String[] fileDiffResultList = gitDiffResult.split(DIFF_FILE_CHUNK_SEPARATOR);
String gitDiffCacheKey = config.getRepoRoot() + parentCommit + commitHash;
ArrayList<String> fileDiffResultList;
ArrayList<String> preImageFilePathList;
ArrayList<String> postImageFilePathList;

if (GIT_DIFF_CACHE.containsKey(gitDiffCacheKey)) {
ArrayList<ArrayList<String>> cacheResults = GIT_DIFF_CACHE.get(gitDiffCacheKey);
fileDiffResultList = cacheResults.get(0);
preImageFilePathList = cacheResults.get(1);
postImageFilePathList = cacheResults.get(2);
} else {
fileDiffResultList = new ArrayList<>();
preImageFilePathList = new ArrayList<>();
postImageFilePathList = new ArrayList<>();

// Generate diff between commit and parent commit
String gitDiffResult = GitDiff.diffCommits(config.getRepoRoot(), parentCommit, commitHash);
String[] fileDiffResults = gitDiffResult.split(DIFF_FILE_CHUNK_SEPARATOR);

for (String fileDiffResult : fileDiffResults) {
Matcher filePathMatcher = FILE_CHANGED_PATTERN.matcher(fileDiffResult);
if (!filePathMatcher.find()) {
continue;
}

for (String fileDiffResult : fileDiffResultList) {
Matcher filePathMatcher = FILE_CHANGED_PATTERN.matcher(fileDiffResult);
if (!filePathMatcher.find()) {
continue;
// If file was added in the commit
String preImageFilePath = filePathMatcher.group(PRE_IMAGE_FILE_PATH_GROUP_NAME);
if (preImageFilePath.equals(FILE_ADDED_SYMBOL)) {
continue;
}

String postImageFilePath = filePathMatcher.group(POST_IMAGE_FILE_PATH_GROUP_NAME);

fileDiffResultList.add(fileDiffResult);
preImageFilePathList.add(preImageFilePath);
postImageFilePathList.add(postImageFilePath);
}

String preImageFilePath = filePathMatcher.group(PRE_IMAGE_FILE_PATH_GROUP_NAME);
String postImageFilePath = filePathMatcher.group(POST_IMAGE_FILE_PATH_GROUP_NAME);
ArrayList<ArrayList<String>> cacheResults = new ArrayList<>();
cacheResults.add(fileDiffResultList);
cacheResults.add(preImageFilePathList);
cacheResults.add(postImageFilePathList);

GIT_DIFF_CACHE.put(gitDiffCacheKey, cacheResults);
}

// If file was added in the commit or file name does not match
if (preImageFilePath.equals(FILE_ADDED_SYMBOL) || !postImageFilePath.equals(filePath)) {
for (int i = 0; i < fileDiffResultList.size(); i++) {
// If file name does not match
if (!postImageFilePathList.get(i).equals(filePath)) {
continue;
}

CandidateLine candidateLine = getDeletedLineWithLowestOriginalityInDiff(
fileDiffResult, lineContent, parentCommit, preImageFilePath);
fileDiffResultList.get(i), lineContent, parentCommit, preImageFilePathList.get(i));
if (candidateLine == null) {
continue;
}
Expand Down Expand Up @@ -152,7 +199,11 @@ private static CandidateLine getDeletedLineWithLowestOriginalityInDiff(String fi

if (lineChanged.startsWith(DELETED_LINE_SYMBOL)) {
String deletedLineContent = lineChanged.substring(DELETED_LINE_SYMBOL.length());
double originalityScore = computeOriginalityScore(lineContent, deletedLineContent);
double lowestOriginalityScore = lowestOriginalityLine == null
? Integer.MAX_VALUE
: lowestOriginalityLine.getOriginalityScore();
double originalityScore = computeOriginalityScore(lineContent, deletedLineContent,
lowestOriginalityScore);

if (lowestOriginalityLine == null
|| originalityScore < lowestOriginalityLine.getOriginalityScore()) {
Expand Down Expand Up @@ -192,8 +243,8 @@ private static int getPreImageStartingLineNumber(String linesChangedHeader) {
/**
* Calculates the originality score of {@code s} with {@code baseString}.
*/
private static double computeOriginalityScore(String s, String baseString) {
double levenshteinDistance = StringsUtil.getLevenshteinDistance(s, baseString);
private static double computeOriginalityScore(String s, String baseString, double limit) {
double levenshteinDistance = StringsUtil.getLevenshteinDistance(s, baseString, limit * baseString.length());
return levenshteinDistance / baseString.length();
}

Expand Down
67 changes: 52 additions & 15 deletions src/main/java/reposense/util/StringsUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -94,35 +94,72 @@ public static boolean isNumeric(String string) {

/**
* Calculates the Levenshtein Distance between two strings using Dynamic Programming.
* Insertion, deletion, and substitution are all of cost 1.
* This version improves the space complexity down to O(min(s, t))
* <p></p>
* The dp will stop if the {@code limit} is reached, this means that if the final distance is 7 and the limit is set
* to 3, the algorithm ends early once it reaches 3. This is possible as we are using this method to find the string
* with the lowest Levenshtein distance.
* <p></p>
* Returns {@code Integer.MAX_VALUE} if limit is reached, else returns the computed Levenshtein distance.
*/
public static int getLevenshteinDistance(String s, String t) {
// dp[i][j] stores the distance between s.substring(0, i) and t.substring(0, j) -> distance(s[:i], t[:j])
int[][] dp = new int[s.length() + 1][t.length() + 1];
public static int getLevenshteinDistance(String s, String t, double limit) {
// Early termination if either string is empty, lev dist is just the length of the other string.
if (s.isEmpty()) {
return t.length();
}

if (t.isEmpty()) {
return s.length();
}

// Distance between a string and an empty string is the length of the string
for (int i = 0; i <= s.length(); i++) {
dp[i][0] = i;
// The final lev dist is at least k where k = difference in length = number of insert/delete.
if (Math.abs(s.length() - t.length()) >= limit) {
return Integer.MAX_VALUE;
}

if (s.length() < t.length()) {
// Swap s and t to ensure s is always the longer string
String temp = s;
s = t;
t = temp;
}

int[] dp = new int[t.length() + 1];
for (int i = 0; i <= t.length(); i++) {
dp[0][i] = i;
dp[i] = i;
}

for (int i = 1; i <= s.length(); i++) {
// Store the value of the previous row's column
int prev = dp[0];
dp[0] = i;

// If for this row, all the values are at least k, then the final lev dist computed will also be at least k.
// hasLower will check for values smaller than the limit, and terminate early if limit is reached.
boolean hasLower = false;

for (int j = 1; j <= t.length(); j++) {
// If s[i-1] and t[j-1] are equal, distance(s[:i], t[:j]) equals to distance(s[:i-1], t[:j-1])
int temp = dp[j];

if (s.charAt(i - 1) == t.charAt(j - 1)) {
dp[i][j] = dp[i - 1][j - 1];
dp[j] = prev;
} else {
// distance(s[:i], t[:j]) is the minimum of:
// 1) distance(s[:i-1], t[:j]) + 1 -> add s[i]
// 2) distance(s[:i], t[:j-1]) + 1 -> add t[j]
// 3) distance(s[:i-1], t[:j-1]) + 1 -> substitute s[i] with t[j]
dp[i][j] = Math.min(dp[i - 1][j], Math.min(dp[i][j - 1], dp[i - 1][j - 1])) + 1;
dp[j] = Math.min(prev, Math.min(dp[j - 1], dp[j])) + 1;
}

prev = temp;

if (dp[j] < limit) {
hasLower = true;
}
}

if (!hasLower) {
return Integer.MAX_VALUE;
}
}

return dp[s.length()][t.length()];
return dp[t.length()];
}
}
30 changes: 23 additions & 7 deletions src/test/java/reposense/util/StringsUtilTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -91,36 +91,52 @@ public void addQuotesForFilePath_specialBashCharacters_success() {

@Test
public void getLevenshteinDistance_success() {
Assertions.assertEquals(4, StringsUtil.getLevenshteinDistance("potato", "tomatoes"));
Assertions.assertEquals(4, StringsUtil.getLevenshteinDistance("potato", "tomatoes", Integer.MAX_VALUE));
}

@Test
public void getLevenshteinDistance_insertion_success() {
Assertions.assertEquals(2, StringsUtil.getLevenshteinDistance("abcd", "abcdef"));
Assertions.assertEquals(2, StringsUtil.getLevenshteinDistance("abcd", "abcdef", Integer.MAX_VALUE));
}

@Test
public void getLevenshteinDistance_deletion_success() {
Assertions.assertEquals(3, StringsUtil.getLevenshteinDistance("abcde", "ab"));
Assertions.assertEquals(3, StringsUtil.getLevenshteinDistance("abcde", "ab", Integer.MAX_VALUE));
}

@Test
public void getLevenshteinDistance_substitution_success() {
Assertions.assertEquals(4, StringsUtil.getLevenshteinDistance("xxxxefg", "abcdefg"));
Assertions.assertEquals(4, StringsUtil.getLevenshteinDistance("xxxxefg", "abcdefg", Integer.MAX_VALUE));
}

@Test
public void getLevenshteinDistance_identicalStrings_success() {
Assertions.assertEquals(0, StringsUtil.getLevenshteinDistance("abcdefg", "abcdefg"));
Assertions.assertEquals(0, StringsUtil.getLevenshteinDistance("abcdefg", "abcdefg", Integer.MAX_VALUE));
}

@Test
public void getLevenshteinDistance_emptyStrings_success() {
Assertions.assertEquals(0, StringsUtil.getLevenshteinDistance("", ""));
Assertions.assertEquals(0, StringsUtil.getLevenshteinDistance("", "", Integer.MAX_VALUE));
}

@Test
public void getLevenshteinDistance_emptyString_success() {
Assertions.assertEquals(6, StringsUtil.getLevenshteinDistance("abcdef", ""));
Assertions.assertEquals(6, StringsUtil.getLevenshteinDistance("abcdef", "", Integer.MAX_VALUE));
}

@Test
public void getLevenshteinDistance_belowLimit_success() {
Assertions.assertEquals(4, StringsUtil.getLevenshteinDistance("xxxxefg", "abcdefg", 4.0001));
Assertions.assertEquals(4, StringsUtil.getLevenshteinDistance("xxxxefg", "abcdefg", 123.456));
Assertions.assertEquals(4, StringsUtil.getLevenshteinDistance("xxxxefg", "abcdefg", Integer.MAX_VALUE));
}

@Test
public void getLevenshteinDistance_exceedLimit_success() {
Assertions.assertEquals(Integer.MAX_VALUE, StringsUtil.getLevenshteinDistance("xxxxefg", "abcdefg", 4.000));
Assertions.assertEquals(Integer.MAX_VALUE, StringsUtil.getLevenshteinDistance("xxxxefg", "abcdefg", 3.99999));
Assertions.assertEquals(Integer.MAX_VALUE, StringsUtil.getLevenshteinDistance("xxxxefg", "abcdefg", 0.89014));

Assertions.assertEquals(Integer.MAX_VALUE, StringsUtil.getLevenshteinDistance("a", "1234567", 2.0));
}
}

0 comments on commit fdd2dae

Please sign in to comment.