Skip to content

Commit

Permalink
[#944] Add originality threshold flag (#2122)
Browse files Browse the repository at this point in the history
The existing setup employs a static originality threshold of 0.51.
However, this threshold is tailored for codes, such as Java or Markdown,
and might not be suitable for other programming languages. Additionally,
it doesn't offer flexibility for users who may want a stricter threshold
but are willing to endure longer processing times, or those who prefer a
more lenient threshold but prioritize faster analysis speeds.

Let's enable users to input their preferred originality threshold.
  • Loading branch information
SkyBlaise99 authored Mar 3, 2024
1 parent e8cb72a commit 79209a3
Show file tree
Hide file tree
Showing 10 changed files with 130 additions and 33 deletions.
2 changes: 1 addition & 1 deletion src/main/java/reposense/RepoSense.java
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ public static void main(String[] args) {
cliArguments.isSinceDateProvided(), cliArguments.isUntilDateProvided(),
cliArguments.getNumCloningThreads(), cliArguments.getNumAnalysisThreads(),
TimeUtil::getElapsedTime, cliArguments.getZoneId(), cliArguments.isFreshClonePerformed(),
cliArguments.isAuthorshipAnalyzed());
cliArguments.isAuthorshipAnalyzed(), cliArguments.getOriginalityThreshold());

FileUtil.zipFoldersAndFiles(reportFoldersAndFiles, cliArguments.getOutputFilePath().toAbsolutePath(),
".json");
Expand Down
9 changes: 6 additions & 3 deletions src/main/java/reposense/authorship/AuthorshipReporter.java
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,11 @@ public class AuthorshipReporter {

/**
* Generates and returns the authorship summary for each repo in {@code config}.
* Further analyzes the authorship of each line in the commit if {@code shouldAnalyzeAuthorship} is true.
* Further analyzes the authorship of each line in the commit if {@code shouldAnalyzeAuthorship} is true, based on
* {code originalityThreshold}.
*/
public AuthorshipSummary generateAuthorshipSummary(RepoConfiguration config, boolean shouldAnalyzeAuthorship) {
public AuthorshipSummary generateAuthorshipSummary(RepoConfiguration config, boolean shouldAnalyzeAuthorship,
double originalityThreshold) {
List<FileInfo> textFileInfos = fileInfoExtractor.extractTextFileInfos(config);

int numFiles = textFileInfos.size();
Expand All @@ -45,7 +47,8 @@ public AuthorshipSummary generateAuthorshipSummary(RepoConfiguration config, boo
}

List<FileResult> fileResults = textFileInfos.stream()
.map(fileInfo -> fileInfoAnalyzer.analyzeTextFile(config, fileInfo, shouldAnalyzeAuthorship))
.map(fileInfo -> fileInfoAnalyzer.analyzeTextFile(config, fileInfo, shouldAnalyzeAuthorship,
originalityThreshold))
.filter(Objects::nonNull)
.collect(Collectors.toList());

Expand Down
19 changes: 12 additions & 7 deletions src/main/java/reposense/authorship/FileInfoAnalyzer.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package reposense.authorship;

import static reposense.parser.ArgsParser.DEFAULT_ORIGINALITY_THRESHOLD;

import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
Expand Down Expand Up @@ -45,11 +47,13 @@ public class FileInfoAnalyzer {
/**
* Analyzes the lines of the file, given in the {@code fileInfo}, that has changed in the time period provided
* by {@code config}.
* Further analyzes the authorship of each line in the commit if {@code shouldAnalyzeAuthorship} is true.
* Further analyzes the authorship of each line in the commit if {@code shouldAnalyzeAuthorship} is true, based on
* {@code originalityThreshold}.
* Returns null if the file is missing from the local system, or none of the
* {@link Author} specified in {@code config} contributed to the file in {@code fileInfo}.
*/
public FileResult analyzeTextFile(RepoConfiguration config, FileInfo fileInfo, boolean shouldAnalyzeAuthorship) {
public FileResult analyzeTextFile(RepoConfiguration config, FileInfo fileInfo, boolean shouldAnalyzeAuthorship,
double originalityThreshold) {
String relativePath = fileInfo.getPath();

if (Files.notExists(Paths.get(config.getRepoRoot(), relativePath))) {
Expand All @@ -61,7 +65,7 @@ public FileResult analyzeTextFile(RepoConfiguration config, FileInfo fileInfo, b
return null;
}

aggregateBlameAuthorModifiedAndDateInfo(config, fileInfo, shouldAnalyzeAuthorship);
aggregateBlameAuthorModifiedAndDateInfo(config, fileInfo, shouldAnalyzeAuthorship, originalityThreshold);
fileInfo.setFileType(config.getFileType(fileInfo.getPath()));

AnnotatorAnalyzer.aggregateAnnotationAuthorInfo(fileInfo, config.getAuthorConfig(), shouldAnalyzeAuthorship);
Expand All @@ -83,7 +87,7 @@ public FileResult analyzeTextFile(RepoConfiguration config, FileInfo fileInfo, b
* {@link Author} specified in {@code config} contributed to the file in {@code fileInfo}.
*/
public FileResult analyzeTextFile(RepoConfiguration config, FileInfo fileInfo) {
return analyzeTextFile(config, fileInfo, false);
return analyzeTextFile(config, fileInfo, false, DEFAULT_ORIGINALITY_THRESHOLD);
}

/**
Expand Down Expand Up @@ -153,10 +157,11 @@ private FileResult generateBinaryFileResult(RepoConfiguration config, FileInfo f
* The {@code config} is used to obtain the root directory for running git blame as well as other parameters used
* in determining which author to assign to each line and whether to set the last modified date for a
* {@code lineInfo}.
* Further analyzes the authorship of each line in the commit if {@code shouldAnalyzeAuthorship} is true.
* Further analyzes the authorship of each line in the commit if {@code shouldAnalyzeAuthorship} is true, based on
* {@code originalityThreshold}.
*/
private void aggregateBlameAuthorModifiedAndDateInfo(RepoConfiguration config, FileInfo fileInfo,
boolean shouldAnalyzeAuthorship) {
boolean shouldAnalyzeAuthorship, double originalityThreshold) {
String blameResults;

if (!config.isFindingPreviousAuthorsPerformed()) {
Expand Down Expand Up @@ -199,7 +204,7 @@ private void aggregateBlameAuthorModifiedAndDateInfo(RepoConfiguration config, F
if (shouldAnalyzeAuthorship && !author.equals(Author.UNKNOWN_AUTHOR)) {
String lineContent = fileInfo.getLine(lineCount / 5 + 1).getContent();
boolean isFullCredit = AuthorshipAnalyzer.analyzeAuthorship(config, fileInfo.getPath(), lineContent,
commitHash, author);
commitHash, author, originalityThreshold);
fileInfo.setIsFullCredit(lineCount / 5, isFullCredit);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,6 @@
*/
public class AuthorshipAnalyzer {
private static final Logger logger = LogsManager.getLogger(AuthorshipAnalyzer.class);

private static final double ORIGINALITY_THRESHOLD = 0.51;

private static final String DIFF_FILE_CHUNK_SEPARATOR = "\ndiff --git a/.*\n";
private static final Pattern FILE_CHANGED_PATTERN =
Pattern.compile("\n(-){3} a?/(?<preImageFilePath>.*)\n(\\+){3} b?/(?<postImageFilePath>.*)\n");
Expand All @@ -45,11 +42,11 @@ public class AuthorshipAnalyzer {
private static final String DELETED_LINE_SYMBOL = "-";

/**
* Analyzes the authorship of {@code lineContent} in {@code filePath}.
* Analyzes the authorship of {@code lineContent} in {@code filePath} based on {@code originalityThreshold}.
* Returns {@code true} if {@code currentAuthor} should be assigned full credit, {@code false} otherwise.
*/
public static boolean analyzeAuthorship(RepoConfiguration config, String filePath, String lineContent,
String commitHash, Author currentAuthor) {
String commitHash, Author currentAuthor, double originalityThreshold) {
// Empty lines are ignored and given full credit
if (lineContent.isEmpty()) {
return true;
Expand All @@ -58,7 +55,7 @@ public static boolean analyzeAuthorship(RepoConfiguration config, String filePat
CandidateLine deletedLine = getDeletedLineWithLowestOriginality(config, filePath, lineContent, commitHash);

// Give full credit if there are no deleted lines found or deleted line is more than originality threshold
if (deletedLine == null || deletedLine.getOriginalityScore() > ORIGINALITY_THRESHOLD) {
if (deletedLine == null || deletedLine.getOriginalityScore() > originalityThreshold) {
return true;
}

Expand All @@ -80,7 +77,7 @@ public static boolean analyzeAuthorship(RepoConfiguration config, String filePat

// Check the previous version as currentAuthor is the same as author of the previous version
return analyzeAuthorship(config, deletedLine.getFilePath(), deletedLine.getLineContent(),
deletedLineInfo.getCommitHash(), deletedLineInfo.getAuthor());
deletedLineInfo.getCommitHash(), deletedLineInfo.getAuthor(), originalityThreshold);
}

/**
Expand Down
20 changes: 19 additions & 1 deletion src/main/java/reposense/model/CliArguments.java
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ public class CliArguments {
private final ZoneId zoneId;
private final boolean isFindingPreviousAuthorsPerformed;
private final boolean isAuthorshipAnalyzed;
private final double originalityThreshold;

private boolean isTestMode = ArgsParser.DEFAULT_IS_TEST_MODE;
private boolean isFreshClonePerformed = ArgsParser.DEFAULT_SHOULD_FRESH_CLONE;
Expand Down Expand Up @@ -81,6 +82,7 @@ private CliArguments(Builder builder) {
this.reportConfigFilePath = builder.reportConfigFilePath;
this.reportConfiguration = builder.reportConfiguration;
this.isAuthorshipAnalyzed = builder.isAuthorshipAnalyzed;
this.originalityThreshold = builder.originalityThreshold;
}

public ZoneId getZoneId() {
Expand Down Expand Up @@ -195,6 +197,10 @@ public boolean isAuthorshipAnalyzed() {
return isAuthorshipAnalyzed;
}

public double getOriginalityThreshold() {
return originalityThreshold;
}

@Override
public boolean equals(Object other) {
// short circuit if same object
Expand Down Expand Up @@ -233,7 +239,8 @@ public boolean equals(Object other) {
&& Objects.equals(this.authorConfigFilePath, otherCliArguments.authorConfigFilePath)
&& Objects.equals(this.groupConfigFilePath, otherCliArguments.groupConfigFilePath)
&& Objects.equals(this.reportConfigFilePath, otherCliArguments.reportConfigFilePath)
&& this.isAuthorshipAnalyzed == otherCliArguments.isAuthorshipAnalyzed;
&& this.isAuthorshipAnalyzed == otherCliArguments.isAuthorshipAnalyzed
&& Objects.equals(this.originalityThreshold, otherCliArguments.originalityThreshold);
}

/**
Expand Down Expand Up @@ -268,6 +275,7 @@ public static final class Builder {
private Path reportConfigFilePath;
private ReportConfiguration reportConfiguration;
private boolean isAuthorshipAnalyzed;
private double originalityThreshold;

public Builder() {
}
Expand Down Expand Up @@ -520,6 +528,16 @@ public Builder isAuthorshipAnalyzed(boolean isAuthorshipAnalyzed) {
return this;
}

/**
* Adds the {@code originalityThreshold} to CliArguments.
*
* @param originalityThreshold the originality threshold.
*/
public Builder originalityThreshold(double originalityThreshold) {
this.originalityThreshold = originalityThreshold;
return this;
}

/**
* Builds CliArguments.
*
Expand Down
13 changes: 12 additions & 1 deletion src/main/java/reposense/parser/ArgsParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ public class ArgsParser {
public static final int DEFAULT_NUM_ANALYSIS_THREADS = Runtime.getRuntime().availableProcessors();
public static final boolean DEFAULT_IS_TEST_MODE = false;
public static final boolean DEFAULT_SHOULD_FRESH_CLONE = false;
public static final double DEFAULT_ORIGINALITY_THRESHOLD = 0.51;

public static final String[] HELP_FLAGS = new String[] {"--help", "-h"};
public static final String[] CONFIG_FLAGS = new String[] {"--config", "-c"};
Expand All @@ -63,6 +64,7 @@ public class ArgsParser {
public static final String[] TEST_MODE_FLAG = new String[] {"--test-mode"};
public static final String[] FRESH_CLONING_FLAG = new String[] {"--fresh-cloning"};
public static final String[] ANALYZE_AUTHORSHIP_FLAGS = new String[] {"--analyze-authorship", "-A"};
public static final String[] ORIGINALITY_THRESHOLD_FLAGS = new String[] {"--originality-threshold", "-ot"};

private static final Logger logger = LogsManager.getLogger(ArgsParser.class);

Expand Down Expand Up @@ -201,6 +203,13 @@ private static ArgumentParser getArgumentParser() {
.action(Arguments.storeTrue())
.help("A flag to perform analysis of code authorship.");

parser.addArgument(ORIGINALITY_THRESHOLD_FLAGS)
.dest(ORIGINALITY_THRESHOLD_FLAGS[0])
.metavar("(0.0 ~ 1.0)")
.type(new OriginalityThresholdArgumentType())
.setDefault(DEFAULT_ORIGINALITY_THRESHOLD)
.help("The originality threshold for analysis of code authorship.");

// Mutex flags - these will always be the last parameters in help message.
mutexParser.addArgument(CONFIG_FLAGS)
.dest(CONFIG_FLAGS[0])
Expand Down Expand Up @@ -280,6 +289,7 @@ public static CliArguments parse(String[] args) throws HelpScreenException, Pars
boolean shouldFindPreviousAuthors = results.get(FIND_PREVIOUS_AUTHORS_FLAGS[0]);
boolean isTestMode = results.get(TEST_MODE_FLAG[0]);
boolean isAuthorshipAnalyzed = results.get(ANALYZE_AUTHORSHIP_FLAGS[0]);
double originalityThreshold = results.get(ORIGINALITY_THRESHOLD_FLAGS[0]);
int numCloningThreads = results.get(CLONING_THREADS_FLAG[0]);
int numAnalysisThreads = results.get(ANALYSIS_THREADS_FLAG[0]);

Expand All @@ -299,7 +309,8 @@ public static CliArguments parse(String[] args) throws HelpScreenException, Pars
.numCloningThreads(numCloningThreads)
.numAnalysisThreads(numAnalysisThreads)
.isTestMode(isTestMode)
.isAuthorshipAnalyzed(isAuthorshipAnalyzed);
.isAuthorshipAnalyzed(isAuthorshipAnalyzed)
.originalityThreshold(originalityThreshold);

LogsManager.setLogFolderLocation(outputFolderPath);

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
package reposense.parser;

import net.sourceforge.argparse4j.inf.Argument;
import net.sourceforge.argparse4j.inf.ArgumentParser;
import net.sourceforge.argparse4j.inf.ArgumentParserException;
import net.sourceforge.argparse4j.inf.ArgumentType;

/**
* Verifies and parses a string-formatted double, between 0.0 and 1.0, to an {@link Double} object.
*/
public class OriginalityThresholdArgumentType implements ArgumentType<Double> {
private static final String PARSE_EXCEPTION_MESSAGE_THRESHOLD_OUT_OF_BOUND =
"Invalid threshold. It must be a number between 0.0 and 1.0.";

@Override
public Double convert(ArgumentParser parser, Argument arg, String value) throws ArgumentParserException {
double threshold = Double.parseDouble(value);

if (Double.compare(threshold, 0.0) < 0 || Double.compare(threshold, 1.0) > 0) {
throw new ArgumentParserException(PARSE_EXCEPTION_MESSAGE_THRESHOLD_OUT_OF_BOUND, parser);
}

return threshold;
}
}
Loading

0 comments on commit 79209a3

Please sign in to comment.