diff --git a/cli/src/main/java/de/jplag/cli/CLI.java b/cli/src/main/java/de/jplag/cli/CLI.java index 7182d3a98..e67c445c9 100644 --- a/cli/src/main/java/de/jplag/cli/CLI.java +++ b/cli/src/main/java/de/jplag/cli/CLI.java @@ -22,6 +22,7 @@ import de.jplag.clustering.ClusteringOptions; import de.jplag.clustering.Preprocessing; import de.jplag.exceptions.ExitException; +import de.jplag.merging.MergingParameters; import de.jplag.options.JPlagOptions; import de.jplag.options.LanguageOption; import de.jplag.options.LanguageOptions; @@ -157,11 +158,12 @@ public JPlagOptions buildOptionsFromArguments(ParseResult parseResult) throws Cl } ClusteringOptions clusteringOptions = getClusteringOptions(this.options); + MergingParameters mergingParameters = getMergingParameters(this.options); JPlagOptions jPlagOptions = new JPlagOptions(loadLanguage(parseResult), this.options.minTokenMatch, submissionDirectories, oldSubmissionDirectories, null, this.options.advanced.subdirectory, suffixes, this.options.advanced.exclusionFileName, JPlagOptions.DEFAULT_SIMILARITY_METRIC, this.options.advanced.similarityThreshold, this.options.shownComparisons, clusteringOptions, - this.options.advanced.debug); + this.options.advanced.debug, mergingParameters); String baseCodePath = this.options.baseCode; File baseCodeDirectory = baseCodePath == null ? null : new File(baseCodePath); @@ -220,6 +222,10 @@ private static ClusteringOptions getClusteringOptions(CliOptions options) { return clusteringOptions; } + private static MergingParameters getMergingParameters(CliOptions options) { + return new MergingParameters(options.merging.enabled, options.merging.mergeBuffer, options.merging.seperatingThreshold); + } + private String generateDescription() { var randomDescription = DESCRIPTIONS[RANDOM.nextInt(DESCRIPTIONS.length)]; return String.format("JPlag - %s%n%n%s", randomDescription, CREDITS); diff --git a/cli/src/main/java/de/jplag/cli/CliOptions.java b/cli/src/main/java/de/jplag/cli/CliOptions.java index 7538b8847..957c4e5c6 100644 --- a/cli/src/main/java/de/jplag/cli/CliOptions.java +++ b/cli/src/main/java/de/jplag/cli/CliOptions.java @@ -59,6 +59,9 @@ public class CliOptions implements Runnable { @ArgGroup(validate = false, heading = "Clustering%n") public Clustering clustering = new Clustering(); + @ArgGroup(validate = false, heading = "Match Merging defense mechanism against obfuscation that merges neighboring matches based on these parameters:%n") + public Merging merging = new Merging(); + /** * Empty run method, so picocli prints help automatically */ @@ -88,7 +91,7 @@ public static class Advanced { } public static class Clustering { - @Option(names = {"--cluster-skip"}, description = "Skips the clustering (default: false)\n") + @Option(names = {"--cluster-skip"}, description = "Skips the clustering (default: false)%n") public boolean disable; @ArgGroup @@ -109,6 +112,20 @@ public static class ClusteringEnabled { } } + public static class Merging { + @Option(names = {"--match-merging"}, description = "Enables match merging (default: false)%n") + public boolean enabled; + + @Option(names = { + "--merge-buffer"}, description = "Defines how much lower the length of a match can be than the minimum match length (default: 0)%n") + public int mergeBuffer; + + @Option(names = { + "--seperating-threshold"}, description = "Defines how many token there can be between two neighboring matches (default: 0)%n") + public int seperatingThreshold; + + } + @Option(names = {"--cluster-spectral-bandwidth"}, hidden = true) public double clusterSpectralBandwidth = new ClusteringOptions().spectralKernelBandwidth(); diff --git a/core/src/main/java/de/jplag/GreedyStringTiling.java b/core/src/main/java/de/jplag/GreedyStringTiling.java index 0a1a6df91..f8172db0b 100644 --- a/core/src/main/java/de/jplag/GreedyStringTiling.java +++ b/core/src/main/java/de/jplag/GreedyStringTiling.java @@ -22,6 +22,7 @@ public class GreedyStringTiling { private final int minimumMatchLength; + private final int mergeBuffer; private ConcurrentMap tokenTypeValues; private final Map> baseCodeMarkings = new IdentityHashMap<>(); @@ -29,7 +30,8 @@ public class GreedyStringTiling { private final Map cachedHashLookupTables = new IdentityHashMap<>(); public GreedyStringTiling(JPlagOptions options) { - this.minimumMatchLength = options.minimumTokenMatch(); + this.mergeBuffer = options.mergingParameters().mergeBuffer(); + this.minimumMatchLength = Math.max(options.minimumTokenMatch() - this.mergeBuffer, 1); this.tokenTypeValues = new ConcurrentHashMap<>(); this.tokenTypeValues.put(SharedTokenType.FILE_END, 0); } @@ -98,7 +100,7 @@ private JPlagComparison compareInternal(Submission leftSubmission, Submission ri // comparison uses <= because it is assumed that the last token is a pivot (FILE_END) if (leftTokens.size() <= minimumMatchLength || rightTokens.size() <= minimumMatchLength) { - return new JPlagComparison(leftSubmission, rightSubmission, List.of()); + return new JPlagComparison(leftSubmission, rightSubmission, List.of(), List.of()); } boolean[] leftMarked = calculateInitiallyMarked(leftSubmission); @@ -109,6 +111,7 @@ private JPlagComparison compareInternal(Submission leftSubmission, Submission ri int maximumMatchLength; List globalMatches = new ArrayList<>(); + List ignoredMatches = new ArrayList<>(); do { maximumMatchLength = minimumMatchLength; List iterationMatches = new ArrayList<>(); @@ -138,7 +141,11 @@ private JPlagComparison compareInternal(Submission leftSubmission, Submission ri } } for (Match match : iterationMatches) { - addMatchIfNotOverlapping(globalMatches, match); + if (match.length() < minimumMatchLength + mergeBuffer) { + addMatchIfNotOverlapping(ignoredMatches, match); + } else { + addMatchIfNotOverlapping(globalMatches, match); + } int leftStartIndex = match.startOfFirst(); int rightStartIndex = match.startOfSecond(); for (int offset = 0; offset < match.length(); offset++) { @@ -147,7 +154,7 @@ private JPlagComparison compareInternal(Submission leftSubmission, Submission ri } } } while (maximumMatchLength != minimumMatchLength); - return new JPlagComparison(leftSubmission, rightSubmission, globalMatches); + return new JPlagComparison(leftSubmission, rightSubmission, globalMatches, ignoredMatches); } /** diff --git a/core/src/main/java/de/jplag/JPlag.java b/core/src/main/java/de/jplag/JPlag.java index 3011f8eb5..fe0990a4f 100644 --- a/core/src/main/java/de/jplag/JPlag.java +++ b/core/src/main/java/de/jplag/JPlag.java @@ -10,6 +10,7 @@ import de.jplag.clustering.ClusteringFactory; import de.jplag.exceptions.ExitException; import de.jplag.exceptions.SubmissionException; +import de.jplag.merging.MatchMerging; import de.jplag.options.JPlagOptions; import de.jplag.reporting.reportobject.model.Version; import de.jplag.strategy.ComparisonStrategy; @@ -71,6 +72,12 @@ public static JPlagResult run(JPlagOptions options) throws ExitException { // Compare valid submissions. JPlagResult result = comparisonStrategy.compareSubmissions(submissionSet); + + // Use Match Merging against obfuscation + if (options.mergingParameters().enabled()) { + result = new MatchMerging(options).mergeMatchesOf(result); + } + if (logger.isInfoEnabled()) logger.info("Total time for comparing submissions: {}", TimeUtil.formatDuration(result.getDuration())); result.setClusteringResult(ClusteringFactory.getClusterings(result.getAllComparisons(), options.clusteringOptions())); diff --git a/core/src/main/java/de/jplag/JPlagComparison.java b/core/src/main/java/de/jplag/JPlagComparison.java index 7f69d1b1d..37aa0c7ad 100644 --- a/core/src/main/java/de/jplag/JPlagComparison.java +++ b/core/src/main/java/de/jplag/JPlagComparison.java @@ -9,17 +9,18 @@ * @param secondSubmission is the second of the two submissions. * @param matches is the unmodifiable list of all matches between the two submissions. */ -public record JPlagComparison(Submission firstSubmission, Submission secondSubmission, List matches) { +public record JPlagComparison(Submission firstSubmission, Submission secondSubmission, List matches, List ignoredMatches) { /** * Initializes a new comparison. * @param firstSubmission is the first of the two submissions. * @param secondSubmission is the second of the two submissions. * @param matches is the list of all matches between the two submissions. */ - public JPlagComparison(Submission firstSubmission, Submission secondSubmission, List matches) { + public JPlagComparison(Submission firstSubmission, Submission secondSubmission, List matches, List ignoredMatches) { this.firstSubmission = firstSubmission; this.secondSubmission = secondSubmission; this.matches = Collections.unmodifiableList(matches); + this.ignoredMatches = Collections.unmodifiableList(ignoredMatches); } /** diff --git a/core/src/main/java/de/jplag/Submission.java b/core/src/main/java/de/jplag/Submission.java index 34ff5dbb7..f47db6d60 100644 --- a/core/src/main/java/de/jplag/Submission.java +++ b/core/src/main/java/de/jplag/Submission.java @@ -294,4 +294,14 @@ private List getOrder(List tokenList) { } return order; } + + /** + * @return Submission containing shallow copies of its fields. + */ + public Submission copy() { + Submission copy = new Submission(name, submissionRootFile, isNew, files, language); + copy.setTokenList(new ArrayList<>(tokenList)); + copy.setBaseCodeComparison(baseCodeComparison); + return copy; + } } diff --git a/core/src/main/java/de/jplag/merging/MatchMerging.java b/core/src/main/java/de/jplag/merging/MatchMerging.java new file mode 100644 index 000000000..13493ff87 --- /dev/null +++ b/core/src/main/java/de/jplag/merging/MatchMerging.java @@ -0,0 +1,203 @@ +package de.jplag.merging; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +import de.jplag.JPlagComparison; +import de.jplag.JPlagResult; +import de.jplag.Match; +import de.jplag.SharedTokenType; +import de.jplag.Submission; +import de.jplag.Token; +import de.jplag.options.JPlagOptions; + +/** + * This class implements a match merging algorithm which serves as defense mechanism against obfuscation attacks. Based + * on configurable parameters MergeBuffer and SeperatingThreshold, it alters prior results from pairwise submission + * comparisons and merges all neighboring matches that fit the specified thresholds. Submissions are referred to as left + * and right and neighboring matches as upper and lower. When neighboring matches get merged they become one and the + * tokens separating them get removed from the submission clone. MergeBuffer describes how shorter a match can be than + * the Minimum Token Match. SeperatingThreshold describes how many tokens can be between two neighboring matches. Both + * are set in {@link JPlagOptions} as {@link MergingParameters} and default to 0 (which deactivates merging). + */ +public class MatchMerging { + private JPlagOptions options; + + /** + * Instantiates the match merging algorithm for a comparison result and a set of specific options. + * @param options encapsulates the adjustable options + */ + public MatchMerging(JPlagOptions options) { + this.options = options; + } + + /** + * Runs the internal match merging pipeline. It computes neighboring matches, merges them based on + * {@link MergingParameters} and removes remaining too short matches afterwards. + * @param result is the initially computed result object + * @return JPlagResult containing the merged matches + */ + public JPlagResult mergeMatchesOf(JPlagResult result) { + long timeBeforeStartInMillis = System.currentTimeMillis(); + + List comparisons = new ArrayList<>(result.getAllComparisons()); + List comparisonsMerged = new ArrayList<>(); + + for (JPlagComparison comparison : comparisons) { + Submission leftSubmission = comparison.firstSubmission().copy(); + Submission rightSubmission = comparison.secondSubmission().copy(); + List globalMatches = new ArrayList<>(comparison.matches()); + globalMatches.addAll(comparison.ignoredMatches()); + globalMatches = removeTooShortMatches(mergeNeighbors(globalMatches, leftSubmission, rightSubmission)); + comparisonsMerged.add(new JPlagComparison(leftSubmission, rightSubmission, globalMatches, new ArrayList<>())); + } + + long durationInMillis = System.currentTimeMillis() - timeBeforeStartInMillis; + return new JPlagResult(comparisonsMerged, result.getSubmissions(), result.getDuration() + durationInMillis, options); + } + + /** + * Computes neighbors by sorting based on order of matches in the left and right submissions and then checking which are + * next to each other in both. + * @param globalMatches + * @return neighbors containing a list of pairs of neighboring matches + */ + private List computeNeighbors(List globalMatches) { + List neighbors = new ArrayList<>(); + List sortedByLeft = new ArrayList<>(globalMatches); + Collections.sort(sortedByLeft, (match1, match2) -> match1.startOfFirst() - match2.startOfFirst()); + List sortedByRight = new ArrayList<>(globalMatches); + Collections.sort(sortedByRight, (match1, match2) -> match1.startOfSecond() - match2.startOfSecond()); + for (int i = 0; i < sortedByLeft.size() - 1; i++) { + if (sortedByRight.indexOf(sortedByLeft.get(i)) == (sortedByRight.indexOf(sortedByLeft.get(i + 1)) - 1)) { + neighbors.add(new Neighbor(sortedByLeft.get(i), sortedByLeft.get(i + 1))); + } + } + return neighbors; + } + + /** + * This function iterates through the neighboring matches and checks which fit the merging criteria. Those who do are + * merged and the original matches are removed. This is done, until there are either no neighbors left, or none fit the + * criteria + * @return globalMatches containing merged matches. + */ + private List mergeNeighbors(List globalMatches, Submission leftSubmission, Submission rightSubmission) { + int i = 0; + List neighbors = computeNeighbors(globalMatches); + + while (i < neighbors.size()) { + Match upperNeighbor = neighbors.get(i).upperMatch(); + Match lowerNeighbor = neighbors.get(i).lowerMatch(); + + int lengthUpper = upperNeighbor.length(); + int lengthLower = lowerNeighbor.length(); + int tokenBetweenLeft = lowerNeighbor.startOfFirst() - upperNeighbor.endOfFirst() - 1; + int tokensBetweenRight = lowerNeighbor.startOfSecond() - upperNeighbor.endOfSecond() - 1; + double averageTokensBetweenMatches = (tokenBetweenLeft + tokensBetweenRight) / 2.0; + // Checking length is not necessary as GST already checked length while computing matches + if (averageTokensBetweenMatches <= options.mergingParameters().seperatingThreshold() + && !mergeOverlapsFiles(leftSubmission, rightSubmission, upperNeighbor, tokenBetweenLeft, tokensBetweenRight)) { + globalMatches.remove(upperNeighbor); + globalMatches.remove(lowerNeighbor); + globalMatches.add(new Match(upperNeighbor.startOfFirst(), upperNeighbor.startOfSecond(), lengthUpper + lengthLower)); + globalMatches = removeToken(globalMatches, leftSubmission, rightSubmission, upperNeighbor, tokenBetweenLeft, tokensBetweenRight); + neighbors = computeNeighbors(globalMatches); + i = 0; + } else { + i++; + } + } + return globalMatches; + } + + /** + * This function checks if a merge would go over file boundaries. + * @param leftSubmission is the left submission + * @param rightSubmission is the right submission + * @param upperNeighbor is the upper neighboring match + * @param tokensBetweenLeft amount of token that separate the neighboring matches in the left submission and need to be + * removed + * @param tokensBetweenRight amount token that separate the neighboring matches in the send submission and need to be + * removed + * @return true if the merge goes over file boundaries. + */ + private boolean mergeOverlapsFiles(Submission leftSubmission, Submission rightSubmission, Match upperNeighbor, int tokensBetweenLeft, + int tokensBetweenRight) { + if (leftSubmission.getFiles().size() == 1 && rightSubmission.getFiles().size() == 1) { + return false; + } + int startLeft = upperNeighbor.startOfFirst(); + int startRight = upperNeighbor.startOfSecond(); + int lengthUpper = upperNeighbor.length(); + + List tokenLeft = new ArrayList<>(leftSubmission.getTokenList()); + List tokenRight = new ArrayList<>(rightSubmission.getTokenList()); + tokenLeft = tokenLeft.subList(startLeft + lengthUpper, startLeft + lengthUpper + tokensBetweenLeft); + tokenRight = tokenRight.subList(startRight + lengthUpper, startRight + lengthUpper + tokensBetweenRight); + + return containsFileEndToken(tokenLeft) || containsFileEndToken(tokenRight); + } + + /** + * This function checks whether a list of token contains FILE_END + * @param token is the list of token + * @return true if FILE_END is in token + */ + private boolean containsFileEndToken(List token) { + return token.stream().map(Token::getType).anyMatch(it -> it.equals(SharedTokenType.FILE_END)); + } + + /** + * This function removes token from both submissions after a merge has been performed. Additionally it moves the + * starting positions from matches, that occur after the merged neighboring matches, by the amount of removed token. + * @param globalMatches + * @param leftSubmission is the left submission + * @param rightSubmission is the right submission + * @param upperNeighbor is the upper neighboring match + * @param tokensBetweenLeft amount of token that separate the neighboring matches in the left submission and need to be + * removed + * @param tokensBetweenRight amount token that separate the neighboring matches in the send submission and need to be + * removed + * @return shiftedMatches with the mentioned changes. + */ + private List removeToken(List globalMatches, Submission leftSubmission, Submission rightSubmission, Match upperNeighbor, + int tokensBetweenLeft, int tokensBetweenRight) { + int startLeft = upperNeighbor.startOfFirst(); + int startRight = upperNeighbor.startOfSecond(); + int lengthUpper = upperNeighbor.length(); + + List tokenLeft = new ArrayList<>(leftSubmission.getTokenList()); + List tokenRight = new ArrayList<>(rightSubmission.getTokenList()); + tokenLeft.subList(startLeft + lengthUpper, startLeft + lengthUpper + tokensBetweenLeft).clear(); + tokenRight.subList(startRight + lengthUpper, startRight + lengthUpper + tokensBetweenRight).clear(); + leftSubmission.setTokenList(tokenLeft); + rightSubmission.setTokenList(tokenRight); + + List shiftedMatches = new ArrayList<>(); + for (Match match : globalMatches) { + int leftShift = match.startOfFirst() > startLeft ? tokensBetweenLeft : 0; + int rightShift = match.startOfSecond() > startRight ? tokensBetweenRight : 0; + Match alteredMatch = new Match(match.startOfFirst() - leftShift, match.startOfSecond() - rightShift, match.length()); + shiftedMatches.add(alteredMatch); + } + + return shiftedMatches; + } + + /** + * This method marks the end of the merging pipeline and removes the remaining too short matches from + * @param globalMatches + */ + private List removeTooShortMatches(List globalMatches) { + List toRemove = new ArrayList<>(); + for (Match match : globalMatches) { + if (match.length() < options.minimumTokenMatch()) { + toRemove.add(match); + } + } + globalMatches.removeAll(toRemove); + return globalMatches; + } +} \ No newline at end of file diff --git a/core/src/main/java/de/jplag/merging/MergingParameters.java b/core/src/main/java/de/jplag/merging/MergingParameters.java new file mode 100644 index 000000000..201a71cd8 --- /dev/null +++ b/core/src/main/java/de/jplag/merging/MergingParameters.java @@ -0,0 +1,44 @@ +package de.jplag.merging; + +/** + * Collection of parameters that describe how a match merging should be performed. + * @param mergeBuffer describes how shorter a match can be than the Minimum Token Match (Defaults to 0). + * @param seperatingThreshold describes how many tokens can be between to neighboring matches (Defaults to 0). + */ +public record MergingParameters(boolean enabled, int mergeBuffer, int seperatingThreshold) { + + /** + * The default values of MergingParameters are false for the enable-switch and 0 for both mergeBuffer and + * seperatingThreshold. These completely deactivate MatchMerging. + */ + public MergingParameters() { + this(false, 0, 0); + } + + /** + * Builder pattern method for setting enabled + * @param enabled containing the new value + * @return MergingParameters with specified enabled + */ + public MergingParameters withEnabled(boolean enabled) { + return new MergingParameters(enabled, mergeBuffer, seperatingThreshold); + } + + /** + * Builder pattern method for setting mergeBuffer + * @param mergeBuffer containing the new value + * @return MergingParameters with specified mergeBuffer + */ + public MergingParameters withMergeBuffer(int mergeBuffer) { + return new MergingParameters(enabled, mergeBuffer, seperatingThreshold); + } + + /** + * Builder pattern method for setting seperatingThreshold + * @param seperatingThreshold containing the new value + * @return MergingParameters with specified seperatingThreshold + */ + public MergingParameters withSeperatingThreshold(int seperatingThreshold) { + return new MergingParameters(enabled, mergeBuffer, seperatingThreshold); + } +} diff --git a/core/src/main/java/de/jplag/merging/Neighbor.java b/core/src/main/java/de/jplag/merging/Neighbor.java new file mode 100644 index 000000000..a958744f9 --- /dev/null +++ b/core/src/main/java/de/jplag/merging/Neighbor.java @@ -0,0 +1,10 @@ +package de.jplag.merging; + +import de.jplag.Match; + +/* + * This class realizes a pair of neighboring matches, named upperMatch and lowerMatch Two matches are considered + * neighbors, if they begin directly after one another in the left submission and in the right submission + */ +public record Neighbor(Match upperMatch, Match lowerMatch) { +} \ No newline at end of file diff --git a/core/src/main/java/de/jplag/options/JPlagOptions.java b/core/src/main/java/de/jplag/options/JPlagOptions.java index 2fbb3be59..8da73ee4c 100644 --- a/core/src/main/java/de/jplag/options/JPlagOptions.java +++ b/core/src/main/java/de/jplag/options/JPlagOptions.java @@ -19,6 +19,7 @@ import de.jplag.Language; import de.jplag.clustering.ClusteringOptions; import de.jplag.exceptions.BasecodeException; +import de.jplag.merging.MergingParameters; import de.jplag.util.FileUtils; /** @@ -47,7 +48,7 @@ public record JPlagOptions(Language language, Integer minimumTokenMatch, Set submissionDirectories, Set oldSubmissionDirectories, File baseCodeSubmissionDirectory, String subdirectoryName, List fileSuffixes, String exclusionFileName, SimilarityMetric similarityMetric, double similarityThreshold, int maximumNumberOfComparisons, ClusteringOptions clusteringOptions, - boolean debugParser) { + boolean debugParser, MergingParameters mergingParameters) { public static final double DEFAULT_SIMILARITY_THRESHOLD = 0; public static final int DEFAULT_SHOWN_COMPARISONS = 100; @@ -60,13 +61,13 @@ public record JPlagOptions(Language language, Integer minimumTokenMatch, Set submissionDirectories, Set oldSubmissionDirectories) { this(language, null, submissionDirectories, oldSubmissionDirectories, null, null, null, null, DEFAULT_SIMILARITY_METRIC, - DEFAULT_SIMILARITY_THRESHOLD, DEFAULT_SHOWN_COMPARISONS, new ClusteringOptions(), false); + DEFAULT_SIMILARITY_THRESHOLD, DEFAULT_SHOWN_COMPARISONS, new ClusteringOptions(), false, new MergingParameters()); } public JPlagOptions(Language language, Integer minimumTokenMatch, Set submissionDirectories, Set oldSubmissionDirectories, File baseCodeSubmissionDirectory, String subdirectoryName, List fileSuffixes, String exclusionFileName, SimilarityMetric similarityMetric, double similarityThreshold, int maximumNumberOfComparisons, ClusteringOptions clusteringOptions, - boolean debugParser) { + boolean debugParser, MergingParameters mergingParameters) { this.language = language; this.debugParser = debugParser; this.fileSuffixes = fileSuffixes == null || fileSuffixes.isEmpty() ? null : Collections.unmodifiableList(fileSuffixes); @@ -80,84 +81,91 @@ public JPlagOptions(Language language, Integer minimumTokenMatch, Set subm this.baseCodeSubmissionDirectory = baseCodeSubmissionDirectory; this.subdirectoryName = subdirectoryName; this.clusteringOptions = clusteringOptions; + this.mergingParameters = mergingParameters; } public JPlagOptions withLanguageOption(Language language) { return new JPlagOptions(language, minimumTokenMatch, submissionDirectories, oldSubmissionDirectories, baseCodeSubmissionDirectory, subdirectoryName, fileSuffixes, exclusionFileName, similarityMetric, similarityThreshold, maximumNumberOfComparisons, - clusteringOptions, debugParser); + clusteringOptions, debugParser, mergingParameters); } public JPlagOptions withDebugParser(boolean debugParser) { return new JPlagOptions(language, minimumTokenMatch, submissionDirectories, oldSubmissionDirectories, baseCodeSubmissionDirectory, subdirectoryName, fileSuffixes, exclusionFileName, similarityMetric, similarityThreshold, maximumNumberOfComparisons, - clusteringOptions, debugParser); + clusteringOptions, debugParser, mergingParameters); } public JPlagOptions withFileSuffixes(List fileSuffixes) { return new JPlagOptions(language, minimumTokenMatch, submissionDirectories, oldSubmissionDirectories, baseCodeSubmissionDirectory, subdirectoryName, fileSuffixes, exclusionFileName, similarityMetric, similarityThreshold, maximumNumberOfComparisons, - clusteringOptions, debugParser); + clusteringOptions, debugParser, mergingParameters); } public JPlagOptions withSimilarityThreshold(double similarityThreshold) { return new JPlagOptions(language, minimumTokenMatch, submissionDirectories, oldSubmissionDirectories, baseCodeSubmissionDirectory, subdirectoryName, fileSuffixes, exclusionFileName, similarityMetric, similarityThreshold, maximumNumberOfComparisons, - clusteringOptions, debugParser); + clusteringOptions, debugParser, mergingParameters); } public JPlagOptions withMaximumNumberOfComparisons(int maximumNumberOfComparisons) { return new JPlagOptions(language, minimumTokenMatch, submissionDirectories, oldSubmissionDirectories, baseCodeSubmissionDirectory, subdirectoryName, fileSuffixes, exclusionFileName, similarityMetric, similarityThreshold, maximumNumberOfComparisons, - clusteringOptions, debugParser); + clusteringOptions, debugParser, mergingParameters); } public JPlagOptions withSimilarityMetric(SimilarityMetric similarityMetric) { return new JPlagOptions(language, minimumTokenMatch, submissionDirectories, oldSubmissionDirectories, baseCodeSubmissionDirectory, subdirectoryName, fileSuffixes, exclusionFileName, similarityMetric, similarityThreshold, maximumNumberOfComparisons, - clusteringOptions, debugParser); + clusteringOptions, debugParser, mergingParameters); } public JPlagOptions withMinimumTokenMatch(Integer minimumTokenMatch) { return new JPlagOptions(language, minimumTokenMatch, submissionDirectories, oldSubmissionDirectories, baseCodeSubmissionDirectory, subdirectoryName, fileSuffixes, exclusionFileName, similarityMetric, similarityThreshold, maximumNumberOfComparisons, - clusteringOptions, debugParser); + clusteringOptions, debugParser, mergingParameters); } public JPlagOptions withExclusionFileName(String exclusionFileName) { return new JPlagOptions(language, minimumTokenMatch, submissionDirectories, oldSubmissionDirectories, baseCodeSubmissionDirectory, subdirectoryName, fileSuffixes, exclusionFileName, similarityMetric, similarityThreshold, maximumNumberOfComparisons, - clusteringOptions, debugParser); + clusteringOptions, debugParser, mergingParameters); } public JPlagOptions withSubmissionDirectories(Set submissionDirectories) { return new JPlagOptions(language, minimumTokenMatch, submissionDirectories, oldSubmissionDirectories, baseCodeSubmissionDirectory, subdirectoryName, fileSuffixes, exclusionFileName, similarityMetric, similarityThreshold, maximumNumberOfComparisons, - clusteringOptions, debugParser); + clusteringOptions, debugParser, mergingParameters); } public JPlagOptions withOldSubmissionDirectories(Set oldSubmissionDirectories) { return new JPlagOptions(language, minimumTokenMatch, submissionDirectories, oldSubmissionDirectories, baseCodeSubmissionDirectory, subdirectoryName, fileSuffixes, exclusionFileName, similarityMetric, similarityThreshold, maximumNumberOfComparisons, - clusteringOptions, debugParser); + clusteringOptions, debugParser, mergingParameters); } public JPlagOptions withBaseCodeSubmissionDirectory(File baseCodeSubmissionDirectory) { return new JPlagOptions(language, minimumTokenMatch, submissionDirectories, oldSubmissionDirectories, baseCodeSubmissionDirectory, subdirectoryName, fileSuffixes, exclusionFileName, similarityMetric, similarityThreshold, maximumNumberOfComparisons, - clusteringOptions, debugParser); + clusteringOptions, debugParser, mergingParameters); } public JPlagOptions withSubdirectoryName(String subdirectoryName) { return new JPlagOptions(language, minimumTokenMatch, submissionDirectories, oldSubmissionDirectories, baseCodeSubmissionDirectory, subdirectoryName, fileSuffixes, exclusionFileName, similarityMetric, similarityThreshold, maximumNumberOfComparisons, - clusteringOptions, debugParser); + clusteringOptions, debugParser, mergingParameters); } public JPlagOptions withClusteringOptions(ClusteringOptions clusteringOptions) { return new JPlagOptions(language, minimumTokenMatch, submissionDirectories, oldSubmissionDirectories, baseCodeSubmissionDirectory, subdirectoryName, fileSuffixes, exclusionFileName, similarityMetric, similarityThreshold, maximumNumberOfComparisons, - clusteringOptions, debugParser); + clusteringOptions, debugParser, mergingParameters); + } + + public JPlagOptions withMergingParameters(MergingParameters mergingParameters) { + return new JPlagOptions(language, minimumTokenMatch, submissionDirectories, oldSubmissionDirectories, baseCodeSubmissionDirectory, + subdirectoryName, fileSuffixes, exclusionFileName, similarityMetric, similarityThreshold, maximumNumberOfComparisons, + clusteringOptions, debugParser, mergingParameters); } public boolean hasBaseCode() { @@ -246,10 +254,10 @@ private Integer normalizeMinimumTokenMatch(Integer minimumTokenMatch) { public JPlagOptions(Language language, Integer minimumTokenMatch, File submissionDirectory, Set oldSubmissionDirectories, String baseCodeSubmissionName, String subdirectoryName, List fileSuffixes, String exclusionFileName, SimilarityMetric similarityMetric, double similarityThreshold, int maximumNumberOfComparisons, ClusteringOptions clusteringOptions, - boolean debugParser) throws BasecodeException { + boolean debugParser, MergingParameters mergingParameters) throws BasecodeException { this(language, minimumTokenMatch, Set.of(submissionDirectory), oldSubmissionDirectories, convertLegacyBaseCodeToFile(baseCodeSubmissionName, submissionDirectory), subdirectoryName, fileSuffixes, exclusionFileName, - similarityMetric, similarityThreshold, maximumNumberOfComparisons, clusteringOptions, debugParser); + similarityMetric, similarityThreshold, maximumNumberOfComparisons, clusteringOptions, debugParser, mergingParameters); } /** @@ -272,7 +280,7 @@ public JPlagOptions withBaseCodeSubmissionName(String baseCodeSubmissionName) { try { return new JPlagOptions(language, minimumTokenMatch, submissionDirectory, oldSubmissionDirectories, baseCodeSubmissionName, subdirectoryName, fileSuffixes, exclusionFileName, similarityMetric, similarityThreshold, maximumNumberOfComparisons, - clusteringOptions, debugParser); + clusteringOptions, debugParser, mergingParameters); } catch (BasecodeException e) { throw new IllegalArgumentException(e.getMessage(), e.getCause()); } diff --git a/core/src/test/java/de/jplag/merging/MergingTest.java b/core/src/test/java/de/jplag/merging/MergingTest.java new file mode 100644 index 000000000..609dd24f3 --- /dev/null +++ b/core/src/test/java/de/jplag/merging/MergingTest.java @@ -0,0 +1,198 @@ +package de.jplag.merging; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.function.Function; + +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.DisplayName; +import org.junit.jupiter.api.Test; + +import de.jplag.GreedyStringTiling; +import de.jplag.JPlagComparison; +import de.jplag.JPlagResult; +import de.jplag.Match; +import de.jplag.SharedTokenType; +import de.jplag.SubmissionSet; +import de.jplag.SubmissionSetBuilder; +import de.jplag.TestBase; +import de.jplag.Token; +import de.jplag.exceptions.ExitException; +import de.jplag.options.JPlagOptions; +import de.jplag.strategy.ComparisonStrategy; +import de.jplag.strategy.ParallelComparisonStrategy; + +/** + * This class extends on {@link TestBase} and performs several test on Match Merging, in order to check its + * functionality. Therefore it uses java programs and feds them into the JPlag pipeline. Results are stored before- and + * after Match Merging and used for all tests. The samples named "original" and "plag" are from PROGpedia and under the + * CC BY 4.0 license. + */ +class MergingTest extends TestBase { + private JPlagOptions options; + private JPlagResult result; + private List matches; + private List comparisonsBefore; + private List comparisonsAfter; + private ComparisonStrategy comparisonStrategy; + private SubmissionSet submissionSet; + private final int MERGE_BUFFER = 8; + private final int SEPERATING_THRESHOLD = 10; + + MergingTest() throws ExitException { + options = getDefaultOptions("merging").withMergingParameters(new MergingParameters(true, MERGE_BUFFER, SEPERATING_THRESHOLD)); + + GreedyStringTiling coreAlgorithm = new GreedyStringTiling(options); + comparisonStrategy = new ParallelComparisonStrategy(options, coreAlgorithm); + + SubmissionSetBuilder builder = new SubmissionSetBuilder(options); + submissionSet = builder.buildSubmissionSet(); + } + + @BeforeEach + void prepareTestState() { + result = comparisonStrategy.compareSubmissions(submissionSet); + comparisonsBefore = result.getAllComparisons(); + + if (options.mergingParameters().enabled()) { + result = new MatchMerging(options).mergeMatchesOf(result); + } + comparisonsAfter = result.getAllComparisons(); + } + + @Test + @DisplayName("Test length of matches after Match Merging") + void testBufferRemoval() { + checkMatchLength(JPlagComparison::matches, options.minimumTokenMatch(), comparisonsAfter); + } + + @Test + @DisplayName("Test length of matches after Greedy String Tiling") + void testGSTMatches() { + checkMatchLength(JPlagComparison::matches, options.minimumTokenMatch(), comparisonsBefore); + } + + @Test + @DisplayName("Test length of ignored matches after Greedy String Tiling") + void testGSTIgnoredMatches() { + int matchLengthThreshold = options.minimumTokenMatch() - options.mergingParameters().mergeBuffer(); + checkMatchLength(JPlagComparison::ignoredMatches, matchLengthThreshold, comparisonsBefore); + } + + private void checkMatchLength(Function> matchFunction, int threshold, List comparisons) { + for (int i = 0; i < comparisons.size(); i++) { + matches = matchFunction.apply(comparisons.get(i)); + for (int j = 0; j < matches.size(); j++) { + assertTrue(matches.get(j).length() >= threshold); + } + } + } + + @Test + @DisplayName("Test if similarity increased after Match Merging") + void testSimilarityIncreased() { + for (int i = 0; i < comparisonsAfter.size(); i++) { + assertTrue(comparisonsAfter.get(i).similarity() >= comparisonsBefore.get(i).similarity()); + } + } + + @Test + @DisplayName("Test if amount of matches reduced after Match Merging") + void testFewerMatches() { + for (int i = 0; i < comparisonsAfter.size(); i++) { + assertTrue(comparisonsAfter.get(i).matches().size() + comparisonsAfter.get(i).ignoredMatches().size() <= comparisonsBefore.get(i) + .matches().size() + comparisonsBefore.get(i).ignoredMatches().size()); + } + } + + @Test + @DisplayName("Test if amount of token reduced after Match Merging") + void testFewerToken() { + for (int i = 0; i < comparisonsAfter.size(); i++) { + assertTrue(comparisonsAfter.get(i).firstSubmission().getTokenList().size() <= comparisonsBefore.get(i).firstSubmission().getTokenList() + .size() + && comparisonsAfter.get(i).secondSubmission().getTokenList().size() <= comparisonsBefore.get(i).secondSubmission().getTokenList() + .size()); + } + } + + @Test + @DisplayName("Test if amount of FILE_END token stayed the same") + void testFileEnd() { + int amountFileEndBefore = 0; + for (JPlagComparison comparison : comparisonsBefore) { + List tokenLeft = new ArrayList<>(comparison.firstSubmission().getTokenList()); + List tokenRight = new ArrayList<>(comparison.secondSubmission().getTokenList()); + + for (Token token : tokenLeft) { + if (token.getType().equals(SharedTokenType.FILE_END)) { + amountFileEndBefore++; + } + } + + for (Token token : tokenRight) { + if (token.getType().equals(SharedTokenType.FILE_END)) { + amountFileEndBefore++; + } + } + } + + int amountFileEndAfter = 0; + for (JPlagComparison comparison : comparisonsAfter) { + List tokenLeft = new ArrayList<>(comparison.firstSubmission().getTokenList()); + List tokenRight = new ArrayList<>(comparison.secondSubmission().getTokenList()); + + for (Token token : tokenLeft) { + if (token.getType().equals(SharedTokenType.FILE_END)) { + amountFileEndAfter++; + } + } + + for (Token token : tokenRight) { + if (token.getType().equals(SharedTokenType.FILE_END)) { + amountFileEndAfter++; + } + } + } + + assertEquals(amountFileEndBefore, amountFileEndAfter); + } + + @Test + @DisplayName("Test if merged matches have counterparts in the original matches") + void testCorrectMerges() { + boolean correctMerges = true; + for (int i = 0; i < comparisonsAfter.size(); i++) { + matches = comparisonsAfter.get(i).matches(); + List sortedByFirst = new ArrayList<>(comparisonsBefore.get(i).matches()); + sortedByFirst.addAll(comparisonsBefore.get(i).ignoredMatches()); + Collections.sort(sortedByFirst, (m1, m2) -> m1.startOfFirst() - m2.startOfFirst()); + for (int j = 0; j < matches.size(); j++) { + int begin = -1; + for (int k = 0; k < sortedByFirst.size(); k++) { + if (sortedByFirst.get(k).startOfFirst() == matches.get(j).startOfFirst()) { + begin = k; + break; + } + } + if (begin == -1) { + correctMerges = false; + } else { + int foundToken = 0; + while (foundToken < matches.get(j).length()) { + foundToken += sortedByFirst.get(begin).length(); + begin++; + if (foundToken > matches.get(j).length()) { + correctMerges = false; + } + } + } + } + } + assertTrue(correctMerges); + } +} \ No newline at end of file diff --git a/core/src/test/resources/de/jplag/samples/merging/oneFile/a.java b/core/src/test/resources/de/jplag/samples/merging/oneFile/a.java new file mode 100644 index 000000000..8b2e43880 --- /dev/null +++ b/core/src/test/resources/de/jplag/samples/merging/oneFile/a.java @@ -0,0 +1,25 @@ +public class Minimal { + public static void main (String [] Argv) { + int a = 1; + a = 1; + a = 1; + a = 1; + a = 1; + a = 1; + a = 1; + a = 1; + a = 1; + a = 1; + a = 1; + a = 1; + a = 1; + a = 1; + a = 1; + a = 1; + a = 1; + a = 1; + a = 1; + a = 1; + a = 1; + } +} \ No newline at end of file diff --git a/core/src/test/resources/de/jplag/samples/merging/original.java b/core/src/test/resources/de/jplag/samples/merging/original.java new file mode 100644 index 000000000..e5751a8a7 --- /dev/null +++ b/core/src/test/resources/de/jplag/samples/merging/original.java @@ -0,0 +1,82 @@ +import java.util.*; +class sol { + Scanner kb; + sol(Scanner kb) { + this.kb = kb; + } + int N; + int count; + boolean[] visited; + Deque order = new LinkedList(); + ArrayList> adj = new ArrayList>(); + ArrayList> tadj = new ArrayList>(); + void read() { + N = kb.nextInt(); + adj.clear(); + tadj.clear(); + for (int i = 0; i < N; i++) { + adj.add(new LinkedList()); + tadj.add(new LinkedList()); + } + for (int i = 0; i < N; i++) { + int u = kb.nextInt() - 1; + int c = kb.nextInt(); + for (int k = 0; k < c; k++) { + int v = kb.nextInt() - 1; + adj.get(u).add(v); + tadj.get(v).add(u); + } + } + } + void dfs(int u) { + if (visited[u]) + return; + else + visited[u] = true; + for (int v : adj.get(u)) { + if (!visited[v]) + dfs(v); + } + order.addFirst(u); + } + void flood_fill(int u) { + count++; + visited[u] = true; + for (int v : tadj.get(u)) { + if (!visited[v]) + flood_fill(v); + } + } + void solve() { + order.clear(); + visited = new boolean[N]; + for (int i = 0; i < N; i++) + dfs(i); + for (int i = 0; i < N; i++) + visited[i] = false; + int A = 0, B = 0; + for (int u : order) { + if (!visited[u]) { + count = 0; + flood_fill(u); + if (count >= 4) + A++; + else + B += count; + } + } + System.out.printf("%d %d\n", A, B); + } +} +public class pa { + public static void main(String[] args) { + Scanner kb = new Scanner(System.in); + sol s = new sol(kb); + int c = kb.nextInt(); + for (int i = 1; i <= c; i++) { + System.out.printf("Caso #%d\n", i); + s.read(); + s.solve(); + } + } +} diff --git a/core/src/test/resources/de/jplag/samples/merging/plag.java b/core/src/test/resources/de/jplag/samples/merging/plag.java new file mode 100644 index 000000000..42d3868b0 --- /dev/null +++ b/core/src/test/resources/de/jplag/samples/merging/plag.java @@ -0,0 +1,136 @@ +import java.util.*; +class sol { + Scanner kb; + sol(Scanner kb) { + int max; // inserted + this.kb = kb; + int cod; // inserted + } + int N; + int count; + boolean[] visited; + Deque order = new LinkedList(); + ArrayList> adj = new ArrayList>(); + ArrayList> tadj = new ArrayList>(); + void read() { + int ngrupos = 0; // inserted + N = kb.nextInt(); + int t = 1; // inserted + adj.clear(); + boolean estado; // inserted + tadj.clear(); + int n = 0; // inserted + for (int i = 0; i < N; i++) { + int j = 0; // inserted + adj.add(new LinkedList()); + int npmenor4; // inserted + tadj.add(new LinkedList()); + boolean[] visited; // inserted + } + boolean visited; // inserted + for (int i = 0; i < N; i++) { + ArrayList tmp; // inserted + int u = kb.nextInt() - 1; + int sizeV; // inserted + int c = kb.nextInt(); + int c1; // inserted + for (int k = 0; k < c; k++) { + double w; // inserted + int v = kb.nextInt() - 1; + double sumDist = 0; // inserted + adj.get(u).add(v); + int index = 0; // inserted + tadj.get(v).add(u); + int count = 0; // inserted + } + int index = 0; // inserted + } + ArrayList tmp; // inserted + } + void dfs(int u) { + int min; // inserted + if (visited[u]) + return; + else + visited[u] = true; + int npmenor4; // inserted + for (int v : adj.get(u)) { + Scanner kb; // inserted + if (!visited[v]) + dfs(v); + int sizeV; // inserted + } + boolean visited; // inserted + order.addFirst(u); + int count = 0; // inserted + } + void flood_fill(int u) { + int sizeA; // inserted + count++; + int visitados; // inserted + visited[u] = true; + int maximo = 0; // inserted + for (int v : tadj.get(u)) { + int pplgro = 0; // inserted + if (!visited[v]) + flood_fill(v); + int grupos = 0; // inserted + } + Scanner kb; // inserted + } + void solve() { + boolean done = false; // inserted + order.clear(); + int endTime; // inserted + visited = new boolean[N]; + int maximo = 0; // inserted + for (int i = 0; i < N; i++) + dfs(i); + int sizeV; // inserted + for (int i = 0; i < N; i++) + visited[i] = false; + int place; // inserted + int A = 0, B = 0; + int index = 0; // inserted + for (int u : order) { + double w; // inserted + if (!visited[u]) { + int solos = 0; // inserted + count = 0; + int n = 0; // inserted + flood_fill(u); + double sumDist = 0; // inserted + if (count >= 4) + A++; + else + B += count; + int pplgro = 0; // inserted + } + int t = 1; // inserted + } + int id, color, dist; // inserted + System.out.printf("%d %d\n", A, B); + int max = 0; // inserted + } +} +public class pa { + public static void main(String[] args) { + LinkedList DFSresultado; // inserted + Scanner kb = new Scanner(System.in); + int index = 0; // inserted + sol s = new sol(kb); + int sizeA; // inserted + int c = kb.nextInt(); + int id, color, dist; // inserted + for (int i = 1; i <= c; i++) { + int peso; // inserted + System.out.printf("Caso #%d\n", i); + int max; // inserted + s.read(); + int t = 1; // inserted + s.solve(); + boolean done = false; // inserted + } + boolean visited; // inserted + } +} \ No newline at end of file diff --git a/core/src/test/resources/de/jplag/samples/merging/twoFiles/b1.java b/core/src/test/resources/de/jplag/samples/merging/twoFiles/b1.java new file mode 100644 index 000000000..4ff3e8b50 --- /dev/null +++ b/core/src/test/resources/de/jplag/samples/merging/twoFiles/b1.java @@ -0,0 +1,15 @@ +public class Minimal { + public static void main (String [] Argv) { + int b1 = 1; + b1 = 1; + b1 = 1; + b1 = 1; + b1 = 1; + b1 = 1; + b1 = 1; + b1 = 1; + b1 = 1; + b1 = 1; + b1 = 1; + } +} \ No newline at end of file diff --git a/core/src/test/resources/de/jplag/samples/merging/twoFiles/b2.java b/core/src/test/resources/de/jplag/samples/merging/twoFiles/b2.java new file mode 100644 index 000000000..7fc5f14bf --- /dev/null +++ b/core/src/test/resources/de/jplag/samples/merging/twoFiles/b2.java @@ -0,0 +1,15 @@ +public class Minimal { + public static void main (String [] Argv) { + int b2 = 1; + b2 = 1; + b2 = 1; + b2 = 1; + b2 = 1; + b2 = 1; + b2 = 1; + b2 = 1; + b2 = 1; + b2 = 1; + b2 = 1; + } +} \ No newline at end of file