-
Notifications
You must be signed in to change notification settings - Fork 337
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add Source Code sanitizer to obfuscate to anonimize the source code c…
…ontained in reports
- Loading branch information
1 parent
cfc2315
commit a152f62
Showing
15 changed files
with
638 additions
and
88 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
100 changes: 100 additions & 0 deletions
100
mtags-shared/src/main/scala/scala/meta/internal/metals/SourceCodeSanitizer.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
package scala.meta.internal.metals | ||
|
||
import java.util.regex.Pattern | ||
|
||
/** | ||
* Sanitizer ensuring that no original source code can leak through the reports. | ||
* First it would treat input as the markdown source snippet with 1 or more code snipets. | ||
* If the snippet contains parsable code it would erase all the original names, replacing them with synthetic symbols of the same length. | ||
* If the code is not parsable or the transformed code is would not be parsable after transformation it would be replaced with an failure reason tag. | ||
* If no code snipets are found the input is treated as a raw source code. | ||
*/ | ||
class SourceCodeSanitizer[ParserCtx, ParserAST]( | ||
parser: SourceCodeTransformer[ParserCtx, ParserAST] | ||
) extends ReportSanitizer { | ||
|
||
override def sanitize(text: String): String = { | ||
anonimizeMarkdownSnippets(text) | ||
.getOrElse(tryAnonimize(text, languageHint = Some("scala")).merge) | ||
} | ||
|
||
// Completion marker needs to be escape before parsing the sources, and restored afterwards | ||
private final val CompletionMarker = "@@" | ||
private final val CompletionMarkerReplecement = "__METALS_COMPLETION_MARKER__" | ||
|
||
private final val MarkdownCodeSnippet = java.util.regex.Pattern | ||
.compile( | ||
raw"^`{3}(\w+\s*)?\n([\s\S]*?)`{3}", | ||
Pattern.MULTILINE | Pattern.CASE_INSENSITIVE | ||
) | ||
private final val StackTraceLine = | ||
raw"(?:\s*(?:at\s*))?(\S+)\((?:(?:\S+\.(?:scala|java)\:\d+)|(?:Native Method))\)".r | ||
|
||
private type FailureReason = String | ||
private def tryAnonimize( | ||
source: String, | ||
languageHint: Option[String] | ||
): Either[FailureReason, String] = { | ||
Option(source) | ||
.map(_.trim()) | ||
.filter(_.nonEmpty) | ||
.map(_.replaceAll(CompletionMarker, CompletionMarkerReplecement)) | ||
.fold[Either[String, String]](Left("no-source")) { source => | ||
if (StackTraceLine.findFirstIn(source).isDefined) | ||
Right(source) | ||
else if (languageHint.forall(_.toLowerCase() == "scala")) { | ||
parser | ||
.parse(source) | ||
.toRight("<unparsable>") | ||
.flatMap { case (ctx, tree) => | ||
parser.transformer | ||
.sanitizeSymbols(tree) | ||
.toRight("<ast-transformation-failed>") | ||
.flatMap { parsed => | ||
val sourceString = parser.toSourceString(parsed, ctx) | ||
val isReparsable = parser.parse(sourceString, ctx).isDefined | ||
if (isReparsable) Right(sourceString) | ||
else Left("<invalid-transformation-not-reparsable>") | ||
} | ||
} | ||
} else | ||
Left("<unknown-source-redacted-out>") | ||
} | ||
.map(_.replace(CompletionMarkerReplecement, CompletionMarker)) | ||
} | ||
|
||
private def anonimizeMarkdownSnippets(source: String): Option[String] = { | ||
// Check if we have even number of markdown snipets markers, if not discard whole input | ||
val snipetMarkers = source.linesIterator.count(_.startsWith("```")) | ||
if (snipetMarkers == 0 || snipetMarkers % 2 != 0) None | ||
else { | ||
val matcher = MarkdownCodeSnippet.matcher(source) | ||
val sourceResult = new java.lang.StringBuffer(source.size) | ||
while (matcher.find()) { | ||
val matchResult = matcher.toMatchResult() | ||
val language = Option(matchResult.group(1)).map(_.trim()) | ||
val result = tryAnonimize( | ||
languageHint = language, | ||
source = matchResult.group(2) | ||
) | ||
val sanitizedOrFailureReason: String = result.merge.replace("$", "\\$") | ||
val updatedSnippet = | ||
s"""```${language.getOrElse("")} | ||
|$sanitizedOrFailureReason | ||
|``` | ||
|""".stripMargin | ||
|
||
matcher.appendReplacement( | ||
sourceResult, | ||
updatedSnippet | ||
) | ||
} | ||
if (sourceResult.length() == 0) None // not found any snipets | ||
else | ||
Some { | ||
matcher.appendTail(sourceResult) | ||
sourceResult.toString() | ||
} | ||
} | ||
} | ||
} |
Oops, something went wrong.