Skip to content

Commit

Permalink
Add Source Code sanitizer to obfuscate to anonimize the source code c…
Browse files Browse the repository at this point in the history
…ontained in reports
  • Loading branch information
WojciechMazur committed Nov 30, 2023
1 parent cfc2315 commit a152f62
Show file tree
Hide file tree
Showing 15 changed files with 638 additions and 88 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ import scala.meta.internal.metals.Messages.IncompatibleBloopVersion
import scala.meta.internal.metals.MetalsEnrichments._
import scala.meta.internal.metals.StdReportContext
import scala.meta.internal.metals.MirroredReportContext
import scala.meta.internal.metals.RemoteTelemetryReportContext
import scala.meta.internal.metals.RemoteReportContext
import scala.meta.internal.metals.ammonite.Ammonite
import scala.meta.internal.metals.callHierarchy.CallHierarchyProvider
import scala.meta.internal.metals.clients.language.ConfiguredLanguageClient
Expand Down Expand Up @@ -191,13 +191,16 @@ class MetalsLspService(
},
ReportLevel.fromString(MetalsServerConfig.default.loglevel),
)
private val remoteTelemetryReports = new RemoteTelemetryReportContext(
serverEndpoint = serverInputs.initialServerConfig.telemetryServer,
workspace = Some(folder.toNIO),
private val remoteTelemetryReports = new RemoteReportContext(
serverEndpoint = RemoteReportContext.DefaultEndpoint,
getReporterContext = makeTelemetryContext,
sanitizers = new RemoteReportContext.Sanitizers(
workspace = Some(folder.toNIO),
sourceCodeTransformer = Some(ScalametaSourceCodeTransformer),
),
logger = {
val logger = logging.MetalsLogger.default
RemoteTelemetryReportContext.LoggerAccess(
RemoteReportContext.LoggerAccess(
info = logger.info(_),
warning = logger.warn(_),
error = logger.error(_),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,30 @@ import scala.util.Random
import java.nio.file.Path
import java.io.InputStreamReader
import scala.util.Try
import RemoteTelemetryReportContext.LoggerAccess
object RemoteTelemetryReportContext {
import RemoteReportContext.LoggerAccess
import java.util.Optional
object RemoteReportContext {
def discoverTelemetryServer =
sys.props.getOrElse("metals.telemetry-server", DefaultEndpoint)
final val DefaultEndpoint =
"https://scala3.westeurope.cloudapp.azure.com/telemetry"
final val DefaultEndpoint = "http://localhost:8081"
// "https://scala3.westeurope.cloudapp.azure.com/telemetry"

case class Sanitizers(
workspaceSanitizer: WorkspaceSanitizer,
sourceCodeSanitizer: Option[SourceCodeSanitizer[_, _]]
) {
def canSanitizeSources = sourceCodeSanitizer.isDefined
def this(
workspace: Option[Path],
sourceCodeTransformer: Option[SourceCodeTransformer[_, _]]
) =
this(
workspaceSanitizer = new WorkspaceSanitizer(workspace),
sourceCodeSanitizer =
sourceCodeTransformer.map(new SourceCodeSanitizer(_))
)
val all = Seq(workspaceSanitizer) ++ sourceCodeSanitizer
}

// Proxy for different logging mechanism java.util.logging in PresentatilnCompiler and scribe in metals
case class LoggerAccess(
Expand All @@ -41,30 +59,33 @@ object RemoteTelemetryReportContext {
* @param telemetryServerEndpoint
* @param getReporterContext Constructor of reporter context metadata containg informations about user/server configuration of components
*/
class RemoteTelemetryReportContext(
class RemoteReportContext(
serverEndpoint: String,
workspace: Option[Path],
getReporterContext: () => telemetry.ReporterContext,
sanitizers: RemoteReportContext.Sanitizers,
logger: LoggerAccess
) extends ReportContext {

// Don't send reports with fragile user data - sources etc
override lazy val unsanitized: Reporter = reporter("unsanitized")
override lazy val incognito: Reporter = reporter("sanitized")
override lazy val incognito: Reporter = reporter("incognito")
override lazy val bloop: Reporter = reporter("bloop")

private def reporter(name: String) = new TelemetryReporter(
name = name,
serverEndpoint = serverEndpoint,
workspace = workspace,
getReporterContext = getReporterContext,
logger = logger
)
private def reporter(name: String) =
new RemoteReporter(
name = name,
serverEndpoint = serverEndpoint,
getReporterContext = getReporterContext,
sanitizers = sanitizers,
logger = logger
)
}

private class TelemetryReporter(
private class RemoteReporter(
override val name: String,
serverEndpoint: String,
workspace: Option[Path],
getReporterContext: () => telemetry.ReporterContext,
sanitizers: RemoteReportContext.Sanitizers,
logger: LoggerAccess
) extends Reporter {

Expand All @@ -73,9 +94,6 @@ private class TelemetryReporter(
Nil
override def deleteAll(): Unit = ()

private val sanitizer: ReportSanitizer = new WorkspaceReportSanitizer(
workspace
)
private lazy val environmentInfo: telemetry.Environment =
new telemetry.Environment(
/* java = */ new telemetry.JavaInfo(
Expand All @@ -94,32 +112,41 @@ private class TelemetryReporter(
logger = logger
)

override def sanitize(message: String): String =
sanitizers.all.foldRight(message)(_.apply(_))

private def createSanitizedReport(report: Report) = new telemetry.ReportEvent(
/* name = */ report.name,
/* text = */ if (sanitizers.canSanitizeSources)
Optional.of(sanitize(report.text))
else Optional.empty(),
/* id = */ report.id.toJava,
/* error = */ report.error
.map(telemetry.ReportedError.fromThrowable(_, sanitize(_)))
.toJava,
/* reporterName = */ name,
/* reporterContext = */ getReporterContext() match {
case ctx: telemetry.MetalsLspContext =>
telemetry.ReporterContextUnion.metalsLSP(ctx)
case ctx: telemetry.ScalaPresentationCompilerContext =>
telemetry.ReporterContextUnion.scalaPresentationCompiler(ctx)
case ctx: telemetry.UnknownProducerContext =>
telemetry.ReporterContextUnion.unknown(ctx)
},
/* env = */ environmentInfo
)

override def create(
unsanitizedReport: => Report,
ifVerbose: Boolean
): Option[Path] = {
val report = sanitizer(unsanitizedReport)
client
.sendReportEvent(
new telemetry.ReportEvent(
/* name = */ report.name,
/* text = */ report.text,
/* shortSummary = */ report.shortSummary,
/* id = */ report.id.toJava,
/* error = */ report.error
.map(telemetry.ReportedError.fromThrowable(_, sanitizer.apply(_)))
.toJava,
/* reporterName = */ name,
/* reporterContext = */ getReporterContext() match {
case ctx: telemetry.MetalsLspContext =>
telemetry.ReporterContextUnion.metalsLSP(ctx)
case ctx: telemetry.ScalaPresentationCompilerContext =>
telemetry.ReporterContextUnion.scalaPresentationCompiler(ctx)
case ctx: telemetry.UnknownProducerContext =>
telemetry.ReporterContextUnion.unknown(ctx)
},
/* env = */ environmentInfo
)
val event = createSanitizedReport(unsanitizedReport)
if (event.getText().isPresent() || event.getError().isPresent())
client.sendReportEvent(event)
else
logger.info(
"Skiped reporting remotely unmeaningful report, no context or error, reportId=" +
unsanitizedReport.id.getOrElse("null")
)
None
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ class StdReporter(
level: ReportLevel,
override val name: String
) extends Reporter {
private val sanitizer: ReportSanitizer = new WorkspaceReportSanitizer(
private val sanitizer: ReportSanitizer = new WorkspaceSanitizer(
Some(workspace)
)
val maybeReportsDir: Path =
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,7 @@ trait ReportSanitizer {
def sanitize(text: String): String
}

class WorkspaceReportSanitizer(workspace: Option[Path])
extends ReportSanitizer {
class WorkspaceSanitizer(workspace: Option[Path]) extends ReportSanitizer {
private lazy val userHome = Option(System.getProperty("user.home"))

override def sanitize(text: String): String = {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
package scala.meta.internal.metals

import java.util.regex.Pattern

/**
* Sanitizer ensuring that no original source code can leak through the reports.
* First it would treat input as the markdown source snippet with 1 or more code snipets.
* If the snippet contains parsable code it would erase all the original names, replacing them with synthetic symbols of the same length.
* If the code is not parsable or the transformed code is would not be parsable after transformation it would be replaced with an failure reason tag.
* If no code snipets are found the input is treated as a raw source code.
*/
class SourceCodeSanitizer[ParserCtx, ParserAST](
parser: SourceCodeTransformer[ParserCtx, ParserAST]
) extends ReportSanitizer {

override def sanitize(text: String): String = {
anonimizeMarkdownSnippets(text)
.getOrElse(tryAnonimize(text, languageHint = Some("scala")).merge)
}

// Completion marker needs to be escape before parsing the sources, and restored afterwards
private final val CompletionMarker = "@@"
private final val CompletionMarkerReplecement = "__METALS_COMPLETION_MARKER__"

private final val MarkdownCodeSnippet = java.util.regex.Pattern
.compile(
raw"^`{3}(\w+\s*)?\n([\s\S]*?)`{3}",
Pattern.MULTILINE | Pattern.CASE_INSENSITIVE
)
private final val StackTraceLine =
raw"(?:\s*(?:at\s*))?(\S+)\((?:(?:\S+\.(?:scala|java)\:\d+)|(?:Native Method))\)".r

private type FailureReason = String
private def tryAnonimize(
source: String,
languageHint: Option[String]
): Either[FailureReason, String] = {
Option(source)
.map(_.trim())
.filter(_.nonEmpty)
.map(_.replaceAll(CompletionMarker, CompletionMarkerReplecement))
.fold[Either[String, String]](Left("no-source")) { source =>
if (StackTraceLine.findFirstIn(source).isDefined)
Right(source)
else if (languageHint.forall(_.toLowerCase() == "scala")) {
parser
.parse(source)
.toRight("<unparsable>")
.flatMap { case (ctx, tree) =>
parser.transformer
.sanitizeSymbols(tree)
.toRight("<ast-transformation-failed>")
.flatMap { parsed =>
val sourceString = parser.toSourceString(parsed, ctx)
val isReparsable = parser.parse(sourceString, ctx).isDefined
if (isReparsable) Right(sourceString)
else Left("<invalid-transformation-not-reparsable>")
}
}
} else
Left("<unknown-source-redacted-out>")
}
.map(_.replace(CompletionMarkerReplecement, CompletionMarker))
}

private def anonimizeMarkdownSnippets(source: String): Option[String] = {
// Check if we have even number of markdown snipets markers, if not discard whole input
val snipetMarkers = source.linesIterator.count(_.startsWith("```"))
if (snipetMarkers == 0 || snipetMarkers % 2 != 0) None
else {
val matcher = MarkdownCodeSnippet.matcher(source)
val sourceResult = new java.lang.StringBuffer(source.size)
while (matcher.find()) {
val matchResult = matcher.toMatchResult()
val language = Option(matchResult.group(1)).map(_.trim())
val result = tryAnonimize(
languageHint = language,
source = matchResult.group(2)
)
val sanitizedOrFailureReason: String = result.merge.replace("$", "\\$")
val updatedSnippet =
s"""```${language.getOrElse("")}
|$sanitizedOrFailureReason
|```
|""".stripMargin

matcher.appendReplacement(
sourceResult,
updatedSnippet
)
}
if (sourceResult.length() == 0) None // not found any snipets
else
Some {
matcher.appendTail(sourceResult)
sourceResult.toString()
}
}
}
}
Loading

0 comments on commit a152f62

Please sign in to comment.