Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add SARIF schema test #105

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,5 @@ traitlets==5.2.2.post1
typing-extensions==4.2.0
xxhash==3.0.0
zstandard==0.17.0
requests
jsonschema
4 changes: 3 additions & 1 deletion sapp/sarif.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@

class SARIF:
version: str = "2.1.0"
schema: str = "https://raw.githubusercontent.com/oasis-tcs/sarif-spec/master/Schemata/sarif-schema-2.1.0.json" # noqa
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the old schema is actually had a broken regex and it was fixed in the codeQL schema see this commit github/codeql-action@9824588

schema: str = "https://raw.githubusercontent.com/github/codeql-action/main/src/sarif-schema-2.1.0.json" # noqa

def __init__(
self, tool: str, session: Session, filtered_issues: Set[IssueQueryResult]
Expand Down Expand Up @@ -140,6 +140,8 @@ def trace_to_sarif(
) + trace.create_trace_tuples(precondition_navigation)
codeflows: List[SARIFCodeflowLocationObject] = []
nesting_level = 0
if len(trace_tuples) == 0:
return []
Comment on lines +143 to +144
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

to conform with the schema definition

for trace_tuple in trace_tuples:
location = self._sarif_codeflow_location_from_trace_tuple(
trace_tuple.trace_frame, nesting_level, output_features
Expand Down
229 changes: 229 additions & 0 deletions sapp/ui/tests/sarif_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,229 @@
#!/usr/bin/env python3
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

# pyre-unsafe

import json
import sys
from datetime import datetime
from io import StringIO
from typing import List
from unittest import TestCase
from unittest.mock import patch

import jsonschema
import requests

from sapp.sarif import SARIF
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Below we use relative imports and here we use an absolute import, is this intentional? If not then let's try to be consisent.


from sqlalchemy.orm import Session

from ...db import DB, DBType
from ...models import (
create as create_models,
DBID,
IssueStatus,
SharedText,
SourceLocation,
TraceFrame,
TraceKind,
)
from ...pipeline.pysa_taint_parser import Parser
from ...tests.fake_object_generator import FakeObjectGenerator
from ..interactive import Interactive, IssueQueryResult, TraceFrameQueryResult


class SarifTest(TestCase):
def setUp(self) -> None:
self.db = DB(DBType.MEMORY)
create_models(self.db)
self.interactive = Interactive(
database=self.db, repository_directory="", parser_class=Parser
)
self.stdout = StringIO()
self.stderr = StringIO()
sys.stdout = self.stdout # redirect output
sys.stderr = self.stderr # redirect output
self.fakes = FakeObjectGenerator()

def tearDown(self) -> None:
sys.stdout = sys.__stdout__ # reset redirect
sys.stderr = sys.__stderr__ # reset redirect

def _frame_to_query_result(
self, session: Session, trace_frame: TraceFrame
) -> TraceFrameQueryResult:
caller = (
session.query(SharedText.contents)
.filter(SharedText.id == trace_frame.caller_id)
.scalar()
)
callee = (
session.query(SharedText.contents)
.filter(SharedText.id == trace_frame.callee_id)
.scalar()
)
filename = (
session.query(SharedText.contents)
.filter(SharedText.id == trace_frame.filename_id)
.scalar()
)
return TraceFrameQueryResult(
id=trace_frame.id,
caller=caller,
caller_port=trace_frame.caller_port,
callee=callee,
callee_port=trace_frame.callee_port,
caller_id=trace_frame.caller_id,
callee_id=trace_frame.callee_id,
callee_location=trace_frame.callee_location,
# pyre-fixme[6]: For 9th param expected `Optional[TraceKind]` but got `str`.
kind=trace_frame.kind,
filename=filename,
)

def testSarifSchemaCheckWithIssuesNoTraces(self) -> None:
run = self.fakes.run()
issues = [
IssueQueryResult(
issue_id=DBID(1),
issue_instance_id=DBID(1),
filename="SelfServicePortalController.java",
location=SourceLocation(1, 2, 3),
code=6,
callable="Response SelfServicePortalController.getSelfServiceLogs(Cookie, AuditLogRequest)",
message="root",
min_trace_length_to_sources=0,
min_trace_length_to_sinks=2,
features=frozenset(),
is_new_issue=False,
source_names=frozenset(["source1"]),
source_kinds=frozenset(["source_kind1"]),
sink_names=frozenset(["sink1"]),
sink_kinds=frozenset(["sink_kind1", "sink_kind2"]),
status=IssueStatus.UNCATEGORIZED,
detected_time=datetime.today(),
similar_issues=set(),
run_id=DBID(1),
)
]
self.fakes.save_all(self.db)
with self.db.make_session() as session:
session.add(run)
session.commit()
sarif = SARIF("mariana-trench", session, set(issues))
output = sarif.to_json()
output = json.loads(output)
Comment on lines +118 to +119
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sarif.to_json() does a json.dumps internally, so this is doing a back-and-forth between the python representation and a string. It would be nice to avoid this, but I guess that's not a big deal for a test..

try:
response = requests.get(SARIF.schema)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not a big fan of tests pulling random stuff from the internet. Maybe we could just push that file in the repository?

response.raise_for_status()
schema = response.json()
jsonschema.Draft202012Validator(schema).validate(output)
except Exception as e:
print(f"Error downloading schema: {e}")
raise e

def testSarifSchemaCheckWithIssuesWithTraces(self) -> None:
run = self.fakes.run()
issues = [
IssueQueryResult(
issue_id=DBID(1),
issue_instance_id=DBID(1),
filename="SelfServicePortalController.java",
location=SourceLocation(1, 2, 3),
code=6,
callable="Response SelfServicePortalController.getSelfServiceLogs(Cookie, AuditLogRequest)",
message="root",
min_trace_length_to_sources=0,
min_trace_length_to_sinks=2,
features=frozenset(),
is_new_issue=False,
source_names=frozenset(["source1"]),
source_kinds=frozenset(["source_kind1"]),
sink_names=frozenset(["sink1"]),
sink_kinds=frozenset(["sink_kind1", "sink_kind2"]),
status=IssueStatus.UNCATEGORIZED,
detected_time=datetime.today(),
similar_issues=set(),
run_id=DBID(1),
),
IssueQueryResult(
issue_id=DBID(2),
issue_instance_id=DBID(2),
filename="SelfServicePortalController.java",
location=SourceLocation(1, 2, 3),
code=4,
callable="Response SelfServicePortalController.getSelfServiceLogs(Cookie, AuditLogRequest)",
message="root",
min_trace_length_to_sources=0,
min_trace_length_to_sinks=2,
features=frozenset(),
is_new_issue=False,
source_names=frozenset(["source1"]),
source_kinds=frozenset(["source_kind1"]),
sink_names=frozenset(["sink1"]),
sink_kinds=frozenset(["sink_kind1", "sink_kind2"]),
status=IssueStatus.UNCATEGORIZED,
detected_time=datetime.today(),
similar_issues=set(),
run_id=DBID(1),
),
]
source_frames: List[TraceFrame] = [
self.fakes.postcondition(
caller="PostCondCallerA",
callee="PostCondCalleeA",
filename="file1.java",
),
self.fakes.postcondition(
caller="PostCondCallerB",
callee="PostCondCalleeB",
filename="file2.java",
),
]
sink_frames: List[TraceFrame] = [
self.fakes.precondition(
caller="PreCondCallerA", callee="PreCondCalleeA", filename="file3.java"
),
self.fakes.precondition(
caller="PreCondCallerB", callee="PreCondCalleeB", filename="file4.java"
),
]
self.fakes.save_all(self.db)
with self.db.make_session() as session:
session.add(run)
session.commit()
source_frames_query_results: List[TraceFrameQueryResult] = [
self._frame_to_query_result(session, source_frames[0]),
self._frame_to_query_result(session, source_frames[1]),
]
sink_frames_query_results: List[TraceFrameQueryResult] = [
self._frame_to_query_result(session, sink_frames[0]),
self._frame_to_query_result(session, sink_frames[1]),
]
with patch("sapp.ui.trace.initial_frames") as mock_trace:
input_return_map = {
(session, issues[0].issue_instance_id, TraceKind.POSTCONDITION): [
source_frames_query_results[0],
source_frames_query_results[1],
],
(session, issues[0].issue_instance_id, TraceKind.PRECONDITION): [
sink_frames_query_results[0],
sink_frames_query_results[1],
],
}
Comment on lines +208 to +217
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this necessary? From my understanding, since we use self.fakes.precondition/postcondition, those should be in the database and initial_frames should find those?

mock_trace.side_effect = lambda *args: input_return_map.get(args)
sarif = SARIF("mariana-trench", session, set(issues))
output = sarif.to_json()
output = json.loads(output)
try:
response = requests.get(SARIF.schema)
response.raise_for_status()
schema = response.json()
jsonschema.Draft202012Validator(schema).validate(output)
except Exception as e:
print(f"Error downloading schema: {e}")
raise e
Comment on lines +219 to +229
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe move that in a function, since it's used twice.

1 change: 1 addition & 0 deletions sapp/ui/trace.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,6 +289,7 @@ def navigate_trace_frames(
callee_port=trace_frame.callee_port,
caller="",
caller_port="",
filename="",
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

to conform with the schema definition

),
0,
)
Expand Down
Loading