From f9b02bc775b634f19a592ac7818e56f5d17d673f Mon Sep 17 00:00:00 2001 From: Andre Kuhlenschmidt Date: Thu, 26 Sep 2024 18:56:54 -0700 Subject: [PATCH 1/4] chore: fix uses of deprecated function utcnow (semgrep/semgrep-proprietary#2339) Error in [logs](https://github.com/semgrep/semgrep-proprietary/actions/runs/11061470331/job/30734143194#step:4:129) Test plan: CI synced from Pro 6ec5e0f7832e85e8c2f6a6a178de494fee102895 --- cli/src/semgrep/app/scans.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/cli/src/semgrep/app/scans.py b/cli/src/semgrep/app/scans.py index 06865a6d68ac..a45352f77890 100644 --- a/cli/src/semgrep/app/scans.py +++ b/cli/src/semgrep/app/scans.py @@ -484,15 +484,16 @@ def report_findings( # minutes to wait for completion. Eventually, this wait may # be configurable as we see larger scans and increased backend # load. - try_until = datetime.utcnow() + timedelta(minutes=30) - slow_down_after = datetime.utcnow() + timedelta(minutes=2) + now = datetime.now().replace(tzinfo=None) + try_until = now + timedelta(minutes=30) + slow_down_after = now + timedelta(minutes=2) while True: # old: was also logging {json.dumps(complete.to_json(), indent=4)} # alt: save it in ~/.semgrep/logs/complete.json? logger.debug(f"Sending /complete") - if datetime.utcnow() > try_until: + if datetime.now().replace(tzinfo=None) > try_until: # let the backend know we won't be trying again complete.final_attempt = True @@ -522,4 +523,4 @@ def report_findings( ) progress_bar.advance(complete_task) - sleep(5 if datetime.utcnow() < slow_down_after else 30) + sleep(5 if datetime.now().replace(tzinfo=None) < slow_down_after else 30) From 4e5af14eaade7bccd84031f79d9c6a36e786aa19 Mon Sep 17 00:00:00 2001 From: Emma Jin Date: Fri, 27 Sep 2024 13:28:57 -0400 Subject: [PATCH 2/4] chore: log the trace id in debug mode (semgrep/semgrep-proprietary#2337) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For non-SMS scans, it can be quite hard to find the trace. This PR logs the trace id in debug log. This only solves the problem when `--debug` is passed as well as `--trace`, so for the future we should output it as part of the results json or the error on a crash. However, it's better than the previous state. Test plan: In an arbitrary folder, run ``` (python-virtualenv) ➜ misc semgrep --config p/default . --trace --pro --debug ┌──── ○○○ ────┐ │ Semgrep CLI │ └─────────────┘ semgrep version 1.90.0 ... [00.06][DEBUG](default): !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! [00.06][INFO]: Executed as: /Users/emma/workspace/semgrep-proprietary/OSS/cli/src/semgrep/bin/semgrep-core-proprietary -json -rules /var/folders/4h/r6m5kls56r98069rz7w7f2400000gp/T/tmp379zh00m.json -j 1 -targets /var/folders/4h/r6m5kls56r98069rz7w7f2400000gp/T/tmp6sce110s -timeout 5 -timeout_threshold 3 -max_memory 0 -fast -trace -deep_inter_file -timeout_for_interfile_analysis 0 . -debug [00.06][INFO]: Version: 1.90.0 [00.06][INFO]: Tracing is enabled for this scan. The trace id is <53914f7e3a7612f09483a98c320a8f98>. ``` Also run it without `--pro`. synced from Pro 7a7dba2f40afef0123d56915efca502917536f7d --- libs/tracing/unix/Tracing.ml | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/libs/tracing/unix/Tracing.ml b/libs/tracing/unix/Tracing.ml index 27d9073fc39a..bb3873b2db64 100644 --- a/libs/tracing/unix/Tracing.ml +++ b/libs/tracing/unix/Tracing.ml @@ -201,6 +201,18 @@ let trace_data_only ?(level = Info) ~__FUNCTION__ ~__FILE__ ~__LINE__ name with_span ~level ~__FUNCTION__ ~__FILE__ ~__LINE__ name (fun sp -> f () |> add_yojson_to_span sp) +let log_trace_message () = + match Otel.Scope.get_ambient_scope () with + | None -> + (* nosemgrep: no-logs-in-library *) + Logs.info (fun m -> + m "Tracing is enabled for this scan. There was no trace id recorded.") + | Some scope -> + let id = Otel.Trace_id.to_hex scope.trace_id in + (* nosemgrep: no-logs-in-library *) + Logs.info (fun m -> + m "Tracing is enabled for this scan. The trace id is <%s>." id) + (*****************************************************************************) (* Entry points for setting up tracing *) (*****************************************************************************) @@ -241,7 +253,9 @@ let with_tracing fname trace_endpoint data f = Opentelemetry_client_ocurl.with_setup ~config () @@ fun () -> with_top_level_span ?parent_span_id ?parent_trace_id ~__FILE__ ~__LINE__ ~data fname - @@ fun sp -> f sp + @@ fun sp -> + log_trace_message (); + f sp (* Alt: using cohttp_lwt (we probably want to do this when we switch to Eio w/ *) (* their compatibility layer) From 2090ca45ad1ef528e46fa9f4d589e527c5fb82d1 Mon Sep 17 00:00:00 2001 From: Andre Kuhlenschmidt Date: Mon, 30 Sep 2024 13:54:54 -0700 Subject: [PATCH 3/4] fix: deprecation errors with Uuidm for ocaml 5.2 (semgrep/semgrep-proprietary#2344) Fixing [logs](https://github.com/semgrep/semgrep-proprietary/actions/runs/11078022721/job/30784508699). Looks like this doesn't work for our 4.x ocaml environment. Good luck guys! synced from Pro 2ea8d06dfac60d2c0fd582450f6ba62bad0beb54 --- Makefile | 13 ++++++++++++- dune-project | 3 ++- libs/git_wrapper/Git_wrapper.ml | 3 ++- semgrep.opam | 6 +++--- src/osemgrep/cli_ci/Ci_subcommand.ml | 2 +- src/osemgrep/configuring/Semgrep_settings.ml | 3 ++- src/osemgrep/core/Metrics_.ml | 3 ++- src/osemgrep/language_server/Test_LS_e2e.ml | 4 +++- src/osemgrep/language_server/Unit_LS.ml | 2 +- src/osemgrep/language_server/server/Lsp_.ml | 8 ++++++-- src/osemgrep/networking/Semgrep_login.ml | 3 ++- src/osemgrep/reporting/Gitlab_output.ml | 2 +- 12 files changed, 37 insertions(+), 15 deletions(-) diff --git a/Makefile b/Makefile index d120162afbdc..1f0e47b82114 100644 --- a/Makefile +++ b/Makefile @@ -237,7 +237,18 @@ core-test-e2e: # path, then recent versions of opam crash with a 'git ls-files fatal error' # about some 'libs/ocaml-tree-sitter-core/../../.git/...' not being a git # repo. -REQUIRED_DEPS = ./ ./libs/ocaml-tree-sitter-core/tree-sitter.opam ./dev/required.opam +# +# EXTRA_OPAM_DEPS allows us to add more opam files when building semgrep +# as part of a larger project (e.g. semgrep-proprietary). Using a single +# 'opam install' command to install all the dependencies allows us to detect +# version constraints incompatibilities. +# +REQUIRED_DEPS = \ + ./ \ + ./libs/ocaml-tree-sitter-core/tree-sitter.opam \ + ./dev/required.opam \ + $(EXTRA_OPAM_DEPS) + OPTIONAL_DEPS = $(REQUIRED_DEPS) ./dev/optional.opam # This target is portable; it only assumes you have 'gcc', 'opam' and diff --git a/dune-project b/dune-project index 3e03f5c0ff3b..b64d46efb9c4 100644 --- a/dune-project +++ b/dune-project @@ -77,6 +77,7 @@ the other programming languages supported by atdgen." (lib_parsing (>= 1.5.5)) (profiling (>= 1.5.5)) (atdgen (>= 2.8.0)) + (uuidm (>= 0.9.9)) ) ) @@ -515,7 +516,7 @@ For more information see https://semgrep.dev (conf-libcurl (= 1)) ; force older version of conf-libcurl to make windows work ; web stuff uri - uuidm + (uuidm (>= 0.9.9)) ; cohttp >= 6.0.0 requires opam 2.1.0 which used to not available in Windows ; TODO: now that opam 2.2.0 is out and support windows, upgrade to 6.0.0 (cohttp (= 5.3.0)) diff --git a/libs/git_wrapper/Git_wrapper.ml b/libs/git_wrapper/Git_wrapper.ml index 151864b20436..97e0f9400407 100644 --- a/libs/git_wrapper/Git_wrapper.ml +++ b/libs/git_wrapper/Git_wrapper.ml @@ -501,7 +501,8 @@ let run_with_worktree (caps : < Cap.chdir ; Cap.tmp >) ~commit ?branch f = | None -> raise (Error "") in let rand_dir () = - let uuid = Uuidm.v `V4 in + let rand = Stdlib.Random.State.make_self_init () in + let uuid = Uuidm.v4_gen rand () in let dir_name = "semgrep_git_worktree_" ^ Uuidm.to_string uuid in let dir = CapTmp.get_temp_dir_name caps#tmp / dir_name in UUnix.mkdir !!dir 0o777; diff --git a/semgrep.opam b/semgrep.opam index c0497637d6d8..9d107e9a8a04 100644 --- a/semgrep.opam +++ b/semgrep.opam @@ -1,6 +1,6 @@ # This file is generated by dune, edit dune-project instead opam-version: "2.0" -version: "1.79.0" +version: "1.90.0" synopsis: "Like grep but for code: fast and syntax-aware semantic code pattern for many languages" description: """ @@ -12,7 +12,7 @@ For more information see https://semgrep.dev """ maintainer: ["Yoann Padioleau "] authors: ["Yoann Padioleau "] -license: "LGPL-2.1" +license: "LGPL-2.1-only" homepage: "https://semgrep.dev" bug-reports: "https://github.com/semgrep/semgrep/issues" depends: [ @@ -65,7 +65,7 @@ depends: [ "ambient-context-lwt" "conf-libcurl" {= "1"} "uri" - "uuidm" + "uuidm" {>= "0.9.9"} "cohttp" {= "5.3.0"} "cohttp-lwt-unix" "cohttp-lwt-jsoo" diff --git a/src/osemgrep/cli_ci/Ci_subcommand.ml b/src/osemgrep/cli_ci/Ci_subcommand.ml index b01b2b2e005d..e457c3092307 100644 --- a/src/osemgrep/cli_ci/Ci_subcommand.ml +++ b/src/osemgrep/cli_ci/Ci_subcommand.ml @@ -201,7 +201,7 @@ let scan_config_and_rules_from_deployment ~dry_run let scan_metadata : OutJ.scan_metadata = { cli_version = Version.version; - unique_id = Uuidm.v `V4; + unique_id = Uuidm.v4_gen (Stdlib.Random.State.make_self_init ()) (); (* TODO: should look at conf.secrets, conf.sca, conf.code, etc. *) requested_products = []; dry_run = false; diff --git a/src/osemgrep/configuring/Semgrep_settings.ml b/src/osemgrep/configuring/Semgrep_settings.ml index 348cfd02627b..90928fbfb6ec 100644 --- a/src/osemgrep/configuring/Semgrep_settings.ml +++ b/src/osemgrep/configuring/Semgrep_settings.ml @@ -20,10 +20,11 @@ type t = { } let default = + let rand = Stdlib.Random.State.make_self_init () in { has_shown_metrics_notification = None; api_token = None; - anonymous_user_id = Uuidm.v `V4; + anonymous_user_id = Uuidm.v4_gen rand (); } (*****************************************************************************) diff --git a/src/osemgrep/core/Metrics_.ml b/src/osemgrep/core/Metrics_.ml index d3bff434a6e2..3f731977b453 100644 --- a/src/osemgrep/core/Metrics_.ml +++ b/src/osemgrep/core/Metrics_.ml @@ -144,8 +144,9 @@ type t = { let now () : Timedesc.Timestamp.t = Timedesc.Timestamp.now () let default_payload = + let rand = Stdlib.Random.State.make_self_init () in { - Semgrep_metrics_t.event_id = Uuidm.v `V4; + Semgrep_metrics_t.event_id = Uuidm.v4_gen rand (); anonymous_user_id = ""; started_at = now (); sent_at = now (); diff --git a/src/osemgrep/language_server/Test_LS_e2e.ml b/src/osemgrep/language_server/Test_LS_e2e.ml index b1f1926a9502..bb62542621f1 100644 --- a/src/osemgrep/language_server/Test_LS_e2e.ml +++ b/src/osemgrep/language_server/Test_LS_e2e.ml @@ -194,7 +194,9 @@ let send_map (type a) (info : server_info) packet (f : Packet.t -> a) : (*****************************************************************************) let send_request info request = - let id = Uuidm.v `V4 |> Uuidm.to_string in + let id = + Uuidm.v4_gen (Stdlib.Random.State.make_self_init ()) () |> Uuidm.to_string + in let packet = Packet.Request (CR.to_jsonrpc_request request (`String id)) in send_map info packet diff --git a/src/osemgrep/language_server/Unit_LS.ml b/src/osemgrep/language_server/Unit_LS.ml index 6140852ed981..9adf9083cf3f 100644 --- a/src/osemgrep/language_server/Unit_LS.ml +++ b/src/osemgrep/language_server/Unit_LS.ml @@ -103,7 +103,7 @@ let mock_run_results (files : string list) : Core_runner.result = let mock_workspace ?(git = false) () : Fpath.t = let rand_dir () = - let uuid = Uuidm.v `V4 in + let uuid = Uuidm.v4_gen (Stdlib.Random.State.make_self_init ()) () in let dir_name = "test_workspace_" ^ Uuidm.to_string uuid in let dir = Filename.concat (Filename.get_temp_dir_name ()) dir_name in Unix.mkdir dir 0o777; diff --git a/src/osemgrep/language_server/server/Lsp_.ml b/src/osemgrep/language_server/server/Lsp_.ml index 15d505933268..344f10a40b61 100644 --- a/src/osemgrep/language_server/server/Lsp_.ml +++ b/src/osemgrep/language_server/server/Lsp_.ml @@ -82,7 +82,9 @@ let respond (type r) (id : Id.t) (request : r CR.t) (response : r) = (** Send a request to the client *) let request request = - let id = Uuidm.v `V4 |> Uuidm.to_string in + let id = + Uuidm.v4_gen (Stdlib.Random.State.make_self_init ()) () |> Uuidm.to_string + in let request = SR.to_jsonrpc_request request (`String id) in Logs.debug (fun m -> m "Sending request %s" @@ -117,7 +119,9 @@ let notify_show_message ~kind s = (** Show a little progress circle while doing thing. Returns a token needed to end progress*) let create_progress title message = - let id = Uuidm.v `V4 |> Uuidm.to_string in + let id = + Uuidm.v4_gen (Stdlib.Random.State.make_self_init ()) () |> Uuidm.to_string + in Logs.debug (fun m -> m "Creating progress token %s, (%s: %s)" id title message); let token = ProgressToken.t_of_yojson (`String id) in diff --git a/src/osemgrep/networking/Semgrep_login.ml b/src/osemgrep/networking/Semgrep_login.ml index 3675983d348f..4d98a4c8f673 100644 --- a/src/osemgrep/networking/Semgrep_login.ml +++ b/src/osemgrep/networking/Semgrep_login.ml @@ -27,7 +27,8 @@ type login_session = shared_secret * Uri.t let support_url = "https://semgrep.dev/docs/support/" let make_login_url () = - let session_id = Uuidm.v `V4 in + let rand = Stdlib.Random.State.make_self_init () in + let session_id = Uuidm.v4_gen rand () in ( session_id, Uri.( add_query_params' diff --git a/src/osemgrep/reporting/Gitlab_output.ml b/src/osemgrep/reporting/Gitlab_output.ml index 189085988013..5f65cdaa8ff4 100644 --- a/src/osemgrep/reporting/Gitlab_output.ml +++ b/src/osemgrep/reporting/Gitlab_output.ml @@ -94,7 +94,7 @@ let format_cli_match (cli_match : OutT.cli_match) = let id = (* TODO the ?index argument needs to be provided (for ci_unique_key duplicates) *) Semgrep_hashing_functions.ci_unique_key cli_match - |> Uuidm.of_bytes |> Option.get |> Uuidm.to_string + |> Uuidm.of_binary_string |> Option.get |> Uuidm.to_string in let r = [ From 5b19333bf49eec02181b921c13e60a1c7d347a9a Mon Sep 17 00:00:00 2001 From: Iago Abal Date: Tue, 1 Oct 2024 19:22:41 +0200 Subject: [PATCH 4/4] refactor: Prepare tainting code for handling HOFs (semgrep/semgrep-proprietary#2347) test plan: make test synced from Pro 8158ec5766fac26ca5c450e3ecac452ce56801c8 --- src/tainting/Dataflow_tainting.ml | 103 ++++--- src/tainting/Shape_and_sig.ml | 45 +-- src/tainting/Sig_inst.ml | 476 +++++++++++++++++++----------- src/tainting/Sig_inst.mli | 28 +- src/tainting/Taint.mli | 1 + src/tainting/Taint_shape.ml | 26 +- 6 files changed, 422 insertions(+), 257 deletions(-) diff --git a/src/tainting/Dataflow_tainting.ml b/src/tainting/Dataflow_tainting.ml index 614e5b152fad..e5d3307ca0b2 100644 --- a/src/tainting/Dataflow_tainting.ml +++ b/src/tainting/Dataflow_tainting.ml @@ -27,7 +27,7 @@ module T = Taint module Lval_env = Taint_lval_env module Taints = T.Taint_set module TM = Taint_smatch -open Shape_and_sig.Shape +module S = Shape_and_sig.Shape module Shape = Taint_shape module Effect = Shape_and_sig.Effect module Signature = Shape_and_sig.Signature @@ -262,7 +262,7 @@ let taints_of_matches env ~incoming sources = (data_taints, lval_env) let report_effects env effects = - if effects <> [] then + if not (List_.null effects) then env.config.handle_effects env.fun_name effects env.lval_env let unify_mvars_sets env mvars1 mvars2 = @@ -365,8 +365,7 @@ let partition_sources_by_side_effect sources_matches = * (there is just no point in doing so). *) let get_control_taints_to_return env = Lval_env.get_control_taints env.lval_env - |> Taints.elements - |> List.filter (fun ({ orig; _ } : T.taint) -> + |> Taints.filter (fun ({ orig; _ } : T.taint) -> match orig with | T.Src _ -> true | Var _ @@ -449,7 +448,7 @@ let propagate_taint_to_label replace_labels label (taint : T.taint) = { taint with orig = new_orig } (*****************************************************************************) -(* Reporting effects *) +(* Effects and signatures *) (*****************************************************************************) (* Potentially produces an effect from incoming taints + call traces to a sink. @@ -567,11 +566,10 @@ let effects_of_tainted_return env taints shape return_tok : Effect.t list = let control_taints = get_control_taints_to_return env in if Shape.taints_and_shape_are_relevant taints shape - || not (List_.null control_taints) + || not (Taints.is_empty control_taints) then let data_taints = - taints |> Taints.elements - |> List_.map (fun t -> { t with T.tokens = List.rev t.T.tokens }) + taints |> Taints.map (fun t -> { t with T.tokens = List.rev t.T.tokens }) in [ Effect.ToReturn @@ -579,6 +577,15 @@ let effects_of_tainted_return env taints shape return_tok : Effect.t list = ] else [] +let lookup_signature env fun_exp = + match (!hook_function_taint_signature, fun_exp) with + | Some hook, { e = Fetch _f; eorig = SameAs eorig } -> hook env.config eorig + | __else__ -> None + +(*****************************************************************************) +(* Miscellaneous *) +(*****************************************************************************) + let check_orig_if_sink env ?filter_sinks orig taints shape = (* NOTE(gather-all-taints): * A sink is something opaque to us, e.g. consider sink(["ok", "tainted"]), @@ -598,10 +605,6 @@ let check_orig_if_sink env ?filter_sinks orig taints shape = let effects = effects_of_tainted_sinks env taints sinks in report_effects env effects -(*****************************************************************************) -(* Miscellaneous large functions *) -(*****************************************************************************) - let fix_poly_taint_with_field env lval xtaint = let type_of_il_offset il_offset = match il_offset.IL.o with @@ -878,7 +881,7 @@ let find_lval_taint_sources env incoming_taints lval = (taints_to_return, lval_env) let rec check_tainted_lval env (lval : IL.lval) : - Taints.t * shape * [ `Sub of Taints.t * shape ] * Lval_env.t = + Taints.t * S.shape * [ `Sub of Taints.t * S.shape ] * Lval_env.t = let new_taints, lval_in_env, lval_shape, sub, lval_env = check_tainted_lval_aux env lval in @@ -976,8 +979,8 @@ and propagate_taint_via_java_getters_and_setters_without_definition env e args and check_tainted_lval_aux env (lval : IL.lval) : Taints.t * Xtaint.t_or_sanitized - * shape - * [ `Sub of Taints.t * shape ] + * S.shape + * [ `Sub of Taints.t * S.shape ] * Lval_env.t = (* Recursively checks an l-value bottom-up. * @@ -1040,13 +1043,13 @@ and check_tainted_lval_aux env (lval : IL.lval) : match sub_in_env with | `Sanitized -> (* See NOTE [lval/sanitized] *) - (`Sanitized, Bot) + (`Sanitized, S.Bot) | (`Clean | `None | `Tainted _) as sub_xtaint -> let xtaint', shape = (* THINK: Should we just use 'Sig.find_in_shape' directly here ? We have the 'sub_shape' available. *) match Lval_env.find_lval lval_env lval with - | None -> (`None, Bot) + | None -> (`None, S.Bot) | Some (Cell (xtaint', shape)) -> (xtaint', shape) in let xtaint' = @@ -1147,7 +1150,7 @@ and check_tainted_lval_offset env offset = (* Test whether an expression is tainted, and if it is also a sink, * report the finding too (by side effect). *) -and check_tainted_expr env exp : Taints.t * shape * Lval_env.t = +and check_tainted_expr env exp : Taints.t * S.shape * Lval_env.t = let check env = check_tainted_expr env in let check_subexpr exp = match exp.e with @@ -1155,20 +1158,20 @@ and check_tainted_expr env exp : Taints.t * shape * Lval_env.t = (* TODO: 'Fetch' is handled specially, this case should not never be taken. *) | Literal _ | FixmeExp (_, _, None) -> - (Taints.empty, Bot, env.lval_env) + (Taints.empty, S.Bot, env.lval_env) | FixmeExp (_, _, Some e) -> let taints, shape, lval_env = check env e in let taints = taints |> Taints.union (Shape.gather_all_taints_in_shape shape) in - (taints, Bot, lval_env) + (taints, S.Bot, lval_env) | Composite ((CTuple | CArray | CList), (_, es, _)) -> let taints_and_shapes, lval_env = map_check_expr env check es in let obj = Shape.tuple_like_obj taints_and_shapes in (Taints.empty, Obj obj, lval_env) | Composite ((CSet | Constructor _ | Regexp), (_, es, _)) -> let taints, lval_env = union_map_taints_and_vars env check es in - (taints, Bot, lval_env) + (taints, S.Bot, lval_env) | Operator ((op, _), es) -> let args_taints, all_args_taints, lval_env = check_function_call_arguments env es @@ -1237,7 +1240,7 @@ and check_tainted_expr env exp : Taints.t * shape * Lval_env.t = | G.RSA -> all_args_taints in - (op_taints, Bot, lval_env) + (op_taints, S.Bot, lval_env) | RecordOrDict fields -> (* TODO: Construct a proper record/dict shape here. *) let fields_exprs = @@ -1251,7 +1254,7 @@ and check_tainted_expr env exp : Taints.t * shape * Lval_env.t = let taints, lval_env = union_map_taints_and_vars env check fields_exprs in - (taints, Bot, lval_env) + (taints, S.Bot, lval_env) | Cast (_, e) -> check env e in match exp_is_sanitized env exp with @@ -1315,7 +1318,7 @@ and check_function_call_arguments env args = let all_args_taints = List.fold_left Taints.union Taints.empty rev_taints in (args_taints, all_args_taints, lval_env) -let check_tainted_var env (var : IL.name) : Taints.t * shape * Lval_env.t = +let check_tainted_var env (var : IL.name) : Taints.t * S.shape * Lval_env.t = let taints, shape, _sub, lval_env = check_tainted_lval env (LV.lval_of_var var) in @@ -1329,11 +1332,10 @@ let check_tainted_var env (var : IL.name) : Taints.t * shape * Lval_env.t = input into the function body, from the calling context? *) let check_function_call env fun_exp args - (args_taints : (Taints.t * shape) argument list) : - (Taints.t * shape * Lval_env.t) option = - match (!hook_function_taint_signature, fun_exp) with - | Some hook, { e = Fetch _f; eorig = SameAs eorig } -> - let* fparams, fun_sig = hook env.config eorig in + (args_taints : (Taints.t * S.shape) argument list) : + (Taints.t * S.shape * Lval_env.t) option = + match lookup_signature env fun_exp with + | Some (fparams, fun_sig) -> Log.debug (fun m -> m ~tags:sigs_tag "Call to %s : %s" (Display_IL.string_of_exp fun_exp) @@ -1344,28 +1346,39 @@ let check_function_call env fun_exp args in let* call_effects = Sig_inst.instantiate_function_signature env.lval_env ~check_lval fparams - fun_sig fun_exp eorig args args_taints + fun_sig ~callee:fun_exp ~args:(Some args) args_taints in Some (call_effects |> List.fold_left - (fun (taints_acc, shape_acc, lval_env) effect -> - match effect with - | `ToSink (incoming_taints, sink) -> + (fun (taints_acc, shape_acc, lval_env) + (call_effect : Sig_inst.call_effect) -> + match call_effect with + | ToSink + { + taints_with_precondition = incoming_taints, _requires; + sink; + _; + } -> effects_of_tainted_sink env incoming_taints sink |> report_effects env; (taints_acc, shape_acc, lval_env) - | `ToReturn (taints, shape, control_taints, _return_tok) -> + | ToReturn + { + data_taints = taints; + data_shape = shape; + control_taints; + return_tok = _; + } -> ( Taints.union taints taints_acc, Shape.unify_shape shape shape_acc, Lval_env.add_control_taints lval_env control_taints ) - | `ToLval (taints, lval) -> + | ToLval (taints, lval) -> (taints_acc, shape_acc, lval_env |> Lval_env.add lval taints)) (Taints.empty, Bot, env.lval_env)) - | None, _ - | Some _, _ -> + | None -> Log.debug (fun m -> - m ~tags:sigs_tag "Call to %s : NO SIGNATURE" + m ~tags:sigs_tag "Call to %s : NO SIGNATURE !!!!" (Display_IL.string_of_exp fun_exp)); None @@ -1387,7 +1400,7 @@ let check_function_call_callee env e = (* Test whether an instruction is tainted, and if it is also a sink, * report the effect too (by side effect). *) -let check_tainted_instr env instr : Taints.t * shape * Lval_env.t = +let check_tainted_instr env instr : Taints.t * S.shape * Lval_env.t = let check_expr env = check_tainted_expr env in let check_instr = function | Assign (_, e) -> @@ -1405,7 +1418,7 @@ let check_tainted_instr env instr : Taints.t * shape * Lval_env.t = all_args_taints |> Taints.union (gather_all_taints_in_args_taints args_taints) in - let e_obj, e_taints, _e_shape, lval_env = + let e_obj, e_taints, _e_shape_TODO, lval_env = check_function_call_callee { env with lval_env } e in (* NOTE(sink_has_focus): @@ -1426,11 +1439,10 @@ let check_tainted_instr env instr : Taints.t * shape * Lval_env.t = | Some (call_taints, shape, lval_env) -> (* THINK: For debugging, we could print a diff of the previous and new lval_env *) Log.debug (fun m -> - m ~tags:sigs_tag - "Instantiating taint signature of %s: returns %s & %s" + m ~tags:sigs_tag "- Instantiating %s: returns %s & %s" (Display_IL.string_of_exp e) (T.show_taints call_taints) - (show_shape shape)); + (S.show_shape shape)); (call_taints, shape, lval_env) | None -> ( let call_taints = @@ -1558,7 +1570,7 @@ let check_tainted_instr env instr : Taints.t * shape * Lval_env.t = (* Test whether a `return' is tainted, and if it is also a sink, * report the effect too (by side effect). *) -let check_tainted_return env tok e : Taints.t * shape * Lval_env.t = +let check_tainted_return env tok e : Taints.t * S.shape * Lval_env.t = let sinks = any_is_best_sink env (G.Tk tok) @ orig_is_best_sink env e.eorig |> List.filter (TM.is_best_match env.best_matches) @@ -1604,8 +1616,7 @@ let effects_from_arg_updates_at_exit enter_env exit_env : Effect.t list = let new_taints = Taints.diff exit_taints enter_taints in (* TODO: Also report if taints are _cleaned_. *) if not (Taints.is_empty new_taints) then - Some - (Effect.ToLval (new_taints |> Taints.elements, lval)) + Some (Effect.ToLval (new_taints, lval)) else None))) |> Seq.concat |> List.of_seq diff --git a/src/tainting/Shape_and_sig.ml b/src/tainting/Shape_and_sig.ml index cfe0caf19a82..55a930ac6d5f 100644 --- a/src/tainting/Shape_and_sig.ml +++ b/src/tainting/Shape_and_sig.ml @@ -33,7 +33,7 @@ end) * associated with its fields and indexes. * * Taint shapes are a bit like types. Right now this is mainly to support - * field- and index-sensitivity, but hapes also provide a good foundation to + * field- and index-sensitivity, but shapes also provide a good foundation to * later add alias analysis. This is somewhat inspired by * * "Polymorphic type, region and effect inference" @@ -249,10 +249,10 @@ and Effect : sig } type taints_to_return = { - data_taints : Taint.taint list; + data_taints : Taint.taints; (** The taints of the data being returned (typical data propagated via data flow). *) data_shape : Shape.shape; (** The shape of the data being returned. *) - control_taints : Taint.taint list; + control_taints : Taint.taints; (** The taints propagated via the control flow (cf., `control: true` sources) * used for reachability queries. *) return_tok : AST_generic.tok; @@ -299,7 +299,7 @@ and Effect : sig * * ToReturn(["taint"], Bot, ...) *) - | ToLval of Taint.taint list * Taint.lval + | ToLval of Taint.taints * Taint.lval (** Taints reach an l-value in the scope of the function/method. * * For example: @@ -319,7 +319,13 @@ and Effect : sig val compare : t -> t -> int val show : t -> string + + (* Mainly for debugging *) + val show_taints_to_sink : taints_to_sink -> string + val show_taints_to_return : taints_to_return -> string end = struct + module Taints = Taint.Taint_set + type sink = { pm : Pattern_match.t; rule_sink : R.taint_sink } type taint_to_sink_item = { taint : T.taint; sink_trace : unit T.call_trace } @@ -336,16 +342,16 @@ end = struct } type taints_to_return = { - data_taints : Taint.taint list; + data_taints : Taint.taints; data_shape : Shape.shape; - control_taints : Taint.taint list; + control_taints : Taint.taints; return_tok : AST_generic.tok; } type t = | ToSink of taints_to_sink | ToReturn of taints_to_return - | ToLval of T.taint list * T.lval (* TODO: CleanArg ? *) + | ToLval of T.taints * T.lval (* TODO: CleanArg ? *) (*************************************) (* Comparison *) @@ -395,10 +401,10 @@ end = struct control_taints = control_taints2; return_tok = _; } = - match List.compare T.compare_taint data_taints1 data_taints2 with + match Taints.compare data_taints1 data_taints2 with | 0 -> ( match Shape.compare_shape data_shape1 data_shape2 with - | 0 -> List.compare T.compare_taint control_taints1 control_taints2 + | 0 -> Taints.compare control_taints1 control_taints2 | other -> other) | other -> other @@ -407,7 +413,7 @@ end = struct | ToSink tts1, ToSink tts2 -> compare_taints_to_sink tts1 tts2 | ToReturn ttr1, ToReturn ttr2 -> compare_taints_to_return ttr1 ttr2 | ToLval (ts1, lv1), ToLval (ts2, lv2) -> ( - match List.compare T.compare_taint ts1 ts2 with + match Taints.compare ts1 ts2 with | 0 -> T.compare_lval lv1 lv2 | other -> other) | ToSink _, (ToReturn _ | ToLval _) -> -1 @@ -445,17 +451,18 @@ end = struct let show_taints_to_sink { taints_with_precondition = taints, _; sink; _ } = Common.spf "%s ~~~> %s" (show_taints_and_traces taints) (show_sink sink) + let show_taints_to_return + { data_taints; data_shape; control_taints; return_tok = _ } = + Printf.sprintf "return (%s & %s & CTRL:%s)" + (T.show_taints data_taints) + (Shape.show_shape data_shape) + (T.show_taints control_taints) + let show = function - | ToSink x -> show_taints_to_sink x - | ToReturn { data_taints; data_shape; control_taints; return_tok = _ } -> - Printf.sprintf "return (%s & %s & CTRL:%s)" - (Common2.string_of_list T.show_taint data_taints) - (Shape.show_shape data_shape) - (Common2.string_of_list T.show_taint control_taints) + | ToSink tts -> show_taints_to_sink tts + | ToReturn ttr -> show_taints_to_return ttr | ToLval (taints, lval) -> - Printf.sprintf "%s ----> %s" - (Common2.string_of_list T.show_taint taints) - (T.show_lval lval) + Printf.sprintf "%s ----> %s" (T.show_taints taints) (T.show_lval lval) end (** A (polymorphic) taint signature: simply a set of results for a function. diff --git a/src/tainting/Sig_inst.ml b/src/tainting/Sig_inst.ml index 02d7da160968..2b91174396b7 100644 --- a/src/tainting/Sig_inst.ml +++ b/src/tainting/Sig_inst.ml @@ -28,11 +28,65 @@ module Lval_env = Taint_lval_env let sigs_tag = Log_tainting.sigs_tag let bad_tag = Log_tainting.bad_tag +(*****************************************************************************) +(* Call effets *) +(*****************************************************************************) + +type call_effect = + | ToSink of Effect.taints_to_sink + | ToReturn of Effect.taints_to_return + | ToLval of Taint.taints * IL.lval + +type call_effects = call_effect list + +let show_call_effect = function + | ToSink tts -> Effect.show_taints_to_sink tts + | ToReturn ttr -> Effect.show_taints_to_return ttr + | ToLval (taints, lval) -> + Printf.sprintf "%s ----> %s" (T.show_taints taints) + (Display_IL.string_of_lval lval) + +let show_call_effects call_effects = + call_effects |> List_.map show_call_effect |> String.concat "; " + +(*****************************************************************************) +(* Instantiation "config" *) +(*****************************************************************************) + +type inst_var = { + inst_lval : T.lval -> (Taints.t * shape) option; + (** How to instantiate a 'Taint.lval', aka "data taint variable". *) + inst_ctrl : unit -> Taints.t; + (** How to instantiate a 'Taint.Control', aka "control taint variable". *) +} + +(* TODO: Right now this is only for source traces, not for sink traces... + * In fact, we should probably not have two traces but just one, but more + * general. *) +type inst_trace = { + add_call_to_trace_for_src : + Tok.t list -> + Rule.taint_source T.call_trace -> + Rule.taint_source T.call_trace option; + (** For sources we extend the call trace. *) + fix_token_trace_for_var : var_tokens:Tok.t list -> Tok.t list -> Tok.t list; + (** For variables we should too, but due to limitations in our call-trace + * representation, we just record the path as tainted tokens. *) +} + +(*****************************************************************************) +(* Helpers *) +(*****************************************************************************) + let ( let+ ) x f = match x with | None -> [] | Some x -> f x +(*****************************************************************************) +(* Instantiating traces *) +(*****************************************************************************) + (* Try to get an idnetifier from a callee/function expression, to be used in * a taint trace. *) let get_ident_of_callee callee = @@ -47,75 +101,158 @@ let get_ident_of_callee callee = | __else__ -> None) | __else__ -> None -(* TODO: Move to 'Taint' module ? *) -let subst_in_precondition ~inst_lval ~inst_ctrl taint = +let add_call_to_trace_if_callee_has_eorig ~callee tainted_tokens call_trace = + (* E.g. (ToReturn) the call to 'bar' in: + * + * 1 def bar(): + * 2 x = taint + * 3 return x + * 4 + * 5 def foo(): + * 6 y = bar() + * 7 sink(y) + * + * would result in this call trace for the source: + * + * Call('bar' @l.6, ["x" @l.2], "taint" @l.2) + * + * E.g. (ToLval) the call to 'bar' in: + * + * 1 s = set([]) + * 2 + * 3 def bar(): + * 4 global s + * 5 s.add(taint) + * 6 + * 7 def foo(): + * 8 global s + * 9 bar() + * 10 sink(s) + * + * would result in this call trace for the source: + * + * Call('bar' @l.6, ["s" @l.5], "taint" @l.5) + *) + match callee with + | { IL.e = _; eorig = SameAs orig_callee } -> + Some (T.Call (orig_callee, tainted_tokens, call_trace)) + | __else__ -> + (* TODO: Have a better fallback in case we can't get an eorig from 'callee', + * maybe for that we need to change `Taint.Call` to accept a token. *) + None + +let add_call_to_token_trace ~callee ~var_tokens caller_tokens = + (* E.g. (ToReturn) the call to 'bar' in: + * + * 1 def bar(x): + * 2 y = x + * 3 return y + * 4 + * 5 def foo(): + * 6 t = bar(taint) + * 7 ... + * + * would result in this list of tokens (note that is reversed): + * + * ["t" @l.6; "y" @l.2; "x" @l.1; "bar" @l.6] + * + * This is a hack we use because taint traces aren't general enough, + * this should be represented with a call trace. + *) + let call_tokens = + (match get_ident_of_callee callee with + | None -> [] + | Some ident -> [ snd ident ]) + @ List.rev var_tokens + in + List.rev_append call_tokens caller_tokens + +let add_lval_update_to_token_trace ~callee:_TODO lval_tok ~var_tokens + caller_tokens = + (* E.g. (ToLval) the call to 'bar' in: + * + * 1 s = set([]) + * 2 + * 3 def bar(x): + * 4 global s + * 5 s.add(x) + * 6 + * 7 def foo(): + * 8 global s + * 9 t = taint + * 10 bar(t) + * 11 sink(s) + * + * would result in this list of tokens (note that is reversed): + * + * ["s" @l.5; "s" @l.5; "x" @l.3; "s" @l.5; "bar" @l.10; "t" @l.9] + * + * This is a hack we use because taint traces aren't general enough, + * this should be represented with a call trace. + *) + let call_tokens = + (* TODO: Use `get_ident_of_callee callee` to add the callee to the trace. *) + lval_tok :: List.rev var_tokens + in + List.rev_append call_tokens caller_tokens + +(*****************************************************************************) +(* Instatiation *) +(*****************************************************************************) + +let subst_in_precondition inst_var taint = let subst taints = taints |> List.concat_map (fun t -> match t.T.orig with | Src _ -> [ t ] | Var lval -> ( - match inst_lval lval with + match inst_var.inst_lval lval with | None -> [] - | Some (var_taints, _var_shape) -> var_taints |> Taints.elements) + | Some (call_taints, _call_shape) -> + call_taints |> Taints.elements) | Shape_var lval -> ( - match inst_lval lval with + match inst_var.inst_lval lval with | None -> [] - | Some (_var_taints, var_shape) -> - Shape.gather_all_taints_in_shape var_shape |> Taints.elements - ) - | Control -> inst_ctrl () |> Taints.elements) + | Some (_call_taints, call_shape) -> + (* Taint shape-variable, stands for the taints reachable + * through the shape of the 'lval', it's like a delayed + * call to 'Shape.gather_all_taints_in_shape'. *) + Shape.gather_all_taints_in_shape call_shape + |> Taints.elements) + | Control -> inst_var.inst_ctrl () |> Taints.elements) in T.map_preconditions subst taint -let instantiate_taint_var ~inst_lval ~inst_ctrl taint = +let instantiate_taint_var inst_var taint = match taint.T.orig with | Src _ -> None - | Var lval -> inst_lval lval + | Var lval -> inst_var.inst_lval lval | Shape_var lval -> (* This is just a delayed 'gather_all_taints_in_shape'. *) let* taints = - inst_lval lval + inst_var.inst_lval lval |> Option.map (fun (_taints, shape) -> Shape.gather_all_taints_in_shape shape) in Some (taints, Bot) | Control -> (* 'Control' is pretty much like a taint variable so we handle all together. *) - Some (inst_ctrl (), Bot) + Some (inst_var.inst_ctrl (), Bot) -let instantiate_taint ~callee ~inst_lval ~inst_ctrl taint = - let inst_taint_var taint = - instantiate_taint_var ~inst_lval ~inst_ctrl taint - in +let instantiate_taint inst_var inst_trace taint = + let inst_taint_var taint = instantiate_taint_var inst_var taint in match taint.T.orig with | Src src -> ( let taint = - (* Update taint trace. - * - * E.g. the call to 'bar' in: - * - * 1 def bar(): - * 2 x = taint - * 3 return x - * 4 - * 5 def foo(): - * 6 bar() - * 7 ... - * - * would result in this call trace: - * - * Call('bar' @l.6, ["x" @l.2], "taint" @l.2) - *) - match callee with - | { IL.e = _; eorig = SameAs orig_callee } -> - let call_trace = - T.Call (orig_callee, taint.tokens, src.call_trace) - in + match + inst_trace.add_call_to_trace_for_src taint.tokens src.call_trace + with + | Some call_trace -> { T.orig = Src { src with call_trace }; tokens = [] } - | __else__ -> taint + | None -> taint in - match subst_in_precondition ~inst_lval ~inst_ctrl taint with + match subst_in_precondition inst_var taint with | None -> (* substitution made preconditon false, so no taint here! *) Taints.empty @@ -126,49 +263,25 @@ let instantiate_taint ~callee ~inst_lval ~inst_ctrl taint = | Control -> ( match inst_taint_var taint with | None -> Taints.empty - | Some (var_taints, _var_shape) -> - (* Update taint trace. - * - * E.g. the call to 'bar' in: - * - * 1 def bar(x): - * 2 y = x - * 3 return y - * 4 - * 5 def foo(): - * 6 t = bar(taint) - * 7 ... - * - * would result in this list of tokens (note that is reversed): - * - * ["t" @l.6; "y" @l.2; "x" @l.1; "bar" @l.6] - * - * This is a hack we use because taint traces aren't general enough, - * this should be represented with a call trace. - *) - let extra_tokens = - (match get_ident_of_callee callee with - | None -> [] - | Some ident -> [ snd ident ]) - @ List.rev taint.tokens - in - var_taints + | Some (call_taints, _Bot_shape) -> + call_taints |> Taints.map (fun taint' -> { taint' with - tokens = List.rev_append extra_tokens taint'.tokens; + tokens = + inst_trace.fix_token_trace_for_var ~var_tokens:taint.tokens + taint'.tokens; })) -let instantiate_taints ~callee ~inst_lval ~inst_ctrl taints = - taints |> Taints.elements - |> List.fold_left +let instantiate_taints inst_var inst_trace taints = + taints |> Taints.to_seq + |> Seq.fold_left (fun acc taint -> - acc - |> Taints.union (instantiate_taint ~callee ~inst_lval ~inst_ctrl taint)) + acc |> Taints.union (instantiate_taint inst_var inst_trace taint)) Taints.empty -let instantiate_shape ~callee ~inst_lval ~inst_ctrl shape = - let inst_taints = instantiate_taints ~callee ~inst_lval ~inst_ctrl in +let instantiate_shape inst_var inst_trace shape = + let inst_taints = instantiate_taints inst_var inst_trace in let rec inst_shape = function | Bot -> Bot | Obj obj -> @@ -181,7 +294,7 @@ let instantiate_shape ~callee ~inst_lval ~inst_ctrl shape = in if Fields.is_empty obj then Bot else Obj obj | Arg arg -> ( - match inst_lval (T.lval_of_arg arg) with + match inst_var.inst_lval (T.lval_of_arg arg) with | Some (_taints, shape) -> shape | None -> Log.warn (fun m -> @@ -501,18 +614,30 @@ let taints_of_sig_lval lval_env ~check_lval fparams fun_exp args_exps (args_taints : (Taints.t * shape) IL.argument list) (sig_lval : T.lval) = match taints_of_lval lval_env fparams fun_exp args_taints sig_lval with | Some (taints, shape) -> Some (taints, shape) - | None -> - (* We want to know what's the taint carried by 'arg_exp.x1. ... .xN'. - * TODO: We should not need this when we cover everything with shapes, - * see 'lval_of_sig_lval'. - *) - let* lval, _obj = lval_of_sig_lval fun_exp fparams args_exps sig_lval in - let lval_taints, shape = check_lval lval in - let lval_taints = - lval_taints - |> fix_lval_taints_if_global_or_a_field_of_this_class fun_exp sig_lval - in - Some (lval_taints, shape) + | None -> ( + match args_exps with + | None -> + Log.warn (fun m -> + m + "Cannot find the taint&shape of %s because we lack the actual \ + arguments" + (T.show_lval sig_lval)); + None + | Some args_exps -> + (* We want to know what's the taint carried by 'arg_exp.x1. ... .xN'. + * TODO: We should not need this when we cover everything with shapes, + * see 'lval_of_sig_lval'. + *) + let* lval, _obj = + lval_of_sig_lval fun_exp fparams args_exps sig_lval + in + let lval_taints, shape = check_lval lval in + let lval_taints = + lval_taints + |> fix_lval_taints_if_global_or_a_field_of_this_class fun_exp + sig_lval + in + Some (lval_taints, shape)) (* This function is consuming the taint signature of a function to determine a few things: @@ -521,26 +646,27 @@ let taints_of_sig_lval lval_env ~check_lval fparams fun_exp args_exps 2) Are there any effects that occur within the function due to taints being input into the function body, from the calling context? *) -let instantiate_function_signature lval_env ~check_lval fparams fun_sig fun_exp - fun_eorig args (args_taints : (Taints.t * shape) IL.argument list) : _ = - (* This function simply produces the corresponding taints to the - given argument, within the body of the function. - *) - (* Our first pass will be to substitute the args for taints. - We can't do this indiscriminately at the beginning, because - we might need to use some of the information of the pre-substitution - taints and the post-substitution taints, for instance the tokens. - - So we will isolate this as a specific step to be applied as necessary. - *) +let instantiate_function_signature lval_env ~check_lval fparams + (fun_sig : Signature.t) ~callee ~(args : _ option) + (args_taints : (Taints.t * shape) IL.argument list) : call_effects option = let lval_to_taints lval = + (* This function simply produces the corresponding taints to the + given argument, within the body of the function. + *) + (* Our first pass will be to substitute the args for taints. + We can't do this indiscriminately at the beginning, because + we might need to use some of the information of the pre-substitution + taints and the post-substitution taints, for instance the tokens. + + So we will isolate this as a specific step to be applied as necessary. + *) let opt_taints_shape = - taints_of_sig_lval lval_env ~check_lval fparams fun_exp args args_taints + taints_of_sig_lval lval_env ~check_lval fparams callee args args_taints lval in Log.debug (fun m -> - m ~tags:sigs_tag "Instantiating taint signature of %s: %s -> %s" - (Display_IL.string_of_exp fun_exp) + m ~tags:sigs_tag "- Instantiating %s: %s -> %s" + (Display_IL.string_of_exp callee) (T.show_lval lval) (match opt_taints_shape with | None -> "nothing :/" @@ -548,34 +674,24 @@ let instantiate_function_signature lval_env ~check_lval fparams fun_sig fun_exp spf "%s & %s" (T.show_taints taints) (show_shape shape))); opt_taints_shape in + (* Instantiation helpers *) let taints_in_ctrl () = Lval_env.get_control_taints lval_env in - let inst_taint_var taint = - instantiate_taint_var ~inst_lval:lval_to_taints ~inst_ctrl:taints_in_ctrl - taint + let inst_var = { inst_lval = lval_to_taints; inst_ctrl = taints_in_ctrl } in + let inst_taint_var taint = instantiate_taint_var inst_var taint in + let subst_in_precondition = subst_in_precondition inst_var in + let inst_trace = + { + add_call_to_trace_for_src = add_call_to_trace_if_callee_has_eorig ~callee; + fix_token_trace_for_var = add_call_to_token_trace ~callee; + } in - let subst_in_precondition = - subst_in_precondition ~inst_lval:lval_to_taints ~inst_ctrl:taints_in_ctrl - in - let process_sig : Effect.t -> _ list = function + let inst_taints taints = instantiate_taints inst_var inst_trace taints in + let inst_shape shape = instantiate_shape inst_var inst_trace shape in + (* Instatiate effects *) + let inst_effect : Effect.t -> call_effect list = function | Effect.ToReturn { data_taints; data_shape; control_taints; return_tok } -> - let inst_taints taints = - taints - |> List.fold_left - (fun return_taints (t : T.taint) -> - let taints' = - (* TODO: Use 'Taint_inst.instantiate_taint' also for 'ToSink' and - 'ToLval' cases below. *) - instantiate_taint ~callee:fun_exp ~inst_lval:lval_to_taints - ~inst_ctrl:taints_in_ctrl t - in - return_taints |> Taints.union taints') - Taints.empty - in let data_taints = inst_taints data_taints in - let data_shape = - instantiate_shape ~callee:fun_exp ~inst_lval:lval_to_taints - ~inst_ctrl:taints_in_ctrl data_shape - in + let data_shape = inst_shape data_shape in let control_taints = (* No need to instantiate 'control_taints' because control taint variables * do not propagate through function calls... BUT instantiation also fixes @@ -585,12 +701,15 @@ let instantiate_function_signature lval_env ~check_lval fparams fun_sig fun_exp if Shape.taints_and_shape_are_relevant data_taints data_shape || not (Taints.is_empty control_taints) - then [ `ToReturn (data_taints, data_shape, control_taints, return_tok) ] + then + [ ToReturn { data_taints; data_shape; control_taints; return_tok } ] else [] - | Effect.ToSink { taints_with_precondition = taints, _requires; sink; _ } -> - let incoming_taints = + | Effect.ToSink + { taints_with_precondition = taints, requires; sink; merged_env } -> + let taints = taints |> List.concat_map (fun { Effect.taint; sink_trace } -> + (* TODO: Use 'instantiate_taint' here too (note differences wrt the call trace). *) match taint.T.orig with | T.Src _ -> (* Here, we do not modify the call trace or the taint. @@ -628,20 +747,30 @@ let instantiate_function_signature lval_env ~check_lval fparams fun_sig fun_exp | Shape_var _ | Control -> let sink_trace = - T.Call (fun_eorig, taint.tokens, sink_trace) + add_call_to_trace_if_callee_has_eorig ~callee + taint.tokens sink_trace + ||| sink_trace in - let+ var_taints, var_shape = inst_taint_var taint in + let+ call_taints, call_shape = inst_taint_var taint in (* See NOTE(gather-all-taints) *) - let var_taints = - var_taints + let call_taints = + call_taints |> Taints.union - (Shape.gather_all_taints_in_shape var_shape) + (Shape.gather_all_taints_in_shape call_shape) in - Taints.elements var_taints + Taints.elements call_taints |> List_.map (fun x -> { Effect.taint = x; sink_trace })) in - if List_.null incoming_taints then [] - else [ `ToSink (incoming_taints, sink) ] + if List_.null taints then [] + else + [ + ToSink + { + taints_with_precondition = (taints, requires); + sink; + merged_env; + }; + ] | Effect.ToLval (taints, dst_sig_lval) -> (* Taints 'taints' go into an argument of the call, by side-effect. * Right now this is mainly used to track taint going into specific @@ -649,42 +778,39 @@ let instantiate_function_signature lval_env ~check_lval fparams fun_sig fun_exp let+ dst_lval, tainted_tok = (* 'dst_lval' is the actual argument/l-value that corresponds * to the formal argument 'dst_sig_lval'. *) - lval_of_sig_lval fun_exp fparams args dst_sig_lval + match args with + | None -> + Log.warn (fun m -> + m + "Cannot instantiate '%s' because we lack the actual \ + arguments" + (T.show_lval dst_sig_lval)); + None + | Some args -> lval_of_sig_lval callee fparams args dst_sig_lval in - taints - |> List.concat_map (fun t -> - let dst_taints = - match t.T.orig with - | Src src -> ( - let call_trace = - T.Call (fun_eorig, t.tokens, src.call_trace) - in - let t = - { Taint.orig = Src { src with call_trace }; tokens = [] } - in - match t |> subst_in_precondition with - | None -> Taints.empty - | Some t -> Taints.singleton t) - | Var _ - | Shape_var _ -> ( - (* Taint is flowing from one argument to another argument - * (or possibly the callee object). Given the formal poly - * taint 'src_lval', we compute the actual taint in the - * context of this function call. *) - match inst_taint_var t with - | None -> Taints.empty - | Some (res, _TODOshape) -> - res - |> Taints.map (fun taint -> - let tokens = - t.tokens @ (tainted_tok :: taint.T.tokens) - in - { taint with tokens })) - | Control -> - (* control taints do not propagate to arguments *) - Taints.empty - in - if Taints.is_empty dst_taints then [] - else [ `ToLval (dst_taints, dst_lval) ]) + let taints = + taints + |> instantiate_taints + { + inst_lval = lval_to_taints; + (* Note that control taints do not propagate to l-values. *) + inst_ctrl = (fun _ -> Taints.empty); + } + { + add_call_to_trace_for_src = + add_call_to_trace_if_callee_has_eorig ~callee; + fix_token_trace_for_var = + add_lval_update_to_token_trace ~callee tainted_tok; + } + in + + if Taints.is_empty taints then [] else [ ToLval (taints, dst_lval) ] + in + let call_effects = + fun_sig |> Signature.elements |> List.concat_map inst_effect in - Some (fun_sig |> Signature.elements |> List.concat_map process_sig) + Log.debug (fun m -> + m ~tags:sigs_tag "Instantiated call to %s: %s" + (Display_IL.string_of_exp callee) + (show_call_effects call_effects)); + Some call_effects diff --git a/src/tainting/Sig_inst.mli b/src/tainting/Sig_inst.mli index de80dbf7a53a..d456673fd8d9 100644 --- a/src/tainting/Sig_inst.mli +++ b/src/tainting/Sig_inst.mli @@ -1,22 +1,28 @@ (** Instantiation of taint signatures *) +(** Like 'Shape_and_sig.Effect.t' but instantiated for a specific call site. + * In particular, there is no 'ToSinkInCall' effect, and 'ToLval' effects + * refer to specific 'IL.lval's rather than to 'Taint.lval's. *) +type call_effect = + | ToSink of Shape_and_sig.Effect.taints_to_sink + | ToReturn of Shape_and_sig.Effect.taints_to_return + | ToLval of Taint.taints * IL.lval + +type call_effects = call_effect list + val instantiate_function_signature : Taint_lval_env.t -> check_lval:(IL.lval -> Taint.Taint_set.t * Shape_and_sig.Shape.shape) -> + (* TODO: 'check_lval' is just a way to avoid a recursive dependency with + * 'Dataflow_tainting'. We should not need this when all field-sensitive + * taint tracking happens through shapes. *) AST_generic.parameters -> Shape_and_sig.Signature.t -> - IL.exp -> - AST_generic.expr -> - IL.exp IL.argument list -> + callee:IL.exp -> + args:IL.exp IL.argument list option (** actual arguments *) -> (Taint.Taint_set.t * Shape_and_sig.Shape.shape) IL.argument list -> - [ `ToSink of - Shape_and_sig.Effect.taint_to_sink_item list * Shape_and_sig.Effect.sink - | `ToReturn of - Taint.Taint_set.t * Shape_and_sig.Shape.shape * Taint.Taint_set.t * Tok.t - | `ToLval of Taint.Taint_set.t * IL.lval ] - list - option + call_effects option (** Instantiation is meant to replace the taint and shape variables in the * signature of a callee function, with the taints and shapes of the parameters - * at the call site. + * at the call site. It also constructs the call trace. *) diff --git a/src/tainting/Taint.mli b/src/tainting/Taint.mli index 52630059ea4f..64d1fafc4fcc 100644 --- a/src/tainting/Taint.mli +++ b/src/tainting/Taint.mli @@ -196,6 +196,7 @@ module Taint_set : sig val map : (taint -> taint) -> t -> t val iter : (taint -> unit) -> t -> unit val fold : (taint -> 'a -> 'a) -> t -> 'a -> 'a + val filter : (taint -> bool) -> t -> t val of_list : taint list -> t val to_seq : t -> taint Seq.t val elements : t -> taint list diff --git a/src/tainting/Taint_shape.ml b/src/tainting/Taint_shape.ml index 81af34b7a1ec..3c19cec66213 100644 --- a/src/tainting/Taint_shape.ml +++ b/src/tainting/Taint_shape.ml @@ -21,6 +21,7 @@ module T = Taint module Taints = T.Taint_set open Shape_and_sig.Shape module Fields = Shape_and_sig.Fields +module Signature = Shape_and_sig.Signature (*********************************************************) (* Helpers *) @@ -46,6 +47,11 @@ let internal_UNSAFE_find_offset_in_obj o obj = (T.show_offset o)); (Oany, obj)) +let debug_offset offset = + match offset with + | [] -> "" + | _ :: _ -> offset |> List_.map T.show_offset |> String.concat "" + (*********************************************************) (* Misc *) (*********************************************************) @@ -94,11 +100,9 @@ and unify_shape shape1 shape2 = match (shape1, shape2) with | Bot, shape | shape, Bot -> + (* 'Bot' acts like a do-not-care. *) shape | Obj obj1, Obj obj2 -> Obj (unify_obj obj1 obj2) - | Arg _, (Obj _ as obj) - | (Obj _ as obj), Arg _ -> - obj | Arg arg1, Arg arg2 -> if T.equal_arg arg1 arg2 then shape1 else ( @@ -117,6 +121,10 @@ and unify_shape shape1 shape2 = m "Trying to unify two different arg shapes: %s ~ %s" (T.show_arg arg1) (T.show_arg arg2)); shape1) + (* 'Arg' acts like a shape variable. *) + | Arg _, (Obj _ as obj) + | (Obj _ as obj), Arg _ -> + obj and unify_obj obj1 obj2 = (* THINK: Apply taint_MAX_OBJ_FIELDS limit ? *) @@ -162,12 +170,16 @@ let rec find_in_cell offset cell = | [] -> Some cell | _ :: _ -> find_in_shape offset shape -and find_in_shape offset = function +and find_in_shape offset shape = + match shape with (* offset <> [] *) | Bot -> None | Obj obj -> find_in_obj offset obj | Arg _ -> (* TODO: Here we should "refine" the arg shape, it should be an Obj shape. *) + Log.warn (fun m -> + m "Could not find offset %s in polymorphic shape %s" + (debug_offset offset) (show_shape shape)); None and find_in_obj (offset : T.offset list) obj = @@ -222,7 +234,8 @@ let rec update_offset_in_cell ~f offset cell = | `Tainted _, (Bot | Obj _ | Arg _) -> Some (Cell (xtaint, shape)) -and update_offset_in_shape ~f offset = function +and update_offset_in_shape ~f offset shape = + match shape with | Bot | Arg _ -> let shape = Obj Fields.empty in @@ -317,7 +330,8 @@ let rec clean_cell (offset : T.offset list) cell = let shape = clean_shape offset shape in Cell (xtaint, shape) -and clean_shape offset = function +and clean_shape offset shape = + match shape with | Bot | Arg _ -> let shape = Obj Fields.empty in