diff --git a/.gitignore b/.gitignore index f889bfb8c..8d79fa9b0 100644 --- a/.gitignore +++ b/.gitignore @@ -1,9 +1,23 @@ +# General site **/merge.log -.kaybee */cpu.prof *.pyc *git-cache* +*.log +*.log.* +**/cov_html +.coverage +similarities.csv + +# Virtual environment +**/.venv/ + +# VSCode Settings +**/.vscode/ + +# Regarding KB +.kaybee kaybee/internal/repository/profile001.pdf kaybee/internal/repository/repository.test kaybee/internal/tasks/.kaybee @@ -12,10 +26,9 @@ kaybee/internal/tasks/profile001.pdf kaybee/internal/tasks/tasks.test kaybee/internal/repository/cpu.prof kaybee/kaybee.code-workspace -.vscode/launch.json -.vscode/task.code-snippets kaybee/coverage.out kaybee/kaybee +kaybee/internal/reconcile/debug.test kaybee/internal/.kaybee/**/* kaybee/dist/** kaybee/kaybeeconf.yaml @@ -25,9 +38,9 @@ kaybee/steady.sh kaybee/kaybeeconf-custom.yaml kaybee/kaybee-new-statements kaybee/pkged.go -*.log -*.log.* kaybeeconf.yaml + +# Regarding Prospector prospector/.env prospector/workspace.code-workspace prospector/disabled_tests/skip_test-commits.db @@ -35,40 +48,19 @@ prospector/disabled_tests/skip_test-vulnerabilities.db prospector/tracer_dataset_final_2 prospector/results prospector/*.py -prospector/.vscode/launch.json -prospector/.vscode/settings.json prospector/install_fastext.sh -prospector/nvd.ipynb -prospector/data/nvd.pkl -prospector/data/nvd.csv -prospector/data_sources/reports -.vscode/settings.json prospector/cov_html/* -prospector/client/cli/cov_html/* prospector/config.yaml -prospector/client/web/node-app/node_modules prospector/.coverage.* prospector/.coverage -**/cov_html prospector/cov_html -.coverage -prospector/.venv prospector/prospector.code-workspace prospector/requests-cache.sqlite prospector/prospector-report.html prospector/test_report.html prospector/test_report.json prospector/.idea/* -similarities.csv prospector/*.html prospector/*.json -requests-cache.sqlite -prospector/output.png -prospector/output.pstats -prospector/kaybee-new-statements -prospector/run.sh -prospector/cve_data prospector/evaluation -.DS_Store -kaybee/internal/reconcile/debug.test -prospector/client/web/node-app/build +.DS_Store \ No newline at end of file diff --git a/prospector/cli/main.py b/prospector/cli/main.py index 633754cdb..41b5661c8 100644 --- a/prospector/cli/main.py +++ b/prospector/cli/main.py @@ -107,7 +107,13 @@ def main(argv): # noqa: C901 "enabled_rules": config.enabled_rules, } - results, advisory_record = prospector(**params) + try: + results, advisory_record = prospector(**params) + except Exception as e: + ConsoleWriter.print( + f"Prospector function couldn't return successfully with {config.vuln_id}: {e}\n" + ) + return if config.preprocess_only: return diff --git a/prospector/core/prospector.py b/prospector/core/prospector.py index 589fb516d..dfab128b5 100644 --- a/prospector/core/prospector.py +++ b/prospector/core/prospector.py @@ -113,7 +113,9 @@ def prospector( # noqa: C901 ) sys.exit(1) - fixing_commit = advisory_record.get_fixing_commit() + commits_in_advisory_references = ( + advisory_record.get_commits_in_advisory_references() + ) # print(advisory_record.references) # obtain a repository object repository = Git(repository_url, git_cache) @@ -131,10 +133,12 @@ def prospector( # noqa: C901 candidates: Dict[str, RawCommit] = dict() - if len(fixing_commit) > 0: - candidates = get_commits_no_tags(repository, fixing_commit) + if len(commits_in_advisory_references) > 0: + candidates = get_commits_no_tags( + repository, commits_in_advisory_references + ) if len(candidates) > 0 and any( - [c for c in candidates if c in fixing_commit] + [c for c in candidates if c in commits_in_advisory_references] ): console.print("Fixing commit found in the advisory references\n") advisory_record.has_fixing_commit = True @@ -170,10 +174,8 @@ def prospector( # noqa: C901 f"Number of candidates exceeds {limit_candidates}, aborting." ) - ConsoleWriter.print( - f"Candidates limitlimit exceeded: {len(candidates)}." - ) - return None, len(candidates) + ConsoleWriter.print(f"Candidates limit exceeded: {len(candidates)}.") + raise Exception(f"Candidate limit exceeded: {len(candidates)}.") with ExecutionTimer( core_statistics.sub_collection("commit preprocessing") @@ -228,7 +230,7 @@ def prospector( # noqa: C901 elapsed_time = time.time() - start_time if elapsed_time > 1800: logger.error("Processing timeout") - return None, len(candidates) + raise Exception("Processing timeout") else: writer.print("\nAll commits found in the backend") @@ -244,15 +246,26 @@ def prospector( # noqa: C901 ): save_or_update_processed_commits(backend_address, payload) else: - logger.warning("Preprocessed commits are not being sent to backend") + logger.warning( + "Preprocessed commits are not being sent to backend (after phase 1)" + ) ranked_candidates = evaluate_commits( - preprocessed_commits, advisory_record, backend_address, enabled_rules + preprocessed_commits, + advisory_record, + use_backend, + backend_address, + enabled_rules, ) # Save outcome of security relevance to DB (Phase 2 Rule) payload = [c.to_dict() for c in ranked_candidates[:NUM_COMMITS_PHASE_2]] - save_or_update_processed_commits(backend_address, payload) + if len(payload) > 0 and use_backend != USE_BACKEND_NEVER: + save_or_update_processed_commits(backend_address, payload) + else: + logger.warning( + "Preprocessed commits are not being sent to backend (after phase 2)" + ) # ConsoleWriter.print("Commit ranking and aggregation...") ranked_candidates = remove_twins(ranked_candidates) @@ -296,6 +309,7 @@ def filter(commits: Dict[str, RawCommit]) -> Dict[str, RawCommit]: def evaluate_commits( commits: List[Commit], advisory: AdvisoryRecord, + use_backend: str, backend_address: str, enabled_rules: List[str], ) -> List[Commit]: @@ -316,8 +330,15 @@ def evaluate_commits( """ with ExecutionTimer(core_statistics.sub_collection("candidates analysis")): with ConsoleWriter("Candidate analysis") as _: + # Pass True to the rules module if the backend is being used, False + # otherwise (needed to decide whether to update the database) + use_backend = use_backend != USE_BACKEND_NEVER ranked_commits = apply_rules( - commits, advisory, backend_address, enabled_rules=enabled_rules + commits, + advisory, + use_backend, + backend_address, + enabled_rules=enabled_rules, ) return ranked_commits diff --git a/prospector/datamodel/advisory.py b/prospector/datamodel/advisory.py index 709b14139..b66874c82 100644 --- a/prospector/datamodel/advisory.py +++ b/prospector/datamodel/advisory.py @@ -131,7 +131,9 @@ def fetch_references(self): def parse_references_from_third_party(self): """Parse the references from third party sites""" - for ref in self.search_references_debian() + self.search_references_redhat(): + for ref in ( + self.search_references_debian() + self.search_references_redhat() + ): # self.references[ref] += 2 self.references[self.extract_hashes(ref)] += 2 @@ -167,7 +169,9 @@ def parse_advisory(self, data): # ) self.versions = { "affected": [ - item.get("versionEndIncluding", item.get("versionStartIncluding")) + item.get( + "versionEndIncluding", item.get("versionStartIncluding") + ) for item in data["configurations"][0]["nodes"][0]["cpeMatch"] ], # TODO: can return to tuples "fixed": [ @@ -178,14 +182,27 @@ def parse_advisory(self, data): self.versions["affected"] = [ v for v in self.versions["affected"] if v is not None ] - self.versions["fixed"] = [v for v in self.versions["fixed"] if v is not None] + self.versions["fixed"] = [ + v for v in self.versions["fixed"] if v is not None + ] + + def get_commits_in_advisory_references(self) -> List[str]: + """Processes the advisory's references to extract commit IDs if + present. Only keeps the five most important ones. - def get_fixing_commit(self) -> List[str]: + Returns: + A list of references to a commit. + """ self.references = dict( - sorted(self.references.items(), key=lambda item: item[1], reverse=True) + sorted( + self.references.items(), key=lambda item: item[1], reverse=True + ) ) limit = 0 - while len([r for r in self.references.keys() if r.startswith("commit::")]) > 5: + while ( + len([r for r in self.references.keys() if r.startswith("commit::")]) + > 5 + ): self.references = { k: v for k, v in self.references.items() @@ -193,7 +210,12 @@ def get_fixing_commit(self) -> List[str]: } limit += 1 - return [ref.split("::")[1] for ref in self.references if "commit::" in ref] + return [ + ref.split("::")[1] + for ref in self.references + if "commit::" in ref + and ref.split("::")[1] not in ["master", "main"] + ] def search_references_debian(self) -> List[str]: url = "https://security-tracker.debian.org/tracker/" @@ -221,7 +243,9 @@ def search_references_redhat(self) -> List[str]: return [] - def extract_hashes(self, reference: str, filter: bool = False) -> str | None: + def extract_hashes( + self, reference: str, filter: bool = False + ) -> str | None: if bool(re.search(r"a=commit;", reference)): return "commit::" + re.search(r";h=(\w{6,40})", reference).group(1) @@ -258,12 +282,15 @@ def parse_advisory_2(self, details, metadata): for field, key in timestamp_fields.items(): timestamp = metadata.get(key) setattr( - self, field, int(isoparse(timestamp).timestamp()) if timestamp else None + self, + field, + int(isoparse(timestamp).timestamp()) if timestamp else None, ) if not self.description: self.description = details["descriptions"][0]["value"] self.references = defaultdict( - int, {self.extract_hashes(r["url"]): 2 for r in details["references"]} + int, + {self.extract_hashes(r["url"]): 2 for r in details["references"]}, ) @@ -290,7 +317,9 @@ def get_from_nvd(cve_id: str): headers = {"apiKey": NVD_API_KEY} if NVD_API_KEY else None params = {"cveId": cve_id} - response = requests.get(NVD_REST_ENDPOINT, headers=headers, params=params) + response = requests.get( + NVD_REST_ENDPOINT, headers=headers, params=params + ) if response.status_code != 200: return None @@ -314,7 +343,9 @@ def is_url_allowed(url: str) -> bool: return False -def get_from_local(vuln_id: str, nvd_rest_endpoint: str = LOCAL_NVD_REST_ENDPOINT): +def get_from_local( + vuln_id: str, nvd_rest_endpoint: str = LOCAL_NVD_REST_ENDPOINT +): try: response = requests.get(nvd_rest_endpoint + vuln_id) if response.status_code != 200: diff --git a/prospector/docker/worker/Dockerfile b/prospector/docker/worker/Dockerfile index e5421220c..fd3b9863f 100644 --- a/prospector/docker/worker/Dockerfile +++ b/prospector/docker/worker/Dockerfile @@ -73,4 +73,11 @@ VOLUME [ "/data_sources/reports" ] RUN chmod +x /usr/local/bin/start_rq_worker.sh #CMD tail -f /dev/null + +# Create directory for gitcache and run git config command to avoid 'dubious ownership' error +RUN mkdir -p /tmp/gitcache && \ + cd /tmp/gitcache && \ + git config --global --add safe.directory '*' + + ENTRYPOINT ["/usr/local/bin/start_rq_worker.sh"] diff --git a/prospector/git/git.py b/prospector/git/git.py index ef317fc37..adf6eb853 100644 --- a/prospector/git/git.py +++ b/prospector/git/git.py @@ -311,6 +311,8 @@ def parse_git_output(self, raw: List[str]) -> Dict[str, RawCommit]: return commits def find_commits_for_twin_lookups(self, commit_id): + """Finds all relevant commits around the given commit wihin a time + window of 10 days. Search is narrowed if too many commits are found.""" # Using both author date and commit date we should cover all cases. try: commit_timestamp_a = self.get_timestamp(commit_id, "a") @@ -329,8 +331,8 @@ def find_commits_for_twin_lookups(self, commit_id): return dict() - except Exception: - logger.error("Git command failed, cannot get commits", exc_info=True) + except Exception as e: + logger.error(f"Git command failed, cannot get commits: {e}") return dict() diff --git a/prospector/llm/llm_service.py b/prospector/llm/llm_service.py index 86dbaffbc..c88e81895 100644 --- a/prospector/llm/llm_service.py +++ b/prospector/llm/llm_service.py @@ -116,17 +116,9 @@ def classify_commit( except Exception as e: raise RuntimeError(f"Prompt-model chain could not be invoked: {e}") - if is_relevant in [ - "True", - "ANSWER:True", - "```ANSWER:True```", - ]: + if "True" in is_relevant: return True - elif is_relevant in [ - "False", - "ANSWER:False", - "```ANSWER:False```", - ]: + elif "False" in is_relevant: return False else: raise RuntimeError( diff --git a/prospector/rules/rules.py b/prospector/rules/rules.py index 204e309b9..eb1c6a78d 100644 --- a/prospector/rules/rules.py +++ b/prospector/rules/rules.py @@ -57,6 +57,7 @@ def get_id(self): def apply_rules( candidates: List[Commit], advisory_record: AdvisoryRecord, + use_backend: bool, backend_address: str, enabled_rules: List[str] = [], ) -> List[Commit]: @@ -95,7 +96,7 @@ def apply_rules( for candidate in candidates[:NUM_COMMITS_PHASE_2]: for rule in phase_2_rules: - if rule.apply(candidate, backend_address): + if rule.apply(candidate, use_backend, backend_address): counter.increment("matches") candidate.add_match(rule.as_dict()) candidate.compute_relevance() @@ -433,6 +434,7 @@ class CommitIsSecurityRelevant(Rule): def apply( self, candidate: Commit, + use_backend: bool, backend_address: str, ) -> bool: @@ -441,18 +443,19 @@ def apply( ): # Check if this commit is already in the database try: - r = requests.get( - f"{backend_address}/commits/{candidate.repository}", - params={"commit_id": candidate.commit_id}, - timeout=10, - ) - r.raise_for_status() - commit_data = r.json()[0] - - is_security_relevant = commit_data.get("security_relevant") - if is_security_relevant is not None: - candidate.security_relevant = is_security_relevant - return is_security_relevant + if use_backend: + r = requests.get( + f"{backend_address}/commits/{candidate.repository}", + params={"commit_id": candidate.commit_id}, + timeout=10, + ) + r.raise_for_status() + commit_data = r.json()[0] + + is_security_relevant = commit_data.get("security_relevant") + if is_security_relevant is not None: + candidate.security_relevant = is_security_relevant + return is_security_relevant candidate.security_relevant = LLMService().classify_commit( candidate.diff, candidate.repository, candidate.message diff --git a/prospector/util/http.py b/prospector/util/http.py index 443100686..feac446c0 100644 --- a/prospector/util/http.py +++ b/prospector/util/http.py @@ -106,17 +106,13 @@ def get_from_xml(id: str): try: params = {"field": {"description", "summary", "comments"}} - # response = requests.get( - # f"https://issues.apache.org/jira/si/jira.issueviews:issue-xml/{id}/{id}.xml", - # params=params, - # ) - # xml_data = BeautifulSoup(response.text, features="html.parser") xml_data = fetch_url( f"https://issues.apache.org/jira/si/jira.issueviews:issue-xml/{id}/{id}.xml", params=params, + extract_text=False, ) item = xml_data.find("item") - if item is None: + if item is None or item == -1: return "" relevant_data = [ itm.text for itm in item.findAll(["description", "summary", "comments"])