Merge pull request #250 from SAP/develop

upgrade to v4.9
SAP · Aug 8, 2022 · 8aaac26 · 8aaac26
2 parents 36851b1 + becbe65
commit 8aaac26
Show file tree

Hide file tree

Showing 12 changed files with 501 additions and 14 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -77,5 +77,6 @@ jobs:
  POSTGRES_DB: credential_digger_tests
  DBHOST: localhost
  DBPORT: 5432
+ GIT_TOKEN: ${{ secrets.GIT_TOKEN }}
  run: |
  pytest tests/functional_tests
diff --git a/.pep8speaks.yml b/.pep8speaks.yml
@@ -6,6 +6,7 @@ scanner:
 
 flake8: # Same as scanner.linter value
  ignore: # Errors and warnings to ignore
+ - E704 # multiple statements on one line (def)
  - W292 # no newline at the end of file (introduces W391)
  - W503 # line break before binary operator
  - W504 # line break after binary operator

diff --git a/credentialdigger/cli/cli.py b/credentialdigger/cli/cli.py
@@ -6,7 +6,7 @@
 from dotenv import load_dotenv
 
 from . import (add_rules, get_discoveries, hook, scan, scan_path,
- scan_snapshot, scan_user, scan_wiki)
+ scan_pr, scan_snapshot, scan_user, scan_wiki)
 
 logger = logging.getLogger(__name__)
 
@@ -100,6 +100,12 @@ def main(sys_argv):
  parents=[parser_dotenv, parser_sqlite, parser_scan_base])
  scan_snapshot.configure_parser(parser_scan_snapshot)
 
+ # scan_pr subparser configuration
+ parser_scan_pr = subparsers.add_parser(
+ 'scan_pr', help='Scan a pull request',
+ parents=[parser_dotenv, parser_sqlite, parser_scan_base])
+ scan_pr.configure_parser(parser_scan_pr)
+
  # get_discoveries subparser configuration
  parser_get_discoveries = subparsers.add_parser(
  'get_discoveries', help='Get discoveries of a scanened repository',
@@ -129,6 +135,7 @@ def main(sys_argv):
  scan_user.run,
  scan_wiki.run,
  scan_path.run,
+ scan_pr.run,
  scan_snapshot.run
  ]:
  # Connect to db only when running commands that need it

diff --git a/credentialdigger/cli/scan_pr.py b/credentialdigger/cli/scan_pr.py
@@ -0,0 +1,122 @@
+"""
+The 'scan' module can be used to scan a git repository on the fly from the
+terminal. It supports both the Sqlite and Postgres clients.
+
+NOTE: Postgres is used by default. Please make sure that the environment
+variables are exported and that the rules have already been added to the
+database.
+
+
+usage: credentialdigger scan_pr [-h] [--dotenv DOTENV] [--sqlite SQLITE]
+ [--pr PULL_REQUEST_NUMBER]
+ [--api_endpoint API_ENDPOINT]
+ [--category CATEGORY]
+ [--models MODELS [MODELS ...]]
+ [--force] [--debug]
+ [--git_token GIT_TOKEN]
+ [--similarity]
+ repo_url
+
+positional arguments:
+ repo_url The location of a git repository
+ pr The number of the pull request to scan
+
+optional arguments:
+ -h, --help show this help message and exit
+ --dotenv DOTENV The path to the .env file which will be used in all
+ commands. If not specified, the one in the current
+ directory will be used (if present).
+ --sqlite SQLITE If specified, scan the repo using the sqlite client
+ passing as argument the path of the db. Otherwise, use
+ postgres (must be up and running)
+ --api_endpoint API_ENDPOINT
+ API endpoint of the git server (default is the public
+ github, i.e., `https://api.github.com`)
+ --category CATEGORY If specified, scan the PR using all the rules of
+ this category, otherwise use all the rules in the db
+ --models MODELS [MODELS ...]
+ A list of models for the ML false positives detection.
+ Cannot accept empty lists.
+ --force Wipe previous scan results, in case this repository
+ has already been scanned previously
+ --debug Flag used to decide whether to visualize the
+ progressbars during the scan (e.g., during the
+ insertion of the detections in the db)
+ --git_token GIT_TOKEN
+ Git personal access token to authenticate to the git
+ server
+ --similarity Build and use the similarity model to compute
+ embeddings and allow for automatic update of similar
+ snippets
+
+"""
+import logging
+import sys
+
+logger = logging.getLogger(__name__)
+
+
+def configure_parser(parser):
+ """
+ Configure arguments for command line parser.
+
+ Parameters
+ ----------
+ parser: `credentialdigger.cli.customParser`
+ Command line parser
+ """
+ parser.set_defaults(func=run)
+ parser.add_argument(
+ 'repo_url', type=str,
+ help='The location of a git repository (an url if --local is not set, \
+ a local path otherwise)')
+ parser.add_argument(
+ '--pr', type=int, required=True,
+ help='The number of pull request to scan')
+ parser.add_argument(
+ '--api_endpoint', type=str, default='https://api.github.com',
+ help='API endpoint of the git server')
+ parser.add_argument(
+ '--force', action='store_true',
+ help='Force a complete re-scan of the repository, in case it has \
+ already been scanned previously')
+ parser.add_argument(
+ '--similarity', action='store_true',
+ help='Build and use the similarity model to compute embeddings \
+ and allow for automatic update of similar snippets')
+ parser.add_argument(
+ '--git_token', default=None, type=str,
+ help='Git personal access token to authenticate to the git server')
+
+
+def run(client, args):
+ """
+ Scan a pull request in a git repository.
+
+ Parameters
+ ----------
+ client: `credentialdigger.Client`
+ Instance of the client on which to save results
+ args: `argparse.Namespace`
+ Arguments from command line parser.
+
+ Returns
+ -------
+ While this function returns nothing of use to the scanner itself, it
+ gives an exit status (integer) that is equal to the number of
+ discoveries. If it exits with a value that is equal to 0, then it means
+ that the scan detected no leaks in this repo.
+ """
+ logger.info(f'Scan pull request number {args.pr}')
+ discoveries = client.scan_pull_request(
+ repo_url=args.repo_url,
+ pr_number=args.pr,
+ category=args.category,
+ models=args.models,
+ force=args.force,
+ debug=args.debug,
+ similarity=args.similarity,
+ git_token=args.git_token,
+ api_endpoint=args.api_endpoint)
+
+ sys.exit(len(discoveries))
diff --git a/credentialdigger/client.py b/credentialdigger/client.py
@@ -14,6 +14,7 @@
 from .models.model_manager import ModelManager
 from .scanners.file_scanner import FileScanner
 from .scanners.git_file_scanner import GitFileScanner
+from .scanners.git_pr_scanner import GitPRScanner
 from .scanners.git_scanner import GitScanner
 from .snippet_similarity import (build_embedding_model, compute_similarity,
  compute_snippet_embedding)
@@ -511,7 +512,7 @@ def get_discoveries(self, query, repo_url, file_name=None):
  """
  cursor = self.db.cursor()
  all_discoveries = []
- params = (repo_url,) if file_name is None else (
+ params = (repo_url,) if not file_name else (
  repo_url, file_name)
  cursor.execute(query, params)
  result = cursor.fetchone()
@@ -787,6 +788,9 @@ def scan(self, repo_url, category=None, models=None, force=False,
  repo_url = repo_url[:-1]
  if repo_url.endswith('.git'):
  repo_url = repo_url[:-4]
+ # NB: removesuffix not supported with py<3.9
+ # repo_url = repo_url.removesuffix('/')
+ # repo_url = repo_url.removesuffix('.git')
 
  rules = self._get_scan_rules(category)
  scanner = GitScanner(rules)
@@ -841,7 +845,7 @@ def scan_snapshot(self, repo_url, branch_or_commit, category=None,
  The id of the discoveries detected by the scanner (excluded the
  ones classified as false positives).
  """
- if self.get_repo(repo_url) != {}:
+ if self.get_repo(repo_url):
  logger.info(f'The repository \"{repo_url}\" has already been '
  'scanned.')
  if force:
@@ -896,7 +900,7 @@ def scan_path(self, scan_path, category=None, models=None, force=False,
  """
  scan_path = os.path.abspath(scan_path)
 
- if self.get_repo(scan_path) != {} and force is False:
+ if self.get_repo(scan_path) and not force:
  raise ValueError(f'The directory \"{scan_path}\" has already been '
  'scanned. Please use \"force\" to rescan it.')
 
@@ -908,6 +912,71 @@ def scan_path(self, scan_path, category=None, models=None, force=False,
  debug=debug, similarity=similarity, max_depth=max_depth,
  ignore_list=ignore_list)
 
+ def scan_pull_request(self, repo_url, pr_number,
+ api_endpoint='https://api.github.com',
+ category=None, models=None, force=False, debug=False,
+ similarity=False, git_token=None):
+ """ Launch the scan of a pull request.
+
+ Only the commits part of the pull request get scanned.
+
+ Parameters
+ ----------
+ repo_url: str
+ The url of the repo to scan
+ pr_number: int
+ The number of pull request
+ api_endpoint: str, default `https://api.github.com`
+ API endpoint of the git server (default is github.com)
+ category: str, optional
+ If specified, scan the repo using all the rules of this category,
+ otherwise use all the rules in the db
+ models: list, optional
+ A list of models for the ML false positives detection
+ force: bool, default `False`
+ Force a complete re-scan of the repository, in case it has already
+ been scanned previously
+ debug: bool, default `False`
+ Flag used to decide whether to visualize the progressbars during
+ the scan (e.g., during the insertion of the detections in the db)
+ similarity: bool, default `False`
+ Decide whether to build the embedding model and to compute and add
+ embeddings, to allow for updating of similar discoveries
+ git_token: str, optional
+ Git personal access token to authenticate to the git server
+
+ Returns
+ -------
+ list
+ The id of the discoveries detected by the scanner (excluded the
+ ones classified as false positives).
+ """
+ # Trim the tail of the repo's url by removing '/' and '.git'
+ if repo_url.endswith('/'):
+ repo_url = repo_url[:-1]
+ if repo_url.endswith('.git'):
+ repo_url = repo_url[:-4]
+
+ if self.get_repo(repo_url):
+ logger.info(f'The repository \"{repo_url}\" has already been '
+ 'scanned.')
+ if force:
+ logger.info(f'The pull request {pr_number} will be scanned, '
+ 'and the old discoveries will be deleted) due to '
+ 'force=True')
+ else:
+ logger.info('Impossible to scan this pull request. Consider '
+ 'relaunching the scan with force=True')
+ return []
+
+ rules = self._get_scan_rules(category)
+ scanner = GitPRScanner(rules)
+
+ return self._scan(
+ repo_url=repo_url, scanner=scanner, models=models, force=force,
+ debug=debug, similarity=similarity,
+ pr_number=pr_number, git_token=git_token)
+
  def scan_user(self, username, category=None, models=None, debug=False,
  forks=False, similarity=False, git_token=None,
  api_endpoint='https://api.github.com'):
@@ -1063,7 +1132,7 @@ def _scan(self, repo_url, scanner, models=None, force=False, debug=False,
  if debug:
  logger.setLevel(level=logging.DEBUG)
 
- if models is None:
+ if not models:
  logger.debug('Don\'t use ML models')
  models = []
 

diff --git a/credentialdigger/client_postgres.py b/credentialdigger/client_postgres.py
@@ -549,9 +549,11 @@ def update_discoveries(self, discoveries_ids, new_state):
  return super().update_discoveries(
  discoveries_ids=discoveries_ids,
  new_state=new_state,
- query='UPDATE discoveries SET state=%s WHERE id IN %s RETURNING true')
+ query='UPDATE discoveries SET state=%s WHERE id IN %s \
+ RETURNING true')
 
- def update_discovery_group(self, new_state, repo_url, file_name, snippet=None):
+ def update_discovery_group(self, new_state, repo_url, file_name,
+ snippet=None):
  """ Change the state of a group of discoveries.
 
  A group of discoveries is identified by the url of their repository,

diff --git a/credentialdigger/client_sqlite.py b/credentialdigger/client_sqlite.py
@@ -58,7 +58,7 @@ def __init__(self, path):
  );
 
  PRAGMA foreign_keys=ON;
- """)
+ """) # noqa: E501
  cursor.close()
  self.db.commit()
 
@@ -205,9 +205,9 @@ def add_embedding(self, discovery_id, repo_url, embedding=None):
  query = 'INSERT INTO embeddings (id, snippet, embedding, repo_url) \
  VALUES (?, ?, ?, ?);'
  super().add_embedding(query,
-  discovery_id,
-  repo_url,
-  embedding)
+ discovery_id,
+ repo_url,
+ embedding)
 
  def add_embeddings(self, repo_url):
  """ Bulk add embeddings.
@@ -261,7 +261,8 @@ def add_rule(self, regex, category, description=''):
  regex=regex,
  category=category,
  description=description,
- query='INSERT INTO rules (regex, category, description) VALUES (?, ?, ?)'
+ query='INSERT INTO rules (regex, category, description) \
+ VALUES (?, ?, ?)'
  )
 
  def delete_rule(self, ruleid):
@@ -588,7 +589,8 @@ def update_discoveries(self, discoveries_ids, new_state):
  query='UPDATE discoveries SET state=? WHERE id IN('
  f'VALUES {", ".join(["?"]*len(discoveries_ids))})')
 
- def update_discovery_group(self, new_state, repo_url, file_name, snippet=None):
+ def update_discovery_group(self, new_state, repo_url, file_name,
+ snippet=None):
  """ Change the state of a group of discoveries.
 
  A group of discoveries is identified by the url of their repository,