Skip to content

Commit

Permalink
Merge pull request #250 from SAP/develop
Browse files Browse the repository at this point in the history
upgrade to v4.9
  • Loading branch information
marcorosa authored Aug 8, 2022
2 parents 36851b1 + becbe65 commit 8aaac26
Show file tree
Hide file tree
Showing 12 changed files with 501 additions and 14 deletions.
1 change: 1 addition & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -77,5 +77,6 @@ jobs:
POSTGRES_DB: credential_digger_tests
DBHOST: localhost
DBPORT: 5432
GIT_TOKEN: ${{ secrets.GIT_TOKEN }}
run: |
pytest tests/functional_tests
1 change: 1 addition & 0 deletions .pep8speaks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ scanner:

flake8: # Same as scanner.linter value
ignore: # Errors and warnings to ignore
- E704 # multiple statements on one line (def)
- W292 # no newline at the end of file (introduces W391)
- W503 # line break before binary operator
- W504 # line break after binary operator
Expand Down
9 changes: 8 additions & 1 deletion credentialdigger/cli/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from dotenv import load_dotenv

from . import (add_rules, get_discoveries, hook, scan, scan_path,
scan_snapshot, scan_user, scan_wiki)
scan_pr, scan_snapshot, scan_user, scan_wiki)

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -100,6 +100,12 @@ def main(sys_argv):
parents=[parser_dotenv, parser_sqlite, parser_scan_base])
scan_snapshot.configure_parser(parser_scan_snapshot)

# scan_pr subparser configuration
parser_scan_pr = subparsers.add_parser(
'scan_pr', help='Scan a pull request',
parents=[parser_dotenv, parser_sqlite, parser_scan_base])
scan_pr.configure_parser(parser_scan_pr)

# get_discoveries subparser configuration
parser_get_discoveries = subparsers.add_parser(
'get_discoveries', help='Get discoveries of a scanened repository',
Expand Down Expand Up @@ -129,6 +135,7 @@ def main(sys_argv):
scan_user.run,
scan_wiki.run,
scan_path.run,
scan_pr.run,
scan_snapshot.run
]:
# Connect to db only when running commands that need it
Expand Down
122 changes: 122 additions & 0 deletions credentialdigger/cli/scan_pr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
"""
The 'scan' module can be used to scan a git repository on the fly from the
terminal. It supports both the Sqlite and Postgres clients.
NOTE: Postgres is used by default. Please make sure that the environment
variables are exported and that the rules have already been added to the
database.
usage: credentialdigger scan_pr [-h] [--dotenv DOTENV] [--sqlite SQLITE]
[--pr PULL_REQUEST_NUMBER]
[--api_endpoint API_ENDPOINT]
[--category CATEGORY]
[--models MODELS [MODELS ...]]
[--force] [--debug]
[--git_token GIT_TOKEN]
[--similarity]
repo_url
positional arguments:
repo_url The location of a git repository
pr The number of the pull request to scan
optional arguments:
-h, --help show this help message and exit
--dotenv DOTENV The path to the .env file which will be used in all
commands. If not specified, the one in the current
directory will be used (if present).
--sqlite SQLITE If specified, scan the repo using the sqlite client
passing as argument the path of the db. Otherwise, use
postgres (must be up and running)
--api_endpoint API_ENDPOINT
API endpoint of the git server (default is the public
github, i.e., `https://api.github.com`)
--category CATEGORY If specified, scan the PR using all the rules of
this category, otherwise use all the rules in the db
--models MODELS [MODELS ...]
A list of models for the ML false positives detection.
Cannot accept empty lists.
--force Wipe previous scan results, in case this repository
has already been scanned previously
--debug Flag used to decide whether to visualize the
progressbars during the scan (e.g., during the
insertion of the detections in the db)
--git_token GIT_TOKEN
Git personal access token to authenticate to the git
server
--similarity Build and use the similarity model to compute
embeddings and allow for automatic update of similar
snippets
"""
import logging
import sys

logger = logging.getLogger(__name__)


def configure_parser(parser):
"""
Configure arguments for command line parser.
Parameters
----------
parser: `credentialdigger.cli.customParser`
Command line parser
"""
parser.set_defaults(func=run)
parser.add_argument(
'repo_url', type=str,
help='The location of a git repository (an url if --local is not set, \
a local path otherwise)')
parser.add_argument(
'--pr', type=int, required=True,
help='The number of pull request to scan')
parser.add_argument(
'--api_endpoint', type=str, default='https://api.github.com',
help='API endpoint of the git server')
parser.add_argument(
'--force', action='store_true',
help='Force a complete re-scan of the repository, in case it has \
already been scanned previously')
parser.add_argument(
'--similarity', action='store_true',
help='Build and use the similarity model to compute embeddings \
and allow for automatic update of similar snippets')
parser.add_argument(
'--git_token', default=None, type=str,
help='Git personal access token to authenticate to the git server')


def run(client, args):
"""
Scan a pull request in a git repository.
Parameters
----------
client: `credentialdigger.Client`
Instance of the client on which to save results
args: `argparse.Namespace`
Arguments from command line parser.
Returns
-------
While this function returns nothing of use to the scanner itself, it
gives an exit status (integer) that is equal to the number of
discoveries. If it exits with a value that is equal to 0, then it means
that the scan detected no leaks in this repo.
"""
logger.info(f'Scan pull request number {args.pr}')
discoveries = client.scan_pull_request(
repo_url=args.repo_url,
pr_number=args.pr,
category=args.category,
models=args.models,
force=args.force,
debug=args.debug,
similarity=args.similarity,
git_token=args.git_token,
api_endpoint=args.api_endpoint)

sys.exit(len(discoveries))
77 changes: 73 additions & 4 deletions credentialdigger/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from .models.model_manager import ModelManager
from .scanners.file_scanner import FileScanner
from .scanners.git_file_scanner import GitFileScanner
from .scanners.git_pr_scanner import GitPRScanner
from .scanners.git_scanner import GitScanner
from .snippet_similarity import (build_embedding_model, compute_similarity,
compute_snippet_embedding)
Expand Down Expand Up @@ -511,7 +512,7 @@ def get_discoveries(self, query, repo_url, file_name=None):
"""
cursor = self.db.cursor()
all_discoveries = []
params = (repo_url,) if file_name is None else (
params = (repo_url,) if not file_name else (
repo_url, file_name)
cursor.execute(query, params)
result = cursor.fetchone()
Expand Down Expand Up @@ -787,6 +788,9 @@ def scan(self, repo_url, category=None, models=None, force=False,
repo_url = repo_url[:-1]
if repo_url.endswith('.git'):
repo_url = repo_url[:-4]
# NB: removesuffix not supported with py<3.9
# repo_url = repo_url.removesuffix('/')
# repo_url = repo_url.removesuffix('.git')

rules = self._get_scan_rules(category)
scanner = GitScanner(rules)
Expand Down Expand Up @@ -841,7 +845,7 @@ def scan_snapshot(self, repo_url, branch_or_commit, category=None,
The id of the discoveries detected by the scanner (excluded the
ones classified as false positives).
"""
if self.get_repo(repo_url) != {}:
if self.get_repo(repo_url):
logger.info(f'The repository \"{repo_url}\" has already been '
'scanned.')
if force:
Expand Down Expand Up @@ -896,7 +900,7 @@ def scan_path(self, scan_path, category=None, models=None, force=False,
"""
scan_path = os.path.abspath(scan_path)

if self.get_repo(scan_path) != {} and force is False:
if self.get_repo(scan_path) and not force:
raise ValueError(f'The directory \"{scan_path}\" has already been '
'scanned. Please use \"force\" to rescan it.')

Expand All @@ -908,6 +912,71 @@ def scan_path(self, scan_path, category=None, models=None, force=False,
debug=debug, similarity=similarity, max_depth=max_depth,
ignore_list=ignore_list)

def scan_pull_request(self, repo_url, pr_number,
api_endpoint='https://api.github.com',
category=None, models=None, force=False, debug=False,
similarity=False, git_token=None):
""" Launch the scan of a pull request.
Only the commits part of the pull request get scanned.
Parameters
----------
repo_url: str
The url of the repo to scan
pr_number: int
The number of pull request
api_endpoint: str, default `https://api.github.com`
API endpoint of the git server (default is github.com)
category: str, optional
If specified, scan the repo using all the rules of this category,
otherwise use all the rules in the db
models: list, optional
A list of models for the ML false positives detection
force: bool, default `False`
Force a complete re-scan of the repository, in case it has already
been scanned previously
debug: bool, default `False`
Flag used to decide whether to visualize the progressbars during
the scan (e.g., during the insertion of the detections in the db)
similarity: bool, default `False`
Decide whether to build the embedding model and to compute and add
embeddings, to allow for updating of similar discoveries
git_token: str, optional
Git personal access token to authenticate to the git server
Returns
-------
list
The id of the discoveries detected by the scanner (excluded the
ones classified as false positives).
"""
# Trim the tail of the repo's url by removing '/' and '.git'
if repo_url.endswith('/'):
repo_url = repo_url[:-1]
if repo_url.endswith('.git'):
repo_url = repo_url[:-4]

if self.get_repo(repo_url):
logger.info(f'The repository \"{repo_url}\" has already been '
'scanned.')
if force:
logger.info(f'The pull request {pr_number} will be scanned, '
'and the old discoveries will be deleted) due to '
'force=True')
else:
logger.info('Impossible to scan this pull request. Consider '
'relaunching the scan with force=True')
return []

rules = self._get_scan_rules(category)
scanner = GitPRScanner(rules)

return self._scan(
repo_url=repo_url, scanner=scanner, models=models, force=force,
debug=debug, similarity=similarity,
pr_number=pr_number, git_token=git_token)

def scan_user(self, username, category=None, models=None, debug=False,
forks=False, similarity=False, git_token=None,
api_endpoint='https://api.github.com'):
Expand Down Expand Up @@ -1063,7 +1132,7 @@ def _scan(self, repo_url, scanner, models=None, force=False, debug=False,
if debug:
logger.setLevel(level=logging.DEBUG)

if models is None:
if not models:
logger.debug('Don\'t use ML models')
models = []

Expand Down
6 changes: 4 additions & 2 deletions credentialdigger/client_postgres.py
Original file line number Diff line number Diff line change
Expand Up @@ -549,9 +549,11 @@ def update_discoveries(self, discoveries_ids, new_state):
return super().update_discoveries(
discoveries_ids=discoveries_ids,
new_state=new_state,
query='UPDATE discoveries SET state=%s WHERE id IN %s RETURNING true')
query='UPDATE discoveries SET state=%s WHERE id IN %s \
RETURNING true')

def update_discovery_group(self, new_state, repo_url, file_name, snippet=None):
def update_discovery_group(self, new_state, repo_url, file_name,
snippet=None):
""" Change the state of a group of discoveries.
A group of discoveries is identified by the url of their repository,
Expand Down
14 changes: 8 additions & 6 deletions credentialdigger/client_sqlite.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def __init__(self, path):
);
PRAGMA foreign_keys=ON;
""")
""") # noqa: E501
cursor.close()
self.db.commit()

Expand Down Expand Up @@ -205,9 +205,9 @@ def add_embedding(self, discovery_id, repo_url, embedding=None):
query = 'INSERT INTO embeddings (id, snippet, embedding, repo_url) \
VALUES (?, ?, ?, ?);'
super().add_embedding(query,
discovery_id,
repo_url,
embedding)
discovery_id,
repo_url,
embedding)

def add_embeddings(self, repo_url):
""" Bulk add embeddings.
Expand Down Expand Up @@ -261,7 +261,8 @@ def add_rule(self, regex, category, description=''):
regex=regex,
category=category,
description=description,
query='INSERT INTO rules (regex, category, description) VALUES (?, ?, ?)'
query='INSERT INTO rules (regex, category, description) \
VALUES (?, ?, ?)'
)

def delete_rule(self, ruleid):
Expand Down Expand Up @@ -588,7 +589,8 @@ def update_discoveries(self, discoveries_ids, new_state):
query='UPDATE discoveries SET state=? WHERE id IN('
f'VALUES {", ".join(["?"]*len(discoveries_ids))})')

def update_discovery_group(self, new_state, repo_url, file_name, snippet=None):
def update_discovery_group(self, new_state, repo_url, file_name,
snippet=None):
""" Change the state of a group of discoveries.
A group of discoveries is identified by the url of their repository,
Expand Down
Loading

0 comments on commit 8aaac26

Please sign in to comment.