Skip to content

Commit

Permalink
Merge pull request #228 from SAP/develop
Browse files Browse the repository at this point in the history
upgrade to v4.7.0
  • Loading branch information
marcorosa authored Apr 11, 2022
2 parents 5ae8c29 + f12ec6d commit 501062b
Show file tree
Hide file tree
Showing 13 changed files with 71 additions and 66 deletions.
12 changes: 1 addition & 11 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ jobs:

strategy:
matrix:
python-version: [3.6, 3.7, 3.8, 3.9]
python-version: ['3.7', '3.8', '3.9', '3.10']

services:
postgres:
Expand Down Expand Up @@ -62,14 +62,6 @@ jobs:
run: |
python setup.py install --user
- name: Download and link credentialdigger's models
env:
path_model: https://github.com/SAP/credential-digger/releases/download/PM-v1.0.1/path_model-1.0.1.tar.gz
snippet_model: https://github.com/SAP/credential-digger/releases/download/SM-v1.0.0/snippet_model-1.0.0.tar.gz
run: |
python -m credentialdigger download path_model
python -m credentialdigger download snippet_model
- name: Run unit tests
run: |
pytest tests/unit_tests
Expand All @@ -85,7 +77,5 @@ jobs:
POSTGRES_DB: credential_digger_tests
DBHOST: localhost
DBPORT: 5432
path_model: https://github.com/SAP/credential-digger/releases/download/PM-v1.0.1/path_model-1.0.1.tar.gz
snippet_model: https://github.com/SAP/credential-digger/releases/download/SM-v1.0.0/snippet_model-1.0.0.tar.gz
run: |
pytest tests/functional_tests
21 changes: 14 additions & 7 deletions credentialdigger/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from datetime import datetime, timezone

import yaml
from git import GitCommandError
from github import Github
from rich.progress import Progress

Expand Down Expand Up @@ -982,11 +983,16 @@ def scan_user(self, username, category=None, models=None, debug=False,
# Get repo clone url without .git at the end
repo_url = repo.clone_url[:-4]
logger.info(f'{i}/{repos_num}) Scanning {repo.url}')
missing_ids[repo_url] = self._scan(repo_url, scanner,
models=models,
debug=debug,
similarity=similarity,
git_token=git_token)
try:
missing_ids[repo_url] = self._scan(repo_url, scanner,
models=models,
debug=debug,
similarity=similarity,
git_token=git_token)
except GitCommandError:
logger.warning(f'{i}/{repos_num} Ignore {repo_url} '
'(it can not be cloned)')

return missing_ids

def scan_wiki(self, repo_url, category=None, models=None, debug=False,
Expand Down Expand Up @@ -1366,7 +1372,8 @@ def update_similar_snippets(self,
# Compute similarity of target_embedding and embedding
similarity = compute_similarity(target_embedding,
embedding)
if similarity > threshold:
self.update_discovery(d['id'], state)
# Increase counter if similar and the update is successful
if (similarity > threshold and
self.update_discovery(d['id'], state)):
n_updated_snippets += 1
return n_updated_snippets
8 changes: 4 additions & 4 deletions credentialdigger/client_postgres.py
Original file line number Diff line number Diff line change
Expand Up @@ -508,7 +508,7 @@ def update_repo(self, url, last_scan):
bool
`True` if the update is successful, `False` otherwise
"""
super().update_repo(
return super().update_repo(
url=url, last_scan=last_scan,
query='UPDATE repos SET last_scan=%s WHERE url=%s RETURNING true'
)
Expand All @@ -528,7 +528,7 @@ def update_discovery(self, discovery_id, new_state):
bool
`True` if the update is successful, `False` otherwise
"""
super().update_discovery(
return super().update_discovery(
discovery_id=discovery_id,
new_state=new_state,
query='UPDATE discoveries SET state=%s WHERE id=%s RETURNING true')
Expand All @@ -548,7 +548,7 @@ def update_discoveries(self, discoveries_ids, new_state):
bool
`True` if the update is successful, `False` otherwise
"""
super().update_discoveries(
return super().update_discoveries(
discoveries_ids=discoveries_ids,
new_state=new_state,
query='UPDATE discoveries SET state=%s WHERE id IN %s RETURNING true')
Expand Down Expand Up @@ -581,6 +581,6 @@ def update_discovery_group(self, new_state, repo_url, file_name, snippet=None):
if snippet is not None:
query += ' and snippet=%s'
query += ' RETURNING true'
super().update_discovery_group(
return super().update_discovery_group(
new_state=new_state, repo_url=repo_url, file_name=file_name,
snippet=snippet, query=query)
10 changes: 5 additions & 5 deletions credentialdigger/client_sqlite.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def query_check(self, query, *args):
try:
cursor.execute(query, args)
self.db.commit()
return cursor.rowcount < 1
return cursor.rowcount >= 1
except (TypeError, IndexError):
""" A TypeError is raised if any of the required arguments is
missing. """
Expand Down Expand Up @@ -544,7 +544,7 @@ def update_repo(self, url, last_scan):
bool
`True` if the update is successful, `False` otherwise
"""
super().update_repo(
return super().update_repo(
url=url, last_scan=last_scan,
query='UPDATE repos SET last_scan=? WHERE url=?'
)
Expand All @@ -564,7 +564,7 @@ def update_discovery(self, discovery_id, new_state):
bool
`True` if the update is successful, `False` otherwise
"""
super().update_discovery(
return super().update_discovery(
new_state=new_state, discovery_id=discovery_id,
query='UPDATE discoveries SET state=? WHERE id=?'
)
Expand All @@ -584,7 +584,7 @@ def update_discoveries(self, discoveries_ids, new_state):
bool
`True` if the update is successful, `False` otherwise
"""
super().update_discoveries(
return super().update_discoveries(
discoveries_ids=discoveries_ids,
new_state=new_state,
query='UPDATE discoveries SET state=? WHERE id IN('
Expand Down Expand Up @@ -617,6 +617,6 @@ def update_discovery_group(self, new_state, repo_url, file_name, snippet=None):
query += ' and file_name=?'
if snippet is not None:
query += ' and snippet=?'
super().update_discovery_group(
return super().update_discovery_group(
new_state=new_state, repo_url=repo_url, file_name=file_name,
snippet=snippet, query=query)
32 changes: 16 additions & 16 deletions credentialdigger/models/password_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,17 +46,18 @@ def analyze_batch(self, discoveries):
# We have to classify only the "new" discoveries
new_discoveries = [d for d in discoveries if d['state'] == 'new']
no_new_discoveries = [d for d in discoveries if d['state'] != 'new']
# Create a dataset with all the preprocessed (new) snippets
data = self._pre_process([d['snippet'] for d in new_discoveries])
# data = self._preprocess_batch_data(snippets)
# Compute a prediction for each snippet
outputs = self.model.predict(data)
logits = outputs['logits']
predictions = tf.argmax(logits, 1)
# Check predictions and set FP discoveries accordingly
for d, p in zip(new_discoveries, predictions):
if p == 0:
d['state'] = 'false_positive'
# Process new_discoveries if not empty
if new_discoveries:
# Create a dataset with all the preprocessed (new) snippets
data = self._pre_process([d['snippet'] for d in new_discoveries])
# Compute a prediction for each snippet
outputs = self.model.predict(data)
logits = outputs['logits']
predictions = tf.argmax(logits, 1)
# Check predictions and set FP discoveries accordingly
for d, p in zip(new_discoveries, predictions):
if p == 0:
d['state'] = 'false_positive'
return new_discoveries + no_new_discoveries

def analyze(self, discovery):
Expand All @@ -69,11 +70,9 @@ def analyze(self, discovery):
Returns
-------
discoveries: list of dict
The discoveries, with states updated according to
the model's predictions
n_false_positives: int
The number of false positives detected by the model
bool
True if the snippet is safe (i.e., there is no leak).
False otherwise
"""
# Preprocess the snippet
data = self._pre_process([discovery['snippet']])
Expand All @@ -84,6 +83,7 @@ def analyze(self, discovery):
# The model classified this snippet as a false positive
# (i.e., spam)
return True
return False

def _pre_process(self, snippet):
""" Compute encodings of snippets and format them to a standard
Expand Down
3 changes: 2 additions & 1 deletion credentialdigger/models/path_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ def analyze(self, discovery):
Returns
-------
bool
True if the discovery is classified as false positive (i.e., spam)
True if the discovery is classified as false positive (i.e., spam),
False otherwise
"""
return bool(self.fp_keywords.search(discovery['file_name'].lower()))

Expand Down
4 changes: 3 additions & 1 deletion credentialdigger/scanners/file_scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ def scan_file(self, project_root, relative_path, **kwargs):
for row in file_to_scan:
rh = ResultHandler()
self.stream.scan(
row if sys.version_info < (3, 9) else row.encode(
row if sys.version_info < (3, 8) else row.encode(
'utf-8'),
match_event_handler=rh.handle_results,
context=[row.strip(), relative_path, commit_id,
Expand All @@ -169,6 +169,8 @@ def scan_file(self, project_root, relative_path, **kwargs):
except UnicodeDecodeError:
# Don't scan binary files
pass
except FileNotFoundError:
logger.warning(f'Ignore {relative_path} (file not found)')
return discoveries

def _prune(self, rel_dir_root, dirs, files, max_depth=-1, ignore_list=[]):
Expand Down
3 changes: 2 additions & 1 deletion credentialdigger/scanners/git_scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ def get_git_repo(self, repo_url, local_repo):
GitRepo.clone_from(repo_url, project_path)
repo = GitRepo(project_path)
except GitCommandError as e:
logger.warning('Repo can not be cloned')
shutil.rmtree(project_path)
raise e

Expand Down Expand Up @@ -405,7 +406,7 @@ def _regex_check(self, printable_diff, filename, commit_hash):

rh = ResultHandler()
self.stream.scan(
row if sys.version_info < (3, 9) else row.encode('utf-8'),
row if sys.version_info < (3, 8) else row.encode('utf-8'),
match_event_handler=rh.handle_results,
context=[row, filename, commit_hash, line_number])
if rh.result:
Expand Down
18 changes: 9 additions & 9 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,21 +1,21 @@
Flask
flask_jwt_extended
GitPython
hyperscan==0.2.0; python_version >= "3.9"
hyperscan==0.1.5; python_version < "3.9"
hyperscan==0.2.0; python_version >= "3.8"
hyperscan==0.1.5; python_version < "3.8"
numpy
pandas
psycopg2-binary
PyGithub
python-dotenv
pyyaml
rich
rich~=12.2
srsly>=2.4.0
tensorflow==2.6.2; python_version >= "3.8"
tensorflow==2.4.*; python_version < "3.8"
tensorflow-estimator==2.6.0; python_version >= "3.8"
tensorflow-estimator==2.4.*; python_version < "3.8"
tensorflow-text==2.6.0; python_version >= "3.8"
tensorflow-text==2.4.*; python_version < "3.8"
tensorflow==2.8.0; python_version >= "3.8"
tensorflow~=2.4; python_version < "3.8"
tensorflow-estimator==2.8.0; python_version >= "3.8"
tensorflow-estimator~=2.4; python_version < "3.8"
tensorflow-text==2.8.1; python_version >= "3.8"
tensorflow-text~=2.4; python_version < "3.8"
tf-models-official
transformers
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def requirements():

setuptools.setup(
name='credentialdigger',
version='4.6.1',
version='4.7.0',
author='SAP SE',
maintainer='Marco Rosa, Slim Trabelsi',
maintainer_email='[email protected], [email protected]',
Expand Down
4 changes: 2 additions & 2 deletions tests/functional_tests/test_rules.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,6 @@ rules:
category: token
description: MailChimp API key

- regex: sshpass|password|pwd|passwd|pass
- regex: sshpass|password|pwd|passwd|pass[\W]
category: password
description: password keywords
description: password keywords
10 changes: 6 additions & 4 deletions tests/functional_tests/test_scans_postgres.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
from git import Repo as GitRepo
from psycopg2 import connect

TOTAL_PW_DISCOVERIES = 11


class TestScansPostgres(unittest.TestCase):
dotenv = os.path.join(os.path.dirname(os.path.abspath(__file__)), ".env")
Expand Down Expand Up @@ -35,7 +37,7 @@ def test_scan_github(self):
cli.main(["", "scan", "--category", "password",
"--dotenv", self.dotenv,
"--force", self.repo_url])
self.assertEqual(cm.exception.code, 9)
self.assertEqual(cm.exception.code, TOTAL_PW_DISCOVERIES)

def test_scan_local(self):
repo_path = tempfile.mkdtemp()
Expand All @@ -46,9 +48,9 @@ def test_scan_local(self):
"--models", "PathModel", "PasswordModel",
"--category", "password",
"--force", "--local", repo_path])
# When using the models, we expect to be left with less than 9
# discoveries to manually review
self.assertTrue(cm.exception.code < 9)
# When using the models, we expect to be left with less than
# TOTAL_PW_DISCOVERIES discoveries to manually review
self.assertTrue(cm.exception.code < TOTAL_PW_DISCOVERIES)

shutil.rmtree(repo_path)

Expand Down
10 changes: 6 additions & 4 deletions tests/functional_tests/test_scans_sqlite.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
from credentialdigger.client_sqlite import SqliteClient
from git import Repo as GitRepo

TOTAL_PW_DISCOVERIES = 11


class TestScansSqlite(unittest.TestCase):
repo_url = 'https://github.com/SAP/credential-digger-tests'
Expand All @@ -27,7 +29,7 @@ def test_scan_github(self):
cli.main(["", "scan", "--sqlite", self.db_path,
"--category", "password",
"--force", self.repo_url])
self.assertEqual(cm.exception.code, 9)
self.assertEqual(cm.exception.code, TOTAL_PW_DISCOVERIES)

def test_scan_local(self):
repo_path = os.path.join(self.tmp_path, "tmp_repo")
Expand All @@ -38,9 +40,9 @@ def test_scan_local(self):
"--models", "PathModel", "PasswordModel",
"--category", "password",
"--force", "--local", repo_path])
# When using the models, we expect to be left with less than 9
# discoveries to manually review
self.assertTrue(cm.exception.code < 9)
# When using the models, we expect to be left with less than
# TOTAL_PW_DISCOVERIES discoveries to manually review
self.assertTrue(cm.exception.code < TOTAL_PW_DISCOVERIES)

def test_scan_wiki(self):
with self.assertRaises(SystemExit) as cm:
Expand Down

0 comments on commit 501062b

Please sign in to comment.