diff --git a/.github/workflows/docker-image.yml b/.github/workflows/docker-image.yml new file mode 100644 index 0000000000..d657a63b2d --- /dev/null +++ b/.github/workflows/docker-image.yml @@ -0,0 +1,18 @@ +name: Docker Image CI + +on: + push: + branches: [ "main" ] + pull_request: + branches: [ "main" ] + +jobs: + + build: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + - name: Build the Docker image + run: docker build . --file Dockerfile --tag my-image-name:$(date +%s) diff --git a/Makefile b/Makefile index 22364ac160..d9f199a867 100644 --- a/Makefile +++ b/Makefile @@ -3,6 +3,7 @@ default: @ echo "Installation Commands:" @ echo " install Installs Augur's full stack for production" + @ echo " wizard Install Augur and launch the graphical setup wizard" @ echo " clean Removes potentially troublesome compiled files" @ echo " rebuild Removes build/compiled files & binaries and reinstalls the project" @ echo @@ -34,6 +35,9 @@ default: install: @ ./scripts/install/install.sh dev +wizard: + @ ./scripts/install/install.sh graphical + install-spdx: @ ./scripts/install/install-spdx.sh diff --git a/README.md b/README.md index c0c99157cb..e5dde81d04 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Augur NEW Release v0.71.0 +# Augur NEW Release v0.76.0 Augur is primarily a data engineering tool that makes it possible for data scientists to gather open source software community data. Less data carpentry for everyone else! The primary way of looking at Augur data is through [8Knot](https://github.com/oss-aspen/8knot) ... A public instance of 8Knot is available at https://metrix.chaoss.io ... That is tied to a public instance of Augur at https://ai.chaoss.io @@ -10,7 +10,7 @@ The primary way of looking at Augur data is through [8Knot](https://github.com/o ## NEW RELEASE ALERT! ### [If you want to jump right in, updated docker build/compose and bare metal installation instructions are available here](docs/new-install.md) -Augur is now releasing a dramatically improved new version to the main branch. It is also available here: https://github.com/chaoss/augur/releases/tag/v0.71.0 +Augur is now releasing a dramatically improved new version to the main branch. It is also available here: https://github.com/chaoss/augur/releases/tag/v0.76.0 - The `main` branch is a stable version of our new architecture, which features: - Dramatic improvement in the speed of large scale data collection (100,000+ repos). All data is obtained for 100k+ repos within 2 weeks. diff --git a/augur/api/view/server/Environment.py b/augur/api/view/server/Environment.py index 409a5975e5..76b8207ca5 100644 --- a/augur/api/view/server/Environment.py +++ b/augur/api/view/server/Environment.py @@ -49,4 +49,4 @@ def __str__(self)-> str: return str(os.environ) def __iter__(self): - return (item for item in os.environ.items) \ No newline at end of file + return (item for item in os.environ.items()) \ No newline at end of file diff --git a/augur/api/view/server/__init__.py b/augur/api/view/server/__init__.py index 2a54a556f7..e919a597a8 100644 --- a/augur/api/view/server/__init__.py +++ b/augur/api/view/server/__init__.py @@ -1,3 +1,2 @@ -from .Environment import Environment -from .ServerThread import ServerThread from .LoginException import LoginException +from .Environment import Environment \ No newline at end of file diff --git a/augur/application/cli/backend.py b/augur/application/cli/backend.py index 2a2deadd1d..ab8810a5a0 100644 --- a/augur/application/cli/backend.py +++ b/augur/application/cli/backend.py @@ -166,21 +166,21 @@ def determine_worker_processes(ratio,maximum): sleep_time += 6 #60% of estimate, Maximum value of 45 : Reduced because it can be lower - core_num_processes = determine_worker_processes(.15, 10) + core_num_processes = determine_worker_processes(.40, 50) logger.info(f"Starting core worker processes with concurrency={core_num_processes}") core_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency={core_num_processes} -n core:{uuid.uuid4().hex}@%h" process_list.append(subprocess.Popen(core_worker.split(" "))) sleep_time += 6 #20% of estimate, Maximum value of 25 - secondary_num_processes = determine_worker_processes(.70, 60) + secondary_num_processes = determine_worker_processes(.39, 50) logger.info(f"Starting secondary worker processes with concurrency={secondary_num_processes}") secondary_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency={secondary_num_processes} -n secondary:{uuid.uuid4().hex}@%h -Q secondary" process_list.append(subprocess.Popen(secondary_worker.split(" "))) sleep_time += 6 #15% of estimate, Maximum value of 20 - facade_num_processes = determine_worker_processes(.15, 20) + facade_num_processes = determine_worker_processes(.17, 20) logger.info(f"Starting facade worker processes with concurrency={facade_num_processes}") facade_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency={facade_num_processes} -n facade:{uuid.uuid4().hex}@%h -Q facade" diff --git a/augur/application/cli/collection.py b/augur/application/cli/collection.py index c13e648322..3cb08ef1cd 100644 --- a/augur/application/cli/collection.py +++ b/augur/application/cli/collection.py @@ -125,21 +125,21 @@ def determine_worker_processes(ratio,maximum): sleep_time += 6 #60% of estimate, Maximum value of 45: Reduced because not needed - core_num_processes = determine_worker_processes(.15, 10) + core_num_processes = determine_worker_processes(.40, 50) logger.info(f"Starting core worker processes with concurrency={core_num_processes}") core_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency={core_num_processes} -n core:{uuid.uuid4().hex}@%h" process_list.append(subprocess.Popen(core_worker.split(" "))) sleep_time += 6 #20% of estimate, Maximum value of 25 - secondary_num_processes = determine_worker_processes(.70, 60) + secondary_num_processes = determine_worker_processes(.39, 50) logger.info(f"Starting secondary worker processes with concurrency={secondary_num_processes}") secondary_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency={secondary_num_processes} -n secondary:{uuid.uuid4().hex}@%h -Q secondary" process_list.append(subprocess.Popen(secondary_worker.split(" "))) sleep_time += 6 #15% of estimate, Maximum value of 20 - facade_num_processes = determine_worker_processes(.15, 20) + facade_num_processes = determine_worker_processes(.17, 20) logger.info(f"Starting facade worker processes with concurrency={facade_num_processes}") facade_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency={facade_num_processes} -n facade:{uuid.uuid4().hex}@%h -Q facade" diff --git a/augur/application/cli/config.py b/augur/application/cli/config.py index e5beae92eb..e9786e1ef8 100644 --- a/augur/application/cli/config.py +++ b/augur/application/cli/config.py @@ -7,6 +7,7 @@ import json import logging +from augur.application.db.models import Config from augur.application.db.session import DatabaseSession from augur.application.config import AugurConfig from augur.application.cli import DatabaseContext, test_connection, test_db_connection, with_database @@ -160,7 +161,7 @@ def add_section(ctx, section_name, file): @click.option('--section', required=True) @click.option('--setting', required=True) @click.option('--value', required=True) -@click.option('--data-type', required=True) +@click.option('--data-type') @test_connection @test_db_connection @with_database @@ -169,6 +170,12 @@ def config_set(ctx, section, setting, value, data_type): with DatabaseSession(logger, engine=ctx.obj.engine) as session: config = AugurConfig(logger, session) + + if not data_type: + result = session.query(Config).filter(Config.section_name == section, Config.setting_name == setting).all() + if not result: + return click.echo("You must specify a data-type if the setting does not already exist") + data_type = result[0].type if data_type not in config.accepted_types: print(f"Error invalid type for config. Please use one of these types: {config.accepted_types}") @@ -218,6 +225,22 @@ def config_get(ctx, section, setting): else: print(f"Error: {section} section not found in config") +@cli.command('get_all_json') +def config_get_all_json(): + data = {} + try: + with DatabaseSession(logger) as session: + sections = session.query(Config.section_name).distinct().all() + for section in sections: + data[section[0]] = {} + + for row in session.query(Config).all(): + data[row.section_name][row.setting_name] = row.value + except: + pass + + print(json.dumps(data, indent=4)) + @cli.command('clear') @test_connection @test_db_connection diff --git a/augur/application/cli/tasks.py b/augur/application/cli/tasks.py index d25f081ab6..d7ce4e4398 100644 --- a/augur/application/cli/tasks.py +++ b/augur/application/cli/tasks.py @@ -36,8 +36,8 @@ def start(): secondary_worker_process = None scheduling_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=1 -n scheduling:{uuid.uuid4().hex}@%h -Q scheduling" - core_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=20 -n core:{uuid.uuid4().hex}@%h" - secondary_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=60 -n secondary:{uuid.uuid4().hex}@%h -Q secondary" + core_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=50 -n core:{uuid.uuid4().hex}@%h" + secondary_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=50 -n secondary:{uuid.uuid4().hex}@%h -Q secondary" scheduling_worker_process = subprocess.Popen(scheduling_worker.split(" ")) core_worker_process = subprocess.Popen(core_worker.split(" ")) diff --git a/augur/application/config.py b/augur/application/config.py index bfda4c8773..e3e93302eb 100644 --- a/augur/application/config.py +++ b/augur/application/config.py @@ -161,7 +161,7 @@ def get_section(self, section_name) -> dict: Returns: The section data as a dict """ - query = self.session.query(Config).filter_by(section_name=section_name) + query = self.session.query(Config).filter_by(section_name=section_name).order_by(Config.setting_name.asc()) section_data = execute_session_query(query, 'all') section_dict = {} @@ -213,7 +213,7 @@ def load_config(self) -> dict: The config from the database """ # get all the sections in the config table - query = self.session.query(Config.section_name) + query = self.session.query(Config.section_name).order_by(Config.section_name.asc()) section_names = execute_session_query(query, 'all') config = {} diff --git a/augur/application/db/lib.py b/augur/application/db/lib.py index cc785951aa..7fb5ce0598 100644 --- a/augur/application/db/lib.py +++ b/augur/application/db/lib.py @@ -9,7 +9,7 @@ from psycopg2.errors import DeadlockDetected from typing import List, Any, Optional, Union -from augur.application.db.models import Config, Repo, Commit, WorkerOauth, Issue, PullRequest, PullRequestReview, ContributorsAlias, UnresolvedCommitEmail, Contributor, CollectionStatus +from augur.application.db.models import Config, Repo, Commit, WorkerOauth, Issue, PullRequest, PullRequestReview, ContributorsAlias,UnresolvedCommitEmail, Contributor, CollectionStatus from augur.tasks.util.collection_state import CollectionState from augur.application.db import get_session, get_engine from augur.application.db.util import execute_session_query @@ -25,7 +25,7 @@ def convert_type_of_value(config_dict, logger=None): if data_type == "str" or data_type is None: return config_dict - elif data_type == "int": + if data_type == "int": config_dict["value"] = int(config_dict["value"]) elif data_type == "bool": @@ -509,4 +509,31 @@ def update_issue_closed_cntrbs_by_repo_id(repo_id): WHERE issue_id = :issue_id AND repo_id = :repo_id """) - connection.execute(update_stmt, update_data) \ No newline at end of file + connection.execute(update_stmt, update_data) + +def get_core_data_last_collected(repo_id): + + with get_session() as session: + try: + return session.query(CollectionStatus).filter(CollectionStatus.repo_id == repo_id).one().core_data_last_collected + except s.orm.exc.NoResultFound: + return None + +def get_secondary_data_last_collected(repo_id): + + with get_session() as session: + try: + return session.query(CollectionStatus).filter(CollectionStatus.repo_id == repo_id).one().secondary_data_last_collected + except s.orm.exc.NoResultFound: + return None + +def get_updated_prs(repo_id, since): + + with get_session() as session: + return session.query(PullRequest).filter(PullRequest.repo_id == repo_id, PullRequest.pr_updated_at >= since).order_by(PullRequest.pr_src_number).all() + +def get_updated_issues(repo_id, since): + + with get_session() as session: + return session.query(Issue).filter(Issue.repo_id == repo_id, Issue.updated_at >= since).order_by(Issue.gh_issue_number).all() + diff --git a/augur/static/css/first_time.css b/augur/static/css/first_time.css index 12f8ae9f54..f2d4602399 100644 --- a/augur/static/css/first_time.css +++ b/augur/static/css/first_time.css @@ -1,50 +1,102 @@ +:root { + --color-bg: #1A233A; + --color-bg-light: #272E48; + --color-bg-contrast: #646683; + --color-fg: white; + --color-fg-dark: #b0bdd6; + --color-fg-contrast: black; + --color-accent: #6f42c1; + --color-accent-dark: #6134b3; + --color-notice: #00ddff; + --color-notice-contrast: #006979; +} + body{ margin-top:20px; - color: #bcd0f7; - background: #1A233A; + background-color: var(--color-bg); + color: var(--color-fg); } + h1 { font-size: 2rem; } + .sidebar .sidebar-top { margin: 0 0 1rem 0; padding-bottom: 1rem; text-align: center; } + .sidebar .sidebar-top .brand-logo { margin: 0 0 1rem 0; } + .sidebar .sidebar-top .brand-logo img { height: 90px; -webkit-border-radius: 100px; -moz-border-radius: 100px; border-radius: 100px; } + .sidebar .about { margin: 1rem 0 0 0; font-size: 0.8rem; text-align: center; } + +.subtitle { + color: var(--color-fg-dark); + margin-bottom: .5rem; + margin-left: 15px; +} + +.no-margin-bottom { + margin-bottom: 0; +} + .card { - background: #272E48; + background: var(--color-bg-light); -webkit-border-radius: 5px; -moz-border-radius: 5px; border-radius: 5px; border: 0; margin-bottom: 1rem; } + .form-control { border: 1px solid #596280; -webkit-border-radius: 2px; -moz-border-radius: 2px; border-radius: 2px; font-size: .825rem; - background: #1A233A; - color: #bcd0f7; + background: var(--color-bg-light); + color: var(--color-fg); +} + +.input-textbox { + color: var(--color-fg); + background-color: var(--color-bg); + border-color: var(--color-accent-dark); } + +.input-textbox::placeholder { + color: var(--color-fg-dark); +} + +.input-textbox:focus { + color: var(--color-fg); + background-color: var(--color-bg); + border-color: var(--color-accent-dark); +} + +.input-textbox:focus::placeholder { + color: var(--color-fg-dark); +} + .modal-content { - color: black; + color: var(--color-fg-contrast); } + .editor-container { height: 300px !important; } diff --git a/augur/tasks/data_analysis/__init__.py b/augur/tasks/data_analysis/__init__.py index b600bcac77..0db9a97ec4 100644 --- a/augur/tasks/data_analysis/__init__.py +++ b/augur/tasks/data_analysis/__init__.py @@ -1,7 +1,7 @@ from celery import chain import logging -def machine_learning_phase(repo_git): +def machine_learning_phase(repo_git, full_collection): from augur.tasks.data_analysis.clustering_worker.tasks import clustering_task from augur.tasks.data_analysis.discourse_analysis.tasks import discourse_analysis_task from augur.tasks.data_analysis.insight_worker.tasks import insight_task @@ -15,7 +15,7 @@ def machine_learning_phase(repo_git): ml_tasks.append(discourse_analysis_task.si(repo_git)) ml_tasks.append(insight_task.si(repo_git)) ml_tasks.append(message_insight_task.si(repo_git)) - ml_tasks.append(pull_request_analysis_task.si(repo_git)) + #ml_tasks.append(pull_request_analysis_task.si(repo_git)) logger.info(f"Machine learning sequence: {ml_tasks}") return chain(*ml_tasks) \ No newline at end of file diff --git a/augur/tasks/data_analysis/clustering_worker/setup.py b/augur/tasks/data_analysis/clustering_worker/setup.py index 78fb0b4b50..a197b21568 100644 --- a/augur/tasks/data_analysis/clustering_worker/setup.py +++ b/augur/tasks/data_analysis/clustering_worker/setup.py @@ -22,13 +22,13 @@ def read(filename): packages=find_packages(), install_requires=[ 'Flask==2.0.2', - 'Flask-Cors==3.0.10', + 'Flask-Cors==4.0.1', 'Flask-Login==0.5.0', 'Flask-WTF==1.0.0', - 'requests==2.28.0', + 'requests==2.32.0', 'psycopg2-binary==2.9.3', #'sklearn==0.0.0', - 'scikit-learn==1.1.3', + 'scikit-learn==1.5.0', 'numpy==1.26.0', 'nltk==3.6.6', 'seaborn==0.11.1', diff --git a/augur/tasks/data_analysis/contributor_breadth_worker/setup.py b/augur/tasks/data_analysis/contributor_breadth_worker/setup.py index 86052e164c..805edfb36b 100644 --- a/augur/tasks/data_analysis/contributor_breadth_worker/setup.py +++ b/augur/tasks/data_analysis/contributor_breadth_worker/setup.py @@ -23,10 +23,10 @@ def read(filename): packages=find_packages(), install_requires=[ 'Flask==2.0.2', - 'Flask-Cors==3.0.10', + 'Flask-Cors==4.0.1', 'Flask-Login==0.5.0', 'Flask-WTF==1.0.0', - 'requests==2.28.0', + 'requests==2.32.0', 'psycopg2-binary==2.9.3' ], entry_points={ diff --git a/augur/tasks/data_analysis/discourse_analysis/setup.py b/augur/tasks/data_analysis/discourse_analysis/setup.py index 37d6557ec5..ca936a6000 100644 --- a/augur/tasks/data_analysis/discourse_analysis/setup.py +++ b/augur/tasks/data_analysis/discourse_analysis/setup.py @@ -22,16 +22,16 @@ def read(filename): packages=find_packages(), install_requires=[ 'Flask==2.0.2', - 'Flask-Cors==3.0.10', + 'Flask-Cors==4.0.1', 'Flask-Login==0.5.0', 'Flask-WTF==1.0.0', - 'requests==2.28.0', + 'requests==2.32.0', 'psycopg2-binary==2.9.3', 'click==8.0.3', 'scipy>=1.10.0', 'nltk==3.6.6', 'pandas==1.5.3', - 'scikit-learn==1.1.3', + 'scikit-learn==1.5.0', 'textblob==0.15.3', 'python-crfsuite>=0.9.8', 'sklearn-crfsuite>=0.3.6', diff --git a/augur/tasks/data_analysis/insight_worker/setup.py b/augur/tasks/data_analysis/insight_worker/setup.py index 1ee6e8a4bd..92d663e3ae 100644 --- a/augur/tasks/data_analysis/insight_worker/setup.py +++ b/augur/tasks/data_analysis/insight_worker/setup.py @@ -23,10 +23,10 @@ def read(filename): packages=find_packages(exclude=('tests',)), install_requires=[ 'Flask==2.0.2', - 'Flask-Cors==3.0.10', + 'Flask-Cors==4.0.1', 'Flask-Login==0.5.0', 'Flask-WTF==1.0.0', - 'requests==2.28.0', + 'requests==2.32.0', 'psycopg2-binary==2.9.3', 'click==8.0.3', 'scipy>=1.10.0', diff --git a/augur/tasks/data_analysis/message_insights/setup.py b/augur/tasks/data_analysis/message_insights/setup.py index a4f6a30c43..2f86701619 100644 --- a/augur/tasks/data_analysis/message_insights/setup.py +++ b/augur/tasks/data_analysis/message_insights/setup.py @@ -24,14 +24,14 @@ def read(filename): packages=find_packages(), install_requires=[ 'Flask==2.0.2', - 'Flask-Cors==3.0.10', + 'Flask-Cors==4.0.1', 'Flask-Login==0.5.0', 'Flask-WTF==1.0.0', - 'requests==2.28.0', + 'requests==2.32.0', 'psycopg2-binary==2.9.3', 'click==8.0.3', 'scipy>=1.10.0', - 'scikit-learn==1.1.3', #0.24.2', + 'scikit-learn==1.5.0', #0.24.2', 'numpy==1.26.0', 'nltk==3.6.6', 'pandas==1.5.3', diff --git a/augur/tasks/data_analysis/pull_request_analysis_worker/setup.py b/augur/tasks/data_analysis/pull_request_analysis_worker/setup.py index 3341f24ff1..63ccbec1de 100644 --- a/augur/tasks/data_analysis/pull_request_analysis_worker/setup.py +++ b/augur/tasks/data_analysis/pull_request_analysis_worker/setup.py @@ -22,10 +22,10 @@ def read(filename): packages=find_packages(), install_requires=[ 'Flask==2.0.2', - 'Flask-Cors==3.0.10', + 'Flask-Cors==4.0.1', 'Flask-Login==0.5.0', 'Flask-WTF==1.0.0', - 'requests==2.28.0', + 'requests==2.32.0', 'psycopg2-binary==2.9.3', 'sklearn==0.0', 'nltk==3.6.6', diff --git a/augur/tasks/db/refresh_materialized_views.py b/augur/tasks/db/refresh_materialized_views.py index 09faffe0cb..8a06ac7a61 100644 --- a/augur/tasks/db/refresh_materialized_views.py +++ b/augur/tasks/db/refresh_materialized_views.py @@ -4,6 +4,9 @@ from augur.tasks.init.celery_app import celery_app as celery from augur.application.db.lib import execute_sql +from augur.tasks.git.util.facade_worker.facade_worker.config import FacadeHelper +from augur.tasks.git.util.facade_worker.facade_worker.rebuildcache import invalidate_caches, rebuild_unknown_affiliation_and_web_caches + @celery.task(bind=True) def refresh_materialized_views(self): @@ -163,6 +166,36 @@ def refresh_materialized_views(self): logger.info(f"error is {e}") pass + #Now refresh facade tables + #Use this class to get all the settings and + #utility functions for facade + facade_helper = FacadeHelper(logger) + + if facade_helper.nuke_stored_affiliations: + logger.error("Nuke stored affiliations is deprecated!") + # deprecated because the UI component of facade where affiliations would be + # nuked upon change no longer exists, and this information can easily be derived + # from queries and materialized views in the current version of Augur. + # This method is also a major performance bottleneck with little value. + + if not facade_helper.limited_run or (facade_helper.limited_run and facade_helper.fix_affiliations): + logger.error("Fill empty affiliations is deprecated!") + # deprecated because the UI component of facade where affiliations would need + # to be fixed upon change no longer exists, and this information can easily be derived + # from queries and materialized views in the current version of Augur. + # This method is also a major performance bottleneck with little value. + + if facade_helper.force_invalidate_caches: + try: + invalidate_caches(facade_helper) + except Exception as e: + logger.info(f"error is {e}") + + if not facade_helper.limited_run or (facade_helper.limited_run and facade_helper.rebuild_caches): + try: + rebuild_unknown_affiliation_and_web_caches(facade_helper) + except Exception as e: + logger.info(f"error is {e}") diff --git a/augur/tasks/git/dependency_tasks/core.py b/augur/tasks/git/dependency_tasks/core.py index 979020f9cb..76c1b15098 100644 --- a/augur/tasks/git/dependency_tasks/core.py +++ b/augur/tasks/git/dependency_tasks/core.py @@ -95,7 +95,7 @@ def generate_scorecard(logger, repo_git): try: required_output = parse_json_from_subprocess_call(logger,['./scorecard', command, '--format=json'],cwd=path_to_scorecard) except Exception as e: - session.logger.error(f"Could not parse required output! Error: {e}") + logger.error(f"Could not parse required output! Error: {e}") raise e # end diff --git a/augur/tasks/git/facade_tasks.py b/augur/tasks/git/facade_tasks.py index 97a69a7574..041f4b8a5b 100644 --- a/augur/tasks/git/facade_tasks.py +++ b/augur/tasks/git/facade_tasks.py @@ -10,8 +10,6 @@ from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_absolute_repo_path, get_parent_commits_set, get_existing_commits_set from augur.tasks.git.util.facade_worker.facade_worker.analyzecommit import analyze_commit from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_repo_commit_count, update_facade_scheduling_fields, get_facade_weight_with_commit_count -from augur.tasks.git.util.facade_worker.facade_worker.rebuildcache import fill_empty_affiliations, invalidate_caches, nuke_affiliations, rebuild_unknown_affiliation_and_web_caches - from augur.tasks.github.facade_github.tasks import * from augur.tasks.git.util.facade_worker.facade_worker.config import FacadeHelper @@ -235,37 +233,6 @@ def analyze_commits_in_parallel(repo_git, multithreaded: bool)-> None: logger.info("Analysis complete") return -@celery.task -def nuke_affiliations_facade_task(): - - logger = logging.getLogger(nuke_affiliations_facade_task.__name__) - - facade_helper = FacadeHelper(logger) - nuke_affiliations(facade_helper) - -@celery.task -def fill_empty_affiliations_facade_task(): - - logger = logging.getLogger(fill_empty_affiliations_facade_task.__name__) - facade_helper = FacadeHelper(logger) - fill_empty_affiliations(facade_helper) - -@celery.task -def invalidate_caches_facade_task(): - - logger = logging.getLogger(invalidate_caches_facade_task.__name__) - - facade_helper = FacadeHelper(logger) - invalidate_caches(facade_helper) - -@celery.task -def rebuild_unknown_affiliation_and_web_caches_facade_task(): - - logger = logging.getLogger(rebuild_unknown_affiliation_and_web_caches_facade_task.__name__) - - facade_helper = FacadeHelper(logger) - rebuild_unknown_affiliation_and_web_caches(facade_helper) - # retry this task indefinitely every 5 minutes if it errors. Since the only way it gets scheduled is by itself, so if it stops running no more clones will happen till the instance is restarted @celery.task(autoretry_for=(Exception,), retry_backoff=True, retry_backoff_max=300, retry_jitter=True, max_retries=None) def clone_repos(): @@ -388,29 +355,8 @@ def generate_analysis_sequence(logger,repo_git, facade_helper): -def generate_contributor_sequence(logger,repo_git, session): - - contributor_sequence = [] - #all_repo_ids = [] - repo_id = None - - #contributor_sequence.append(facade_start_contrib_analysis_task.si()) - repo = get_repo_by_repo_git(repo_git) - repo_id = repo.repo_id - - #pdb.set_trace() - #breakpoint() - #for repo in all_repos: - # contributor_sequence.append(insert_facade_contributors.si(repo['repo_id'])) - #all_repo_ids = [repo['repo_id'] for repo in all_repos] - #contrib_group = create_grouped_task_load(dataList=all_repo_ids,task=insert_facade_contributors)#group(contributor_sequence) - #contrib_group.link_error(facade_error_handler.s()) - #return contrib_group#chain(facade_start_contrib_analysis_task.si(), contrib_group) - return insert_facade_contributors.si(repo_id) - - -def facade_phase(repo_git): +def facade_phase(repo_git, full_collection): logger = logging.getLogger(facade_phase.__name__) logger.info("Generating facade sequence") facade_helper = FacadeHelper(logger) @@ -450,7 +396,7 @@ def facade_phase(repo_git): #Generate contributor analysis task group. if not limited_run or (limited_run and run_facade_contributors): - facade_core_collection.append(generate_contributor_sequence(logger,repo_git,facade_helper)) + facade_core_collection.append(insert_facade_contributors.si(repo_git)) #These tasks need repos to be cloned by facade before they can work. @@ -464,55 +410,4 @@ def facade_phase(repo_git): ) logger.info(f"Facade sequence: {facade_sequence}") - return chain(*facade_sequence) - -def generate_non_repo_domain_facade_tasks(logger): - logger.info("Generating facade sequence") - facade_helper = FacadeHelper(logger) - - # Figure out what we need to do - limited_run = facade_helper.limited_run - delete_marked_repos = facade_helper.delete_marked_repos - pull_repos = facade_helper.pull_repos - # clone_repos = facade_helper.clone_repos - check_updates = facade_helper.check_updates - # force_updates = facade_helper.force_updates - run_analysis = facade_helper.run_analysis - # force_analysis = facade_helper.force_analysis - nuke_stored_affiliations = facade_helper.nuke_stored_affiliations - fix_affiliations = facade_helper.fix_affiliations - force_invalidate_caches = facade_helper.force_invalidate_caches - rebuild_caches = facade_helper.rebuild_caches - #if abs((datetime.datetime.strptime(session.cfg.get_setting('aliases_processed')[:-3], - # '%Y-%m-%d %I:%M:%S.%f') - datetime.datetime.now()).total_seconds()) // 3600 > int(session.cfg.get_setting( - # 'update_frequency')) else 0 - force_invalidate_caches = facade_helper.force_invalidate_caches - create_xlsx_summary_files = facade_helper.create_xlsx_summary_files - multithreaded = facade_helper.multithreaded - - facade_sequence = [] - - if nuke_stored_affiliations: - #facade_sequence.append(nuke_affiliations_facade_task.si().on_error(facade_error_handler.s()))#nuke_affiliations(session.cfg) - logger.info("Nuke stored affiliations is deprecated.") - # deprecated because the UI component of facade where affiliations would be - # nuked upon change no longer exists, and this information can easily be derived - # from queries and materialized views in the current version of Augur. - # This method is also a major performance bottleneck with little value. - - #logger.info(session.cfg) - if not limited_run or (limited_run and fix_affiliations): - #facade_sequence.append(fill_empty_affiliations_facade_task.si().on_error(facade_error_handler.s()))#fill_empty_affiliations(session) - logger.info("Fill empty affiliations is deprecated.") - # deprecated because the UI component of facade where affiliations would need - # to be fixed upon change no longer exists, and this information can easily be derived - # from queries and materialized views in the current version of Augur. - # This method is also a major performance bottleneck with little value. - - if force_invalidate_caches: - facade_sequence.append(invalidate_caches_facade_task.si().on_error(facade_error_handler.s()))#invalidate_caches(session.cfg) - - if not limited_run or (limited_run and rebuild_caches): - facade_sequence.append(rebuild_unknown_affiliation_and_web_caches_facade_task.si().on_error(facade_error_handler.s()))#rebuild_unknown_affiliation_and_web_caches(session.cfg) - - return facade_sequence + return chain(*facade_sequence) \ No newline at end of file diff --git a/augur/tasks/git/util/facade_worker/setup.py b/augur/tasks/git/util/facade_worker/setup.py index 298baff49d..e2a1af8b75 100644 --- a/augur/tasks/git/util/facade_worker/setup.py +++ b/augur/tasks/git/util/facade_worker/setup.py @@ -23,10 +23,10 @@ def read(filename): packages=find_packages(exclude=('tests',)), install_requires=[ 'Flask==2.0.2', - 'Flask-Cors==3.0.10', + 'Flask-Cors==4.0.1', 'Flask-Login==0.5.0', 'Flask-WTF==1.0.0', - 'requests==2.28.0', + 'requests==2.32.0', 'psycopg2-binary==2.9.3', 'click==8.0.3', 'XlsxWriter==1.3.7' diff --git a/augur/tasks/github/__init__.py b/augur/tasks/github/__init__.py index 29823eafe5..63d68da41b 100644 --- a/augur/tasks/github/__init__.py +++ b/augur/tasks/github/__init__.py @@ -1,7 +1,7 @@ -from augur.tasks.github.contributors.tasks import * -from augur.tasks.github.events.tasks import * -from augur.tasks.github.issues.tasks import * -from augur.tasks.github.messages.tasks import * +from augur.tasks.github.contributors import * +from augur.tasks.github.events import * +from augur.tasks.github.issues import * +from augur.tasks.github.messages import * from augur.tasks.github.pull_requests.tasks import * from augur.tasks.github.repo_info.tasks import * from augur.tasks.github.releases.tasks import * diff --git a/augur/tasks/github/contributors/tasks.py b/augur/tasks/github/contributors.py similarity index 100% rename from augur/tasks/github/contributors/tasks.py rename to augur/tasks/github/contributors.py diff --git a/augur/tasks/github/contributors/__init__.py b/augur/tasks/github/contributors/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/augur/tasks/github/contributors/core.py b/augur/tasks/github/contributors/core.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/augur/tasks/github/events/tasks.py b/augur/tasks/github/events.py similarity index 89% rename from augur/tasks/github/events/tasks.py rename to augur/tasks/github/events.py index ee4f407616..44bb7e19ae 100644 --- a/augur/tasks/github/events/tasks.py +++ b/augur/tasks/github/events.py @@ -5,7 +5,7 @@ from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask from augur.application.db.data_parse import * -from augur.tasks.github.util.github_paginator import GithubPaginator +from augur.tasks.github.util.github_data_access import GithubDataAccess from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth from augur.tasks.github.util.util import get_owner_repo from augur.tasks.util.worker_util import remove_duplicate_dicts @@ -50,27 +50,13 @@ def retrieve_all_event_data(repo_git: str, logger, key_auth): url = f"https://api.github.com/repos/{owner}/{repo}/issues/events" - # returns an iterable of all issues at this url (this essentially means you can treat the issues variable as a list of the issues) - events = GithubPaginator(url, key_auth, logger) + github_data_access = GithubDataAccess(key_auth, logger) + event_count = github_data_access.get_resource_page_count(url) - num_pages = events.get_num_pages() - all_data = [] - for page_data, page in events.iter_pages(): + logger.info(f"{owner}/{repo}: Collecting {event_count} github events") - if page_data is None: - return all_data - - elif len(page_data) == 0: - logger.debug(f"{repo.capitalize()} Events Page {page} contains no data...returning") - logger.info(f"Events Page {page} of {num_pages}") - return all_data - - logger.info(f"{repo} Events Page {page} of {num_pages}") - - all_data += page_data - - return all_data + return list(github_data_access.paginate_resource(url)) def process_events(events, task_name, repo_id, logger): diff --git a/augur/tasks/github/events/__init__.py b/augur/tasks/github/events/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/augur/tasks/github/events/core.py b/augur/tasks/github/events/core.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/augur/tasks/github/facade_github/tasks.py b/augur/tasks/github/facade_github/tasks.py index 4a3806d507..c2a221037f 100644 --- a/augur/tasks/github/facade_github/tasks.py +++ b/augur/tasks/github/facade_github/tasks.py @@ -7,7 +7,7 @@ from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth from augur.application.db.models import Contributor from augur.tasks.github.facade_github.core import * -from augur.application.db.lib import execute_sql, get_contributor_aliases_by_email, get_unresolved_commit_emails_by_name, get_contributors_by_full_name +from augur.application.db.lib import execute_sql, get_contributor_aliases_by_email, get_unresolved_commit_emails_by_name, get_contributors_by_full_name, get_repo_by_repo_git from augur.tasks.git.util.facade_worker.facade_worker.facade00mainprogram import * @@ -46,10 +46,10 @@ def process_commit_metadata(logger, auth, contributorQueue, repo_id, platform_id contributors_with_matching_name = get_contributors_by_full_name(name) - if not contributors_with_matching_name: + if not contributors_with_matching_name or len(contributors_with_matching_name) > 1: logger.debug("Failed local login lookup") else: - login = contributors_with_matching_name.gh_login + login = contributors_with_matching_name[0].gh_login # Try to get the login from the commit sha @@ -195,10 +195,9 @@ def insert_facade_contributors(self, repo_git): # Set platform id to 1 since this task is github specific platform_id = 1 - engine = self.app.engine - logger = logging.getLogger(insert_facade_contributors.__name__) - repo_id = None + repo = get_repo_by_repo_git(repo_git) + repo_id = repo.repo_id # Get all of the commit data's emails and names from the commit table that do not appear # in the contributors table or the contributors_aliases table. diff --git a/augur/tasks/github/issues/tasks.py b/augur/tasks/github/issues.py similarity index 84% rename from augur/tasks/github/issues/tasks.py rename to augur/tasks/github/issues.py index 98a8067eb5..ae1fb07cd9 100644 --- a/augur/tasks/github/issues/tasks.py +++ b/augur/tasks/github/issues.py @@ -1,5 +1,6 @@ import logging import traceback +from datetime import timedelta, timezone from sqlalchemy.exc import IntegrityError @@ -7,19 +8,19 @@ from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask from augur.application.db.data_parse import * -from augur.tasks.github.util.github_paginator import GithubPaginator +from augur.tasks.github.util.github_data_access import GithubDataAccess from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth from augur.tasks.github.util.util import add_key_value_pair_to_dicts, get_owner_repo from augur.tasks.util.worker_util import remove_duplicate_dicts from augur.application.db.models import Issue, IssueLabel, IssueAssignee, Contributor from augur.application.config import get_development_flag -from augur.application.db.lib import get_repo_by_repo_git, bulk_insert_dicts +from augur.application.db.lib import get_repo_by_repo_git, bulk_insert_dicts, get_core_data_last_collected development = get_development_flag() @celery.task(base=AugurCoreRepoCollectionTask) -def collect_issues(repo_git : str) -> int: +def collect_issues(repo_git : str, full_collection: bool) -> int: logger = logging.getLogger(collect_issues.__name__) @@ -27,28 +28,35 @@ def collect_issues(repo_git : str) -> int: owner, repo = get_owner_repo(repo_git) + if full_collection: + core_data_last_collected = None + else: + # subtract 2 days to ensure all data is collected + core_data_last_collected = (get_core_data_last_collected(repo_id) - timedelta(days=2)).replace(tzinfo=timezone.utc) + key_auth = GithubRandomKeyAuth(logger) logger.info(f'this is the manifest.key_auth value: {str(key_auth)}') try: - issue_data = retrieve_all_issue_data(repo_git, logger, key_auth) - - if issue_data: - total_issues = len(issue_data) - process_issues(issue_data, f"{owner}/{repo}: Issue task", repo_id, logger) + issue_data = retrieve_all_issue_data(repo_git, logger, key_auth, core_data_last_collected) - return total_issues - else: + if not issue_data: logger.info(f"{owner}/{repo} has no issues") return 0 + + total_issues = len(issue_data) + process_issues(issue_data, f"{owner}/{repo}: Issue task", repo_id, logger) + + return total_issues + except Exception as e: logger.error(f"Could not collect issues for repo {repo_git}\n Reason: {e} \n Traceback: {''.join(traceback.format_exception(None, e, e.__traceback__))}") return -1 -def retrieve_all_issue_data(repo_git, logger, key_auth) -> None: +def retrieve_all_issue_data(repo_git, logger, key_auth, since) -> None: owner, repo = get_owner_repo(repo_git) @@ -56,32 +64,17 @@ def retrieve_all_issue_data(repo_git, logger, key_auth) -> None: url = f"https://api.github.com/repos/{owner}/{repo}/issues?state=all" - # returns an iterable of all issues at this url (this essentially means you can treat the issues variable as a list of the issues) - # Reference the code documenation for GithubPaginator for more details - issues = GithubPaginator(url, key_auth, logger) - - # this is defined so we can decrement it each time - # we come across a pr, so at the end we can log how - # many issues were collected - # loop through the issues - all_data = [] - num_pages = issues.get_num_pages() - for page_data, page in issues.iter_pages(): - - if page_data is None: - return all_data + if since: + url += f"&since={since.isoformat()}" - if len(page_data) == 0: - logger.debug( - f"{owner}/{repo}: Issues Page {page} contains no data...returning") - logger.info(f"{owner}/{repo}: Issues Page {page} of {num_pages}") - return all_data + github_data_access = GithubDataAccess(key_auth, logger) - logger.info(f"{owner}/{repo}: Issues Page {page} of {num_pages}") + num_pages = github_data_access.get_resource_page_count(url) + logger.info(f"{owner}/{repo}: Retrieving {num_pages} pages of issues") - all_data += page_data + issues_paginator = github_data_access.paginate_resource(url) - return all_data + return list(issues_paginator) def process_issues(issues, task_name, repo_id, logger) -> None: diff --git a/augur/tasks/github/issues/__init__.py b/augur/tasks/github/issues/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/augur/tasks/github/issues/core.py b/augur/tasks/github/issues/core.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/augur/tasks/github/messages/tasks.py b/augur/tasks/github/messages.py similarity index 86% rename from augur/tasks/github/messages/tasks.py rename to augur/tasks/github/messages.py index 3e104fc6dc..d47107c163 100644 --- a/augur/tasks/github/messages/tasks.py +++ b/augur/tasks/github/messages.py @@ -4,7 +4,7 @@ from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask from augur.application.db.data_parse import * -from augur.tasks.github.util.github_paginator import GithubPaginator +from augur.tasks.github.util.github_data_access import GithubDataAccess from augur.tasks.github.util.github_task_session import GithubTaskManifest from augur.tasks.util.worker_util import remove_duplicate_dicts from augur.tasks.github.util.util import get_owner_repo @@ -60,34 +60,14 @@ def fast_retrieve_all_pr_and_issue_messages(repo_git: str, logger, key_auth, tas # define logger for task logger.info(f"Collecting github comments for {owner}/{repo}") - - # url to get issue and pull request comments - url = f"https://api.github.com/repos/{owner}/{repo}/issues/comments" - - # define database task session, that also holds authentication keys the GithubPaginator needs - # returns an iterable of all issues at this url (this essentially means you can treat the issues variable as a list of the issues) - messages = GithubPaginator(url, key_auth, logger) - - num_pages = messages.get_num_pages() - all_data = [] - for page_data, page in messages.iter_pages(): + github_data_access = GithubDataAccess(key_auth, logger) - if page_data is None: - return all_data + message_count = github_data_access.get_resource_count(url) - elif len(page_data) == 0: - logger.debug(f"{repo.capitalize()} Messages Page {page} contains no data...returning") - logger.info( - f"{task_name}: Page {page} of {num_pages}") - return all_data + logger.info(f"{task_name}: Collecting {message_count} github messages") - logger.info(f"{task_name}: Page {page} of {num_pages}") - - all_data += page_data - - - return all_data + return list(github_data_access.paginate_resource(url)) def process_large_issue_and_pr_message_collection(repo_id, repo_git: str, logger, key_auth, task_name, augur_db) -> None: @@ -110,20 +90,16 @@ def process_large_issue_and_pr_message_collection(repo_id, repo_git: str, logger result = connection.execute(query).fetchall() comment_urls = [x[0] for x in result] - all_data = [] - for index, comment_url in enumerate(comment_urls): - - logger.info(f"{task_name}: Github messages index {index+1} of {len(comment_urls)}") + github_data_access = GithubDataAccess(key_auth, logger) - messages = GithubPaginator(comment_url, key_auth, logger) - for page_data, _ in messages.iter_pages(): + logger.info(f"{task_name}: Collecting github messages for {len(comment_urls)} prs/issues") - if page_data is None or len(page_data) == 0: - break + all_data = [] + for comment_url in comment_urls: - all_data += page_data + messages = list(github_data_access.paginate_resource(comment_url)) - logger.info(f"All data size: {len(all_data)}") + all_data += messages if len(all_data) >= 20: process_messages(all_data, task_name, repo_id, logger, augur_db) diff --git a/augur/tasks/github/messages/__init__.py b/augur/tasks/github/messages/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/augur/tasks/github/messages/core.py b/augur/tasks/github/messages/core.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/augur/tasks/github/pull_requests/commits_model/core.py b/augur/tasks/github/pull_requests/commits_model/core.py index 7c6f36abfb..f58d875503 100644 --- a/augur/tasks/github/pull_requests/commits_model/core.py +++ b/augur/tasks/github/pull_requests/commits_model/core.py @@ -2,20 +2,40 @@ from augur.tasks.github.util.github_paginator import GithubPaginator from augur.application.db.models import * from augur.tasks.github.util.util import get_owner_repo -from augur.application.db.lib import bulk_insert_dicts, fetchall_data_from_sql_text +from augur.application.db.util import execute_session_query +from augur.application.db.lib import get_secondary_data_last_collected, get_updated_prs -def pull_request_commits_model(repo,logger, key_auth): + +def pull_request_commits_model(repo_id,logger, augur_db, key_auth, full_collection=False): + + if full_collection: + # query existing PRs and the respective url we will append the commits url to + pr_url_sql = s.sql.text(""" + SELECT DISTINCT pr_url, pull_requests.pull_request_id + FROM pull_requests--, pull_request_meta + WHERE repo_id = :repo_id + """).bindparams(repo_id=repo_id) + pr_urls = [] + #pd.read_sql(pr_number_sql, self.db, params={}) + + pr_urls = augur_db.fetchall_data_from_sql_text(pr_url_sql)#session.execute_sql(pr_number_sql).fetchall() - # query existing PRs and the respective url we will append the commits url to - pr_url_sql = s.sql.text(""" - SELECT DISTINCT pr_url, pull_requests.pull_request_id - FROM pull_requests--, pull_request_meta - WHERE repo_id = :repo_id - """).bindparams(repo_id=repo.repo_id) - pr_urls = [] - #pd.read_sql(pr_number_sql, self.db, params={}) - - pr_urls = fetchall_data_from_sql_text(pr_url_sql) + else: + last_collected = get_secondary_data_last_collected(repo_id).date() + prs = get_updated_prs(repo_id, last_collected) + pr_urls = [pr.pr_url for pr in prs] + + pr_urls = [] + for pr in prs: + pr_urls.append({ + 'pr_url': pr.pr_url, + 'pull_request_id': pr.pull_request_id + }) + + + query = augur_db.session.query(Repo).filter(Repo.repo_id == repo_id) + repo = execute_session_query(query, 'one') + owner, name = get_owner_repo(repo.repo_git) task_name = f"{owner}/{name} Pr commits" @@ -52,7 +72,7 @@ def pull_request_commits_model(repo,logger, key_auth): if len(all_data) > 0: logger.info(f"{task_name}: Inserting {len(all_data)} rows") pr_commits_natural_keys = ["pull_request_id", "repo_id", "pr_cmt_sha"] - bulk_insert_dicts(logger, all_data,PullRequestCommit,pr_commits_natural_keys) + augur_db.insert_data(all_data,PullRequestCommit,pr_commits_natural_keys) diff --git a/augur/tasks/github/pull_requests/commits_model/tasks.py b/augur/tasks/github/pull_requests/commits_model/tasks.py index b1d920e986..e6acdfa90a 100644 --- a/augur/tasks/github/pull_requests/commits_model/tasks.py +++ b/augur/tasks/github/pull_requests/commits_model/tasks.py @@ -2,18 +2,18 @@ from augur.tasks.github.pull_requests.commits_model.core import * from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurSecondaryRepoCollectionTask -from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth +from augur.tasks.github.util.github_task_session import GithubTaskManifest from augur.application.db.lib import get_repo_by_repo_git @celery.task(base=AugurSecondaryRepoCollectionTask) -def process_pull_request_commits(repo_git: str) -> None: +def process_pull_request_commits(repo_git: str, full_collection: bool) -> None: logger = logging.getLogger(process_pull_request_commits.__name__) repo = get_repo_by_repo_git(repo_git) - key_auth = GithubRandomKeyAuth(logger) + with GithubTaskManifest(logger) as manifest: - pull_request_commits_model(repo, logger, key_auth) + pull_request_commits_model(repo.repo_id, logger, manifest.augur_db, manifest.key_auth, full_collection) diff --git a/augur/tasks/github/pull_requests/files_model/core.py b/augur/tasks/github/pull_requests/files_model/core.py index 537d2bd205..983ac67595 100644 --- a/augur/tasks/github/pull_requests/files_model/core.py +++ b/augur/tasks/github/pull_requests/files_model/core.py @@ -1,23 +1,39 @@ import sqlalchemy as s +import httpx from augur.tasks.github.util.gh_graphql_entities import GraphQlPageCollection from augur.application.db.models import * from augur.tasks.github.util.util import get_owner_repo from augur.application.db.lib import bulk_insert_dicts, execute_sql from augur.application.db.util import execute_session_query -import traceback -import httpx # Import httpx +from augur.application.db.lib import get_secondary_data_last_collected, get_updated_prs -def pull_request_files_model(repo_id,logger, augur_db, key_auth): - - # query existing PRs and the respective url we will append the commits url to - pr_number_sql = s.sql.text(""" - SELECT DISTINCT pr_src_number as pr_src_number, pull_requests.pull_request_id - FROM pull_requests--, pull_request_meta - WHERE repo_id = :repo_id - """).bindparams(repo_id=repo_id) - pr_numbers = [] - #pd.read_sql(pr_number_sql, self.db, params={}) + +def pull_request_files_model(repo_id,logger, augur_db, key_auth, full_collection=False): + if full_collection: + # query existing PRs and the respective url we will append the commits url to + pr_number_sql = s.sql.text(""" + SELECT DISTINCT pr_src_number as pr_src_number, pull_requests.pull_request_id + FROM pull_requests--, pull_request_meta + WHERE repo_id = :repo_id + """).bindparams(repo_id=repo_id) + pr_numbers = [] + #pd.read_sql(pr_number_sql, self.db, params={}) + + result = augur_db.execute_sql(pr_number_sql)#.fetchall() + pr_numbers = [dict(row) for row in result.mappings()] + + else: + last_collected = get_secondary_data_last_collected(repo_id).date() + prs = get_updated_prs(repo_id, last_collected) + + pr_numbers = [] + for pr in prs: + pr_numbers.append({ + 'pr_src_number': pr.pr_src_number, + 'pull_request_id': pr.pull_request_id + }) + query = augur_db.session.query(Repo).filter(Repo.repo_id == repo_id) repo = execute_session_query(query, 'one') owner, name = get_owner_repo(repo.repo_git) @@ -78,4 +94,4 @@ def pull_request_files_model(repo_id,logger, augur_db, key_auth): if len(pr_file_rows) > 0: # Execute a bulk upsert with sqlalchemy pr_file_natural_keys = ["pull_request_id", "repo_id", "pr_file_path"] - bulk_insert_dicts(logger, pr_file_rows, PullRequestFile, pr_file_natural_keys) + augur_db.insert_data(pr_file_rows, PullRequestFile, pr_file_natural_keys) diff --git a/augur/tasks/github/pull_requests/files_model/tasks.py b/augur/tasks/github/pull_requests/files_model/tasks.py index 988261f6c8..be75c88a9d 100644 --- a/augur/tasks/github/pull_requests/files_model/tasks.py +++ b/augur/tasks/github/pull_requests/files_model/tasks.py @@ -6,7 +6,7 @@ from augur.application.db.util import execute_session_query @celery.task(base=AugurSecondaryRepoCollectionTask) -def process_pull_request_files(repo_git: str) -> None: +def process_pull_request_files(repo_git: str, full_collection: bool) -> None: logger = logging.getLogger(process_pull_request_files.__name__) @@ -15,4 +15,4 @@ def process_pull_request_files(repo_git: str) -> None: query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) repo = execute_session_query(query, 'one') - pull_request_files_model(repo.repo_id, logger, augur_db, manifest.key_auth) \ No newline at end of file + pull_request_files_model(repo.repo_id, logger, augur_db, manifest.key_auth, full_collection) \ No newline at end of file diff --git a/augur/tasks/github/pull_requests/tasks.py b/augur/tasks/github/pull_requests/tasks.py index ace6fa7d2b..dd31f726f4 100644 --- a/augur/tasks/github/pull_requests/tasks.py +++ b/augur/tasks/github/pull_requests/tasks.py @@ -1,20 +1,21 @@ import logging +from datetime import datetime, timedelta, timezone from augur.tasks.github.pull_requests.core import extract_data_from_pr_list from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask, AugurSecondaryRepoCollectionTask from augur.application.db.data_parse import * +from augur.tasks.github.util.github_data_access import GithubDataAccess from augur.tasks.github.util.github_paginator import GithubPaginator from augur.tasks.util.worker_util import remove_duplicate_dicts from augur.tasks.github.util.util import add_key_value_pair_to_dicts, get_owner_repo from augur.application.db.models import PullRequest, Message, PullRequestReview, PullRequestLabel, PullRequestReviewer, PullRequestMeta, PullRequestAssignee, PullRequestReviewMessageRef, Contributor, Repo from augur.tasks.github.util.github_task_session import GithubTaskManifest +from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth from augur.application.db.lib import get_session, get_repo_by_repo_git, bulk_insert_dicts, get_pull_request_reviews_by_repo_id from augur.application.db.util import execute_session_query -from ..messages.tasks import process_github_comment_contributors -from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth - -import httpx +from ..messages import process_github_comment_contributors +from augur.application.db.lib import get_secondary_data_last_collected, get_updated_prs, get_core_data_last_collected from typing import Generator, List, Dict @@ -22,7 +23,7 @@ platform_id = 1 @celery.task(base=AugurCoreRepoCollectionTask) -def collect_pull_requests(repo_git: str) -> int: +def collect_pull_requests(repo_git: str, full_collection: bool) -> int: logger = logging.getLogger(collect_pull_requests.__name__) @@ -36,10 +37,17 @@ def collect_pull_requests(repo_git: str) -> int: owner, repo = get_owner_repo(repo_git) + if full_collection: + core_data_last_collected = None + else: + # subtract 2 days to ensure all data is collected + core_data_last_collected = (get_core_data_last_collected(repo_id) - timedelta(days=2)).replace(tzinfo=timezone.utc) + total_count = 0 all_data = [] - for page in retrieve_all_pr_data(repo_git, logger, manifest.key_auth): - all_data += page + for pr in retrieve_all_pr_data(repo_git, logger, manifest.key_auth, core_data_last_collected): + + all_data.append(pr) if len(all_data) >= 1000: process_pull_requests(all_data, f"{owner}/{repo}: Pr task", repo_id, logger, augur_db) @@ -60,31 +68,29 @@ def collect_pull_requests(repo_git: str) -> int: # TODO: Rename pull_request_reviewers table to pull_request_requested_reviewers # TODO: Fix column names in pull request labels table -def retrieve_all_pr_data(repo_git: str, logger, key_auth): #-> Generator[List[Dict]]: +def retrieve_all_pr_data(repo_git: str, logger, key_auth, since): #-> Generator[List[Dict]]: owner, repo = get_owner_repo(repo_git) logger.info(f"Collecting pull requests for {owner}/{repo}") - url = f"https://api.github.com/repos/{owner}/{repo}/pulls?state=all&direction=desc" - # returns an iterable of all prs at this url (this essentially means you can treat the prs variable as a list of the prs) - prs = GithubPaginator(url, key_auth, logger) + url = f"https://api.github.com/repos/{owner}/{repo}/pulls?state=all&direction=desc&sort=updated" - num_pages = prs.get_num_pages() - for page_data, page in prs.iter_pages(): + github_data_access = GithubDataAccess(key_auth, logger) - if page_data is None: - return + num_pages = github_data_access.get_resource_page_count(url) - if len(page_data) == 0: - logger.debug( - f"{owner}/{repo} Prs Page {page} contains no data...returning") - logger.info(f"{owner}/{repo} Prs Page {page} of {num_pages}") - return + logger.info(f"{owner}/{repo}: Retrieving {num_pages} pages of pull requests") - logger.info(f"{owner}/{repo} Prs Page {page} of {num_pages}") - - yield page_data + # returns a generator so this method can be used by doing for x in retrieve_all_pr_data() + + for pr in github_data_access.paginate_resource(url): + + yield pr + + # return if last pr on the page was updated before the since date + if since and datetime.fromisoformat(pr["updated_at"].replace("Z", "+00:00")).replace(tzinfo=timezone.utc) < since: + return def process_pull_requests(pull_requests, task_name, repo_id, logger, augur_db): """ @@ -292,7 +298,9 @@ def collect_pull_request_review_comments(repo_git: str) -> None: logger.info(f"Inserting {len(pr_review_comment_dicts)} pr review comments") message_natural_keys = ["platform_msg_id", "pltfrm_id"] message_return_columns = ["msg_id", "platform_msg_id"] - message_return_data = bulk_insert_dicts(logger, pr_review_comment_dicts, Message, message_natural_keys, message_return_columns) + message_string_fields = ["msg_text"] + message_return_data = bulk_insert_dicts(logger, pr_review_comment_dicts, Message, message_natural_keys, + return_columns=message_return_columns, string_fields=message_string_fields) if message_return_data is None: return @@ -328,7 +336,7 @@ def collect_pull_request_review_comments(repo_git: str) -> None: @celery.task(base=AugurSecondaryRepoCollectionTask) -def collect_pull_request_reviews(repo_git: str) -> None: +def collect_pull_request_reviews(repo_git: str, full_collection: bool) -> None: logger = logging.getLogger(collect_pull_request_reviews.__name__) @@ -339,81 +347,88 @@ def collect_pull_request_reviews(repo_git: str) -> None: data_source = "Github API" repo_id = get_repo_by_repo_git(repo_git).repo_id + with GithubTaskManifest(logger) as manifest: - key_auth = GithubRandomKeyAuth(logger) + augur_db = manifest.augur_db - with get_session() as session: - - query = session.query(PullRequest).filter(PullRequest.repo_id == repo_id).order_by(PullRequest.pr_src_number) - prs = execute_session_query(query, 'all') + query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) + repo_id = execute_session_query(query, 'one').repo_id + + if full_collection: - pr_count = len(prs) + query = augur_db.session.query(PullRequest).filter(PullRequest.repo_id == repo_id).order_by(PullRequest.pr_src_number) + prs = execute_session_query(query, 'all') + else: + last_collected = get_secondary_data_last_collected(repo_id).date() + prs = get_updated_prs(repo_id, last_collected) - all_pr_reviews = {} - for index, pr in enumerate(prs): + pr_count = len(prs) - pr_number = pr.pr_src_number - pull_request_id = pr.pull_request_id + all_pr_reviews = {} + for index, pr in enumerate(prs): - logger.info(f"{owner}/{repo} Collecting Pr Reviews for pr {index + 1} of {pr_count}") + pr_number = pr.pr_src_number + pull_request_id = pr.pull_request_id - pr_review_url = f"https://api.github.com/repos/{owner}/{repo}/pulls/{pr_number}/reviews" + logger.info(f"{owner}/{repo} Collecting Pr Reviews for pr {index + 1} of {pr_count}") - pr_reviews = [] - pr_reviews_generator = GithubPaginator(pr_review_url, key_auth, logger) - for page_data, page in pr_reviews_generator.iter_pages(): - - if page_data is None: - break - - if len(page_data) == 0: - break - - if isinstance(page_data, list): - page_data = [ - element.decode('utf-8').replace('\x00', ' ') if isinstance(element, bytes) else element - for element in page_data - ] - logger.info(f"NUL characters were found in PR Reviews and replaced with spaces.") - elif isinstance(page_data, bytes): - page_data = page_data.decode('utf-8').replace('\x00', ' ') - logger.info(f"NUL characters were found in PR Reviews and replaced with spaces.") + pr_review_url = f"https://api.github.com/repos/{owner}/{repo}/pulls/{pr_number}/reviews" + + pr_reviews = [] + pr_reviews_generator = GithubPaginator(pr_review_url, manifest.key_auth, logger) + for page_data, page in pr_reviews_generator.iter_pages(): + + if page_data is None: + break + + if len(page_data) == 0: + break + + if isinstance(page_data, list): + page_data = [ + element.decode('utf-8').replace('\x00', ' ') if isinstance(element, bytes) else element + for element in page_data + ] + logger.info(f"NUL characters were found in PR Reviews and replaced with spaces.") + elif isinstance(page_data, bytes): + page_data = page_data.decode('utf-8').replace('\x00', ' ') + logger.info(f"NUL characters were found in PR Reviews and replaced with spaces.") + + pr_reviews.extend(page_data) - pr_reviews.extend(page_data) - - if pr_reviews: - all_pr_reviews[pull_request_id] = pr_reviews + if pr_reviews: + all_pr_reviews[pull_request_id] = pr_reviews - if not list(all_pr_reviews.keys()): - logger.info(f"{owner}/{repo} No pr reviews for repo") - return + if not list(all_pr_reviews.keys()): + logger.info(f"{owner}/{repo} No pr reviews for repo") + return - contributors = [] - for pull_request_id in all_pr_reviews.keys(): + contributors = [] + for pull_request_id in all_pr_reviews.keys(): - reviews = all_pr_reviews[pull_request_id] - for review in reviews: - contributor = process_pull_request_review_contributor(review, tool_source, tool_version, data_source) - if contributor: - contributors.append(contributor) + reviews = all_pr_reviews[pull_request_id] + for review in reviews: + contributor = process_pull_request_review_contributor(review, tool_source, tool_version, data_source) + if contributor: + contributors.append(contributor) - logger.info(f"{owner}/{repo} Pr reviews: Inserting {len(contributors)} contributors") - bulk_insert_dicts(logger, contributors, Contributor, ["cntrb_id"]) + logger.info(f"{owner}/{repo} Pr reviews: Inserting {len(contributors)} contributors") + augur_db.insert_data(contributors, Contributor, ["cntrb_id"]) - pr_reviews = [] - for pull_request_id in all_pr_reviews.keys(): + pr_reviews = [] + for pull_request_id in all_pr_reviews.keys(): - reviews = all_pr_reviews[pull_request_id] - for review in reviews: - - if "cntrb_id" in review: - pr_reviews.append(extract_needed_pr_review_data(review, pull_request_id, repo_id, platform_id, tool_source, tool_version)) + reviews = all_pr_reviews[pull_request_id] + for review in reviews: + + if "cntrb_id" in review: + pr_reviews.append(extract_needed_pr_review_data(review, pull_request_id, repo_id, platform_id, tool_source, tool_version)) - logger.info(f"{owner}/{repo}: Inserting pr reviews of length: {len(pr_reviews)}") - pr_review_natural_keys = ["pr_review_src_id",] - bulk_insert_dicts(logger, pr_reviews, PullRequestReview, pr_review_natural_keys) + logger.info(f"{owner}/{repo}: Inserting pr reviews of length: {len(pr_reviews)}") + pr_review_natural_keys = ["pr_review_src_id",] + augur_db.insert_data(pr_reviews, PullRequestReview, pr_review_natural_keys) diff --git a/augur/tasks/github/traffic/tasks.py b/augur/tasks/github/traffic.py similarity index 100% rename from augur/tasks/github/traffic/tasks.py rename to augur/tasks/github/traffic.py diff --git a/augur/tasks/github/traffic/__init__.py b/augur/tasks/github/traffic/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/augur/tasks/github/traffic/core.py b/augur/tasks/github/traffic/core.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/augur/tasks/github/util/github_data_access.py b/augur/tasks/github/util/github_data_access.py new file mode 100644 index 0000000000..2f4c988014 --- /dev/null +++ b/augur/tasks/github/util/github_data_access.py @@ -0,0 +1,190 @@ +import logging +import time +import httpx +from tenacity import retry, stop_after_attempt, wait_fixed, retry_if_exception, RetryError +from urllib.parse import urlparse, parse_qs, urlencode + + +class RatelimitException(Exception): + + def __init__(self, response, message="Github Rate limit exceeded") -> None: + + self.response = response + + super().__init__(message) + +class UrlNotFoundException(Exception): + pass + +class GithubDataAccess: + + def __init__(self, key_manager, logger: logging.Logger): + + self.logger = logger + self.key_manager = key_manager + + def get_resource_count(self, url): + + # set per_page to 100 explicitly so we know each page is 100 long + params = {"per_page": 100} + url = self.__add_query_params(url, params) + + num_pages = self.get_resource_page_count(url) + + # get data for last page + params = {"page": num_pages} + url = self.__add_query_params(url, params) + + data = self.get_resource(url) + + return (100 * (num_pages -1)) + len(data) + + def paginate_resource(self, url): + + response = self.make_request_with_retries(url) + data = response.json() + + # need to ensure data is a list so yield from works properly + if not isinstance(data, list): + raise Exception(f"GithubApiHandler.paginate_resource must be used with url that returns a list. Use GithubApiHandler.get_resource to retrieve data that is not paginated. The url of {url} returned a {type(data)}.") + + yield from data + + while 'next' in response.links.keys(): + + next_page = response.links['next']['url'] + + response = self.make_request_with_retries(next_page) + data = response.json() + + # need to ensure data is a list so yield from works properly + if not isinstance(data, list): + raise Exception(f"GithubApiHandler.paginate_resource must be used with url that returns a list. Use GithubApiHandler.get_resource to retrieve data that is not paginated. The url of {url} returned a {type(data)}. ") + + yield from data + + return + + def is_pagination_limited_by_max_github_pages(self, url): + + page_count = self.get_resource_page_count(url) + + return page_count <= 299 + + def get_resource_page_count(self, url): + + response = self.make_request_with_retries(url, method="HEAD") + + if 'last' not in response.links.keys(): + return 1 + + try: + last_page_url = response.links['last']['url'] + + parsed_url = urlparse(last_page_url) + + return int(parse_qs(parsed_url.query)['page'][0]) + except (KeyError, ValueError): + raise Exception(f"Unable to parse 'last' url from response: {response.links['last']}") + + def get_resource(self, url): + + response = self.make_request_with_retries(url) + + return response.json() + + # TODO: Handle timeout exceptions better + def make_request(self, url, method="GET", timeout=100): + + with httpx.Client() as client: + + response = client.request(method=method, url=url, auth=self.key_manager, timeout=timeout, follow_redirects=True) + + if response.status_code in [403, 429]: + raise RatelimitException(response) + + if response.status_code == 404: + raise UrlNotFoundException(f"Could not find {url}") + + response.raise_for_status() + + return response + + def make_request_with_retries(self, url, method="GET", timeout=100): + """ What method does? + 1. Catches RetryError and rethrows a nicely formatted OutOfRetriesException that includes that last exception thrown + """ + + try: + return self.__make_request_with_retries(url, method, timeout) + except RetryError as e: + raise e.last_attempt.exception() + + @retry(stop=stop_after_attempt(10), wait=wait_fixed(5), retry=retry_if_exception(lambda exc: not isinstance(exc, UrlNotFoundException))) + def __make_request_with_retries(self, url, method="GET", timeout=100): + """ What method does? + 1. Retires 10 times + 2. Waits 5 seconds between retires + 3. Does not rety UrlNotFoundException + 4. Catches RatelimitException and waits before raising exception + """ + + try: + return self.make_request(url, method, timeout) + except RatelimitException as e: + self.__handle_github_ratelimit_response(e.response) + raise e + + def __handle_github_ratelimit_response(self, response): + + headers = response.headers + + if "Retry-After" in headers: + + retry_after = int(headers["Retry-After"]) + self.logger.info( + f'\n\n\n\nSleeping for {retry_after} seconds due to secondary rate limit issue.\n\n\n\n') + time.sleep(retry_after) + + elif "X-RateLimit-Remaining" in headers and int(headers["X-RateLimit-Remaining"]) == 0: + current_epoch = int(time.time()) + epoch_when_key_resets = int(headers["X-RateLimit-Reset"]) + key_reset_time = epoch_when_key_resets - current_epoch + + if key_reset_time < 0: + self.logger.error(f"Key reset time was less than 0 setting it to 0.\nThe current epoch is {current_epoch} and the epoch that the key resets at is {epoch_when_key_resets}") + key_reset_time = 0 + + self.logger.info(f"\n\n\nAPI rate limit exceeded. Sleeping until the key resets ({key_reset_time} seconds)") + time.sleep(key_reset_time) + else: + time.sleep(60) + + def __add_query_params(self, url: str, additional_params: dict) -> str: + """Add query params to a url. + + Args: + url: the url that is being modified + additional_params: key value pairs specififying the paramaters to be added + + Returns: + The url with the key value pairs in additional_params added as query params + """ + url_components = urlparse(url) + original_params = parse_qs(url_components.query) + # Before Python 3.5 you could update original_params with + # additional_params, but here all the variables are immutable. + merged_params = {**original_params, **additional_params} + updated_query = urlencode(merged_params, doseq=True) + # _replace() is how you can create a new NamedTuple with a changed field + return url_components._replace(query=updated_query).geturl() + + + + + + + + + + diff --git a/augur/tasks/init/celery_app.py b/augur/tasks/init/celery_app.py index e57fb674d2..da97751db9 100644 --- a/augur/tasks/init/celery_app.py +++ b/augur/tasks/init/celery_app.py @@ -26,18 +26,18 @@ 'augur.tasks.data_analysis', 'augur.tasks.util.collection_util'] -github_tasks = ['augur.tasks.github.contributors.tasks', - 'augur.tasks.github.issues.tasks', +github_tasks = ['augur.tasks.github.contributors', + 'augur.tasks.github.issues', 'augur.tasks.github.pull_requests.tasks', - 'augur.tasks.github.events.tasks', - 'augur.tasks.github.messages.tasks', + 'augur.tasks.github.events', + 'augur.tasks.github.messages', 'augur.tasks.github.facade_github.tasks', 'augur.tasks.github.releases.tasks', 'augur.tasks.github.repo_info.tasks', 'augur.tasks.github.detect_move.tasks', 'augur.tasks.github.pull_requests.files_model.tasks', 'augur.tasks.github.pull_requests.commits_model.tasks', - 'augur.tasks.github.traffic.tasks'] + 'augur.tasks.github.traffic'] gitlab_tasks = ['augur.tasks.gitlab.merge_request_task', 'augur.tasks.gitlab.issues_task', diff --git a/augur/tasks/start_tasks.py b/augur/tasks/start_tasks.py index 6b35881d60..562069ce84 100644 --- a/augur/tasks/start_tasks.py +++ b/augur/tasks/start_tasks.py @@ -15,7 +15,7 @@ from augur.tasks.github.pull_requests.files_model.tasks import process_pull_request_files from augur.tasks.github.pull_requests.commits_model.tasks import process_pull_request_commits from augur.tasks.git.dependency_tasks.tasks import process_ossf_dependency_metrics -from augur.tasks.github.traffic.tasks import collect_github_repo_clones_data +from augur.tasks.github.traffic import collect_github_repo_clones_data from augur.tasks.gitlab.merge_request_task import collect_gitlab_merge_requests, collect_merge_request_metadata, collect_merge_request_commits, collect_merge_request_files, collect_merge_request_comments from augur.tasks.gitlab.issues_task import collect_gitlab_issues, collect_gitlab_issue_comments from augur.tasks.gitlab.events_task import collect_gitlab_issue_events, collect_gitlab_merge_request_events @@ -29,6 +29,8 @@ from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_facade_weight_time_factor from augur.application.db.lib import execute_sql, get_session +RUNNING_DOCKER = os.environ.get('AUGUR_DOCKER_DEPLOY') == "1" + CELERY_GROUP_TYPE = type(group()) CELERY_CHAIN_TYPE = type(chain()) @@ -43,13 +45,13 @@ """ #Prelim phases are used to detect if where the repo has hosted has moved or not. -def prelim_phase(repo_git): +def prelim_phase(repo_git, full_collection): logger = logging.getLogger(prelim_phase.__name__) return detect_github_repo_move_core.si(repo_git) -def prelim_phase_secondary(repo_git): +def prelim_phase_secondary(repo_git, full_collection): logger = logging.getLogger(prelim_phase.__name__) return detect_github_repo_move_secondary.si(repo_git) @@ -57,14 +59,14 @@ def prelim_phase_secondary(repo_git): #This is the phase that defines the message for core augur collection #A chain is needed for each repo. -def primary_repo_collect_phase(repo_git): +def primary_repo_collect_phase(repo_git, full_collection): logger = logging.getLogger(primary_repo_collect_phase.__name__) #Define primary group of jobs for the primary collect phase: issues and pull requests. primary_repo_jobs = group( - collect_issues.si(repo_git), - collect_pull_requests.si(repo_git) + collect_issues.si(repo_git, full_collection), + collect_pull_requests.si(repo_git, full_collection) ) #Define secondary group that can't run until after primary jobs have finished. @@ -86,7 +88,7 @@ def primary_repo_collect_phase(repo_git): return repo_task_group -def primary_repo_collect_phase_gitlab(repo_git): +def primary_repo_collect_phase_gitlab(repo_git, full_collection): logger = logging.getLogger(primary_repo_collect_phase_gitlab.__name__) @@ -110,13 +112,13 @@ def primary_repo_collect_phase_gitlab(repo_git): #This phase creates the message for secondary collection tasks. #These are less important and have their own worker. -def secondary_repo_collect_phase(repo_git): +def secondary_repo_collect_phase(repo_git, full_collection): logger = logging.getLogger(secondary_repo_collect_phase.__name__) repo_task_group = group( - process_pull_request_files.si(repo_git), - process_pull_request_commits.si(repo_git), - chain(collect_pull_request_reviews.si(repo_git), collect_pull_request_review_comments.si(repo_git)), + process_pull_request_files.si(repo_git, full_collection), + process_pull_request_commits.si(repo_git, full_collection), + chain(collect_pull_request_reviews.si(repo_git, full_collection), collect_pull_request_review_comments.si(repo_git)), process_ossf_dependency_metrics.si(repo_git) ) @@ -140,9 +142,7 @@ def non_repo_domain_tasks(self): enabled_tasks = [] - enabled_tasks.extend(generate_non_repo_domain_facade_tasks(logger)) - - if machine_learning_phase.__name__ in enabled_phase_names: + if not RUNNING_DOCKER and machine_learning_phase.__name__ in enabled_phase_names: #enabled_tasks.extend(machine_learning_phase()) from augur.tasks.data_analysis.contributor_breadth_worker.contributor_breadth_worker import contributor_breadth_model enabled_tasks.append(contributor_breadth_model.si()) @@ -167,7 +167,7 @@ def build_primary_repo_collect_request(session, logger, enabled_phase_names, day primary_gitlab_enabled_phases.append(primary_repo_collect_phase_gitlab) #task success is scheduled no matter what the config says. - def core_task_success_util_gen(repo_git): + def core_task_success_util_gen(repo_git, full_collection): return core_task_success_util.si(repo_git) primary_enabled_phases.append(core_task_success_util_gen) @@ -187,7 +187,7 @@ def build_secondary_repo_collect_request(session, logger, enabled_phase_names, d secondary_enabled_phases.append(secondary_repo_collect_phase) - def secondary_task_success_util_gen(repo_git): + def secondary_task_success_util_gen(repo_git, full_collection): return secondary_task_success_util.si(repo_git) secondary_enabled_phases.append(secondary_task_success_util_gen) @@ -203,12 +203,12 @@ def build_facade_repo_collect_request(session, logger, enabled_phase_names, days facade_enabled_phases.append(facade_phase) - def facade_task_success_util_gen(repo_git): + def facade_task_success_util_gen(repo_git, full_collection): return facade_task_success_util.si(repo_git) facade_enabled_phases.append(facade_task_success_util_gen) - def facade_task_update_weight_util_gen(repo_git): + def facade_task_update_weight_util_gen(repo_git, full_collection): return git_update_commit_count_weight.si(repo_git) facade_enabled_phases.append(facade_task_update_weight_util_gen) @@ -223,7 +223,7 @@ def build_ml_repo_collect_request(session, logger, enabled_phase_names, days_unt ml_enabled_phases.append(machine_learning_phase) - def ml_task_success_util_gen(repo_git): + def ml_task_success_util_gen(repo_git, full_collection): return ml_task_success_util.si(repo_git) ml_enabled_phases.append(ml_task_success_util_gen) @@ -260,7 +260,7 @@ def augur_collection_monitor(self): #start_facade_collection(session, max_repo=30) enabled_collection_hooks.append(build_facade_repo_collect_request(session, logger, enabled_phase_names)) - if machine_learning_phase.__name__ in enabled_phase_names: + if not RUNNING_DOCKER and machine_learning_phase.__name__ in enabled_phase_names: enabled_collection_hooks.append(build_ml_repo_collect_request(session, logger, enabled_phase_names)) #start_ml_collection(session,max_repo=5) diff --git a/augur/tasks/util/collection_util.py b/augur/tasks/util/collection_util.py index f8156c8bf9..a31fbbbf2c 100644 --- a/augur/tasks/util/collection_util.py +++ b/augur/tasks/util/collection_util.py @@ -137,7 +137,8 @@ def get_valid_repos(self,session): if limit <= 0: return - collection_list = get_newly_added_repos(session, limit, hook=self.name) + new_collection_git_list = get_newly_added_repos(session, limit, hook=self.name) + collection_list = [(repo_git, True) for repo_git in new_collection_git_list] self.repo_list.extend(collection_list) limit -= len(collection_list) @@ -145,8 +146,8 @@ def get_valid_repos(self,session): if limit <= 0: return - collection_list = get_repos_for_recollection(session, limit, hook=self.name, days_until_collect_again=self.days_until_collect_again) - + recollection_git_list = get_repos_for_recollection(session, limit, hook=self.name, days_until_collect_again=self.days_until_collect_again) + collection_list = [(repo_git, False) for repo_git in recollection_git_list] self.repo_list.extend(collection_list) @@ -589,8 +590,8 @@ def send_messages(self): for col_hook in self.collection_hooks: self.logger.info(f"Starting collection on {len(col_hook.repo_list)} {col_hook.name} repos") - - for repo_git in col_hook.repo_list: + + for repo_git, full_collection in col_hook.repo_list: repo = get_repo_by_repo_git(repo_git) if "github" in repo.repo_git: @@ -598,7 +599,7 @@ def send_messages(self): for job in col_hook.phases: #Add the phase to the sequence in order as a celery task. #The preliminary task creates the larger task chain - augur_collection_sequence.append(job(repo_git)) + augur_collection_sequence.append(job(repo_git, full_collection)) #augur_collection_sequence.append(core_task_success_util.si(repo_git)) #Link all phases in a chain and send to celery @@ -616,7 +617,7 @@ def send_messages(self): for job in col_hook.gitlab_phases: #Add the phase to the sequence in order as a celery task. #The preliminary task creates the larger task chain - augur_collection_sequence.append(job(repo_git)) + augur_collection_sequence.append(job(repo_git, full_collection)) #augur_collection_sequence.append(core_task_success_util.si(repo_git)) #Link all phases in a chain and send to celery diff --git a/augur/tasks/util/worker_util.py b/augur/tasks/util/worker_util.py index 6198f1ccdb..51b0109faa 100644 --- a/augur/tasks/util/worker_util.py +++ b/augur/tasks/util/worker_util.py @@ -135,7 +135,10 @@ def parse_json_from_subprocess_call(logger, subprocess_arr, cwd=None): output = p.stdout try: - required_output = json.loads(output) + if output and output.strip(): + required_output = json.loads(output) + else: + required_output = {} except json.decoder.JSONDecodeError as e: logger.error(f"Could not parse required output! \n output: {output} \n Error: {e}") raise e diff --git a/augur/templates/first-time-config.j2 b/augur/templates/first-time-config.j2 new file mode 100644 index 0000000000..c9c2106d6c --- /dev/null +++ b/augur/templates/first-time-config.j2 @@ -0,0 +1,287 @@ +{# https://www.bootdey.com/snippets/view/dark-profile-settings #} + + + +
+ + + + + + + + + + + + +No database config exists
+Take a moment to create or update the configuration for your instance.
Default values are shown. When you are done updating, click the restart button to save the settings and bring the primary server up.
+Default values are shown. When you are done updating, click the continue button to + continue to the primary configuration setup
+Double-click an empty input field to automatically populate it with the placeholder + value