Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore: analytics exporter #1767

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 32 additions & 14 deletions dataeng/jobs/analytics/AnalyticsExporter.groovy
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
package analytics
import static org.edx.jenkins.dsl.AnalyticsConstants.common_publishers
import static org.edx.jenkins.dsl.AnalyticsConstants.secure_scm
import static org.edx.jenkins.dsl.AnalyticsConstants.secure_scm_parameters
import static org.edx.jenkins.dsl.AnalyticsConstants.config_scm
import static org.edx.jenkins.dsl.AnalyticsConstants.opsgenie_heartbeat_publisher

class AnalyticsExporter {
Expand All @@ -19,7 +18,6 @@ class AnalyticsExporter {
stringParam('TASKS', '', 'Space separated list of tasks to process. Leave this blank to use the task list specified in the config file. Specify here only if you are running tests of a specific task.')
stringParam('PYTHON_VENV_VERSION', 'python3.7', 'Python virtual environment version to used.')
}
parameters secure_scm_parameters(allVars)

environmentVariables {
env('REMOTE_CONFIG_PROD_EDX_ROLE_ARN', allVars.get('REMOTE_CONFIG_PROD_EDX_ROLE_ARN'))
Expand All @@ -32,7 +30,7 @@ class AnalyticsExporter {
env('REMOTE_CONFIG_DECRYPTION_KEYS_VAULT_KV_VERSION', allVars.get('REMOTE_CONFIG_DECRYPTION_KEYS_VAULT_KV_VERSION'))
}

multiscm secure_scm(allVars) << {
multiscm config_scm(allVars) << {
git {
remote {
url('[email protected]:openedx/edx-platform.git')
Expand All @@ -55,6 +53,17 @@ class AnalyticsExporter {
relativeTargetDirectory('analytics-exporter')
}
}
git {
remote {
url('[email protected]:edx/analytics-tools.git')
branch('master')
credentials('1')
}
extensions {
pruneBranches()
relativeTargetDirectory('analytics-tools')
}
}

}

Expand All @@ -73,7 +82,7 @@ class AnalyticsExporter {
}
}

dslFactory.job('analytics-exporter-worker') {
dslFactory.job('analytics-exporter-worker-test') {
description('This is a worker/downstream job to the Analytics Exporter. It does all of the legwork of exporting/encrypting the data for a given org. See also: analytics-exporter-master.')
parameters {
stringParam('NOTIFY')
Expand All @@ -89,7 +98,6 @@ class AnalyticsExporter {
stringParam('EXTRA_OPTIONS')
stringParam('PYTHON_VENV_VERSION', 'python3.7', 'Python version to use for creating virtualenv.')
}
parameters secure_scm_parameters(allVars)

environmentVariables {
env('REMOTE_CONFIG_PROD_EDX_ROLE_ARN', allVars.get('REMOTE_CONFIG_PROD_EDX_ROLE_ARN'))
Expand All @@ -113,7 +121,7 @@ class AnalyticsExporter {

concurrentBuild()

multiscm secure_scm(allVars)
multiscm config_scm(allVars)

wrappers {
timestamps()
Expand All @@ -139,29 +147,29 @@ class AnalyticsExporter {
}
}

dslFactory.job('analytics-exporter-master') {
dslFactory.job('analytics-exporter-master-test') {
description('The Analytics Exporter weekly job, which exports tons of structure and state data for every course for every participating org and delivers them encrypted to our partners via S3. Specifically, this sets up the shared edx-platform execution environment, fetches a list of all the orgs, then kicks off downstream analytics-exporter-worker jobs for each one that corresponds to a partner which is configured to receive export data.')
parameters {
stringParam('ORGS', '*', 'Space separated list of organizations to process. Can use wildcards. e.g.: idbx HarvardX')
stringParam('EXPORTER_BRANCH', 'origin/master', 'Branch from the edx-analytics-exporter repository. For tags use tags/[tag-name].')
stringParam('PLATFORM_BRANCH', 'origin/2u/release', 'Branch from the edx-platform repository. For tags use tags/[tag-name].')
stringParam('EXPORTER_CONFIG_FILENAME', 'default.yaml', 'Name of configuration file in analytics-secure/analytics-exporter.')
stringParam('OUTPUT_BUCKET', allVars.get('EXPORTER_OUTPUT_BUCKET'), 'Name of the bucket for the destination of the export data. Can use a path. (eg. export-data/test).')
stringParam('NOTIFY', allVars.get('ANALYTICS_EXPORTER_NOTIFY_LIST'), 'Space separated list of emails to notify in case of failure.')
stringParam('OUTPUT_BUCKET', 's3://edx-analytics-scratch/analytics-test', 'Name of the bucket for the destination of the export data. Can use a path. (eg. export-data/test).')
stringParam('NOTIFY', '', 'Space separated list of emails to notify in case of failure.')

stringParam('DATE_MODIFIER', '', 'Used to set the date of the CWSM dump. Leave blank to use today\'s date. Set to "-d 202x-0x-0x" if that is when the CWSM dump took place. (Leave off quotes.)')
stringParam('EXTRA_OPTIONS', '--exclude-task=OrgEmailOptInTask', 'e.g. --exclude-task=OrgEmailOptInTask')
stringParam('ORG_CONFIG', 'data-czar-keys/config.yaml', 'Path to the data-czar organization config file.')
stringParam('DATA_CZAR_KEYS_BRANCH', 'master', 'Branch to use for the data-czar-keys repository.')
stringParam('PRIORITY_ORGS', allVars.get('PRIORITY_ORGS'), 'Space separated list of organizations to process first.')
}
parameters secure_scm_parameters(allVars)
environmentVariables {
env('OPSGENIE_HEARTBEAT_NAME', allVars.get('OPSGENIE_HEARTBEAT_NAME'))
env('OPSGENIE_HEARTBEAT_DURATION_NUM', allVars.get('OPSGENIE_HEARTBEAT_DURATION_NUM'))
env('OPSGENIE_HEARTBEAT_DURATION_UNIT', allVars.get('OPSGENIE_HEARTBEAT_DURATION_UNIT'))
}

multiscm secure_scm(allVars) << {
multiscm config_scm(allVars) << {
git {
remote {
url('[email protected]:openedx/edx-platform.git')
Expand Down Expand Up @@ -194,7 +202,17 @@ class AnalyticsExporter {
relativeTargetDirectory('data-czar-keys')
}
}

git {
remote {
url('[email protected]:edx/analytics-tools.git')
branch('master')
credentials('1')
}
extensions {
pruneBranches()
relativeTargetDirectory('analytics-tools')
}
}
}

triggers{
Expand All @@ -220,7 +238,7 @@ class AnalyticsExporter {
shell(dslFactory.readFileFromWorkspace('dataeng/resources/setup-platform-venv-py3.sh'))
shell(dslFactory.readFileFromWorkspace('dataeng/resources/setup-exporter.sh'))
downstreamParameterized {
trigger('analytics-exporter-worker') {
trigger('analytics-exporter-worker-test') {
block {
// Mark this build step as FAILURE if at least one of the downstream builds were marked FAILED.
buildStepFailure('FAILURE')
Expand Down
13 changes: 10 additions & 3 deletions dataeng/resources/setup-exporter.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ mkdir -p /var/lib/jenkins/tmp/analytics-exporter/course-data

# Create and activate a virtualenv in shell script
EXPORTER_VENV="exporter_venv"
virtualenv --python=python3.8 --clear "${EXPORTER_VENV}"
virtualenv --python=python3.11 --clear "${EXPORTER_VENV}"
source "${EXPORTER_VENV}/bin/activate"

# Install requirements into this (exporter) virtual environment
Expand All @@ -16,8 +16,15 @@ pip install -r github_requirements.txt
pip install mysql-connector-python -e .
popd

# Configuration paths in analytics-secure
SECURE_ROOT=${WORKSPACE}/analytics-secure/analytics-exporter


cd analytics-tools/snowflake
pip install argparse boto3
python3 secrets-manager.py -w -n analytics-secure/analytics-exporter/task-auth.json -v task-auth.json
Copy link
Contributor

@HassanJaveed84 HassanJaveed84 May 29, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looking at

stringParam('EXPORTER_CONFIG_FILENAME', 'default.yaml', 'Name of configuration file in analytics-secure/analytics-exporter.')
, it looks like we also need default.yaml in addition to task-auth.json

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, I'm forgetting this but where did we decide to keep the secrets-manager.py script ?
is it going to be part of analytics-tools repo ?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I noticed that the path here is analytics-secure/analytics-exporter/task-auth.json, shouldn't it be analytics-config ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes to analytics-tools repo. It should be in there already.

Task-auth has sensitive values, so i was going to pull it from secrets manager

cd ../../

# Configuration paths in analytics-config
SECURE_ROOT=${WORKSPACE}/analytics-config/analytics-exporter
CONFIG_PATH=${SECURE_ROOT}/${EXPORTER_CONFIG_FILENAME}
GPG_KEYS_PATH=${WORKSPACE}/data-czar-keys

Expand Down
14 changes: 14 additions & 0 deletions src/main/groovy/org/edx/jenkins/dsl/AnalyticsConstants.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,20 @@ class AnalyticsConstants {
}
}

public static def config_scm = { allVars ->
return {
git {
remote {
url('[email protected]:edx/analytics-config.git')
branch('master')
}
extensions {
pruneBranches()
relativeTargetDirectory('analytics-config')
}
}
}
}
public static def data_czar_keys_scm = { allVars ->
return {
git {
Expand Down
Loading