From b6c39e280aed407bb9dc087ab430c69c3628dcc2 Mon Sep 17 00:00:00 2001 From: yuenmichelle1 Date: Thu, 18 Apr 2024 11:48:37 -0500 Subject: [PATCH] Add cronjob to backfill corporate partner user groups' memberships and classifications (#57) * update comments on backfill scripts to give context of what previous scripts where used for * adding comments for context on old scripts * update backfill script in ruby * update env variables to use Rails.app.credentials. attempt at cron job kubernetes template * Update cron_sync.yml * Update user_group_membership_classification_backfill.py * update panoptes_membership_client.rb to create insert query using array.new vs times.map per hound * adding a more descriptive comment insert query * add manual sync * Update .github/workflows/manual_corp_user_group_sync.yml Co-authored-by: Zach Wolfenbarger * Update .github/workflows/manual_corp_user_group_sync.yml Co-authored-by: Zach Wolfenbarger * Update .github/workflows/manual_corp_user_group_sync.yml Co-authored-by: Zach Wolfenbarger * Update .github/workflows/manual_corp_user_group_sync.yml Co-authored-by: Zach Wolfenbarger * Update .github/workflows/manual_corp_user_group_sync.yml Co-authored-by: Zach Wolfenbarger --------- Co-authored-by: Zach Wolfenbarger --- .../workflows/manual_corp_user_group_sync.yml | 32 ++++++++ config/credentials/production.yml.enc | 2 +- kubernetes/corp_user_groups_cron_sync.yml | 28 +++++++ kubernetes/manual_corp_user_group_sync.yml | 25 +++++++ scripts/backfill_classifications.py | 7 ++ scripts/backfill_talk_comments.py | 5 ++ scripts/copy_classifications_from_files.py | 8 ++ scripts/panoptes_membership_client.rb | 52 +++++++++++++ .../save_classifications_chunk_in_files.py | 8 ++ ...roup_membership_classification_backfill.py | 3 +- ...roup_membership_classification_backfill.rb | 74 +++++++++++++++++++ 11 files changed, 241 insertions(+), 3 deletions(-) create mode 100644 .github/workflows/manual_corp_user_group_sync.yml create mode 100644 kubernetes/corp_user_groups_cron_sync.yml create mode 100644 kubernetes/manual_corp_user_group_sync.yml create mode 100644 scripts/panoptes_membership_client.rb create mode 100644 scripts/user_group_membership_classification_backfill.rb diff --git a/.github/workflows/manual_corp_user_group_sync.yml b/.github/workflows/manual_corp_user_group_sync.yml new file mode 100644 index 0000000..a292d45 --- /dev/null +++ b/.github/workflows/manual_corp_user_group_sync.yml @@ -0,0 +1,32 @@ +name: Manually Sync Eras Corporate User Group Sync + +on: + workflow_dispatch: + +jobs: + manual_sync: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4.1.1 + + - name: Login to GitHub Container Registry + uses: docker/login-action@v3.1.0 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - uses: azure/login@v2 + with: + creds: ${{ secrets.AZURE_AKS }} + + - name: Set the target AKS cluster + uses: Azure/aks-set-context@v4.0.0 + with: + cluster-name: microservices + resource-group: kubernetes + + - name: Modify & apply template + run: kubectl create -f kubernetes/manual_corp_user_group_sync.yml + \ No newline at end of file diff --git a/config/credentials/production.yml.enc b/config/credentials/production.yml.enc index 9e0b707..d5936ab 100644 --- a/config/credentials/production.yml.enc +++ b/config/credentials/production.yml.enc @@ -1 +1 @@ -lCfkIMPH/fwl6Cf1K+EWfeiHK/ITstJhGzRXMTS7XqvUG2odRAmlIlbRDibfLoGPj7kPMJzwvnpHgZN2ey09RS1dp3ZzrorY6JC3gnnsrB9rX0+dLZihtj+Tlvtzwd8H4Yv3OM3um0jWr9DD3sOq4N3a7kSm1Wqkr+UqqGrWLusESvEeDV/c6sJkjjI8q0Sbv34Mc3inSOZVcIk1DocxneYA1dSWDOWqbhF1vd2p53ubqGJIq3RPQEt7881IECek3iDuBXPPVaw6H2KaerBv0utnMhmST3mcNtAeAEzWcw/j8TztMqvaFowmp5+lo0jNJR4wNOUQaT7XiOtRFZfA6i9T0gUtqAz+gZxJshRarYpbKUb703PvQ4WAsh2Fs9g+v0PPuW2sSK6tfEzn84c9EQUCeQVKTeATsQlXAGuO9vYdKTw0ahnJzdFfql5LXUH7VJ+12cNbOanjVFjykcs3qY2N69PK1/ESAPqkyK8Fe0fosfAepUVm3rd7VLNMc2qGdWAxTAfdj59xYNaGU97+xRSATBMUdsGNoFcyv7WTUUxcn01Qz9WqyLdCzyIABEwvcXWtc3pNJ3U7bwC0fsqQkL+zqt883eYGDxU4Mn0H5IX21OfqTYDuPB3mQltuQe2KUq2KZcCLtBVce0nRGKx82dEhnjP+PRWk7moCsG8eyNqqi1i1ZqZcDemPTXL48+IgCLfmMYju7uDWTM5OWG9e+TWEqUowwUV+Gp5AoGKUt/DtfityIQmdJde7+6YaxYDsDEKZfP6na4C2Zyt6qXtCWYIBXo8sEuEE7j38lrWFgNODc8k53PgSBp6R0lPjzw5/U9HPzBKmkn705JdfRN5kYbKVpCPOQqMpjYU1jSWpqdO4ZgobEWT5n3XtHCuV6oO/P5t2lbwm+mVuVotrjv3p7/x+CbN+nTvA9Kma7mOdK8FMrQlqXD/FpnSvSLuO5/3Q7LltdKF3njmmUmsqFnbpXOaxKpgDb+ijeuwndEZ1hvyLepcxahBevbOS+ZOtdWHmfGhTlZ4uCgbgAXJ04FH8xFrf1/VG4GtMHuapT6tlJcxqBer4--8e7ThvhXjefdpI+0--QG+VUWhyp9cydSvpzydc0w== \ No newline at end of file +hD5l4bBd4gVw4DS/oeGO3XDXcwPUCSVCt+TFbmQsy+Hx4mBSTLF22nRljM/KDXPSU2b2RqbOl3AcDTeZWlNHqK0OUL68CPjrOWK0KFOqKPgqsh+ydEcjpCWkm1A5mQdxCO4GQ2zjZwZxeQvZbvGiPz1sc4WpVd5zILUcYeiK2P8lLqcLdSHroQbG9+L/0VT8d+0B1mNsdJ339S3cI0y+4Cp9evwi3l7fjIRpHN10tPFJJkuKd90syGOif1tDvtScl1bfkK3f7xmNL2lzDBMEGq2BzhFte/QZmBKYnHEeAolmMthLMQqQdZTftGUrxaZeh4OtzcKwerox81ZmUjVkubTGJEsSrB2Q8iR5RMZNscmjbzem6YEiT4mef7HeAECasZG5IbwHAMMh+s/BnKVWUlS0Srit5Zt40Judyde1K1UgjxsCsqWJS4JYcfhkmBzPi7x1paxGlWcezsEAJZFqjiqzIzvc4cvDoyCaEO1FMfbvOlHWgTH5A2xV2xnCiwE2DgkrVR9a9vaCkwxAdNV44Esq8npeVkytF39khDuaWWsn2dh+3j6vcnzGyI2gbIgyYAv7aV0+JEPBGRlPHHhVt3nUzYQUHaJHUL+yhjkIuMKxIilkOig+RhZDjdujR+kZdqP8KyVXuAg8Ui/h+/kG2XMtQR8YhHIQZFhZk9TVQQK2IC7PKmqJEo/V8zh53q37vaODthUNzACp9ww7UsZ6/35oconmuVpa1Ucijahw5Fiv1WEGG2oI0QPEhEQToG5dWHi2DbYwmQe1d/deG/CdFKMhdGaCGeOC/fZWTtUqKfDp70eytfic8I1jwsK1p3smO3/PZYNXwo78VXaZVKepqhp8IgVxPoNIY4YtmaEpigcHMmm409quT7/mX3fTpdGqcUPBVxx94w5xYVACmStGIGP2AesIgfdbxz0HzDf1aNMj8iEZuyo1gQ4rUiD6YygbzdDXOjuVa3Aq8ZvO3ixkJWFB65w6+KsLmt93zodHndH4N62PnmUw6eAJCmIxbBaJqU1kim8IQd0zXv5Bx3hMq7GVulMZccsIzZoCWDACJic+jsYM3Rp7UBP1WYsGpfOSWC14PXQGVwmXKhG3hn3a/Lim/nkMKUclx1Hy2W+qpDPKDl27uQPxF9wt4ZwkAyBdKXgDCpq11GPI2Kkj5Zvhrx+labnulixV6HfZpQJdcVk11Rqf5O8l6v/AHjSaQ6qrEztNLZL/IE5cqeWiZZZOIxUQuYVa6aldsEH9+niHzD6/U6JvHVP0dRj5PNF0TepNtvFkzUKZI+fTG6R6OYyMSBcOOW7i6brSCNM1I+Yx3+F7eACovJQhl9spb0YEbxvENYeHStlWYUnxFlqslbPHIikB/2J1BMzpscVC7AR2nJgBifoGcrMX2w9cCH2SgGW2HFkr21b51YuXQo99vBrk0dUudDC7s1f4bnudtaLZOUjrCcY0DudT8RUcDrVfJpkxOvGgcI6zxIzoXLu9SW3bS27zJWfhhZxXqljfczHgXThFWqlQd518XUGmfxGrg9C+fP2ix6ngAaIAkywavbLvfltYe6mkJGrTIjVAns6GX3QtroLJYvvCq2i0BHRJSvISJVfzZ7E6JYLdVJS9ur3C3jrIGl4iiiVJeuwjcJA7NFlrey9aN/MR/4jwj7TlMqi+47bQGThT1nzyZmvkB2EJpO271Vw1dSbUsaj5hz5dKiypav3puMXHq7IguQv12gUaRX5ccMaNqDbyXhC0jWWa9xoFsisAEqNCNCrwosivND18fbaaNUZ2G0QuuPFjYlTiEKvXuaaip/QOmYlQ8q99arps7rn6SK8rhRX3ymhxLXyx5yIjtI+BXMvylXM05OVMWmhm7ST3p7ecqENCS0Y2cA119mjUd6TG+ulLyw4l3UGIBAwLrUcraNvpiYNUQcx+y07OfHAjct4lIdseaCqnqPB37W4k9p6aYo+CRlaE/uKtckdkBCtlHxxeoREyQyepTgtI2uq+i+UEeYmqQrHIAEsT+C76cSaeuwINRYQ6z6AjrmvgvQ4LjteyV2ky1M7ckBGZMnh1mLcB6jN0zEsqwgs74yYeZnmEVN0nwOc+3M9OHwZEjIIbgedHBXhOF/y+Aa9dSomYaHrmVDzrRvUmF2z0gqNQAcbL7TVo9FmwLxBYUFqORXLlf0B6An5vpDzEslw3sKSx4+hbclYDAGKsLuPPeFD1O2D3v+WluUqwCSdkVXNNnPD0rZUT/8jVVSBDb11Vvbtl603cTaZJcRujvynJllx1jky3RiVuTkG7HBGTLIozy9Na6eQzATBZazRQC7cVobySMsyf4CKtHcAH97Vl5fSrYcBtTkgTbWrZrVfMJwT6ufnSyDXHkOXWhi37J9J5Ig/w6c4G1yQcjOa12jsHsWhRTNt8XY6b9r2vW1kqA2+bbtW9Z8o8NcskNCoPKOKzeRlEJ2cPvAFV5Fh+H9FZN6UpCxMDEuggBjAB/J3EYjFG6XEVqEOhfXMVOmRJ9ywsp7whxtk1F5kxhRDXomOsCxydd3gGAyIdwF0NHIptjaa7DIrxbCB+u+BtmytZzE00QwoeffVmLhoa8JyTMHGdK7ion+bfuFnEiP8/m1O8uvFesnfDzWp6+/F8Au4m2GACcZcjn0TsoNW7iYy6xMXn32X7rKcs12baJFbLsH4xJfdOPM0RlH16E79n6g8p7uUb8RSzrJ0cUsLC5dRVhw2hQWNbzQudEMtciIvvfGAkDBpUcpu/4ZkmS8RdblmJ+BDT0yFKddWdD217EYzrPbrpXXomPU95kEq9z9i0EKA5e8hjGt1roAgGh51/sezhrLVGmAeIPodxbxbWOWH3UTt+HUP5QpnRq8k2qILk4FuiFRlHUTNLvVtSdk1Z8XybLwmYMdoGTSQjD1mSmVSouA==--UPEZvym9PPXcBjaM--xrk0OyBCCPOBgaRp2gjxIg== \ No newline at end of file diff --git a/kubernetes/corp_user_groups_cron_sync.yml b/kubernetes/corp_user_groups_cron_sync.yml new file mode 100644 index 0000000..e90b276 --- /dev/null +++ b/kubernetes/corp_user_groups_cron_sync.yml @@ -0,0 +1,28 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + name: eras-corp-partners-cron-sync-job +spec: + schedule: "0 3 * * *" + jobTemplate: + spec: + template: + metadata: + name: eras-corp-partners-sync + spec: + containers: + - name: eras-corp-partners-sync + image: ghcr.io/zooniverse/eras + env: + - name: RAILS_LOG_TO_STDOUT + value: "true" + - name: RAILS_ENV + value: production + - name: RAILS_MASTER_KEY + valueFrom: + secretKeyRef: + name: eras-production + key: rails-master-key + command: ['ruby', './scripts/user_group_membership_classification_backfill.rb'] + restartPolicy: Never + backoffLimit: 2 diff --git a/kubernetes/manual_corp_user_group_sync.yml b/kubernetes/manual_corp_user_group_sync.yml new file mode 100644 index 0000000..724b6bd --- /dev/null +++ b/kubernetes/manual_corp_user_group_sync.yml @@ -0,0 +1,25 @@ +apiVersion: batch/v1 +kind: Job +metadata: + generateName: eras-corp-partners-sync- +spec: + template: + metadata: + name: eras-corp-partners-sync + spec: + containers: + - name: eras-corp-partners-sync + image: ghcr.io/zooniverse/eras + env: + - name: RAILS_LOG_TO_STDOUT + value: "true" + - name: RAILS_ENV + value: production + - name: RAILS_MASTER_KEY + valueFrom: + secretKeyRef: + name: eras-production + key: rails-master-key + command: ['ruby', './scripts/user_group_membership_classification_backfill.rb'] + restartPolicy: Never + backoffLimit: 2 diff --git a/scripts/backfill_classifications.py b/scripts/backfill_classifications.py index a5b03fb..5b00f7d 100644 --- a/scripts/backfill_classifications.py +++ b/scripts/backfill_classifications.py @@ -1,3 +1,10 @@ +## +## This script was used in VM when first introducing ERAS. We needed to backfill classifications into ERAS db. +## Unfortunately there was too much data to do a straight copy from panoptes db to copy to eras db. +## The script was followed up with save_classifications_chunk_in_files.py and copy_classifications_from_files.py, which copies from panoptes +## DB to csvs and then csvs to Eras DB. See PR: https://github.com/zooniverse/eras/pull/40 +## + import os import psycopg from datetime import datetime diff --git a/scripts/backfill_talk_comments.py b/scripts/backfill_talk_comments.py index d5d04d8..14f452f 100644 --- a/scripts/backfill_talk_comments.py +++ b/scripts/backfill_talk_comments.py @@ -1,3 +1,8 @@ +## +## This script was used in VM when first introducing ERAS. We needed to backfill talk comments into ERAS db. +## This script is a straight COPY FROM Talk DB to COPY TO ERAS DB. +## + import os import psycopg from datetime import datetime diff --git a/scripts/copy_classifications_from_files.py b/scripts/copy_classifications_from_files.py index 7595b6d..0f5a006 100644 --- a/scripts/copy_classifications_from_files.py +++ b/scripts/copy_classifications_from_files.py @@ -1,3 +1,11 @@ +## +## This script along with save_classifications_chunk_in_files.py was used in VM when first introducing ERAS. +## We needed to backfill classifications into ERAS db. +## The script was preluded with backfll_classifications.py which does a straight copy from panoptes db to copy to eras db. +## There was too much data to do a straight copy from panoptes db to copy to eras db, so we had to chunk in files. +## See PR: https://github.com/zooniverse/eras/pull/40 +## + import os import psycopg from datetime import datetime diff --git a/scripts/panoptes_membership_client.rb b/scripts/panoptes_membership_client.rb new file mode 100644 index 0000000..8a01207 --- /dev/null +++ b/scripts/panoptes_membership_client.rb @@ -0,0 +1,52 @@ +# frozen_string_literal: true + +require 'pg' +require '../config/environment' + +class PanoptesMembershipClient + def user_ids_not_in_user_group(user_group_id, domain_formats) + conn.exec( + "SELECT id FROM users + WHERE email ILIKE ANY(STRING_TO_ARRAY('#{domain_formats.join(',')}', ',')) + AND id NOT IN (SELECT user_id FROM memberships where user_group_id=#{user_group_id}) + " + ).entries.map { |res| res['id'].to_i } + end + + def insert_memberships(user_group_id, user_ids) + memberships_to_create = user_memberships(user_group_id, user_ids) + + member_creation_sql_query = memberships_insert_query(memberships_to_create) + + conn.exec_params(member_creation_sql_query, memberships_to_create.flatten) + end + + private + + def conn + @conn ||= PG.connect(Rails.application.credentials.panoptes_db_uri, sslmode: 'require') + end + + def user_memberships(user_group_id, user_ids) + memberships_to_create = [] + user_ids.each do |user_id| + # membership in array order: user_id, user_group_id, state, roles + membership = [ + user_id, + user_group_id, + 0, + '{"group_member"}' + ] + memberships_to_create << membership + end + memberships_to_create + end + + def memberships_insert_query(memberships_to_create) + # Values is a string that will look like ($1, $2, $3, $4), ($5, $6, $7, $8), ..etc.. + values = Array.new(memberships_to_create.length) do |i| + "($#{(4 * i) + 1}, $#{(4 * i) + 2}, $#{(4 * i) + 3}, $#{(4 * i) + 4})" + end.join(',') + "INSERT INTO memberships (user_id, user_group_id, state, roles) VALUES #{values}" + end +end diff --git a/scripts/save_classifications_chunk_in_files.py b/scripts/save_classifications_chunk_in_files.py index 7758aa6..fec9572 100644 --- a/scripts/save_classifications_chunk_in_files.py +++ b/scripts/save_classifications_chunk_in_files.py @@ -1,3 +1,11 @@ +## +## This script along with copy_classifications_from_files.py was used in VM when first introducing ERAS. +## We needed to backfill classifications into ERAS db. +## The script was preluded with backfll_classifications.py which does a straight copy from panoptes db to copy to eras db. +## There was too much data to do a straight copy from panoptes db to copy to eras db, so we had to chunk in files. +## See PR: https://github.com/zooniverse/eras/pull/40 +## + import os import psycopg from datetime import datetime diff --git a/scripts/user_group_membership_classification_backfill.py b/scripts/user_group_membership_classification_backfill.py index 9b26a50..647bf56 100644 --- a/scripts/user_group_membership_classification_backfill.py +++ b/scripts/user_group_membership_classification_backfill.py @@ -18,7 +18,6 @@ current_time = now.strftime("%H:%M:%S") print("BEFORE Time =", current_time) - parser = argparse.ArgumentParser() parser.add_argument("-ug", "--user_group_id", type=int) parser.add_argument('email_domain_formats') @@ -47,7 +46,7 @@ panoptes_db_conn.commit() # eras get classification_events of not_in_group_yet_user_ids that does not have user_group_id within their user_group_ids classification_event - eras_cursor.execute("SELECT classification_id, event_time, session_time, project_id, user_id, workflow_id, created_at, updated_at, user_group_ids from classification_events WHERE user_id = ANY(%s) AND %s!=ANY(user_group_ids)", (not_in_group_yet_user_ids, user_group_id)) + eras_cursor.execute("SELECT classification_id, event_time, session_time, project_id, user_id, workflow_id, created_at, updated_at, user_group_ids from classification_events WHERE user_id IN %s AND %s!=ANY(user_group_ids)", (not_in_group_yet_user_ids, user_group_id)) classification_events_to_backfill = eras_cursor.fetchall() # create classification_user_group diff --git a/scripts/user_group_membership_classification_backfill.rb b/scripts/user_group_membership_classification_backfill.rb new file mode 100644 index 0000000..bd9b2fd --- /dev/null +++ b/scripts/user_group_membership_classification_backfill.rb @@ -0,0 +1,74 @@ +# frozen_string_literal: true + +require '../config/environment' +require './panoptes_membership_client' +require 'json' + +corporate_user_groups_str = Rails.application.credentials.corporate_user_groups +corporate_partners = JSON.parse(corporate_user_groups_str) + +puts 'Starting Classification and Membership Backfill...' + +panoptes_client = PanoptesMembershipClient.new + +corporate_partners.each do |corporate_partner| + puts "Geting Ids of users that are not in group yet for #{corporate_partner['corp_name']}..." + not_yet_member_user_ids = panoptes_client.user_ids_not_in_user_group(corporate_partner['user_group_id'], corporate_partner['domain_formats']) + + puts "Query found #{not_yet_member_user_ids.length} users not in the #{corporate_partner['corp_name']} user_group..." + + next unless not_yet_member_user_ids.length.positive? + + puts "Creating Memberships for #{corporate_partner['corp_name']}..." + panoptes_client.insert_memberships(corporate_partner['user_group_id'], not_yet_member_user_ids) + + puts 'Querying Eras ClassificationEvents of newly created members...' + classification_events_to_backfill = ClassificationEvent.where('user_id IN (?)', not_yet_member_user_ids) + + next unless classification_events_to_backfill.length.positive? + + puts 'Creating Classification User Groups...' + classification_user_groups_to_create = [] + classification_events_to_backfill.each do |classification| + classification_user_group = { + classification_id: classification.classification_id, + event_time: classification.event_time, + project_id: classification.project_id, + workflow_id: classification.workflow_id, + user_id: classification.user_id, + session_time: classification.session_time, + user_group_id: corporate_partner['user_group_id'] + } + classification_user_groups_to_create << classification_user_group + end + + ClassificationUserGroup.upsert_all(classification_user_groups_to_create, + unique_by: %i[classification_id event_time user_group_id user_id]) + + puts 'ClassificationUserGroup Upsert Finished...' +end + +today = Date.today.to_s +two_days_ago = (Date.today - 2).to_s +puts 'Classification and Membership Backfill Finished. Starting CA Refresh...' +puts 'Refreshing Continuous Aggregates dealing with User Groups...' + +puts 'Refreshing Daily Group Classifications Count And Time...' +ActiveRecord::Base.connection.exec_query("CALL refresh_continuous_aggregate('daily_group_classification_count_and_time', '#{two_days_ago}', '#{today}')") + +puts 'Refreshing Daily Group Classifications Count And Time Per Project...' +ActiveRecord::Base.connection.exec_query("CALL refresh_continuous_aggregate('daily_group_classification_count_and_time_per_project', '#{two_days_ago}', '#{today}')") + +puts 'Refreshing Daily Group Classifications Count And Time Per User...' +ActiveRecord::Base.connection.exec_query("CALL refresh_continuous_aggregate('daily_group_classification_count_and_time_per_user', '#{two_days_ago}', '#{today}')") + +puts 'Refreshing Daily Group Classifications Count And Time Per User And Project...' +ActiveRecord::Base.connection.exec_query("CALL refresh_continuous_aggregate('daily_group_classification_count_and_time_per_user_per_project', '#{two_days_ago}', '#{today}')") + +puts 'Refreshing Daily Group Classifications Count And Time Per User And Workflow...' +ActiveRecord::Base.connection.exec_query("CALL refresh_continuous_aggregate('daily_group_classification_count_and_time_per_user_per_workflow', '#{two_days_ago}', '#{today}')") + +puts 'Refreshing Daily Group Classifications Count And Time Per Workflow...' +ActiveRecord::Base.connection.exec_query("CALL refresh_continuous_aggregate('daily_group_classification_count_and_time_per_workflow', '#{two_days_ago}', '#{today}')") + +puts 'Stats User Group Membership and Classification Backfill Completed'