From a82c1b3798fced3494ac80ad2a06405472efda21 Mon Sep 17 00:00:00 2001
From: John Hsu <john.hsu@ubc.ca>
Date: Mon, 13 Feb 2023 20:38:54 -0800
Subject: [PATCH] ADD kaltura migration script to cli tools

We're switching from on-prem hosted Kaltura to cloud Kaltura. When
videos are transferred to the new Kaltura instance, a new entry ID is
generated. We will be given a CSV mapping of old to new entry IDs, and
will have to update our Kaltura data accordingly.

The CSV is expected to be in the following format, where the first row
is a header and can be ignored:

old entry id, new entry id
0_someenid,0_newentid
0_someenid,0_newentid
0_someenid,0_newentid

Assuming that collision between old and new entry IDs are impossible.

Requires that these Kaltura env vars are set to the new Kaltura
environment:

* KALTURA_SERVICE_URL
* KALTURA_PARTNER_ID
* KALTURA_SECRET
* KALTURA_USER_ID
* KALTURA_PLAYER_ID

Usage options, run in app root:

    python manage.py kaltura migrate /path/to/mappingCsv.csv

-d Do a dry run, without actually making any changes to the database:

    python manage.py kaltura migrate -d /path/to/mappingCsv.csv

-n If present, tells the CSV parser not to skip the first row. By
default, we assume the first row is a header row and skip it:

    python manage.py kaltura migrate -n /path/to/mappingCsv.csv

This will migrate all the normal Kaltura integration uploaded videos.
There's a handful of special cases where people inserted Kaltura video
links into answers that this does NOT deal with. We're not sure how to
deal with that on the Kaltura side yet, as those videos were uploaded to
a different section that wasn't meant for ComPAIR.
---
 compair/manage/kaltura.py | 161 ++++++++++++++++++++++++++++++++++++++
 manage.py                 |   2 +
 requirements.txt          |   2 +
 3 files changed, 165 insertions(+)
 create mode 100644 compair/manage/kaltura.py

diff --git a/compair/manage/kaltura.py b/compair/manage/kaltura.py
new file mode 100644
index 000000000..d5db80041
--- /dev/null
+++ b/compair/manage/kaltura.py
@@ -0,0 +1,161 @@
+"""
+Migrate Kaltura media to new Kaltura instance. We're switching from on-prem
+hosted Kaltura to cloud Kaltura. When videos are transferred to the new Kaltura
+instance, a new entry ID is generated. We will be given a CSV mapping of old
+to new entry IDs, and will have to update our Kaltura data accordingly.
+
+Assuming that collision between old and new entry IDs are impossible.
+
+Requires that these Kaltura env vars are set to the new Kaltura environment:
+
+* KALTURA_SERVICE_URL
+* KALTURA_PARTNER_ID
+* KALTURA_SECRET
+* KALTURA_USER_ID
+* KALTURA_PLAYER_ID
+
+Usage options, run in app root:
+
+    python manage.py kaltura migrate /path/to/mappingCsv.csv
+
+-d Do a dry run, without actually making any changes to the database:
+
+    python manage.py kaltura migrate -d /path/to/mappingCsv.csv
+
+-n If present, tells the CSV parser not to skip the first row. By default, we
+assume the first row is a header row and skip it:
+
+    python manage.py kaltura migrate -n /path/to/mappingCsv.csv
+
+"""
+
+import csv
+from datetime import datetime
+import re
+
+from KalturaClient import KalturaClient, KalturaConfiguration
+from KalturaClient.Plugins.Core import (KalturaSessionType, KalturaMediaEntry,
+                                        KalturaMediaType)
+from flask_script import Manager
+
+from compair.core import db
+from compair.kaltura.core import KalturaCore
+from compair.models import Answer, KalturaMedia, File
+from flask import current_app
+
+manager = Manager(usage="Kaltura Migration")
+
+def readMappingCsv(mappingCsv, noHeader):
+    oldToNewEntryIds = {}
+    idRe = re.compile(r"\d_\w{8}")
+    with open(mappingCsv, 'r') as csvFile:
+        csvReader = csv.reader(csvFile, skipinitialspace=True)
+        for row in csvReader:
+            if not noHeader and csvReader.line_num == 1:
+                continue
+            oldEntryId = row[0]
+            newEntryId = row[1]
+            if not (re.match(idRe, oldEntryId) and re.match(idRe, newEntryId)):
+                raise ValueError(f"Mapping file line {csvReader.line_num} has a value not in entry ID format.")
+            oldToNewEntryIds[oldEntryId] = newEntryId
+    if oldToNewEntryIds:
+        return oldToNewEntryIds
+    raise ValueError("Mapping file is empty")
+
+
+def msg(msg, logfile):
+    print(msg)
+    logfile.write(f'{msg}\n')
+    logfile.flush()
+
+
+def summarize(numToMigrate, numInvalid, numMigrated, numNoMapping, numTotal,
+              logfile):
+    msg( '-------- Summary --------', logfile)
+    msg(f'  To be Migrated: {numToMigrate}', logfile)
+    msg(f'   To be Deleted: {numInvalid}', logfile)
+    msg(f'Already Migrated: {numMigrated}', logfile)
+    msg(f'      No Mapping: {numNoMapping}', logfile)
+    msg(f'           Total: {numTotal}', logfile)
+    msg( '-------- ------- --------', logfile)
+
+
+def deleteInvalidKalturaMedias(medias, logfile):
+    for media in medias:
+        msg(f'Deleting invalid kaltura media id {media.id}', logfile)
+        db.session.delete(media)
+
+
+def migrateKalturaMedias(medias, oldToNewEntryIds, logfile):
+    # connect to the Kaltura API
+    kClient = KalturaClient(KalturaConfiguration(
+                            serviceUrl=KalturaCore.service_url()))
+    kSession = kClient.session.start(
+        KalturaCore.secret(),
+        KalturaCore.user_id(),
+        KalturaSessionType.ADMIN,
+        KalturaCore.partner_id(),
+        86400, # session expires in 1 hour
+        "appID:compair"
+    )
+    kClient.setKs(kSession)
+
+    for media in medias:
+        mediaId = media.id
+        oldEntryId = media.entry_id
+        newEntryId = oldToNewEntryIds[oldEntryId]
+        msg(f'Processing id {mediaId}: Old {oldEntryId} to New {newEntryId}',
+            logfile)
+        newInfo = kClient.media.get(newEntryId, -1)
+        media.download_url = newInfo.getDownloadUrl()
+        media.partner_id = newInfo.getPartnerId()
+        media.service_url = KalturaCore.service_url()
+        media.player_id = KalturaCore.player_id()
+        media.entry_id = newEntryId
+        #db.session.add(media)
+
+
+@manager.command
+def migrate(mappingCsv, noHeader=False, dryRun=False):
+    ts = datetime.now().isoformat(timespec='seconds')
+    logfile = open(f'kaltura-migration-log-{ts}.log', 'a')
+    msg('Starting Kaltura migration', logfile)
+    oldToNewEntryIds = readMappingCsv(mappingCsv, noHeader)
+    newToOldEntryIds = dict(map(reversed, oldToNewEntryIds.items()))
+    invalidKalturaMedias = [] # can't be migrated, might as well delete
+    needMigrationMedias = [] # needs to be migrated
+    numAlreadyMigrated = 0
+    numNoMapping = 0
+    numTotal = 0
+    kalturaMedias = KalturaMedia.query.all()
+    # find out how much work needs to be done
+    for kalturaMedia in kalturaMedias:
+        numTotal += 1
+        mediaId = kalturaMedia.id
+        entryId = kalturaMedia.entry_id
+        if not entryId:
+            msg(f'Empty entry ID for id {mediaId}', logfile)
+            invalidKalturaMedias.append(kalturaMedia)
+        elif entryId in oldToNewEntryIds:
+            msg(f"Migration needed for id {mediaId}: Entry {entryId}", logfile)
+            needMigrationMedias.append(kalturaMedia)
+        elif entryId in newToOldEntryIds:
+            msg(f"Already migrated id {mediaId}: Entry {entryId}", logfile)
+            numAlreadyMigrated += 1
+        else:
+            # didn't find a mapping, perhaps missing from migration?
+            msg(f'No mapping for id {mediaId}: Entry {entryId}', logfile)
+            numNoMapping += 1
+    # summarize what needs to be done
+    summarize(len(needMigrationMedias), len(invalidKalturaMedias),
+              numAlreadyMigrated, numNoMapping, numTotal, logfile)
+    # do the actual work in a transaction
+    if dryRun:
+        msg(f'*** Dry run completed, no changes were made ***', logfile)
+    else:
+        msg(f'Starting database session', logfile)
+        deleteInvalidKalturaMedias(invalidKalturaMedias, logfile)
+        migrateKalturaMedias(needMigrationMedias, oldToNewEntryIds, logfile)
+        msg(f'Committing to database', logfile)
+        db.session.commit()
+    logfile.close()
diff --git a/manage.py b/manage.py
index 02d965a84..91c9d41a8 100755
--- a/manage.py
+++ b/manage.py
@@ -9,6 +9,7 @@
 from compair.manage.score import manager as score_generator
 from compair.manage.user import manager as user_manager
 from compair.manage.utils import manager as util_manager
+from compair.manage.kaltura import manager as kaltura_manager
 from compair import create_app
 
 manager = Manager(create_app(skip_assets=True))
@@ -20,6 +21,7 @@
 manager.add_command("runserver", Server(port=8080))
 manager.add_command("user", user_manager)
 manager.add_command("util", util_manager)
+manager.add_command("kaltura", kaltura_manager)
 
 
 @manager.command
diff --git a/requirements.txt b/requirements.txt
index 037c3a545..f6e56449f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -56,3 +56,5 @@ markupsafe==2.0.1
 # json api was deprecated in latest version, so need older version, can be
 # removed once upgraded to flask 2
 itsdangerous==2.0.1
+# for the kaltura migration script
+kalturaapiclient==19.1.0