ADD kaltura migration script to cli tools

We're switching from on-prem hosted Kaltura to cloud Kaltura. When videos are transferred to the new Kaltura instance, a new entry ID is generated. We will be given a CSV mapping of old to new entry IDs, and will have to update our Kaltura data accordingly. The CSV is expected to be in the following format, where the first row is a header and can be ignored: old entry id, new entry id 0_someenid,0_newentid 0_someenid,0_newentid 0_someenid,0_newentid Assuming that collision between old and new entry IDs are impossible. Requires that these Kaltura env vars are set to the new Kaltura environment: * KALTURA_SERVICE_URL * KALTURA_PARTNER_ID * KALTURA_SECRET * KALTURA_USER_ID * KALTURA_PLAYER_ID Usage options, run in app root: python manage.py kaltura migrate /path/to/mappingCsv.csv -d Do a dry run, without actually making any changes to the database: python manage.py kaltura migrate -d /path/to/mappingCsv.csv -n If present, tells the CSV parser not to skip the first row. By default, we assume the first row is a header row and skip it: python manage.py kaltura migrate -n /path/to/mappingCsv.csv This will migrate all the normal Kaltura integration uploaded videos. There's a handful of special cases where people inserted Kaltura video links into answers that this does NOT deal with. We're not sure how to deal with that on the Kaltura side yet, as those videos were uploaded to a different section that wasn't meant for ComPAIR.
ubc · Feb 14, 2023 · a82c1b3 · a82c1b3
1 parent 0e4b738
commit a82c1b3
Show file tree

Hide file tree

Showing 3 changed files with 165 additions and 0 deletions.
diff --git a/compair/manage/kaltura.py b/compair/manage/kaltura.py
@@ -0,0 +1,161 @@
+"""
+Migrate Kaltura media to new Kaltura instance. We're switching from on-prem
+hosted Kaltura to cloud Kaltura. When videos are transferred to the new Kaltura
+instance, a new entry ID is generated. We will be given a CSV mapping of old
+to new entry IDs, and will have to update our Kaltura data accordingly.
+
+Assuming that collision between old and new entry IDs are impossible.
+
+Requires that these Kaltura env vars are set to the new Kaltura environment:
+
+* KALTURA_SERVICE_URL
+* KALTURA_PARTNER_ID
+* KALTURA_SECRET
+* KALTURA_USER_ID
+* KALTURA_PLAYER_ID
+
+Usage options, run in app root:
+
+    python manage.py kaltura migrate /path/to/mappingCsv.csv
+
+-d Do a dry run, without actually making any changes to the database:
+
+    python manage.py kaltura migrate -d /path/to/mappingCsv.csv
+
+-n If present, tells the CSV parser not to skip the first row. By default, we
+assume the first row is a header row and skip it:
+
+    python manage.py kaltura migrate -n /path/to/mappingCsv.csv
+
+"""
+
+import csv
+from datetime import datetime
+import re
+
+from KalturaClient import KalturaClient, KalturaConfiguration
+from KalturaClient.Plugins.Core import (KalturaSessionType, KalturaMediaEntry,
+                                        KalturaMediaType)
+from flask_script import Manager
+
+from compair.core import db
+from compair.kaltura.core import KalturaCore
+from compair.models import Answer, KalturaMedia, File
+from flask import current_app
+
+manager = Manager(usage="Kaltura Migration")
+
+def readMappingCsv(mappingCsv, noHeader):
+    oldToNewEntryIds = {}
+    idRe = re.compile(r"\d_\w{8}")
+    with open(mappingCsv, 'r') as csvFile:
+        csvReader = csv.reader(csvFile, skipinitialspace=True)
+        for row in csvReader:
+            if not noHeader and csvReader.line_num == 1:
+                continue
+            oldEntryId = row[0]
+            newEntryId = row[1]
+            if not (re.match(idRe, oldEntryId) and re.match(idRe, newEntryId)):
+                raise ValueError(f"Mapping file line {csvReader.line_num} has a value not in entry ID format.")
+            oldToNewEntryIds[oldEntryId] = newEntryId
+    if oldToNewEntryIds:
+        return oldToNewEntryIds
+    raise ValueError("Mapping file is empty")
+
+
+def msg(msg, logfile):
+    print(msg)
+    logfile.write(f'{msg}\n')
+    logfile.flush()
+
+
+def summarize(numToMigrate, numInvalid, numMigrated, numNoMapping, numTotal,
+              logfile):
+    msg( '-------- Summary --------', logfile)
+    msg(f'  To be Migrated: {numToMigrate}', logfile)
+    msg(f'   To be Deleted: {numInvalid}', logfile)
+    msg(f'Already Migrated: {numMigrated}', logfile)
+    msg(f'      No Mapping: {numNoMapping}', logfile)
+    msg(f'           Total: {numTotal}', logfile)
+    msg( '-------- ------- --------', logfile)
+
+
+def deleteInvalidKalturaMedias(medias, logfile):
+    for media in medias:
+        msg(f'Deleting invalid kaltura media id {media.id}', logfile)
+        db.session.delete(media)
+
+
+def migrateKalturaMedias(medias, oldToNewEntryIds, logfile):
+    # connect to the Kaltura API
+    kClient = KalturaClient(KalturaConfiguration(
+                            serviceUrl=KalturaCore.service_url()))
+    kSession = kClient.session.start(
+        KalturaCore.secret(),
+        KalturaCore.user_id(),
+        KalturaSessionType.ADMIN,
+        KalturaCore.partner_id(),
+        86400, # session expires in 1 hour
+        "appID:compair"
+    )
+    kClient.setKs(kSession)
+
+    for media in medias:
+        mediaId = media.id
+        oldEntryId = media.entry_id
+        newEntryId = oldToNewEntryIds[oldEntryId]
+        msg(f'Processing id {mediaId}: Old {oldEntryId} to New {newEntryId}',
+            logfile)
+        newInfo = kClient.media.get(newEntryId, -1)
+        media.download_url = newInfo.getDownloadUrl()
+        media.partner_id = newInfo.getPartnerId()
+        media.service_url = KalturaCore.service_url()
+        media.player_id = KalturaCore.player_id()
+        media.entry_id = newEntryId
+        #db.session.add(media)
+
+
+@manager.command
+def migrate(mappingCsv, noHeader=False, dryRun=False):
+    ts = datetime.now().isoformat(timespec='seconds')
+    logfile = open(f'kaltura-migration-log-{ts}.log', 'a')
+    msg('Starting Kaltura migration', logfile)
+    oldToNewEntryIds = readMappingCsv(mappingCsv, noHeader)
+    newToOldEntryIds = dict(map(reversed, oldToNewEntryIds.items()))
+    invalidKalturaMedias = [] # can't be migrated, might as well delete
+    needMigrationMedias = [] # needs to be migrated
+    numAlreadyMigrated = 0
+    numNoMapping = 0
+    numTotal = 0
+    kalturaMedias = KalturaMedia.query.all()
+    # find out how much work needs to be done
+    for kalturaMedia in kalturaMedias:
+        numTotal += 1
+        mediaId = kalturaMedia.id
+        entryId = kalturaMedia.entry_id
+        if not entryId:
+            msg(f'Empty entry ID for id {mediaId}', logfile)
+            invalidKalturaMedias.append(kalturaMedia)
+        elif entryId in oldToNewEntryIds:
+            msg(f"Migration needed for id {mediaId}: Entry {entryId}", logfile)
+            needMigrationMedias.append(kalturaMedia)
+        elif entryId in newToOldEntryIds:
+            msg(f"Already migrated id {mediaId}: Entry {entryId}", logfile)
+            numAlreadyMigrated += 1
+        else:
+            # didn't find a mapping, perhaps missing from migration?
+            msg(f'No mapping for id {mediaId}: Entry {entryId}', logfile)
+            numNoMapping += 1
+    # summarize what needs to be done
+    summarize(len(needMigrationMedias), len(invalidKalturaMedias),
+              numAlreadyMigrated, numNoMapping, numTotal, logfile)
+    # do the actual work in a transaction
+    if dryRun:
+        msg(f'*** Dry run completed, no changes were made ***', logfile)
+    else:
+        msg(f'Starting database session', logfile)
+        deleteInvalidKalturaMedias(invalidKalturaMedias, logfile)
+        migrateKalturaMedias(needMigrationMedias, oldToNewEntryIds, logfile)
+        msg(f'Committing to database', logfile)
+        db.session.commit()
+    logfile.close()
diff --git a/manage.py b/manage.py
@@ -9,6 +9,7 @@
 from compair.manage.score import manager as score_generator
 from compair.manage.user import manager as user_manager
 from compair.manage.utils import manager as util_manager
+from compair.manage.kaltura import manager as kaltura_manager
 from compair import create_app
 
 manager = Manager(create_app(skip_assets=True))
@@ -20,6 +21,7 @@
 manager.add_command("runserver", Server(port=8080))
 manager.add_command("user", user_manager)
 manager.add_command("util", util_manager)
+manager.add_command("kaltura", kaltura_manager)
 
 
 @manager.command

diff --git a/requirements.txt b/requirements.txt
@@ -56,3 +56,5 @@ markupsafe==2.0.1
 # json api was deprecated in latest version, so need older version, can be
 # removed once upgraded to flask 2
 itsdangerous==2.0.1
+# for the kaltura migration script
+kalturaapiclient==19.1.0