From a8cf5271cf0e0dff73067dd8f9a1c9ba0ef35cca Mon Sep 17 00:00:00 2001
From: Francois Daoust <fd@tidoust.net>
Date: Wed, 20 Sep 2023 12:06:20 +0200
Subject: [PATCH] Add a few tools to process recordings

---
 package-lock.json                     |   7 ++
 package.json                          |   1 +
 tools/create-recording-pages.mjs      | 155 ++++++++++++++++++++++++++
 tools/lib/webvtt2html.mjs             | 121 ++++++++++++++++++++
 tools/rename-recordings.mjs           |  58 ++++++++++
 tools/update-recording-thumbnails.mjs |  76 +++++++++++++
 6 files changed, 418 insertions(+)
 create mode 100644 tools/create-recording-pages.mjs
 create mode 100644 tools/lib/webvtt2html.mjs
 create mode 100644 tools/rename-recordings.mjs
 create mode 100644 tools/update-recording-thumbnails.mjs

diff --git a/package-lock.json b/package-lock.json
index 53cd211..5fa8f3d 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -9,6 +9,7 @@
         "irc": "^0.5.2",
         "puppeteer": "^20.5.0",
         "seedrandom": "^3.0.5",
+        "webvtt-parser": "^2.2.0",
         "yaml": "^2.3.1"
       }
     },
@@ -1306,6 +1307,12 @@
       "integrity": "sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ==",
       "dev": true
     },
+    "node_modules/webvtt-parser": {
+      "version": "2.2.0",
+      "resolved": "https://registry.npmjs.org/webvtt-parser/-/webvtt-parser-2.2.0.tgz",
+      "integrity": "sha512-FzmaED+jZyt8SCJPTKbSsimrrnQU8ELlViE1wuF3x1pgiQUM8Llj5XWj2j/s6Tlk71ucPfGSMFqZWBtKn/0uEA==",
+      "dev": true
+    },
     "node_modules/whatwg-url": {
       "version": "5.0.0",
       "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-5.0.0.tgz",
diff --git a/package.json b/package.json
index ee80825..7b4433c 100644
--- a/package.json
+++ b/package.json
@@ -4,6 +4,7 @@
     "irc": "^0.5.2",
     "puppeteer": "^20.5.0",
     "seedrandom": "^3.0.5",
+    "webvtt-parser": "^2.2.0",
     "yaml": "^2.3.1"
   }
 }
diff --git a/tools/create-recording-pages.mjs b/tools/create-recording-pages.mjs
new file mode 100644
index 0000000..1e90ad1
--- /dev/null
+++ b/tools/create-recording-pages.mjs
@@ -0,0 +1,155 @@
+/**
+ * This tool is only useful once recordings of breakout sessions have been
+ * uploaded to Cloudflare. It create HTML recording pages for each of these
+ * recordings that contain the video and an HTML rendition of the captions as
+ * a transcript.
+ * 
+ * To run the tool:
+ *
+ *  node tools/create-recording-pages.mjs
+ *
+ * Pre-requisites:
+ * 1. Recordings must have been uploaded to Cloudflare with a name that starts
+ * with a well-known prefix.
+ * 2. The well-known prefix must appear in a RECORDING_PREFIX env variable.
+ * 3. Cloudflare account info must appear in CLOUDFLARE_ACCOUNT and
+ * CLOUDFLARE_TOKEN env variables.
+ * 4. The RECORDING_FOLDER env variable must target the local folder to use to
+ * save recordings pages
+ * 5. The RECORDING_FOLDER folder must contain a "recording-template.html" page
+ * that contains the template to use for each recording page, see for example:
+ * https://www.w3.org/2023/09/breakouts/recording-template.html
+ *
+ * The tool assumes that the recordings are named prefix-xx.mp4, where xx is
+ * the breakout session number. It creates "recording-xx.html" pages in the
+ * recording folder.
+ */
+
+import path from 'path';
+import fs from 'fs/promises';
+import { convert } from './lib/webvtt2html.mjs';
+import { getEnvKey } from './lib/envkeys.mjs';
+import { fetchProject } from './lib/project.mjs';
+import { validateSession } from './lib/validate.mjs';
+import { todoStrings } from './lib/todostrings.mjs';
+
+async function listRecordings(accountId, authToken, recordingPrefix) {
+  const response = await fetch(
+    `https://api.cloudflare.com/client/v4/accounts/${accountId}/stream?search=${recordingPrefix}`,
+    {
+      headers: {
+        'Authorization': `Bearer ${authToken}`
+      }
+    }
+  );
+  const json = await response.json();
+  const recordings = json.result
+    .map(v => Object.assign({
+      sessionId: v.meta.name.match(/-(\d+)\.mp4$/)[1],
+      name: v.meta.name,
+      title: v.meta.name,
+      videoId: v.uid,
+      preview: v.preview,
+      embedUrl: v.preview.replace(/watch$/, 'iframe'),
+      captions: v.preview.replace(/watch$/, 'captions/en')
+    }))
+    .sort((v1, v2) => v1.name.localeCompare(v2.name));
+  return recordings;
+}
+
+async function createRecordingPage(recording, recordingFolder) {
+  let template = await fs.readFile(path.join(recordingFolder, 'recording-template.html'), 'utf8');
+
+  recording.transcript = await convert(recording.captions, { clean: true });
+
+  // Replace content that needs to be serialized as JSON
+  for (const property of Object.keys(recording)) {
+    const regexp = new RegExp(`\{\{\{\{${property}\}\}\}\}`, 'g');
+    template = template.replace(regexp, JSON.stringify(recording[property], null, 2));
+  }
+
+  // Replace content that needs to be escaped for use in HTML attributes
+  for (const property of Object.keys(recording)) {
+    const regexp = new RegExp(`\{\{\{${property}\}\}\}`, 'g');
+    template = template.replace(regexp,
+      ('' + recording[property] || '')
+        .replace(/&/g, '&amp;')
+        .replace(/</g, '&lt;')
+        .replace(/>/g, '&gt;')
+        .replace(/"/g, '&quot;')
+        .replace(/'/g, '&apos;'));
+  }
+
+  // Replace raw text content
+  for (const property of Object.keys(recording)) {
+    const regexp = new RegExp(`\{\{${property}\}\}`, 'g');
+    template = template.replace(regexp, recording[property]);
+  }
+
+  // Write resulting recording page
+  await fs.writeFile(path.join(recordingFolder, `recording-${recording.sessionId}.html`), template, 'utf8');
+}
+
+async function main() {
+  // First, retrieve known information about the project
+  const PROJECT_OWNER = await getEnvKey('PROJECT_OWNER');
+  const PROJECT_NUMBER = await getEnvKey('PROJECT_NUMBER');
+  const CHAIR_W3CID = await getEnvKey('CHAIR_W3CID', {}, true);
+  console.log();
+  console.log(`Retrieve project ${PROJECT_OWNER}/${PROJECT_NUMBER}...`);
+  const project = await fetchProject(PROJECT_OWNER, PROJECT_NUMBER);
+  if (!project) {
+    throw new Error(`Project ${PROJECT_OWNER}/${PROJECT_NUMBER} could not be retrieved`);
+  }
+  project.chairsToW3CID = CHAIR_W3CID;
+  console.log(`- ${project.sessions.length} sessions`);
+  console.log(`- ${project.rooms.length} rooms`);
+  console.log(`- ${project.slots.length} slots`);
+  console.log(`Retrieve project ${PROJECT_OWNER}/${PROJECT_NUMBER}... done`);
+
+  console.log();
+  console.log('List recordings...');
+  const CLOUDFLARE_ACCOUNT = await getEnvKey('CLOUDFLARE_ACCOUNT');
+  const CLOUDFLARE_TOKEN = await getEnvKey('CLOUDFLARE_TOKEN');
+  const RECORDING_PREFIX = await getEnvKey('RECORDING_PREFIX');
+  const RECORDING_FOLDER = await getEnvKey('RECORDING_FOLDER');;
+  const recordings = await listRecordings(CLOUDFLARE_ACCOUNT, CLOUDFLARE_TOKEN, RECORDING_PREFIX);
+  console.log(`- found ${recordings.length} recordings`);
+  console.log('List recordings... done');
+
+  console.log();
+  console.log('Create recording pages...');
+  for (const recording of recordings) {
+    const session = project.sessions.find(s => s.number === parseInt(recording.sessionId, 10));
+    console.log(`- create page for ${recording.sessionId} - ${session.title}`);
+    await validateSession(session.number, project);
+    const desc = session.description;
+    recording.title = session.title;
+    recording.githubIssue = `https://github.com/${session.repository}/issues/${session.number}`;
+    const links = [
+      {
+        title: 'Session proposal on GitHub',
+        url: recording.githubIssue
+      }
+    ];
+    if (desc.materials.slides && !todoStrings.includes(desc.materials.slides.toUpperCase())) {
+      links.push({
+        title: 'Slides',
+        url: desc.materials.slides
+      });
+    }
+    if (desc.materials.minutes && !todoStrings.includes(desc.materials.minutes.toUpperCase())) {
+      links.push({
+        title: 'Session minutes',
+        url: desc.materials.minutes
+      });
+    }
+    recording.links = links
+      .map(l => `<li><a href="${l.url}">${l.title}</a></li>`)
+      .join('\n');
+    await createRecordingPage(recording, RECORDING_FOLDER);
+  }
+  console.log('Create recording pages... done');
+}
+
+main().then(_ => process.exit(0));
\ No newline at end of file
diff --git a/tools/lib/webvtt2html.mjs b/tools/lib/webvtt2html.mjs
new file mode 100644
index 0000000..750b31c
--- /dev/null
+++ b/tools/lib/webvtt2html.mjs
@@ -0,0 +1,121 @@
+import webvttParser from 'webvtt-parser';
+
+const parser = new webvttParser.WebVTTParser();
+
+export async function convert(vttUrl, options) {
+  options = options || {};
+
+  function cleanSentence(sentence) {
+    if (options.clean) {
+      sentence = sentence.replace(/^slide [a-z0-9]*\.?/i, '');
+      sentence = sentence.replace(/^next slide\.?/i, '');
+      sentence = sentence.replace(/^next page\.?/i, '');
+      sentence = sentence.replace(/^moving to next slide\.?/i, '');
+      sentence = sentence.replace(/^moving to next page\.?/i, '');
+      sentence = sentence.replace(/, you know, ?/g, ' ');
+    }
+    return sentence;
+  }
+
+  const response = await fetch(vttUrl);
+  const vtt = await response.text();
+
+  let cues;
+  try {
+    ({cues} = parser.parse(vtt));
+  } catch (e) {
+    console.error(`Could not parse ${vttUrl} as WebVTT: ` + e);
+    process.exit(1);
+  }
+
+  cues.forEach(c => c.text = c.text
+    .replace(/<v [^>]*>/, '')
+    .replace(/<\/v>/, '')
+    .replace('"',''));
+  if (options.clean) {
+    cues.forEach(c => c.text = c.text.replace(/^slide [0-9]+$/i, ''));
+  }
+
+  const divs = [{
+    slide: "1",
+    paragraphs: []
+  }];
+  let p = '';
+  cues.forEach(c => {
+    if (c.id.startsWith("slide-")) {
+      if (cleanSentence(p)) {
+        divs[divs.length-1].paragraphs.push(cleanSentence(p));
+      }
+      divs.push({
+        slide: c.id.substring("slide-".length),
+        paragraphs: []
+      });
+      p = '';
+    } else if (c.id.endsWith("-p")) {
+      if (cleanSentence(p)) {
+        divs[divs.length-1].paragraphs.push(cleanSentence(p));
+        p = c.text;
+      }
+      p = '';
+    } else if (c.text.match(/:/)) {
+      if (cleanSentence(p)) {
+        divs[divs.length-1].paragraphs.push(cleanSentence(p));
+        p = c.text;
+      }
+      p = '';
+    }
+    p += (p ? ' ' : '') + c.text;
+  });
+
+  // Output final sentence
+  if (cleanSentence(p)) {
+    divs[divs.length-1].paragraphs.push(cleanSentence(p));
+  }
+
+  let content = '';
+  let pid = 1;
+  if (options.splitPerSlide) {
+    for (let i = 0 ; i < divs.length; i++) {
+      if (options.slideset) {
+        content += `<div id="ts-${divs[i].slide}">`;
+        content += `<i-slide src="${options.slideset}#${divs[i].slide}" class="slide">Slide ${divs[i].slide} of ${divs.length}</i-slide>\n`;
+      }
+      content += (options.markupStart || `<div>`) + "\n";
+
+      for (const p of divs[i].paragraphs) {
+        const match = p.match(/^(.*):\s*(.*)$/);
+        if (match) {
+          content += `  <p id="tp-${pid}"><cite>${match[1]}:</cite> ${match[2]}</p>\n`;
+        }
+        else {
+          content += `  <p id="tp-${pid}">${p}</p>\n`;
+        }
+        pid += 1;
+      }
+      content += (options.markupEnd || '</div>') + "\n\n";
+      if (options.slideset) {
+        content += `</div>`;
+      }
+    }
+  } else {
+    let last = '';
+    content += '<p>';
+    for (const p of divs.map(d => d.paragraphs).flat().flat()) {
+      const match = p.match(/^(.*):\s*(.*)$/);
+      if (match) {
+        if (last && match[1] === last) {
+          content += `<br/>\n  … ${match[2]}`;
+        }
+        else {
+          content += `</p>\n  <p><cite>${match[1]}:</cite> ${match[2]}`;
+        }
+        last = match[1];
+      }
+      else {
+        content += `</p>\n  ${p}`;
+      }
+    }
+  }
+
+  return content;
+}
diff --git a/tools/rename-recordings.mjs b/tools/rename-recordings.mjs
new file mode 100644
index 0000000..f4bbaaf
--- /dev/null
+++ b/tools/rename-recordings.mjs
@@ -0,0 +1,58 @@
+/**
+ * This tool is only useful once there are Zoom recordings of the breakout
+ * sessions available. It pulls and renames the recordings from local storage.
+ *
+ * To run the tool:
+ *
+ *  node tools/rename-recordings.mjs
+ *
+ * Pre-requisites:
+ * 1. Zoom recordings must have been downloaed to a local folder, with one
+ * subfolder per recording. The subfolder name must start with the session
+ * number followed by a "-", e.g., "10-ecija" (the rest does not matter).
+ * 2. The local folder must appear in a RECORDING_FOLDER_RAW env variable.
+ * 3. The prefix to use to rename the recordings must be in a RECORDING_PREFIX
+ * env variable.
+ *
+ * The tool assumes that the video file to use each time has a name that ends
+ * with "_Recording_wwwwxhhhh.mp4".
+ * 
+ * The tool also extracts the captions file, provided that its name ends with
+ * "_Recording.transcript.vtt".
+ * 
+ * Renamed recordings and captions file are saved at the root of the
+ * RECORDING_FOLDER_RAW folder.
+ */
+
+import path from 'path';
+import fs from 'fs/promises';
+import { getEnvKey } from './lib/envkeys.mjs';
+
+async function main() {
+  const RECORDING_FOLDER_RAW = await getEnvKey('RECORDING_FOLDER_RAW');
+  const RECORDING_PREFIX = await getEnvKey('RECORDING_PREFIX');
+  const folders = await fs.readdir(RECORDING_FOLDER_RAW);
+  for (const folder of folders) {
+    if (folder.includes('.')) {
+      continue;
+    }
+    let files = await fs.readdir(path.join(rootFolder, folder));
+    const prefix = `${RECORDING_PREFIX}-${folder.split('-')[0]}`;
+
+    const recording = files.find(f => f.match(/_Recording_\d{3,4}x\d{3,4}\.mp4$/));
+    if (recording) {
+      await fs.copyFile(
+        path.join(rootFolder, folder, recording),
+        path.join(rootFolder, prefix + '.mp4'));
+    }
+
+    const subtitles = files.find(f => f.match(/_Recording\.transcript\.vtt$/));
+    if (subtitles) {
+      await fs.copyFile(
+        path.join(rootFolder, folder, subtitles),
+        path.join(rootFolder, prefix + '.vtt'));
+    }
+  }
+}
+
+main().then(_ => process.exit(0));
\ No newline at end of file
diff --git a/tools/update-recording-thumbnails.mjs b/tools/update-recording-thumbnails.mjs
new file mode 100644
index 0000000..19f8910
--- /dev/null
+++ b/tools/update-recording-thumbnails.mjs
@@ -0,0 +1,76 @@
+/**
+ * This tool is only useful once recordings of breakout sessions have been
+ * uploaded to Cloudflare. It updates the thumbnails of the recordings on
+ * Cloudflare to use a screenshot at 1% of the video, the goal being to avoid
+ * that the initial black screen gets used as thumbnail.
+ *
+ * To run the tool:
+ *
+ *  node tools/update-recording-thumbnails.mjs
+ *
+ * Pre-requisites:
+ * 1. Recordings must have been uploaded to Cloudflare with a name that starts
+ * with a well-known prefix.
+ * 2. The well-known prefix must appear in a RECORDING_PREFIX env variable.
+ * 3. Cloudflare account info must appear in CLOUDFLARE_ACCOUNT and
+ * CLOUDFLARE_TOKEN env variables.
+ */
+
+import path from 'path';
+import fs from 'fs/promises';
+import { convert } from './webvtt2html.mjs';
+
+async function listRecordings(accountId, authToken, recordingPrefix) {
+  const response = await fetch(
+    `https://api.cloudflare.com/client/v4/accounts/${accountId}/stream?search=${recordingPrefix}`,
+    {
+      headers: {
+        'Authorization': `Bearer ${authToken}`
+      }
+    }
+  );
+  const json = await response.json();
+  const recordings = json.result
+    .map(v => Object.assign({
+      sessionId: v.meta.name.match(/-(\d+)\.mp4$/)[1],
+      name: v.meta.name,
+      videoId: v.uid
+    }))
+    .sort((v1, v2) => v1.name.localeCompare(v2.name));
+  return recordings;
+}
+
+async function updateThumbnail(recording, accountId, authToken) {
+  const response = await fetch(
+    `https://api.cloudflare.com/client/v4/accounts/${accountId}/stream/${recording.videoId}`,
+    {
+      method: 'POST',
+      headers: {
+        'Authorization': `Bearer ${authToken}`
+      },
+      body: JSON.stringify({
+        uid: recording.videoId,
+        // Consider that a screenshot taken after 1% of the meeting
+        // will be a good thumbnail
+        thumbnailTimestampPct: 0.01
+      }, null, 2)
+    }
+  );
+  const json = await response.json();
+  if (!json.success) {
+    console.warn(`Thumbnail could not be set for session #${recording.sessionId} - ${recording.videoId}`);
+  }
+}
+
+async function main() {
+  const CLOUDFLARE_ACCOUNT = await getEnvKey('CLOUDFLARE_ACCOUNT');
+  const CLOUDFLARE_TOKEN = await getEnvKey('CLOUDFLARE_TOKEN');
+  const RECORDING_PREFIX = await getEnvKey('RECORDING_PREFIX');
+  const recordings = await listRecordings(CLOUDFLARE_ACCOUNT, CLOUDFLARE_TOKEN, RECORDING_PREFIX);
+  for (const recording of recordings) {
+    await updateThumbnail(recording, CLOUDFLARE_ACCOUNT, CLOUDFLARE_TOKEN);
+  }
+}
+
+main().then(_ => process.exit(0));
+