From a8cf5271cf0e0dff73067dd8f9a1c9ba0ef35cca Mon Sep 17 00:00:00 2001 From: Francois Daoust Date: Wed, 20 Sep 2023 12:06:20 +0200 Subject: [PATCH] Add a few tools to process recordings --- package-lock.json | 7 ++ package.json | 1 + tools/create-recording-pages.mjs | 155 ++++++++++++++++++++++++++ tools/lib/webvtt2html.mjs | 121 ++++++++++++++++++++ tools/rename-recordings.mjs | 58 ++++++++++ tools/update-recording-thumbnails.mjs | 76 +++++++++++++ 6 files changed, 418 insertions(+) create mode 100644 tools/create-recording-pages.mjs create mode 100644 tools/lib/webvtt2html.mjs create mode 100644 tools/rename-recordings.mjs create mode 100644 tools/update-recording-thumbnails.mjs diff --git a/package-lock.json b/package-lock.json index 53cd211..5fa8f3d 100644 --- a/package-lock.json +++ b/package-lock.json @@ -9,6 +9,7 @@ "irc": "^0.5.2", "puppeteer": "^20.5.0", "seedrandom": "^3.0.5", + "webvtt-parser": "^2.2.0", "yaml": "^2.3.1" } }, @@ -1306,6 +1307,12 @@ "integrity": "sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ==", "dev": true }, + "node_modules/webvtt-parser": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/webvtt-parser/-/webvtt-parser-2.2.0.tgz", + "integrity": "sha512-FzmaED+jZyt8SCJPTKbSsimrrnQU8ELlViE1wuF3x1pgiQUM8Llj5XWj2j/s6Tlk71ucPfGSMFqZWBtKn/0uEA==", + "dev": true + }, "node_modules/whatwg-url": { "version": "5.0.0", "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-5.0.0.tgz", diff --git a/package.json b/package.json index ee80825..7b4433c 100644 --- a/package.json +++ b/package.json @@ -4,6 +4,7 @@ "irc": "^0.5.2", "puppeteer": "^20.5.0", "seedrandom": "^3.0.5", + "webvtt-parser": "^2.2.0", "yaml": "^2.3.1" } } diff --git a/tools/create-recording-pages.mjs b/tools/create-recording-pages.mjs new file mode 100644 index 0000000..1e90ad1 --- /dev/null +++ b/tools/create-recording-pages.mjs @@ -0,0 +1,155 @@ +/** + * This tool is only useful once recordings of breakout sessions have been + * uploaded to Cloudflare. It create HTML recording pages for each of these + * recordings that contain the video and an HTML rendition of the captions as + * a transcript. + * + * To run the tool: + * + * node tools/create-recording-pages.mjs + * + * Pre-requisites: + * 1. Recordings must have been uploaded to Cloudflare with a name that starts + * with a well-known prefix. + * 2. The well-known prefix must appear in a RECORDING_PREFIX env variable. + * 3. Cloudflare account info must appear in CLOUDFLARE_ACCOUNT and + * CLOUDFLARE_TOKEN env variables. + * 4. The RECORDING_FOLDER env variable must target the local folder to use to + * save recordings pages + * 5. The RECORDING_FOLDER folder must contain a "recording-template.html" page + * that contains the template to use for each recording page, see for example: + * https://www.w3.org/2023/09/breakouts/recording-template.html + * + * The tool assumes that the recordings are named prefix-xx.mp4, where xx is + * the breakout session number. It creates "recording-xx.html" pages in the + * recording folder. + */ + +import path from 'path'; +import fs from 'fs/promises'; +import { convert } from './lib/webvtt2html.mjs'; +import { getEnvKey } from './lib/envkeys.mjs'; +import { fetchProject } from './lib/project.mjs'; +import { validateSession } from './lib/validate.mjs'; +import { todoStrings } from './lib/todostrings.mjs'; + +async function listRecordings(accountId, authToken, recordingPrefix) { + const response = await fetch( + `https://api.cloudflare.com/client/v4/accounts/${accountId}/stream?search=${recordingPrefix}`, + { + headers: { + 'Authorization': `Bearer ${authToken}` + } + } + ); + const json = await response.json(); + const recordings = json.result + .map(v => Object.assign({ + sessionId: v.meta.name.match(/-(\d+)\.mp4$/)[1], + name: v.meta.name, + title: v.meta.name, + videoId: v.uid, + preview: v.preview, + embedUrl: v.preview.replace(/watch$/, 'iframe'), + captions: v.preview.replace(/watch$/, 'captions/en') + })) + .sort((v1, v2) => v1.name.localeCompare(v2.name)); + return recordings; +} + +async function createRecordingPage(recording, recordingFolder) { + let template = await fs.readFile(path.join(recordingFolder, 'recording-template.html'), 'utf8'); + + recording.transcript = await convert(recording.captions, { clean: true }); + + // Replace content that needs to be serialized as JSON + for (const property of Object.keys(recording)) { + const regexp = new RegExp(`\{\{\{\{${property}\}\}\}\}`, 'g'); + template = template.replace(regexp, JSON.stringify(recording[property], null, 2)); + } + + // Replace content that needs to be escaped for use in HTML attributes + for (const property of Object.keys(recording)) { + const regexp = new RegExp(`\{\{\{${property}\}\}\}`, 'g'); + template = template.replace(regexp, + ('' + recording[property] || '') + .replace(/&/g, '&') + .replace(//g, '>') + .replace(/"/g, '"') + .replace(/'/g, ''')); + } + + // Replace raw text content + for (const property of Object.keys(recording)) { + const regexp = new RegExp(`\{\{${property}\}\}`, 'g'); + template = template.replace(regexp, recording[property]); + } + + // Write resulting recording page + await fs.writeFile(path.join(recordingFolder, `recording-${recording.sessionId}.html`), template, 'utf8'); +} + +async function main() { + // First, retrieve known information about the project + const PROJECT_OWNER = await getEnvKey('PROJECT_OWNER'); + const PROJECT_NUMBER = await getEnvKey('PROJECT_NUMBER'); + const CHAIR_W3CID = await getEnvKey('CHAIR_W3CID', {}, true); + console.log(); + console.log(`Retrieve project ${PROJECT_OWNER}/${PROJECT_NUMBER}...`); + const project = await fetchProject(PROJECT_OWNER, PROJECT_NUMBER); + if (!project) { + throw new Error(`Project ${PROJECT_OWNER}/${PROJECT_NUMBER} could not be retrieved`); + } + project.chairsToW3CID = CHAIR_W3CID; + console.log(`- ${project.sessions.length} sessions`); + console.log(`- ${project.rooms.length} rooms`); + console.log(`- ${project.slots.length} slots`); + console.log(`Retrieve project ${PROJECT_OWNER}/${PROJECT_NUMBER}... done`); + + console.log(); + console.log('List recordings...'); + const CLOUDFLARE_ACCOUNT = await getEnvKey('CLOUDFLARE_ACCOUNT'); + const CLOUDFLARE_TOKEN = await getEnvKey('CLOUDFLARE_TOKEN'); + const RECORDING_PREFIX = await getEnvKey('RECORDING_PREFIX'); + const RECORDING_FOLDER = await getEnvKey('RECORDING_FOLDER');; + const recordings = await listRecordings(CLOUDFLARE_ACCOUNT, CLOUDFLARE_TOKEN, RECORDING_PREFIX); + console.log(`- found ${recordings.length} recordings`); + console.log('List recordings... done'); + + console.log(); + console.log('Create recording pages...'); + for (const recording of recordings) { + const session = project.sessions.find(s => s.number === parseInt(recording.sessionId, 10)); + console.log(`- create page for ${recording.sessionId} - ${session.title}`); + await validateSession(session.number, project); + const desc = session.description; + recording.title = session.title; + recording.githubIssue = `https://github.com/${session.repository}/issues/${session.number}`; + const links = [ + { + title: 'Session proposal on GitHub', + url: recording.githubIssue + } + ]; + if (desc.materials.slides && !todoStrings.includes(desc.materials.slides.toUpperCase())) { + links.push({ + title: 'Slides', + url: desc.materials.slides + }); + } + if (desc.materials.minutes && !todoStrings.includes(desc.materials.minutes.toUpperCase())) { + links.push({ + title: 'Session minutes', + url: desc.materials.minutes + }); + } + recording.links = links + .map(l => `
  • ${l.title}
  • `) + .join('\n'); + await createRecordingPage(recording, RECORDING_FOLDER); + } + console.log('Create recording pages... done'); +} + +main().then(_ => process.exit(0)); \ No newline at end of file diff --git a/tools/lib/webvtt2html.mjs b/tools/lib/webvtt2html.mjs new file mode 100644 index 0000000..750b31c --- /dev/null +++ b/tools/lib/webvtt2html.mjs @@ -0,0 +1,121 @@ +import webvttParser from 'webvtt-parser'; + +const parser = new webvttParser.WebVTTParser(); + +export async function convert(vttUrl, options) { + options = options || {}; + + function cleanSentence(sentence) { + if (options.clean) { + sentence = sentence.replace(/^slide [a-z0-9]*\.?/i, ''); + sentence = sentence.replace(/^next slide\.?/i, ''); + sentence = sentence.replace(/^next page\.?/i, ''); + sentence = sentence.replace(/^moving to next slide\.?/i, ''); + sentence = sentence.replace(/^moving to next page\.?/i, ''); + sentence = sentence.replace(/, you know, ?/g, ' '); + } + return sentence; + } + + const response = await fetch(vttUrl); + const vtt = await response.text(); + + let cues; + try { + ({cues} = parser.parse(vtt)); + } catch (e) { + console.error(`Could not parse ${vttUrl} as WebVTT: ` + e); + process.exit(1); + } + + cues.forEach(c => c.text = c.text + .replace(/]*>/, '') + .replace(/<\/v>/, '') + .replace('"','')); + if (options.clean) { + cues.forEach(c => c.text = c.text.replace(/^slide [0-9]+$/i, '')); + } + + const divs = [{ + slide: "1", + paragraphs: [] + }]; + let p = ''; + cues.forEach(c => { + if (c.id.startsWith("slide-")) { + if (cleanSentence(p)) { + divs[divs.length-1].paragraphs.push(cleanSentence(p)); + } + divs.push({ + slide: c.id.substring("slide-".length), + paragraphs: [] + }); + p = ''; + } else if (c.id.endsWith("-p")) { + if (cleanSentence(p)) { + divs[divs.length-1].paragraphs.push(cleanSentence(p)); + p = c.text; + } + p = ''; + } else if (c.text.match(/:/)) { + if (cleanSentence(p)) { + divs[divs.length-1].paragraphs.push(cleanSentence(p)); + p = c.text; + } + p = ''; + } + p += (p ? ' ' : '') + c.text; + }); + + // Output final sentence + if (cleanSentence(p)) { + divs[divs.length-1].paragraphs.push(cleanSentence(p)); + } + + let content = ''; + let pid = 1; + if (options.splitPerSlide) { + for (let i = 0 ; i < divs.length; i++) { + if (options.slideset) { + content += `
    `; + content += `Slide ${divs[i].slide} of ${divs.length}\n`; + } + content += (options.markupStart || `
    `) + "\n"; + + for (const p of divs[i].paragraphs) { + const match = p.match(/^(.*):\s*(.*)$/); + if (match) { + content += `

    ${match[1]}: ${match[2]}

    \n`; + } + else { + content += `

    ${p}

    \n`; + } + pid += 1; + } + content += (options.markupEnd || '
    ') + "\n\n"; + if (options.slideset) { + content += `
    `; + } + } + } else { + let last = ''; + content += '

    '; + for (const p of divs.map(d => d.paragraphs).flat().flat()) { + const match = p.match(/^(.*):\s*(.*)$/); + if (match) { + if (last && match[1] === last) { + content += `
    \n … ${match[2]}`; + } + else { + content += `

    \n

    ${match[1]}: ${match[2]}`; + } + last = match[1]; + } + else { + content += `

    \n ${p}`; + } + } + } + + return content; +} diff --git a/tools/rename-recordings.mjs b/tools/rename-recordings.mjs new file mode 100644 index 0000000..f4bbaaf --- /dev/null +++ b/tools/rename-recordings.mjs @@ -0,0 +1,58 @@ +/** + * This tool is only useful once there are Zoom recordings of the breakout + * sessions available. It pulls and renames the recordings from local storage. + * + * To run the tool: + * + * node tools/rename-recordings.mjs + * + * Pre-requisites: + * 1. Zoom recordings must have been downloaed to a local folder, with one + * subfolder per recording. The subfolder name must start with the session + * number followed by a "-", e.g., "10-ecija" (the rest does not matter). + * 2. The local folder must appear in a RECORDING_FOLDER_RAW env variable. + * 3. The prefix to use to rename the recordings must be in a RECORDING_PREFIX + * env variable. + * + * The tool assumes that the video file to use each time has a name that ends + * with "_Recording_wwwwxhhhh.mp4". + * + * The tool also extracts the captions file, provided that its name ends with + * "_Recording.transcript.vtt". + * + * Renamed recordings and captions file are saved at the root of the + * RECORDING_FOLDER_RAW folder. + */ + +import path from 'path'; +import fs from 'fs/promises'; +import { getEnvKey } from './lib/envkeys.mjs'; + +async function main() { + const RECORDING_FOLDER_RAW = await getEnvKey('RECORDING_FOLDER_RAW'); + const RECORDING_PREFIX = await getEnvKey('RECORDING_PREFIX'); + const folders = await fs.readdir(RECORDING_FOLDER_RAW); + for (const folder of folders) { + if (folder.includes('.')) { + continue; + } + let files = await fs.readdir(path.join(rootFolder, folder)); + const prefix = `${RECORDING_PREFIX}-${folder.split('-')[0]}`; + + const recording = files.find(f => f.match(/_Recording_\d{3,4}x\d{3,4}\.mp4$/)); + if (recording) { + await fs.copyFile( + path.join(rootFolder, folder, recording), + path.join(rootFolder, prefix + '.mp4')); + } + + const subtitles = files.find(f => f.match(/_Recording\.transcript\.vtt$/)); + if (subtitles) { + await fs.copyFile( + path.join(rootFolder, folder, subtitles), + path.join(rootFolder, prefix + '.vtt')); + } + } +} + +main().then(_ => process.exit(0)); \ No newline at end of file diff --git a/tools/update-recording-thumbnails.mjs b/tools/update-recording-thumbnails.mjs new file mode 100644 index 0000000..19f8910 --- /dev/null +++ b/tools/update-recording-thumbnails.mjs @@ -0,0 +1,76 @@ +/** + * This tool is only useful once recordings of breakout sessions have been + * uploaded to Cloudflare. It updates the thumbnails of the recordings on + * Cloudflare to use a screenshot at 1% of the video, the goal being to avoid + * that the initial black screen gets used as thumbnail. + * + * To run the tool: + * + * node tools/update-recording-thumbnails.mjs + * + * Pre-requisites: + * 1. Recordings must have been uploaded to Cloudflare with a name that starts + * with a well-known prefix. + * 2. The well-known prefix must appear in a RECORDING_PREFIX env variable. + * 3. Cloudflare account info must appear in CLOUDFLARE_ACCOUNT and + * CLOUDFLARE_TOKEN env variables. + */ + +import path from 'path'; +import fs from 'fs/promises'; +import { convert } from './webvtt2html.mjs'; + +async function listRecordings(accountId, authToken, recordingPrefix) { + const response = await fetch( + `https://api.cloudflare.com/client/v4/accounts/${accountId}/stream?search=${recordingPrefix}`, + { + headers: { + 'Authorization': `Bearer ${authToken}` + } + } + ); + const json = await response.json(); + const recordings = json.result + .map(v => Object.assign({ + sessionId: v.meta.name.match(/-(\d+)\.mp4$/)[1], + name: v.meta.name, + videoId: v.uid + })) + .sort((v1, v2) => v1.name.localeCompare(v2.name)); + return recordings; +} + +async function updateThumbnail(recording, accountId, authToken) { + const response = await fetch( + `https://api.cloudflare.com/client/v4/accounts/${accountId}/stream/${recording.videoId}`, + { + method: 'POST', + headers: { + 'Authorization': `Bearer ${authToken}` + }, + body: JSON.stringify({ + uid: recording.videoId, + // Consider that a screenshot taken after 1% of the meeting + // will be a good thumbnail + thumbnailTimestampPct: 0.01 + }, null, 2) + } + ); + const json = await response.json(); + if (!json.success) { + console.warn(`Thumbnail could not be set for session #${recording.sessionId} - ${recording.videoId}`); + } +} + +async function main() { + const CLOUDFLARE_ACCOUNT = await getEnvKey('CLOUDFLARE_ACCOUNT'); + const CLOUDFLARE_TOKEN = await getEnvKey('CLOUDFLARE_TOKEN'); + const RECORDING_PREFIX = await getEnvKey('RECORDING_PREFIX'); + const recordings = await listRecordings(CLOUDFLARE_ACCOUNT, CLOUDFLARE_TOKEN, RECORDING_PREFIX); + for (const recording of recordings) { + await updateThumbnail(recording, CLOUDFLARE_ACCOUNT, CLOUDFLARE_TOKEN); + } +} + +main().then(_ => process.exit(0)); +