From 7e764b6ce8628dae46a1fb19124de11597b4df43 Mon Sep 17 00:00:00 2001 From: rishikanthc Date: Tue, 15 Oct 2024 18:58:33 -0700 Subject: [PATCH 1/7] fix(diarization): working diarization using word level timestamps --- src/lib/fileFuncs.ts | 1 + src/lib/queue.ts | 198 +++++++++++++++++++++++++++++++------------ 2 files changed, 147 insertions(+), 52 deletions(-) diff --git a/src/lib/fileFuncs.ts b/src/lib/fileFuncs.ts index ecce0ec..2f54183 100644 --- a/src/lib/fileFuncs.ts +++ b/src/lib/fileFuncs.ts @@ -30,6 +30,7 @@ export async function ensureCollectionExists(pb) { } }, { name: 'title', type: 'text' }, + { name: 'rttm', type: 'text' }, { name: 'summary', type: 'text' }, { name: 'processed', type: 'bool' }, { name: 'model', type: 'text' }, diff --git a/src/lib/queue.ts b/src/lib/queue.ts index 32b83b9..c5166bf 100644 --- a/src/lib/queue.ts +++ b/src/lib/queue.ts @@ -86,7 +86,7 @@ pauseAndCleanQueue(); clearQueue(); // Helper function to execute shell commands and log output -const execCommandWithLogging = (cmd: string, job: Job) => { +const execCommandWithLogging = (cmd: string, job: Job, progress: number) => { return new Promise((resolve, reject) => { const process = exec(cmd); @@ -104,11 +104,16 @@ const execCommandWithLogging = (cmd: string, job: Job) => { // Check if stderr contains a progress update from Whisper const progressMatch = data.toString().match(/progress\s*=\s*(\d+)%/); if (progressMatch) { - const progress = parseInt(progressMatch[1], 10); - if (progress == 100) { - return; - } - await job.updateProgress(progress); + const tprogress = parseInt(progressMatch[1], 10); + + // if (tprogress == 100) { + // return; + // } + + const _remaining = 100 - progress + const _prog = _remaining * tprogress / 100 + + await job.updateProgress(_prog); } }); @@ -169,70 +174,41 @@ const worker = new Worker( await execCommandWithLogging(diarizeCmd, job); await job.log(`Diarization completed successfully`); - job.updateProgress(22.5); + job.updateProgress(35); // Read and parse the RTTM file const rttmContent = fs.readFileSync(rttmPath, 'utf-8'); const segments = parseRttm(rttmContent); await job.log(`Parsed RTTM file for record ${recordId}`); - // Split the audio into segments using FFmpeg - const segmentPaths = await splitAudioIntoSegments(ffmpegPath, segments, baseUrl, job); - await job.log(`Audio split into segments for record ${recordId}`); - - job.updateProgress(30); const settingsRecords = await pb.collection('settings').getList(1, 1); const settings = settingsRecords.items[0]; - const transcriptPath = path.resolve(env.SCRIBO_FILES, 'transcripts', `${recordId}`); - fs.mkdir(transcriptPath, { recursive: true }, (err) => { + + // Execute whisper.cpp command and log output + const transcriptdir = path.resolve(env.SCRIBO_FILES, 'transcripts', `${recordId}`); + const transcriptPath= path.resolve(env.SCRIBO_FILES, 'transcripts', `${recordId}`, `${recordId}`); + fs.mkdir(transcriptdir, { recursive: true }, (err) => { if (err) throw err; }); - let transcription = []; - let prog = 0; - - for (let i = 0; i < segmentPaths.length; i++) { - prog = 30 + (70 * i) / segmentPaths.length; - job.updateProgress(prog); - const segmentFilePath = path.resolve(baseUrl, `segment_${i}.wav`); - const segmentTranscriptPath = path.resolve(transcriptPath, `segment_${i}`); - - // Execute whisper command on the segment - const whisperCmd = `./whisper.cpp/main -m ./whisper.cpp/models/ggml-${settings.model}.en.bin -f ${segmentFilePath} -of ${segmentTranscriptPath} -otxt -t ${settings.threads} -p ${settings.processors}`; - await execCommandWithLogging(whisperCmd, job); - await job.log(`Whisper transcription for segment ${i} of ${recordId} completed`); - - // Read the transcript JSON file and append it to the combined transcript - const transcriptxt = fs.readFileSync(`${segmentTranscriptPath}.txt`, 'utf-8'); - transcription.push({ - timestamps: { - from: formatRttmTimestamp(segments[i].startTime), - to: formatRttmTimestamp(segments[i].startTime + segments[i].duration) - }, - speaker: segments[i].speaker, - text: transcriptxt.trim() - }); - } - - // const settingsRecords = await pb.collection('settings').getList(1, 1); - // const settings = settingsRecords.items[0]; - - // // Execute whisper.cpp command and log output - // const transcriptPath = path.resolve(env.SCRIBO_FILES, 'transcripts', `${recordId}`); - // const whisperCmd = `whisper -m /models/ggml-${settings.model}.en.bin -f ${ffmpegPath} -oj -of ${transcriptPath} -t ${settings.threads} -p ${settings.processors} -pp`; - // await execCommandWithLogging(whisperCmd, job); - // await job.log(`Whisper transcription for ${recordId} completed`); + const whisperCmd = `./whisper.cpp/main -m ./whisper.cpp/models/ggml-${settings.model}.en.bin -f ${ffmpegPath} -oj -of ${transcriptPath} -t ${settings.threads} -p ${settings.processors} -pp -ml 1`; + await execCommandWithLogging(whisperCmd, job, 35); + await job.log(`Whisper transcription for ${recordId} completed`); // Read and update transcript - // const transcript = fs.readFileSync(`${transcriptPath}.json`, 'utf-8'); - const validTranscription = JSON.stringify({"transcription": transcription}); - console.log(validTranscription); + const transcript = fs.readFileSync(`${transcriptPath}.json`, 'utf-8'); + let transcriptJson = JSON.parse(transcript); + console.log(transcriptJson); + + const diarizedTranscript = generateTranscript(transcriptJson.transcription, rttmContent) + transcriptJson.transcription = diarizedTranscript const audioPeaks = fs.readFileSync(`${audioPath}.json`, 'utf-8'); const upd = await pb.collection('scribo').update(recordId, { // transcript: '{ "test": "hi" }', - transcript: validTranscription, + transcript: transcriptJson, + rttm: rttmContent, processed: true, peaks: JSON.parse(audioPeaks) }); @@ -302,3 +278,121 @@ async function splitAudioIntoSegments(audioPath, segments, outputDir, job) { } return segmentPaths; } + +function preprocessWordTimestamps(wordTimestamps) { + const cleanedTimestamps = []; + let previousWord = null; + + wordTimestamps.forEach((word, index) => { + const text = word.text.trim(); + + // Handle periods and other punctuation + if (text === '.') { + if (previousWord) { + // Append the period to the previous word + previousWord.text += text; + previousWord.timestamps.to = word.timestamps.to; + } + } else if (text.startsWith("'")) { + // Append apostrophe-starting words to the previous word + if (previousWord) { + previousWord.text += text; + previousWord.timestamps.to = word.timestamps.to; + } + } else if (text.length === 1 && text !== 'a') { + // Handle single character words (except "a") + // if (previousWord) { + // // Append single character to the previous word + // previousWord.text += ` ${text}`; + // previousWord.timestamps.to = word.timestamps.to; + // } else if (index + 1 < wordTimestamps.length) { + // // If no previous word, prepend to the next word + // const nextWord = wordTimestamps[index + 1]; + // nextWord.text = `${text} ${nextWord.text}`; + // nextWord.timestamps.from = word.timestamps.from; + // } + console.log('deleting char') + + } else if (text.length === 1 && text === 'a') { + // Keep "a" as a separate word + cleanedTimestamps.push(word); + previousWord = word; + } else { + // Remove other single-character symbols (e.g., parentheses, commas) + if (!/^[\.,!?;:()\[\]]$/.test(text)) { + cleanedTimestamps.push(word); + previousWord = word; + } + } + }); + + return cleanedTimestamps; +} + +function generateTranscript(wordys, rttmString) { + const speakerSegments = parseRttm(rttmString); + const wordTimestamps = preprocessWordTimestamps(wordys); + + const finalTranscript = []; + let currentSegment = { + text: "", + timestamps: { from: null, to: null }, + speaker: null + }; + + wordTimestamps.forEach(word => { + const wordStart = word.offsets.from; + const wordEnd = word.offsets.to; + + const matchingSpeakerSegment = speakerSegments.find(speakerSegment => { + const speakerStart = speakerSegment.startTime * 1000; + const speakerEnd = speakerStart + (speakerSegment.duration * 1000); + return wordEnd >= speakerStart && wordEnd <= speakerEnd; + }); + + const assignedSpeaker = matchingSpeakerSegment ? matchingSpeakerSegment.speaker : currentSegment.speaker; + + if (!matchingSpeakerSegment) { + console.log('---------> Speaker unknown') + } + + // If the current segment is for the same speaker, append the word + if (currentSegment.speaker === assignedSpeaker) { + currentSegment.text += word.text; + currentSegment.timestamps.to = word.timestamps.to; // Update end time + } else if (currentSegment === null) { + currentSegment.speaker = assignedSpeaker; + currentSegment.text += word.text + currentSegment.timestamps.to = word.timestamps.to; // Update end time + + } else { + // Push the current segment if it has text + if (currentSegment.text.length > 0) { + finalTranscript.push({ ...currentSegment }); + } + + // Start a new segment for the new speaker + currentSegment = { + text: word.text, + timestamps: { from: word.timestamps.from, to: word.timestamps.to }, + speaker: assignedSpeaker + }; + } + }); + + // Push the last segment if any + if (currentSegment.text.length > 0) { + finalTranscript.push(currentSegment); + } + + return finalTranscript; +} + +function timestampToSeconds(timestamp) { + const [hours, minutes, seconds] = timestamp.split(":"); + const [sec, ms] = seconds.split(","); + return parseFloat(hours) * 3600 + parseFloat(minutes) * 60 + parseFloat(sec) + parseFloat(ms) / 1000; +} + + + From 312e83eeec004a87e432e7777fb25293effefbc7 Mon Sep 17 00:00:00 2001 From: rishikanthc Date: Tue, 15 Oct 2024 19:14:26 -0700 Subject: [PATCH 2/7] feat(speaker-labels): Display speakers in transcript pane --- src/lib/components/DisplayPane.svelte | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/src/lib/components/DisplayPane.svelte b/src/lib/components/DisplayPane.svelte index df1f017..a6b3331 100644 --- a/src/lib/components/DisplayPane.svelte +++ b/src/lib/components/DisplayPane.svelte @@ -2,6 +2,7 @@ import { ScrollArea } from 'bits-ui'; import { Button } from 'bits-ui'; import { Tabs } from 'bits-ui'; + import { Volume2 } from 'lucide-svelte'; import AudioViz from '$lib/components/AudioViz.svelte'; import { Combobox } from 'bits-ui'; import { Sparkles, ChevronsUpDown, Search, Check } from 'lucide-svelte'; @@ -88,12 +89,25 @@ {#if transcript} {#each transcript as t} {#if t.text !== ''} -
-
+
+
+ {#if t.speaker} +
+ +
{t.speaker}
+
+ {/if} {t.timestamps.from.split(',')[0]}
-

{t.text}

+

{t.text}

{/if} From 5c383d0cc2b2d8d64f0b3db1dab15c6f383dc965 Mon Sep 17 00:00:00 2001 From: rishikanthc Date: Wed, 16 Oct 2024 09:46:55 -0700 Subject: [PATCH 3/7] Adding docker build Github Action Workflow --- .github/workflows/github-actions-docker.yml | 33 +++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 .github/workflows/github-actions-docker.yml diff --git a/.github/workflows/github-actions-docker.yml b/.github/workflows/github-actions-docker.yml new file mode 100644 index 0000000..6fba150 --- /dev/null +++ b/.github/workflows/github-actions-docker.yml @@ -0,0 +1,33 @@ +name: ci + +on: + push: + +jobs: + docker: + runs-on: ubuntu-latest + steps: + - name: Set up QEMU + uses: docker/setup-qemu-action@v3 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Login to Docker Hub + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: Build and push + uses: docker/build-push-action@v6 + with: + platforms: linux/amd64,linux/arm64 + push: true + tags: | + rishikanthc/scriberr:latest + + - name: Push image to GHCR + run: | + docker buildx imagetools create \ + --tag ghcr.io/rishikanthc/scriberr:latest From fb28efceb71ae7a46dfec3869af58fb454c0a182 Mon Sep 17 00:00:00 2001 From: rishikanthc Date: Wed, 16 Oct 2024 10:20:48 -0700 Subject: [PATCH 4/7] build action --- .github/workflows/github-actions-docker.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/github-actions-docker.yml b/.github/workflows/github-actions-docker.yml index 6fba150..83bcba8 100644 --- a/.github/workflows/github-actions-docker.yml +++ b/.github/workflows/github-actions-docker.yml @@ -25,9 +25,9 @@ jobs: platforms: linux/amd64,linux/arm64 push: true tags: | - rishikanthc/scriberr:latest + rishikanthc/scriberr:nightly - name: Push image to GHCR run: | docker buildx imagetools create \ - --tag ghcr.io/rishikanthc/scriberr:latest + --tag ghcr.io/rishikanthc/scriberr:nightly From 0dc1001a404a6700c5733e0f052854878af9b53e Mon Sep 17 00:00:00 2001 From: Rishikanth Chandrasekaran Date: Wed, 16 Oct 2024 11:09:07 -0700 Subject: [PATCH 5/7] Update github-actions-docker.yml --- .github/workflows/github-actions-docker.yml | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/.github/workflows/github-actions-docker.yml b/.github/workflows/github-actions-docker.yml index 83bcba8..f61b6d0 100644 --- a/.github/workflows/github-actions-docker.yml +++ b/.github/workflows/github-actions-docker.yml @@ -13,19 +13,20 @@ jobs: - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 - - name: Login to Docker Hub + - name: Login to GHCR uses: docker/login-action@v3 with: registry: ghcr.io username: ${{ github.repository_owner }} - password: ${{ secrets.GITHUB_TOKEN }} - - name: Build and push + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Build and push to GHCR uses: docker/build-push-action@v6 with: platforms: linux/amd64,linux/arm64 push: true tags: | - rishikanthc/scriberr:nightly + ghcr.io/rishikanthc/scriberr:nightly - name: Push image to GHCR run: | From 0b90a78903ed1805b29197b016037af7ae283239 Mon Sep 17 00:00:00 2001 From: Rishikanth Chandrasekaran Date: Wed, 16 Oct 2024 12:25:40 -0700 Subject: [PATCH 6/7] Update github-actions-docker.yml --- .github/workflows/github-actions-docker.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/github-actions-docker.yml b/.github/workflows/github-actions-docker.yml index f61b6d0..8bff8e7 100644 --- a/.github/workflows/github-actions-docker.yml +++ b/.github/workflows/github-actions-docker.yml @@ -31,4 +31,5 @@ jobs: - name: Push image to GHCR run: | docker buildx imagetools create \ + ghcr.io/rishikanthc/scriberr:nightly \ --tag ghcr.io/rishikanthc/scriberr:nightly From b432e500ae4287d4c7d134b8ac3371c63ceba0c0 Mon Sep 17 00:00:00 2001 From: Rishikanth Chandrasekaran Date: Wed, 16 Oct 2024 13:04:20 -0700 Subject: [PATCH 7/7] fix(build): Cache binaries to reduce build time --- .github/workflows/github-actions-docker.yml | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/.github/workflows/github-actions-docker.yml b/.github/workflows/github-actions-docker.yml index 8bff8e7..e7590e4 100644 --- a/.github/workflows/github-actions-docker.yml +++ b/.github/workflows/github-actions-docker.yml @@ -19,6 +19,24 @@ jobs: registry: ghcr.io username: ${{ github.repository_owner }} password: ${{ secrets.GITHUB_TOKEN }} + + # Cache Node.js dependencies + - name: Cache Node.js modules + uses: actions/cache@v3 + with: + path: ~/.npm + key: ${{ runner.os }}-node-${{ hashFiles('package-lock.json') }} + restore-keys: | + ${{ runner.os }}-node- + + - name: Cache compiled binaries + uses: actions/cache@v3 + with: + path: | + /usr/local/bin/ + /usr/local/share/man/man1/ + /usr/local/share/man/man5/ + key: ${{ runner.os }}-build-${{ hashFiles('Dockerfile') }} - name: Build and push to GHCR uses: docker/build-push-action@v6 @@ -27,6 +45,8 @@ jobs: push: true tags: | ghcr.io/rishikanthc/scriberr:nightly + cache-from: type=registry,ref=ghcr.io/rishikanthc/scriberr:nightly + cache-to: type=registry,ref=ghcr.io/rishikanthc/scriberr:nightly,mode=max - name: Push image to GHCR run: |