Added sensitive words censoring

tmoroney · Dec 8, 2024 · 966ae22 · 966ae22
1 parent 2394424
commit 966ae22
Show file tree

Hide file tree

Showing 7 changed files with 194 additions and 62 deletions.
diff --git a/.github/workflows/package-mac.yml b/.github/workflows/package-mac.yml
@@ -73,7 +73,7 @@ jobs:
         python3 -m venv venv
         source venv/bin/activate
         pip install -r requirements-mac.txt
-        pyinstaller package-server.spec --noconfirm
+        pyinstaller package-mac.spec --noconfirm
         deactivate
 
     - name: Move Python Server to resources folder

diff --git a/AutoSubs-App/src-tauri/resources/AutoSubs V2.lua b/AutoSubs-App/src-tauri/resources/AutoSubs V2.lua
@@ -304,25 +304,8 @@ function GetTemplateItem(folder, templateName)
     end
 end
 
--- remove sensitive words from the text and replace some letters with asterisks
-local function RemoveSensitiveWords(input_string, censor_list)
-    -- Iterate through the list of words to censor
-    for _, word in ipairs(censor_list) do
-        -- Create a pattern to match the word (case-insensitive)
-        local pattern = word:gsub("(%W)", "%%%1") -- Escape special characters
-        pattern = utf8.lower(pattern) -- Ensure the pattern matches in lower case
-        pattern = "%f[%a]" .. pattern .. "%f[%A]" -- Match whole words only
-
-        -- Replace the word with asterisks
-        input_string = utf8.gsub(input_string, pattern, function(match)
-            return string.rep("*", utf8.len(match))
-        end)
-    end
-    return input_string
-end
-
 -- Add subtitles to the timeline using the specified template
-function AddSubtitles(filePath, trackIndex, templateName, textFormat, removePunctuation, sensitiveWords)
+function AddSubtitles(filePath, trackIndex, templateName, textFormat, removePunctuation)
     local timeline = project:GetCurrentTimeline()
 
     if trackIndex == "0" or trackIndex == "" then
@@ -409,7 +392,13 @@ function AddSubtitles(filePath, trackIndex, templateName, textFormat, removePunc
 
             -- Remove punctuation if specified
             if removePunctuation then
-                subtitleText = utf8.gsub(subtitleText, "[%p%c]", "")
+                subtitleText = utf8.gsub(subtitleText, "[%p%c]", function(c)
+                    if c == "*" then
+                        return c
+                    else
+                        return ""
+                    end
+                end)
             end
 
             -- Apply text formatting
@@ -421,10 +410,6 @@ function AddSubtitles(filePath, trackIndex, templateName, textFormat, removePunc
                 subtitleText = utf8.lower(subtitleText)
             end
 
-            -- if #sensitiveWords > 0 then
-            --     subtitleText = RemoveSensitiveWords(subtitleText, sensitiveWords)
-            -- end
-
             -- Skip if text is not compatible
             if timelineItem:GetFusionCompCount() > 0 then
                 local comp = timelineItem:GetFusionCompByIndex(1)
@@ -565,7 +550,7 @@ while not quitServer do
                     elseif data.func == "AddSubtitles" then
                         print("[AutoSubs Server] Adding subtitles to timeline...")
                         AddSubtitles(data.filePath, data.trackIndex, data.templateName, data.textFormat,
-                            data.removePunctuation, data.sensitiveWords)
+                            data.removePunctuation)
                         body = json.encode({
                             message = "Job completed"
                         })

diff --git a/AutoSubs-App/src-tauri/tauri.conf.json b/AutoSubs-App/src-tauri/tauri.conf.json
@@ -46,8 +46,7 @@
       "resources/"
     ],
     "macOS": {
-      "minimumSystemVersion": "14.0",
-      "entitlements": "macos/entitlements.plist"
+      "minimumSystemVersion": "14.0"
     },
     "windows": {
       "nsis": {

diff --git a/AutoSubs-App/src/GlobalContext.tsx b/AutoSubs-App/src/GlobalContext.tsx
@@ -283,6 +283,10 @@ export function GlobalProvider({ children }: React.PropsWithChildren<{}>) {
             setProcessingStep("Preparing to transcribe...");
 
             console.log("Audio Path: ", audioInfo.path);
+            let sensitiveWordsList: string[] = [];
+            if (sensitiveWords !== "") {
+                sensitiveWordsList = sensitiveWords.split(',').map((word: string) => word.trim().toLowerCase());
+            }
 
             let body = {
                 file_path: audioInfo.path,
@@ -295,6 +299,7 @@ export function GlobalProvider({ children }: React.PropsWithChildren<{}>) {
                 align_words: alignWords,
                 max_words: maxWords,
                 max_chars: maxChars,
+                sensitive_words: sensitiveWordsList,
                 mark_in: audioInfo.markIn
             };
 
@@ -404,10 +409,7 @@ export function GlobalProvider({ children }: React.PropsWithChildren<{}>) {
             await updateTranscript(speakers);
             filePath = await getFullTranscriptPath();
         }
-        let sensitiveWordsList: string[] = [];
-        if (sensitiveWords !== "") {
-            sensitiveWordsList = sensitiveWords.split(',').map((word: string) => word.trim().toLowerCase());
-        }
+
         try {
             const response = await fetch(resolveAPI, {
                 method: 'POST',
@@ -419,7 +421,6 @@ export function GlobalProvider({ children }: React.PropsWithChildren<{}>) {
                     trackIndex: currentTrack,
                     removePunctuation,
                     textFormat,
-                    sensitiveWords: sensitiveWordsList,
                 }),
             });
 

diff --git a/AutoSubs-App/src/pages/home-page.tsx b/AutoSubs-App/src/pages/home-page.tsx
@@ -204,7 +204,7 @@ export function HomePage() {
         maxChars,
         textFormat,
         removePunctuation,
-        //sensitiveWords,
+        sensitiveWords,
         alignWords,
         audioPath,
         setTemplate,
@@ -217,7 +217,7 @@ export function HomePage() {
         setMaxChars,
         setTextFormat,
         setRemovePunctuation,
-        // setSensitiveWords,
+        setSensitiveWords,
         setAlignWords,
         setIsLoading,
         setError,
@@ -669,10 +669,10 @@ export function HomePage() {
                                 </div>
                                 <Switch checked={removePunctuation} onCheckedChange={(checked) => setRemovePunctuation(checked)} />
                             </div>
-                            {/* <div className="grid gap-3">
+                            <div className="grid gap-3">
                                 <Label htmlFor="sensitiveWords">Sensored Words</Label>
                                 <Input value={sensitiveWords} id="sensitiveWords" type="string" placeholder="bomb, gun, kill" onChange={(e) => setSensitiveWords(e.target.value)} />
-                            </div> */}
+                            </div>
                             <div className="grid grid-cols-2 gap-4">
                                 <div className="grid gap-3">
                                     <Label htmlFor="maxWords">Max words</Label>

diff --git a/Transcription-Server/package-mac.spec b/Transcription-Server/package-mac.spec
@@ -0,0 +1,100 @@
+import os
+from PyInstaller.utils.hooks import collect_data_files, collect_submodules, collect_all
+import certifi
+
+# Initialize variables if not already defined
+hiddenimports = []
+datas = []
+binaries = []
+
+# Collect all data files, binaries, and hiddenimports for 'speechbrain'
+speechbrain_datas, speechbrain_binaries, speechbrain_hiddenimports = collect_all('speechbrain')
+
+# Include 'speechbrain' components
+hiddenimports += speechbrain_hiddenimports
+datas += speechbrain_datas
+binaries += speechbrain_binaries
+
+# Include other packages as before
+hiddenimports += collect_submodules('mlx')
+hiddenimports += collect_submodules('mlx_whisper')
+hiddenimports += collect_submodules('stable_whisper')
+hiddenimports += collect_submodules('pytorch_lightning')
+hiddenimports += collect_submodules('pyannote')
+hiddenimports += collect_submodules('torch')
+hiddenimports += collect_submodules('torchaudio')
+hiddenimports += collect_submodules('transformers')
+hiddenimports += collect_submodules('huggingface_hub')
+
+# Collect data files
+datas += collect_data_files('mlx')
+datas += collect_data_files('mlx_whisper')
+datas += collect_data_files('stable_whisper')
+datas += collect_data_files('pytorch_lightning')
+datas += collect_data_files('lightning_fabric')
+datas += collect_data_files('pyannote')
+datas += [(os.path.abspath('ffmpeg_bin_mac'), 'ffmpeg_bin_mac')]
+
+# Exclude unnecessary modules to reduce size and startup time
+excludes = [
+    # Development tools
+    'jupyter', 'IPython', 'notebook', 'pytest',
+
+    # GUI-related modules
+    'tkinter', 'PyQt5', 'PyQt6', 'PySide2', 'PySide6', 'wx', 'pyglet', 'pycairo', 'pygobject', 'pyopengl',
+
+    # Scientific libraries and test suites
+    'matplotlib.tests', 'numpy.tests', 'scipy.spatial.cKDTree', 'scipy.tests',
+
+    # Unused data and legacy components
+    'torchvision',
+
+    # Large/unused packages
+    'cv2', 'Pillow', 'geopy',
+
+    # PyInstaller-related
+    'pyinstaller', 'PyInstaller.utils', 'PyInstaller.compat',
+
+    # Common third-party libraries
+    'sqlalchemy', 'psycopg2', 'pymysql', 'redis', 'celery', 'rq',
+]
+
+a = Analysis(
+    ['transcription-server.py'],
+    pathex=[],
+    binaries=binaries,   # Include collected binaries
+    datas=datas,
+    hiddenimports=hiddenimports,
+    hookspath=[],
+    hooksconfig={},
+    runtime_hooks=[],
+    excludes=excludes,
+    noarchive=False,     # Include PYZ
+    optimize=0,
+)
+
+pyz = PYZ(a.pure, a.zipped_data, cipher=None, optimize=2)
+
+exe = EXE(
+    pyz,
+    a.scripts,
+    exclude_binaries=True,    # Exclude binaries from EXE
+    name='transcription-server',  # Use consistent naming
+    debug=False,
+    bootloader_ignore_signals=False,
+    strip=False,              # Set strip to False for debugging
+    upx=False,                # Disable UPX compression
+    console=False,
+    disable_windowed_traceback=False,
+)
+
+coll = COLLECT(
+    exe,
+    a.binaries,               # Include binaries here
+    a.zipfiles,
+    a.datas,                  # Include data files
+    strip=False,
+    upx=False,
+    upx_exclude=[],
+    name='Transcription-Server',  # Use consistent naming
+)