Skip to content

Commit

Permalink
Added sensitive words censoring
Browse files Browse the repository at this point in the history
  • Loading branch information
tmoroney committed Dec 8, 2024
1 parent 2394424 commit 966ae22
Show file tree
Hide file tree
Showing 7 changed files with 194 additions and 62 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/package-mac.yml
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ jobs:
python3 -m venv venv
source venv/bin/activate
pip install -r requirements-mac.txt
pyinstaller package-server.spec --noconfirm
pyinstaller package-mac.spec --noconfirm
deactivate
- name: Move Python Server to resources folder
Expand Down
33 changes: 9 additions & 24 deletions AutoSubs-App/src-tauri/resources/AutoSubs V2.lua
Original file line number Diff line number Diff line change
Expand Up @@ -304,25 +304,8 @@ function GetTemplateItem(folder, templateName)
end
end

-- remove sensitive words from the text and replace some letters with asterisks
local function RemoveSensitiveWords(input_string, censor_list)
-- Iterate through the list of words to censor
for _, word in ipairs(censor_list) do
-- Create a pattern to match the word (case-insensitive)
local pattern = word:gsub("(%W)", "%%%1") -- Escape special characters
pattern = utf8.lower(pattern) -- Ensure the pattern matches in lower case
pattern = "%f[%a]" .. pattern .. "%f[%A]" -- Match whole words only

-- Replace the word with asterisks
input_string = utf8.gsub(input_string, pattern, function(match)
return string.rep("*", utf8.len(match))
end)
end
return input_string
end

-- Add subtitles to the timeline using the specified template
function AddSubtitles(filePath, trackIndex, templateName, textFormat, removePunctuation, sensitiveWords)
function AddSubtitles(filePath, trackIndex, templateName, textFormat, removePunctuation)
local timeline = project:GetCurrentTimeline()

if trackIndex == "0" or trackIndex == "" then
Expand Down Expand Up @@ -409,7 +392,13 @@ function AddSubtitles(filePath, trackIndex, templateName, textFormat, removePunc

-- Remove punctuation if specified
if removePunctuation then
subtitleText = utf8.gsub(subtitleText, "[%p%c]", "")
subtitleText = utf8.gsub(subtitleText, "[%p%c]", function(c)
if c == "*" then
return c
else
return ""
end
end)
end

-- Apply text formatting
Expand All @@ -421,10 +410,6 @@ function AddSubtitles(filePath, trackIndex, templateName, textFormat, removePunc
subtitleText = utf8.lower(subtitleText)
end

-- if #sensitiveWords > 0 then
-- subtitleText = RemoveSensitiveWords(subtitleText, sensitiveWords)
-- end

-- Skip if text is not compatible
if timelineItem:GetFusionCompCount() > 0 then
local comp = timelineItem:GetFusionCompByIndex(1)
Expand Down Expand Up @@ -565,7 +550,7 @@ while not quitServer do
elseif data.func == "AddSubtitles" then
print("[AutoSubs Server] Adding subtitles to timeline...")
AddSubtitles(data.filePath, data.trackIndex, data.templateName, data.textFormat,
data.removePunctuation, data.sensitiveWords)
data.removePunctuation)
body = json.encode({
message = "Job completed"
})
Expand Down
3 changes: 1 addition & 2 deletions AutoSubs-App/src-tauri/tauri.conf.json
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,7 @@
"resources/"
],
"macOS": {
"minimumSystemVersion": "14.0",
"entitlements": "macos/entitlements.plist"
"minimumSystemVersion": "14.0"
},
"windows": {
"nsis": {
Expand Down
11 changes: 6 additions & 5 deletions AutoSubs-App/src/GlobalContext.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -283,6 +283,10 @@ export function GlobalProvider({ children }: React.PropsWithChildren<{}>) {
setProcessingStep("Preparing to transcribe...");

console.log("Audio Path: ", audioInfo.path);
let sensitiveWordsList: string[] = [];
if (sensitiveWords !== "") {
sensitiveWordsList = sensitiveWords.split(',').map((word: string) => word.trim().toLowerCase());
}

let body = {
file_path: audioInfo.path,
Expand All @@ -295,6 +299,7 @@ export function GlobalProvider({ children }: React.PropsWithChildren<{}>) {
align_words: alignWords,
max_words: maxWords,
max_chars: maxChars,
sensitive_words: sensitiveWordsList,
mark_in: audioInfo.markIn
};

Expand Down Expand Up @@ -404,10 +409,7 @@ export function GlobalProvider({ children }: React.PropsWithChildren<{}>) {
await updateTranscript(speakers);
filePath = await getFullTranscriptPath();
}
let sensitiveWordsList: string[] = [];
if (sensitiveWords !== "") {
sensitiveWordsList = sensitiveWords.split(',').map((word: string) => word.trim().toLowerCase());
}

try {
const response = await fetch(resolveAPI, {
method: 'POST',
Expand All @@ -419,7 +421,6 @@ export function GlobalProvider({ children }: React.PropsWithChildren<{}>) {
trackIndex: currentTrack,
removePunctuation,
textFormat,
sensitiveWords: sensitiveWordsList,
}),
});

Expand Down
8 changes: 4 additions & 4 deletions AutoSubs-App/src/pages/home-page.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@ export function HomePage() {
maxChars,
textFormat,
removePunctuation,
//sensitiveWords,
sensitiveWords,
alignWords,
audioPath,
setTemplate,
Expand All @@ -217,7 +217,7 @@ export function HomePage() {
setMaxChars,
setTextFormat,
setRemovePunctuation,
// setSensitiveWords,
setSensitiveWords,
setAlignWords,
setIsLoading,
setError,
Expand Down Expand Up @@ -669,10 +669,10 @@ export function HomePage() {
</div>
<Switch checked={removePunctuation} onCheckedChange={(checked) => setRemovePunctuation(checked)} />
</div>
{/* <div className="grid gap-3">
<div className="grid gap-3">
<Label htmlFor="sensitiveWords">Sensored Words</Label>
<Input value={sensitiveWords} id="sensitiveWords" type="string" placeholder="bomb, gun, kill" onChange={(e) => setSensitiveWords(e.target.value)} />
</div> */}
</div>
<div className="grid grid-cols-2 gap-4">
<div className="grid gap-3">
<Label htmlFor="maxWords">Max words</Label>
Expand Down
100 changes: 100 additions & 0 deletions Transcription-Server/package-mac.spec
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
import os
from PyInstaller.utils.hooks import collect_data_files, collect_submodules, collect_all
import certifi

# Initialize variables if not already defined
hiddenimports = []
datas = []
binaries = []

# Collect all data files, binaries, and hiddenimports for 'speechbrain'
speechbrain_datas, speechbrain_binaries, speechbrain_hiddenimports = collect_all('speechbrain')

# Include 'speechbrain' components
hiddenimports += speechbrain_hiddenimports
datas += speechbrain_datas
binaries += speechbrain_binaries

# Include other packages as before
hiddenimports += collect_submodules('mlx')
hiddenimports += collect_submodules('mlx_whisper')
hiddenimports += collect_submodules('stable_whisper')
hiddenimports += collect_submodules('pytorch_lightning')
hiddenimports += collect_submodules('pyannote')
hiddenimports += collect_submodules('torch')
hiddenimports += collect_submodules('torchaudio')
hiddenimports += collect_submodules('transformers')
hiddenimports += collect_submodules('huggingface_hub')

# Collect data files
datas += collect_data_files('mlx')
datas += collect_data_files('mlx_whisper')
datas += collect_data_files('stable_whisper')
datas += collect_data_files('pytorch_lightning')
datas += collect_data_files('lightning_fabric')
datas += collect_data_files('pyannote')
datas += [(os.path.abspath('ffmpeg_bin_mac'), 'ffmpeg_bin_mac')]

# Exclude unnecessary modules to reduce size and startup time
excludes = [
# Development tools
'jupyter', 'IPython', 'notebook', 'pytest',

# GUI-related modules
'tkinter', 'PyQt5', 'PyQt6', 'PySide2', 'PySide6', 'wx', 'pyglet', 'pycairo', 'pygobject', 'pyopengl',

# Scientific libraries and test suites
'matplotlib.tests', 'numpy.tests', 'scipy.spatial.cKDTree', 'scipy.tests',

# Unused data and legacy components
'torchvision',

# Large/unused packages
'cv2', 'Pillow', 'geopy',

# PyInstaller-related
'pyinstaller', 'PyInstaller.utils', 'PyInstaller.compat',

# Common third-party libraries
'sqlalchemy', 'psycopg2', 'pymysql', 'redis', 'celery', 'rq',
]

a = Analysis(
['transcription-server.py'],
pathex=[],
binaries=binaries, # Include collected binaries
datas=datas,
hiddenimports=hiddenimports,
hookspath=[],
hooksconfig={},
runtime_hooks=[],
excludes=excludes,
noarchive=False, # Include PYZ
optimize=0,
)

pyz = PYZ(a.pure, a.zipped_data, cipher=None, optimize=2)

exe = EXE(
pyz,
a.scripts,
exclude_binaries=True, # Exclude binaries from EXE
name='transcription-server', # Use consistent naming
debug=False,
bootloader_ignore_signals=False,
strip=False, # Set strip to False for debugging
upx=False, # Disable UPX compression
console=False,
disable_windowed_traceback=False,
)

coll = COLLECT(
exe,
a.binaries, # Include binaries here
a.zipfiles,
a.datas, # Include data files
strip=False,
upx=False,
upx_exclude=[],
name='Transcription-Server', # Use consistent naming
)
Loading

0 comments on commit 966ae22

Please sign in to comment.