Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

App denoising #852

Open
wants to merge 5 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -113,3 +113,4 @@ examples/app_yolov8_classification/xcore_flash_binary.out
examples/app_yolov8_classification/yolov8n-cls.onnx
examples/app_yolov8_classification/yolov8n-cls.pt
examples/app_yolov8_classification/yolov8n-cls_saved_model/
examples/app_denoising/data
39 changes: 39 additions & 0 deletions examples/app_denoising/README.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
======================
De-noising model
======================

Installation
============

1. **Install Dependencies**:

.. code-block:: shell

pip install -r requirements.txt

2. **Download Dataset**:

.. code-block:: shell

bash download.sh

This script will download a sample of the `MS-SNSD` dataset for noise samples, and a sample of the `DNS-Challenge` dataset for clean speech. Samples are saved in the `data/` directory.

3. **Convert Dataset**:

.. code-block:: shell

python dataset.py

Convert the downloaded datasets into training samples, and save them as `TFRecords` in `data/records/`.

Training
========

1. **Initiate Training**:

.. code-block:: shell

python train.py

This script will train a de-noising and de-reverberation model on the prepared data, and save it as `model.h5`.
223 changes: 223 additions & 0 deletions examples/app_denoising/dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,223 @@
import os
import tensorflow as tf
import random
from tqdm import tqdm
import numpy as np
from scipy.io import wavfile
from scipy.signal import stft, resample, fftconvolve
import noisereduce as nr
from glob import glob

SAMPLES = None
WINDOW_SIZE = 512
HOP_SIZE = 128
FFT_SIZE = 512
FL = 512 * 8 * 40
FADE_IN_LENGTH = 512 * 50
SR = 16000
NUM_BINS = 64
POWER_FACTOR = .3
tf.keras.utils.set_random_seed(42)


def unique_log_bins(low, high, nbins):
if low < 1:
bins = np.geomspace(1, high, nbins-1, dtype=int)
bins = np.concatenate(([0], bins))
else:
bins = np.geomspace(low, high, nbins, dtype=int)
while len(np.unique(bins)) != nbins:
unique_vals, counts = np.unique(bins, return_counts=True)
duplicates = np.argwhere(counts > 1)
arg_first_unique = duplicates[-1][0] + 1
first_unique = unique_vals[arg_first_unique]
total_duplicates = np.sum(unique_vals < first_unique)
next_bins = np.geomspace(
first_unique, high, nbins - total_duplicates, dtype=int)
bins = np.concatenate((unique_vals[:arg_first_unique], next_bins))
return bins


def log_filterbank(Fs, nfft, n_filters=24, f_low=0, f_high=None, window_function=np.hanning):
f_high = f_high or Fs/2.0
assert (f_high <= Fs/2.0), "Log filterbank higher frequency cannot exceed Fs/2!"
bin_low = np.floor(f_low*(nfft)/Fs)
bin_high = np.floor(f_high*(nfft)/Fs)
Hz_bins = unique_log_bins(bin_low, bin_high, n_filters)
fbank = np.zeros([n_filters, nfft // 2 + 1])
for n in range(n_filters-1):
dist = int(Hz_bins[n+1] - Hz_bins[n])
wind = window_function(2*dist + 1)
fbank[n, Hz_bins[n]:Hz_bins[n+1]] = wind[dist:-1]
fbank[n+1, Hz_bins[n]:Hz_bins[n+1]] = wind[:dist]
fbank[0, :Hz_bins[0]] = 1.0
fbank[-1, Hz_bins[-1]:] = 1.0
return fbank


F_BANK = log_filterbank(SR, WINDOW_SIZE, NUM_BINS)
_, NOISE_AUDIO = wavfile.read("data/noise.wav")


def infinite(gen, *args, **kwargs):
while True:
yield from gen(*args, **kwargs)


def nsf(signal, noise, snr):
signal_power = np.mean(signal ** 2)
noise_power = np.mean(noise ** 2)
return np.sqrt((signal_power / noise_power) * 10 ** (-snr / 10.0))


def apply_reverb(signal, rir):
ratio = np.random.uniform(0, 1)
r_signal = fftconvolve(signal, rir, mode="full")[:len(signal)]
return ratio * r_signal + (1.-ratio) * signal


def process_wave(signal, return_orig=False):
_, _, s = stft(signal, fs=SR, nperseg=WINDOW_SIZE,
noverlap=WINDOW_SIZE - HOP_SIZE, nfft=FFT_SIZE)
mag = np.abs(s.T) @ F_BANK.T
mag = mag[..., None].astype(np.float32)**POWER_FACTOR
if return_orig:
return mag, s.T
return mag


def pad(sig):
tot_pad = FL - len(sig)
left_pad = np.random.randint(0, tot_pad + 1)
right_pad = tot_pad - left_pad
return np.concatenate([np.zeros(left_pad), sig, np.zeros(right_pad)])


def get_input(signal, noise, rirs, return_phase=False):
snr = np.random.uniform(0, 30)
noise_factor = nsf(signal, noise, snr)
if len(signal) > FADE_IN_LENGTH:
signal[:FADE_IN_LENGTH] *= np.arange(0, 1, 1/FADE_IN_LENGTH)
signal, noise = pad(signal), pad(noise)
r_signal = apply_reverb(signal, rirs[..., 0])
r_noise = apply_reverb(noise, rirs[..., 1])
noisy_signal = r_signal + r_noise * noise_factor
if return_phase:
ins, orig = process_wave(noisy_signal, True)
outs, perf = process_wave(signal, True)
return ins, outs, orig, perf
else:
ins, outs = process_wave(noisy_signal), process_wave(signal)
return ins, outs


def signal_gen(folder, chop=True, is_clean=False):
paths = glob(f"{folder}/**/*.wav", recursive=True)
random.shuffle(paths)
for path in paths:
fs, s = wavfile.read(path)
if is_clean:
s = nr.reduce_noise(s, sr=fs, y_noise=NOISE_AUDIO,
stationary=True, n_fft=512,
time_mask_smooth_ms=32,
freq_mask_smooth_hz=188,
n_std_thresh_stationary=.8)
s = resample(s, int(len(s) / fs * SR))
if chop:
yield from (s[i:i+FL] for i in range(0, len(s), FL))
elif s.shape[0] <= FL and len(s.shape) == 2:
yield from (s[:, i-2:i] for i in range(2, s.shape[1], 2))


def chop_silence(wav):
aw = np.abs(wav[..., 0])
maw = np.max(aw)
index = np.where(aw / maw > 0.7)[0][0]
return wav[index:] / maw


def data_gen(sig_fol, noise_fol, rir_fol, phase=False):
voices = signal_gen(sig_fol, is_clean=True)
noises = infinite(signal_gen, noise_fol)
rirs = map(chop_silence, infinite(signal_gen, rir_fol, False))
combined = zip(voices, noises, rirs)
yield from (get_input(s, n, r, phase) for s, n, r in combined)


def _bytes_feature(value):
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[tf.io.serialize_tensor(value).numpy()]))


def serialize_example(signal, noise):
feature = {
'signal': _bytes_feature(signal),
'noise': _bytes_feature(noise),
}
example_proto = tf.train.Example(
features=tf.train.Features(feature=feature))
return example_proto.SerializeToString()


def write_tfrecords(folder_path, data_generator, samples_per_file=256):
file_count = samples_written = 0
tfrecord_writer = None
for signal, noise in tqdm(data_generator):
if not samples_written:
file_name = f"{folder_path}/data_{file_count}.tfrecord"
tfrecord_writer = tf.io.TFRecordWriter(file_name)
tf_example = serialize_example(signal, noise)
tfrecord_writer.write(tf_example)
samples_written += 1
if samples_written == samples_per_file:
tfrecord_writer.close()
file_count += 1
samples_written = 0
if samples_written:
tfrecord_writer.close()


def ds_from_paths(paths, batch_size):
ds = tf.data.TFRecordDataset(filenames=paths)
ds = ds.map(_parse_function).shuffle(buffer_size=100)
return ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)


def read_tfrecords(folder_path, bs=16):
fp = glob(os.path.join(folder_path, '*.tfrecord'))
random.shuffle(fp)
nt = len(fp) // 10
return ds_from_paths(fp[nt:], bs), ds_from_paths(fp[:nt], bs)


def load_dataset(batch_size):
return tf.data.Dataset.from_generator(
lambda: data_gen("data/datasets_fullband/",
"data/MS-SNSD/", "data/rirs_noises/"),
output_signature=(
tf.TensorSpec(shape=(SAMPLES, NUM_BINS, 1), dtype=np.float32),
tf.TensorSpec(shape=(SAMPLES, NUM_BINS, 1), dtype=np.float32),
)
).batch(batch_size)


def _parse_function(proto):
keys_to_features = {
'signal': tf.io.FixedLenFeature([], tf.string),
'noise': tf.io.FixedLenFeature([], tf.string)
}
parsed_features = tf.io.parse_single_example(proto, keys_to_features)
parsed_features['signal'] = tf.io.parse_tensor(
parsed_features['signal'], out_type=tf.float32)
parsed_features['noise'] = tf.io.parse_tensor(
parsed_features['noise'], out_type=tf.float32)

return parsed_features['signal'], parsed_features['noise']


if __name__ == "__main__":
gen = data_gen("data/datasets_fullband/",
"data/MS-SNSD/", "data/rirs_noises/")
write_tfrecords(f"data/records_{NUM_BINS}", gen)
# for a, b in gen:
# print(a.shape, b.shape)
# break
37 changes: 37 additions & 0 deletions examples/app_denoising/download.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#!/usr/bin/bash

### DOWNLOAD VOICE SAMPLES ###
BLOB_NAMES=(
clean_fullband/datasets_fullband.clean_fullband.french_speech_000_NA_NA.tar.bz2
)
AZURE_URL="https://dns4public.blob.core.windows.net/dns4archive/datasets_fullband"

DATA_DIR="data"
OUTPUT_PATH="$DATA_DIR/datasets_fullband"

mkdir -p $OUTPUT_PATH/{clean_fullband,noise_fullband}

for BLOB in ${BLOB_NAMES[@]}
do
URL="$AZURE_URL/$BLOB"
echo "Download: $BLOB"
curl "$URL" | tar -C "$OUTPUT_PATH" -f - -x -j
done

## DOWNLOAD NOISE SAMPLES ###
repo_subdir="noise_train"

git -C "$DATA_DIR" init
git -C "$DATA_DIR" config core.sparseCheckout true
echo "noise_train/*" > "$DATA_DIR/.git/info/sparse-checkout"
git -C "$DATA_DIR" remote add -f origin https://github.com/microsoft/MS-SNSD.git
git -C "$DATA_DIR" pull origin master
rm -rf "$DATA_DIR/.git"

### DOWNLOAD RIRS ###
openslr_url="https://www.openslr.org/resources/28/rirs_noises.zip"
openslr_dir="$DATA_DIR/rirs_noises"
mkdir -p "$openslr_dir"
wget -O "$openslr_dir/rirs_noises.zip" "$openslr_url"
unzip "$openslr_dir/rirs_noises.zip" -d "$openslr_dir"
rm "$openslr_dir/rirs_noises.zip"
84 changes: 84 additions & 0 deletions examples/app_denoising/model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import tensorflow as tf
import numpy as np
from tensorflow.keras.layers import (Input, GRU, ReLU, Reshape,
BatchNormalization, Conv2D,
Conv2DTranspose, Concatenate)

TIMESTEPS = None
SAMPLES = 1281


def bn_relu(x):
return ReLU()(BatchNormalization()(x))


def simple_sigmoid(x):
return tf.clip_by_value(ReLU()(x + .5), 0, 1)


def simple_tanh(x):
return tf.clip_by_value(x, -1, 1)


def gru_block(x, num_samples, enc_f, state_input):
num_chans = 32
x = Reshape([num_samples, num_chans * enc_f])(x)
if state_input is not None:
x, state = GRU(num_chans * enc_f, return_state=True,
return_sequences=True, unroll=True)(x, state_input)
# activation=simple_tanh,
# recurrent_activation=simple_sigmoid)(x, state_input)
else:
x, state = GRU(num_chans * enc_f, return_sequences=True)(x), None
# activation=simple_tanh,
# recurrent_activation=simple_sigmoid)(x), None
x = Reshape([num_samples, enc_f, 32])(x)
return x, state


def get_trunet(num_freqs=64, num_samples=SAMPLES, inference=False):
channels = [24, 32, 48, 48, 64, 64]
strides = [2, 1, 2, 1, 2, 2]
k_sizes = [5, 3, 5, 3, 5, 3]
zipped = list(zip(k_sizes, strides, channels))
inp = Input(shape=(num_samples, num_freqs, 1))
state_input = Input(shape=(64,)) if inference else None
x = BatchNormalization()(inp)
x = Conv2D(channels[0], kernel_size=[1, k_sizes[0]],
strides=[1, strides[0]], padding="same", use_bias=False)(x)
x = bn_relu(x)
xs = [x]
for k, s, c in zipped[1:]:
x = Conv2D(c, kernel_size=[1, k], strides=[
1, s], padding="same", use_bias=False)(x)
x = bn_relu(x)
xs.append(x)
x = Conv2D(32, kernel_size=[1, 2], strides=[1, 2],
padding="same", use_bias=False)(x)
x = bn_relu(x)
x, new_state = gru_block(x, num_samples, 2, state_input)
x = Conv2DTranspose(32, kernel_size=[1, 2], strides=[1, 2],
padding="same", use_bias=False)(x)
x = bn_relu(x)
for (k, s, c), skip in list(zip(zipped, xs))[:1:-1]:
cs = (c * 2) // 3
x = Concatenate()([x, skip])
x = Conv2D(cs, kernel_size=[1, 1], use_bias=False)(x)
x = BatchNormalization()(x)
x = Conv2DTranspose(cs, kernel_size=[1, k], strides=[1, s],
padding="same", use_bias=False)(x)
x = bn_relu(x)
x = Concatenate()([x, xs[0]])
x = Conv2DTranspose(1, kernel_size=[1, k_sizes[0]],
strides=[1, strides[0]], padding="same")(x)
out = tf.keras.activations.sigmoid(x)
inputs = [inp, state_input] if inference else inp
outputs = [out, new_state] if inference else out * inp
model = tf.keras.models.Model(inputs=inputs, outputs=outputs)
return model


if __name__ == "__main__":
model = get_trunet(64, 1, inference=True)
print(np.sum([np.prod(i.shape) for i in model.trainable_weights]))
tf.keras.utils.plot_model(model, show_shapes=True)
Binary file added examples/app_denoising/samples/input_1.wav
Binary file not shown.
Binary file added examples/app_denoising/samples/input_2.wav
Binary file not shown.
Binary file added examples/app_denoising/samples/input_3.wav
Binary file not shown.
Binary file added examples/app_denoising/samples/input_4.wav
Binary file not shown.
Binary file added examples/app_denoising/samples/input_5.wav
Binary file not shown.
Binary file added examples/app_denoising/samples/input_6.wav
Binary file not shown.
Loading
Loading