-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmain.py
112 lines (85 loc) · 3.65 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import numpy as np
import pyloudnorm as pyln
from openai import OpenAI
import soundfile as sf
import sys, os
sys.path.insert(0, '../')
import utils
from utils.audio_generation import sample, get_model
from utils.audio_processing import compress_spectrogram_simple, compress_spectrogram_with_centroid
import warnings
def fxn():
warnings.warn("deprecated", DeprecationWarning)
with warnings.catch_warnings():
warnings.simplefilter("ignore")
fxn()
os.environ["TOKENIZERS_PARALLELISM"] = "false" # Important.
model_name = 'audioldm_16k_crossattn_t5' # Smaller model; Less GPU memory ~[6-9]GB;
# model_name = 'audioldm2-full' # Larger model; More GPU memory ~[12-15] GB;
#Audio params
loudness_dblufs = -10.0
sample_rate = 16000
stft_channels = 1024
hop_length = 128
# Diffusion params
guidance_scale = 3
n_candidates = 1
batch_size = 1
ddim_steps = 100
'''
Foley Interpreter
'''
def foley_interpreter(txt, client):
content = "Describe a language phrase consisting of a noun verb adjective adverb like "+\
"'a brown cat running purposefully and quickly' that would be a sound which could be also perceivable "+\
"as touch sensation and resembles "+txt+". Use simple words and long descriptions when generating the language phrase."
response = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": content}
]
)
foley_language_phrase = response.choices[0].message.content.split('"')[1]
print(foley_language_phrase)
return foley_language_phrase
'''
Audio Generator
'''
def audio_generator(foley_language_phrase, latent_diffusion, random_seed):
audio = sample(latent_diffusion, foley_language_phrase, batch_size=1, ddim_steps=100, guidance_scale=3.0, \
random_seed=random_seed, disable_tqdmoutput=False)
return audio
'''
Dynamic Audio Converter
'''
def audio_post_processor(audio, freq_limit, hop_length, stft_channels, sample_rate, loudness):
loudness_meter = pyln.Meter(sample_rate)
wav_compressed_simple = compress_spectrogram_simple(audio, limit=freq_limit, hop_length=hop_length, stft_channels=stft_channels, \
sample_rate=sample_rate, loudness_meter=loudness_meter, loudness=loudness)
wav_compressed_w_centroid = compress_spectrogram_with_centroid(audio, limit=freq_limit, hop_length=hop_length, stft_channels=stft_channels, \
sample_rate=sample_rate, loudness_meter=loudness_meter, loudness=loudness)
_ = {\
'original': audio, \
'simple compressed': wav_compressed_simple, \
'compressed with centroid': wav_compressed_w_centroid\
}
return _
def main(prompt):
with open('config/.env', 'r') as fh:
vars_dict = dict(
tuple(line.replace('\n', '').split('='))
for line in fh.readlines() if not line.startswith('#')
)
os.environ.update(vars_dict)
latent_diffusion = get_model(model_name)
client = OpenAI()
foley_language_phrase = foley_interpreter(prompt, client)
audio = audio_generator(foley_language_phrase, latent_diffusion, np.random.randint(0,10000))
audio_c = audio_post_processor(audio, freq_limit=1000, hop_length=hop_length, stft_channels=stft_channels, sample_rate=sample_rate, loudness=-10)
os.makedirs('output_dir', exist_ok=True)
for k in audio_c:
sf.write('output_dir/'+prompt.replace(' ','_')+'_'+k.replace(' ','_')+'.wav', data=audio_c[k], samplerate=sample_rate)
if __name__ == '__main__':
prompt=sys.argv[1]
main(prompt)