-
Notifications
You must be signed in to change notification settings - Fork 0
/
tesi.bib
410 lines (370 loc) · 17.2 KB
/
tesi.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
@Article{Goodfellow,
author = {{Goodfellow}, Ian J. and {Pouget-Abadie}, Jean and {Mirza}, Mehdi and {Xu}, Bing and {Warde-Farley}, David and {Ozair}, Sherjil and {Courville}, Aaron and {Bengio}, Yoshua},
title = {{Generative Adversarial Networks}},
year = {2014},
month = jun,
adsnote = {Provided by the SAO/NASA Astrophysics Data System},
adsurl = {https://ui.adsabs.harvard.edu/abs/2014arXiv1406.2661G},
archiveprefix = {arXiv},
groups = {GAN Related},
keywords = {Statistics - Machine Learning, Computer Science - Machine Learning},
primaryclass = {stat.ML},
}
@InProceedings{CycleGAN2017,
author = {Zhu, Jun-Yan and Park, Taesung and Isola, Phillip and Efros, Alexei A.},
booktitle = {2017 IEEE International Conference on Computer Vision (ICCV)},
title = {{Unpaired Image-to-Image Translation Using Cycle-Consistent Adversarial Networks}},
year = {2017},
pages = {2242--2251},
doi = {10.1109/ICCV.2017.244},
groups = {GAN Related},
}
@InProceedings{MaskCyclegan-VC,
author = {Kaneko, Takuhiro and Kameoka, Hirokazu and Tanaka, Kou and Hojo, Nobukatsu},
booktitle = {ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
title = {{Maskcyclegan-VC: Learning Non-Parallel Voice Conversion with Filling in Frames}},
year = {2021},
pages = {5919--5923},
doi = {10.1109/ICASSP39728.2021.9414851},
groups = {Non Parallel VC},
}
@Article{remez1981speech,
author = {Remez, Robert E. and Rubin, Philip E. and Pisoni, David B. and Carrell, Thomas D.},
journal = {Science},
title = {Speech perception without traditional speech cues},
year = {1981},
number = {4497},
pages = {947--950},
volume = {212},
groups = {Psychoacoustics Related},
publisher = {American Association for the Advancement of Science},
url = {https://www.science.org/doi/10.1126/science.7233191},
}
@Article{speaker-recognition,
author = {Kabir, Muhammad Mohsin and Mridha, M. F. and Shin, Jungpil and Jahan, Israt and Ohi, Abu Quwsar},
journal = {IEEE Access},
title = {{A Survey of Speaker Recognition: Fundamental Theories, Recognition Methods and Opportunities}},
year = {2021},
pages = {79236--79263},
volume = {9},
doi = {10.1109/ACCESS.2021.3084299},
groups = {Overviews},
}
@Article{voice-conversion-overview,
author = {Sisman, Berrak and Yamagishi, Junichi and King, Simon and Li, Haizhou},
journal = {IEEE/ACM Transactions on Audio, Speech, and Language Processing},
title = {{An Overview of Voice Conversion and Its Challenges: From Statistical Modeling to Deep Learning}},
year = {2021},
pages = {132--157},
volume = {29},
doi = {10.1109/TASLP.2020.3038524},
groups = {Overviews},
}
@Article{Stevens1937,
author = {Stevens, S. S. and Volkmann, J. and Newman, E. B.},
journal = {The Journal of the Acoustical Society of America},
title = {{A Scale for the Measurement of the Psychological Magnitude Pitch}},
year = {1937},
number = {3},
pages = {185--190},
volume = {8},
doi = {10.1121/1.1915893},
groups = {Psychoacoustics Related},
url = {https://doi.org/10.1121/1.1915893},
}
@Article{Goodfellow2020,
author = {Goodfellow, Ian and Pouget-Abadie, Jean and Mirza, Mehdi and Xu, Bing and Warde-Farley, David and Ozair, Sherjil and Courville, Aaron and Bengio, Yoshua},
journal = {Commun. ACM},
title = {{Generative Adversarial Networks}},
year = {2020},
issn = {0001-0782},
month = oct,
number = {11},
pages = {139--144},
volume = {63},
abstract = {Generative adversarial networks are a kind of artificial intelligence algorithm designed to solve the generative modeling problem. The goal of a generative model is to study a collection of training examples and learn the probability distribution that generated them. Generative Adversarial Networks (GANs) are then able to generate more examples from the estimated probability distribution. Generative models based on deep learning are common, but GANs are among the most successful generative models (especially in terms of their ability to generate realistic high-resolution images). GANs have been successfully applied to a wide variety of tasks (mostly in research settings) but continue to present unique challenges and research opportunities because they are based on game theory while most other approaches to generative modeling are based on optimization.},
address = {New York, NY, USA},
doi = {10.1145/3422622},
groups = {GAN Related},
issue_date = {November 2020},
numpages = {6},
publisher = {Association for Computing Machinery},
url = {https://doi.org/10.1145/3422622},
}
@Article{CycleGAN-VC,
author = {Takuhiro Kaneko and H. Kameoka},
journal = {ArXiv},
title = {{Parallel-Data-Free Voice Conversion Using Cycle-Consistent Adversarial Networks}},
year = {2017},
volume = {abs/1711.11293},
groups = {Non Parallel VC},
url = {https://arxiv.org/abs/1711.11293},
}
@InProceedings{CycleGAN-VC2,
author = {Kaneko, Takuhiro and Kameoka, Hirokazu and Tanaka, Kou and Hojo, Nobukatsu},
booktitle = {ICASSP 2019 - 2019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
title = {{Cyclegan-VC2: Improved Cyclegan-based Non-parallel Voice Conversion}},
year = {2019},
pages = {6820--6824},
doi = {10.1109/ICASSP.2019.8682897},
groups = {Non Parallel VC},
}
@Article{CycleGAN-VC3,
author = {Kaneko, Takuhiro and Kameoka, Hirokazu and Tanaka, Kou and Hojo, Nobukatsu},
title = {{CycleGAN-VC3: Examining and Improving CycleGAN-VCs for Mel-spectrogram Conversion}},
year = {2020},
copyright = {arXiv.org perpetual, non-exclusive license},
doi = {10.48550/ARXIV.2010.11672},
groups = {Non Parallel VC},
keywords = {Sound (cs.SD), Machine Learning (cs.LG), Audio and Speech Processing (eess.AS), Machine Learning (stat.ML), FOS: Computer and information sciences, FOS: Electrical engineering, electronic engineering, information engineering},
publisher = {arXiv},
url = {https://arxiv.org/abs/2010.11672},
}
@Article{Ellis2004,
author = {D. P. W. Ellis},
title = {{Sinewave Speech Analysis/Synthesis in Matlab}},
year = {2004},
groups = {Psychoacoustics Related},
url = {http://www.ee.columbia.edu/ln/labrosa/matlab/sws/},
}
@Article{formants-from-lpc,
author = {Snell, R. C. and Milinazzo, F.},
journal = {IEEE Transactions on Speech and Audio Processing},
title = {{Formant location from LPC analysis data}},
year = {1993},
number = {2},
pages = {129--134},
volume = {1},
doi = {10.1109/89.222882},
groups = {Psychoacoustics Related},
}
@Misc{haskins-laboratories,
author = {P.E. Rubin},
howpublished = {Internal memorandum, Haskins Laboratories, New Haven, CT},
title = {Sinewave synthesis},
year = {1980},
groups = {Psychoacoustics Related},
url = {https://web.archive.org/web/20151218110228/http://www.haskins.yale.edu/featured/sws/bibliography.html},
}
@Article{Perceptron,
author = {Frank Rosenblatt},
journal = {Psychological review},
title = {The perceptron: a probabilistic model for information storage and organization in the brain.},
year = {1958},
pages = {386--408},
volume = {65 6},
groups = {Deep Learning Generic},
}
@Article{ReLU,
author = {{Agarap}, Abien Fred},
journal = {arXiv e-prints},
title = {{Deep Learning using Rectified Linear Units (ReLU)}},
year = {2018},
month = mar,
adsnote = {Provided by the SAO/NASA Astrophysics Data System},
adsurl = {https://ui.adsabs.harvard.edu/abs/2018arXiv180308375A},
archiveprefix = {arXiv},
groups = {Deep Learning Generic},
keywords = {Computer Science - Neural and Evolutionary Computing, Computer Science - Computer Vision and Pattern Recognition, Computer Science - Machine Learning, Statistics - Machine Learning},
primaryclass = {cs.NE},
}
@Book{deep-learning-book,
author = {Ian Goodfellow and Yoshua Bengio and Aaron Courville},
publisher = {MIT Press},
title = {{Deep Learning}},
year = {2016},
groups = {Deep Learning Generic},
url = {https://www.deeplearningbook.org/},
}
@Article{audio-fft,
author = {Welch, P.},
journal = {IEEE Transactions on Audio and Electroacoustics},
title = {The use of fast Fourier transform for the estimation of power spectra: A method based on time averaging over short, modified periodograms},
year = {1967},
number = {2},
pages = {70-73},
volume = {15},
doi = {10.1109/TAU.1967.1161901},
groups = {Audio},
}
@Article{time-frequency-review,
author = {Cohen, L.},
journal = {Proceedings of the IEEE},
title = {Time-frequency distributions-a review},
year = {1989},
number = {7},
pages = {941-981},
volume = {77},
doi = {10.1109/5.30749},
groups = {Audio},
}
@Article{melgan,
author = {Kumar, Kundan and Kumar, Rithesh and de Boissiere, Thibault and Gestin, Lucas and Teoh, Wei Zhen and Sotelo, Jose and de Brebisson, Alexandre and Bengio, Yoshua and Courville, Aaron},
title = {{MelGAN: Generative Adversarial Networks for Conditional Waveform Synthesis}},
year = {2019},
copyright = {arXiv.org perpetual, non-exclusive license},
doi = {10.48550/ARXIV.1910.06711},
groups = {Non Parallel VC},
keywords = {Audio and Speech Processing (eess.AS), Computation and Language (cs.CL), Machine Learning (cs.LG), Sound (cs.SD), FOS: Electrical engineering, electronic engineering, information engineering, FOS: Computer and information sciences},
publisher = {arXiv},
url = {https://arxiv.org/abs/1910.06711},
}
@Article{noise-vocoded,
author = {Matthew H. Davis and Ingrid S. Johnsrude and Alexis Hervais-Adelman and Karen J. Taylor and Carolyn McGettigan},
journal = {Journal of experimental psychology. General},
title = {Lexical information drives perceptual learning of distorted speech: evidence from the comprehension of noise-vocoded sentences.},
year = {2005},
pages = {222--41},
volume = {134 2},
groups = {Psychoacoustics Related},
url = {https://www.semanticscholar.org/paper/Lexical-information-drives-perceptual-learning-of-Davis-Johnsrude/4344de03181f1b5fa841404a2c93f860a03c19dd},
}
@Article{cnn,
author = {Lecun, Y. and Bottou, L. and Bengio, Y. and Haffner, P.},
journal = {Proceedings of the IEEE},
title = {Gradient-based learning applied to document recognition},
year = {1998},
number = {11},
pages = {2278-2324},
volume = {86},
doi = {10.1109/5.726791},
groups = {Deep Learning Generic},
}
@Article{transfer-learning-tts-vc,
author = {Zhang, Mingyang and Zhou, Yi and Zhao, Li and Li, Haizhou},
title = {{Transfer Learning from Speech Synthesis to Voice Conversion with Non-Parallel Training Data}},
year = {2020},
copyright = {arXiv.org perpetual, non-exclusive license},
doi = {10.48550/ARXIV.2009.14399},
groups = {Other VC},
keywords = {Audio and Speech Processing (eess.AS), Sound (cs.SD), FOS: Electrical engineering, electronic engineering, information engineering, FOS: Computer and information sciences},
publisher = {arXiv},
url = {https://arxiv.org/abs/2009.14399},
}
@InProceedings{ppg-vc,
author = {Sun, Lifa and Li, Kun and Wang, Hao and Kang, Shiyin and Meng, Helen},
booktitle = {2016 IEEE International Conference on Multimedia and Expo (ICME)},
title = {Phonetic posteriorgrams for many-to-one voice conversion without parallel data training},
year = {2016},
pages = {1-6},
doi = {10.1109/ICME.2016.7552917},
groups = {Other VC},
}
@InProceedings{mel-cepstral-coefficient,
author = {Imai, S.},
booktitle = {ICASSP '83. IEEE International Conference on Acoustics, Speech, and Signal Processing},
title = {Cepstral analysis synthesis on the mel frequency scale},
year = {1983},
pages = {93-96},
volume = {8},
doi = {10.1109/ICASSP.1983.1172250},
groups = {Audio},
}
@Article{mel-frequency-cepstral,
author = {Davis, S. and Mermelstein, P.},
journal = {IEEE Transactions on Acoustics, Speech, and Signal Processing},
title = {Comparison of parametric representations for monosyllabic word recognition in continuously spoken sentences},
year = {1980},
number = {4},
pages = {357-366},
volume = {28},
doi = {10.1109/TASSP.1980.1163420},
groups = {Audio},
}
@Article{instance-normalization,
author = {Chou, Ju-chieh and Yeh, Cheng-chieh and Lee, Hung-yi},
title = {{One-shot Voice Conversion by Separating Speaker and Content Representations with Instance Normalization}},
year = {2019},
copyright = {arXiv.org perpetual, non-exclusive license},
doi = {10.48550/ARXIV.1904.05742},
groups = {Other VC},
keywords = {Machine Learning (cs.LG), Sound (cs.SD), Audio and Speech Processing (eess.AS), Machine Learning (stat.ML), FOS: Computer and information sciences, FOS: Electrical engineering, electronic engineering, information engineering},
publisher = {arXiv},
url = {https://arxiv.org/abs/1904.05742},
}
@InProceedings{vector-quantization,
author = {Wu, Da-Yi and Lee, Hung-yi},
booktitle = {ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
title = {{One-Shot Voice Conversion by Vector Quantization}},
year = {2020},
pages = {7734-7738},
doi = {10.1109/ICASSP40776.2020.9053854},
groups = {Other VC},
}
@Article{auto-encoder,
author = {Larsen, Anders Boesen Lindbo and Sønderby, Søren Kaae and Larochelle, Hugo and Winther, Ole},
title = {Autoencoding beyond pixels using a learned similarity metric},
year = {2015},
copyright = {arXiv.org perpetual, non-exclusive license},
doi = {10.48550/ARXIV.1512.09300},
groups = {Other VC},
keywords = {Machine Learning (cs.LG), Computer Vision and Pattern Recognition (cs.CV), Machine Learning (stat.ML), FOS: Computer and information sciences},
publisher = {arXiv},
url = {https://arxiv.org/abs/1512.09300},
}
@Article{attention-mechanism,
author = {Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N. and Kaiser, Lukasz and Polosukhin, Illia},
title = {{Attention Is All You Need}},
year = {2017},
copyright = {arXiv.org perpetual, non-exclusive license},
doi = {10.48550/ARXIV.1706.03762},
groups = {Deep Learning Generic},
keywords = {Computation and Language (cs.CL), Machine Learning (cs.LG), FOS: Computer and information sciences},
publisher = {arXiv},
url = {https://arxiv.org/abs/1706.03762},
}
@Article{burg-algorithm,
author = {Gray, A. and Wong, D.},
journal = {IEEE Transactions on Acoustics, Speech, and Signal Processing},
title = {{The Burg algorithm for LPC speech analysis/Synthesis}},
year = {1980},
number = {6},
pages = {609-615},
volume = {28},
doi = {10.1109/TASSP.1980.1163489},
groups = {Audio},
}
@InProceedings{mel-cepstral-distance,
author = {Kubichek, R.},
booktitle = {Proceedings of IEEE Pacific Rim Conference on Communications Computers and Signal Processing},
title = {Mel-cepstral distance measure for objective speech quality assessment},
year = {1993},
pages = {125-128 vol.1},
volume = {1},
doi = {10.1109/PACRIM.1993.407206},
groups = {Audio Metrics},
}
@InProceedings{mosnet,
author = {Chen-Chou Lo and Szu-Wei Fu and Wen-Chin Huang and Xin Wang and Junichi Yamagishi and Yu Tsao and Hsin-Min Wang},
booktitle = {Interspeech 2019},
title = {{MOSNet}: {Deep Learning-Based Objective Assessment for Voice Conversion}},
year = {2019},
month = {sep},
publisher = {{ISCA}},
doi = {10.21437/interspeech.2019-2003},
groups = {Audio Metrics},
url = {https://doi.org/10.21437%2Finterspeech.2019-2003},
}
@Article{kdsd,
author = {Bińkowski, Mikołaj and Donahue, Jeff and Dieleman, Sander and Clark, Aidan and Elsen, Erich and Casagrande, Norman and Cobo, Luis C. and Simonyan, Karen},
title = {{High Fidelity Speech Synthesis with Adversarial Networks}},
year = {2019},
copyright = {arXiv.org perpetual, non-exclusive license},
doi = {10.48550/ARXIV.1909.11646},
groups = {Audio Metrics},
keywords = {Sound (cs.SD), Machine Learning (cs.LG), Audio and Speech Processing (eess.AS), FOS: Computer and information sciences, FOS: Electrical engineering, electronic engineering, information engineering},
publisher = {arXiv},
url = {https://arxiv.org/abs/1909.11646},
}
@Comment{jabref-meta: databaseType:bibtex;}
@Comment{jabref-meta: grouping:
0 AllEntriesGroup:;
1 StaticGroup:Non Parallel VC\;0\;1\;0xffff00ff\;\;\;;
1 StaticGroup:GAN Related\;0\;1\;0x0000ffff\;\;\;;
1 StaticGroup:Overviews\;0\;1\;0x00ff00ff\;\;\;;
1 StaticGroup:Psychoacoustics Related\;0\;1\;0x800000ff\;\;\;;
1 StaticGroup:Deep Learning Generic\;0\;1\;0xb31a1aff\;\;\;;
1 StaticGroup:Audio\;0\;1\;0x8a8a8aff\;\;\;;
1 StaticGroup:Other VC\;0\;1\;0xff00ffff\;\;\;;
1 StaticGroup:Audio Metrics\;0\;1\;0x00ffffff\;\;\;;
}