diff --git a/Cargo.lock b/Cargo.lock
index a9ba449bd..56c0a14ab 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -83,17 +83,6 @@ dependencies = [
  "opaque-debug",
 ]
 
-[[package]]
-name = "ahash"
-version = "0.7.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fcb51a0695d8f838b1ee009b3fbf66bda078cd64590202a864a8f3e8c4315c47"
-dependencies = [
- "getrandom 0.2.7",
- "once_cell",
- "version_check",
-]
-
 [[package]]
 name = "aho-corasick"
 version = "0.7.19"
@@ -147,7 +136,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a3203e79f4dd9bdda415ed03cf14dae5a2bf775c683a00f94e9cd1faf0f596e5"
 dependencies = [
  "quote",
- "syn",
+ "syn 1.0.102",
 ]
 
 [[package]]
@@ -161,6 +150,23 @@ dependencies = [
  "futures-core",
 ]
 
+[[package]]
+name = "async-compression"
+version = "0.3.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "942c7cd7ae39e91bde4820d74132e9862e62c2f386c3aa90ccf55949f5bad63a"
+dependencies = [
+ "bzip2",
+ "flate2",
+ "futures-core",
+ "memchr",
+ "pin-project-lite",
+ "tokio",
+ "xz2",
+ "zstd",
+ "zstd-safe",
+]
+
 [[package]]
 name = "async-executor"
 version = "1.4.1"
@@ -260,7 +266,22 @@ checksum = "76464446b8bc32758d7e88ee1a804d9914cd9b1cb264c029899680b0be29826f"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn",
+ "syn 1.0.102",
+]
+
+[[package]]
+name = "async_zip"
+version = "0.0.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c50d29ab7e2f9e808cca1a69ea56a36f4ff216f54a41a23aae1fd4afc05cc020"
+dependencies = [
+ "async-compression",
+ "chrono",
+ "crc32fast",
+ "log",
+ "pin-project",
+ "thiserror",
+ "tokio",
 ]
 
 [[package]]
@@ -474,7 +495,7 @@ dependencies = [
  "quote",
  "serde",
  "serde_json",
- "syn",
+ "syn 1.0.102",
  "tempfile",
  "toml 0.5.9",
 ]
@@ -591,7 +612,7 @@ dependencies = [
  "proc-macro-error",
  "proc-macro2",
  "quote",
- "syn",
+ "syn 1.0.102",
 ]
 
 [[package]]
@@ -826,7 +847,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "cdffe87e1d521a10f9696f833fe502293ea446d7f256c06128293a4119bdf4cb"
 dependencies = [
  "quote",
- "syn",
+ "syn 1.0.102",
 ]
 
 [[package]]
@@ -893,7 +914,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "scratch",
- "syn",
+ "syn 1.0.102",
 ]
 
 [[package]]
@@ -910,7 +931,7 @@ checksum = "39e61fda7e62115119469c7b3591fd913ecca96fb766cfd3f2e2502ab7bc87a5"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn",
+ "syn 1.0.102",
 ]
 
 [[package]]
@@ -921,7 +942,7 @@ checksum = "0c5905670fd9c320154f3a4a01c9e609733cd7b753f3c58777ab7d5ce26686b3"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn",
+ "syn 1.0.102",
 ]
 
 [[package]]
@@ -932,7 +953,7 @@ checksum = "3418329ca0ad70234b9735dc4ceed10af4df60eff9c8e7b06cb5e520d92c3535"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn",
+ "syn 1.0.102",
 ]
 
 [[package]]
@@ -1179,9 +1200,9 @@ checksum = "2022715d62ab30faffd124d40b76f4134a550a87792276512b18d63272333394"
 
 [[package]]
 name = "futures"
-version = "0.3.25"
+version = "0.3.28"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "38390104763dc37a5145a53c29c63c1290b5d316d6086ec32c293f6736051bb0"
+checksum = "23342abe12aba583913b2e62f22225ff9c950774065e4bfb61a19cd9770fec40"
 dependencies = [
  "futures-channel",
  "futures-core",
@@ -1194,9 +1215,9 @@ dependencies = [
 
 [[package]]
 name = "futures-channel"
-version = "0.3.25"
+version = "0.3.28"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "52ba265a92256105f45b719605a571ffe2d1f0fea3807304b522c1d778f79eed"
+checksum = "955518d47e09b25bbebc7a18df10b81f0c766eaf4c4f1cccef2fca5f2a4fb5f2"
 dependencies = [
  "futures-core",
  "futures-sink",
@@ -1204,15 +1225,15 @@ dependencies = [
 
 [[package]]
 name = "futures-core"
-version = "0.3.25"
+version = "0.3.28"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "04909a7a7e4633ae6c4a9ab280aeb86da1236243a77b694a49eacd659a4bd3ac"
+checksum = "4bca583b7e26f571124fe5b7561d49cb2868d79116cfa0eefce955557c6fee8c"
 
 [[package]]
 name = "futures-executor"
-version = "0.3.25"
+version = "0.3.28"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7acc85df6714c176ab5edf386123fafe217be88c0840ec11f199441134a074e2"
+checksum = "ccecee823288125bd88b4d7f565c9e58e41858e47ab72e8ea2d64e93624386e0"
 dependencies = [
  "futures-core",
  "futures-task",
@@ -1221,9 +1242,9 @@ dependencies = [
 
 [[package]]
 name = "futures-io"
-version = "0.3.25"
+version = "0.3.28"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "00f5fb52a06bdcadeb54e8d3671f8888a39697dcb0b81b23b55174030427f4eb"
+checksum = "4fff74096e71ed47f8e023204cfd0aa1289cd54ae5430a9523be060cdb849964"
 
 [[package]]
 name = "futures-lite"
@@ -1242,26 +1263,26 @@ dependencies = [
 
 [[package]]
 name = "futures-macro"
-version = "0.3.25"
+version = "0.3.28"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bdfb8ce053d86b91919aad980c220b1fb8401a9394410e1c289ed7e66b61835d"
+checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn",
+ "syn 2.0.15",
 ]
 
 [[package]]
 name = "futures-sink"
-version = "0.3.25"
+version = "0.3.28"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "39c15cf1a4aa79df40f1bb462fb39676d0ad9e366c2a33b590d7c66f4f81fcf9"
+checksum = "f43be4fe21a13b9781a69afa4985b0f6ee0e1afab2c6f454a8cf30e2b2237b6e"
 
 [[package]]
 name = "futures-task"
-version = "0.3.25"
+version = "0.3.28"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2ffb393ac5d9a6eaa9d3fdf37ae2776656b706e200c8e16b1bdb227f5198e6ea"
+checksum = "76d3d132be6c0e6aa1534069c705a74a5997a356c0dc2f86a47765e5617c5b65"
 
 [[package]]
 name = "futures-timer"
@@ -1271,9 +1292,9 @@ checksum = "e64b03909df88034c26dc1547e8970b91f98bdb65165d6a4e9110d94263dbb2c"
 
 [[package]]
 name = "futures-util"
-version = "0.3.25"
+version = "0.3.28"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "197676987abd2f9cadff84926f410af1c183608d36641465df73ae8211dc65d6"
+checksum = "26b01e40b772d54cf6c6d721c1d1abd0647a0106a12ecaa1c186273392a69533"
 dependencies = [
  "futures-channel",
  "futures-core",
@@ -1337,7 +1358,7 @@ checksum = "41973d4c45f7a35af8753ba3457cc99d406d863941fd7f52663cff54a5ab99b3"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn",
+ "syn 1.0.102",
 ]
 
 [[package]]
@@ -1779,9 +1800,9 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
 
 [[package]]
 name = "libc"
-version = "0.2.134"
+version = "0.2.142"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "329c933548736bc49fd575ee68c89e8be4d260064184389a5b77517cddd99ffb"
+checksum = "6a987beff54b60ffa6d51982e1aa1146bc42f19bd26be28b0586f252fccf5317"
 
 [[package]]
 name = "libloading"
@@ -1867,6 +1888,17 @@ dependencies = [
  "value-bag",
 ]
 
+[[package]]
+name = "lzma-sys"
+version = "0.1.20"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5fda04ab3764e6cde78b9974eec4f779acaba7c4e84b36eca3cf77c581b85d27"
+dependencies = [
+ "cc",
+ "libc",
+ "pkg-config",
+]
+
 [[package]]
 name = "matchers"
 version = "0.1.0"
@@ -1899,18 +1931,18 @@ checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
 
 [[package]]
 name = "memoffset"
-version = "0.6.5"
+version = "0.7.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5aa361d4faea93603064a027415f07bd8e1d5c88c9fbf68bf56a285428fd79ce"
+checksum = "5de893c32cde5f383baa4c04c5d6dbdd735cfd4a794b0debdb2bb1b421da5ff4"
 dependencies = [
  "autocfg",
 ]
 
 [[package]]
 name = "memoffset"
-version = "0.7.1"
+version = "0.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5de893c32cde5f383baa4c04c5d6dbdd735cfd4a794b0debdb2bb1b421da5ff4"
+checksum = "d61c719bcfbcf5d62b3a09efa6088de8c54bc0bfcd3ea7ae39fcc186108b8de1"
 dependencies = [
  "autocfg",
 ]
@@ -1967,6 +1999,15 @@ dependencies = [
  "windows-sys 0.42.0",
 ]
 
+[[package]]
+name = "nanoid"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3ffa00dec017b5b1a8b7cf5e2c008bfda1aa7e0697ac1508b491fdf2622fb4d8"
+dependencies = [
+ "rand 0.8.5",
+]
+
 [[package]]
 name = "ndarray"
 version = "0.15.6"
@@ -2097,21 +2138,6 @@ version = "0.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3"
 
-[[package]]
-name = "numpy"
-version = "0.17.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a462c1af5ba1fddec1488c4646993a23ae7931f9e170ccba23e9c7c834277797"
-dependencies = [
- "ahash",
- "libc",
- "ndarray",
- "num-complex",
- "num-integer",
- "num-traits",
- "pyo3",
-]
-
 [[package]]
 name = "object"
 version = "0.29.0"
@@ -2349,7 +2375,7 @@ checksum = "069bdb1e05adc7a8990dce9cc75370895fbe4e3d58b9b73bf1aee56359344a55"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn",
+ "syn 1.0.102",
 ]
 
 [[package]]
@@ -2463,7 +2489,7 @@ dependencies = [
  "proc-macro-error-attr",
  "proc-macro2",
  "quote",
- "syn",
+ "syn 1.0.102",
  "version_check",
 ]
 
@@ -2486,9 +2512,9 @@ checksum = "dbf0c48bc1d91375ae5c3cd81e3722dff1abcf81a30960240640d223f59fe0e5"
 
 [[package]]
 name = "proc-macro2"
-version = "1.0.46"
+version = "1.0.56"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "94e2ef8dbfc347b10c094890f778ee2e36ca9bb4262e86dc99cd217e35f3470b"
+checksum = "2b63bdb0cd06f1f4dedf69b254734f9b45af66e4a031e42a7480257d9898b435"
 dependencies = [
  "unicode-ident",
 ]
@@ -2504,14 +2530,14 @@ dependencies = [
 
 [[package]]
 name = "pyo3"
-version = "0.17.2"
+version = "0.18.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "201b6887e5576bf2f945fe65172c1fcbf3fcf285b23e4d71eb171d9736e38d32"
+checksum = "e3b1ac5b3731ba34fdaa9785f8d74d17448cd18f30cf19e0c7e7b1fdb5272109"
 dependencies = [
  "cfg-if",
  "indoc",
  "libc",
- "memoffset 0.6.5",
+ "memoffset 0.8.0",
  "parking_lot",
  "pyo3-build-config",
  "pyo3-ffi",
@@ -2519,11 +2545,24 @@ dependencies = [
  "unindent",
 ]
 
+[[package]]
+name = "pyo3-asyncio"
+version = "0.18.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d3564762e37035cfc486228e10b0528460fa026d681b5763873c693aa0d5c260"
+dependencies = [
+ "futures",
+ "once_cell",
+ "pin-project-lite",
+ "pyo3",
+ "tokio",
+]
+
 [[package]]
 name = "pyo3-build-config"
-version = "0.17.2"
+version = "0.18.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bf0708c9ed01692635cbf056e286008e5a2927ab1a5e48cdd3aeb1ba5a6fef47"
+checksum = "9cb946f5ac61bb61a5014924910d936ebd2b23b705f7a4a3c40b05c720b079a3"
 dependencies = [
  "once_cell",
  "target-lexicon",
@@ -2531,9 +2570,9 @@ dependencies = [
 
 [[package]]
 name = "pyo3-ffi"
-version = "0.17.2"
+version = "0.18.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "90352dea4f486932b72ddf776264d293f85b79a1d214de1d023927b41461132d"
+checksum = "fd4d7c5337821916ea2a1d21d1092e8443cf34879e53a0ac653fbb98f44ff65c"
 dependencies = [
  "libc",
  "pyo3-build-config",
@@ -2541,9 +2580,9 @@ dependencies = [
 
 [[package]]
 name = "pyo3-log"
-version = "0.7.0"
+version = "0.8.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e5695ccff5060c13ca1751cf8c857a12da9b0bf0378cb071c5e0326f7c7e4c1b"
+checksum = "f9c8b57fe71fb5dcf38970ebedc2b1531cf1c14b1b9b4c560a182a57e115575c"
 dependencies = [
  "arc-swap",
  "log",
@@ -2552,32 +2591,32 @@ dependencies = [
 
 [[package]]
 name = "pyo3-macros"
-version = "0.17.2"
+version = "0.18.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7eb24b804a2d9e88bfcc480a5a6dd76f006c1e3edaf064e8250423336e2cd79d"
+checksum = "a9d39c55dab3fc5a4b25bbd1ac10a2da452c4aca13bb450f22818a002e29648d"
 dependencies = [
  "proc-macro2",
  "pyo3-macros-backend",
  "quote",
- "syn",
+ "syn 1.0.102",
 ]
 
 [[package]]
 name = "pyo3-macros-backend"
-version = "0.17.2"
+version = "0.18.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f22bb49f6a7348c253d7ac67a6875f2dc65f36c2ae64a82c381d528972bea6d6"
+checksum = "97daff08a4c48320587b5224cc98d609e3c27b6d437315bd40b605c98eeb5918"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn",
+ "syn 1.0.102",
 ]
 
 [[package]]
 name = "quote"
-version = "1.0.21"
+version = "1.0.26"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bbe448f377a7d6961e30f5955f9b8d106c3f5e449d493ee1b125c1d43c2b5179"
+checksum = "4424af4bf778aae2051a77b60283332f386554255d722233d09fbfc7e30da2fc"
 dependencies = [
  "proc-macro2",
 ]
@@ -2793,7 +2832,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "rustc_version 0.4.0",
- "syn",
+ "syn 1.0.102",
 ]
 
 [[package]]
@@ -2952,7 +2991,7 @@ checksum = "81fa1584d3d1bcacd84c277a0dfe21f5b0f6accf4a23d04d4c6d61f1af522b4c"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn",
+ "syn 1.0.102",
 ]
 
 [[package]]
@@ -2961,6 +3000,7 @@ version = "1.0.85"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e55a28e3aaef9d5ce0506d0a14dbba8054ddc7e499ef522dd8b26859ec9d4a44"
 dependencies = [
+ "indexmap",
  "itoa",
  "ryu",
  "serde",
@@ -3140,14 +3180,14 @@ dependencies = [
  "heck",
  "proc-macro2",
  "quote",
- "syn",
+ "syn 1.0.102",
 ]
 
 [[package]]
 name = "socket2"
-version = "0.4.7"
+version = "0.4.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "02e2d2db9033d13a1567121ddd7a095ee144db4e1ca1b1bda3419bc0da294ebd"
+checksum = "64a4a911eed85daf18834cfaa86a79b7d266ff93ff5ba14005426219480ed662"
 dependencies = [
  "libc",
  "winapi",
@@ -3201,7 +3241,7 @@ dependencies = [
  "quote",
  "serde",
  "serde_derive",
- "syn",
+ "syn 1.0.102",
 ]
 
 [[package]]
@@ -3217,7 +3257,7 @@ dependencies = [
  "serde_derive",
  "serde_json",
  "sha1 0.6.1",
- "syn",
+ "syn 1.0.102",
 ]
 
 [[package]]
@@ -3251,7 +3291,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "rustversion",
- "syn",
+ "syn 1.0.102",
 ]
 
 [[package]]
@@ -3294,6 +3334,17 @@ dependencies = [
  "unicode-ident",
 ]
 
+[[package]]
+name = "syn"
+version = "2.0.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a34fcf3e8b60f57e6a14301a2e916d323af98b0ea63c599441eec8558660c822"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+
 [[package]]
 name = "tar"
 version = "0.4.38"
@@ -3373,7 +3424,7 @@ checksum = "982d17546b47146b28f7c22e3d08465f6b8903d0ea13c1660d9d84a6e7adcdbb"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn",
+ "syn 1.0.102",
 ]
 
 [[package]]
@@ -3447,7 +3498,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "standback",
- "syn",
+ "syn 1.0.102",
 ]
 
 [[package]]
@@ -3467,31 +3518,30 @@ checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c"
 
 [[package]]
 name = "tokio"
-version = "1.24.1"
+version = "1.28.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1d9f76183f91ecfb55e1d7d5602bd1d979e38a3a522fe900241cf195624d67ae"
+checksum = "c3c786bf8134e5a3a166db9b29ab8f48134739014a3eca7bc6bfa95d673b136f"
 dependencies = [
  "autocfg",
  "bytes 1.1.0",
  "libc",
- "memchr",
  "mio",
  "num_cpus",
  "pin-project-lite",
  "socket2",
  "tokio-macros",
- "windows-sys 0.42.0",
+ "windows-sys 0.48.0",
 ]
 
 [[package]]
 name = "tokio-macros"
-version = "1.8.2"
+version = "2.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d266c00fde287f55d3f1c3e96c500c362a2b8c695076ec180f27918820bc6df8"
+checksum = "630bdcf245f78637c13ec01ffae6187cca34625e8c63150d424b59e55af2675e"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn",
+ "syn 2.0.15",
 ]
 
 [[package]]
@@ -3589,7 +3639,7 @@ checksum = "4017f8f45139870ca7e672686113917c71c7a6e02d4924eda67186083c03081a"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn",
+ "syn 1.0.102",
 ]
 
 [[package]]
@@ -3684,7 +3734,7 @@ checksum = "8f9568611f0de5e83e0993b85c54679cd0afd659adcfcb0233f16280b980492e"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn",
+ "syn 1.0.102",
 ]
 
 [[package]]
@@ -3807,13 +3857,17 @@ name = "voicevox_core"
 version = "0.0.0"
 dependencies = [
  "anyhow",
+ "async_zip",
  "cfg-if",
  "derive-getters",
  "derive-new",
  "easy-ext",
+ "flate2",
  "fs-err",
+ "futures",
  "heck",
  "humansize",
+ "nanoid",
  "once_cell",
  "onnxruntime",
  "open_jtalk",
@@ -3823,8 +3877,10 @@ dependencies = [
  "rstest",
  "serde",
  "serde_json",
+ "tar",
  "test_util",
  "thiserror",
+ "tokio",
  "tracing",
  "windows",
 ]
@@ -3837,6 +3893,7 @@ dependencies = [
  "assert_cmd",
  "chrono",
  "clap 4.0.10",
+ "derive-getters",
  "duct",
  "easy-ext",
  "inventory",
@@ -3853,7 +3910,9 @@ dependencies = [
  "rstest",
  "serde",
  "serde_json",
+ "test_util",
  "thiserror",
+ "tokio",
  "toml 0.7.2",
  "tracing-subscriber",
  "typetag",
@@ -3868,11 +3927,13 @@ dependencies = [
  "easy-ext",
  "fs_extra",
  "log",
- "numpy",
+ "once_cell",
  "pyo3",
+ "pyo3-asyncio",
  "pyo3-log",
  "serde",
  "serde_json",
+ "tokio",
  "tracing",
  "voicevox_core",
 ]
@@ -3935,7 +3996,7 @@ dependencies = [
  "once_cell",
  "proc-macro2",
  "quote",
- "syn",
+ "syn 1.0.102",
  "wasm-bindgen-shared",
 ]
 
@@ -3969,7 +4030,7 @@ checksum = "07bc0c051dc5f23e307b13285f9d75df86bfdf816c5721e573dec1f9b8aa193c"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn",
+ "syn 1.0.102",
  "wasm-bindgen-backend",
  "wasm-bindgen-shared",
 ]
@@ -4066,12 +4127,12 @@ version = "0.43.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "04662ed0e3e5630dfa9b26e4cb823b817f1a9addda855d973a9458c236556244"
 dependencies = [
- "windows_aarch64_gnullvm",
+ "windows_aarch64_gnullvm 0.42.0",
  "windows_aarch64_msvc 0.42.0",
  "windows_i686_gnu 0.42.0",
  "windows_i686_msvc 0.42.0",
  "windows_x86_64_gnu 0.42.0",
- "windows_x86_64_gnullvm",
+ "windows_x86_64_gnullvm 0.42.0",
  "windows_x86_64_msvc 0.42.0",
 ]
 
@@ -4094,21 +4155,51 @@ version = "0.42.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5a3e1820f08b8513f676f7ab6c1f99ff312fb97b553d30ff4dd86f9f15728aa7"
 dependencies = [
- "windows_aarch64_gnullvm",
+ "windows_aarch64_gnullvm 0.42.0",
  "windows_aarch64_msvc 0.42.0",
  "windows_i686_gnu 0.42.0",
  "windows_i686_msvc 0.42.0",
  "windows_x86_64_gnu 0.42.0",
- "windows_x86_64_gnullvm",
+ "windows_x86_64_gnullvm 0.42.0",
  "windows_x86_64_msvc 0.42.0",
 ]
 
+[[package]]
+name = "windows-sys"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9"
+dependencies = [
+ "windows-targets",
+]
+
+[[package]]
+name = "windows-targets"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7b1eb6f0cd7c80c79759c929114ef071b87354ce476d9d94271031c0497adfd5"
+dependencies = [
+ "windows_aarch64_gnullvm 0.48.0",
+ "windows_aarch64_msvc 0.48.0",
+ "windows_i686_gnu 0.48.0",
+ "windows_i686_msvc 0.48.0",
+ "windows_x86_64_gnu 0.48.0",
+ "windows_x86_64_gnullvm 0.48.0",
+ "windows_x86_64_msvc 0.48.0",
+]
+
 [[package]]
 name = "windows_aarch64_gnullvm"
 version = "0.42.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "41d2aa71f6f0cbe00ae5167d90ef3cfe66527d6f613ca78ac8024c3ccab9a19e"
 
+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc"
+
 [[package]]
 name = "windows_aarch64_msvc"
 version = "0.36.1"
@@ -4121,6 +4212,12 @@ version = "0.42.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "dd0f252f5a35cac83d6311b2e795981f5ee6e67eb1f9a7f64eb4500fbc4dcdb4"
 
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3"
+
 [[package]]
 name = "windows_i686_gnu"
 version = "0.36.1"
@@ -4133,6 +4230,12 @@ version = "0.42.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "fbeae19f6716841636c28d695375df17562ca208b2b7d0dc47635a50ae6c5de7"
 
+[[package]]
+name = "windows_i686_gnu"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241"
+
 [[package]]
 name = "windows_i686_msvc"
 version = "0.36.1"
@@ -4145,6 +4248,12 @@ version = "0.42.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "84c12f65daa39dd2babe6e442988fc329d6243fdce47d7d2d155b8d874862246"
 
+[[package]]
+name = "windows_i686_msvc"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00"
+
 [[package]]
 name = "windows_x86_64_gnu"
 version = "0.36.1"
@@ -4157,12 +4266,24 @@ version = "0.42.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bf7b1b21b5362cbc318f686150e5bcea75ecedc74dd157d874d754a2ca44b0ed"
 
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1"
+
 [[package]]
 name = "windows_x86_64_gnullvm"
 version = "0.42.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "09d525d2ba30eeb3297665bd434a54297e4170c7f1a44cad4ef58095b4cd2028"
 
+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953"
+
 [[package]]
 name = "windows_x86_64_msvc"
 version = "0.36.1"
@@ -4175,6 +4296,12 @@ version = "0.42.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f40009d85759725a34da6d89a94e63d7bdc50a862acf0dbc7c8e488f1edcb6f5"
 
+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a"
+
 [[package]]
 name = "winreg"
 version = "0.10.1"
@@ -4204,6 +4331,15 @@ dependencies = [
  "fs-err",
 ]
 
+[[package]]
+name = "xz2"
+version = "0.1.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "388c44dc09d76f1536602ead6d325eb532f5c122f17782bd57fb47baeeb767e2"
+dependencies = [
+ "lzma-sys",
+]
+
 [[package]]
 name = "yansi"
 version = "0.5.1"
diff --git a/Cargo.toml b/Cargo.toml
index 94b1e19f0..4a81b36dc 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -23,12 +23,14 @@ once_cell = "1.15.0"
 process_path = { git = "https://github.com/VOICEVOX/process_path.git", rev = "de226a26e8e18edbdb1d6f986afe37bbbf35fbf4" }
 regex = "1.6.0"
 serde = { version = "1.0.145", features = ["derive"] }
-serde_json = "1.0.85"
+serde_json = { version = "1.0.85", features = ["preserve_order"] }
 test_util = { path = "crates/test_util" }
 thiserror = "1.0.37"
 tracing = { version = "0.1.37", features = ["log"] }
 tracing-subscriber = { version = "0.3.16", features = ["env-filter"] }
 voicevox_core = { path = "crates/voicevox_core" }
+tokio = { version = "1.25.0", features = ["rt", "rt-multi-thread", "macros", "sync"] }
+derive-getters = "0.2.0"
 
 # min-sized-rustを元にrelease buildのサイズが小さくなるようにした
 # https://github.com/johnthagen/min-sized-rust
diff --git a/crates/download/Cargo.toml b/crates/download/Cargo.toml
index 421e90563..a14a1e066 100644
--- a/crates/download/Cargo.toml
+++ b/crates/download/Cargo.toml
@@ -21,7 +21,7 @@ platforms = "3.0.2"
 rayon = "1.6.1"
 reqwest = { version = "0.11.13", default-features = false, features = ["rustls-tls", "stream"] }
 strum = { version = "0.24.1", features = ["derive"] }
-tokio = { version = "1.24.1", features = ["macros", "rt-multi-thread", "sync"] }
+tokio.workspace = true
 tracing.workspace = true
 tracing-subscriber.workspace = true
 url = "2.3.0"
diff --git a/crates/voicevox_core/Cargo.toml b/crates/voicevox_core/Cargo.toml
index 049e46292..4d98fe352 100644
--- a/crates/voicevox_core/Cargo.toml
+++ b/crates/voicevox_core/Cargo.toml
@@ -12,7 +12,7 @@ directml = ["onnxruntime/directml"]
 [dependencies]
 anyhow.workspace = true
 cfg-if = "1.0.0"
-derive-getters = "0.2.0"
+derive-getters.workspace = true
 derive-new = "0.5.9"
 easy-ext.workspace = true
 fs-err.workspace = true
@@ -25,10 +25,16 @@ thiserror.workspace = true
 tracing.workspace = true
 open_jtalk = { git = "https://github.com/VOICEVOX/open_jtalk-rs.git", rev="d766a52bad4ccafe18597e57bd6842f59dca881e" }
 regex.workspace = true
+async_zip = { version = "0.0.11", features = ["full"] }
+futures = "0.3.26"
+nanoid = "0.4.0"
+tokio.workspace = true 
 
 [dev-dependencies]
 rstest = "0.15.0"
 pretty_assertions = "1.3.0"
+flate2 = "1.0.24"
+tar = "0.4.38"
 heck = "0.4.0"
 test_util.workspace = true
 
diff --git a/crates/voicevox_core/src/devices.rs b/crates/voicevox_core/src/devices.rs
new file mode 100644
index 000000000..ceef4b9d3
--- /dev/null
+++ b/crates/voicevox_core/src/devices.rs
@@ -0,0 +1,49 @@
+use serde::{Deserialize, Serialize};
+
+use super::*;
+
+#[derive(Getters, Debug, Serialize, Deserialize)]
+pub struct SupportedDevices {
+    cpu: bool,
+    cuda: bool,
+    dml: bool,
+}
+
+impl SupportedDevices {
+    /// サポートされているデバイス情報を取得する
+    pub fn get_supported_devices() -> Result<Self> {
+        let mut cuda_support = false;
+        let mut dml_support = false;
+        for provider in onnxruntime::session::get_available_providers()
+            .map_err(|e| Error::GetSupportedDevices(e.into()))?
+            .iter()
+        {
+            match provider.as_str() {
+                "CUDAExecutionProvider" => cuda_support = true,
+                "DmlExecutionProvider" => dml_support = true,
+                _ => {}
+            }
+        }
+
+        Ok(SupportedDevices {
+            cpu: true,
+            cuda: cuda_support,
+            dml: dml_support,
+        })
+    }
+
+    pub fn to_json(&self) -> serde_json::Value {
+        serde_json::to_value(self).expect("should not fail")
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    #[rstest]
+    fn supported_devices_get_supported_devices_works() {
+        let result = SupportedDevices::get_supported_devices();
+        // 環境によって結果が変わるので、関数呼び出しが成功するかどうかの確認のみ行う
+        assert!(result.is_ok(), "{result:?}");
+    }
+}
diff --git a/crates/voicevox_core/src/engine/full_context_label.rs b/crates/voicevox_core/src/engine/full_context_label.rs
index b8e3cce2d..f52b84ade 100644
--- a/crates/voicevox_core/src/engine/full_context_label.rs
+++ b/crates/voicevox_core/src/engine/full_context_label.rs
@@ -306,7 +306,7 @@ impl Utterance {
     }
 
     pub fn extract_full_context_label(
-        open_jtalk: &mut open_jtalk::OpenJtalk,
+        open_jtalk: &open_jtalk::OpenJtalk,
         text: impl AsRef<str>,
     ) -> Result<Self> {
         let labels = open_jtalk.extract_fullcontext(text)?;
diff --git a/crates/voicevox_core/src/engine/open_jtalk.rs b/crates/voicevox_core/src/engine/open_jtalk.rs
index 60c3acbf1..f07fe89cb 100644
--- a/crates/voicevox_core/src/engine/open_jtalk.rs
+++ b/crates/voicevox_core/src/engine/open_jtalk.rs
@@ -1,7 +1,12 @@
-use std::path::{Path, PathBuf};
+use std::{
+    path::{Path, PathBuf},
+    sync::Mutex,
+};
 
 use ::open_jtalk::*;
 
+use crate::Error;
+
 #[derive(thiserror::Error, Debug)]
 pub enum OpenJtalkError {
     #[error("open_jtalk load error")]
@@ -17,55 +22,74 @@ pub enum OpenJtalkError {
 pub type Result<T> = std::result::Result<T, OpenJtalkError>;
 
 pub struct OpenJtalk {
+    resources: Mutex<Resources>,
+    dict_loaded: bool,
+}
+
+struct Resources {
     mecab: ManagedResource<Mecab>,
     njd: ManagedResource<Njd>,
     jpcommon: ManagedResource<JpCommon>,
-    dict_loaded: bool,
 }
 
+#[allow(unsafe_code)]
+unsafe impl Send for Resources {}
+
 impl OpenJtalk {
-    pub fn initialize() -> Self {
+    pub fn new_without_dic() -> Self {
         Self {
-            mecab: ManagedResource::initialize(),
-            njd: ManagedResource::initialize(),
-            jpcommon: ManagedResource::initialize(),
+            resources: Mutex::new(Resources {
+                mecab: ManagedResource::initialize(),
+                njd: ManagedResource::initialize(),
+                jpcommon: ManagedResource::initialize(),
+            }),
             dict_loaded: false,
         }
     }
-
-    pub fn extract_fullcontext(&mut self, text: impl AsRef<str>) -> Result<Vec<String>> {
-        let result = self.extract_fullcontext_non_reflesh(text);
-        self.jpcommon.refresh();
-        self.njd.refresh();
-        self.mecab.refresh();
-        result
+    pub fn new_with_initialize(
+        open_jtalk_dict_dir: impl AsRef<Path>,
+    ) -> crate::result::Result<Self> {
+        let mut s = Self::new_without_dic();
+        s.load(open_jtalk_dict_dir)
+            .map_err(|_| Error::NotLoadedOpenjtalkDict)?;
+        Ok(s)
     }
 
-    fn extract_fullcontext_non_reflesh(&mut self, text: impl AsRef<str>) -> Result<Vec<String>> {
+    pub fn extract_fullcontext(&self, text: impl AsRef<str>) -> Result<Vec<String>> {
+        let Resources {
+            mecab,
+            njd,
+            jpcommon,
+        } = &mut *self.resources.lock().unwrap();
+
+        jpcommon.refresh();
+        njd.refresh();
+        mecab.refresh();
+
         let mecab_text =
             text2mecab(text.as_ref()).map_err(|e| OpenJtalkError::ExtractFullContext {
                 text: text.as_ref().into(),
                 source: Some(e.into()),
             })?;
-        if self.mecab.analysis(mecab_text) {
-            self.njd.mecab2njd(
-                self.mecab
+        if mecab.analysis(mecab_text) {
+            njd.mecab2njd(
+                mecab
                     .get_feature()
                     .ok_or(OpenJtalkError::ExtractFullContext {
                         text: text.as_ref().into(),
                         source: None,
                     })?,
-                self.mecab.get_size(),
+                mecab.get_size(),
             );
-            self.njd.set_pronunciation();
-            self.njd.set_digit();
-            self.njd.set_accent_phrase();
-            self.njd.set_accent_type();
-            self.njd.set_unvoiced_vowel();
-            self.njd.set_long_vowel();
-            self.jpcommon.njd2jpcommon(&self.njd);
-            self.jpcommon.make_label();
-            self.jpcommon
+            njd.set_pronunciation();
+            njd.set_digit();
+            njd.set_accent_phrase();
+            njd.set_accent_type();
+            njd.set_unvoiced_vowel();
+            njd.set_long_vowel();
+            jpcommon.njd2jpcommon(njd);
+            jpcommon.make_label();
+            jpcommon
                 .get_label_feature_to_iter()
                 .ok_or_else(|| OpenJtalkError::ExtractFullContext {
                     text: text.as_ref().into(),
@@ -80,15 +104,20 @@ impl OpenJtalk {
         }
     }
 
-    pub fn load(&mut self, mecab_dict_dir: impl AsRef<Path>) -> Result<()> {
-        let result = self.mecab.load(mecab_dict_dir.as_ref());
+    fn load(&mut self, open_jtalk_dict_dir: impl AsRef<Path>) -> Result<()> {
+        let result = self
+            .resources
+            .lock()
+            .unwrap()
+            .mecab
+            .load(open_jtalk_dict_dir.as_ref());
         if result {
             self.dict_loaded = true;
             Ok(())
         } else {
             self.dict_loaded = false;
             Err(OpenJtalkError::Load {
-                mecab_dict_dir: mecab_dict_dir.as_ref().into(),
+                mecab_dict_dir: open_jtalk_dict_dir.as_ref().into(),
             })
         }
     }
@@ -101,7 +130,7 @@ impl OpenJtalk {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use test_util::OPEN_JTALK_DIC_DIR;
+    use ::test_util::OPEN_JTALK_DIC_DIR;
 
     use crate::{macros::tests::assert_debug_fmt_eq, *};
 
@@ -196,8 +225,7 @@ mod tests {
     #[case("",Err(OpenJtalkError::ExtractFullContext{text:"".into(),source:None}))]
     #[case("こんにちは、ヒホです。", Ok(testdata_hello_hiho()))]
     fn extract_fullcontext_works(#[case] text: &str, #[case] expected: super::Result<Vec<String>>) {
-        let mut open_jtalk = OpenJtalk::initialize();
-        open_jtalk.load(OPEN_JTALK_DIC_DIR).unwrap();
+        let open_jtalk = OpenJtalk::new_with_initialize(OPEN_JTALK_DIC_DIR).unwrap();
         let result = open_jtalk.extract_fullcontext(text);
         assert_debug_fmt_eq!(expected, result);
     }
@@ -208,8 +236,7 @@ mod tests {
         #[case] text: &str,
         #[case] expected: super::Result<Vec<String>>,
     ) {
-        let mut open_jtalk = OpenJtalk::initialize();
-        open_jtalk.load(OPEN_JTALK_DIC_DIR).unwrap();
+        let open_jtalk = OpenJtalk::new_with_initialize(OPEN_JTALK_DIC_DIR).unwrap();
         for _ in 0..10 {
             let result = open_jtalk.extract_fullcontext(text);
             assert_debug_fmt_eq!(expected, result);
diff --git a/crates/voicevox_core/src/engine/synthesis_engine.rs b/crates/voicevox_core/src/engine/synthesis_engine.rs
index b16271a50..0adce4fe2 100644
--- a/crates/voicevox_core/src/engine/synthesis_engine.rs
+++ b/crates/voicevox_core/src/engine/synthesis_engine.rs
@@ -1,6 +1,6 @@
 use derive_new::new;
 use std::io::{Cursor, Write};
-use std::path::Path;
+use std::sync::Arc;
 
 use super::full_context_label::Utterance;
 use super::open_jtalk::OpenJtalk;
@@ -17,7 +17,7 @@ const MORA_PHONEME_LIST: &[&str] = &[
 #[derive(new)]
 pub struct SynthesisEngine {
     inference_core: InferenceCore,
-    open_jtalk: OpenJtalk,
+    open_jtalk: Arc<OpenJtalk>,
 }
 
 #[allow(unsafe_code)]
@@ -34,16 +34,16 @@ impl SynthesisEngine {
         &mut self.inference_core
     }
 
-    pub fn create_accent_phrases(
-        &mut self,
-        text: impl AsRef<str>,
-        speaker_id: u32,
+    pub async fn create_accent_phrases(
+        &self,
+        text: &str,
+        style_id: StyleId,
     ) -> Result<Vec<AccentPhraseModel>> {
-        if text.as_ref().is_empty() {
+        if text.is_empty() {
             return Ok(Vec::new());
         }
 
-        let utterance = Utterance::extract_full_context_label(&mut self.open_jtalk, text.as_ref())?;
+        let utterance = Utterance::extract_full_context_label(&self.open_jtalk, text)?;
 
         let accent_phrases: Vec<AccentPhraseModel> = utterance
             .breath_groups()
@@ -108,22 +108,24 @@ impl SynthesisEngine {
                 accum_vec
             });
 
-        self.replace_mora_data(&accent_phrases, speaker_id)
+        self.replace_mora_data(&accent_phrases, style_id).await
     }
 
-    pub fn replace_mora_data(
-        &mut self,
+    pub async fn replace_mora_data(
+        &self,
         accent_phrases: &[AccentPhraseModel],
-        speaker_id: u32,
+        style_id: StyleId,
     ) -> Result<Vec<AccentPhraseModel>> {
-        let accent_phrases = self.replace_phoneme_length(accent_phrases, speaker_id)?;
-        self.replace_mora_pitch(&accent_phrases, speaker_id)
+        let accent_phrases = self
+            .replace_phoneme_length(accent_phrases, style_id)
+            .await?;
+        self.replace_mora_pitch(&accent_phrases, style_id).await
     }
 
-    pub fn replace_phoneme_length(
-        &mut self,
+    pub async fn replace_phoneme_length(
+        &self,
         accent_phrases: &[AccentPhraseModel],
-        speaker_id: u32,
+        style_id: StyleId,
     ) -> Result<Vec<AccentPhraseModel>> {
         let (_, phoneme_data_list) = SynthesisEngine::initial_process(accent_phrases);
 
@@ -134,8 +136,9 @@ impl SynthesisEngine {
             .map(|phoneme_data| phoneme_data.phoneme_id())
             .collect();
         let phoneme_length = self
-            .inference_core_mut()
-            .predict_duration(&phoneme_list_s, speaker_id)?;
+            .inference_core()
+            .predict_duration(&phoneme_list_s, style_id)
+            .await?;
 
         let mut index = 0;
         let new_accent_phrases = accent_phrases
@@ -181,10 +184,10 @@ impl SynthesisEngine {
         Ok(new_accent_phrases)
     }
 
-    pub fn replace_mora_pitch(
-        &mut self,
+    pub async fn replace_mora_pitch(
+        &self,
         accent_phrases: &[AccentPhraseModel],
-        speaker_id: u32,
+        style_id: StyleId,
     ) -> Result<Vec<AccentPhraseModel>> {
         let (_, phoneme_data_list) = SynthesisEngine::initial_process(accent_phrases);
 
@@ -246,16 +249,19 @@ impl SynthesisEngine {
             end_accent_phrase_list.push(base_end_accent_phrase_list[vowel_index as usize]);
         }
 
-        let mut f0_list = self.inference_core_mut().predict_intonation(
-            vowel_phoneme_list.len(),
-            &vowel_phoneme_list,
-            &consonant_phoneme_list,
-            &start_accent_list,
-            &end_accent_list,
-            &start_accent_phrase_list,
-            &end_accent_phrase_list,
-            speaker_id,
-        )?;
+        let mut f0_list = self
+            .inference_core()
+            .predict_intonation(
+                vowel_phoneme_list.len(),
+                &vowel_phoneme_list,
+                &consonant_phoneme_list,
+                &start_accent_list,
+                &end_accent_list,
+                &start_accent_phrase_list,
+                &end_accent_phrase_list,
+                style_id,
+            )
+            .await?;
 
         for i in 0..vowel_phoneme_data_list.len() {
             if UNVOICED_MORA_PHONEME_LIST
@@ -308,10 +314,10 @@ impl SynthesisEngine {
         Ok(new_accent_phrases)
     }
 
-    pub fn synthesis(
-        &mut self,
+    pub async fn synthesis(
+        &self,
         query: &AudioQueryModel,
-        speaker_id: u32,
+        style_id: StyleId,
         enable_interrogative_upspeak: bool,
     ) -> Result<Vec<f32>> {
         let speed_scale = *query.speed_scale();
@@ -409,28 +415,32 @@ impl SynthesisEngine {
         // 2次元のvectorを1次元に変換し、アドレスを連続させる
         let flatten_phoneme = phoneme.into_iter().flatten().collect::<Vec<_>>();
 
-        self.inference_core_mut().decode(
-            f0.len(),
-            OjtPhoneme::num_phoneme(),
-            &f0,
-            &flatten_phoneme,
-            speaker_id,
-        )
+        self.inference_core()
+            .decode(
+                f0.len(),
+                OjtPhoneme::num_phoneme(),
+                &f0,
+                &flatten_phoneme,
+                style_id,
+            )
+            .await
     }
 
-    pub fn synthesis_wave_format(
-        &mut self,
+    pub async fn synthesis_wave_format(
+        &self,
         query: &AudioQueryModel,
-        speaker_id: u32,
+        style_id: StyleId,
         enable_interrogative_upspeak: bool,
     ) -> Result<Vec<u8>> {
-        let wave = self.synthesis(query, speaker_id, enable_interrogative_upspeak)?;
-
+        let wave = self
+            .synthesis(query, style_id, enable_interrogative_upspeak)
+            .await?;
         let volume_scale = *query.volume_scale();
         let output_stereo = *query.output_stereo();
-        // TODO: 44.1kHzなどの対応
         let output_sampling_rate = *query.output_sampling_rate();
 
+        // TODO: 44.1kHzなどの対応
+
         let num_channels: u16 = if output_stereo { 2 } else { 1 };
         let bit_depth: u16 = 16;
         let repeat_count: u32 =
@@ -470,12 +480,6 @@ impl SynthesisEngine {
         Ok(cur.into_inner())
     }
 
-    pub fn load_openjtalk_dict(&mut self, mecab_dict_dir: impl AsRef<Path>) -> Result<()> {
-        self.open_jtalk
-            .load(mecab_dict_dir)
-            .map_err(|_| Error::NotLoadedOpenjtalkDict)
-    }
-
     pub fn is_openjtalk_dict_loaded(&self) -> bool {
         self.open_jtalk.dict_loaded()
     }
@@ -644,44 +648,43 @@ fn make_interrogative_mora(last_mora: &MoraModel) -> MoraModel {
 #[cfg(test)]
 mod tests {
     use super::*;
+    use ::test_util::OPEN_JTALK_DIC_DIR;
     use pretty_assertions::assert_eq;
-    use test_util::OPEN_JTALK_DIC_DIR;
 
-    use crate::{macros::tests::assert_debug_fmt_eq, *};
+    use crate::*;
 
     #[rstest]
-    fn load_openjtalk_dict_works() {
-        let core = InferenceCore::new(false, None);
-        let mut synthesis_engine = SynthesisEngine::new(core, OpenJtalk::initialize());
-
-        let result = synthesis_engine.load_openjtalk_dict(OPEN_JTALK_DIC_DIR);
-        assert_debug_fmt_eq!(result, Ok(()));
-
-        let result = synthesis_engine.load_openjtalk_dict("");
-        assert_debug_fmt_eq!(result, Err(Error::NotLoadedOpenjtalkDict));
-    }
-
-    #[rstest]
-    fn is_openjtalk_dict_loaded_works() {
-        let core = InferenceCore::new(false, None);
-        let mut synthesis_engine = SynthesisEngine::new(core, OpenJtalk::initialize());
+    #[tokio::test]
+    async fn is_openjtalk_dict_loaded_works() {
+        let core = InferenceCore::new_with_initialize(false, 0, false)
+            .await
+            .unwrap();
+        let synthesis_engine = SynthesisEngine::new(
+            core,
+            OpenJtalk::new_with_initialize(OPEN_JTALK_DIC_DIR)
+                .unwrap()
+                .into(),
+        );
 
-        let _ = synthesis_engine.load_openjtalk_dict(OPEN_JTALK_DIC_DIR);
         assert_eq!(synthesis_engine.is_openjtalk_dict_loaded(), true);
-
-        let _ = synthesis_engine.load_openjtalk_dict("");
-        assert_eq!(synthesis_engine.is_openjtalk_dict_loaded(), false);
     }
 
     #[rstest]
-    fn create_accent_phrases_works() {
-        let mut core = InferenceCore::new(true, None);
-        core.initialize(false, 0, true).unwrap();
-        let mut synthesis_engine = SynthesisEngine::new(core, OpenJtalk::initialize());
+    #[tokio::test]
+    async fn create_accent_phrases_works() {
+        let core = InferenceCore::new_with_initialize(false, 0, true)
+            .await
+            .unwrap();
+        let synthesis_engine = SynthesisEngine::new(
+            core,
+            OpenJtalk::new_with_initialize(OPEN_JTALK_DIC_DIR)
+                .unwrap()
+                .into(),
+        );
 
-        let _ = synthesis_engine.load_openjtalk_dict(OPEN_JTALK_DIC_DIR);
         let accent_phrases = synthesis_engine
-            .create_accent_phrases("同じ、文章、です。完全に、同一です。", 0)
+            .create_accent_phrases("同じ、文章、です。完全に、同一です。", StyleId::new(1))
+            .await
             .unwrap();
         assert_eq!(accent_phrases.len(), 5);
 
diff --git a/crates/voicevox_core/src/error.rs b/crates/voicevox_core/src/error.rs
index f4e3f882b..6c4916412 100644
--- a/crates/voicevox_core/src/error.rs
+++ b/crates/voicevox_core/src/error.rs
@@ -30,6 +30,25 @@ pub enum Error {
         #[source]
         source: anyhow::Error,
     },
+    #[error("{} ({})", base_error_message(VOICEVOX_ALREADY_LOADED_MODEL_ERROR), path.display())]
+    AlreadyLoadedModel { path: PathBuf },
+
+    #[error("{} ({model_id:?})", base_error_message(VOICEVOX_UNLOADED_MODEL_ERROR))]
+    UnloadedModel { model_id: VoiceModelId },
+
+    #[error("{}({path}):{source}", base_error_message(VOICEVOX_OPEN_FILE_ERROR))]
+    OpenFile {
+        path: PathBuf,
+        #[source]
+        source: anyhow::Error,
+    },
+
+    #[error("{},{filename}", base_error_message(VOICEVOX_VVM_MODEL_READ_ERROR))]
+    VvmRead {
+        filename: String,
+        #[source]
+        source: Option<anyhow::Error>,
+    },
 
     #[error("{},{0}", base_error_message(VOICEVOX_RESULT_LOAD_METAS_ERROR))]
     LoadMetas(#[source] anyhow::Error),
@@ -40,14 +59,11 @@ pub enum Error {
     )]
     GetSupportedDevices(#[source] anyhow::Error),
 
-    #[error("{}", base_error_message(VOICEVOX_RESULT_UNINITIALIZED_STATUS_ERROR))]
-    UninitializedStatus,
-
     #[error(
-        "{}: {speaker_id}",
-        base_error_message(VOICEVOX_RESULT_INVALID_SPEAKER_ID_ERROR)
+        "{}: {style_id:?}",
+        base_error_message(VOICEVOX_RESULT_INVALID_STYLE_ID_ERROR)
     )]
-    InvalidSpeakerId { speaker_id: u32 },
+    InvalidStyleId { style_id: StyleId },
 
     #[error(
         "{}: {model_index}",
@@ -69,6 +85,6 @@ pub enum Error {
 }
 
 fn base_error_message(result_code: VoicevoxResultCode) -> &'static str {
-    let c_message: &'static str = crate::error_result_to_message(result_code);
+    let c_message: &'static str = crate::result_code::error_result_to_message(result_code);
     &c_message[..(c_message.len() - 1)]
 }
diff --git a/crates/voicevox_core/src/inference_core.rs b/crates/voicevox_core/src/inference_core.rs
new file mode 100644
index 000000000..e22269486
--- /dev/null
+++ b/crates/voicevox_core/src/inference_core.rs
@@ -0,0 +1,231 @@
+use self::status::*;
+use super::*;
+use onnxruntime::{
+    ndarray,
+    session::{AnyArray, NdArray},
+};
+
+const PHONEME_LENGTH_MINIMAL: f32 = 0.01;
+
+pub struct InferenceCore {
+    status: Status,
+}
+
+impl InferenceCore {
+    pub(crate) async fn new_with_initialize(
+        use_gpu: bool,
+        cpu_num_threads: u16,
+        load_all_models: bool,
+    ) -> Result<Self> {
+        if !use_gpu || Self::can_support_gpu_feature()? {
+            let mut status = Status::new(use_gpu, cpu_num_threads);
+
+            if load_all_models {
+                for model in &VoiceModel::get_all_models().await? {
+                    status.load_model(model).await?;
+                }
+            }
+            Ok(Self { status })
+        } else {
+            Err(Error::GpuSupport)
+        }
+    }
+
+    fn can_support_gpu_feature() -> Result<bool> {
+        let supported_devices = SupportedDevices::get_supported_devices()?;
+
+        cfg_if! {
+            if #[cfg(feature = "directml")]{
+                Ok(*supported_devices.dml())
+            } else{
+                Ok(*supported_devices.cuda())
+            }
+        }
+    }
+
+    pub async fn load_model(&mut self, model: &VoiceModel) -> Result<()> {
+        self.status.load_model(model).await
+    }
+
+    pub fn unload_model(&mut self, voice_model_id: &VoiceModelId) -> Result<()> {
+        self.status.unload_model(voice_model_id)
+    }
+    pub fn metas(&self) -> &VoiceModelMeta {
+        self.status.metas()
+    }
+
+    pub fn is_loaded_model(&self, model_id: &VoiceModelId) -> bool {
+        self.status.is_loaded_model(model_id)
+    }
+
+    pub fn is_model_loaded_by_style_id(&self, style_id: StyleId) -> bool {
+        self.status.is_loaded_model_by_style_id(style_id)
+    }
+
+    pub async fn predict_duration(
+        &self,
+        phoneme_vector: &[i64],
+        style_id: StyleId,
+    ) -> Result<Vec<f32>> {
+        if !self.status.validate_speaker_id(style_id) {
+            return Err(Error::InvalidStyleId { style_id });
+        }
+
+        let mut phoneme_vector_array = NdArray::new(ndarray::arr1(phoneme_vector));
+        let mut speaker_id_array = NdArray::new(ndarray::arr1(&[style_id.raw_id() as i64]));
+
+        let input_tensors: Vec<&mut dyn AnyArray> =
+            vec![&mut phoneme_vector_array, &mut speaker_id_array];
+
+        let mut output = self
+            .status
+            .predict_duration_session_run(style_id, input_tensors)?;
+
+        for output_item in output.iter_mut() {
+            if *output_item < PHONEME_LENGTH_MINIMAL {
+                *output_item = PHONEME_LENGTH_MINIMAL;
+            }
+        }
+
+        Ok(output)
+    }
+
+    #[allow(clippy::too_many_arguments)]
+    pub async fn predict_intonation(
+        &self,
+        length: usize,
+        vowel_phoneme_vector: &[i64],
+        consonant_phoneme_vector: &[i64],
+        start_accent_vector: &[i64],
+        end_accent_vector: &[i64],
+        start_accent_phrase_vector: &[i64],
+        end_accent_phrase_vector: &[i64],
+        style_id: StyleId,
+    ) -> Result<Vec<f32>> {
+        if !self.status.validate_speaker_id(style_id) {
+            return Err(Error::InvalidStyleId { style_id });
+        }
+
+        let mut length_array = NdArray::new(ndarray::arr0(length as i64));
+        let mut vowel_phoneme_vector_array = NdArray::new(ndarray::arr1(vowel_phoneme_vector));
+        let mut consonant_phoneme_vector_array =
+            NdArray::new(ndarray::arr1(consonant_phoneme_vector));
+        let mut start_accent_vector_array = NdArray::new(ndarray::arr1(start_accent_vector));
+        let mut end_accent_vector_array = NdArray::new(ndarray::arr1(end_accent_vector));
+        let mut start_accent_phrase_vector_array =
+            NdArray::new(ndarray::arr1(start_accent_phrase_vector));
+        let mut end_accent_phrase_vector_array =
+            NdArray::new(ndarray::arr1(end_accent_phrase_vector));
+        let mut speaker_id_array = NdArray::new(ndarray::arr1(&[style_id.raw_id() as i64]));
+
+        let input_tensors: Vec<&mut dyn AnyArray> = vec![
+            &mut length_array,
+            &mut vowel_phoneme_vector_array,
+            &mut consonant_phoneme_vector_array,
+            &mut start_accent_vector_array,
+            &mut end_accent_vector_array,
+            &mut start_accent_phrase_vector_array,
+            &mut end_accent_phrase_vector_array,
+            &mut speaker_id_array,
+        ];
+
+        self.status
+            .predict_intonation_session_run(style_id, input_tensors)
+    }
+
+    pub async fn decode(
+        &self,
+        length: usize,
+        phoneme_size: usize,
+        f0: &[f32],
+        phoneme_vector: &[f32],
+        style_id: StyleId,
+    ) -> Result<Vec<f32>> {
+        if !self.status.validate_speaker_id(style_id) {
+            return Err(Error::InvalidStyleId { style_id });
+        }
+
+        // 音が途切れてしまうのを避けるworkaround処理が入っている
+        // TODO: 改善したらここのpadding処理を取り除く
+        const PADDING_SIZE: f64 = 0.4;
+        const DEFAULT_SAMPLING_RATE: f64 = 24000.0;
+        let padding_size = ((PADDING_SIZE * DEFAULT_SAMPLING_RATE) / 256.0).round() as usize;
+        let start_and_end_padding_size = 2 * padding_size;
+        let length_with_padding = length + start_and_end_padding_size;
+        let f0_with_padding = Self::make_f0_with_padding(f0, length_with_padding, padding_size);
+
+        let phoneme_with_padding = Self::make_phoneme_with_padding(
+            phoneme_vector,
+            phoneme_size,
+            length_with_padding,
+            padding_size,
+        );
+
+        let mut f0_array = NdArray::new(
+            ndarray::arr1(&f0_with_padding)
+                .into_shape([length_with_padding, 1])
+                .unwrap(),
+        );
+        let mut phoneme_array = NdArray::new(
+            ndarray::arr1(&phoneme_with_padding)
+                .into_shape([length_with_padding, phoneme_size])
+                .unwrap(),
+        );
+        let mut speaker_id_array = NdArray::new(ndarray::arr1(&[style_id.raw_id() as i64]));
+
+        let input_tensors: Vec<&mut dyn AnyArray> =
+            vec![&mut f0_array, &mut phoneme_array, &mut speaker_id_array];
+
+        self.status
+            .decode_session_run(style_id, input_tensors)
+            .map(|output| Self::trim_padding_from_output(output, padding_size))
+    }
+
+    fn make_f0_with_padding(
+        f0_slice: &[f32],
+        length_with_padding: usize,
+        padding_size: usize,
+    ) -> Vec<f32> {
+        // 音が途切れてしまうのを避けるworkaround処理
+        // 改善したらこの関数を削除する
+        let mut f0_with_padding = Vec::with_capacity(length_with_padding);
+        let padding = vec![0.0; padding_size];
+        f0_with_padding.extend_from_slice(&padding);
+        f0_with_padding.extend_from_slice(f0_slice);
+        f0_with_padding.extend_from_slice(&padding);
+        f0_with_padding
+    }
+
+    fn make_phoneme_with_padding(
+        phoneme_slice: &[f32],
+        phoneme_size: usize,
+        length_with_padding: usize,
+        padding_size: usize,
+    ) -> Vec<f32> {
+        // 音が途切れてしまうのを避けるworkaround処理
+        // 改善したらこの関数を削除する
+        let mut padding_phoneme = vec![0.0; phoneme_size];
+        padding_phoneme[0] = 1.0;
+        let padding_phoneme_len = padding_phoneme.len();
+        let padding_phonemes: Vec<f32> = padding_phoneme
+            .into_iter()
+            .cycle()
+            .take(padding_phoneme_len * padding_size)
+            .collect();
+        let mut phoneme_with_padding = Vec::with_capacity(phoneme_size * length_with_padding);
+        phoneme_with_padding.extend_from_slice(&padding_phonemes);
+        phoneme_with_padding.extend_from_slice(phoneme_slice);
+        phoneme_with_padding.extend_from_slice(&padding_phonemes);
+
+        phoneme_with_padding
+    }
+
+    fn trim_padding_from_output(mut output: Vec<f32>, padding_f0_size: usize) -> Vec<f32> {
+        // 音が途切れてしまうのを避けるworkaround処理
+        // 改善したらこの関数を削除する
+        let padding_sampling_size = padding_f0_size * 256;
+        output
+            .drain(padding_sampling_size..output.len() - padding_sampling_size)
+            .collect()
+    }
+}
diff --git a/crates/voicevox_core/src/lib.rs b/crates/voicevox_core/src/lib.rs
index aa1cb021c..ec8928d79 100644
--- a/crates/voicevox_core/src/lib.rs
+++ b/crates/voicevox_core/src/lib.rs
@@ -1,23 +1,42 @@
 #![deny(unsafe_code)]
 
+mod devices;
 /// cbindgen:ignore
 mod engine;
 mod error;
+mod inference_core;
 mod macros;
+mod manifest;
+mod metas;
 mod numerics;
-mod publish;
 mod result;
 pub mod result_code;
 mod status;
+mod version;
+mod voice_model;
+mod voice_synthesizer;
 
-pub use self::publish::*;
+use self::inference_core::*;
 
-pub use self::engine::{AccentPhraseModel, AudioQueryModel};
+#[cfg(test)]
+mod test_util;
+
+#[cfg(test)]
+use self::test_util::*;
+
+pub use self::engine::{AccentPhraseModel, AudioQueryModel, OpenJtalk};
 pub use self::error::*;
+pub use self::metas::*;
 pub use self::result::*;
+pub use self::voice_model::*;
+pub use devices::*;
+pub use manifest::*;
+pub use version::*;
+pub use voice_synthesizer::*;
 
 use derive_getters::*;
 use derive_new::new;
+use nanoid::nanoid;
 #[cfg(test)]
 use rstest::*;
 
diff --git a/crates/voicevox_core/src/manifest.rs b/crates/voicevox_core/src/manifest.rs
new file mode 100644
index 000000000..4e9ccd158
--- /dev/null
+++ b/crates/voicevox_core/src/manifest.rs
@@ -0,0 +1,30 @@
+use std::fmt::Display;
+
+use derive_getters::Getters;
+use derive_new::new;
+use serde::Deserialize;
+
+pub type RawManifestVersion = String;
+#[derive(Deserialize, Clone, Debug, PartialEq, new)]
+pub struct ManifestVersion(RawManifestVersion);
+
+impl ManifestVersion {
+    pub fn raw_manifest_version(&self) -> &RawManifestVersion {
+        &self.0
+    }
+}
+
+impl Display for ManifestVersion {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.0)
+    }
+}
+
+#[derive(Deserialize, Getters, Clone)]
+pub struct Manifest {
+    manifest_version: ManifestVersion,
+    metas_filename: String,
+    decode_filename: String,
+    predict_duration_filename: String,
+    predict_intonation_filename: String,
+}
diff --git a/crates/voicevox_core/src/metas.rs b/crates/voicevox_core/src/metas.rs
new file mode 100644
index 000000000..2ee336906
--- /dev/null
+++ b/crates/voicevox_core/src/metas.rs
@@ -0,0 +1,59 @@
+use std::fmt::Display;
+
+use super::*;
+use derive_getters::Getters;
+use serde::{Deserialize, Serialize};
+
+/// スタイルIdの実体
+pub type RawStyleId = u32;
+/// スタイルId
+#[derive(PartialEq, Eq, Clone, Copy, Ord, PartialOrd, Deserialize, Serialize, new, Debug)]
+pub struct StyleId(RawStyleId);
+
+impl StyleId {
+    pub fn raw_id(self) -> RawStyleId {
+        self.0
+    }
+}
+
+impl Display for StyleId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.raw_id())
+    }
+}
+
+pub type RawStyleVersion = String;
+
+#[derive(PartialEq, Eq, Clone, Ord, PartialOrd, Deserialize, Serialize, new, Debug)]
+pub struct StyleVersion(RawStyleVersion);
+
+impl StyleVersion {
+    pub fn raw_version(&self) -> &RawStyleVersion {
+        &self.0
+    }
+}
+
+impl Display for StyleVersion {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.raw_version())
+    }
+}
+
+/// 音声合成モデルのメタ情報
+pub type VoiceModelMeta = Vec<SpeakerMeta>;
+
+/// スピーカーのメタ情報
+#[derive(Deserialize, Serialize, Getters, Clone)]
+pub struct SpeakerMeta {
+    name: String,
+    styles: Vec<StyleMeta>,
+    version: StyleVersion,
+    speaker_uuid: String,
+}
+
+/// スタイルのメタ情報
+#[derive(Deserialize, Serialize, Getters, Clone)]
+pub struct StyleMeta {
+    id: StyleId,
+    name: String,
+}
diff --git a/crates/voicevox_core/src/publish.rs b/crates/voicevox_core/src/publish.rs
deleted file mode 100644
index 50db061fe..000000000
--- a/crates/voicevox_core/src/publish.rs
+++ /dev/null
@@ -1,1259 +0,0 @@
-use self::engine::*;
-use self::result_code::VoicevoxResultCode;
-use self::status::*;
-use super::*;
-use once_cell::sync::Lazy;
-use onnxruntime::{
-    ndarray,
-    session::{AnyArray, NdArray},
-};
-use std::ffi::{CStr, CString};
-use std::path::PathBuf;
-use std::sync::Mutex;
-
-const PHONEME_LENGTH_MINIMAL: f32 = 0.01;
-
-pub struct VoicevoxCore {
-    synthesis_engine: SynthesisEngine,
-    use_gpu: bool,
-}
-
-impl VoicevoxCore {
-    pub fn new_with_initialize(options: InitializeOptions) -> Result<Self> {
-        let mut this = Self::new();
-        this.initialize(options)?;
-        Ok(this)
-    }
-
-    pub fn new_with_mutex() -> Mutex<VoicevoxCore> {
-        Mutex::new(Self::new())
-    }
-
-    fn new() -> Self {
-        #[cfg(windows)]
-        list_windows_video_cards();
-
-        Self {
-            synthesis_engine: SynthesisEngine::new(
-                InferenceCore::new(false, None),
-                OpenJtalk::initialize(),
-            ),
-            use_gpu: false,
-        }
-    }
-
-    pub fn initialize(&mut self, options: InitializeOptions) -> Result<()> {
-        let use_gpu = match options.acceleration_mode {
-            AccelerationMode::Auto => {
-                let supported_devices = SupportedDevices::get_supported_devices()?;
-
-                cfg_if! {
-                    if #[cfg(feature="directml")]{
-                        *supported_devices.dml()
-
-                    } else {
-                        *supported_devices.cuda()
-                    }
-                }
-            }
-            AccelerationMode::Cpu => false,
-            AccelerationMode::Gpu => true,
-        };
-        self.use_gpu = use_gpu;
-        self.synthesis_engine.inference_core_mut().initialize(
-            use_gpu,
-            options.cpu_num_threads,
-            options.load_all_models,
-        )?;
-        if let Some(open_jtalk_dict_dir) = options.open_jtalk_dict_dir {
-            self.synthesis_engine
-                .load_openjtalk_dict(open_jtalk_dict_dir)?;
-        }
-        Ok(())
-    }
-
-    pub fn is_gpu_mode(&self) -> bool {
-        self.use_gpu
-    }
-
-    pub fn load_model(&mut self, speaker_id: u32) -> Result<()> {
-        self.synthesis_engine
-            .inference_core_mut()
-            .load_model(speaker_id)
-    }
-
-    pub fn is_model_loaded(&self, speaker_id: u32) -> bool {
-        self.synthesis_engine
-            .inference_core()
-            .is_model_loaded(speaker_id)
-    }
-
-    pub fn finalize(&mut self) {
-        self.synthesis_engine.inference_core_mut().finalize()
-    }
-
-    pub const fn get_version() -> &'static str {
-        env!("CARGO_PKG_VERSION")
-    }
-
-    pub fn get_metas_json(&self) -> &'static CStr {
-        &METAS_CSTRING
-    }
-
-    pub fn get_supported_devices_json(&self) -> &'static CStr {
-        &SUPPORTED_DEVICES_CSTRING
-    }
-
-    pub fn predict_duration(
-        &mut self,
-        phoneme_vector: &[i64],
-        speaker_id: u32,
-    ) -> Result<Vec<f32>> {
-        self.synthesis_engine
-            .inference_core_mut()
-            .predict_duration(phoneme_vector, speaker_id)
-    }
-
-    #[allow(clippy::too_many_arguments)]
-    pub fn predict_intonation(
-        &mut self,
-        length: usize,
-        vowel_phoneme_vector: &[i64],
-        consonant_phoneme_vector: &[i64],
-        start_accent_vector: &[i64],
-        end_accent_vector: &[i64],
-        start_accent_phrase_vector: &[i64],
-        end_accent_phrase_vector: &[i64],
-        speaker_id: u32,
-    ) -> Result<Vec<f32>> {
-        self.synthesis_engine
-            .inference_core_mut()
-            .predict_intonation(
-                length,
-                vowel_phoneme_vector,
-                consonant_phoneme_vector,
-                start_accent_vector,
-                end_accent_vector,
-                start_accent_phrase_vector,
-                end_accent_phrase_vector,
-                speaker_id,
-            )
-    }
-
-    pub fn decode(
-        &mut self,
-        length: usize,
-        phoneme_size: usize,
-        f0: &[f32],
-        phoneme_vector: &[f32],
-        speaker_id: u32,
-    ) -> Result<Vec<f32>> {
-        self.synthesis_engine.inference_core_mut().decode(
-            length,
-            phoneme_size,
-            f0,
-            phoneme_vector,
-            speaker_id,
-        )
-    }
-
-    pub fn audio_query(
-        &mut self,
-        text: &str,
-        speaker_id: u32,
-        options: AudioQueryOptions,
-    ) -> Result<AudioQueryModel> {
-        let accent_phrases = self.accent_phrases(
-            text,
-            speaker_id,
-            AccentPhrasesOptions { kana: options.kana },
-        )?;
-        let kana = create_kana(&accent_phrases);
-
-        Ok(AudioQueryModel::new(
-            accent_phrases,
-            1.,
-            0.,
-            1.,
-            1.,
-            0.1,
-            0.1,
-            SynthesisEngine::DEFAULT_SAMPLING_RATE,
-            false,
-            Some(kana),
-        ))
-    }
-
-    pub fn accent_phrases(
-        &mut self,
-        text: &str,
-        speaker_id: u32,
-        options: AccentPhrasesOptions,
-    ) -> Result<Vec<AccentPhraseModel>> {
-        if !self.synthesis_engine.is_openjtalk_dict_loaded() {
-            return Err(Error::NotLoadedOpenjtalkDict);
-        }
-
-        let accent_phrases = if options.kana {
-            self.synthesis_engine
-                .replace_mora_data(&parse_kana(text)?, speaker_id)?
-        } else {
-            self.synthesis_engine
-                .create_accent_phrases(text, speaker_id)?
-        };
-
-        Ok(accent_phrases)
-    }
-
-    pub fn mora_length(
-        &mut self,
-        speaker_id: u32,
-        accent_phrases: &[AccentPhraseModel],
-    ) -> Result<Vec<AccentPhraseModel>> {
-        let accent_phrases = self
-            .synthesis_engine
-            .replace_phoneme_length(accent_phrases, speaker_id)?;
-
-        Ok(accent_phrases)
-    }
-
-    pub fn mora_pitch(
-        &mut self,
-        speaker_id: u32,
-        accent_phrases: &[AccentPhraseModel],
-    ) -> Result<Vec<AccentPhraseModel>> {
-        let accent_phrases = self
-            .synthesis_engine
-            .replace_mora_pitch(accent_phrases, speaker_id)?;
-
-        Ok(accent_phrases)
-    }
-
-    pub fn mora_data(
-        &mut self,
-        speaker_id: u32,
-        accent_phrases: &[AccentPhraseModel],
-    ) -> Result<Vec<AccentPhraseModel>> {
-        let accent_phrases = self
-            .synthesis_engine
-            .replace_mora_data(accent_phrases, speaker_id)?;
-
-        Ok(accent_phrases)
-    }
-
-    pub fn synthesis(
-        &mut self,
-        audio_query: &AudioQueryModel,
-        speaker_id: u32,
-        options: SynthesisOptions,
-    ) -> Result<Vec<u8>> {
-        self.synthesis_engine.synthesis_wave_format(
-            audio_query,
-            speaker_id,
-            options.enable_interrogative_upspeak,
-        )
-    }
-
-    pub fn tts(&mut self, text: &str, speaker_id: u32, options: TtsOptions) -> Result<Vec<u8>> {
-        let audio_query = &self.audio_query(text, speaker_id, AudioQueryOptions::from(&options))?;
-        self.synthesis(audio_query, speaker_id, SynthesisOptions::from(&options))
-    }
-}
-
-#[derive(Default)]
-pub struct AudioQueryOptions {
-    pub kana: bool,
-}
-
-impl From<&TtsOptions> for AudioQueryOptions {
-    fn from(options: &TtsOptions) -> Self {
-        Self { kana: options.kana }
-    }
-}
-
-#[derive(Default)]
-pub struct AccentPhrasesOptions {
-    pub kana: bool,
-}
-
-impl From<&TtsOptions> for AccentPhrasesOptions {
-    fn from(options: &TtsOptions) -> Self {
-        Self { kana: options.kana }
-    }
-}
-
-#[derive(Default, Debug, PartialEq, Eq)]
-pub enum AccelerationMode {
-    #[default]
-    Auto,
-    Cpu,
-    Gpu,
-}
-
-#[derive(Default)]
-pub struct InitializeOptions {
-    pub acceleration_mode: AccelerationMode,
-    pub cpu_num_threads: u16,
-    pub load_all_models: bool,
-    pub open_jtalk_dict_dir: Option<PathBuf>,
-}
-
-pub struct SynthesisOptions {
-    pub enable_interrogative_upspeak: bool,
-}
-
-impl From<&TtsOptions> for SynthesisOptions {
-    fn from(options: &TtsOptions) -> Self {
-        Self {
-            enable_interrogative_upspeak: options.enable_interrogative_upspeak,
-        }
-    }
-}
-
-pub struct TtsOptions {
-    pub kana: bool,
-    pub enable_interrogative_upspeak: bool,
-}
-
-impl Default for TtsOptions {
-    fn default() -> Self {
-        Self {
-            enable_interrogative_upspeak: true,
-            kana: Default::default(),
-        }
-    }
-}
-
-#[derive(new)]
-pub struct InferenceCore {
-    initialized: bool,
-    status_option: Option<Status>,
-}
-
-impl InferenceCore {
-    pub fn initialize(
-        &mut self,
-        use_gpu: bool,
-        cpu_num_threads: u16,
-        load_all_models: bool,
-    ) -> Result<()> {
-        self.initialized = false;
-        if !use_gpu || self.can_support_gpu_feature()? {
-            let mut status = Status::new(use_gpu, cpu_num_threads);
-
-            status.load_metas()?;
-
-            if load_all_models {
-                for model_index in 0..MODEL_FILE_SET.models_count() {
-                    status.load_model(model_index)?;
-                }
-            }
-
-            self.status_option = Some(status);
-            self.initialized = true;
-            Ok(())
-        } else {
-            Err(Error::GpuSupport)
-        }
-    }
-    fn can_support_gpu_feature(&self) -> Result<bool> {
-        let supported_devices = SupportedDevices::get_supported_devices()?;
-
-        cfg_if! {
-            if #[cfg(feature = "directml")]{
-                Ok(*supported_devices.dml())
-            } else{
-                Ok(*supported_devices.cuda())
-            }
-        }
-    }
-    pub fn load_model(&mut self, speaker_id: u32) -> Result<()> {
-        if self.initialized {
-            let status = self
-                .status_option
-                .as_mut()
-                .ok_or(Error::UninitializedStatus)?;
-            if let Some((model_index, _)) = get_model_index_and_speaker_id(speaker_id) {
-                status.load_model(model_index)
-            } else {
-                Err(Error::InvalidSpeakerId { speaker_id })
-            }
-        } else {
-            Err(Error::UninitializedStatus)
-        }
-    }
-    pub fn is_model_loaded(&self, speaker_id: u32) -> bool {
-        if let Some(status) = self.status_option.as_ref() {
-            if let Some((model_index, _)) = get_model_index_and_speaker_id(speaker_id) {
-                status.is_model_loaded(model_index)
-            } else {
-                false
-            }
-        } else {
-            false
-        }
-    }
-    pub fn finalize(&mut self) {
-        self.initialized = false;
-        self.status_option = None;
-    }
-
-    pub fn predict_duration(
-        &mut self,
-        phoneme_vector: &[i64],
-        speaker_id: u32,
-    ) -> Result<Vec<f32>> {
-        if !self.initialized {
-            return Err(Error::UninitializedStatus);
-        }
-
-        let status = self
-            .status_option
-            .as_mut()
-            .ok_or(Error::UninitializedStatus)?;
-
-        if !status.validate_speaker_id(speaker_id) {
-            return Err(Error::InvalidSpeakerId { speaker_id });
-        }
-
-        let (model_index, speaker_id) =
-            if let Some((model_index, speaker_id)) = get_model_index_and_speaker_id(speaker_id) {
-                (model_index, speaker_id)
-            } else {
-                return Err(Error::InvalidSpeakerId { speaker_id });
-            };
-
-        if model_index >= MODEL_FILE_SET.models_count() {
-            return Err(Error::InvalidModelIndex { model_index });
-        }
-
-        let mut phoneme_vector_array = NdArray::new(ndarray::arr1(phoneme_vector));
-        let mut speaker_id_array = NdArray::new(ndarray::arr1(&[speaker_id as i64]));
-
-        let input_tensors: Vec<&mut dyn AnyArray> =
-            vec![&mut phoneme_vector_array, &mut speaker_id_array];
-
-        let mut output = status.predict_duration_session_run(model_index, input_tensors)?;
-
-        for output_item in output.iter_mut() {
-            if *output_item < PHONEME_LENGTH_MINIMAL {
-                *output_item = PHONEME_LENGTH_MINIMAL;
-            }
-        }
-
-        Ok(output)
-    }
-
-    #[allow(clippy::too_many_arguments)]
-    pub fn predict_intonation(
-        &mut self,
-        length: usize,
-        vowel_phoneme_vector: &[i64],
-        consonant_phoneme_vector: &[i64],
-        start_accent_vector: &[i64],
-        end_accent_vector: &[i64],
-        start_accent_phrase_vector: &[i64],
-        end_accent_phrase_vector: &[i64],
-        speaker_id: u32,
-    ) -> Result<Vec<f32>> {
-        if !self.initialized {
-            return Err(Error::UninitializedStatus);
-        }
-
-        let status = self
-            .status_option
-            .as_mut()
-            .ok_or(Error::UninitializedStatus)?;
-
-        if !status.validate_speaker_id(speaker_id) {
-            return Err(Error::InvalidSpeakerId { speaker_id });
-        }
-
-        let (model_index, speaker_id) =
-            if let Some((model_index, speaker_id)) = get_model_index_and_speaker_id(speaker_id) {
-                (model_index, speaker_id)
-            } else {
-                return Err(Error::InvalidSpeakerId { speaker_id });
-            };
-
-        if model_index >= MODEL_FILE_SET.models_count() {
-            return Err(Error::InvalidModelIndex { model_index });
-        }
-
-        let mut length_array = NdArray::new(ndarray::arr0(length as i64));
-        let mut vowel_phoneme_vector_array = NdArray::new(ndarray::arr1(vowel_phoneme_vector));
-        let mut consonant_phoneme_vector_array =
-            NdArray::new(ndarray::arr1(consonant_phoneme_vector));
-        let mut start_accent_vector_array = NdArray::new(ndarray::arr1(start_accent_vector));
-        let mut end_accent_vector_array = NdArray::new(ndarray::arr1(end_accent_vector));
-        let mut start_accent_phrase_vector_array =
-            NdArray::new(ndarray::arr1(start_accent_phrase_vector));
-        let mut end_accent_phrase_vector_array =
-            NdArray::new(ndarray::arr1(end_accent_phrase_vector));
-        let mut speaker_id_array = NdArray::new(ndarray::arr1(&[speaker_id as i64]));
-
-        let input_tensors: Vec<&mut dyn AnyArray> = vec![
-            &mut length_array,
-            &mut vowel_phoneme_vector_array,
-            &mut consonant_phoneme_vector_array,
-            &mut start_accent_vector_array,
-            &mut end_accent_vector_array,
-            &mut start_accent_phrase_vector_array,
-            &mut end_accent_phrase_vector_array,
-            &mut speaker_id_array,
-        ];
-
-        status.predict_intonation_session_run(model_index, input_tensors)
-    }
-
-    pub fn decode(
-        &mut self,
-        length: usize,
-        phoneme_size: usize,
-        f0: &[f32],
-        phoneme_vector: &[f32],
-        speaker_id: u32,
-    ) -> Result<Vec<f32>> {
-        if !self.initialized {
-            return Err(Error::UninitializedStatus);
-        }
-
-        let status = self
-            .status_option
-            .as_mut()
-            .ok_or(Error::UninitializedStatus)?;
-
-        if !status.validate_speaker_id(speaker_id) {
-            return Err(Error::InvalidSpeakerId { speaker_id });
-        }
-
-        let (model_index, speaker_id) =
-            if let Some((model_index, speaker_id)) = get_model_index_and_speaker_id(speaker_id) {
-                (model_index, speaker_id)
-            } else {
-                return Err(Error::InvalidSpeakerId { speaker_id });
-            };
-
-        if model_index >= MODEL_FILE_SET.models_count() {
-            return Err(Error::InvalidModelIndex { model_index });
-        }
-
-        // 音が途切れてしまうのを避けるworkaround処理が入っている
-        // TODO: 改善したらここのpadding処理を取り除く
-        const PADDING_SIZE: f64 = 0.4;
-        const DEFAULT_SAMPLING_RATE: f64 = 24000.0;
-        let padding_size = ((PADDING_SIZE * DEFAULT_SAMPLING_RATE) / 256.0).round() as usize;
-        let start_and_end_padding_size = 2 * padding_size;
-        let length_with_padding = length + start_and_end_padding_size;
-        let f0_with_padding = Self::make_f0_with_padding(f0, length_with_padding, padding_size);
-
-        let phoneme_with_padding = Self::make_phoneme_with_padding(
-            phoneme_vector,
-            phoneme_size,
-            length_with_padding,
-            padding_size,
-        );
-
-        let mut f0_array = NdArray::new(
-            ndarray::arr1(&f0_with_padding)
-                .into_shape([length_with_padding, 1])
-                .unwrap(),
-        );
-        let mut phoneme_array = NdArray::new(
-            ndarray::arr1(&phoneme_with_padding)
-                .into_shape([length_with_padding, phoneme_size])
-                .unwrap(),
-        );
-        let mut speaker_id_array = NdArray::new(ndarray::arr1(&[speaker_id as i64]));
-
-        let input_tensors: Vec<&mut dyn AnyArray> =
-            vec![&mut f0_array, &mut phoneme_array, &mut speaker_id_array];
-
-        status
-            .decode_session_run(model_index, input_tensors)
-            .map(|output| Self::trim_padding_from_output(output, padding_size))
-    }
-
-    fn make_f0_with_padding(
-        f0_slice: &[f32],
-        length_with_padding: usize,
-        padding_size: usize,
-    ) -> Vec<f32> {
-        // 音が途切れてしまうのを避けるworkaround処理
-        // 改善したらこの関数を削除する
-        let mut f0_with_padding = Vec::with_capacity(length_with_padding);
-        let padding = vec![0.0; padding_size];
-        f0_with_padding.extend_from_slice(&padding);
-        f0_with_padding.extend_from_slice(f0_slice);
-        f0_with_padding.extend_from_slice(&padding);
-        f0_with_padding
-    }
-
-    fn make_phoneme_with_padding(
-        phoneme_slice: &[f32],
-        phoneme_size: usize,
-        length_with_padding: usize,
-        padding_size: usize,
-    ) -> Vec<f32> {
-        // 音が途切れてしまうのを避けるworkaround処理
-        // 改善したらこの関数を削除する
-        let mut padding_phoneme = vec![0.0; phoneme_size];
-        padding_phoneme[0] = 1.0;
-        let padding_phoneme_len = padding_phoneme.len();
-        let padding_phonemes: Vec<f32> = padding_phoneme
-            .into_iter()
-            .cycle()
-            .take(padding_phoneme_len * padding_size)
-            .collect();
-        let mut phoneme_with_padding = Vec::with_capacity(phoneme_size * length_with_padding);
-        phoneme_with_padding.extend_from_slice(&padding_phonemes);
-        phoneme_with_padding.extend_from_slice(phoneme_slice);
-        phoneme_with_padding.extend_from_slice(&padding_phonemes);
-
-        phoneme_with_padding
-    }
-
-    fn trim_padding_from_output(mut output: Vec<f32>, padding_f0_size: usize) -> Vec<f32> {
-        // 音が途切れてしまうのを避けるworkaround処理
-        // 改善したらこの関数を削除する
-        let padding_sampling_size = padding_f0_size * 256;
-        output
-            .drain(padding_sampling_size..output.len() - padding_sampling_size)
-            .collect()
-    }
-}
-
-pub static METAS: &Lazy<&str> = {
-    static METAS: Lazy<&str> = Lazy::new(|| &MODEL_FILE_SET.metas_str);
-    &METAS
-};
-
-pub static METAS_CSTRING: Lazy<CString> =
-    Lazy::new(|| CString::new(&*MODEL_FILE_SET.metas_str).unwrap());
-
-pub static SUPPORTED_DEVICES: Lazy<SupportedDevices> =
-    Lazy::new(|| SupportedDevices::get_supported_devices().unwrap());
-
-pub static SUPPORTED_DEVICES_CSTRING: Lazy<CString> =
-    Lazy::new(|| CString::new(SUPPORTED_DEVICES.to_json().to_string()).unwrap());
-
-fn get_model_index_and_speaker_id(speaker_id: u32) -> Option<(usize, u32)> {
-    MODEL_FILE_SET.speaker_id_map.get(&speaker_id).copied()
-}
-
-pub const fn error_result_to_message(result_code: VoicevoxResultCode) -> &'static str {
-    // C APIのため、messageには必ず末尾にNULL文字を追加する
-    use VoicevoxResultCode::*;
-    match result_code {
-        VOICEVOX_RESULT_NOT_LOADED_OPENJTALK_DICT_ERROR => {
-            "OpenJTalkの辞書が読み込まれていません\0"
-        }
-        VOICEVOX_RESULT_LOAD_MODEL_ERROR => "modelデータ読み込みに失敗しました\0",
-        VOICEVOX_RESULT_LOAD_METAS_ERROR => "メタデータ読み込みに失敗しました\0",
-
-        VOICEVOX_RESULT_GPU_SUPPORT_ERROR => "GPU機能をサポートすることができません\0",
-        VOICEVOX_RESULT_GET_SUPPORTED_DEVICES_ERROR => {
-            "サポートされているデバイス情報取得中にエラーが発生しました\0"
-        }
-
-        VOICEVOX_RESULT_OK => "エラーが発生しませんでした\0",
-        VOICEVOX_RESULT_UNINITIALIZED_STATUS_ERROR => "Statusが初期化されていません\0",
-        VOICEVOX_RESULT_INVALID_SPEAKER_ID_ERROR => "無効なspeaker_idです\0",
-        VOICEVOX_RESULT_INVALID_MODEL_INDEX_ERROR => "無効なmodel_indexです\0",
-        VOICEVOX_RESULT_INFERENCE_ERROR => "推論に失敗しました\0",
-        VOICEVOX_RESULT_EXTRACT_FULL_CONTEXT_LABEL_ERROR => {
-            "入力テキストからのフルコンテキストラベル抽出に失敗しました\0"
-        }
-        VOICEVOX_RESULT_INVALID_UTF8_INPUT_ERROR => "入力テキストが無効なUTF-8データでした\0",
-        VOICEVOX_RESULT_PARSE_KANA_ERROR => {
-            "入力テキストをAquesTalkライクな読み仮名としてパースすることに失敗しました\0"
-        }
-        VOICEVOX_RESULT_INVALID_AUDIO_QUERY_ERROR => "無効なaudio_queryです\0",
-        VOICEVOX_RESULT_INVALID_ACCENT_PHRASE_ERROR => "無効なaccent_phraseです\0",
-    }
-}
-
-#[cfg(windows)]
-fn list_windows_video_cards() {
-    use std::{ffi::OsString, os::windows::ffi::OsStringExt as _};
-
-    use humansize::BINARY;
-    use tracing::{error, info};
-    use windows::Win32::Graphics::Dxgi::{
-        CreateDXGIFactory, IDXGIFactory, DXGI_ADAPTER_DESC, DXGI_ERROR_NOT_FOUND,
-    };
-
-    info!("検出されたGPU (DirectMLには1番目のGPUが使われます):");
-    match list_windows_video_cards() {
-        Ok(descs) => {
-            for desc in descs {
-                let description = OsString::from_wide(trim_nul(&desc.Description));
-                let vram = humansize::format_size(desc.DedicatedVideoMemory, BINARY);
-                info!("  - {description:?} ({vram})");
-            }
-        }
-        Err(err) => error!("{err}"),
-    }
-
-    fn list_windows_video_cards() -> windows::core::Result<Vec<DXGI_ADAPTER_DESC>> {
-        #[allow(unsafe_code)]
-        unsafe {
-            let factory = CreateDXGIFactory::<IDXGIFactory>()?;
-            (0..)
-                .map(|i| factory.EnumAdapters(i)?.GetDesc())
-                .take_while(|r| !matches!(r, Err(e) if e.code() == DXGI_ERROR_NOT_FOUND))
-                .collect()
-        }
-    }
-
-    fn trim_nul(s: &[u16]) -> &[u16] {
-        &s[..s.iter().position(|&c| c == 0x0000).unwrap_or(s.len())]
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use crate::macros::tests::assert_debug_fmt_eq;
-    use pretty_assertions::assert_eq;
-    use test_util::OPEN_JTALK_DIC_DIR;
-
-    #[rstest]
-    fn finalize_works() {
-        let internal = VoicevoxCore::new_with_mutex();
-        let result = internal
-            .lock()
-            .unwrap()
-            .initialize(InitializeOptions::default());
-        assert_debug_fmt_eq!(Ok(()), result);
-        internal.lock().unwrap().finalize();
-        assert_eq!(
-            false,
-            internal
-                .lock()
-                .unwrap()
-                .synthesis_engine
-                .inference_core()
-                .initialized
-        );
-        assert_eq!(
-            true,
-            internal
-                .lock()
-                .unwrap()
-                .synthesis_engine
-                .inference_core()
-                .status_option
-                .is_none()
-        );
-    }
-
-    #[rstest]
-    #[case(0, Err(Error::UninitializedStatus), Ok(()))]
-    #[case(1, Err(Error::UninitializedStatus), Ok(()))]
-    #[case(999, Err(Error::UninitializedStatus), Err(Error::InvalidSpeakerId{speaker_id:999}))]
-    fn load_model_works(
-        #[case] speaker_id: u32,
-        #[case] expected_result_at_uninitialized: Result<()>,
-        #[case] expected_result_at_initialized: Result<()>,
-    ) {
-        let internal = VoicevoxCore::new_with_mutex();
-        let result = internal.lock().unwrap().load_model(speaker_id);
-        assert_debug_fmt_eq!(expected_result_at_uninitialized, result);
-
-        internal
-            .lock()
-            .unwrap()
-            .initialize(InitializeOptions {
-                acceleration_mode: AccelerationMode::Cpu,
-                ..Default::default()
-            })
-            .unwrap();
-        let result = internal.lock().unwrap().load_model(speaker_id);
-        assert_debug_fmt_eq!(
-            expected_result_at_initialized,
-            result,
-            "got load_model result",
-        );
-    }
-
-    #[rstest]
-    fn is_use_gpu_works() {
-        let internal = VoicevoxCore::new_with_mutex();
-        assert_eq!(false, internal.lock().unwrap().is_gpu_mode());
-        internal
-            .lock()
-            .unwrap()
-            .initialize(InitializeOptions {
-                acceleration_mode: AccelerationMode::Cpu,
-                ..Default::default()
-            })
-            .unwrap();
-        assert_eq!(false, internal.lock().unwrap().is_gpu_mode());
-    }
-
-    #[rstest]
-    #[case(0, true)]
-    #[case(1, true)]
-    #[case(999, false)]
-    fn is_model_loaded_works(#[case] speaker_id: u32, #[case] expected: bool) {
-        let internal = VoicevoxCore::new_with_mutex();
-        assert!(
-            !internal.lock().unwrap().is_model_loaded(speaker_id),
-            "expected is_model_loaded to return false, but got true",
-        );
-
-        internal
-            .lock()
-            .unwrap()
-            .initialize(InitializeOptions {
-                acceleration_mode: AccelerationMode::Cpu,
-                ..Default::default()
-            })
-            .unwrap();
-        assert!(
-            !internal.lock().unwrap().is_model_loaded(speaker_id),
-            "expected is_model_loaded to return false, but got true",
-        );
-
-        internal
-            .lock()
-            .unwrap()
-            .load_model(speaker_id)
-            .unwrap_or(());
-        assert_eq!(
-            internal.lock().unwrap().is_model_loaded(speaker_id),
-            expected,
-            "expected is_model_loaded return value against speaker_id `{}` is `{}`, but got `{}`",
-            speaker_id,
-            expected,
-            !expected
-        );
-    }
-
-    #[rstest]
-    fn supported_devices_works() {
-        let internal = VoicevoxCore::new_with_mutex();
-        let cstr_result = internal.lock().unwrap().get_supported_devices_json();
-        assert!(cstr_result.to_str().is_ok(), "{cstr_result:?}");
-
-        let json_result: std::result::Result<SupportedDevices, _> =
-            serde_json::from_str(cstr_result.to_str().unwrap());
-        assert!(json_result.is_ok(), "{json_result:?}");
-    }
-
-    #[rstest]
-    #[case(0, Some((0,0)))]
-    #[case(1, Some((0,1)))]
-    #[case(999, None)]
-    fn get_model_index_and_speaker_id_works(
-        #[case] speaker_id: u32,
-        #[case] expected: Option<(usize, u32)>,
-    ) {
-        let actual = get_model_index_and_speaker_id(speaker_id);
-        assert_eq!(expected, actual);
-    }
-
-    #[rstest]
-    fn predict_duration_works() {
-        let internal = VoicevoxCore::new_with_mutex();
-        internal
-            .lock()
-            .unwrap()
-            .initialize(InitializeOptions {
-                load_all_models: true,
-                acceleration_mode: AccelerationMode::Cpu,
-                ..Default::default()
-            })
-            .unwrap();
-
-        // 「こんにちは、音声合成の世界へようこそ」という文章を変換して得た phoneme_vector
-        let phoneme_vector = [
-            0, 23, 30, 4, 28, 21, 10, 21, 42, 7, 0, 30, 4, 35, 14, 14, 16, 30, 30, 35, 14, 14, 28,
-            30, 35, 14, 23, 7, 21, 14, 43, 30, 30, 23, 30, 35, 30, 0,
-        ];
-
-        let result = internal
-            .lock()
-            .unwrap()
-            .predict_duration(&phoneme_vector, 0);
-
-        assert!(result.is_ok(), "{result:?}");
-        assert_eq!(result.unwrap().len(), phoneme_vector.len());
-    }
-
-    #[rstest]
-    fn predict_intonation_works() {
-        let internal = VoicevoxCore::new_with_mutex();
-        internal
-            .lock()
-            .unwrap()
-            .initialize(InitializeOptions {
-                load_all_models: true,
-                acceleration_mode: AccelerationMode::Cpu,
-                ..Default::default()
-            })
-            .unwrap();
-
-        // 「テスト」という文章に対応する入力
-        let vowel_phoneme_vector = [0, 14, 6, 30, 0];
-        let consonant_phoneme_vector = [-1, 37, 35, 37, -1];
-        let start_accent_vector = [0, 1, 0, 0, 0];
-        let end_accent_vector = [0, 1, 0, 0, 0];
-        let start_accent_phrase_vector = [0, 1, 0, 0, 0];
-        let end_accent_phrase_vector = [0, 0, 0, 1, 0];
-
-        let result = internal.lock().unwrap().predict_intonation(
-            vowel_phoneme_vector.len(),
-            &vowel_phoneme_vector,
-            &consonant_phoneme_vector,
-            &start_accent_vector,
-            &end_accent_vector,
-            &start_accent_phrase_vector,
-            &end_accent_phrase_vector,
-            0,
-        );
-
-        assert!(result.is_ok(), "{result:?}");
-        assert_eq!(result.unwrap().len(), vowel_phoneme_vector.len());
-    }
-
-    #[rstest]
-    fn decode_works() {
-        let internal = VoicevoxCore::new_with_mutex();
-        internal
-            .lock()
-            .unwrap()
-            .initialize(InitializeOptions {
-                acceleration_mode: AccelerationMode::Cpu,
-                load_all_models: true,
-                ..Default::default()
-            })
-            .unwrap();
-
-        // 「テスト」という文章に対応する入力
-        const F0_LENGTH: usize = 69;
-        let mut f0 = [0.; F0_LENGTH];
-        f0[9..24].fill(5.905218);
-        f0[37..60].fill(5.565851);
-
-        const PHONEME_SIZE: usize = 45;
-        let mut phoneme = [0.; PHONEME_SIZE * F0_LENGTH];
-        let mut set_one = |index, range| {
-            for i in range {
-                phoneme[i * PHONEME_SIZE + index] = 1.;
-            }
-        };
-        set_one(0, 0..9);
-        set_one(37, 9..13);
-        set_one(14, 13..24);
-        set_one(35, 24..30);
-        set_one(6, 30..37);
-        set_one(37, 37..45);
-        set_one(30, 45..60);
-        set_one(0, 60..69);
-
-        let result = internal
-            .lock()
-            .unwrap()
-            .decode(F0_LENGTH, PHONEME_SIZE, &f0, &phoneme, 0);
-
-        assert!(result.is_ok(), "{result:?}");
-        assert_eq!(result.unwrap().len(), F0_LENGTH * 256);
-    }
-
-    type TextConsonantVowelData =
-        [(&'static [(&'static str, &'static str, &'static str)], usize)];
-
-    // [([(テキスト, 母音, 子音), ...], アクセントの位置), ...] の形式
-    const TEXT_CONSONANT_VOWEL_DATA1: &TextConsonantVowelData = &[
-        (&[("コ", "k", "o"), ("レ", "r", "e"), ("ワ", "w", "a")], 3),
-        (
-            &[
-                ("テ", "t", "e"),
-                ("ス", "s", "U"),
-                ("ト", "t", "o"),
-                ("デ", "d", "e"),
-                ("ス", "s", "U"),
-            ],
-            1,
-        ),
-    ];
-
-    const TEXT_CONSONANT_VOWEL_DATA2: &TextConsonantVowelData = &[
-        (&[("コ", "k", "o"), ("レ", "r", "e"), ("ワ", "w", "a")], 1),
-        (
-            &[
-                ("テ", "t", "e"),
-                ("ス", "s", "U"),
-                ("ト", "t", "o"),
-                ("デ", "d", "e"),
-                ("ス", "s", "U"),
-            ],
-            3,
-        ),
-    ];
-
-    #[rstest]
-    #[case(
-        "これはテストです",
-        false,
-        TEXT_CONSONANT_VOWEL_DATA1,
-        "コレワ'/テ'_ストデ_ス"
-    )]
-    #[case(
-        "コ'レワ/テ_スト'デ_ス",
-        true,
-        TEXT_CONSONANT_VOWEL_DATA2,
-        "コ'レワ/テ_スト'デ_ス"
-    )]
-    fn audio_query_works(
-        #[case] input_text: &str,
-        #[case] input_kana_option: bool,
-        #[case] expected_text_consonant_vowel_data: &TextConsonantVowelData,
-        #[case] expected_kana_text: &str,
-    ) {
-        let core = VoicevoxCore::new_with_mutex();
-        core.lock()
-            .unwrap()
-            .initialize(InitializeOptions {
-                acceleration_mode: AccelerationMode::Cpu,
-                load_all_models: true,
-                open_jtalk_dict_dir: Some(OPEN_JTALK_DIC_DIR.into()),
-                ..Default::default()
-            })
-            .unwrap();
-
-        let query = core
-            .lock()
-            .unwrap()
-            .audio_query(
-                input_text,
-                0,
-                AudioQueryOptions {
-                    kana: input_kana_option,
-                },
-            )
-            .unwrap();
-
-        assert_eq!(
-            query.accent_phrases().len(),
-            expected_text_consonant_vowel_data.len()
-        );
-
-        for (accent_phrase, (text_consonant_vowel_slice, accent_pos)) in
-            std::iter::zip(query.accent_phrases(), expected_text_consonant_vowel_data)
-        {
-            assert_eq!(
-                accent_phrase.moras().len(),
-                text_consonant_vowel_slice.len()
-            );
-            assert_eq!(accent_phrase.accent(), accent_pos);
-
-            for (mora, (text, consonant, vowel)) in
-                std::iter::zip(accent_phrase.moras(), *text_consonant_vowel_slice)
-            {
-                assert_eq!(mora.text(), text);
-                // NOTE: 子音の長さが必ず非ゼロになるテストケースを想定している
-                assert_ne!(
-                    mora.consonant_length(),
-                    &Some(0.),
-                    "expected mora.consonant_length is not Some(0.0), but got Some(0.0)."
-                );
-                assert_eq!(mora.consonant(), &Some(consonant.to_string()));
-                assert_eq!(mora.vowel(), vowel);
-                // NOTE: 母音の長さが必ず非ゼロになるテストケースを想定している
-                assert_ne!(
-                    mora.vowel_length(),
-                    &0.,
-                    "expected mora.vowel_length is not 0.0, but got 0.0."
-                );
-            }
-        }
-
-        assert_eq!(query.kana().as_deref(), Some(expected_kana_text));
-    }
-
-    #[rstest]
-    #[case("これはテストです", false, TEXT_CONSONANT_VOWEL_DATA1)]
-    #[case("コ'レワ/テ_スト'デ_ス", true, TEXT_CONSONANT_VOWEL_DATA2)]
-    fn accent_phrases_works(
-        #[case] input_text: &str,
-        #[case] input_kana_option: bool,
-        #[case] expected_text_consonant_vowel_data: &TextConsonantVowelData,
-    ) {
-        let core = VoicevoxCore::new_with_mutex();
-        core.lock()
-            .unwrap()
-            .initialize(InitializeOptions {
-                acceleration_mode: AccelerationMode::Cpu,
-                load_all_models: true,
-                open_jtalk_dict_dir: Some(OPEN_JTALK_DIC_DIR.into()),
-                ..Default::default()
-            })
-            .unwrap();
-
-        let accent_phrases = core
-            .lock()
-            .unwrap()
-            .accent_phrases(
-                input_text,
-                0,
-                AccentPhrasesOptions {
-                    kana: input_kana_option,
-                },
-            )
-            .unwrap();
-
-        assert_eq!(
-            accent_phrases.len(),
-            expected_text_consonant_vowel_data.len()
-        );
-
-        for (accent_phrase, (text_consonant_vowel_slice, accent_pos)) in
-            std::iter::zip(accent_phrases, expected_text_consonant_vowel_data)
-        {
-            assert_eq!(
-                accent_phrase.moras().len(),
-                text_consonant_vowel_slice.len()
-            );
-            assert_eq!(accent_phrase.accent(), accent_pos);
-
-            for (mora, (text, consonant, vowel)) in
-                std::iter::zip(accent_phrase.moras(), *text_consonant_vowel_slice)
-            {
-                assert_eq!(mora.text(), text);
-                // NOTE: 子音の長さが必ず非ゼロになるテストケースを想定している
-                assert_ne!(
-                    mora.consonant_length(),
-                    &Some(0.),
-                    "expected mora.consonant_length is not Some(0.0), but got Some(0.0)."
-                );
-                assert_eq!(mora.consonant(), &Some(consonant.to_string()));
-                assert_eq!(mora.vowel(), vowel);
-                // NOTE: 母音の長さが必ず非ゼロになるテストケースを想定している
-                assert_ne!(
-                    mora.vowel_length(),
-                    &0.,
-                    "expected mora.vowel_length is not 0.0, but got 0.0."
-                );
-            }
-        }
-    }
-
-    #[rstest]
-    fn mora_length_works() {
-        let core = VoicevoxCore::new_with_mutex();
-        core.lock()
-            .unwrap()
-            .initialize(InitializeOptions {
-                acceleration_mode: AccelerationMode::Cpu,
-                load_all_models: true,
-                open_jtalk_dict_dir: Some(OPEN_JTALK_DIC_DIR.into()),
-                ..Default::default()
-            })
-            .unwrap();
-
-        let accent_phrases = core
-            .lock()
-            .unwrap()
-            .accent_phrases("これはテストです", 0, AccentPhrasesOptions { kana: false })
-            .unwrap();
-
-        let modified_accent_phrases = core
-            .lock()
-            .unwrap()
-            .mora_length(1, &accent_phrases)
-            .unwrap();
-
-        // NOTE: 一つでも母音の長さが変わっていれば、動作しているとみなす
-        assert!(
-            any_mora_param_changed(
-                &accent_phrases,
-                &modified_accent_phrases,
-                MoraModel::vowel_length
-            ),
-            "mora_length() does not work: mora.vowel_length() is not changed."
-        );
-    }
-
-    #[rstest]
-    fn mora_pitch_works() {
-        let core = VoicevoxCore::new_with_mutex();
-        core.lock()
-            .unwrap()
-            .initialize(InitializeOptions {
-                acceleration_mode: AccelerationMode::Cpu,
-                load_all_models: true,
-                open_jtalk_dict_dir: Some(OPEN_JTALK_DIC_DIR.into()),
-                ..Default::default()
-            })
-            .unwrap();
-
-        let accent_phrases = core
-            .lock()
-            .unwrap()
-            .accent_phrases("これはテストです", 0, AccentPhrasesOptions { kana: false })
-            .unwrap();
-
-        let modified_accent_phrases = core.lock().unwrap().mora_pitch(1, &accent_phrases).unwrap();
-
-        // NOTE: 一つでも音高が変わっていれば、動作しているとみなす
-        assert!(
-            any_mora_param_changed(&accent_phrases, &modified_accent_phrases, MoraModel::pitch),
-            "mora_pitch() does not work: mora.pitch() is not changed."
-        );
-    }
-
-    #[rstest]
-    fn mora_data_works() {
-        let core = VoicevoxCore::new_with_mutex();
-        core.lock()
-            .unwrap()
-            .initialize(InitializeOptions {
-                acceleration_mode: AccelerationMode::Cpu,
-                load_all_models: true,
-                open_jtalk_dict_dir: Some(OPEN_JTALK_DIC_DIR.into()),
-                ..Default::default()
-            })
-            .unwrap();
-
-        let accent_phrases = core
-            .lock()
-            .unwrap()
-            .accent_phrases("これはテストです", 0, AccentPhrasesOptions { kana: false })
-            .unwrap();
-
-        let modified_accent_phrases = core.lock().unwrap().mora_data(1, &accent_phrases).unwrap();
-
-        // NOTE: 一つでも音高が変わっていれば、動作しているとみなす
-        assert!(
-            any_mora_param_changed(&accent_phrases, &modified_accent_phrases, MoraModel::pitch),
-            "mora_data() does not work: mora.pitch() is not changed."
-        );
-        // NOTE: 一つでも母音の長さが変わっていれば、動作しているとみなす
-        assert!(
-            any_mora_param_changed(
-                &accent_phrases,
-                &modified_accent_phrases,
-                MoraModel::vowel_length
-            ),
-            "mora_data() does not work: mora.vowel_length() is not changed."
-        );
-    }
-
-    fn any_mora_param_changed<T: PartialEq>(
-        before: &[AccentPhraseModel],
-        after: &[AccentPhraseModel],
-        param: fn(&MoraModel) -> &T,
-    ) -> bool {
-        std::iter::zip(before, after)
-            .flat_map(move |(before, after)| std::iter::zip(before.moras(), after.moras()))
-            .any(|(before, after)| param(before) != param(after))
-    }
-
-    #[rstest]
-    fn get_version_works() {
-        assert_eq!("0.0.0", VoicevoxCore::get_version());
-    }
-}
diff --git a/crates/voicevox_core/src/result_code.rs b/crates/voicevox_core/src/result_code.rs
index 45816e468..aac908295 100644
--- a/crates/voicevox_core/src/result_code.rs
+++ b/crates/voicevox_core/src/result_code.rs
@@ -17,22 +17,65 @@ pub enum VoicevoxResultCode {
     VOICEVOX_RESULT_GPU_SUPPORT_ERROR = 4,
     /// メタ情報読み込みに失敗した
     VOICEVOX_RESULT_LOAD_METAS_ERROR = 5,
-    /// ステータスが初期化されていない
-    VOICEVOX_RESULT_UNINITIALIZED_STATUS_ERROR = 6,
-    /// 無効なspeaker_idが指定された
-    VOICEVOX_RESULT_INVALID_SPEAKER_ID_ERROR = 7,
+    /// 無効なstyle_idが指定された
+    VOICEVOX_RESULT_INVALID_STYLE_ID_ERROR = 6,
     /// 無効なmodel_indexが指定された
-    VOICEVOX_RESULT_INVALID_MODEL_INDEX_ERROR = 8,
+    VOICEVOX_RESULT_INVALID_MODEL_INDEX_ERROR = 7,
     /// 推論に失敗した
-    VOICEVOX_RESULT_INFERENCE_ERROR = 9,
+    VOICEVOX_RESULT_INFERENCE_ERROR = 8,
     /// コンテキストラベル出力に失敗した
-    VOICEVOX_RESULT_EXTRACT_FULL_CONTEXT_LABEL_ERROR = 10,
+    VOICEVOX_RESULT_EXTRACT_FULL_CONTEXT_LABEL_ERROR = 11,
     /// 無効なutf8文字列が入力された
-    VOICEVOX_RESULT_INVALID_UTF8_INPUT_ERROR = 11,
+    VOICEVOX_RESULT_INVALID_UTF8_INPUT_ERROR = 12,
     /// aquestalk形式のテキストの解析に失敗した
-    VOICEVOX_RESULT_PARSE_KANA_ERROR = 12,
+    VOICEVOX_RESULT_PARSE_KANA_ERROR = 13,
     /// 無効なAudioQuery
-    VOICEVOX_RESULT_INVALID_AUDIO_QUERY_ERROR = 13,
+    VOICEVOX_RESULT_INVALID_AUDIO_QUERY_ERROR = 14,
     /// 無効なAccentPhrase
-    VOICEVOX_RESULT_INVALID_ACCENT_PHRASE_ERROR = 14,
+    VOICEVOX_RESULT_INVALID_ACCENT_PHRASE_ERROR = 15,
+    /// ファイルオープンエラー
+    VOICEVOX_OPEN_FILE_ERROR = 16,
+    /// Modelを読み込めなかった
+    VOICEVOX_VVM_MODEL_READ_ERROR = 17,
+    /// すでに読み込まれているModelを読み込もうとした
+    VOICEVOX_ALREADY_LOADED_MODEL_ERROR = 18,
+    /// Modelが読み込まれていない
+    VOICEVOX_UNLOADED_MODEL_ERROR = 19,
+}
+
+pub const fn error_result_to_message(result_code: VoicevoxResultCode) -> &'static str {
+    // C APIのため、messageには必ず末尾にNULL文字を追加する
+    use VoicevoxResultCode::*;
+    match result_code {
+        VOICEVOX_RESULT_NOT_LOADED_OPENJTALK_DICT_ERROR => {
+            "OpenJTalkの辞書が読み込まれていません\0"
+        }
+        VOICEVOX_RESULT_LOAD_MODEL_ERROR => "modelデータ読み込みに失敗しました\0",
+        VOICEVOX_RESULT_LOAD_METAS_ERROR => "メタデータ読み込みに失敗しました\0",
+
+        VOICEVOX_RESULT_GPU_SUPPORT_ERROR => "GPU機能をサポートすることができません\0",
+        VOICEVOX_RESULT_GET_SUPPORTED_DEVICES_ERROR => {
+            "サポートされているデバイス情報取得中にエラーが発生しました\0"
+        }
+
+        VOICEVOX_RESULT_OK => "エラーが発生しませんでした\0",
+        VOICEVOX_RESULT_INVALID_STYLE_ID_ERROR => "無効なspeaker_idです\0",
+        VOICEVOX_RESULT_INVALID_MODEL_INDEX_ERROR => "無効なmodel_indexです\0",
+        VOICEVOX_RESULT_INFERENCE_ERROR => "推論に失敗しました\0",
+        VOICEVOX_RESULT_EXTRACT_FULL_CONTEXT_LABEL_ERROR => {
+            "入力テキストからのフルコンテキストラベル抽出に失敗しました\0"
+        }
+        VOICEVOX_RESULT_INVALID_UTF8_INPUT_ERROR => "入力テキストが無効なUTF-8データでした\0",
+        VOICEVOX_RESULT_PARSE_KANA_ERROR => {
+            "入力テキストをAquesTalkライクな読み仮名としてパースすることに失敗しました\0"
+        }
+        VOICEVOX_RESULT_INVALID_AUDIO_QUERY_ERROR => "無効なaudio_queryです\0",
+        VOICEVOX_RESULT_INVALID_ACCENT_PHRASE_ERROR => "無効なaccent_phraseです\0",
+        VOICEVOX_OPEN_FILE_ERROR => "ファイルオープンに失敗しました\0",
+        VOICEVOX_VVM_MODEL_READ_ERROR => "Modelを読み込めませんでした\0",
+        VOICEVOX_ALREADY_LOADED_MODEL_ERROR => {
+            "すでに読み込まれているModelを読み込もうとしました\0"
+        }
+        VOICEVOX_UNLOADED_MODEL_ERROR => "Modelが読み込まれていません\0",
+    }
 }
diff --git a/crates/voicevox_core/src/status.rs b/crates/voicevox_core/src/status.rs
index 4c4704bd2..68f839a6b 100644
--- a/crates/voicevox_core/src/status.rs
+++ b/crates/voicevox_core/src/status.rs
@@ -1,16 +1,12 @@
 use super::*;
-use anyhow::Context as _;
 use once_cell::sync::Lazy;
 use onnxruntime::{
     environment::Environment,
     session::{AnyArray, Session},
     GraphOptimizationLevel, LoggingLevel,
 };
-use serde::{Deserialize, Serialize};
-use std::{
-    env,
-    path::{Path, PathBuf},
-};
+use std::sync::Mutex;
+use std::{env, path::Path};
 use tracing::error;
 
 mod model_file;
@@ -20,27 +16,21 @@ cfg_if! {
         use onnxruntime::CudaProviderOptions;
     }
 }
-use std::collections::{BTreeMap, BTreeSet};
-
-pub(crate) static MODEL_FILE_SET: Lazy<ModelFileSet> = Lazy::new(|| {
-    let result = ModelFileSet::new();
-    if let Err(err) = &result {
-        error!("ファイルを読み込めなかったためクラッシュします: {err}");
-    }
-    result.unwrap()
-});
+use std::collections::BTreeMap;
 
 pub struct Status {
     models: StatusModels,
+    merged_metas: VoiceModelMeta,
     light_session_options: SessionOptions, // 軽いモデルはこちらを使う
     heavy_session_options: SessionOptions, // 重いモデルはこちらを使う
-    supported_styles: BTreeSet<u32>,
+    id_relations: BTreeMap<StyleId, VoiceModelId>,
 }
 
 struct StatusModels {
-    predict_duration: BTreeMap<usize, Session<'static>>,
-    predict_intonation: BTreeMap<usize, Session<'static>>,
-    decode: BTreeMap<usize, Session<'static>>,
+    metas: BTreeMap<VoiceModelId, VoiceModelMeta>,
+    predict_duration: BTreeMap<VoiceModelId, Mutex<Session<'static>>>,
+    predict_intonation: BTreeMap<VoiceModelId, Mutex<Session<'static>>>,
+    decode: BTreeMap<VoiceModelId, Mutex<Session<'static>>>,
 }
 
 #[derive(new, Getters)]
@@ -49,107 +39,10 @@ struct SessionOptions {
     use_gpu: bool,
 }
 
-pub(crate) struct ModelFileSet {
-    pub(crate) speaker_id_map: BTreeMap<u32, (usize, u32)>,
-    pub(crate) metas_str: String,
-    models: Vec<Model>,
-}
-
-impl ModelFileSet {
-    fn new() -> anyhow::Result<Self> {
-        let path = {
-            let root_dir = if cfg!(test) {
-                Path::new(env!("CARGO_WORKSPACE_DIR")).join("model")
-            } else if let Some(root_dir) = env::var_os(ROOT_DIR_ENV_NAME) {
-                root_dir.into()
-            } else {
-                process_path::get_dylib_path()
-                    .or_else(process_path::get_executable_path)
-                    .with_context(|| "Could not get the current dynamic library/executable path")?
-                    .parent()
-                    .unwrap_or_else(|| "".as_ref())
-                    .join("model")
-            };
-
-            move |rel_path| root_dir.join(rel_path)
-        };
-
-        let metas_str = fs_err::read_to_string(path("metas.json"))?;
-
-        let models = model_file::MODEL_FILE_NAMES
-            .iter()
-            .map(
-                |&ModelFileNames {
-                     predict_duration_model,
-                     predict_intonation_model,
-                     decode_model,
-                 }| {
-                    let predict_duration_model = ModelFile::new(&path(predict_duration_model))?;
-                    let predict_intonation_model = ModelFile::new(&path(predict_intonation_model))?;
-                    let decode_model = ModelFile::new(&path(decode_model))?;
-                    Ok(Model {
-                        predict_duration_model,
-                        predict_intonation_model,
-                        decode_model,
-                    })
-                },
-            )
-            .collect::<anyhow::Result<_>>()?;
-
-        return Ok(Self {
-            speaker_id_map: model_file::SPEAKER_ID_MAP.iter().copied().collect(),
-            metas_str,
-            models,
-        });
-
-        const ROOT_DIR_ENV_NAME: &str = "VV_MODELS_ROOT_DIR";
-    }
-
-    pub(crate) fn models_count(&self) -> usize {
-        self.models.len()
-    }
-}
-
-struct ModelFileNames {
-    predict_duration_model: &'static str,
-    predict_intonation_model: &'static str,
-    decode_model: &'static str,
-}
-
 #[derive(thiserror::Error, Debug)]
 #[error("不正なモデルファイルです")]
 struct DecryptModelError;
 
-struct Model {
-    predict_duration_model: ModelFile,
-    predict_intonation_model: ModelFile,
-    decode_model: ModelFile,
-}
-
-struct ModelFile {
-    path: PathBuf,
-    content: Vec<u8>,
-}
-
-impl ModelFile {
-    fn new(path: &Path) -> anyhow::Result<Self> {
-        let content = fs_err::read(path)?;
-        Ok(Self {
-            path: path.to_owned(),
-            content,
-        })
-    }
-}
-
-#[derive(Deserialize, Getters)]
-struct Meta {
-    styles: Vec<Style>,
-}
-
-#[derive(Deserialize, Getters)]
-struct Style {
-    id: u64,
-}
 static ENVIRONMENT: Lazy<Environment> = Lazy::new(|| {
     cfg_if! {
         if #[cfg(debug_assertions)]{
@@ -165,109 +58,136 @@ static ENVIRONMENT: Lazy<Environment> = Lazy::new(|| {
         .unwrap()
 });
 
-#[derive(Getters, Debug, Serialize, Deserialize)]
-pub struct SupportedDevices {
-    cpu: bool,
-    cuda: bool,
-    dml: bool,
-}
-
-impl SupportedDevices {
-    pub fn get_supported_devices() -> Result<Self> {
-        let mut cuda_support = false;
-        let mut dml_support = false;
-        for provider in onnxruntime::session::get_available_providers()
-            .map_err(|e| Error::GetSupportedDevices(e.into()))?
-            .iter()
-        {
-            match provider.as_str() {
-                "CUDAExecutionProvider" => cuda_support = true,
-                "DmlExecutionProvider" => dml_support = true,
-                _ => {}
-            }
-        }
-
-        Ok(SupportedDevices {
-            cpu: true,
-            cuda: cuda_support,
-            dml: dml_support,
-        })
-    }
-
-    pub fn to_json(&self) -> serde_json::Value {
-        serde_json::to_value(self).expect("should not fail")
-    }
-}
-
 #[allow(unsafe_code)]
 unsafe impl Send for Status {}
 
+#[allow(unsafe_code)]
+unsafe impl Sync for Status {}
+
 impl Status {
     pub fn new(use_gpu: bool, cpu_num_threads: u16) -> Self {
         Self {
             models: StatusModels {
+                metas: BTreeMap::new(),
                 predict_duration: BTreeMap::new(),
                 predict_intonation: BTreeMap::new(),
                 decode: BTreeMap::new(),
             },
+            merged_metas: VoiceModelMeta::default(),
             light_session_options: SessionOptions::new(cpu_num_threads, false),
             heavy_session_options: SessionOptions::new(cpu_num_threads, use_gpu),
-            supported_styles: BTreeSet::default(),
+            id_relations: BTreeMap::default(),
         }
     }
 
-    pub fn load_metas(&mut self) -> Result<()> {
-        let metas: Vec<Meta> = serde_json::from_str(&MODEL_FILE_SET.metas_str)
-            .map_err(|e| Error::LoadMetas(e.into()))?;
-
-        for meta in metas.iter() {
-            for style in meta.styles().iter() {
-                self.supported_styles.insert(*style.id() as u32);
+    pub async fn load_model(&mut self, model: &VoiceModel) -> Result<()> {
+        for speaker in model.metas().iter() {
+            for style in speaker.styles().iter() {
+                if self.id_relations.contains_key(style.id()) {
+                    Err(Error::AlreadyLoadedModel {
+                        path: model.path().clone(),
+                    })?;
+                }
             }
         }
+        let models = model.read_inference_models().await?;
+
+        let predict_duration_session = self.new_session(
+            models.predict_duration_model(),
+            &self.light_session_options,
+            model.path(),
+        )?;
+        let predict_intonation_session = self.new_session(
+            models.predict_intonation_model(),
+            &self.light_session_options,
+            model.path(),
+        )?;
+        let decode_model = self.new_session(
+            models.decode_model(),
+            &self.heavy_session_options,
+            model.path(),
+        )?;
+        self.models
+            .metas
+            .insert(model.id().clone(), model.metas().clone());
+
+        for speaker in model.metas().iter() {
+            for style in speaker.styles().iter() {
+                self.id_relations.insert(*style.id(), model.id().clone());
+            }
+        }
+        self.set_metas();
+
+        self.models
+            .predict_duration
+            .insert(model.id().clone(), Mutex::new(predict_duration_session));
+        self.models
+            .predict_intonation
+            .insert(model.id().clone(), Mutex::new(predict_intonation_session));
+
+        self.models
+            .decode
+            .insert(model.id().clone(), Mutex::new(decode_model));
 
         Ok(())
     }
 
-    pub fn load_model(&mut self, model_index: usize) -> Result<()> {
-        if model_index < MODEL_FILE_SET.models.len() {
-            let model = &MODEL_FILE_SET.models[model_index];
-            let predict_duration_session =
-                self.new_session(&model.predict_duration_model, &self.light_session_options)?;
-            let predict_intonation_session =
-                self.new_session(&model.predict_intonation_model, &self.light_session_options)?;
-            let decode_model =
-                self.new_session(&model.decode_model, &self.heavy_session_options)?;
-
-            self.models
-                .predict_duration
-                .insert(model_index, predict_duration_session);
-            self.models
-                .predict_intonation
-                .insert(model_index, predict_intonation_session);
-
-            self.models.decode.insert(model_index, decode_model);
-
+    pub fn unload_model(&mut self, voice_model_id: &VoiceModelId) -> Result<()> {
+        if self.is_loaded_model(voice_model_id) {
+            self.models.predict_intonation.remove(voice_model_id);
+            self.models.predict_duration.remove(voice_model_id);
+            self.models.decode.remove(voice_model_id);
+
+            let remove_style_ids = self
+                .id_relations
+                .iter()
+                .filter(|&(_, loaded_model_id)| loaded_model_id == voice_model_id)
+                .map(|(&style_id, _)| style_id)
+                .collect::<Vec<_>>();
+
+            for style_id in remove_style_ids.iter() {
+                self.id_relations.remove(style_id);
+            }
+            self.set_metas();
             Ok(())
         } else {
-            Err(Error::InvalidModelIndex { model_index })
+            Err(Error::UnloadedModel {
+                model_id: voice_model_id.clone(),
+            })
+        }
+    }
+
+    fn set_metas(&mut self) {
+        let mut meta = VoiceModelMeta::default();
+        for m in self.models.metas.values() {
+            meta.extend_from_slice(m);
         }
+        self.merged_metas = meta;
+    }
+
+    pub fn metas(&self) -> &VoiceModelMeta {
+        &self.merged_metas
+    }
+
+    pub fn is_loaded_model(&self, voice_model_id: &VoiceModelId) -> bool {
+        self.models.predict_duration.contains_key(voice_model_id)
+            && self.models.predict_intonation.contains_key(voice_model_id)
+            && self.models.decode.contains_key(voice_model_id)
     }
 
-    pub fn is_model_loaded(&self, model_index: usize) -> bool {
-        self.models.predict_intonation.contains_key(&model_index)
-            && self.models.predict_duration.contains_key(&model_index)
-            && self.models.decode.contains_key(&model_index)
+    pub fn is_loaded_model_by_style_id(&self, style_id: StyleId) -> bool {
+        self.id_relations.contains_key(&style_id)
     }
 
     fn new_session(
         &self,
-        model_file: &ModelFile,
+        model: &[u8],
         session_options: &SessionOptions,
+        path: impl AsRef<Path>,
     ) -> Result<Session<'static>> {
-        self.new_session_from_bytes(|| model_file::decrypt(&model_file.content), session_options)
+        self.new_session_from_bytes(|| model_file::decrypt(model), session_options)
             .map_err(|source| Error::LoadModel {
-                path: model_file.path.clone(),
+                path: path.as_ref().into(),
                 source,
             })
     }
@@ -302,55 +222,67 @@ impl Status {
         Ok(session_builder.with_model_from_memory(model_bytes()?)?)
     }
 
-    pub fn validate_speaker_id(&self, speaker_id: u32) -> bool {
-        self.supported_styles.contains(&speaker_id)
+    pub fn validate_speaker_id(&self, style_id: StyleId) -> bool {
+        self.id_relations.contains_key(&style_id)
     }
 
     pub fn predict_duration_session_run(
-        &mut self,
-        model_index: usize,
+        &self,
+        style_id: StyleId,
         inputs: Vec<&mut dyn AnyArray>,
     ) -> Result<Vec<f32>> {
-        if let Some(model) = self.models.predict_duration.get_mut(&model_index) {
-            if let Ok(output_tensors) = model.run(inputs) {
-                Ok(output_tensors[0].as_slice().unwrap().to_owned())
+        if let Some(model_id) = self.id_relations.get(&style_id) {
+            if let Some(model) = self.models.predict_duration.get(model_id) {
+                if let Ok(output_tensors) = model.lock().unwrap().run(inputs) {
+                    Ok(output_tensors[0].as_slice().unwrap().to_owned())
+                } else {
+                    Err(Error::InferenceFailed)
+                }
             } else {
-                Err(Error::InferenceFailed)
+                Err(Error::InvalidStyleId { style_id })
             }
         } else {
-            Err(Error::InvalidModelIndex { model_index })
+            Err(Error::InvalidStyleId { style_id })
         }
     }
 
     pub fn predict_intonation_session_run(
-        &mut self,
-        model_index: usize,
+        &self,
+        style_id: StyleId,
         inputs: Vec<&mut dyn AnyArray>,
     ) -> Result<Vec<f32>> {
-        if let Some(model) = self.models.predict_intonation.get_mut(&model_index) {
-            if let Ok(output_tensors) = model.run(inputs) {
-                Ok(output_tensors[0].as_slice().unwrap().to_owned())
+        if let Some(model_id) = self.id_relations.get(&style_id) {
+            if let Some(model) = self.models.predict_intonation.get(model_id) {
+                if let Ok(output_tensors) = model.lock().unwrap().run(inputs) {
+                    Ok(output_tensors[0].as_slice().unwrap().to_owned())
+                } else {
+                    Err(Error::InferenceFailed)
+                }
             } else {
-                Err(Error::InferenceFailed)
+                Err(Error::InvalidStyleId { style_id })
             }
         } else {
-            Err(Error::InvalidModelIndex { model_index })
+            Err(Error::InvalidStyleId { style_id })
         }
     }
 
     pub fn decode_session_run(
-        &mut self,
-        model_index: usize,
+        &self,
+        style_id: StyleId,
         inputs: Vec<&mut dyn AnyArray>,
     ) -> Result<Vec<f32>> {
-        if let Some(model) = self.models.decode.get_mut(&model_index) {
-            if let Ok(output_tensors) = model.run(inputs) {
-                Ok(output_tensors[0].as_slice().unwrap().to_owned())
+        if let Some(model_id) = self.id_relations.get(&style_id) {
+            if let Some(model) = self.models.decode.get(model_id) {
+                if let Ok(output_tensors) = model.lock().unwrap().run(inputs) {
+                    Ok(output_tensors[0].as_slice().unwrap().to_owned())
+                } else {
+                    Err(Error::InferenceFailed)
+                }
             } else {
-                Err(Error::InferenceFailed)
+                Err(Error::InvalidStyleId { style_id })
             }
         } else {
-            Err(Error::InvalidModelIndex { model_index })
+            Err(Error::InvalidStyleId { style_id })
         }
     }
 }
@@ -385,29 +317,14 @@ mod tests {
         assert!(status.models.predict_duration.is_empty());
         assert!(status.models.predict_intonation.is_empty());
         assert!(status.models.decode.is_empty());
-        assert!(status.supported_styles.is_empty());
+        assert!(status.id_relations.is_empty());
     }
 
     #[rstest]
-    fn status_load_metas_works() {
-        let mut status = Status::new(true, 0);
-        let result = status.load_metas();
-        assert_debug_fmt_eq!(Ok(()), result);
-        let expected = BTreeSet::from([0, 1, 2, 3]);
-        assert_eq!(expected, status.supported_styles);
-    }
-
-    #[rstest]
-    fn supported_devices_get_supported_devices_works() {
-        let result = SupportedDevices::get_supported_devices();
-        // 環境によって結果が変わるので、関数呼び出しが成功するかどうかの確認のみ行う
-        assert!(result.is_ok(), "{result:?}");
-    }
-
-    #[rstest]
-    fn status_load_model_works() {
+    #[tokio::test]
+    async fn status_load_model_works() {
         let mut status = Status::new(false, 0);
-        let result = status.load_model(0);
+        let result = status.load_model(&open_default_vvm_file().await).await;
         assert_debug_fmt_eq!(Ok(()), result);
         assert_eq!(1, status.models.predict_duration.len());
         assert_eq!(1, status.models.predict_intonation.len());
@@ -415,18 +332,16 @@ mod tests {
     }
 
     #[rstest]
-    fn status_is_model_loaded_works() {
+    #[tokio::test]
+    async fn status_is_model_loaded_works() {
         let mut status = Status::new(false, 0);
-        let model_index = 0;
+        let vvm = open_default_vvm_file().await;
         assert!(
-            !status.is_model_loaded(model_index),
+            !status.is_loaded_model(vvm.id()),
             "model should  not be loaded"
         );
-        let result = status.load_model(model_index);
+        let result = status.load_model(&vvm).await;
         assert_debug_fmt_eq!(Ok(()), result);
-        assert!(
-            status.is_model_loaded(model_index),
-            "model should be loaded"
-        );
+        assert!(status.is_loaded_model(vvm.id()), "model should be loaded");
     }
 }
diff --git a/crates/voicevox_core/src/status/model_file.rs b/crates/voicevox_core/src/status/model_file.rs
index f5dce926a..470ce9a6b 100644
--- a/crates/voicevox_core/src/status/model_file.rs
+++ b/crates/voicevox_core/src/status/model_file.rs
@@ -1,21 +1,5 @@
-use super::{DecryptModelError, ModelFileNames};
+use super::DecryptModelError;
 
 pub(super) fn decrypt(content: &[u8]) -> std::result::Result<Vec<u8>, DecryptModelError> {
     Ok(content.to_owned())
 }
-
-pub(super) const SPEAKER_ID_MAP: &[(u32, (usize, u32))] =
-    &[(0, (0, 0)), (1, (0, 1)), (2, (1, 0)), (3, (1, 1))];
-
-pub(super) const MODEL_FILE_NAMES: &[ModelFileNames] = &[
-    ModelFileNames {
-        predict_duration_model: "predict_duration-0.onnx",
-        predict_intonation_model: "predict_intonation-0.onnx",
-        decode_model: "decode-0.onnx",
-    },
-    ModelFileNames {
-        predict_duration_model: "predict_duration-1.onnx",
-        predict_intonation_model: "predict_intonation-1.onnx",
-        decode_model: "decode-1.onnx",
-    },
-];
diff --git a/model/decode-0.onnx b/crates/voicevox_core/src/test_data/model_sources/load_model_works1/decode.onnx
similarity index 100%
rename from model/decode-0.onnx
rename to crates/voicevox_core/src/test_data/model_sources/load_model_works1/decode.onnx
diff --git a/crates/voicevox_core/src/test_data/model_sources/load_model_works1/manifest.json b/crates/voicevox_core/src/test_data/model_sources/load_model_works1/manifest.json
new file mode 100644
index 000000000..e766f1449
--- /dev/null
+++ b/crates/voicevox_core/src/test_data/model_sources/load_model_works1/manifest.json
@@ -0,0 +1,7 @@
+{
+  "manifest_version": "0.0.0",
+  "metas_filename": "metas.json",
+  "decode_filename": "decode.onnx",
+  "predict_duration_filename": "predict_duration.onnx",
+  "predict_intonation_filename": "predict_intonation.onnx"
+}
diff --git a/model/metas.json b/crates/voicevox_core/src/test_data/model_sources/load_model_works1/metas.json
similarity index 63%
rename from model/metas.json
rename to crates/voicevox_core/src/test_data/model_sources/load_model_works1/metas.json
index dc8873469..e78fd0950 100644
--- a/model/metas.json
+++ b/crates/voicevox_core/src/test_data/model_sources/load_model_works1/metas.json
@@ -2,7 +2,10 @@
   {
     "name": "dummy1",
     "styles": [
-      { "name": "style1", "id": 0 }
+      {
+        "name": "style1",
+        "id": 0
+      }
     ],
     "speaker_uuid": "574bc678-8370-44be-b941-08e46e7b47d7",
     "version": "0.0.1"
@@ -10,7 +13,10 @@
   {
     "name": "dummy2",
     "styles": [
-      { "name": "style2", "id": 1 }
+      {
+        "name": "style2",
+        "id": 1
+      }
     ],
     "speaker_uuid": "dd9ccd75-75f6-40ce-a3db-960cbed2e905",
     "version": "0.0.1"
@@ -18,8 +24,14 @@
   {
     "name": "dummy3",
     "styles": [
-      { "name": "style3-1", "id": 2 },
-      { "name": "style3-2", "id": 3 }
+      {
+        "name": "style3-1",
+        "id": 2
+      },
+      {
+        "name": "style3-2",
+        "id": 3
+      }
     ],
     "speaker_uuid": "5d3d9aa9-88e5-4a96-8ef7-f13a3cad1cb3",
     "version": "0.0.1"
diff --git a/model/predict_duration-0.onnx b/crates/voicevox_core/src/test_data/model_sources/load_model_works1/predict_duration.onnx
similarity index 100%
rename from model/predict_duration-0.onnx
rename to crates/voicevox_core/src/test_data/model_sources/load_model_works1/predict_duration.onnx
diff --git a/model/predict_intonation-0.onnx b/crates/voicevox_core/src/test_data/model_sources/load_model_works1/predict_intonation.onnx
similarity index 100%
rename from model/predict_intonation-0.onnx
rename to crates/voicevox_core/src/test_data/model_sources/load_model_works1/predict_intonation.onnx
diff --git a/crates/voicevox_core/src/test_util.rs b/crates/voicevox_core/src/test_util.rs
new file mode 100644
index 000000000..587ad1057
--- /dev/null
+++ b/crates/voicevox_core/src/test_util.rs
@@ -0,0 +1,75 @@
+use async_zip::{write::ZipFileWriter, Compression, ZipEntryBuilder};
+use once_cell::sync::Lazy;
+use std::{
+    collections::HashMap,
+    path::{Path, PathBuf},
+};
+use tokio::{
+    fs::{self, File},
+    io::{AsyncReadExt, AsyncWriteExt},
+    sync::Mutex,
+};
+
+use crate::VoiceModel;
+
+pub async fn open_default_vvm_file() -> VoiceModel {
+    VoiceModel::from_path(
+        convert_zip_vvm(
+            PathBuf::from(env!("CARGO_WORKSPACE_DIR"))
+                .join(file!())
+                .parent()
+                .unwrap()
+                .join("test_data/model_sources")
+                .join("load_model_works1"),
+        )
+        .await,
+    )
+    .await
+    .unwrap()
+}
+
+static PATH_MUTEX: Lazy<Mutex<HashMap<PathBuf, Mutex<()>>>> =
+    Lazy::new(|| Mutex::new(HashMap::default()));
+
+async fn convert_zip_vvm(dir: impl AsRef<Path>) -> PathBuf {
+    let dir = dir.as_ref();
+    let output_file_name = dir.file_name().unwrap().to_str().unwrap().to_owned() + ".vvm";
+
+    let out_file_path = PathBuf::from(env!("OUT_DIR"))
+        .join("test_data/models/")
+        .join(output_file_name);
+    let mut path_map = PATH_MUTEX.lock().await;
+    if !path_map.contains_key(&out_file_path) {
+        path_map.insert(out_file_path.clone(), Mutex::new(()));
+    }
+    let _m = path_map.get(&out_file_path).unwrap().lock().await;
+
+    if !out_file_path.exists() {
+        fs::create_dir_all(out_file_path.parent().unwrap())
+            .await
+            .unwrap();
+        let mut out_file = File::create(&out_file_path).await.unwrap();
+        let mut writer = ZipFileWriter::new(&mut out_file);
+
+        for entry in dir.read_dir().unwrap().flatten() {
+            let entry_builder = ZipEntryBuilder::new(
+                entry
+                    .path()
+                    .file_name()
+                    .unwrap()
+                    .to_str()
+                    .unwrap()
+                    .to_string(),
+                Compression::Deflate,
+            );
+            let mut entry_writer = writer.write_entry_stream(entry_builder).await.unwrap();
+            let mut file = File::open(entry.path()).await.unwrap();
+            let mut buf = Vec::with_capacity(entry.metadata().unwrap().len() as usize);
+            file.read_to_end(&mut buf).await.unwrap();
+            entry_writer.write_all(&buf).await.unwrap();
+            entry_writer.close().await.unwrap();
+        }
+        writer.close().await.unwrap();
+    }
+    out_file_path
+}
diff --git a/crates/voicevox_core/src/version.rs b/crates/voicevox_core/src/version.rs
new file mode 100644
index 000000000..2944ef034
--- /dev/null
+++ b/crates/voicevox_core/src/version.rs
@@ -0,0 +1,13 @@
+pub const fn get_version() -> &'static str {
+    env!("CARGO_PKG_VERSION")
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::*;
+    #[rstest]
+    fn get_version_works() {
+        assert_eq!("0.0.0", get_version());
+    }
+}
diff --git a/crates/voicevox_core/src/voice_model.rs b/crates/voicevox_core/src/voice_model.rs
new file mode 100644
index 000000000..388ba3ec5
--- /dev/null
+++ b/crates/voicevox_core/src/voice_model.rs
@@ -0,0 +1,172 @@
+use async_zip::{read::fs::ZipFileReader, ZipEntry};
+use futures::future::{join3, join_all};
+use serde::{de::DeserializeOwned, Deserialize};
+
+use super::*;
+use std::{
+    collections::HashMap,
+    env,
+    path::{Path, PathBuf},
+};
+
+/// 音声合成モデルIdの実体
+pub type RawVoiceModelId = String;
+
+/// 音声合成モデルId (型を強く分けるためにこうしている)
+#[derive(PartialEq, Eq, Clone, Ord, PartialOrd, Deserialize, new, Getters, Debug)]
+pub struct VoiceModelId {
+    raw_voice_model_id: RawVoiceModelId,
+}
+
+/// 音声合成モデル
+#[derive(Getters, Clone)]
+pub struct VoiceModel {
+    id: VoiceModelId,
+    manifest: Manifest,
+    metas: VoiceModelMeta,
+    path: PathBuf,
+}
+
+#[derive(Getters)]
+pub(crate) struct InferenceModels {
+    decode_model: Vec<u8>,
+    predict_duration_model: Vec<u8>,
+    predict_intonation_model: Vec<u8>,
+}
+
+impl VoiceModel {
+    pub(crate) async fn read_inference_models(&self) -> Result<InferenceModels> {
+        let reader = VvmEntryReader::open(&self.path).await?;
+        let (decode_model_result, predict_duration_model_result, predict_intonation_model_result) =
+            join3(
+                reader.read_vvm_entry(self.manifest.decode_filename()),
+                reader.read_vvm_entry(self.manifest.predict_duration_filename()),
+                reader.read_vvm_entry(self.manifest.predict_intonation_filename()),
+            )
+            .await;
+
+        Ok(InferenceModels {
+            predict_duration_model: predict_duration_model_result?,
+            predict_intonation_model: predict_intonation_model_result?,
+            decode_model: decode_model_result?,
+        })
+    }
+    /// 与えられたパスからモデルを取得する
+    pub async fn from_path(path: impl AsRef<Path>) -> Result<Self> {
+        let reader = VvmEntryReader::open(&path).await?;
+        let manifest = reader.read_vvm_json::<Manifest>("manifest.json").await?;
+        let metas = reader
+            .read_vvm_json::<VoiceModelMeta>(manifest.metas_filename())
+            .await?;
+        let id = VoiceModelId::new(nanoid!());
+
+        Ok(Self {
+            id,
+            metas,
+            manifest,
+            path: path.as_ref().into(),
+        })
+    }
+
+    pub async fn get_all_models() -> Result<Vec<Self>> {
+        let root_dir = if cfg!(test) {
+            Path::new(env!("CARGO_WORKSPACE_DIR")).join("model")
+        } else if let Some(root_dir) = env::var_os(Self::ROOT_DIR_ENV_NAME) {
+            root_dir.into()
+        } else {
+            process_path::get_dylib_path()
+                .or_else(process_path::get_executable_path)
+                .unwrap()
+                .parent()
+                .unwrap_or_else(|| "".as_ref())
+                .join("model")
+        };
+
+        let mut vvm_paths = Vec::new();
+        for entry in root_dir.read_dir().map_err(|e| Error::LoadModel {
+            path: root_dir.clone(),
+            source: e.into(),
+        })? {
+            match entry {
+                Ok(entry) => vvm_paths.push(Self::from_path(entry.path())),
+                Err(e) => Err(Error::LoadModel {
+                    path: root_dir.clone(),
+                    source: e.into(),
+                })?,
+            }
+        }
+
+        join_all(vvm_paths).await.into_iter().collect()
+    }
+    const ROOT_DIR_ENV_NAME: &str = "VV_MODELS_ROOT_DIR";
+}
+
+struct VvmEntry {
+    index: usize,
+    entry: ZipEntry,
+}
+
+#[derive(new)]
+struct VvmEntryReader {
+    reader: ZipFileReader,
+    entry_map: HashMap<String, VvmEntry>,
+}
+
+impl VvmEntryReader {
+    async fn open(path: impl AsRef<Path>) -> Result<Self> {
+        let reader = ZipFileReader::new(path.as_ref())
+            .await
+            .map_err(|e| Error::OpenFile {
+                path: path.as_ref().into(),
+                source: e.into(),
+            })?;
+        let entry_map: HashMap<_, _> = reader
+            .file()
+            .entries()
+            .iter()
+            .filter(|e| !e.entry().dir())
+            .enumerate()
+            .map(|(i, e)| {
+                (
+                    e.entry().filename().to_string(),
+                    VvmEntry {
+                        index: i,
+                        entry: e.entry().clone(),
+                    },
+                )
+            })
+            .collect();
+        Ok(VvmEntryReader::new(reader, entry_map))
+    }
+    async fn read_vvm_json<T: DeserializeOwned>(&self, filename: &str) -> Result<T> {
+        let bytes = self.read_vvm_entry(filename).await?;
+        serde_json::from_slice(&bytes).map_err(|e| Error::VvmRead {
+            filename: filename.into(),
+            source: Some(e.into()),
+        })
+    }
+
+    async fn read_vvm_entry(&self, filename: &str) -> Result<Vec<u8>> {
+        let me = self.entry_map.get(filename).ok_or(Error::VvmRead {
+            filename: filename.into(),
+            source: None,
+        })?;
+        let mut manifest_reader =
+            self.reader
+                .entry(me.index)
+                .await
+                .map_err(|_| Error::VvmRead {
+                    filename: filename.into(),
+                    source: None,
+                })?;
+        let mut buf = Vec::with_capacity(me.entry.uncompressed_size() as usize);
+        manifest_reader
+            .read_to_end_checked(&mut buf, &me.entry)
+            .await
+            .map_err(|_| Error::VvmRead {
+                filename: filename.into(),
+                source: None,
+            })?;
+        Ok(buf)
+    }
+}
diff --git a/crates/voicevox_core/src/voice_synthesizer.rs b/crates/voicevox_core/src/voice_synthesizer.rs
new file mode 100644
index 000000000..902c37b37
--- /dev/null
+++ b/crates/voicevox_core/src/voice_synthesizer.rs
@@ -0,0 +1,857 @@
+use std::sync::Arc;
+
+use crate::engine::{create_kana, parse_kana, AccentPhraseModel, OpenJtalk, SynthesisEngine};
+
+use super::*;
+
+pub struct SynthesisOptions {
+    pub enable_interrogative_upspeak: bool,
+}
+
+impl AsRef<SynthesisOptions> for SynthesisOptions {
+    fn as_ref(&self) -> &SynthesisOptions {
+        self
+    }
+}
+
+impl From<&TtsOptions> for SynthesisOptions {
+    fn from(options: &TtsOptions) -> Self {
+        Self {
+            enable_interrogative_upspeak: options.enable_interrogative_upspeak,
+        }
+    }
+}
+
+#[derive(Default)]
+pub struct AccentPhrasesOptions {
+    pub kana: bool,
+}
+
+#[derive(Default)]
+pub struct AudioQueryOptions {
+    pub kana: bool,
+}
+
+impl From<&TtsOptions> for AudioQueryOptions {
+    fn from(options: &TtsOptions) -> Self {
+        Self { kana: options.kana }
+    }
+}
+
+pub struct TtsOptions {
+    pub kana: bool,
+    pub enable_interrogative_upspeak: bool,
+}
+
+impl AsRef<TtsOptions> for TtsOptions {
+    fn as_ref(&self) -> &Self {
+        self
+    }
+}
+
+impl Default for TtsOptions {
+    fn default() -> Self {
+        Self {
+            enable_interrogative_upspeak: true,
+            kana: Default::default(),
+        }
+    }
+}
+
+#[derive(Default, Debug, PartialEq, Eq)]
+pub enum AccelerationMode {
+    #[default]
+    Auto,
+    Cpu,
+    Gpu,
+}
+
+#[derive(Default)]
+pub struct InitializeOptions {
+    pub acceleration_mode: AccelerationMode,
+    pub cpu_num_threads: u16,
+    pub load_all_models: bool,
+}
+
+/// 音声シンセサイザ
+pub struct Synthesizer {
+    synthesis_engine: SynthesisEngine,
+    use_gpu: bool,
+}
+
+impl Synthesizer {
+    /// コンストラクタ兼初期化
+    pub async fn new_with_initialize(
+        open_jtalk: Arc<OpenJtalk>,
+        options: &InitializeOptions,
+    ) -> Result<Self> {
+        #[cfg(windows)]
+        list_windows_video_cards();
+        let use_gpu = match options.acceleration_mode {
+            AccelerationMode::Auto => {
+                let supported_devices = SupportedDevices::get_supported_devices()?;
+
+                cfg_if! {
+                    if #[cfg(feature="directml")]{
+                        *supported_devices.dml()
+
+                    } else {
+                        *supported_devices.cuda()
+                    }
+                }
+            }
+            AccelerationMode::Cpu => false,
+            AccelerationMode::Gpu => true,
+        };
+
+        Ok(Self {
+            synthesis_engine: SynthesisEngine::new(
+                InferenceCore::new_with_initialize(
+                    use_gpu,
+                    options.cpu_num_threads,
+                    options.load_all_models,
+                )
+                .await?,
+                open_jtalk,
+            ),
+            use_gpu,
+        })
+    }
+
+    pub fn is_gpu_mode(&self) -> bool {
+        self.use_gpu
+    }
+
+    /// 音声合成モデルを読み込む
+    pub async fn load_voice_model(&mut self, model: &VoiceModel) -> Result<()> {
+        self.synthesis_engine
+            .inference_core_mut()
+            .load_model(model)
+            .await?;
+        Ok(())
+    }
+
+    /// 指定したモデルIdの音声合成モデルを開放する
+    pub fn unload_voice_model(&mut self, voice_model_id: &VoiceModelId) -> Result<()> {
+        self.synthesis_engine
+            .inference_core_mut()
+            .unload_model(voice_model_id)
+    }
+
+    /// 指定したモデルIdの音声合成モデルが読み込まれているか判定する
+    pub fn is_loaded_voice_model(&self, voice_model_id: &VoiceModelId) -> bool {
+        self.synthesis_engine
+            .inference_core()
+            .is_loaded_model(voice_model_id)
+    }
+
+    #[doc(hidden)]
+    pub fn is_loaded_model_by_style_id(&self, style_id: StyleId) -> bool {
+        self.synthesis_engine
+            .inference_core()
+            .is_model_loaded_by_style_id(style_id)
+    }
+
+    /// 今読み込んでいる音声合成モデルのメタ情報を返す
+    pub fn metas(&self) -> &VoiceModelMeta {
+        self.synthesis_engine.inference_core().metas()
+    }
+
+    /// 音声合成を行う
+    pub async fn synthesis(
+        &self,
+        audio_query: &AudioQueryModel,
+        style_id: StyleId,
+        options: &SynthesisOptions,
+    ) -> Result<Vec<u8>> {
+        self.synthesis_engine
+            .synthesis_wave_format(audio_query, style_id, options.enable_interrogative_upspeak)
+            .await
+    }
+
+    #[doc(hidden)]
+    pub async fn predict_duration(
+        &self,
+        phoneme_vector: &[i64],
+        style_id: StyleId,
+    ) -> Result<Vec<f32>> {
+        self.synthesis_engine
+            .inference_core()
+            .predict_duration(phoneme_vector, style_id)
+            .await
+    }
+
+    #[allow(clippy::too_many_arguments)]
+    #[doc(hidden)]
+    pub async fn predict_intonation(
+        &self,
+        length: usize,
+        vowel_phoneme_vector: &[i64],
+        consonant_phoneme_vector: &[i64],
+        start_accent_vector: &[i64],
+        end_accent_vector: &[i64],
+        start_accent_phrase_vector: &[i64],
+        end_accent_phrase_vector: &[i64],
+        style_id: StyleId,
+    ) -> Result<Vec<f32>> {
+        self.synthesis_engine
+            .inference_core()
+            .predict_intonation(
+                length,
+                vowel_phoneme_vector,
+                consonant_phoneme_vector,
+                start_accent_vector,
+                end_accent_vector,
+                start_accent_phrase_vector,
+                end_accent_phrase_vector,
+                style_id,
+            )
+            .await
+    }
+    #[doc(hidden)]
+    pub async fn decode(
+        &self,
+        length: usize,
+        phoneme_size: usize,
+        f0: &[f32],
+        phoneme_vector: &[f32],
+        style_id: StyleId,
+    ) -> Result<Vec<f32>> {
+        self.synthesis_engine
+            .inference_core()
+            .decode(length, phoneme_size, f0, phoneme_vector, style_id)
+            .await
+    }
+
+    pub async fn create_accent_phrases(
+        &self,
+        text: &str,
+        style_id: StyleId,
+        options: &AccentPhrasesOptions,
+    ) -> Result<Vec<AccentPhraseModel>> {
+        if !self.synthesis_engine.is_openjtalk_dict_loaded() {
+            return Err(Error::NotLoadedOpenjtalkDict);
+        }
+        if options.kana {
+            self.synthesis_engine
+                .replace_mora_data(&parse_kana(text)?, style_id)
+                .await
+        } else {
+            self.synthesis_engine
+                .create_accent_phrases(text, style_id)
+                .await
+        }
+    }
+
+    pub async fn replace_mora_data(
+        &self,
+        accent_phrases: &[AccentPhraseModel],
+        style_id: StyleId,
+    ) -> Result<Vec<AccentPhraseModel>> {
+        self.synthesis_engine
+            .replace_mora_data(accent_phrases, style_id)
+            .await
+    }
+
+    pub async fn replace_phoneme_length(
+        &self,
+        accent_phrases: &[AccentPhraseModel],
+        style_id: StyleId,
+    ) -> Result<Vec<AccentPhraseModel>> {
+        self.synthesis_engine
+            .replace_phoneme_length(accent_phrases, style_id)
+            .await
+    }
+
+    pub async fn replace_mora_pitch(
+        &self,
+        accent_phrases: &[AccentPhraseModel],
+        style_id: StyleId,
+    ) -> Result<Vec<AccentPhraseModel>> {
+        self.synthesis_engine
+            .replace_mora_pitch(accent_phrases, style_id)
+            .await
+    }
+
+    pub async fn audio_query(
+        &self,
+        text: &str,
+        style_id: StyleId,
+        options: &AudioQueryOptions,
+    ) -> Result<AudioQueryModel> {
+        let accent_phrases = self
+            .create_accent_phrases(text, style_id, &AccentPhrasesOptions { kana: options.kana })
+            .await?;
+        let kana = create_kana(&accent_phrases);
+        Ok(AudioQueryModel::new(
+            accent_phrases,
+            1.,
+            0.,
+            1.,
+            1.,
+            0.1,
+            0.1,
+            SynthesisEngine::DEFAULT_SAMPLING_RATE,
+            false,
+            Some(kana),
+        ))
+    }
+
+    pub async fn tts(
+        &self,
+        text: &str,
+        style_id: StyleId,
+        options: &TtsOptions,
+    ) -> Result<Vec<u8>> {
+        let audio_query = &self
+            .audio_query(text, style_id, &AudioQueryOptions::from(options))
+            .await?;
+        self.synthesis(audio_query, style_id, &SynthesisOptions::from(options))
+            .await
+    }
+}
+
+#[cfg(windows)]
+fn list_windows_video_cards() {
+    use std::{ffi::OsString, os::windows::ffi::OsStringExt as _};
+
+    use humansize::BINARY;
+    use tracing::{error, info};
+    use windows::Win32::Graphics::Dxgi::{
+        CreateDXGIFactory, IDXGIFactory, DXGI_ADAPTER_DESC, DXGI_ERROR_NOT_FOUND,
+    };
+
+    info!("検出されたGPU (DirectMLには1番目のGPUが使われます):");
+    match list_windows_video_cards() {
+        Ok(descs) => {
+            for desc in descs {
+                let description = OsString::from_wide(trim_nul(&desc.Description));
+                let vram = humansize::format_size(desc.DedicatedVideoMemory, BINARY);
+                info!("  - {description:?} ({vram})");
+            }
+        }
+        Err(err) => error!("{err}"),
+    }
+
+    fn list_windows_video_cards() -> windows::core::Result<Vec<DXGI_ADAPTER_DESC>> {
+        #[allow(unsafe_code)]
+        unsafe {
+            let factory = CreateDXGIFactory::<IDXGIFactory>()?;
+            (0..)
+                .map(|i| factory.EnumAdapters(i)?.GetDesc())
+                .take_while(|r| !matches!(r, Err(e) if e.code() == DXGI_ERROR_NOT_FOUND))
+                .collect()
+        }
+    }
+
+    fn trim_nul(s: &[u16]) -> &[u16] {
+        &s[..s.iter().position(|&c| c == 0x0000).unwrap_or(s.len())]
+    }
+}
+
+#[cfg(test)]
+mod tests {
+
+    use super::*;
+    use crate::{engine::MoraModel, macros::tests::assert_debug_fmt_eq};
+    use ::test_util::OPEN_JTALK_DIC_DIR;
+
+    #[rstest]
+    #[case(Ok(()))]
+    #[tokio::test]
+    async fn load_model_works(#[case] expected_result_at_initialized: Result<()>) {
+        let mut syntesizer = Synthesizer::new_with_initialize(
+            Arc::new(OpenJtalk::new_without_dic()),
+            &InitializeOptions {
+                acceleration_mode: AccelerationMode::Cpu,
+                ..Default::default()
+            },
+        )
+        .await
+        .unwrap();
+
+        let result = syntesizer
+            .load_voice_model(&open_default_vvm_file().await)
+            .await;
+
+        assert_debug_fmt_eq!(
+            expected_result_at_initialized,
+            result,
+            "got load_model result",
+        );
+    }
+
+    #[rstest]
+    #[tokio::test]
+    async fn is_use_gpu_works() {
+        let syntesizer = Synthesizer::new_with_initialize(
+            Arc::new(OpenJtalk::new_without_dic()),
+            &InitializeOptions {
+                acceleration_mode: AccelerationMode::Cpu,
+                ..Default::default()
+            },
+        )
+        .await
+        .unwrap();
+        assert!(!syntesizer.is_gpu_mode());
+    }
+
+    #[rstest]
+    #[case(1, true)]
+    #[tokio::test]
+    async fn is_loaded_model_by_style_id_works(#[case] style_id: u32, #[case] expected: bool) {
+        let style_id = StyleId::new(style_id);
+        let mut syntesizer = Synthesizer::new_with_initialize(
+            Arc::new(OpenJtalk::new_without_dic()),
+            &InitializeOptions {
+                acceleration_mode: AccelerationMode::Cpu,
+                ..Default::default()
+            },
+        )
+        .await
+        .unwrap();
+        assert!(
+            !syntesizer.is_loaded_model_by_style_id(style_id),
+            "expected is_model_loaded to return false, but got true",
+        );
+        syntesizer
+            .load_voice_model(&open_default_vvm_file().await)
+            .await
+            .unwrap();
+
+        assert_eq!(
+            syntesizer.is_loaded_model_by_style_id(style_id),
+            expected,
+            "expected is_model_loaded return value against style_id `{style_id}` is `{expected}`, but got `{}`",
+            !expected
+        );
+    }
+
+    #[rstest]
+    #[tokio::test]
+    async fn predict_duration_works() {
+        let mut syntesizer = Synthesizer::new_with_initialize(
+            Arc::new(OpenJtalk::new_without_dic()),
+            &InitializeOptions {
+                acceleration_mode: AccelerationMode::Cpu,
+                ..Default::default()
+            },
+        )
+        .await
+        .unwrap();
+
+        syntesizer
+            .load_voice_model(&open_default_vvm_file().await)
+            .await
+            .unwrap();
+
+        // 「こんにちは、音声合成の世界へようこそ」という文章を変換して得た phoneme_vector
+        let phoneme_vector = [
+            0, 23, 30, 4, 28, 21, 10, 21, 42, 7, 0, 30, 4, 35, 14, 14, 16, 30, 30, 35, 14, 14, 28,
+            30, 35, 14, 23, 7, 21, 14, 43, 30, 30, 23, 30, 35, 30, 0,
+        ];
+
+        let result = syntesizer
+            .predict_duration(&phoneme_vector, StyleId::new(1))
+            .await;
+
+        assert!(result.is_ok(), "{result:?}");
+        assert_eq!(result.unwrap().len(), phoneme_vector.len());
+    }
+
+    #[rstest]
+    #[tokio::test]
+    async fn predict_intonation_works() {
+        let mut syntesizer = Synthesizer::new_with_initialize(
+            Arc::new(OpenJtalk::new_without_dic()),
+            &InitializeOptions {
+                acceleration_mode: AccelerationMode::Cpu,
+                ..Default::default()
+            },
+        )
+        .await
+        .unwrap();
+        syntesizer
+            .load_voice_model(&open_default_vvm_file().await)
+            .await
+            .unwrap();
+
+        // 「テスト」という文章に対応する入力
+        let vowel_phoneme_vector = [0, 14, 6, 30, 0];
+        let consonant_phoneme_vector = [-1, 37, 35, 37, -1];
+        let start_accent_vector = [0, 1, 0, 0, 0];
+        let end_accent_vector = [0, 1, 0, 0, 0];
+        let start_accent_phrase_vector = [0, 1, 0, 0, 0];
+        let end_accent_phrase_vector = [0, 0, 0, 1, 0];
+
+        let result = syntesizer
+            .predict_intonation(
+                vowel_phoneme_vector.len(),
+                &vowel_phoneme_vector,
+                &consonant_phoneme_vector,
+                &start_accent_vector,
+                &end_accent_vector,
+                &start_accent_phrase_vector,
+                &end_accent_phrase_vector,
+                StyleId::new(1),
+            )
+            .await;
+
+        assert!(result.is_ok(), "{result:?}");
+        assert_eq!(result.unwrap().len(), vowel_phoneme_vector.len());
+    }
+
+    #[rstest]
+    #[tokio::test]
+    async fn decode_works() {
+        let mut syntesizer = Synthesizer::new_with_initialize(
+            Arc::new(OpenJtalk::new_without_dic()),
+            &InitializeOptions {
+                acceleration_mode: AccelerationMode::Cpu,
+                ..Default::default()
+            },
+        )
+        .await
+        .unwrap();
+        syntesizer
+            .load_voice_model(&open_default_vvm_file().await)
+            .await
+            .unwrap();
+
+        // 「テスト」という文章に対応する入力
+        const F0_LENGTH: usize = 69;
+        let mut f0 = [0.; F0_LENGTH];
+        f0[9..24].fill(5.905218);
+        f0[37..60].fill(5.565851);
+
+        const PHONEME_SIZE: usize = 45;
+        let mut phoneme = [0.; PHONEME_SIZE * F0_LENGTH];
+        let mut set_one = |index, range| {
+            for i in range {
+                phoneme[i * PHONEME_SIZE + index] = 1.;
+            }
+        };
+        set_one(0, 0..9);
+        set_one(37, 9..13);
+        set_one(14, 13..24);
+        set_one(35, 24..30);
+        set_one(6, 30..37);
+        set_one(37, 37..45);
+        set_one(30, 45..60);
+        set_one(0, 60..69);
+
+        let result = syntesizer
+            .decode(F0_LENGTH, PHONEME_SIZE, &f0, &phoneme, StyleId::new(1))
+            .await;
+
+        assert!(result.is_ok(), "{result:?}");
+        assert_eq!(result.unwrap().len(), F0_LENGTH * 256);
+    }
+
+    type TextConsonantVowelData =
+        [(&'static [(&'static str, &'static str, &'static str)], usize)];
+
+    // [([(テキスト, 母音, 子音), ...], アクセントの位置), ...] の形式
+    const TEXT_CONSONANT_VOWEL_DATA1: &TextConsonantVowelData = &[
+        (&[("コ", "k", "o"), ("レ", "r", "e"), ("ワ", "w", "a")], 3),
+        (
+            &[
+                ("テ", "t", "e"),
+                ("ス", "s", "U"),
+                ("ト", "t", "o"),
+                ("デ", "d", "e"),
+                ("ス", "s", "U"),
+            ],
+            1,
+        ),
+    ];
+
+    const TEXT_CONSONANT_VOWEL_DATA2: &TextConsonantVowelData = &[
+        (&[("コ", "k", "o"), ("レ", "r", "e"), ("ワ", "w", "a")], 1),
+        (
+            &[
+                ("テ", "t", "e"),
+                ("ス", "s", "U"),
+                ("ト", "t", "o"),
+                ("デ", "d", "e"),
+                ("ス", "s", "U"),
+            ],
+            3,
+        ),
+    ];
+
+    #[rstest]
+    #[case(
+        "これはテストです",
+        false,
+        TEXT_CONSONANT_VOWEL_DATA1,
+        "コレワ'/テ'_ストデ_ス"
+    )]
+    #[case(
+        "コ'レワ/テ_スト'デ_ス",
+        true,
+        TEXT_CONSONANT_VOWEL_DATA2,
+        "コ'レワ/テ_スト'デ_ス"
+    )]
+    #[tokio::test]
+    async fn audio_query_works(
+        #[case] input_text: &str,
+        #[case] input_kana_option: bool,
+        #[case] expected_text_consonant_vowel_data: &TextConsonantVowelData,
+        #[case] expected_kana_text: &str,
+    ) {
+        let syntesizer = Synthesizer::new_with_initialize(
+            Arc::new(OpenJtalk::new_with_initialize(OPEN_JTALK_DIC_DIR).unwrap()),
+            &InitializeOptions {
+                acceleration_mode: AccelerationMode::Cpu,
+                load_all_models: true,
+                ..Default::default()
+            },
+        )
+        .await
+        .unwrap();
+
+        let query = syntesizer
+            .audio_query(
+                input_text,
+                StyleId::new(0),
+                &AudioQueryOptions {
+                    kana: input_kana_option,
+                },
+            )
+            .await
+            .unwrap();
+
+        assert_eq!(
+            query.accent_phrases().len(),
+            expected_text_consonant_vowel_data.len()
+        );
+
+        for (accent_phrase, (text_consonant_vowel_slice, accent_pos)) in
+            std::iter::zip(query.accent_phrases(), expected_text_consonant_vowel_data)
+        {
+            assert_eq!(
+                accent_phrase.moras().len(),
+                text_consonant_vowel_slice.len()
+            );
+            assert_eq!(accent_phrase.accent(), accent_pos);
+
+            for (mora, (text, consonant, vowel)) in
+                std::iter::zip(accent_phrase.moras(), *text_consonant_vowel_slice)
+            {
+                assert_eq!(mora.text(), text);
+                // NOTE: 子音の長さが必ず非ゼロになるテストケースを想定している
+                assert_ne!(
+                    mora.consonant_length(),
+                    &Some(0.),
+                    "expected mora.consonant_length is not Some(0.0), but got Some(0.0)."
+                );
+                assert_eq!(mora.consonant(), &Some(consonant.to_string()));
+                assert_eq!(mora.vowel(), vowel);
+                // NOTE: 母音の長さが必ず非ゼロになるテストケースを想定している
+                assert_ne!(
+                    mora.vowel_length(),
+                    &0.,
+                    "expected mora.vowel_length is not 0.0, but got 0.0."
+                );
+            }
+        }
+
+        assert_eq!(query.kana().as_deref(), Some(expected_kana_text));
+    }
+
+    #[rstest]
+    #[case("これはテストです", false, TEXT_CONSONANT_VOWEL_DATA1)]
+    #[case("コ'レワ/テ_スト'デ_ス", true, TEXT_CONSONANT_VOWEL_DATA2)]
+    #[tokio::test]
+    async fn accent_phrases_works(
+        #[case] input_text: &str,
+        #[case] input_kana_option: bool,
+        #[case] expected_text_consonant_vowel_data: &TextConsonantVowelData,
+    ) {
+        let syntesizer = Synthesizer::new_with_initialize(
+            Arc::new(OpenJtalk::new_with_initialize(OPEN_JTALK_DIC_DIR).unwrap()),
+            &InitializeOptions {
+                acceleration_mode: AccelerationMode::Cpu,
+                load_all_models: true,
+                ..Default::default()
+            },
+        )
+        .await
+        .unwrap();
+
+        let accent_phrases = syntesizer
+            .create_accent_phrases(
+                input_text,
+                StyleId::new(0),
+                &AccentPhrasesOptions {
+                    kana: input_kana_option,
+                },
+            )
+            .await
+            .unwrap();
+
+        assert_eq!(
+            accent_phrases.len(),
+            expected_text_consonant_vowel_data.len()
+        );
+
+        for (accent_phrase, (text_consonant_vowel_slice, accent_pos)) in
+            std::iter::zip(accent_phrases, expected_text_consonant_vowel_data)
+        {
+            assert_eq!(
+                accent_phrase.moras().len(),
+                text_consonant_vowel_slice.len()
+            );
+            assert_eq!(accent_phrase.accent(), accent_pos);
+
+            for (mora, (text, consonant, vowel)) in
+                std::iter::zip(accent_phrase.moras(), *text_consonant_vowel_slice)
+            {
+                assert_eq!(mora.text(), text);
+                // NOTE: 子音の長さが必ず非ゼロになるテストケースを想定している
+                assert_ne!(
+                    mora.consonant_length(),
+                    &Some(0.),
+                    "expected mora.consonant_length is not Some(0.0), but got Some(0.0)."
+                );
+                assert_eq!(mora.consonant(), &Some(consonant.to_string()));
+                assert_eq!(mora.vowel(), vowel);
+                // NOTE: 母音の長さが必ず非ゼロになるテストケースを想定している
+                assert_ne!(
+                    mora.vowel_length(),
+                    &0.,
+                    "expected mora.vowel_length is not 0.0, but got 0.0."
+                );
+            }
+        }
+    }
+
+    #[rstest]
+    #[tokio::test]
+    async fn mora_length_works() {
+        let syntesizer = Synthesizer::new_with_initialize(
+            Arc::new(OpenJtalk::new_with_initialize(OPEN_JTALK_DIC_DIR).unwrap()),
+            &InitializeOptions {
+                acceleration_mode: AccelerationMode::Cpu,
+                load_all_models: true,
+                ..Default::default()
+            },
+        )
+        .await
+        .unwrap();
+
+        let accent_phrases = syntesizer
+            .create_accent_phrases(
+                "これはテストです",
+                StyleId::new(0),
+                &AccentPhrasesOptions { kana: false },
+            )
+            .await
+            .unwrap();
+
+        let modified_accent_phrases = syntesizer
+            .replace_phoneme_length(&accent_phrases, StyleId::new(1))
+            .await
+            .unwrap();
+
+        // NOTE: 一つでも母音の長さが変わっていれば、動作しているとみなす
+        assert!(
+            any_mora_param_changed(
+                &accent_phrases,
+                &modified_accent_phrases,
+                MoraModel::vowel_length
+            ),
+            "mora_length() does not work: mora.vowel_length() is not changed."
+        );
+    }
+
+    #[rstest]
+    #[tokio::test]
+    async fn mora_pitch_works() {
+        let syntesizer = Synthesizer::new_with_initialize(
+            Arc::new(OpenJtalk::new_with_initialize(OPEN_JTALK_DIC_DIR).unwrap()),
+            &InitializeOptions {
+                acceleration_mode: AccelerationMode::Cpu,
+                load_all_models: true,
+                ..Default::default()
+            },
+        )
+        .await
+        .unwrap();
+
+        let accent_phrases = syntesizer
+            .create_accent_phrases(
+                "これはテストです",
+                StyleId::new(0),
+                &AccentPhrasesOptions { kana: false },
+            )
+            .await
+            .unwrap();
+
+        let modified_accent_phrases = syntesizer
+            .replace_mora_pitch(&accent_phrases, StyleId::new(1))
+            .await
+            .unwrap();
+
+        // NOTE: 一つでも音高が変わっていれば、動作しているとみなす
+        assert!(
+            any_mora_param_changed(&accent_phrases, &modified_accent_phrases, MoraModel::pitch),
+            "mora_pitch() does not work: mora.pitch() is not changed."
+        );
+    }
+
+    #[rstest]
+    #[tokio::test]
+    async fn mora_data_works() {
+        let syntesizer = Synthesizer::new_with_initialize(
+            Arc::new(OpenJtalk::new_with_initialize(OPEN_JTALK_DIC_DIR).unwrap()),
+            &InitializeOptions {
+                acceleration_mode: AccelerationMode::Cpu,
+                load_all_models: true,
+                ..Default::default()
+            },
+        )
+        .await
+        .unwrap();
+
+        let accent_phrases = syntesizer
+            .create_accent_phrases(
+                "これはテストです",
+                StyleId::new(0),
+                &AccentPhrasesOptions { kana: false },
+            )
+            .await
+            .unwrap();
+
+        let modified_accent_phrases = syntesizer
+            .replace_mora_data(&accent_phrases, StyleId::new(1))
+            .await
+            .unwrap();
+
+        // NOTE: 一つでも音高が変わっていれば、動作しているとみなす
+        assert!(
+            any_mora_param_changed(&accent_phrases, &modified_accent_phrases, MoraModel::pitch),
+            "mora_data() does not work: mora.pitch() is not changed."
+        );
+        // NOTE: 一つでも母音の長さが変わっていれば、動作しているとみなす
+        assert!(
+            any_mora_param_changed(
+                &accent_phrases,
+                &modified_accent_phrases,
+                MoraModel::vowel_length
+            ),
+            "mora_data() does not work: mora.vowel_length() is not changed."
+        );
+    }
+
+    fn any_mora_param_changed<T: PartialEq>(
+        before: &[AccentPhraseModel],
+        after: &[AccentPhraseModel],
+        param: fn(&MoraModel) -> &T,
+    ) -> bool {
+        std::iter::zip(before, after)
+            .flat_map(move |(before, after)| std::iter::zip(before.moras(), after.moras()))
+            .any(|(before, after)| param(before) != param(after))
+    }
+}
diff --git a/crates/voicevox_core_c_api/Cargo.toml b/crates/voicevox_core_c_api/Cargo.toml
index cf28e0000..6c6e2e63a 100644
--- a/crates/voicevox_core_c_api/Cargo.toml
+++ b/crates/voicevox_core_c_api/Cargo.toml
@@ -24,6 +24,8 @@ once_cell.workspace = true
 serde_json.workspace = true
 thiserror.workspace = true
 tracing-subscriber.workspace = true
+tokio.workspace = true
+derive-getters.workspace = true
 
 [dev-dependencies]
 pretty_assertions = "1.3.0"
@@ -40,6 +42,7 @@ ndarray-stats = "0.5.1"
 process_path.workspace = true
 regex.workspace = true
 serde.workspace = true
+test_util.workspace = true
 toml = "0.7.2"
 typetag = "0.2.5"
 inventory = "0.3.4"
diff --git a/crates/voicevox_core_c_api/include/voicevox_core.h b/crates/voicevox_core_c_api/include/voicevox_core.h
index 163034863..e1f66381b 100644
--- a/crates/voicevox_core_c_api/include/voicevox_core.h
+++ b/crates/voicevox_core_c_api/include/voicevox_core.h
@@ -68,46 +68,63 @@ enum VoicevoxResultCode
    */
   VOICEVOX_RESULT_LOAD_METAS_ERROR = 5,
   /**
-   * ステータスが初期化されていない
+   * 無効なstyle_idが指定された
    */
-  VOICEVOX_RESULT_UNINITIALIZED_STATUS_ERROR = 6,
-  /**
-   * 無効なspeaker_idが指定された
-   */
-  VOICEVOX_RESULT_INVALID_SPEAKER_ID_ERROR = 7,
+  VOICEVOX_RESULT_INVALID_STYLE_ID_ERROR = 6,
   /**
    * 無効なmodel_indexが指定された
    */
-  VOICEVOX_RESULT_INVALID_MODEL_INDEX_ERROR = 8,
+  VOICEVOX_RESULT_INVALID_MODEL_INDEX_ERROR = 7,
   /**
    * 推論に失敗した
    */
-  VOICEVOX_RESULT_INFERENCE_ERROR = 9,
+  VOICEVOX_RESULT_INFERENCE_ERROR = 8,
   /**
    * コンテキストラベル出力に失敗した
    */
-  VOICEVOX_RESULT_EXTRACT_FULL_CONTEXT_LABEL_ERROR = 10,
+  VOICEVOX_RESULT_EXTRACT_FULL_CONTEXT_LABEL_ERROR = 11,
   /**
    * 無効なutf8文字列が入力された
    */
-  VOICEVOX_RESULT_INVALID_UTF8_INPUT_ERROR = 11,
+  VOICEVOX_RESULT_INVALID_UTF8_INPUT_ERROR = 12,
   /**
    * aquestalk形式のテキストの解析に失敗した
    */
-  VOICEVOX_RESULT_PARSE_KANA_ERROR = 12,
+  VOICEVOX_RESULT_PARSE_KANA_ERROR = 13,
   /**
    * 無効なAudioQuery
    */
-  VOICEVOX_RESULT_INVALID_AUDIO_QUERY_ERROR = 13,
+  VOICEVOX_RESULT_INVALID_AUDIO_QUERY_ERROR = 14,
   /**
    * 無効なAccentPhrase
    */
-  VOICEVOX_RESULT_INVALID_ACCENT_PHRASE_ERROR = 14,
+  VOICEVOX_RESULT_INVALID_ACCENT_PHRASE_ERROR = 15,
+  /**
+   * ファイルオープンエラー
+   */
+  VOICEVOX_OPEN_FILE_ERROR = 16,
+  /**
+   * Modelを読み込めなかった
+   */
+  VOICEVOX_VVM_MODEL_READ_ERROR = 17,
+  /**
+   * すでに読み込まれているModelを読み込もうとした
+   */
+  VOICEVOX_ALREADY_LOADED_MODEL_ERROR = 18,
+  /**
+   * Modelが読み込まれていない
+   */
+  VOICEVOX_UNLOADED_MODEL_ERROR = 19,
 };
 #ifndef __cplusplus
 typedef int32_t VoicevoxResultCode;
 #endif // __cplusplus
 
+/**
+ * 参照カウントで管理されたOpenJtalk
+ */
+typedef struct OpenJtalkRc OpenJtalkRc;
+
 /**
  * 初期化オプション
  */
@@ -125,12 +142,24 @@ typedef struct VoicevoxInitializeOptions {
    * 全てのモデルを読み込む
    */
   bool load_all_models;
-  /**
-   * open_jtalkの辞書ディレクトリ
-   */
-  const char *open_jtalk_dict_dir;
 } VoicevoxInitializeOptions;
 
+/**
+ * 音声合成モデル
+ */
+typedef struct VoicevoxVoiceModel {
+
+} VoicevoxVoiceModel;
+
+/**
+ * 音声合成モデルID
+ */
+typedef const char *VoicevoxVoiceModelId;
+
+typedef struct VoicevoxSynthesizer {
+
+} VoicevoxSynthesizer;
+
 /**
  * Audio query のオプション
  */
@@ -141,6 +170,11 @@ typedef struct VoicevoxAudioQueryOptions {
   bool kana;
 } VoicevoxAudioQueryOptions;
 
+/**
+ * スタイルID
+ */
+typedef uint32_t VoicevoxStyleId;
+
 /**
  * `accent_phrases` のオプション
  */
@@ -152,7 +186,7 @@ typedef struct VoicevoxAccentPhrasesOptions {
 } VoicevoxAccentPhrasesOptions;
 
 /**
- * `voicevox_synthesis` のオプション
+ * `voicevox_synthesizer_synthesis` のオプション
  */
 typedef struct VoicevoxSynthesisOptions {
   /**
@@ -180,25 +214,40 @@ extern "C" {
 #endif // __cplusplus
 
 /**
- * デフォルトの初期化オプションを生成する
- * @return デフォルト値が設定された初期化オプション
+ * 参照カウントで管理されたOpenJtalkを生成する
+ *
+ * # Safety
+ * @out_open_jtalk 自動でheap領域が割り当てられるため :voicevox_open_jtalk_rc_delete で開放する必要がある
  */
 #ifdef _WIN32
 __declspec(dllimport)
 #endif
 
-struct VoicevoxInitializeOptions voicevox_make_default_initialize_options(void);
+VoicevoxResultCode voicevox_open_jtalk_rc_new(const char *open_jtalk_dic_dir,
+                                              struct OpenJtalkRc **out_open_jtalk);
 
 /**
- * 初期化する
- * @param [in] options 初期化オプション
- * @return 結果コード #VoicevoxResultCode
+ * 参照カウントで管理されたOpenJtalkを削除する
+ * @param [in] open_jtalk 参照カウントで管理されたOpenJtalk
+ *
+ * # Safety
+ * @open_jtalk 有効な :OpenJtalkRc のポインタであること
+ */
+#ifdef _WIN32
+__declspec(dllimport)
+#endif
+
+void voicevox_open_jtalk_rc_delete(struct OpenJtalkRc *open_jtalk);
+
+/**
+ * デフォルトの初期化オプションを生成する
+ * @return デフォルト値が設定された初期化オプション
  */
 #ifdef _WIN32
 __declspec(dllimport)
 #endif
 
-VoicevoxResultCode voicevox_initialize(struct VoicevoxInitializeOptions options);
+struct VoicevoxInitializeOptions voicevox_make_default_initialize_options(void);
 
 /**
  * voicevoxのバージョンを取得する
@@ -210,191 +259,181 @@ __declspec(dllimport)
  const char *voicevox_get_version(void);
 
 /**
- * モデルを読み込む
- * @param [in] speaker_id 読み込むモデルの話者ID
+ * vvmファイルパスから音声モデルを生成する
+ * @param [in] path vvmファイルパス
+ * @param [out] out_model 新しく生成された音声モデルの出力先
  * @return 結果コード #VoicevoxResultCode
+ *
+ * # Safety
+ * @param path null終端文字列であること
+ * @param out_model 自動でheapメモリが割り当てられるので ::voicevox_voice_model_delete で解放する必要がある
  */
 #ifdef _WIN32
 __declspec(dllimport)
 #endif
 
-VoicevoxResultCode voicevox_load_model(uint32_t speaker_id);
+VoicevoxResultCode voicevox_voice_model_new_from_path(const char *path,
+                                                      struct VoicevoxVoiceModel **out_model);
 
 /**
- * ハードウェアアクセラレーションがGPUモードか判定する
- * @return GPUモードならtrue、そうでないならfalse
+ * 音声モデルのIDを取得する
+ * @param [in] model 音声モデル #VoicevoxVoiceModel
+ * @return 音声モデルID #VoicevoxVoiceModelId
+ *
+ * # Safety
+ * @param model 有効な #VoicevoxVoiceModel へのポインタであること
  */
 #ifdef _WIN32
 __declspec(dllimport)
 #endif
- bool voicevox_is_gpu_mode(void);
+
+VoicevoxVoiceModelId voicevox_voice_model_id(const struct VoicevoxVoiceModel *model);
 
 /**
- * 指定したspeaker_idのモデルが読み込まれているか判定する
- * @return モデルが読み込まれているのであればtrue、そうでないならfalse
+ * 音声モデルのメタ情報を取得する
+ * @param [in] model 音声モデル #VoicevoxVoiceModel
+ * @return メタ情報のjson文字列
+ *
+ * # Safety
+ * @param model 有効な #VoicevoxVoiceModel へのポインタであること
  */
 #ifdef _WIN32
 __declspec(dllimport)
 #endif
- bool voicevox_is_model_loaded(uint32_t speaker_id);
+
+const char *voicevox_voice_model_get_metas_json(const struct VoicevoxVoiceModel *model);
 
 /**
- * このライブラリの利用を終了し、確保しているリソースを解放する
+ * 音声モデルを破棄する
+ * @param [in] model 破棄する音声モデル #VoicevoxVoiceModel
+ *
+ * # Safety
+ * @param model 有効な #VoicevoxVoiceModel へのポインタであること
  */
 #ifdef _WIN32
 __declspec(dllimport)
 #endif
- void voicevox_finalize(void);
+
+void voicevox_voice_model_delete(struct VoicevoxVoiceModel *model);
 
 /**
- * メタ情報をjsonで取得する
- * @return メタ情報のjson文字列
+ * 音声シンセサイザを生成して初期化する
+ * @param [in] open_jtalk 参照カウントで管理されたOpenJtalk
+ * @param [in] options 初期化オプション #VoicevoxInitializeOptions
+ * @param [out] out_synthesizer 新しく生成された音声シンセサイザの出力先 #VoicevoxVoiceSynthesizer
+ * @return 結果コード #VoicevoxResultCode
+ *
+ * # Safety
+ * @param out_synthesizer 自動でheapメモリが割り当てられるので ::voicevox_synthesizer_delete で解放する必要がある
  */
 #ifdef _WIN32
 __declspec(dllimport)
 #endif
- const char *voicevox_get_metas_json(void);
+
+VoicevoxResultCode voicevox_synthesizer_new_with_initialize(const struct OpenJtalkRc *open_jtalk,
+                                                            struct VoicevoxInitializeOptions options,
+                                                            struct VoicevoxSynthesizer **out_synthesizer);
 
 /**
- * サポートデバイス情報をjsonで取得する
- * @return サポートデバイス情報のjson文字列
+ * 音声シンセサイザを破棄する
+ * @param [in] synthesizer 破棄する音声シンセサイザ #VoicevoxVoiceSynthesizer
+ *
+ * # Safety
+ * @param synthesizer 有効な #VoicevoxVoiceSynthesizer へのポインタであること
  */
 #ifdef _WIN32
 __declspec(dllimport)
 #endif
- const char *voicevox_get_supported_devices_json(void);
+
+void voicevox_synthesizer_delete(struct VoicevoxSynthesizer *synthesizer);
 
 /**
- * 音素ごとの長さを推論する
- * @param [in] length phoneme_vector, output のデータ長
- * @param [in] phoneme_vector  音素データ
- * @param [in] speaker_id 話者ID
- * @param [out] output_predict_duration_length 出力データのサイズ
- * @param [out] output_predict_duration_data データの出力先
+ * モデルを読み込む
+ * @param [in] synthesizer 音声シンセサイザ
+ * @param [in] model 音声モデル
  * @return 結果コード #VoicevoxResultCode
  *
  * # Safety
- * @param phoneme_vector 必ずlengthの長さだけデータがある状態で渡すこと
- * @param output_predict_duration_data_length uintptr_t 分のメモリ領域が割り当てられていること
- * @param output_predict_duration_data 成功後にメモリ領域が割り当てられるので ::voicevox_predict_duration_data_free で解放する必要がある
+ * @param synthesizer 有効な #VoicevoxVoiceSynthesizer へのポインタであること
+ * @param model 有効な #VoicevoxVoiceModel へのポインタであること
  */
 #ifdef _WIN32
 __declspec(dllimport)
 #endif
 
-VoicevoxResultCode voicevox_predict_duration(uintptr_t length,
-                                             int64_t *phoneme_vector,
-                                             uint32_t speaker_id,
-                                             uintptr_t *output_predict_duration_data_length,
-                                             float **output_predict_duration_data);
+VoicevoxResultCode voicevox_synthesizer_load_voice_model(struct VoicevoxSynthesizer *synthesizer,
+                                                         const struct VoicevoxVoiceModel *model);
 
 /**
- * ::voicevox_predict_durationで出力されたデータを解放する
- * @param[in] predict_duration_data 確保されたメモリ領域
+ * モデルの読み込みを解除する
+ * @param [in] synthesizer 音声シンセサイザ
+ * @param [in] model_id 音声モデルID
+ * @return 結果コード #VoicevoxResultCode
  *
  * # Safety
- * @param predict_duration_data voicevox_predict_durationで確保されたポインタであり、かつ呼び出し側でバッファの変更が行われていないこと
+ * @param synthesizer 有効な #VoicevoxVoiceSynthesizer へのポインタであること
+ * @param model_id NULL終端文字列であること
  */
 #ifdef _WIN32
 __declspec(dllimport)
 #endif
 
-void voicevox_predict_duration_data_free(float *predict_duration_data);
+VoicevoxResultCode voicevox_synthesizer_unload_voice_model(struct VoicevoxSynthesizer *synthesizer,
+                                                           VoicevoxVoiceModelId model_id);
 
 /**
- * モーラごとのF0を推論する
- * @param [in] length vowel_phoneme_vector, consonant_phoneme_vector, start_accent_vector, end_accent_vector, start_accent_phrase_vector, end_accent_phrase_vector, output のデータ長
- * @param [in] vowel_phoneme_vector 母音の音素データ
- * @param [in] consonant_phoneme_vector 子音の音素データ
- * @param [in] start_accent_vector アクセントの開始位置のデータ
- * @param [in] end_accent_vector アクセントの終了位置のデータ
- * @param [in] start_accent_phrase_vector アクセント句の開始位置のデータ
- * @param [in] end_accent_phrase_vector アクセント句の終了位置のデータ
- * @param [in] speaker_id 話者ID
- * @param [out] output_predict_intonation_data_length 出力データのサイズ
- * @param [out] output_predict_intonation_data データの出力先
- * @return 結果コード #VoicevoxResultCode
+ * ハードウェアアクセラレーションがGPUモードか判定する
+ * @param [in] synthesizer 音声シンセサイザ
+ * @return GPUモードならtrue、そうでないならfalse
  *
  * # Safety
- * @param vowel_phoneme_vector 必ずlengthの長さだけデータがある状態で渡すこと
- * @param consonant_phoneme_vector 必ずlengthの長さだけデータがある状態で渡すこと
- * @param start_accent_vector 必ずlengthの長さだけデータがある状態で渡すこと
- * @param end_accent_vector 必ずlengthの長さだけデータがある状態で渡すこと
- * @param start_accent_phrase_vector 必ずlengthの長さだけデータがある状態で渡すこと
- * @param end_accent_phrase_vector 必ずlengthの長さだけデータがある状態で渡すこと
- * @param output_predict_intonation_data_length uintptr_t 分のメモリ領域が割り当てられていること
- * @param output_predict_intonation_data 成功後にメモリ領域が割り当てられるので ::voicevox_predict_intonation_data_free で解放する必要がある
+ * @param synthesizer 有効な #VoicevoxVoiceSynthesizer へのポインタであること
  */
 #ifdef _WIN32
 __declspec(dllimport)
 #endif
 
-VoicevoxResultCode voicevox_predict_intonation(uintptr_t length,
-                                               int64_t *vowel_phoneme_vector,
-                                               int64_t *consonant_phoneme_vector,
-                                               int64_t *start_accent_vector,
-                                               int64_t *end_accent_vector,
-                                               int64_t *start_accent_phrase_vector,
-                                               int64_t *end_accent_phrase_vector,
-                                               uint32_t speaker_id,
-                                               uintptr_t *output_predict_intonation_data_length,
-                                               float **output_predict_intonation_data);
+bool voicevox_synthesizer_is_gpu_mode(const struct VoicevoxSynthesizer *synthesizer);
 
 /**
- * ::voicevox_predict_intonationで出力されたデータを解放する
- * @param[in] predict_intonation_data 確保されたメモリ領域
+ * 指定したspeaker_idのモデルが読み込まれているか判定する
+ * @param [in] synthesizer 音声シンセサイザ #VoicevoxVoiceSynthesizer
+ * @param [in] model_id 音声モデルのID #VoicevoxVoiceModelId
+ * @return モデルが読み込まれているのであればtrue、そうでないならfalse
  *
  * # Safety
- * @param predict_intonation_data 実行後に割り当てられたメモリ領域が解放される
- * @param predict_duration_data voicevox_predict_intonationで確保された，ポインタでありかつ，呼び出し側でバッファの変更を行われていないこと.
+ * @param synthesizer 有効な #VoicevoxVoiceSynthesizer へのポインタであること
+ * @param model_id NULL終端文字列
  */
 #ifdef _WIN32
 __declspec(dllimport)
 #endif
 
-void voicevox_predict_intonation_data_free(float *predict_intonation_data);
+bool voicevox_is_loaded_voice_model(const struct VoicevoxSynthesizer *synthesizer,
+                                    VoicevoxVoiceModelId model_id);
 
 /**
- * decodeを実行する
- * @param [in] length f0 , output のデータ長及び phoneme のデータ長に関連する
- * @param [in] phoneme_size 音素のサイズ phoneme のデータ長に関連する
- * @param [in] f0 基本周波数
- * @param [in] phoneme_vector 音素データ
- * @param [in] speaker_id 話者ID
- * @param [out] output_decode_data_length 出力先データのサイズ
- * @param [out] output_decode_data データ出力先
- * @return 結果コード #VoicevoxResultCode
+ * メタ情報をjsonで取得する
+ * @param [in] synthesizer 音声シンセサイザ #VoicevoxVoiceSynthesizer
+ * @return メタ情報のjson文字列
  *
  * # Safety
- * @param f0 必ず length の長さだけデータがある状態で渡すこと
- * @param phoneme_vector 必ず length * phoneme_size の長さだけデータがある状態で渡すこと
- * @param output_decode_data_length uintptr_t 分のメモリ領域が割り当てられていること
- * @param output_decode_data 成功後にメモリ領域が割り当てられるので ::voicevox_decode_data_free で解放する必要がある
+ * @param synthesizer 有効な #VoicevoxVoiceSynthesizer へのポインタであること
  */
 #ifdef _WIN32
 __declspec(dllimport)
 #endif
 
-VoicevoxResultCode voicevox_decode(uintptr_t length,
-                                   uintptr_t phoneme_size,
-                                   float *f0,
-                                   float *phoneme_vector,
-                                   uint32_t speaker_id,
-                                   uintptr_t *output_decode_data_length,
-                                   float **output_decode_data);
+const char *voicevox_synthesizer_get_metas_json(const struct VoicevoxSynthesizer *synthesizer);
 
 /**
- * ::voicevox_decodeで出力されたデータを解放する
- * @param[in] decode_data 確保されたメモリ領域
- *
- * # Safety
- * @param decode_data voicevox_decodeで確保されたポインタであり、かつ呼び出し側でバッファの変更を行われていないこと
+ * サポートデバイス情報をjsonで取得する
+ * @return サポートデバイス情報のjson文字列
  */
 #ifdef _WIN32
 __declspec(dllimport)
 #endif
-
-void voicevox_decode_data_free(float *decode_data);
+ const char *voicevox_get_supported_devices_json(void);
 
 /**
  * デフォルトの AudioQuery のオプションを生成する
@@ -408,24 +447,26 @@ struct VoicevoxAudioQueryOptions voicevox_make_default_audio_query_options(void)
 
 /**
  * AudioQuery を実行する
+ * @param [in] synthesizer 音声シンセサイザ #VoicevoxVoiceSynthesizer
  * @param [in] text テキスト。文字コードはUTF-8
- * @param [in] speaker_id 話者ID
- * @param [in] options AudioQueryのオプション
+ * @param [in] style_id スタイルID #VoicevoxStyleId
+ * @param [in] options AudioQueryのオプション #VoicevoxAudioQueryOptions
  * @param [out] output_audio_query_json AudioQuery を json でフォーマットしたもの
  * @return 結果コード #VoicevoxResultCode
  *
  * # Safety
  * @param text null終端文字列であること
- * @param output_audio_query_json 自動でheapメモリが割り当てられるので ::voicevox_audio_query_json_free で解放する必要がある
+ * @param output_audio_query_json 自動でheapメモリが割り当てられるので ::voicevox_json_free で解放する必要がある
  */
 #ifdef _WIN32
 __declspec(dllimport)
 #endif
 
-VoicevoxResultCode voicevox_audio_query(const char *text,
-                                        uint32_t speaker_id,
-                                        struct VoicevoxAudioQueryOptions options,
-                                        char **output_audio_query_json);
+VoicevoxResultCode voicevox_synthesizer_audio_query(const struct VoicevoxSynthesizer *synthesizer,
+                                                    const char *text,
+                                                    VoicevoxStyleId style_id,
+                                                    struct VoicevoxAudioQueryOptions options,
+                                                    char **output_audio_query_json);
 
 /**
  * デフォルトの `accent_phrases` のオプションを生成する
@@ -438,87 +479,89 @@ __declspec(dllimport)
 struct VoicevoxAccentPhrasesOptions voicevox_make_default_accent_phrases_options(void);
 
 /**
- * `accent_phrases` を実行する
- * @param [in] text テキスト。文字コードはUTF-8
- * @param [in] speaker_id 話者ID
- * @param [in] options `accent_phrases`のオプション
- * @param [out] output_accent_phrases_json アクセント句の情報の配列を json でフォーマットしたもの
- * @return 結果コード #VoicevoxResultCode
+ * create_accent_phrases を実行する
+ * @param [in] synthesizer 音声シンセサイザ #VoicevoxVoiceSynthesizer
+ * @param [in] text テキスト
+ * @param [in] style_id スタイルID #VoicevoxStyleId
+ * @param [in] output_accent_phrases_json アクセントフレーズのjson文字列
  *
  * # Safety
  * @param text null終端文字列であること
- * @param output_accent_phrases_json 自動でheapメモリが割り当てられるので ::voicevox_accent_phrases_json_free で解放する必要がある
+ * @param output_accent_phrases_json 自動でheapメモリが割り当てられるので ::voicevox_json_free で解放する必要がある
  */
 #ifdef _WIN32
 __declspec(dllimport)
 #endif
 
-VoicevoxResultCode voicevox_accent_phrases(const char *text,
-                                           uint32_t speaker_id,
-                                           struct VoicevoxAccentPhrasesOptions options,
-                                           char **output_accent_phrases_json);
+VoicevoxResultCode voicevox_synthesizer_create_accent_phrases(const struct VoicevoxSynthesizer *synthesizer,
+                                                              const char *text,
+                                                              VoicevoxStyleId style_id,
+                                                              struct VoicevoxAccentPhrasesOptions options,
+                                                              char **output_accent_phrases_json);
 
 /**
- *
- * アクセント句の音素長を変更する
- * @param [in] accent_phrases_json アクセント句の配列を json でフォーマットしたもの
- * @param [in] speaker_id 話者ID
- * @param [out] output_accent_phrases_json 音素長が変更されたアクセント句の情報の配列を json でフォーマットしたもの
- * @return 結果コード #VoicevoxResultCode
+ * replace_mora_data を実行する
+ * @param [in] synthesizer 音声シンセサイザ #VoicevoxVoiceSynthesizer
+ * @param [in] accent_phrases_json 変換前のアクセントフレーズのjson文字列
+ * @param [in] style_id スタイルID #VoicevoxStyleId
+ * @param [in] output_accent_phrases_json 変換後のアクセントフレーズのjson文字列
  *
  * # Safety
  * @param accent_phrases_json null終端文字列であること
- * @param output_accent_phrases_json 自動でheapメモリが割り当てられるので ::voicevox_accent_phrases_json_free で解放する必要がある
+ * @param output_accent_phrases_json 自動でheapメモリが割り当てられるので ::voicevox_json_free で解放する必要がある
  */
 #ifdef _WIN32
 __declspec(dllimport)
 #endif
 
-VoicevoxResultCode voicevox_mora_length(const char *accent_phrases_json,
-                                        uint32_t speaker_id,
-                                        char **output_accent_phrases_json);
+VoicevoxResultCode voicevox_synthesizer_replace_mora_data(const struct VoicevoxSynthesizer *synthesizer,
+                                                          const char *accent_phrases_json,
+                                                          VoicevoxStyleId style_id,
+                                                          char **output_accent_phrases_json);
 
 /**
- * アクセント句の音高を変更する
- * @param [in] accent_phrases_json アクセント句の配列を json でフォーマットしたもの
- * @param [in] speaker_id 話者ID
- * @param [out] output_accent_phrases_json 音高が変更されたアクセント句の情報の配列を json でフォーマットしたもの
- * @return 結果コード #VoicevoxResultCode
+ * replace_phoneme_length を実行する
+ * @param [in] synthesizer 音声シンセサイザ #VoicevoxVoiceSynthesizer
+ * @param [in] accent_phrases_json 変換前のアクセントフレーズのjson文字列
+ * @param [in] style_id スタイルID #VoicevoxStyleId
+ * @param [in] output_accent_phrases_json 変換後のアクセントフレーズのjson文字列
  *
  * # Safety
  * @param accent_phrases_json null終端文字列であること
- * @param output_accent_phrases_json 自動でheapメモリが割り当てられるので ::voicevox_accent_phrases_json_free で解放する必要がある
+ * @param output_accent_phrases_json 自動でheapメモリが割り当てられるので ::voicevox_json_free で解放する必要がある
  */
 #ifdef _WIN32
 __declspec(dllimport)
 #endif
 
-VoicevoxResultCode voicevox_mora_pitch(const char *accent_phrases_json,
-                                       uint32_t speaker_id,
-                                       char **output_accent_phrases_json);
+VoicevoxResultCode voicevox_synthesizer_replace_phoneme_length(const struct VoicevoxSynthesizer *synthesizer,
+                                                               const char *accent_phrases_json,
+                                                               VoicevoxStyleId style_id,
+                                                               char **output_accent_phrases_json);
 
 /**
- * アクセント句の音高・音素長を変更する
- * @param [in] accent_phrases_json アクセント句の配列を json でフォーマットしたもの
- * @param [in] speaker_id 話者ID
- * @param [out] output_accent_phrases_json 音高・音素長が変更されたアクセント句の情報の配列を json でフォーマットしたもの
- * @return 結果コード #VoicevoxResultCode
+ * replace_mora_pitch を実行する
+ * @param [in] synthesizer 音声シンセサイザ #VoicevoxVoiceSynthesizer
+ * @param [in] accent_phrases_json 変換前のアクセントフレーズのjson文字列
+ * @param [in] style_id スタイルID #VoicevoxStyleId
+ * @param [in] output_accent_phrases_json 変換後のアクセントフレーズのjson文字列
  *
  * # Safety
  * @param accent_phrases_json null終端文字列であること
- * @param output_accent_phrases_json 自動でheapメモリが割り当てられるので ::voicevox_accent_phrases_json_free で解放する必要がある
+ * @param output_accent_phrases_json 自動でheapメモリが割り当てられるので ::voicevox_json_free で解放する必要がある
  */
 #ifdef _WIN32
 __declspec(dllimport)
 #endif
 
-VoicevoxResultCode voicevox_mora_data(const char *accent_phrases_json,
-                                      uint32_t speaker_id,
-                                      char **output_accent_phrases_json);
+VoicevoxResultCode voicevox_synthesizer_replace_mora_pitch(const struct VoicevoxSynthesizer *synthesizer,
+                                                           const char *accent_phrases_json,
+                                                           VoicevoxStyleId style_id,
+                                                           char **output_accent_phrases_json);
 
 /**
- * デフォルトの `voicevox_synthesis` のオプションを生成する
- * @return デフォルト値が設定された `voicevox_synthesis` のオプション
+ * デフォルトの `voicevox_synthesizer_synthesis` のオプションを生成する
+ * @return デフォルト値が設定された `voicevox_synthesizer_synthesis` のオプション
  */
 #ifdef _WIN32
 __declspec(dllimport)
@@ -528,8 +571,9 @@ struct VoicevoxSynthesisOptions voicevox_make_default_synthesis_options(void);
 
 /**
  * AudioQuery から音声合成する
+ * @param [in] synthesizer 音声シンセサイザ #VoicevoxVoiceSynthesizer
  * @param [in] audio_query_json jsonフォーマットされた AudioQuery
- * @param [in] speaker_id  話者ID
+ * @param [in] style_id スタイルID #VoicevoxStyleId
  * @param [in] options AudioQueryから音声合成オプション
  * @param [out] output_wav_length 出力する wav データのサイズ
  * @param [out] output_wav wav データの出力先
@@ -543,11 +587,12 @@ struct VoicevoxSynthesisOptions voicevox_make_default_synthesis_options(void);
 __declspec(dllimport)
 #endif
 
-VoicevoxResultCode voicevox_synthesis(const char *audio_query_json,
-                                      uint32_t speaker_id,
-                                      struct VoicevoxSynthesisOptions options,
-                                      uintptr_t *output_wav_length,
-                                      uint8_t **output_wav);
+VoicevoxResultCode voicevox_synthesizer_synthesis(const struct VoicevoxSynthesizer *synthesizer,
+                                                  const char *audio_query_json,
+                                                  VoicevoxStyleId style_id,
+                                                  struct VoicevoxSynthesisOptions options,
+                                                  uintptr_t *output_wav_length,
+                                                  uint8_t **output_wav);
 
 /**
  * デフォルトのテキスト音声合成オプションを生成する
@@ -561,8 +606,9 @@ struct VoicevoxTtsOptions voicevox_make_default_tts_options(void);
 
 /**
  * テキスト音声合成を実行する
+ * @param [in] synthesizer 音声シンセサイザ #VoicevoxVoiceSynthesizer
  * @param [in] text テキスト。文字コードはUTF-8
- * @param [in] speaker_id 話者ID
+ * @param [in] style_id スタイルID #VoicevoxStyleId
  * @param [in] options テキスト音声合成オプション
  * @param [out] output_wav_length 出力する wav データのサイズ
  * @param [out] output_wav wav データの出力先
@@ -576,15 +622,16 @@ struct VoicevoxTtsOptions voicevox_make_default_tts_options(void);
 __declspec(dllimport)
 #endif
 
-VoicevoxResultCode voicevox_tts(const char *text,
-                                uint32_t speaker_id,
-                                struct VoicevoxTtsOptions options,
-                                uintptr_t *output_wav_length,
-                                uint8_t **output_wav);
+VoicevoxResultCode voicevox_synthesizer_tts(const struct VoicevoxSynthesizer *synthesizer,
+                                            const char *text,
+                                            VoicevoxStyleId style_id,
+                                            struct VoicevoxTtsOptions options,
+                                            uintptr_t *output_wav_length,
+                                            uint8_t **output_wav);
 
 /**
- * jsonフォーマットされた AudioQuery データのメモリを解放する
- * @param [in] audio_query_json 解放する json フォーマットされた AudioQuery データ
+ * jsonフォーマットされたデータのメモリを解放する
+ * @param [in] json 解放する json データ
  *
  * # Safety
  * @param voicevox_audio_query で確保されたポインタであり、かつ呼び出し側でバッファの変更を行われていないこと
@@ -593,20 +640,7 @@ VoicevoxResultCode voicevox_tts(const char *text,
 __declspec(dllimport)
 #endif
 
-void voicevox_audio_query_json_free(char *audio_query_json);
-
-/**
- * jsonフォーマットされた AccnetPhrase データのメモリを解放する
- * @param [in] accented_phrase_json 解放する json フォーマットされた AccnetPhrase データ
- *
- * # Safety
- * @param voicevox_accent_phrases で確保されたポインタであり、かつ呼び出し側でバッファの変更を行われていないこと
- */
-#ifdef _WIN32
-__declspec(dllimport)
-#endif
-
-void voicevox_accent_phrases_json_free(char *accented_phrase_json);
+void voicevox_json_free(char *json);
 
 /**
  * wav データのメモリを解放する
diff --git a/crates/voicevox_core_c_api/src/c_impls.rs b/crates/voicevox_core_c_api/src/c_impls.rs
new file mode 100644
index 000000000..b79d877a7
--- /dev/null
+++ b/crates/voicevox_core_c_api/src/c_impls.rs
@@ -0,0 +1,73 @@
+use derive_getters::Getters;
+use std::{
+    ffi::{CStr, CString},
+    path::Path,
+    sync::Arc,
+};
+
+use voicevox_core::{InitializeOptions, OpenJtalk, Result, Synthesizer, VoiceModel, VoiceModelId};
+
+pub(crate) struct COpenJtalkRc {
+    open_jtalk: Arc<OpenJtalk>,
+}
+
+impl COpenJtalkRc {
+    pub(crate) fn new_with_initialize(open_jtalk_dic_dir: impl AsRef<Path>) -> Result<Self> {
+        Ok(Self {
+            open_jtalk: Arc::new(OpenJtalk::new_with_initialize(open_jtalk_dic_dir)?),
+        })
+    }
+}
+
+#[derive(Getters)]
+pub(crate) struct CSynthesizer {
+    synthesizer: Synthesizer,
+    metas_cstring: CString,
+}
+
+impl CSynthesizer {
+    pub(crate) async fn new_with_initialize(
+        open_jtalk: &COpenJtalkRc,
+        options: &InitializeOptions,
+    ) -> Result<Self> {
+        Ok(Self {
+            synthesizer: Synthesizer::new_with_initialize(open_jtalk.open_jtalk.clone(), options)
+                .await?,
+            metas_cstring: CString::default(),
+        })
+    }
+
+    pub(crate) async fn load_voice_model(&mut self, model: &VoiceModel) -> Result<()> {
+        self.synthesizer.load_voice_model(model).await?;
+        let metas = self.synthesizer.metas();
+        self.metas_cstring = CString::new(serde_json::to_string(metas).unwrap()).unwrap();
+        Ok(())
+    }
+
+    pub(crate) fn unload_voice_model(&mut self, model_id: &VoiceModelId) -> Result<()> {
+        self.synthesizer.unload_voice_model(model_id)?;
+        let metas = self.synthesizer.metas();
+        self.metas_cstring = CString::new(serde_json::to_string(metas).unwrap()).unwrap();
+        Ok(())
+    }
+
+    pub(crate) fn metas(&self) -> &CStr {
+        &self.metas_cstring
+    }
+}
+
+#[derive(Getters)]
+pub(crate) struct CVoiceModel {
+    model: VoiceModel,
+    id: CString,
+    metas: CString,
+}
+
+impl CVoiceModel {
+    pub(crate) async fn from_path(path: impl AsRef<Path>) -> Result<Self> {
+        let model = VoiceModel::from_path(path).await?;
+        let id = CString::new(model.id().raw_voice_model_id().as_str()).unwrap();
+        let metas = CString::new(serde_json::to_string(model.metas()).unwrap()).unwrap();
+        Ok(Self { model, id, metas })
+    }
+}
diff --git a/crates/voicevox_core_c_api/src/compatible_engine.rs b/crates/voicevox_core_c_api/src/compatible_engine.rs
index 23b446072..d944c0259 100644
--- a/crates/voicevox_core_c_api/src/compatible_engine.rs
+++ b/crates/voicevox_core_c_api/src/compatible_engine.rs
@@ -1,10 +1,65 @@
+use std::{collections::BTreeMap, sync::Arc};
+
 use super::*;
 use libc::c_int;
 
 pub use voicevox_core::result_code::VoicevoxResultCode;
+use voicevox_core::{OpenJtalk, StyleId, VoiceModel};
+
+macro_rules! ensure_initialized {
+    ($synthesizer:expr $(,)?) => {
+        match $synthesizer {
+            Some(synthesizer) => synthesizer,
+            None => {
+                set_message("Statusが初期化されていません"); // 以前の`UNINITIALIZED_STATUS_ERROR`のメッセージ
+                return false;
+            }
+        }
+    };
+}
 
 static ERROR_MESSAGE: Lazy<Mutex<String>> = Lazy::new(|| Mutex::new(String::new()));
 
+struct VoiceModelSet {
+    all_metas_json: CString,
+    style_model_map: BTreeMap<StyleId, VoiceModelId>,
+    model_map: BTreeMap<VoiceModelId, VoiceModel>,
+}
+
+static VOICE_MODEL_SET: Lazy<Mutex<VoiceModelSet>> = Lazy::new(|| {
+    let all_vvms = RUNTIME.block_on(VoiceModel::get_all_models()).unwrap();
+    let model_map: BTreeMap<_, _> = all_vvms
+        .iter()
+        .map(|vvm| (vvm.id().clone(), vvm.clone()))
+        .collect();
+    let metas: Vec<_> = all_vvms.iter().flat_map(|vvm| vvm.metas()).collect();
+    let mut style_model_map = BTreeMap::default();
+    for vvm in all_vvms.iter() {
+        for meta in vvm.metas().iter() {
+            for style in meta.styles().iter() {
+                style_model_map.insert(*style.id(), vvm.id().clone());
+            }
+        }
+    }
+
+    Mutex::new(VoiceModelSet {
+        all_metas_json: CString::new(serde_json::to_string(&metas).unwrap()).unwrap(),
+        style_model_map,
+        model_map,
+    })
+});
+
+fn voice_model_set() -> MutexGuard<'static, VoiceModelSet> {
+    VOICE_MODEL_SET.lock().unwrap()
+}
+
+static SYNTHESIZER: Lazy<Mutex<Option<voicevox_core::Synthesizer>>> =
+    Lazy::new(|| Mutex::new(None));
+
+fn lock_synthesizer() -> MutexGuard<'static, Option<voicevox_core::Synthesizer>> {
+    SYNTHESIZER.lock().unwrap()
+}
+
 fn set_message(message: &str) {
     ERROR_MESSAGE
         .lock()
@@ -14,48 +69,65 @@ fn set_message(message: &str) {
 
 #[no_mangle]
 pub extern "C" fn initialize(use_gpu: bool, cpu_num_threads: c_int, load_all_models: bool) -> bool {
-    let result = lock_internal().initialize(voicevox_core::InitializeOptions {
-        acceleration_mode: if use_gpu {
-            voicevox_core::AccelerationMode::Gpu
-        } else {
-            voicevox_core::AccelerationMode::Cpu
+    let result = RUNTIME.block_on(voicevox_core::Synthesizer::new_with_initialize(
+        Arc::new(OpenJtalk::new_without_dic()),
+        &voicevox_core::InitializeOptions {
+            acceleration_mode: if use_gpu {
+                voicevox_core::AccelerationMode::Gpu
+            } else {
+                voicevox_core::AccelerationMode::Cpu
+            },
+            cpu_num_threads: cpu_num_threads as u16,
+            load_all_models,
         },
-        cpu_num_threads: cpu_num_threads as u16,
-        load_all_models,
-        ..Default::default()
-    });
-    if let Some(err) = result.err() {
-        set_message(&format!("{err}"));
-        false
-    } else {
-        true
+    ));
+    match result {
+        Ok(synthesizer) => {
+            *lock_synthesizer() = Some(synthesizer);
+            true
+        }
+        Err(err) => {
+            set_message(&format!("{err}"));
+            false
+        }
     }
 }
 
 #[no_mangle]
-pub extern "C" fn load_model(speaker_id: i64) -> bool {
-    let result = lock_internal().load_model(speaker_id as u32);
-    if let Some(err) = result.err() {
-        set_message(&format!("{err}"));
-        false
+pub extern "C" fn load_model(style_id: i64) -> bool {
+    let style_id = StyleId::new(style_id as u32);
+    let model_set = voice_model_set();
+    if let Some(model_id) = model_set.style_model_map.get(&style_id) {
+        let vvm = model_set.model_map.get(model_id).unwrap();
+        let synthesizer = &mut *lock_synthesizer();
+        let result = RUNTIME.block_on(ensure_initialized!(synthesizer).load_voice_model(vvm));
+        if let Some(err) = result.err() {
+            set_message(&format!("{err}"));
+            false
+        } else {
+            true
+        }
     } else {
-        true
+        set_message(&format!("{}は無効なStyle IDです", style_id));
+        false
     }
 }
 
 #[no_mangle]
 pub extern "C" fn is_model_loaded(speaker_id: i64) -> bool {
-    lock_internal().is_model_loaded(speaker_id as u32)
+    ensure_initialized!(&*lock_synthesizer())
+        .is_loaded_model_by_style_id(StyleId::new(speaker_id as u32))
 }
 
 #[no_mangle]
 pub extern "C" fn finalize() {
-    lock_internal().finalize()
+    *lock_synthesizer() = None;
 }
 
 #[no_mangle]
 pub extern "C" fn metas() -> *const c_char {
-    voicevox_get_metas_json()
+    let model_set = voice_model_set();
+    model_set.all_metas_json.as_ptr()
 }
 
 #[no_mangle]
@@ -75,10 +147,11 @@ pub extern "C" fn yukarin_s_forward(
     speaker_id: *mut i64,
     output: *mut f32,
 ) -> bool {
-    let result = lock_internal().predict_duration(
+    let synthesizer = &*lock_synthesizer();
+    let result = RUNTIME.block_on(ensure_initialized!(synthesizer).predict_duration(
         unsafe { std::slice::from_raw_parts_mut(phoneme_list, length as usize) },
-        unsafe { *speaker_id as u32 },
-    );
+        StyleId::new(unsafe { *speaker_id as u32 }),
+    ));
     match result {
         Ok(output_vec) => {
             let output_slice = unsafe { std::slice::from_raw_parts_mut(output, length as usize) };
@@ -104,7 +177,8 @@ pub extern "C" fn yukarin_sa_forward(
     speaker_id: *mut i64,
     output: *mut f32,
 ) -> bool {
-    let result = lock_internal().predict_intonation(
+    let synthesizer = &*lock_synthesizer();
+    let result = RUNTIME.block_on(ensure_initialized!(synthesizer).predict_intonation(
         length as usize,
         unsafe { std::slice::from_raw_parts(vowel_phoneme_list, length as usize) },
         unsafe { std::slice::from_raw_parts(consonant_phoneme_list, length as usize) },
@@ -112,8 +186,8 @@ pub extern "C" fn yukarin_sa_forward(
         unsafe { std::slice::from_raw_parts(end_accent_list, length as usize) },
         unsafe { std::slice::from_raw_parts(start_accent_phrase_list, length as usize) },
         unsafe { std::slice::from_raw_parts(end_accent_phrase_list, length as usize) },
-        unsafe { *speaker_id as u32 },
-    );
+        StyleId::new(unsafe { *speaker_id as u32 }),
+    ));
     match result {
         Ok(output_vec) => {
             let output_slice = unsafe { std::slice::from_raw_parts_mut(output, length as usize) };
@@ -138,13 +212,14 @@ pub extern "C" fn decode_forward(
 ) -> bool {
     let length = length as usize;
     let phoneme_size = phoneme_size as usize;
-    let result = lock_internal().decode(
+    let synthesizer = &*lock_synthesizer();
+    let result = RUNTIME.block_on(ensure_initialized!(synthesizer).decode(
         length,
         phoneme_size,
         unsafe { std::slice::from_raw_parts(f0, length) },
         unsafe { std::slice::from_raw_parts(phoneme, phoneme_size * length) },
-        unsafe { *speaker_id as u32 },
-    );
+        StyleId::new(unsafe { *speaker_id as u32 }),
+    ));
     match result {
         Ok(output_vec) => {
             let output_slice = unsafe { std::slice::from_raw_parts_mut(output, length * 256) };
diff --git a/crates/voicevox_core_c_api/src/helpers.rs b/crates/voicevox_core_c_api/src/helpers.rs
index d549959f9..f51305b22 100644
--- a/crates/voicevox_core_c_api/src/helpers.rs
+++ b/crates/voicevox_core_c_api/src/helpers.rs
@@ -5,6 +5,7 @@ use std::fmt::Debug;
 use thiserror::Error;
 
 use super::*;
+use voicevox_core::AccentPhraseModel;
 
 pub(crate) fn into_result_code_with_error(result: CApiResult<()>) -> VoicevoxResultCode {
     if let Err(err) = &result {
@@ -28,13 +29,16 @@ pub(crate) fn into_result_code_with_error(result: CApiResult<()>) -> VoicevoxRes
             Err(RustApi(LoadModel { .. })) => VOICEVOX_RESULT_LOAD_MODEL_ERROR,
             Err(RustApi(LoadMetas(_))) => VOICEVOX_RESULT_LOAD_METAS_ERROR,
             Err(RustApi(GetSupportedDevices(_))) => VOICEVOX_RESULT_GET_SUPPORTED_DEVICES_ERROR,
-            Err(RustApi(UninitializedStatus)) => VOICEVOX_RESULT_UNINITIALIZED_STATUS_ERROR,
-            Err(RustApi(InvalidSpeakerId { .. })) => VOICEVOX_RESULT_INVALID_SPEAKER_ID_ERROR,
+            Err(RustApi(InvalidStyleId { .. })) => VOICEVOX_RESULT_INVALID_STYLE_ID_ERROR,
             Err(RustApi(InvalidModelIndex { .. })) => VOICEVOX_RESULT_INVALID_MODEL_INDEX_ERROR,
             Err(RustApi(InferenceFailed)) => VOICEVOX_RESULT_INFERENCE_ERROR,
             Err(RustApi(ExtractFullContextLabel(_))) => {
                 VOICEVOX_RESULT_EXTRACT_FULL_CONTEXT_LABEL_ERROR
             }
+            Err(RustApi(UnloadedModel { .. })) => VOICEVOX_UNLOADED_MODEL_ERROR,
+            Err(RustApi(AlreadyLoadedModel { .. })) => VOICEVOX_ALREADY_LOADED_MODEL_ERROR,
+            Err(RustApi(OpenFile { .. })) => VOICEVOX_OPEN_FILE_ERROR,
+            Err(RustApi(VvmRead { .. })) => VOICEVOX_VVM_MODEL_READ_ERROR,
             Err(RustApi(ParseKana(_))) => VOICEVOX_RESULT_PARSE_KANA_ERROR,
             Err(InvalidUtf8Input) => VOICEVOX_RESULT_INVALID_UTF8_INPUT_ERROR,
             Err(InvalidAudioQuery(_)) => VOICEVOX_RESULT_INVALID_AUDIO_QUERY_ERROR,
@@ -57,67 +61,12 @@ pub(crate) enum CApiError {
     InvalidAccentPhrase(serde_json::Error),
 }
 
-pub(crate) fn create_audio_query(
-    japanese_or_kana: &CStr,
-    speaker_id: u32,
-    method: fn(
-        &mut Internal,
-        &str,
-        u32,
-        voicevox_core::AudioQueryOptions,
-    ) -> Result<AudioQueryModel>,
-    options: VoicevoxAudioQueryOptions,
-) -> CApiResult<CString> {
-    let japanese_or_kana = ensure_utf8(japanese_or_kana)?;
-
-    let audio_query = method(
-        &mut lock_internal(),
-        japanese_or_kana,
-        speaker_id,
-        options.into(),
-    )?;
-    Ok(CString::new(audio_query_model_to_json(&audio_query)).expect("should not contain '\\0'"))
-}
-
-fn audio_query_model_to_json(audio_query_model: &AudioQueryModel) -> String {
+pub(crate) fn audio_query_model_to_json(audio_query_model: &AudioQueryModel) -> String {
     serde_json::to_string(audio_query_model).expect("should be always valid")
 }
 
-pub(crate) fn create_accent_phrases(
-    japanese_or_kana: &CStr,
-    speaker_id: u32,
-    method: fn(
-        &mut Internal,
-        &str,
-        u32,
-        voicevox_core::AccentPhrasesOptions,
-    ) -> Result<Vec<AccentPhraseModel>>,
-    options: VoicevoxAccentPhrasesOptions,
-) -> CApiResult<CString> {
-    let japanese_or_kana = ensure_utf8(japanese_or_kana)?;
-
-    let accent_phrases = method(
-        &mut lock_internal(),
-        japanese_or_kana,
-        speaker_id,
-        options.into(),
-    )?;
-    Ok(CString::new(accent_phrases_model_to_json(&accent_phrases))
-        .expect("should not contain '\\0'"))
-}
-
-fn accent_phrases_model_to_json(accent_phrases_model: &[AccentPhraseModel]) -> String {
-    serde_json::to_string(accent_phrases_model).expect("should be always valid")
-}
-
-pub(crate) fn modify_accent_phrases(
-    accent_phrases: &[AccentPhraseModel],
-    speaker_id: u32,
-    method: fn(&mut Internal, u32, &[AccentPhraseModel]) -> Result<Vec<AccentPhraseModel>>,
-) -> CApiResult<CString> {
-    let accent_phrases = method(&mut lock_internal(), speaker_id, accent_phrases)?;
-    Ok(CString::new(accent_phrases_model_to_json(&accent_phrases))
-        .expect("should not contain '\\0'"))
+pub(crate) fn accent_phrases_to_json(audio_query_model: &[AccentPhraseModel]) -> String {
+    serde_json::to_string(audio_query_model).expect("should be always valid")
 }
 
 pub(crate) fn ensure_utf8(s: &CStr) -> CApiResult<&str> {
@@ -183,22 +132,17 @@ impl Default for VoicevoxInitializeOptions {
             acceleration_mode: options.acceleration_mode.into(),
             cpu_num_threads: options.cpu_num_threads,
             load_all_models: options.load_all_models,
-            open_jtalk_dict_dir: null(),
         }
     }
 }
 
-impl VoicevoxInitializeOptions {
-    pub(crate) unsafe fn try_into_options(self) -> CApiResult<voicevox_core::InitializeOptions> {
-        let open_jtalk_dict_dir = (!self.open_jtalk_dict_dir.is_null())
-            .then(|| ensure_utf8(CStr::from_ptr(self.open_jtalk_dict_dir)).map(Into::into))
-            .transpose()?;
-        Ok(voicevox_core::InitializeOptions {
-            acceleration_mode: self.acceleration_mode.into(),
-            cpu_num_threads: self.cpu_num_threads,
-            load_all_models: self.load_all_models,
-            open_jtalk_dict_dir,
-        })
+impl From<VoicevoxInitializeOptions> for voicevox_core::InitializeOptions {
+    fn from(value: VoicevoxInitializeOptions) -> Self {
+        voicevox_core::InitializeOptions {
+            acceleration_mode: value.acceleration_mode.into(),
+            cpu_num_threads: value.cpu_num_threads,
+            load_all_models: value.load_all_models,
+        }
     }
 }
 
diff --git a/crates/voicevox_core_c_api/src/lib.rs b/crates/voicevox_core_c_api/src/lib.rs
index 32bb3ec3a..f0b7c8a6c 100644
--- a/crates/voicevox_core_c_api/src/lib.rs
+++ b/crates/voicevox_core_c_api/src/lib.rs
@@ -1,7 +1,9 @@
+mod c_impls;
 /// cbindgen:ignore
 mod compatible_engine;
 mod helpers;
 use self::helpers::*;
+use c_impls::*;
 use chrono::SecondsFormat;
 use is_terminal::IsTerminal;
 use once_cell::sync::Lazy;
@@ -10,22 +12,21 @@ use std::ffi::{CStr, CString};
 use std::fmt;
 use std::io::{self, Write};
 use std::os::raw::c_char;
-use std::ptr::null;
 use std::sync::{Mutex, MutexGuard};
+use tokio::runtime::Runtime;
 use tracing_subscriber::fmt::format::Writer;
 use tracing_subscriber::EnvFilter;
-use voicevox_core::Result;
-use voicevox_core::VoicevoxCore;
-use voicevox_core::{AccentPhraseModel, AudioQueryModel};
+use voicevox_core::StyleId;
+use voicevox_core::{
+    AccentPhraseModel, AudioQueryModel, AudioQueryOptions, TtsOptions, VoiceModelId,
+};
+use voicevox_core::{SupportedDevices, SynthesisOptions};
 
 #[cfg(test)]
 use rstest::*;
 
-type Internal = VoicevoxCore;
-
-static INTERNAL: Lazy<Mutex<Internal>> = Lazy::new(|| {
+static RUNTIME: Lazy<Runtime> = Lazy::new(|| {
     let _ = init_logger();
-    return Internal::new_with_mutex();
 
     fn init_logger() -> std::result::Result<(), impl Sized> {
         tracing_subscriber::fmt()
@@ -60,12 +61,9 @@ static INTERNAL: Lazy<Mutex<Internal>> = Lazy::new(|| {
             |term| term != "dumb",
         ) && env::var_os("NO_COLOR").is_none()
     }
+    Runtime::new().unwrap()
 });
 
-pub(crate) fn lock_internal() -> MutexGuard<'static, Internal> {
-    INTERNAL.lock().unwrap()
-}
-
 // C_APIに渡すために，VecやCStringのサイズを記憶しながら生ポインタを得るためのマネージャ
 static BUFFER_MANAGER: Mutex<BufferManager> = Mutex::new(BufferManager::new());
 
@@ -76,6 +74,36 @@ static BUFFER_MANAGER: Mutex<BufferManager> = Mutex::new(BufferManager::new());
  * voicevox_core/publish.rsにある対応する関数とはこのファイルに定義してある公開関数からvoicevoxプレフィックスを取り除いた名前の関数である
  */
 
+/// 参照カウントで管理されたOpenJtalk
+pub struct OpenJtalkRc;
+
+/// 参照カウントで管理されたOpenJtalkを生成する
+///
+/// # Safety
+/// @out_open_jtalk 自動でheap領域が割り当てられるため :voicevox_open_jtalk_rc_delete で開放する必要がある
+#[no_mangle]
+pub unsafe extern "C" fn voicevox_open_jtalk_rc_new(
+    open_jtalk_dic_dir: *const c_char,
+    out_open_jtalk: *mut *mut OpenJtalkRc,
+) -> VoicevoxResultCode {
+    into_result_code_with_error((|| {
+        let open_jtalk_dic_dir = ensure_utf8(CStr::from_ptr(open_jtalk_dic_dir))?;
+        let open_jtalk = COpenJtalkRc::new_with_initialize(open_jtalk_dic_dir)?;
+        out_open_jtalk.write(Box::into_raw(Box::new(open_jtalk)) as *mut OpenJtalkRc);
+        Ok(())
+    })())
+}
+
+/// 参照カウントで管理されたOpenJtalkを削除する
+/// @param [in] open_jtalk 参照カウントで管理されたOpenJtalk
+///
+/// # Safety
+/// @open_jtalk 有効な :OpenJtalkRc のポインタであること
+#[no_mangle]
+pub unsafe extern "C" fn voicevox_open_jtalk_rc_delete(open_jtalk: *mut OpenJtalkRc) {
+    let _ = Box::from_raw(open_jtalk as *mut COpenJtalkRc);
+}
+
 pub use voicevox_core::result_code::VoicevoxResultCode;
 
 /// ハードウェアアクセラレーションモードを設定する設定値
@@ -101,8 +129,6 @@ pub struct VoicevoxInitializeOptions {
     cpu_num_threads: u16,
     /// 全てのモデルを読み込む
     load_all_models: bool,
-    /// open_jtalkの辞書ディレクトリ
-    open_jtalk_dict_dir: *const c_char,
 }
 
 /// デフォルトの初期化オプションを生成する
@@ -112,20 +138,8 @@ pub extern "C" fn voicevox_make_default_initialize_options() -> VoicevoxInitiali
     VoicevoxInitializeOptions::default()
 }
 
-/// 初期化する
-/// @param [in] options 初期化オプション
-/// @return 結果コード #VoicevoxResultCode
-#[no_mangle]
-pub extern "C" fn voicevox_initialize(options: VoicevoxInitializeOptions) -> VoicevoxResultCode {
-    into_result_code_with_error((|| {
-        let options = unsafe { options.try_into_options() }?;
-        lock_internal().initialize(options)?;
-        Ok(())
-    })())
-}
-
 static VOICEVOX_VERSION: once_cell::sync::Lazy<CString> =
-    once_cell::sync::Lazy::new(|| CString::new(Internal::get_version()).unwrap());
+    once_cell::sync::Lazy::new(|| CString::new(voicevox_core::get_version()).unwrap());
 
 /// voicevoxのバージョンを取得する
 /// @return SemVerでフォーマットされたバージョン
@@ -134,211 +148,217 @@ pub extern "C" fn voicevox_get_version() -> *const c_char {
     VOICEVOX_VERSION.as_ptr()
 }
 
-/// モデルを読み込む
-/// @param [in] speaker_id 読み込むモデルの話者ID
-/// @return 結果コード #VoicevoxResultCode
-#[no_mangle]
-pub extern "C" fn voicevox_load_model(speaker_id: u32) -> VoicevoxResultCode {
-    into_result_code_with_error(lock_internal().load_model(speaker_id).map_err(Into::into))
-}
+/// 音声合成モデル
+#[repr(C)]
+pub struct VoicevoxVoiceModel;
 
-/// ハードウェアアクセラレーションがGPUモードか判定する
-/// @return GPUモードならtrue、そうでないならfalse
-#[no_mangle]
-pub extern "C" fn voicevox_is_gpu_mode() -> bool {
-    lock_internal().is_gpu_mode()
-}
+/// 音声合成モデルID
+pub type VoicevoxVoiceModelId = *const c_char;
 
-/// 指定したspeaker_idのモデルが読み込まれているか判定する
-/// @return モデルが読み込まれているのであればtrue、そうでないならfalse
+/// スタイルID
+pub type VoicevoxStyleId = u32;
+
+/// vvmファイルパスから音声モデルを生成する
+/// @param [in] path vvmファイルパス
+/// @param [out] out_model 新しく生成された音声モデルの出力先
+/// @return 結果コード #VoicevoxResultCode
+///
+/// # Safety
+/// @param path null終端文字列であること
+/// @param out_model 自動でheapメモリが割り当てられるので ::voicevox_voice_model_delete で解放する必要がある
 #[no_mangle]
-pub extern "C" fn voicevox_is_model_loaded(speaker_id: u32) -> bool {
-    lock_internal().is_model_loaded(speaker_id)
+pub unsafe extern "C" fn voicevox_voice_model_new_from_path(
+    path: *const c_char,
+    out_model: *mut *mut VoicevoxVoiceModel,
+) -> VoicevoxResultCode {
+    into_result_code_with_error((|| {
+        let model = RUNTIME.block_on(CVoiceModel::from_path(ensure_utf8(CStr::from_ptr(path))?))?;
+        out_model.write(Box::into_raw(Box::new(model)) as *mut VoicevoxVoiceModel);
+        Ok(())
+    })())
 }
 
-/// このライブラリの利用を終了し、確保しているリソースを解放する
+/// 音声モデルのIDを取得する
+/// @param [in] model 音声モデル #VoicevoxVoiceModel
+/// @return 音声モデルID #VoicevoxVoiceModelId
+///
+/// # Safety
+/// @param model 有効な #VoicevoxVoiceModel へのポインタであること
 #[no_mangle]
-pub extern "C" fn voicevox_finalize() {
-    lock_internal().finalize()
+pub unsafe extern "C" fn voicevox_voice_model_id(
+    model: *const VoicevoxVoiceModel,
+) -> VoicevoxVoiceModelId {
+    let model = &*(model as *const CVoiceModel);
+    model.id().as_ptr()
 }
 
-/// メタ情報をjsonで取得する
+/// 音声モデルのメタ情報を取得する
+/// @param [in] model 音声モデル #VoicevoxVoiceModel
 /// @return メタ情報のjson文字列
+///
+/// # Safety
+/// @param model 有効な #VoicevoxVoiceModel へのポインタであること
 #[no_mangle]
-pub extern "C" fn voicevox_get_metas_json() -> *const c_char {
-    lock_internal().get_metas_json().as_ptr()
+pub unsafe extern "C" fn voicevox_voice_model_get_metas_json(
+    model: *const VoicevoxVoiceModel,
+) -> *const c_char {
+    let model = &*(model as *const CVoiceModel);
+    model.metas().as_ptr()
 }
 
-/// サポートデバイス情報をjsonで取得する
-/// @return サポートデバイス情報のjson文字列
+/// 音声モデルを破棄する
+/// @param [in] model 破棄する音声モデル #VoicevoxVoiceModel
+///
+/// # Safety
+/// @param model 有効な #VoicevoxVoiceModel へのポインタであること
 #[no_mangle]
-pub extern "C" fn voicevox_get_supported_devices_json() -> *const c_char {
-    lock_internal().get_supported_devices_json().as_ptr()
+pub unsafe extern "C" fn voicevox_voice_model_delete(model: *mut VoicevoxVoiceModel) {
+    let _ = Box::from_raw(model as *mut CVoiceModel);
 }
 
-/// 音素ごとの長さを推論する
-/// @param [in] length phoneme_vector, output のデータ長
-/// @param [in] phoneme_vector  音素データ
-/// @param [in] speaker_id 話者ID
-/// @param [out] output_predict_duration_length 出力データのサイズ
-/// @param [out] output_predict_duration_data データの出力先
+#[repr(C)]
+pub struct VoicevoxSynthesizer;
+
+/// 音声シンセサイザを生成して初期化する
+/// @param [in] open_jtalk 参照カウントで管理されたOpenJtalk
+/// @param [in] options 初期化オプション #VoicevoxInitializeOptions
+/// @param [out] out_synthesizer 新しく生成された音声シンセサイザの出力先 #VoicevoxVoiceSynthesizer
 /// @return 結果コード #VoicevoxResultCode
 ///
 /// # Safety
-/// @param phoneme_vector 必ずlengthの長さだけデータがある状態で渡すこと
-/// @param output_predict_duration_data_length uintptr_t 分のメモリ領域が割り当てられていること
-/// @param output_predict_duration_data 成功後にメモリ領域が割り当てられるので ::voicevox_predict_duration_data_free で解放する必要がある
+/// @param out_synthesizer 自動でheapメモリが割り当てられるので ::voicevox_synthesizer_delete で解放する必要がある
 #[no_mangle]
-pub unsafe extern "C" fn voicevox_predict_duration(
-    length: usize,
-    phoneme_vector: *mut i64,
-    speaker_id: u32,
-    output_predict_duration_data_length: *mut usize,
-    output_predict_duration_data: *mut *mut f32,
+pub unsafe extern "C" fn voicevox_synthesizer_new_with_initialize(
+    open_jtalk: *const OpenJtalkRc,
+    options: VoicevoxInitializeOptions,
+    out_synthesizer: *mut *mut VoicevoxSynthesizer,
 ) -> VoicevoxResultCode {
     into_result_code_with_error((|| {
-        let output_vec = lock_internal().predict_duration(
-            std::slice::from_raw_parts_mut(phoneme_vector, length),
-            speaker_id,
-        )?;
-        let (ptr, size) = BUFFER_MANAGER.lock().unwrap().vec_into_raw(output_vec);
-
-        output_predict_duration_data_length.write(size);
-        output_predict_duration_data.write(ptr);
+        let options = options.into();
+        let open_jtalk = &*(open_jtalk as *const COpenJtalkRc);
 
+        let synthesizer =
+            RUNTIME.block_on(CSynthesizer::new_with_initialize(open_jtalk, &options))?;
+        out_synthesizer.write(Box::into_raw(Box::new(synthesizer)) as *mut VoicevoxSynthesizer);
         Ok(())
     })())
 }
 
-/// ::voicevox_predict_durationで出力されたデータを解放する
-/// @param[in] predict_duration_data 確保されたメモリ領域
+/// 音声シンセサイザを破棄する
+/// @param [in] synthesizer 破棄する音声シンセサイザ #VoicevoxVoiceSynthesizer
 ///
 /// # Safety
-/// @param predict_duration_data voicevox_predict_durationで確保されたポインタであり、かつ呼び出し側でバッファの変更が行われていないこと
+/// @param synthesizer 有効な #VoicevoxVoiceSynthesizer へのポインタであること
 #[no_mangle]
-pub unsafe extern "C" fn voicevox_predict_duration_data_free(predict_duration_data: *mut f32) {
-    BUFFER_MANAGER
-        .lock()
-        .unwrap()
-        .dealloc_slice(predict_duration_data as *const f32);
-}
-
-/// モーラごとのF0を推論する
-/// @param [in] length vowel_phoneme_vector, consonant_phoneme_vector, start_accent_vector, end_accent_vector, start_accent_phrase_vector, end_accent_phrase_vector, output のデータ長
-/// @param [in] vowel_phoneme_vector 母音の音素データ
-/// @param [in] consonant_phoneme_vector 子音の音素データ
-/// @param [in] start_accent_vector アクセントの開始位置のデータ
-/// @param [in] end_accent_vector アクセントの終了位置のデータ
-/// @param [in] start_accent_phrase_vector アクセント句の開始位置のデータ
-/// @param [in] end_accent_phrase_vector アクセント句の終了位置のデータ
-/// @param [in] speaker_id 話者ID
-/// @param [out] output_predict_intonation_data_length 出力データのサイズ
-/// @param [out] output_predict_intonation_data データの出力先
+pub unsafe extern "C" fn voicevox_synthesizer_delete(synthesizer: *mut VoicevoxSynthesizer) {
+    let _ = Box::from_raw(synthesizer as *mut CSynthesizer);
+}
+
+/// モデルを読み込む
+/// @param [in] synthesizer 音声シンセサイザ
+/// @param [in] model 音声モデル
 /// @return 結果コード #VoicevoxResultCode
 ///
 /// # Safety
-/// @param vowel_phoneme_vector 必ずlengthの長さだけデータがある状態で渡すこと
-/// @param consonant_phoneme_vector 必ずlengthの長さだけデータがある状態で渡すこと
-/// @param start_accent_vector 必ずlengthの長さだけデータがある状態で渡すこと
-/// @param end_accent_vector 必ずlengthの長さだけデータがある状態で渡すこと
-/// @param start_accent_phrase_vector 必ずlengthの長さだけデータがある状態で渡すこと
-/// @param end_accent_phrase_vector 必ずlengthの長さだけデータがある状態で渡すこと
-/// @param output_predict_intonation_data_length uintptr_t 分のメモリ領域が割り当てられていること
-/// @param output_predict_intonation_data 成功後にメモリ領域が割り当てられるので ::voicevox_predict_intonation_data_free で解放する必要がある
+/// @param synthesizer 有効な #VoicevoxVoiceSynthesizer へのポインタであること
+/// @param model 有効な #VoicevoxVoiceModel へのポインタであること
 #[no_mangle]
-pub unsafe extern "C" fn voicevox_predict_intonation(
-    length: usize,
-    vowel_phoneme_vector: *mut i64,
-    consonant_phoneme_vector: *mut i64,
-    start_accent_vector: *mut i64,
-    end_accent_vector: *mut i64,
-    start_accent_phrase_vector: *mut i64,
-    end_accent_phrase_vector: *mut i64,
-    speaker_id: u32,
-    output_predict_intonation_data_length: *mut usize,
-    output_predict_intonation_data: *mut *mut f32,
+pub unsafe extern "C" fn voicevox_synthesizer_load_voice_model(
+    synthesizer: *mut VoicevoxSynthesizer,
+    model: *const VoicevoxVoiceModel,
 ) -> VoicevoxResultCode {
+    let synthesizer = &mut *(synthesizer as *mut CSynthesizer);
+    let model = &*(model as *const CVoiceModel);
+    into_result_code_with_error(
+        RUNTIME
+            .block_on(synthesizer.load_voice_model(model.model()))
+            .map_err(Into::into),
+    )
+}
+
+/// モデルの読み込みを解除する
+/// @param [in] synthesizer 音声シンセサイザ
+/// @param [in] model_id 音声モデルID
+/// @return 結果コード #VoicevoxResultCode
+///
+/// # Safety
+/// @param synthesizer 有効な #VoicevoxVoiceSynthesizer へのポインタであること
+/// @param model_id NULL終端文字列であること
+#[no_mangle]
+pub unsafe extern "C" fn voicevox_synthesizer_unload_voice_model(
+    synthesizer: *mut VoicevoxSynthesizer,
+    model_id: VoicevoxVoiceModelId,
+) -> VoicevoxResultCode {
+    let synthesizer = &mut *(synthesizer as *mut CSynthesizer);
     into_result_code_with_error((|| {
-        let output_vec = lock_internal().predict_intonation(
-            length,
-            std::slice::from_raw_parts(vowel_phoneme_vector, length),
-            std::slice::from_raw_parts(consonant_phoneme_vector, length),
-            std::slice::from_raw_parts(start_accent_vector, length),
-            std::slice::from_raw_parts(end_accent_vector, length),
-            std::slice::from_raw_parts(start_accent_phrase_vector, length),
-            std::slice::from_raw_parts(end_accent_phrase_vector, length),
-            speaker_id,
-        )?;
-        let (ptr, len) = BUFFER_MANAGER.lock().unwrap().vec_into_raw(output_vec);
-        output_predict_intonation_data.write(ptr);
-        output_predict_intonation_data_length.write(len);
-
-        Ok(())
+        let raw_model_id = ensure_utf8(unsafe { CStr::from_ptr(model_id) })?;
+        synthesizer
+            .unload_voice_model(&VoiceModelId::new(raw_model_id.to_string()))
+            .map_err(Into::into)
     })())
 }
 
-/// ::voicevox_predict_intonationで出力されたデータを解放する
-/// @param[in] predict_intonation_data 確保されたメモリ領域
+/// ハードウェアアクセラレーションがGPUモードか判定する
+/// @param [in] synthesizer 音声シンセサイザ
+/// @return GPUモードならtrue、そうでないならfalse
 ///
 /// # Safety
-/// @param predict_intonation_data 実行後に割り当てられたメモリ領域が解放される
-/// @param predict_duration_data voicevox_predict_intonationで確保された，ポインタでありかつ，呼び出し側でバッファの変更を行われていないこと.
+/// @param synthesizer 有効な #VoicevoxVoiceSynthesizer へのポインタであること
 #[no_mangle]
-pub unsafe extern "C" fn voicevox_predict_intonation_data_free(predict_intonation_data: *mut f32) {
-    BUFFER_MANAGER
-        .lock()
-        .unwrap()
-        .dealloc_slice(predict_intonation_data as *const f32);
+pub unsafe extern "C" fn voicevox_synthesizer_is_gpu_mode(
+    synthesizer: *const VoicevoxSynthesizer,
+) -> bool {
+    let synthesizer = &*(synthesizer as *const CSynthesizer);
+    synthesizer.synthesizer().is_gpu_mode()
 }
 
-/// decodeを実行する
-/// @param [in] length f0 , output のデータ長及び phoneme のデータ長に関連する
-/// @param [in] phoneme_size 音素のサイズ phoneme のデータ長に関連する
-/// @param [in] f0 基本周波数
-/// @param [in] phoneme_vector 音素データ
-/// @param [in] speaker_id 話者ID
-/// @param [out] output_decode_data_length 出力先データのサイズ
-/// @param [out] output_decode_data データ出力先
-/// @return 結果コード #VoicevoxResultCode
+/// 指定したspeaker_idのモデルが読み込まれているか判定する
+/// @param [in] synthesizer 音声シンセサイザ #VoicevoxVoiceSynthesizer
+/// @param [in] model_id 音声モデルのID #VoicevoxVoiceModelId
+/// @return モデルが読み込まれているのであればtrue、そうでないならfalse
 ///
 /// # Safety
-/// @param f0 必ず length の長さだけデータがある状態で渡すこと
-/// @param phoneme_vector 必ず length * phoneme_size の長さだけデータがある状態で渡すこと
-/// @param output_decode_data_length uintptr_t 分のメモリ領域が割り当てられていること
-/// @param output_decode_data 成功後にメモリ領域が割り当てられるので ::voicevox_decode_data_free で解放する必要がある
+/// @param synthesizer 有効な #VoicevoxVoiceSynthesizer へのポインタであること
+/// @param model_id NULL終端文字列
 #[no_mangle]
-pub unsafe extern "C" fn voicevox_decode(
-    length: usize,
-    phoneme_size: usize,
-    f0: *mut f32,
-    phoneme_vector: *mut f32,
-    speaker_id: u32,
-    output_decode_data_length: *mut usize,
-    output_decode_data: *mut *mut f32,
-) -> VoicevoxResultCode {
-    into_result_code_with_error((|| {
-        let output_vec = lock_internal().decode(
-            length,
-            phoneme_size,
-            std::slice::from_raw_parts(f0, length),
-            std::slice::from_raw_parts(phoneme_vector, phoneme_size * length),
-            speaker_id,
-        )?;
-        let (ptr, len) = BUFFER_MANAGER.lock().unwrap().vec_into_raw(output_vec);
-        output_decode_data.write(ptr);
-        output_decode_data_length.write(len);
-        Ok(())
-    })())
+pub unsafe extern "C" fn voicevox_is_loaded_voice_model(
+    synthesizer: *const VoicevoxSynthesizer,
+    model_id: VoicevoxVoiceModelId,
+) -> bool {
+    let synthesizer = &*(synthesizer as *const CSynthesizer);
+    let raw_model_id = ensure_utf8(unsafe { CStr::from_ptr(model_id) }).unwrap();
+    synthesizer
+        .synthesizer()
+        .is_loaded_voice_model(&VoiceModelId::new(raw_model_id.into()))
 }
 
-/// ::voicevox_decodeで出力されたデータを解放する
-/// @param[in] decode_data 確保されたメモリ領域
+/// メタ情報をjsonで取得する
+/// @param [in] synthesizer 音声シンセサイザ #VoicevoxVoiceSynthesizer
+/// @return メタ情報のjson文字列
 ///
 /// # Safety
-/// @param decode_data voicevox_decodeで確保されたポインタであり、かつ呼び出し側でバッファの変更を行われていないこと
+/// @param synthesizer 有効な #VoicevoxVoiceSynthesizer へのポインタであること
+#[no_mangle]
+pub unsafe extern "C" fn voicevox_synthesizer_get_metas_json(
+    synthesizer: *const VoicevoxSynthesizer,
+) -> *const c_char {
+    let synthesizer = &*(synthesizer as *const CSynthesizer);
+    synthesizer.metas().as_ptr()
+}
+
+static VOICEVOX_SUPPORTED_DEVICES_JSON: once_cell::sync::Lazy<CString> =
+    once_cell::sync::Lazy::new(|| {
+        CString::new(
+            serde_json::to_string(&SupportedDevices::get_supported_devices().unwrap()).unwrap(),
+        )
+        .unwrap()
+    });
+
+/// サポートデバイス情報をjsonで取得する
+/// @return サポートデバイス情報のjson文字列
 #[no_mangle]
-pub unsafe extern "C" fn voicevox_decode_data_free(decode_data: *mut f32) {
-    BUFFER_MANAGER.lock().unwrap().dealloc_slice(decode_data);
+pub extern "C" fn voicevox_get_supported_devices_json() -> *const c_char {
+    VOICEVOX_SUPPORTED_DEVICES_JSON.as_ptr()
 }
 
 /// Audio query のオプション
@@ -356,26 +376,35 @@ pub extern "C" fn voicevox_make_default_audio_query_options() -> VoicevoxAudioQu
 }
 
 /// AudioQuery を実行する
+/// @param [in] synthesizer 音声シンセサイザ #VoicevoxVoiceSynthesizer
 /// @param [in] text テキスト。文字コードはUTF-8
-/// @param [in] speaker_id 話者ID
-/// @param [in] options AudioQueryのオプション
+/// @param [in] style_id スタイルID #VoicevoxStyleId
+/// @param [in] options AudioQueryのオプション #VoicevoxAudioQueryOptions
 /// @param [out] output_audio_query_json AudioQuery を json でフォーマットしたもの
 /// @return 結果コード #VoicevoxResultCode
 ///
 /// # Safety
 /// @param text null終端文字列であること
-/// @param output_audio_query_json 自動でheapメモリが割り当てられるので ::voicevox_audio_query_json_free で解放する必要がある
+/// @param output_audio_query_json 自動でheapメモリが割り当てられるので ::voicevox_json_free で解放する必要がある
 #[no_mangle]
-pub unsafe extern "C" fn voicevox_audio_query(
+pub unsafe extern "C" fn voicevox_synthesizer_audio_query(
+    synthesizer: *const VoicevoxSynthesizer,
     text: *const c_char,
-    speaker_id: u32,
+    style_id: VoicevoxStyleId,
     options: VoicevoxAudioQueryOptions,
     output_audio_query_json: *mut *mut c_char,
 ) -> VoicevoxResultCode {
     into_result_code_with_error((|| {
+        let synthesizer = &*(synthesizer as *const CSynthesizer);
         let text = CStr::from_ptr(text);
-        let audio_query = create_audio_query(text, speaker_id, Internal::audio_query, options)?;
-
+        let japanese_or_kana = ensure_utf8(text)?;
+        let audio_query = RUNTIME.block_on(synthesizer.synthesizer().audio_query(
+            japanese_or_kana,
+            StyleId::new(style_id),
+            &AudioQueryOptions::from(options),
+        ))?;
+        let audio_query = CString::new(audio_query_model_to_json(&audio_query))
+            .expect("should not contain '\\0'");
         output_audio_query_json.write(audio_query.into_raw());
         Ok(())
     })())
@@ -395,140 +424,155 @@ pub extern "C" fn voicevox_make_default_accent_phrases_options() -> VoicevoxAcce
     voicevox_core::AccentPhrasesOptions::default().into()
 }
 
-/// `accent_phrases` を実行する
-/// @param [in] text テキスト。文字コードはUTF-8
-/// @param [in] speaker_id 話者ID
-/// @param [in] options `accent_phrases`のオプション
-/// @param [out] output_accent_phrases_json アクセント句の情報の配列を json でフォーマットしたもの
-/// @return 結果コード #VoicevoxResultCode
+/// create_accent_phrases を実行する
+/// @param [in] synthesizer 音声シンセサイザ #VoicevoxVoiceSynthesizer
+/// @param [in] text テキスト
+/// @param [in] style_id スタイルID #VoicevoxStyleId
+/// @param [in] output_accent_phrases_json アクセントフレーズのjson文字列
 ///
 /// # Safety
 /// @param text null終端文字列であること
-/// @param output_accent_phrases_json 自動でheapメモリが割り当てられるので ::voicevox_accent_phrases_json_free で解放する必要がある
+/// @param output_accent_phrases_json 自動でheapメモリが割り当てられるので ::voicevox_json_free で解放する必要がある
 #[no_mangle]
-pub unsafe extern "C" fn voicevox_accent_phrases(
+pub unsafe extern "C" fn voicevox_synthesizer_create_accent_phrases(
+    synthesizer: *const VoicevoxSynthesizer,
     text: *const c_char,
-    speaker_id: u32,
+    style_id: VoicevoxStyleId,
     options: VoicevoxAccentPhrasesOptions,
     output_accent_phrases_json: *mut *mut c_char,
 ) -> VoicevoxResultCode {
     into_result_code_with_error((|| {
-        let text = CStr::from_ptr(text);
-        let accent_phrases =
-            create_accent_phrases(text, speaker_id, Internal::accent_phrases, options)?;
-
+        let synthesizer = &*(synthesizer as *const CSynthesizer);
+        let text = ensure_utf8(CStr::from_ptr(text))?;
+        let accent_phrases = RUNTIME.block_on(synthesizer.synthesizer().create_accent_phrases(
+            text,
+            StyleId::new(style_id),
+            &options.into(),
+        ))?;
+        let accent_phrases = CString::new(accent_phrases_to_json(&accent_phrases))
+            .expect("should not contain '\\0'");
         output_accent_phrases_json.write(accent_phrases.into_raw());
         Ok(())
     })())
 }
-///
-/// アクセント句の音素長を変更する
-/// @param [in] accent_phrases_json アクセント句の配列を json でフォーマットしたもの
-/// @param [in] speaker_id 話者ID
-/// @param [out] output_accent_phrases_json 音素長が変更されたアクセント句の情報の配列を json でフォーマットしたもの
-/// @return 結果コード #VoicevoxResultCode
+
+/// replace_mora_data を実行する
+/// @param [in] synthesizer 音声シンセサイザ #VoicevoxVoiceSynthesizer
+/// @param [in] accent_phrases_json 変換前のアクセントフレーズのjson文字列
+/// @param [in] style_id スタイルID #VoicevoxStyleId
+/// @param [in] output_accent_phrases_json 変換後のアクセントフレーズのjson文字列
 ///
 /// # Safety
 /// @param accent_phrases_json null終端文字列であること
-/// @param output_accent_phrases_json 自動でheapメモリが割り当てられるので ::voicevox_accent_phrases_json_free で解放する必要がある
+/// @param output_accent_phrases_json 自動でheapメモリが割り当てられるので ::voicevox_json_free で解放する必要がある
 #[no_mangle]
-pub unsafe extern "C" fn voicevox_mora_length(
+pub unsafe extern "C" fn voicevox_synthesizer_replace_mora_data(
+    synthesizer: *const VoicevoxSynthesizer,
     accent_phrases_json: *const c_char,
-    speaker_id: u32,
+    style_id: VoicevoxStyleId,
     output_accent_phrases_json: *mut *mut c_char,
 ) -> VoicevoxResultCode {
     into_result_code_with_error((|| {
-        let accent_phrases_json = CStr::from_ptr(accent_phrases_json)
-            .to_str()
-            .map_err(|_| CApiError::InvalidUtf8Input)?;
+        let synthesizer = &*(synthesizer as *const CSynthesizer);
         let accent_phrases: Vec<AccentPhraseModel> =
-            serde_json::from_str(accent_phrases_json).map_err(CApiError::InvalidAccentPhrase)?;
-
-        let accent_phrases_with_mora_length =
-            modify_accent_phrases(&accent_phrases, speaker_id, Internal::mora_length)?;
-
-        output_accent_phrases_json.write(accent_phrases_with_mora_length.into_raw());
+            serde_json::from_str(ensure_utf8(CStr::from_ptr(accent_phrases_json))?)
+                .map_err(CApiError::InvalidAccentPhrase)?;
+        let accent_phrases = RUNTIME.block_on(
+            synthesizer
+                .synthesizer()
+                .replace_mora_data(&accent_phrases, StyleId::new(style_id)),
+        )?;
+        let accent_phrases = CString::new(accent_phrases_to_json(&accent_phrases))
+            .expect("should not contain '\\0'");
+        output_accent_phrases_json.write(accent_phrases.into_raw());
         Ok(())
     })())
 }
 
-/// アクセント句の音高を変更する
-/// @param [in] accent_phrases_json アクセント句の配列を json でフォーマットしたもの
-/// @param [in] speaker_id 話者ID
-/// @param [out] output_accent_phrases_json 音高が変更されたアクセント句の情報の配列を json でフォーマットしたもの
-/// @return 結果コード #VoicevoxResultCode
+/// replace_phoneme_length を実行する
+/// @param [in] synthesizer 音声シンセサイザ #VoicevoxVoiceSynthesizer
+/// @param [in] accent_phrases_json 変換前のアクセントフレーズのjson文字列
+/// @param [in] style_id スタイルID #VoicevoxStyleId
+/// @param [in] output_accent_phrases_json 変換後のアクセントフレーズのjson文字列
 ///
 /// # Safety
 /// @param accent_phrases_json null終端文字列であること
-/// @param output_accent_phrases_json 自動でheapメモリが割り当てられるので ::voicevox_accent_phrases_json_free で解放する必要がある
+/// @param output_accent_phrases_json 自動でheapメモリが割り当てられるので ::voicevox_json_free で解放する必要がある
 #[no_mangle]
-pub unsafe extern "C" fn voicevox_mora_pitch(
+pub unsafe extern "C" fn voicevox_synthesizer_replace_phoneme_length(
+    synthesizer: *const VoicevoxSynthesizer,
     accent_phrases_json: *const c_char,
-    speaker_id: u32,
+    style_id: VoicevoxStyleId,
     output_accent_phrases_json: *mut *mut c_char,
 ) -> VoicevoxResultCode {
     into_result_code_with_error((|| {
-        let accent_phrases_json = CStr::from_ptr(accent_phrases_json)
-            .to_str()
-            .map_err(|_| CApiError::InvalidUtf8Input)?;
+        let synthesizer = &*(synthesizer as *const CSynthesizer);
         let accent_phrases: Vec<AccentPhraseModel> =
-            serde_json::from_str(accent_phrases_json).map_err(CApiError::InvalidAccentPhrase)?;
-
-        let accent_phrases_with_mora_pitch =
-            modify_accent_phrases(&accent_phrases, speaker_id, Internal::mora_pitch)?;
-
-        output_accent_phrases_json.write(accent_phrases_with_mora_pitch.into_raw());
+            serde_json::from_str(ensure_utf8(CStr::from_ptr(accent_phrases_json))?)
+                .map_err(CApiError::InvalidAccentPhrase)?;
+        let accent_phrases = RUNTIME.block_on(
+            synthesizer
+                .synthesizer()
+                .replace_phoneme_length(&accent_phrases, StyleId::new(style_id)),
+        )?;
+        let accent_phrases = CString::new(accent_phrases_to_json(&accent_phrases))
+            .expect("should not contain '\\0'");
+        output_accent_phrases_json.write(accent_phrases.into_raw());
         Ok(())
     })())
 }
 
-/// アクセント句の音高・音素長を変更する
-/// @param [in] accent_phrases_json アクセント句の配列を json でフォーマットしたもの
-/// @param [in] speaker_id 話者ID
-/// @param [out] output_accent_phrases_json 音高・音素長が変更されたアクセント句の情報の配列を json でフォーマットしたもの
-/// @return 結果コード #VoicevoxResultCode
+/// replace_mora_pitch を実行する
+/// @param [in] synthesizer 音声シンセサイザ #VoicevoxVoiceSynthesizer
+/// @param [in] accent_phrases_json 変換前のアクセントフレーズのjson文字列
+/// @param [in] style_id スタイルID #VoicevoxStyleId
+/// @param [in] output_accent_phrases_json 変換後のアクセントフレーズのjson文字列
 ///
 /// # Safety
 /// @param accent_phrases_json null終端文字列であること
-/// @param output_accent_phrases_json 自動でheapメモリが割り当てられるので ::voicevox_accent_phrases_json_free で解放する必要がある
+/// @param output_accent_phrases_json 自動でheapメモリが割り当てられるので ::voicevox_json_free で解放する必要がある
 #[no_mangle]
-pub unsafe extern "C" fn voicevox_mora_data(
+pub unsafe extern "C" fn voicevox_synthesizer_replace_mora_pitch(
+    synthesizer: *const VoicevoxSynthesizer,
     accent_phrases_json: *const c_char,
-    speaker_id: u32,
+    style_id: VoicevoxStyleId,
     output_accent_phrases_json: *mut *mut c_char,
 ) -> VoicevoxResultCode {
     into_result_code_with_error((|| {
-        let accent_phrases_json = CStr::from_ptr(accent_phrases_json)
-            .to_str()
-            .map_err(|_| CApiError::InvalidUtf8Input)?;
+        let synthesizer = &*(synthesizer as *const CSynthesizer);
         let accent_phrases: Vec<AccentPhraseModel> =
-            serde_json::from_str(accent_phrases_json).map_err(CApiError::InvalidAccentPhrase)?;
-
-        let accent_phrases_with_mora_data =
-            modify_accent_phrases(&accent_phrases, speaker_id, Internal::mora_data)?;
-
-        output_accent_phrases_json.write(accent_phrases_with_mora_data.into_raw());
+            serde_json::from_str(ensure_utf8(CStr::from_ptr(accent_phrases_json))?)
+                .map_err(CApiError::InvalidAccentPhrase)?;
+        let accent_phrases = RUNTIME.block_on(
+            synthesizer
+                .synthesizer()
+                .replace_mora_pitch(&accent_phrases, StyleId::new(style_id)),
+        )?;
+        let accent_phrases = CString::new(accent_phrases_to_json(&accent_phrases))
+            .expect("should not contain '\\0'");
+        output_accent_phrases_json.write(accent_phrases.into_raw());
         Ok(())
     })())
 }
 
-/// `voicevox_synthesis` のオプション
+/// `voicevox_synthesizer_synthesis` のオプション
 #[repr(C)]
 pub struct VoicevoxSynthesisOptions {
     /// 疑問文の調整を有効にする
     enable_interrogative_upspeak: bool,
 }
 
-/// デフォルトの `voicevox_synthesis` のオプションを生成する
-/// @return デフォルト値が設定された `voicevox_synthesis` のオプション
+/// デフォルトの `voicevox_synthesizer_synthesis` のオプションを生成する
+/// @return デフォルト値が設定された `voicevox_synthesizer_synthesis` のオプション
 #[no_mangle]
 pub extern "C" fn voicevox_make_default_synthesis_options() -> VoicevoxSynthesisOptions {
     VoicevoxSynthesisOptions::default()
 }
 
 /// AudioQuery から音声合成する
+/// @param [in] synthesizer 音声シンセサイザ #VoicevoxVoiceSynthesizer
 /// @param [in] audio_query_json jsonフォーマットされた AudioQuery
-/// @param [in] speaker_id  話者ID
+/// @param [in] style_id スタイルID #VoicevoxStyleId
 /// @param [in] options AudioQueryから音声合成オプション
 /// @param [out] output_wav_length 出力する wav データのサイズ
 /// @param [out] output_wav wav データの出力先
@@ -538,25 +582,29 @@ pub extern "C" fn voicevox_make_default_synthesis_options() -> VoicevoxSynthesis
 /// @param output_wav_length 出力先の領域が確保された状態でpointerに渡されていること
 /// @param output_wav 自動で output_wav_length 分のデータが割り当てられるので ::voicevox_wav_free で解放する必要がある
 #[no_mangle]
-pub unsafe extern "C" fn voicevox_synthesis(
+pub unsafe extern "C" fn voicevox_synthesizer_synthesis(
+    synthesizer: *const VoicevoxSynthesizer,
     audio_query_json: *const c_char,
-    speaker_id: u32,
+    style_id: VoicevoxStyleId,
     options: VoicevoxSynthesisOptions,
     output_wav_length: *mut usize,
     output_wav: *mut *mut u8,
 ) -> VoicevoxResultCode {
     into_result_code_with_error((|| {
+        let synthesizer = &*(synthesizer as *const CSynthesizer);
         let audio_query_json = CStr::from_ptr(audio_query_json)
             .to_str()
             .map_err(|_| CApiError::InvalidUtf8Input)?;
-        let audio_query =
-            &serde_json::from_str(audio_query_json).map_err(CApiError::InvalidAudioQuery)?;
-        let wav = lock_internal().synthesis(audio_query, speaker_id, options.into())?;
-
+        let audio_query: AudioQueryModel =
+            serde_json::from_str(audio_query_json).map_err(CApiError::InvalidAudioQuery)?;
+        let wav = RUNTIME.block_on(synthesizer.synthesizer().synthesis(
+            &audio_query,
+            StyleId::new(style_id),
+            &SynthesisOptions::from(options),
+        ))?;
         let (ptr, len) = BUFFER_MANAGER.lock().unwrap().vec_into_raw(wav);
         output_wav.write(ptr);
         output_wav_length.write(len);
-
         Ok(())
     })())
 }
@@ -578,8 +626,9 @@ pub extern "C" fn voicevox_make_default_tts_options() -> VoicevoxTtsOptions {
 }
 
 /// テキスト音声合成を実行する
+/// @param [in] synthesizer 音声シンセサイザ #VoicevoxVoiceSynthesizer
 /// @param [in] text テキスト。文字コードはUTF-8
-/// @param [in] speaker_id 話者ID
+/// @param [in] style_id スタイルID #VoicevoxStyleId
 /// @param [in] options テキスト音声合成オプション
 /// @param [out] output_wav_length 出力する wav データのサイズ
 /// @param [out] output_wav wav データの出力先
@@ -589,16 +638,22 @@ pub extern "C" fn voicevox_make_default_tts_options() -> VoicevoxTtsOptions {
 /// @param output_wav_length 出力先の領域が確保された状態でpointerに渡されていること
 /// @param output_wav は自動で output_wav_length 分のデータが割り当てられるので ::voicevox_wav_free で解放する必要がある
 #[no_mangle]
-pub unsafe extern "C" fn voicevox_tts(
+pub unsafe extern "C" fn voicevox_synthesizer_tts(
+    synthesizer: *const VoicevoxSynthesizer,
     text: *const c_char,
-    speaker_id: u32,
+    style_id: VoicevoxStyleId,
     options: VoicevoxTtsOptions,
     output_wav_length: *mut usize,
     output_wav: *mut *mut u8,
 ) -> VoicevoxResultCode {
     into_result_code_with_error((|| {
+        let synthesizer = &*(synthesizer as *const CSynthesizer);
         let text = ensure_utf8(CStr::from_ptr(text))?;
-        let output = lock_internal().tts(text, speaker_id, options.into())?;
+        let output = RUNTIME.block_on(synthesizer.synthesizer().tts(
+            text,
+            StyleId::new(style_id),
+            &TtsOptions::from(options),
+        ))?;
         let (ptr, size) = BUFFER_MANAGER.lock().unwrap().vec_into_raw(output);
         output_wav.write(ptr);
         output_wav_length.write(size);
@@ -606,24 +661,14 @@ pub unsafe extern "C" fn voicevox_tts(
     })())
 }
 
-/// jsonフォーマットされた AudioQuery データのメモリを解放する
-/// @param [in] audio_query_json 解放する json フォーマットされた AudioQuery データ
+/// jsonフォーマットされたデータのメモリを解放する
+/// @param [in] json 解放する json データ
 ///
 /// # Safety
 /// @param voicevox_audio_query で確保されたポインタであり、かつ呼び出し側でバッファの変更を行われていないこと
 #[no_mangle]
-pub unsafe extern "C" fn voicevox_audio_query_json_free(audio_query_json: *mut c_char) {
-    drop(CString::from_raw(audio_query_json));
-}
-
-/// jsonフォーマットされた AccnetPhrase データのメモリを解放する
-/// @param [in] accented_phrase_json 解放する json フォーマットされた AccnetPhrase データ
-///
-/// # Safety
-/// @param voicevox_accent_phrases で確保されたポインタであり、かつ呼び出し側でバッファの変更を行われていないこと
-#[no_mangle]
-pub unsafe extern "C" fn voicevox_accent_phrases_json_free(accented_phrase_json: *mut c_char) {
-    drop(CString::from_raw(accented_phrase_json));
+pub unsafe extern "C" fn voicevox_json_free(json: *mut c_char) {
+    drop(CString::from_raw(json));
 }
 
 /// wav データのメモリを解放する
@@ -643,7 +688,7 @@ pub unsafe extern "C" fn voicevox_wav_free(wav: *mut u8) {
 pub extern "C" fn voicevox_error_result_to_message(
     result_code: VoicevoxResultCode,
 ) -> *const c_char {
-    voicevox_core::error_result_to_message(result_code).as_ptr() as *const c_char
+    voicevox_core::result_code::error_result_to_message(result_code).as_ptr() as *const c_char
 }
 
 #[cfg(test)]
@@ -652,6 +697,7 @@ mod tests {
     use anyhow::anyhow;
     use pretty_assertions::assert_eq;
     use voicevox_core::Error;
+    use voicevox_core::Result;
 
     #[rstest]
     #[case(Ok(()), VoicevoxResultCode::VOICEVOX_RESULT_OK)]
diff --git a/crates/voicevox_core_c_api/tests/e2e/log_mask.rs b/crates/voicevox_core_c_api/tests/e2e/log_mask.rs
index 64a687b77..76e01951f 100644
--- a/crates/voicevox_core_c_api/tests/e2e/log_mask.rs
+++ b/crates/voicevox_core_c_api/tests/e2e/log_mask.rs
@@ -23,7 +23,7 @@ impl Utf8Output {
     pub(crate) fn mask_windows_video_cards(self) -> Self {
         self.mask_stderr(
             static_regex!(
-                r#"(?m)^\{timestamp\}  INFO voicevox_core::publish: 検出されたGPU \(DirectMLには1番目のGPUが使われます\):(\n\{timestamp\}  INFO voicevox_core::publish:   - "[^"]+" \([0-9.]+ [a-zA-Z]+\))+"#,
+                r#"(?m)^\{timestamp\}  INFO voicevox_core::voice_synthesizer: 検出されたGPU \(DirectMLには1番目のGPUが使われます\):(\n\{timestamp\}  INFO voicevox_core::voice_synthesizer:   - "[^"]+" \([0-9.]+ [a-zA-Z]+\))+"#,
             ),
             "{windows-video-cards}",
         )
diff --git a/crates/voicevox_core_c_api/tests/e2e/snapshots.toml b/crates/voicevox_core_c_api/tests/e2e/snapshots.toml
index f7dd1bf7d..0fc8a0c53 100644
--- a/crates/voicevox_core_c_api/tests/e2e/snapshots.toml
+++ b/crates/voicevox_core_c_api/tests/e2e/snapshots.toml
@@ -1,4 +1,44 @@
 [compatible_engine]
+metas = '''
+[
+  {
+    "name": "dummy1",
+    "styles": [
+      {
+        "id": 0,
+        "name": "style1"
+      }
+    ],
+    "version": "0.0.1",
+    "speaker_uuid": "574bc678-8370-44be-b941-08e46e7b47d7"
+  },
+  {
+    "name": "dummy2",
+    "styles": [
+      {
+        "id": 1,
+        "name": "style2"
+      }
+    ],
+    "version": "0.0.1",
+    "speaker_uuid": "dd9ccd75-75f6-40ce-a3db-960cbed2e905"
+  },
+  {
+    "name": "dummy3",
+    "styles": [
+      {
+        "id": 2,
+        "name": "style3-1"
+      },
+      {
+        "id": 3,
+        "name": "style3-2"
+      }
+    ],
+    "version": "0.0.1",
+    "speaker_uuid": "5d3d9aa9-88e5-4a96-8ef7-f13a3cad1cb3"
+  }
+]'''
 yukarin_s_forward = [0.9537022, 0.046877652, 0.11338878, 0.06429571, 0.07507616, 0.08266081, 0.1571679, 0.64980185]
 yukarin_sa_forward = [5.0591826, 5.905218, 5.846999, 5.565851, 5.528879]
 stderr.windows = '''
@@ -8,6 +48,17 @@ stderr.unix = ""
 
 [compatible_engine_load_model_before_initialize]
 last_error_message = "Statusが初期化されていません"
+stderr = ""
+
+[simple_tts]
+output."こんにちは、音声合成の世界へようこそ".wav_length = 176172
+stderr.windows = '''
+{windows-video-cards}
+'''
+stderr.unix = ""
+
+[tts_via_audio_query]
+output."こんにちは、音声合成の世界へようこそ".wav_length = 176172
 stderr.windows = '''
 {windows-video-cards}
 '''
diff --git a/crates/voicevox_core_c_api/tests/e2e/symbols.rs b/crates/voicevox_core_c_api/tests/e2e/symbols.rs
index 193b0dae3..c26cb5e34 100644
--- a/crates/voicevox_core_c_api/tests/e2e/symbols.rs
+++ b/crates/voicevox_core_c_api/tests/e2e/symbols.rs
@@ -1,9 +1,103 @@
-use std::ffi::{c_char, c_int};
+use std::ffi::{c_char, c_int, c_void};
 
 use libloading::{Library, Symbol};
+use voicevox_core::result_code::VoicevoxResultCode;
 
 /// voicevox\_core\_c\_apiのcdylibのシンボルを集めたもの。
+#[allow(dead_code)] // TODO: WIP
 pub(crate) struct Symbols<'lib> {
+    pub(crate) voicevox_open_jtalk_rc_new: Symbol<
+        'lib,
+        unsafe extern "C" fn(*const c_char, *mut *mut OpenJtalkRc) -> VoicevoxResultCode,
+    >,
+    pub(crate) voicevox_open_jtalk_rc_delete: Symbol<'lib, unsafe extern "C" fn(*mut OpenJtalkRc)>,
+    pub(crate) voicevox_make_default_initialize_options:
+        Symbol<'lib, unsafe extern "C" fn() -> VoicevoxInitializeOptions>,
+    pub(crate) voicevox_get_version: Symbol<'lib, unsafe extern "C" fn() -> *const c_char>,
+    pub(crate) voicevox_voice_model_new_from_path: Symbol<
+        'lib,
+        unsafe extern "C" fn(*const c_char, *mut *mut VoicevoxVoiceModel) -> VoicevoxResultCode,
+    >,
+    pub(crate) voicevox_voice_model_id:
+        Symbol<'lib, unsafe extern "C" fn(*const VoicevoxVoiceModel) -> VoicevoxVoiceModelId>,
+    pub(crate) voicevox_voice_model_get_metas_json:
+        Symbol<'lib, unsafe extern "C" fn(*const VoicevoxVoiceModel) -> *const c_char>,
+    pub(crate) voicevox_voice_model_delete:
+        Symbol<'lib, unsafe extern "C" fn(*mut VoicevoxVoiceModel)>,
+    pub(crate) voicevox_synthesizer_new_with_initialize: Symbol<
+        'lib,
+        unsafe extern "C" fn(
+            *const OpenJtalkRc,
+            VoicevoxInitializeOptions,
+            *mut *mut VoicevoxSynthesizer,
+        ) -> VoicevoxResultCode,
+    >,
+    pub(crate) voicevox_synthesizer_delete:
+        Symbol<'lib, unsafe extern "C" fn(*mut VoicevoxSynthesizer)>,
+    pub(crate) voicevox_synthesizer_load_voice_model: Symbol<
+        'lib,
+        unsafe extern "C" fn(
+            *mut VoicevoxSynthesizer,
+            *const VoicevoxVoiceModel,
+        ) -> VoicevoxResultCode,
+    >,
+    pub(crate) voicevox_synthesizer_unload_voice_model: Symbol<
+        'lib,
+        unsafe extern "C" fn(*mut VoicevoxSynthesizer, VoicevoxVoiceModelId) -> VoicevoxResultCode,
+    >,
+    pub(crate) voicevox_synthesizer_is_gpu_mode:
+        Symbol<'lib, unsafe extern "C" fn(*const VoicevoxSynthesizer) -> bool>,
+    pub(crate) voicevox_is_loaded_voice_model: Symbol<
+        'lib,
+        unsafe extern "C" fn(*const VoicevoxSynthesizer, VoicevoxVoiceModelId) -> bool,
+    >,
+    pub(crate) voicevox_synthesizer_get_metas_json:
+        Symbol<'lib, unsafe extern "C" fn(*const VoicevoxSynthesizer) -> *const c_char>,
+    pub(crate) voicevox_get_supported_devices_json:
+        Symbol<'lib, unsafe extern "C" fn() -> *const c_char>,
+    pub(crate) voicevox_make_default_audio_query_options:
+        Symbol<'lib, unsafe extern "C" fn() -> VoicevoxAudioQueryOptions>,
+    pub(crate) voicevox_synthesizer_audio_query: Symbol<
+        'lib,
+        unsafe extern "C" fn(
+            *const VoicevoxSynthesizer,
+            *const c_char,
+            VoicevoxStyleId,
+            VoicevoxAudioQueryOptions,
+            *mut *mut c_char,
+        ) -> VoicevoxResultCode,
+    >,
+    pub(crate) voicevox_make_default_synthesis_options:
+        Symbol<'lib, unsafe extern "C" fn() -> VoicevoxSynthesisOptions>,
+    pub(crate) voicevox_synthesizer_synthesis: Symbol<
+        'lib,
+        unsafe extern "C" fn(
+            *const VoicevoxSynthesizer,
+            *const c_char,
+            VoicevoxStyleId,
+            VoicevoxSynthesisOptions,
+            *mut usize,
+            *mut *mut u8,
+        ) -> VoicevoxResultCode,
+    >,
+    pub(crate) voicevox_make_default_tts_options:
+        Symbol<'lib, unsafe extern "C" fn() -> VoicevoxTtsOptions>,
+    pub(crate) voicevox_synthesizer_tts: Symbol<
+        'lib,
+        unsafe extern "C" fn(
+            *const VoicevoxSynthesizer,
+            *const c_char,
+            VoicevoxStyleId,
+            VoicevoxTtsOptions,
+            *mut usize,
+            *mut *mut u8,
+        ) -> VoicevoxResultCode,
+    >,
+    pub(crate) voicevox_json_free: Symbol<'lib, unsafe extern "C" fn(*mut c_char)>,
+    pub(crate) voicevox_wav_free: Symbol<'lib, unsafe extern "C" fn(*mut u8)>,
+    pub(crate) voicevox_error_result_to_message:
+        Symbol<'lib, unsafe extern "C" fn(VoicevoxResultCode) -> *const c_char>,
+
     pub(crate) initialize: Symbol<'lib, unsafe extern "C" fn(bool, c_int, bool) -> bool>,
     pub(crate) load_model: Symbol<'lib, unsafe extern "C" fn(i64) -> bool>,
     pub(crate) is_model_loaded: Symbol<'lib, unsafe extern "C" fn(i64) -> bool>,
@@ -44,6 +138,31 @@ impl<'lib> Symbols<'lib> {
         });
 
         Ok(new!(
+            voicevox_open_jtalk_rc_new,
+            voicevox_open_jtalk_rc_delete,
+            voicevox_make_default_initialize_options,
+            voicevox_get_version,
+            voicevox_voice_model_new_from_path,
+            voicevox_voice_model_id,
+            voicevox_voice_model_get_metas_json,
+            voicevox_voice_model_delete,
+            voicevox_synthesizer_new_with_initialize,
+            voicevox_synthesizer_delete,
+            voicevox_synthesizer_load_voice_model,
+            voicevox_synthesizer_unload_voice_model,
+            voicevox_synthesizer_is_gpu_mode,
+            voicevox_is_loaded_voice_model,
+            voicevox_synthesizer_get_metas_json,
+            voicevox_get_supported_devices_json,
+            voicevox_make_default_audio_query_options,
+            voicevox_synthesizer_audio_query,
+            voicevox_make_default_synthesis_options,
+            voicevox_synthesizer_synthesis,
+            voicevox_make_default_tts_options,
+            voicevox_synthesizer_tts,
+            voicevox_json_free,
+            voicevox_wav_free,
+            voicevox_error_result_to_message,
             initialize,
             load_model,
             is_model_loaded,
@@ -57,3 +176,38 @@ impl<'lib> Symbols<'lib> {
         ))
     }
 }
+
+type OpenJtalkRc = c_void;
+type VoicevoxVoiceModel = c_void;
+type VoicevoxVoiceModelId = *const c_char;
+type VoicevoxSynthesizer = c_void;
+type VoicevoxStyleId = u32;
+
+#[repr(i32)]
+#[allow(non_camel_case_types)]
+pub(crate) enum VoicevoxAccelerationMode {
+    VOICEVOX_ACCELERATION_MODE_CPU = 1,
+}
+
+#[repr(C)]
+pub(crate) struct VoicevoxInitializeOptions {
+    pub(crate) acceleration_mode: VoicevoxAccelerationMode,
+    pub(crate) _cpu_num_threads: u16,
+    pub(crate) _load_all_models: bool,
+}
+
+#[repr(C)]
+pub(crate) struct VoicevoxAudioQueryOptions {
+    _kana: bool,
+}
+
+#[repr(C)]
+pub(crate) struct VoicevoxSynthesisOptions {
+    _enable_interrogative_upspeak: bool,
+}
+
+#[repr(C)]
+pub(crate) struct VoicevoxTtsOptions {
+    _kana: bool,
+    _enable_interrogative_upspeak: bool,
+}
diff --git a/crates/voicevox_core_c_api/tests/e2e/testcases.rs b/crates/voicevox_core_c_api/tests/e2e/testcases.rs
index 6417040ec..e244a428f 100644
--- a/crates/voicevox_core_c_api/tests/e2e/testcases.rs
+++ b/crates/voicevox_core_c_api/tests/e2e/testcases.rs
@@ -1,2 +1,4 @@
 mod compatible_engine;
 mod compatible_engine_load_model_before_initialize;
+mod simple_tts;
+mod tts_via_audio_query;
diff --git a/crates/voicevox_core_c_api/tests/e2e/testcases/compatible_engine.rs b/crates/voicevox_core_c_api/tests/e2e/testcases/compatible_engine.rs
index 971951f15..df15173da 100644
--- a/crates/voicevox_core_c_api/tests/e2e/testcases/compatible_engine.rs
+++ b/crates/voicevox_core_c_api/tests/e2e/testcases/compatible_engine.rs
@@ -6,6 +6,7 @@ use assert_cmd::assert::AssertResult;
 use libloading::Library;
 use once_cell::sync::Lazy;
 use serde::{Deserialize, Serialize};
+use voicevox_core::SupportedDevices;
 
 use crate::{
     assert_cdylib::{self, case, Utf8Output},
@@ -37,8 +38,7 @@ impl assert_cdylib::TestCase for TestCase {
         let metas_json = {
             let metas_json = metas();
             let metas_json = CStr::from_ptr(metas_json).to_str()?;
-            metas_json.parse::<serde_json::Value>()?;
-            metas_json
+            serde_json::to_string_pretty(&metas_json.parse::<serde_json::Value>()?).unwrap()
         };
 
         let supported_devices = {
@@ -119,9 +119,9 @@ impl assert_cdylib::TestCase for TestCase {
             wave
         };
 
-        std::assert_eq!(include_str!("../../../../../model/metas.json"), metas_json);
+        std::assert_eq!(SNAPSHOTS.metas, metas_json);
         std::assert_eq!(
-            voicevox_core::SUPPORTED_DEVICES.to_json(),
+            SupportedDevices::get_supported_devices().unwrap().to_json(),
             supported_devices,
         );
 
@@ -153,6 +153,7 @@ static SNAPSHOTS: Lazy<Snapshots> = snapshots::section!(compatible_engine);
 
 #[derive(Deserialize)]
 struct Snapshots {
+    metas: String,
     pub(crate) yukarin_s_forward: [f32; 8],
     pub(crate) yukarin_sa_forward: [f32; 5],
     #[serde(deserialize_with = "snapshots::deserialize_platform_specific_snapshot")]
diff --git a/crates/voicevox_core_c_api/tests/e2e/testcases/compatible_engine_load_model_before_initialize.rs b/crates/voicevox_core_c_api/tests/e2e/testcases/compatible_engine_load_model_before_initialize.rs
index abbd28982..4ea024a98 100644
--- a/crates/voicevox_core_c_api/tests/e2e/testcases/compatible_engine_load_model_before_initialize.rs
+++ b/crates/voicevox_core_c_api/tests/e2e/testcases/compatible_engine_load_model_before_initialize.rs
@@ -52,6 +52,5 @@ static SNAPSHOTS: Lazy<Snapshots> =
 #[derive(Deserialize)]
 struct Snapshots {
     last_error_message: String,
-    #[serde(deserialize_with = "snapshots::deserialize_platform_specific_snapshot")]
     stderr: String,
 }
diff --git a/crates/voicevox_core_c_api/tests/e2e/testcases/simple_tts.rs b/crates/voicevox_core_c_api/tests/e2e/testcases/simple_tts.rs
new file mode 100644
index 000000000..70a16ea1c
--- /dev/null
+++ b/crates/voicevox_core_c_api/tests/e2e/testcases/simple_tts.rs
@@ -0,0 +1,141 @@
+use std::{
+    collections::HashMap,
+    ffi::{CStr, CString},
+    mem::MaybeUninit,
+};
+
+use assert_cmd::assert::AssertResult;
+use libloading::Library;
+use once_cell::sync::Lazy;
+use serde::{Deserialize, Serialize};
+use test_util::OPEN_JTALK_DIC_DIR;
+use voicevox_core::result_code::VoicevoxResultCode;
+
+use crate::{
+    assert_cdylib::{self, case, Utf8Output},
+    snapshots,
+    symbols::{Symbols, VoicevoxAccelerationMode, VoicevoxInitializeOptions},
+};
+
+macro_rules! cstr {
+    ($s:literal $(,)?) => {
+        CStr::from_bytes_with_nul(concat!($s, '\0').as_ref()).unwrap()
+    };
+}
+
+case!(TestCase {
+    text: "こんにちは、音声合成の世界へようこそ".to_owned()
+});
+
+#[derive(Serialize, Deserialize)]
+struct TestCase {
+    text: String,
+}
+
+#[typetag::serde(name = "simple_tts")]
+impl assert_cdylib::TestCase for TestCase {
+    unsafe fn exec(&self, lib: &Library) -> anyhow::Result<()> {
+        let Symbols {
+            voicevox_open_jtalk_rc_new,
+            voicevox_open_jtalk_rc_delete,
+            voicevox_make_default_initialize_options,
+            voicevox_voice_model_new_from_path,
+            voicevox_voice_model_delete,
+            voicevox_synthesizer_new_with_initialize,
+            voicevox_synthesizer_delete,
+            voicevox_synthesizer_load_voice_model,
+            voicevox_make_default_tts_options,
+            voicevox_synthesizer_tts,
+            voicevox_wav_free,
+            ..
+        } = Symbols::new(lib)?;
+
+        let model = {
+            let mut model = MaybeUninit::uninit();
+            assert_ok(voicevox_voice_model_new_from_path(
+                cstr!("../../model/sample.vvm").as_ptr(),
+                model.as_mut_ptr(),
+            ));
+            model.assume_init()
+        };
+
+        let openjtalk = {
+            let mut openjtalk = MaybeUninit::uninit();
+            let open_jtalk_dic_dir = CString::new(OPEN_JTALK_DIC_DIR).unwrap();
+            assert_ok(voicevox_open_jtalk_rc_new(
+                open_jtalk_dic_dir.as_ptr(),
+                openjtalk.as_mut_ptr(),
+            ));
+            openjtalk.assume_init()
+        };
+
+        let synthesizer = {
+            let mut synthesizer = MaybeUninit::uninit();
+            assert_ok(voicevox_synthesizer_new_with_initialize(
+                openjtalk,
+                VoicevoxInitializeOptions {
+                    acceleration_mode: VoicevoxAccelerationMode::VOICEVOX_ACCELERATION_MODE_CPU,
+                    ..voicevox_make_default_initialize_options()
+                },
+                synthesizer.as_mut_ptr(),
+            ));
+            synthesizer.assume_init()
+        };
+
+        assert_ok(voicevox_synthesizer_load_voice_model(synthesizer, model));
+
+        let (wav_length, wav) = {
+            let mut wav_length = MaybeUninit::uninit();
+            let mut wav = MaybeUninit::uninit();
+            let text = CString::new(&*self.text).unwrap();
+            assert_ok(voicevox_synthesizer_tts(
+                synthesizer,
+                text.as_ptr(),
+                STYLE_ID,
+                voicevox_make_default_tts_options(),
+                wav_length.as_mut_ptr(),
+                wav.as_mut_ptr(),
+            ));
+            (wav_length.assume_init(), wav.assume_init())
+        };
+
+        std::assert_eq!(SNAPSHOTS.output[&self.text].wav_length, wav_length);
+
+        voicevox_voice_model_delete(model);
+        voicevox_open_jtalk_rc_delete(openjtalk);
+        voicevox_synthesizer_delete(synthesizer);
+        voicevox_wav_free(wav);
+
+        return Ok(());
+
+        const STYLE_ID: u32 = 0;
+
+        fn assert_ok(result_code: VoicevoxResultCode) {
+            std::assert_eq!(VoicevoxResultCode::VOICEVOX_RESULT_OK, result_code);
+        }
+    }
+
+    fn assert_output(&self, output: Utf8Output) -> AssertResult {
+        output
+            .mask_timestamps()
+            .mask_windows_video_cards()
+            .assert()
+            .try_success()?
+            .try_stdout("")?
+            .try_stderr(&*SNAPSHOTS.stderr)
+    }
+}
+
+static SNAPSHOTS: Lazy<Snapshots> = snapshots::section!(simple_tts);
+
+#[derive(Deserialize)]
+struct Snapshots {
+    output: HashMap<String, ExpectedOutput>,
+    #[serde(deserialize_with = "snapshots::deserialize_platform_specific_snapshot")]
+    stderr: String,
+}
+
+#[derive(Deserialize)]
+struct ExpectedOutput {
+    wav_length: usize,
+}
diff --git a/crates/voicevox_core_c_api/tests/e2e/testcases/tts_via_audio_query.rs b/crates/voicevox_core_c_api/tests/e2e/testcases/tts_via_audio_query.rs
new file mode 100644
index 000000000..79bb27ff7
--- /dev/null
+++ b/crates/voicevox_core_c_api/tests/e2e/testcases/tts_via_audio_query.rs
@@ -0,0 +1,157 @@
+use std::{
+    collections::HashMap,
+    ffi::{CStr, CString},
+    mem::MaybeUninit,
+};
+
+use assert_cmd::assert::AssertResult;
+use libloading::Library;
+use once_cell::sync::Lazy;
+use serde::{Deserialize, Serialize};
+use test_util::OPEN_JTALK_DIC_DIR;
+use voicevox_core::result_code::VoicevoxResultCode;
+
+use crate::{
+    assert_cdylib::{self, case, Utf8Output},
+    snapshots,
+    symbols::{Symbols, VoicevoxAccelerationMode, VoicevoxInitializeOptions},
+};
+
+macro_rules! cstr {
+    ($s:literal $(,)?) => {
+        CStr::from_bytes_with_nul(concat!($s, '\0').as_ref()).unwrap()
+    };
+}
+
+case!(TestCase {
+    text: "こんにちは、音声合成の世界へようこそ".to_owned()
+});
+
+#[derive(Serialize, Deserialize)]
+struct TestCase {
+    text: String,
+}
+
+#[typetag::serde(name = "tts_via_audio_query")]
+impl assert_cdylib::TestCase for TestCase {
+    unsafe fn exec(&self, lib: &Library) -> anyhow::Result<()> {
+        let Symbols {
+            voicevox_open_jtalk_rc_new,
+            voicevox_open_jtalk_rc_delete,
+            voicevox_make_default_initialize_options,
+            voicevox_voice_model_new_from_path,
+            voicevox_voice_model_delete,
+            voicevox_synthesizer_new_with_initialize,
+            voicevox_synthesizer_delete,
+            voicevox_synthesizer_load_voice_model,
+            voicevox_make_default_audio_query_options,
+            voicevox_synthesizer_audio_query,
+            voicevox_make_default_synthesis_options,
+            voicevox_synthesizer_synthesis,
+            voicevox_json_free,
+            voicevox_wav_free,
+            ..
+        } = Symbols::new(lib)?;
+
+        let model = {
+            let mut model = MaybeUninit::uninit();
+            assert_ok(voicevox_voice_model_new_from_path(
+                cstr!("../../model/sample.vvm").as_ptr(),
+                model.as_mut_ptr(),
+            ));
+            model.assume_init()
+        };
+
+        let openjtalk = {
+            let mut openjtalk = MaybeUninit::uninit();
+            let open_jtalk_dic_dir = CString::new(OPEN_JTALK_DIC_DIR).unwrap();
+            assert_ok(voicevox_open_jtalk_rc_new(
+                open_jtalk_dic_dir.as_ptr(),
+                openjtalk.as_mut_ptr(),
+            ));
+            openjtalk.assume_init()
+        };
+
+        let synthesizer = {
+            let mut synthesizer = MaybeUninit::uninit();
+            assert_ok(voicevox_synthesizer_new_with_initialize(
+                openjtalk,
+                VoicevoxInitializeOptions {
+                    acceleration_mode: VoicevoxAccelerationMode::VOICEVOX_ACCELERATION_MODE_CPU,
+                    ..voicevox_make_default_initialize_options()
+                },
+                synthesizer.as_mut_ptr(),
+            ));
+            synthesizer.assume_init()
+        };
+
+        assert_ok(voicevox_synthesizer_load_voice_model(synthesizer, model));
+
+        let audio_query = {
+            let mut audio_query = MaybeUninit::uninit();
+            let text = CString::new(&*self.text).unwrap();
+            assert_ok(voicevox_synthesizer_audio_query(
+                synthesizer,
+                text.as_ptr(),
+                STYLE_ID,
+                voicevox_make_default_audio_query_options(),
+                audio_query.as_mut_ptr(),
+            ));
+            audio_query.assume_init()
+        };
+
+        let (wav_length, wav) = {
+            let mut wav_length = MaybeUninit::uninit();
+            let mut wav = MaybeUninit::uninit();
+            assert_ok(voicevox_synthesizer_synthesis(
+                synthesizer,
+                audio_query,
+                STYLE_ID,
+                voicevox_make_default_synthesis_options(),
+                wav_length.as_mut_ptr(),
+                wav.as_mut_ptr(),
+            ));
+            (wav_length.assume_init(), wav.assume_init())
+        };
+
+        std::assert_eq!(SNAPSHOTS.output[&self.text].wav_length, wav_length);
+
+        voicevox_voice_model_delete(model);
+        voicevox_open_jtalk_rc_delete(openjtalk);
+        voicevox_synthesizer_delete(synthesizer);
+        voicevox_json_free(audio_query);
+        voicevox_wav_free(wav);
+
+        return Ok(());
+
+        const STYLE_ID: u32 = 0;
+
+        fn assert_ok(result_code: VoicevoxResultCode) {
+            std::assert_eq!(VoicevoxResultCode::VOICEVOX_RESULT_OK, result_code);
+        }
+    }
+
+    fn assert_output(&self, output: Utf8Output) -> AssertResult {
+        output
+            .mask_timestamps()
+            .mask_windows_video_cards()
+            .assert()
+            .try_success()?
+            .try_stdout("")?
+            .try_stderr(&*SNAPSHOTS.stderr)
+    }
+}
+
+static SNAPSHOTS: Lazy<Snapshots> = snapshots::section!(tts_via_audio_query);
+
+#[derive(Deserialize)]
+struct Snapshots {
+    output: HashMap<String, ExpectedOutput>,
+    #[serde(deserialize_with = "snapshots::deserialize_platform_specific_snapshot")]
+    stderr: String,
+}
+
+#[derive(Deserialize)]
+struct ExpectedOutput {
+    wav_length: usize,
+}
diff --git a/crates/voicevox_core_python_api/Cargo.toml b/crates/voicevox_core_python_api/Cargo.toml
index 87aec1341..f75f19db9 100644
--- a/crates/voicevox_core_python_api/Cargo.toml
+++ b/crates/voicevox_core_python_api/Cargo.toml
@@ -17,13 +17,15 @@ directml = ["voicevox_core/directml"]
 [dependencies]
 easy-ext.workspace = true
 log = "0.4.17"
-numpy = "0.17.2"
-pyo3 = { version = "0.17.2", features = ["abi3-py38", "extension-module"] }
-pyo3-log = "0.7.0"
+pyo3 = { version = "0.18.0", features = ["abi3-py38", "extension-module"] }
+pyo3-log = "0.8.0"
 serde.workspace = true
 serde_json.workspace = true
 tracing.workspace = true
 voicevox_core.workspace = true
+pyo3-asyncio = { version = "0.18.0", features = ["tokio-runtime"] }
+tokio.workspace = true
+once_cell.workspace = true
 
 [build-dependencies]
 anyhow.workspace = true
diff --git a/crates/voicevox_core_python_api/python/voicevox_core/__init__.py b/crates/voicevox_core_python_api/python/voicevox_core/__init__.py
index efd215079..7b1dce0c8 100644
--- a/crates/voicevox_core_python_api/python/voicevox_core/__init__.py
+++ b/crates/voicevox_core_python_api/python/voicevox_core/__init__.py
@@ -3,20 +3,21 @@
     AccelerationMode,
     AccentPhrase,
     AudioQuery,
-    Meta,
     Mora,
+    SpeakerMeta,
     SupportedDevices,
 )
-from ._rust import METAS, SUPPORTED_DEVICES, VoicevoxCore  # noqa: F401
+from ._rust import OpenJtalk, Synthesizer, VoiceModel, supported_devices  # noqa: F401
 
 __all__ = [
-    "METAS",
-    "SUPPORTED_DEVICES",
     "AccelerationMode",
     "AccentPhrase",
     "AudioQuery",
-    "Meta",
     "Mora",
+    "OpenJtalk",
+    "SpeakerMeta",
     "SupportedDevices",
-    "VoicevoxCore",
+    "Synthesizer",
+    "VoiceModel",
+    "supported_devices",
 ]
diff --git a/crates/voicevox_core_python_api/python/voicevox_core/_models.py b/crates/voicevox_core_python_api/python/voicevox_core/_models.py
index 308951482..213e9bfee 100644
--- a/crates/voicevox_core_python_api/python/voicevox_core/_models.py
+++ b/crates/voicevox_core_python_api/python/voicevox_core/_models.py
@@ -5,17 +5,17 @@
 
 
 @pydantic.dataclasses.dataclass
-class Style:
+class StyleMeta:
     name: str
     id: int
 
 
 @pydantic.dataclasses.dataclass
-class Meta:
+class SpeakerMeta:
     """メタ情報。"""
 
     name: str
-    styles: List[Style]
+    styles: List[StyleMeta]
     speaker_uuid: str
     version: str
 
diff --git a/crates/voicevox_core_python_api/python/voicevox_core/_rust.pyi b/crates/voicevox_core_python_api/python/voicevox_core/_rust.pyi
index c34e407f1..3ee503e88 100644
--- a/crates/voicevox_core_python_api/python/voicevox_core/_rust.pyi
+++ b/crates/voicevox_core_python_api/python/voicevox_core/_rust.pyi
@@ -4,33 +4,63 @@ from typing import Final, List, Literal, Union
 import numpy as np
 from numpy.typing import NDArray
 
-from voicevox_core import AccelerationMode, AccentPhrase, AudioQuery, Meta, SupportedDevices
+from voicevox_core import (
+    AccelerationMode,
+    AccentPhrase,
+    AudioQuery,
+    SpeakerMeta,
+    SupportedDevices,
+)
 
-METAS: Final[List[Meta]]
-SUPPORTED_DEVICES: Final[SupportedDevices]
 __version__: str
 
-class VoicevoxCore:
-    def __init__(
-        self,
+def supported_devices() -> SupportedDevices: ...
+
+class VoiceModel:
+    @staticmethod
+    async def from_path(path: Union[Path, str]) -> "VoiceModel":
+        """
+        Parameters
+        ----------
+        path
+            vvmファイルへのパス
+        """
+        ...
+    @property
+    def id(self) -> str: ...
+    @property
+    def metas(self) -> List[SpeakerMeta]: ...
+
+class OpenJtalk:
+    def __init__(self, open_jtalk_dict_dir: Union[Path, str]) -> None:
+        """
+        Parameters
+        ----------
+        open_jtalk_dict_dir
+            open_jtalkの辞書ディレクトリ。
+        """
+        ...
+
+class Synthesizer:
+    @staticmethod
+    async def new_with_initialize(
+        open_jtalk: OpenJtalk,
         acceleration_mode: Union[
             AccelerationMode, Literal["AUTO", "CPU", "GPU"]
         ] = AccelerationMode.AUTO,
         cpu_num_threads: int = 0,
         load_all_models: bool = False,
-        open_jtalk_dict_dir: Union[Path, str, None] = None,
-    ) -> None:
+    ) -> "Synthesizer":
         """
         Parameters
         ----------
+        open_jtalk
         acceleration_mode
             ハードウェアアクセラレーションモード。
         cpu_num_threads
             CPU利用数を指定。0を指定すると環境に合わせたCPUが利用される。
         load_all_models
             全てのモデルを読み込む。
-        open_jtalk_dict_dir
-            open_jtalkの辞書ディレクトリ。
         """
         ...
     def __repr__(self) -> str: ...
@@ -43,111 +73,42 @@ class VoicevoxCore:
         GPUモードならtrue、そうでないならfalse
         """
         ...
-    def load_model(self, speaker_id: int) -> None:
+    @property
+    def metas(self) -> SpeakerMeta:
+        """メタ情報を取得する。"""
+        ...
+    async def load_voice_model(self, model: VoiceModel) -> None:
         """モデルを読み込む。
 
         Parameters
         ----------
-        speaker_id
+        style_id
             読み込むモデルの話者ID。
         """
         ...
-    def is_model_loaded(self, speaker_id: int) -> bool:
-        """指定したspeaker_idのモデルが読み込まれているか判定する。
-
-        Returns
-        -------
-        モデルが読み込まれているのであればtrue、そうでないならfalse
-        """
-        ...
-    def predict_duration(
-        self,
-        phoneme_vector: NDArray[np.int64],
-        speaker_id: int,
-    ) -> NDArray[np.float32]:
-        """音素ごとの長さを推論する。
+    def unload_voice_model(self, voice_model_id: str) -> None:
+        """モデルの読み込みを解除する。
 
         Parameters
         ----------
-        phoneme_vector
-            音素データ。
-        speaker_id
-            話者ID。
-
-        Returns
-        -------
-        音素ごとの長さ
+        voice_model_id
+            音声モデルID。
         """
         ...
-    def predict_intonation(
-        self,
-        length: int,
-        vowel_phoneme_vector: NDArray[np.int64],
-        consonant_phoneme_vector: NDArray[np.int64],
-        start_accent_vector: NDArray[np.int64],
-        end_accent_vector: NDArray[np.int64],
-        start_accent_phrase_vector: NDArray[np.int64],
-        end_accent_phrase_vector: NDArray[np.int64],
-        speaker_id: int,
-    ) -> NDArray[np.float32]:
-        """モーラごとのF0を推論する。
-
-        Parameters
-        ----------
-        length
-            vowel_phoneme_vector, consonant_phoneme_vector, start_accent_vector, end_accent_vector, start_accent_phrase_vector, end_accent_phrase_vector, output のデータ長。
-        vowel_phoneme_vector
-            母音の音素データ。
-        consonant_phoneme_vector
-            子音の音素データ。
-        start_accent_vector
-            アクセントの開始位置のデータ。
-        end_accent_vector
-            アクセントの終了位置のデータ。
-        start_accent_phrase_vector
-            アクセント句の開始位置のデータ。
-        end_accent_phrase_vector
-            アクセント句の終了位置のデータ。
-        speaker_id
-            話者ID。
+    def is_loaded_voice_model(self, voice_model_id: str) -> bool:
+        """指定したvoice_model_idのモデルが読み込まれているか判定する。
 
         Returns
         -------
-        モーラごとのF0
-        """
-        ...
-    def decode(
-        self,
-        length: int,
-        phoneme_size: int,
-        f0: NDArray[np.float32],
-        phoneme: NDArray[np.float32],
-        speaker_id: int,
-    ) -> NDArray[np.float32]:
-        """decodeを実行する。
-
-        Parameters
-        ----------
-        length
-            f0 , output のデータ長及び phoneme のデータ長に関連する。
-        phoneme_size
-            音素のサイズ phoneme のデータ長に関連する。
-        f0
-            基本周波数。
-        phoneme_vector
-            音素データ。
-        speaker_id
-            話者ID。
-
-        Returns
-        -------
-        decode結果
+        モデルが読み込まれているのであればtrue、そうでないならfalse
         """
         ...
-    def audio_query(
+    def unload_voice_model(self, voice_model_id: str) -> None:
+        """指定したvoice_model_idのモデルがを破棄する"""
+    async def audio_query(
         self,
         text: str,
-        speaker_id: int,
+        style_id: int,
         kana: bool = False,
     ) -> AudioQuery:
         """AudioQuery を実行する。
@@ -156,7 +117,7 @@ class VoicevoxCore:
         ----------
         text
             テキスト。文字コードはUTF-8。
-        speaker_id
+        style_id
             話者ID。
         kana
             aquestalk形式のkanaとしてテキストを解釈する。
@@ -166,19 +127,19 @@ class VoicevoxCore:
         :class:`AudioQuery`
         """
         ...
-    def accent_phrases(
+    async def create_accent_phrases(
         self,
         text: str,
-        speaker_id: int,
+        style_id: int,
         kana: bool = False,
     ) -> List[AccentPhrase]:
-        """`accent_phrases` を実行する。
+        """create_accent_phrases を実行する。
 
         Parameters
         ----------
         text
             テキスト。文字コードはUTF-8。
-        speaker_id
+        style_id
             話者ID。
         kana
             aquestalk形式のkanaとしてテキストを解釈する。
@@ -188,64 +149,64 @@ class VoicevoxCore:
         :class:`List` [:class:`AccentPhrase`]
         """
         ...
-    def mora_length( self,
+    async def replace_mora_data(
+        self,
         accent_phrases: List[AccentPhrase],
-        speaker_id: int,
+        style_id: int,
     ) -> List[AccentPhrase]:
-        """アクセント句の音素長を変更する
+        """アクセント句の音高・音素長を変更する。
 
         Parameters
         ----------
         accent_phrases
             変更元のアクセント句。
-        speaker_id
+        style_id
             話者ID。
-
         Returns
         -------
         :class:`List` [:class:`AccentPhrase`]
         """
         ...
-    def mora_pitch( self,
+    async def replace_phoneme_length(
+        self,
         accent_phrases: List[AccentPhrase],
-        speaker_id: int,
+        style_id: int,
     ) -> List[AccentPhrase]:
-        """アクセント句の音高を変更する
+        """アクセント句の音素長を変更する。
 
         Parameters
         ----------
         accent_phrases
             変更元のアクセント句。
-        speaker_id
+        style_id
             話者ID。
-
         Returns
         -------
         :class:`List` [:class:`AccentPhrase`]
         """
         ...
-    def mora_data( self,
+    async def replace_mora_pitch(
+        self,
         accent_phrases: List[AccentPhrase],
-        speaker_id: int,
+        style_id: int,
     ) -> List[AccentPhrase]:
-        """アクセント句の音高・音素長を変更する
+        """アクセント句の音高を変更する。
 
         Parameters
         ----------
         accent_phrases
             変更元のアクセント句。
-        speaker_id
+        style_id
             話者ID。
-
         Returns
         -------
         :class:`List` [:class:`AccentPhrase`]
         """
         ...
-    def synthesis(
+    async def synthesis(
         self,
         audio_query: AudioQuery,
-        speaker_id: int,
+        style_id: int,
         enable_interrogative_upspeak: bool = True,
     ) -> bytes:
         """AudioQuery から音声合成する。
@@ -254,7 +215,7 @@ class VoicevoxCore:
         ----------
         audio_query
             AudioQuery。
-        speaker_id
+        style_id
             話者ID。
         enable_interrogative_upspeak
             疑問文の調整を有効にする。
@@ -264,10 +225,10 @@ class VoicevoxCore:
         wavデータ
         """
         ...
-    def tts(
+    async def tts(
         self,
         text: str,
-        speaker_id: int,
+        style_id: int,
         kana: bool = False,
         enable_interrogative_upspeak: bool = True,
     ) -> bytes:
@@ -277,7 +238,7 @@ class VoicevoxCore:
         ----------
         text
             テキスト。文字コードはUTF-8。
-        speaker_id
+        style_id
             話者ID。
         kana
             aquestalk形式のkanaとしてテキストを解釈する。
diff --git a/crates/voicevox_core_python_api/src/lib.rs b/crates/voicevox_core_python_api/src/lib.rs
index a8e14ceac..4989c5d98 100644
--- a/crates/voicevox_core_python_api/src/lib.rs
+++ b/crates/voicevox_core_python_api/src/lib.rs
@@ -1,48 +1,35 @@
-use std::{fmt::Display, path::PathBuf};
+use std::{fmt::Display, future::Future, path::PathBuf, sync::Arc};
 
 use easy_ext::ext;
 use log::debug;
-use numpy::{Ix1, PyArray};
+use once_cell::sync::Lazy;
 use pyo3::{
     create_exception,
     exceptions::PyException,
-    pyclass, pymethods, pymodule,
-    types::{PyBytes, PyModule},
-    FromPyObject as _, PyAny, PyResult, Python,
+    pyclass, pyfunction, pymethods, pymodule,
+    types::{PyBytes, PyList, PyModule},
+    wrap_pyfunction, FromPyObject as _, PyAny, PyResult, Python, ToPyObject,
 };
 use serde::{de::DeserializeOwned, Serialize};
+use tokio::{runtime::Runtime, sync::Mutex};
 use voicevox_core::{
     AccelerationMode, AccentPhraseModel, AccentPhrasesOptions, AudioQueryModel, AudioQueryOptions,
-    InitializeOptions, SynthesisOptions, TtsOptions,
+    InitializeOptions, StyleId, SynthesisOptions, TtsOptions, VoiceModelId, VoiceModelMeta,
 };
 
+static RUNTIME: Lazy<Runtime> = Lazy::new(|| Runtime::new().unwrap());
+
 #[pymodule]
 #[pyo3(name = "_rust")]
-fn rust(py: Python<'_>, module: &PyModule) -> PyResult<()> {
+fn rust(_py: Python<'_>, module: &PyModule) -> PyResult<()> {
     pyo3_log::init();
 
-    module.add("METAS", {
-        let class = py.import("voicevox_core")?.getattr("Meta")?.cast_as()?;
-        let meta_from_json = |x: &serde_json::Value| to_pydantic_dataclass(x, class);
-        serde_json::from_str::<Vec<_>>(voicevox_core::METAS)
-            .into_py_result()?
-            .into_iter()
-            .map(|meta| meta_from_json(&meta))
-            .collect::<Result<Vec<_>, _>>()?
-    })?;
-
-    module.add("SUPPORTED_DEVICES", {
-        let class = py
-            .import("voicevox_core")?
-            .getattr("SupportedDevices")?
-            .cast_as()?;
-        let supported_devices_from_json = |x: &serde_json::Value| to_pydantic_dataclass(x, class);
-        supported_devices_from_json(&voicevox_core::SUPPORTED_DEVICES.to_json())?
-    })?;
-
-    module.add("__version__", voicevox_core::VoicevoxCore::get_version())?;
-
-    module.add_class::<VoicevoxCore>()
+    module.add("__version__", voicevox_core::get_version())?;
+    module.add_wrapped(wrap_pyfunction!(supported_devices))?;
+
+    module.add_class::<Synthesizer>()?;
+    module.add_class::<OpenJtalk>()?;
+    module.add_class::<VoiceModel>()
 }
 
 create_exception!(
@@ -53,235 +40,324 @@ create_exception!(
 );
 
 #[pyclass]
-struct VoicevoxCore {
-    inner: voicevox_core::VoicevoxCore,
+#[derive(Clone)]
+struct VoiceModel {
+    model: voicevox_core::VoiceModel,
+}
+
+#[pyfunction]
+fn supported_devices(py: Python) -> PyResult<&PyAny> {
+    let class = py
+        .import("voicevox_core")?
+        .getattr("SupportedDevices")?
+        .downcast()?;
+    let s = voicevox_core::SupportedDevices::get_supported_devices().into_py_result()?;
+    to_pydantic_dataclass(s, class)
+}
+
+#[pymethods]
+impl VoiceModel {
+    #[staticmethod]
+    fn from_path(
+        py: Python<'_>,
+        #[pyo3(from_py_with = "from_utf8_path")] path: String,
+    ) -> PyResult<&PyAny> {
+        pyo3_asyncio::tokio::future_into_py(py, async move {
+            let model = voicevox_core::VoiceModel::from_path(path)
+                .await
+                .into_py_result()?;
+            Ok(Self { model })
+        })
+    }
+
+    #[getter]
+    fn id(&self) -> &str {
+        self.model.id().raw_voice_model_id()
+    }
+
+    #[getter]
+    fn metas<'py>(&self, py: Python<'py>) -> Vec<&'py PyAny> {
+        to_pydantic_voice_model_meta(self.model.metas(), py).unwrap()
+    }
+}
+
+#[pyclass]
+#[derive(Clone)]
+struct OpenJtalk {
+    open_jtalk: Arc<voicevox_core::OpenJtalk>,
 }
 
 #[pymethods]
-impl VoicevoxCore {
+impl OpenJtalk {
     #[new]
-    #[args(
-        acceleration_mode = "InitializeOptions::default().acceleration_mode",
-        cpu_num_threads = "InitializeOptions::default().cpu_num_threads",
-        load_all_models = "InitializeOptions::default().load_all_models",
-        open_jtalk_dict_dir = "None"
-    )]
-    fn new(
+    fn new(#[pyo3(from_py_with = "from_utf8_path")] open_jtalk_dict_dir: String) -> PyResult<Self> {
+        Ok(Self {
+            open_jtalk: Arc::new(
+                voicevox_core::OpenJtalk::new_with_initialize(open_jtalk_dict_dir)
+                    .into_py_result()?,
+            ),
+        })
+    }
+}
+
+#[pyclass]
+struct Synthesizer {
+    synthesizer: Arc<Mutex<voicevox_core::Synthesizer>>,
+}
+
+#[pymethods]
+impl Synthesizer {
+    #[staticmethod]
+    #[pyo3(signature =(
+        open_jtalk,
+        acceleration_mode = InitializeOptions::default().acceleration_mode,
+        cpu_num_threads = InitializeOptions::default().cpu_num_threads,
+        load_all_models = InitializeOptions::default().load_all_models,
+    ))]
+    fn new_with_initialize(
+        py: Python,
+        open_jtalk: OpenJtalk,
         #[pyo3(from_py_with = "from_acceleration_mode")] acceleration_mode: AccelerationMode,
         cpu_num_threads: u16,
         load_all_models: bool,
-        #[pyo3(from_py_with = "from_optional_utf8_path")] open_jtalk_dict_dir: Option<String>,
-    ) -> PyResult<Self> {
-        let inner = voicevox_core::VoicevoxCore::new_with_initialize(InitializeOptions {
-            acceleration_mode,
-            cpu_num_threads,
-            load_all_models,
-            open_jtalk_dict_dir: open_jtalk_dict_dir.map(Into::into),
+    ) -> PyResult<&PyAny> {
+        pyo3_asyncio::tokio::future_into_py(py, async move {
+            let synthesizer = voicevox_core::Synthesizer::new_with_initialize(
+                open_jtalk.open_jtalk.clone(),
+                &InitializeOptions {
+                    acceleration_mode,
+                    cpu_num_threads,
+                    load_all_models,
+                },
+            )
+            .await
+            .into_py_result()?;
+            Ok(Self {
+                synthesizer: Arc::new(Mutex::new(synthesizer)),
+            })
         })
-        .into_py_result()?;
-        Ok(Self { inner })
     }
 
     fn __repr__(&self) -> &'static str {
-        "VoicevoxCore { .. }"
+        "VoiceSynthesizer { .. }"
     }
 
     #[getter]
     fn is_gpu_mode(&self) -> bool {
-        self.inner.is_gpu_mode()
+        RUNTIME.block_on(self.synthesizer.lock()).is_gpu_mode()
     }
 
-    fn load_model(&mut self, speaker_id: u32) -> PyResult<()> {
-        self.inner.load_model(speaker_id).into_py_result()
-    }
-
-    fn is_model_loaded(&self, speaker_id: u32) -> bool {
-        self.inner.is_model_loaded(speaker_id)
+    #[getter]
+    fn metas<'py>(&self, py: Python<'py>) -> Vec<&'py PyAny> {
+        to_pydantic_voice_model_meta(RUNTIME.block_on(self.synthesizer.lock()).metas(), py).unwrap()
     }
 
-    fn predict_duration<'py>(
+    fn load_voice_model<'py>(
         &mut self,
-        phoneme_vector: &'py PyArray<i64, Ix1>,
-        speaker_id: u32,
+        model: &'py PyAny,
         py: Python<'py>,
-    ) -> PyResult<&'py PyArray<f32, Ix1>> {
-        let duration = self
-            .inner
-            .predict_duration(&phoneme_vector.to_vec()?, speaker_id)
-            .into_py_result()?;
-        Ok(PyArray::from_vec(py, duration))
+    ) -> PyResult<&'py PyAny> {
+        let model: VoiceModel = model.extract()?;
+        let synthesizer = self.synthesizer.clone();
+        pyo3_asyncio::tokio::future_into_py(py, async move {
+            synthesizer
+                .lock()
+                .await
+                .load_voice_model(&model.model)
+                .await
+                .into_py_result()
+        })
     }
 
-    #[allow(clippy::too_many_arguments)]
-    fn predict_intonation<'py>(
-        &mut self,
-        length: usize,
-        vowel_phoneme_vector: &'py PyArray<i64, Ix1>,
-        consonant_phoneme_vector: &'py PyArray<i64, Ix1>,
-        start_accent_vector: &'py PyArray<i64, Ix1>,
-        end_accent_vector: &'py PyArray<i64, Ix1>,
-        start_accent_phrase_vector: &'py PyArray<i64, Ix1>,
-        end_accent_phrase_vector: &'py PyArray<i64, Ix1>,
-        speaker_id: u32,
-        py: Python<'py>,
-    ) -> PyResult<&'py PyArray<f32, Ix1>> {
-        let intonation = self
-            .inner
-            .predict_intonation(
-                length,
-                &vowel_phoneme_vector.to_vec()?,
-                &consonant_phoneme_vector.to_vec()?,
-                &start_accent_vector.to_vec()?,
-                &end_accent_vector.to_vec()?,
-                &start_accent_phrase_vector.to_vec()?,
-                &end_accent_phrase_vector.to_vec()?,
-                speaker_id,
-            )
-            .into_py_result()?;
-        Ok(PyArray::from_vec(py, intonation))
+    fn unload_voice_model(&mut self, voice_model_id: &str) -> PyResult<()> {
+        RUNTIME
+            .block_on(self.synthesizer.lock())
+            .unload_voice_model(&VoiceModelId::new(voice_model_id.to_string()))
+            .into_py_result()
     }
 
-    fn decode<'py>(
-        &mut self,
-        length: usize,
-        phoneme_size: usize,
-        f0: &'py PyArray<f32, Ix1>,
-        phoneme: &'py PyArray<f32, Ix1>,
-        speaker_id: u32,
-        py: Python<'py>,
-    ) -> PyResult<&'py PyArray<f32, Ix1>> {
-        let decoded = self
-            .inner
-            .decode(
-                length,
-                phoneme_size,
-                &f0.to_vec()?,
-                &phoneme.to_vec()?,
-                speaker_id,
-            )
-            .into_py_result()?;
-        Ok(PyArray::from_vec(py, decoded))
+    fn is_loaded_voice_model(&self, voice_model_id: &str) -> bool {
+        RUNTIME
+            .block_on(self.synthesizer.lock())
+            .is_loaded_voice_model(&VoiceModelId::new(voice_model_id.to_string()))
     }
 
-    #[args(kana = "AudioQueryOptions::default().kana")]
+    #[pyo3(signature=(text,style_id,kana = AudioQueryOptions::default().kana))]
     fn audio_query<'py>(
-        &mut self,
+        &self,
         text: &str,
-        speaker_id: u32,
+        style_id: u32,
         kana: bool,
         py: Python<'py>,
     ) -> PyResult<&'py PyAny> {
-        let audio_query = &self
-            .inner
-            .audio_query(text, speaker_id, AudioQueryOptions { kana })
-            .into_py_result()?;
-        to_pydantic_dataclass(
-            audio_query,
-            py.import("voicevox_core")?.getattr("AudioQuery")?,
+        let synthesizer = self.synthesizer.clone();
+        let text = text.to_owned();
+        pyo3_asyncio::tokio::future_into_py_with_locals(
+            py,
+            pyo3_asyncio::tokio::get_current_locals(py)?,
+            async move {
+                let audio_query = synthesizer
+                    .lock()
+                    .await
+                    .audio_query(&text, StyleId::new(style_id), &AudioQueryOptions { kana })
+                    .await
+                    .into_py_result()?;
+
+                Python::with_gil(|py| {
+                    let class = py.import("voicevox_core")?.getattr("AudioQuery")?;
+                    let ret = to_pydantic_dataclass(audio_query, class)?;
+                    Ok(ret.to_object(py))
+                })
+            },
         )
     }
 
-    #[args(kana = "AccentPhrasesOptions::default().kana")]
-    fn accent_phrases<'py>(
-        &mut self,
+    #[pyo3(signature=(text, style_id, kana = AccentPhrasesOptions::default().kana))]
+    fn create_accent_phrases<'py>(
+        &self,
         text: &str,
-        speaker_id: u32,
+        style_id: u32,
         kana: bool,
         py: Python<'py>,
-    ) -> PyResult<Vec<&'py PyAny>> {
-        let accent_phrases = &self
-            .inner
-            .accent_phrases(text, speaker_id, AccentPhrasesOptions { kana })
-            .into_py_result()?;
-        accent_phrases
-            .iter()
-            .map(|accent_phrase| {
-                to_pydantic_dataclass(
-                    accent_phrase,
-                    py.import("voicevox_core")?.getattr("AccentPhrase")?,
-                )
-            })
-            .collect()
+    ) -> PyResult<&'py PyAny> {
+        let synthesizer = self.synthesizer.clone();
+        let text = text.to_owned();
+        pyo3_asyncio::tokio::future_into_py_with_locals(
+            py,
+            pyo3_asyncio::tokio::get_current_locals(py)?,
+            async move {
+                let accent_phrases = synthesizer
+                    .lock()
+                    .await
+                    .create_accent_phrases(
+                        &text,
+                        StyleId::new(style_id),
+                        &AccentPhrasesOptions { kana },
+                    )
+                    .await
+                    .into_py_result()?;
+                Python::with_gil(|py| {
+                    let class = py.import("voicevox_core")?.getattr("AccentPhrase")?;
+                    let accent_phrases = accent_phrases
+                        .iter()
+                        .map(|ap| to_pydantic_dataclass(ap, class))
+                        .collect::<PyResult<Vec<_>>>();
+                    let list = PyList::new(py, accent_phrases.into_iter());
+                    Ok(list.to_object(py))
+                })
+            },
+        )
     }
 
-    fn mora_length<'py>(
-        &mut self,
-        accent_phrases: Vec<&'py PyAny>,
-        speaker_id: u32,
+    fn replace_mora_data<'py>(
+        &self,
+        accent_phrases: &'py PyList,
+        style_id: u32,
         py: Python<'py>,
-    ) -> PyResult<Vec<&'py PyAny>> {
-        modify_accent_phrases(accent_phrases, speaker_id, py, |a, s| {
-            self.inner.mora_length(a, s)
-        })
+    ) -> PyResult<&'py PyAny> {
+        let synthesizer = self.synthesizer.clone();
+        modify_accent_phrases(
+            accent_phrases,
+            StyleId::new(style_id),
+            py,
+            |a, s| async move { synthesizer.lock().await.replace_mora_data(&a, s).await },
+        )
     }
 
-    fn mora_pitch<'py>(
-        &mut self,
-        accent_phrases: Vec<&'py PyAny>,
-        speaker_id: u32,
+    fn replace_phoneme_length<'py>(
+        &self,
+        accent_phrases: &'py PyList,
+        style_id: u32,
         py: Python<'py>,
-    ) -> PyResult<Vec<&'py PyAny>> {
-        modify_accent_phrases(accent_phrases, speaker_id, py, |a, s| {
-            self.inner.mora_pitch(a, s)
-        })
+    ) -> PyResult<&'py PyAny> {
+        let synthesizer = self.synthesizer.clone();
+        modify_accent_phrases(
+            accent_phrases,
+            StyleId::new(style_id),
+            py,
+            |a, s| async move { synthesizer.lock().await.replace_phoneme_length(&a, s).await },
+        )
     }
 
-    fn mora_data<'py>(
-        &mut self,
-        accent_phrases: Vec<&'py PyAny>,
-        speaker_id: u32,
+    fn replace_mora_pitch<'py>(
+        &self,
+        accent_phrases: &'py PyList,
+        style_id: u32,
         py: Python<'py>,
-    ) -> PyResult<Vec<&'py PyAny>> {
-        modify_accent_phrases(accent_phrases, speaker_id, py, |a, s| {
-            self.inner.mora_data(a, s)
-        })
+    ) -> PyResult<&'py PyAny> {
+        let synthesizer = self.synthesizer.clone();
+        modify_accent_phrases(
+            accent_phrases,
+            StyleId::new(style_id),
+            py,
+            |a, s| async move { synthesizer.lock().await.replace_mora_pitch(&a, s).await },
+        )
     }
 
-    #[args(enable_interrogative_upspeak = "TtsOptions::default().enable_interrogative_upspeak")]
+    #[pyo3(signature=(audio_query,style_id,enable_interrogative_upspeak = TtsOptions::default().enable_interrogative_upspeak))]
     fn synthesis<'py>(
-        &mut self,
+        &self,
         #[pyo3(from_py_with = "from_dataclass")] audio_query: AudioQueryModel,
-        speaker_id: u32,
+        style_id: u32,
         enable_interrogative_upspeak: bool,
         py: Python<'py>,
-    ) -> PyResult<&'py PyBytes> {
-        let wav = &self
-            .inner
-            .synthesis(
-                &audio_query,
-                speaker_id,
-                SynthesisOptions {
-                    enable_interrogative_upspeak,
-                },
-            )
-            .into_py_result()?;
-        Ok(PyBytes::new(py, wav))
+    ) -> PyResult<&'py PyAny> {
+        let synthesizer = self.synthesizer.clone();
+        pyo3_asyncio::tokio::future_into_py_with_locals(
+            py,
+            pyo3_asyncio::tokio::get_current_locals(py)?,
+            async move {
+                let wav = synthesizer
+                    .lock()
+                    .await
+                    .synthesis(
+                        &audio_query,
+                        StyleId::new(style_id),
+                        &SynthesisOptions {
+                            enable_interrogative_upspeak,
+                        },
+                    )
+                    .await
+                    .into_py_result()?;
+                Python::with_gil(|py| Ok(PyBytes::new(py, &wav).to_object(py)))
+            },
+        )
     }
 
-    #[args(
-        kana = "TtsOptions::default().kana",
-        enable_interrogative_upspeak = "TtsOptions::default().enable_interrogative_upspeak"
-    )]
+    #[pyo3(signature=(
+        text,
+        style_id,
+        kana = TtsOptions::default().kana,
+        enable_interrogative_upspeak = TtsOptions::default().enable_interrogative_upspeak
+    ))]
     fn tts<'py>(
-        &mut self,
+        &self,
         text: &str,
-        speaker_id: u32,
+        style_id: u32,
         kana: bool,
         enable_interrogative_upspeak: bool,
         py: Python<'py>,
-    ) -> PyResult<&'py PyBytes> {
-        let wav = &self
-            .inner
-            .tts(
-                text,
-                speaker_id,
-                TtsOptions {
-                    kana,
-                    enable_interrogative_upspeak,
-                },
-            )
-            .into_py_result()?;
-        Ok(PyBytes::new(py, wav))
+    ) -> PyResult<&'py PyAny> {
+        let style_id = StyleId::new(style_id);
+        let options = TtsOptions {
+            kana,
+            enable_interrogative_upspeak,
+        };
+        let synthesizer = self.synthesizer.clone();
+        let text = text.to_owned();
+        pyo3_asyncio::tokio::future_into_py_with_locals(
+            py,
+            pyo3_asyncio::tokio::get_current_locals(py)?,
+            async move {
+                let wav = synthesizer
+                    .lock()
+                    .await
+                    .tts(&text, style_id, &options)
+                    .await
+                    .into_py_result()?;
+                Python::with_gil(|py| Ok(PyBytes::new(py, &wav).to_object(py)))
+            },
+        )
     }
 }
 
@@ -302,15 +378,10 @@ fn from_acceleration_mode(ob: &PyAny) -> PyResult<AccelerationMode> {
     }
 }
 
-fn from_optional_utf8_path(ob: &PyAny) -> PyResult<Option<String>> {
-    if ob.is_none() {
-        return Ok(None);
-    }
-
+fn from_utf8_path(ob: &PyAny) -> PyResult<String> {
     PathBuf::extract(ob)?
         .into_os_string()
         .into_string()
-        .map(Some)
         .map_err(|s| VoicevoxError::new_err(format!("{s:?} cannot be encoded to UTF-8")))
 }
 
@@ -325,43 +396,70 @@ fn from_dataclass<T: DeserializeOwned>(ob: &PyAny) -> PyResult<T> {
     serde_json::from_str(json).into_py_result()
 }
 
+fn to_pydantic_voice_model_meta<'py>(
+    metas: &VoiceModelMeta,
+    py: Python<'py>,
+) -> PyResult<Vec<&'py PyAny>> {
+    let class = py
+        .import("voicevox_core")?
+        .getattr("SpeakerMeta")?
+        .downcast()?;
+
+    metas
+        .iter()
+        .map(|m| to_pydantic_dataclass(m, class))
+        .collect::<PyResult<Vec<_>>>()
+}
+
 fn to_pydantic_dataclass(x: impl Serialize, class: &PyAny) -> PyResult<&PyAny> {
     let py = class.py();
 
     let x = serde_json::to_string(&x).into_py_result()?;
-    let x = py.import("json")?.call_method1("loads", (x,))?.cast_as()?;
+    let x = py.import("json")?.call_method1("loads", (x,))?.downcast()?;
     class.call((), Some(x))
 }
 
-fn modify_accent_phrases<'py, F>(
-    accent_phrases: Vec<&'py PyAny>,
-    speaker_id: u32,
+fn modify_accent_phrases<'py, Fun, Fut>(
+    accent_phrases: &'py PyList,
+    speaker_id: StyleId,
     py: Python<'py>,
-    mut method: F,
-) -> PyResult<Vec<&'py PyAny>>
+    method: Fun,
+) -> PyResult<&'py PyAny>
 where
-    F: FnMut(u32, &[AccentPhraseModel]) -> Result<Vec<AccentPhraseModel>, voicevox_core::Error>,
+    Fun: FnOnce(Vec<AccentPhraseModel>, StyleId) -> Fut + Send + 'static,
+    Fut: Future<Output = voicevox_core::Result<Vec<AccentPhraseModel>>> + Send + 'static,
 {
     let rust_accent_phrases = accent_phrases
         .iter()
-        .map(|a| from_dataclass::<AccentPhraseModel>(a))
-        .collect::<PyResult<Vec<_>>>()?;
-    let replaced_accent_phrases = method(speaker_id, &rust_accent_phrases).into_py_result()?;
-    replaced_accent_phrases
-        .iter()
-        .map(move |accent_phrase| {
-            to_pydantic_dataclass(
-                accent_phrase,
-                py.import("voicevox_core")?.getattr("AccentPhrase")?,
-            )
-        })
-        .collect()
+        .map(from_dataclass)
+        .collect::<PyResult<Vec<AccentPhraseModel>>>()?;
+    pyo3_asyncio::tokio::future_into_py_with_locals(
+        py,
+        pyo3_asyncio::tokio::get_current_locals(py)?,
+        async move {
+            let replaced_accent_phrases = method(rust_accent_phrases, speaker_id)
+                .await
+                .into_py_result()?;
+            Python::with_gil(|py| {
+                let replaced_accent_phrases = replaced_accent_phrases
+                    .iter()
+                    .map(move |accent_phrase| {
+                        to_pydantic_dataclass(
+                            accent_phrase,
+                            py.import("voicevox_core")?.getattr("AccentPhrase")?,
+                        )
+                    })
+                    .collect::<PyResult<Vec<_>>>()?;
+                let replaced_accent_phrases = PyList::new(py, replaced_accent_phrases);
+                Ok(replaced_accent_phrases.to_object(py))
+            })
+        },
+    )
 }
 
-impl Drop for VoicevoxCore {
+impl Drop for Synthesizer {
     fn drop(&mut self) {
         debug!("Destructing a VoicevoxCore");
-        self.inner.finalize();
     }
 }
 
diff --git a/example/cpp/unix/simple_tts.cpp b/example/cpp/unix/simple_tts.cpp
index f41aaf1ee..3fb61742a 100644
--- a/example/cpp/unix/simple_tts.cpp
+++ b/example/cpp/unix/simple_tts.cpp
@@ -19,11 +19,19 @@ int main(int argc, char *argv[]) {
 
   auto initialize_options = voicevox_make_default_initialize_options();
   initialize_options.load_all_models = true;
-  initialize_options.open_jtalk_dict_dir = open_jtalk_dict_path.c_str();
-  if (voicevox_initialize(initialize_options) != VOICEVOX_RESULT_OK) {
-    std::cout << "coreの初期化に失敗しました" << std::endl;
+  OpenJtalkRc* open_jtalk;
+  auto result = voicevox_open_jtalk_rc_new(open_jtalk_dict_path.c_str(),&open_jtalk);
+  if (result != VOICEVOX_RESULT_OK){
+    std::cerr << voicevox_error_result_to_message(result) << std::endl;
     return 1;
   }
+  VoicevoxSynthesizer* synthesizer;
+  result = voicevox_synthesizer_new_with_initialize(open_jtalk,initialize_options,&synthesizer);
+  if (result != VOICEVOX_RESULT_OK) {
+    std::cerr << voicevox_error_result_to_message(result) << std::endl;
+    return 1;
+  }
+  voicevox_open_jtalk_rc_delete(open_jtalk);
 
   std::cout << "音声生成中..." << std::endl;
 
@@ -31,11 +39,11 @@ int main(int argc, char *argv[]) {
   size_t output_wav_size = 0;
   uint8_t *output_wav = nullptr;
 
-  auto result = voicevox_tts(text.c_str(), speaker_id,
+  result = voicevox_synthesizer_tts(synthesizer,text.c_str(), speaker_id,
                              voicevox_make_default_tts_options(),
                              &output_wav_size, &output_wav);
   if (result != VOICEVOX_RESULT_OK) {
-    std::cout << voicevox_error_result_to_message(result) << std::endl;
+    std::cerr << voicevox_error_result_to_message(result) << std::endl;
     return 1;
   }
 
@@ -47,5 +55,7 @@ int main(int argc, char *argv[]) {
 
   std::cout << "音声ファイル保存完了 (" << OUTPUT_WAV_NAME << ")" << std::endl;
 
+  voicevox_synthesizer_delete(synthesizer);
+
   return 0;
 }
diff --git a/example/cpp/windows/simple_tts/simple_tts.cpp b/example/cpp/windows/simple_tts/simple_tts.cpp
index 5c348235b..b22ba067f 100644
--- a/example/cpp/windows/simple_tts/simple_tts.cpp
+++ b/example/cpp/windows/simple_tts/simple_tts.cpp
@@ -30,15 +30,21 @@ int main() {
   std::wcout << L"coreの初期化中" << std::endl;
   VoicevoxInitializeOptions  initializeOptions = voicevox_make_default_initialize_options();
   std::string dict = GetOpenJTalkDict();
-  initializeOptions.open_jtalk_dict_dir = dict.c_str();
   initializeOptions.load_all_models = true;
 
-  VoicevoxResultCode result = VoicevoxResultCode::VOICEVOX_RESULT_OK;
-  result = voicevox_initialize(initializeOptions);
+  OpenJtalkRc* open_jtalk;
+  auto result = voicevox_open_jtalk_rc_new(dict.c_str(),&open_jtalk);
   if (result != VoicevoxResultCode::VOICEVOX_RESULT_OK) {
     OutErrorMessage(result);
     return 0;
   }
+  VoicevoxSynthesizer* synthesizer;
+  result = voicevox_synthesizer_new_with_initialize(open_jtalk,initializeOptions,&synthesizer);
+  if (result != VoicevoxResultCode::VOICEVOX_RESULT_OK) {
+    OutErrorMessage(result);
+    return 0;
+  }
+  voicevox_open_jtalk_rc_delete(open_jtalk);
 
   std::wcout << L"音声生成中" << std::endl;
   int32_t speaker_id = 0;
@@ -46,7 +52,7 @@ int main() {
   uint8_t* output_wav = nullptr;
   VoicevoxTtsOptions ttsOptions = voicevox_make_default_tts_options();
 
-  result = voicevox_tts(wide_to_utf8_cppapi(speak_words).c_str(), speaker_id, ttsOptions, &output_binary_size, &output_wav);
+  result = voicevox_synthesizer_tts(synthesizer,wide_to_utf8_cppapi(speak_words).c_str(), speaker_id, ttsOptions, &output_binary_size, &output_wav);
   if (result != VoicevoxResultCode::VOICEVOX_RESULT_OK) {
     OutErrorMessage(result);
     return 0;
@@ -65,8 +71,7 @@ int main() {
   std::wcout << L"音声データの開放" << std::endl;
   voicevox_wav_free(output_wav);
 
-  voicevox_finalize();
-
+  voicevox_synthesizer_delete(synthesizer);
 }
 
 /// <summary>
diff --git a/example/python/README.md b/example/python/README.md
index f079dbb46..191d41521 100644
--- a/example/python/README.md
+++ b/example/python/README.md
@@ -52,7 +52,10 @@ run.py を実行します。 Open JTalk 辞書ディレクトリ、読み上げ
 
 ```console
 ❯ python ./run.py -h
-usage: run.py [-h] [--mode MODE] [--dict-dir DICT_DIR] [--text TEXT] [--out OUT] [--speaker-id SPEAKER_ID]
+usage: run.py [-h] [--mode MODE] [--dict-dir DICT_DIR] [--text TEXT] [--out OUT] [--speaker-id SPEAKER_ID] vvm
+
+positional arguments:
+  vvm                   vvmファイルへのパス
 
 optional arguments:
   -h, --help            show this help message and exit
@@ -67,18 +70,16 @@ optional arguments:
 ## 実行例
 
 ```console
-❯ cd voicevox_core/example/python
-❯ python ./run.py
-[DEBUG] run.py: voicevox_core.METAS=[Meta(name='四国めたん', styles=[Style(name='ノーマル', id=2), Style(name='あまあま', id=0), Style(name='ツンツン', id=6), Style(name='セクシー', id=4), Style(name='ささやき', id=36), Style(name='ヒソヒソ', id=37)], speaker_uuid='7ffcb7ce-00ec-4bdc-82cd-45a8889e43ff', version='0.14.3'), Meta(name='ずんだもん', styles=[Style(name='ノーマル', id=3), Style(name='あまあま', id=1), Style(name='ツンツン', id=7), Style(name='セクシー', id=5), Style(name='ささやき', id=22), Style(name='ヒソヒソ', id=38)], speaker_uuid='388f246b-8c41-4ac1-8e2d-5d79f3ff56d9', version='0.14.3'), Meta(name='春日部つむぎ', styles=[Style(name='ノーマル', id=8)], speaker_uuid='35b2c544-660e-401e-b503-0e14c635303a', version='0.14.3'), Meta(name='雨晴はう', styles=[Style(name='ノーマル', id=10)], speaker_uuid='3474ee95-c274-47f9-aa1a-8322163d96f1', version='0.14.3'), Meta(name='波音リツ', styles=[Style(name='ノーマル', id=9)], speaker_uuid='b1a81618-b27b-40d2-b0ea-27a9ad408c4b', version='0.14.3'), Meta(name='玄野武宏', styles=[Style(name='ノーマル', id=11), Style(name='喜び', id=39), Style(name='ツンギレ', id=40), Style(name='悲しみ', id=41)], speaker_uuid='c30dc15a-0992-4f8d-8bb8-ad3b314e6a6f', version='0.14.3'), Meta(name=' 白上虎太郎', styles=[Style(name='ふつう', id=12), Style(name='わーい', id=32), Style(name='びくびく', id=33), Style(name='おこ', id=34), Style(name='びえー ん', id=35)], speaker_uuid='e5020595-5c5d-4e87-b849-270a518d0dcf', version='0.14.3'), Meta(name='青山龍星', styles=[Style(name='ノーマル', id=13)], speaker_uuid='4f51116a-d9ee-4516-925d-21f183e2afad', version='0.14.3'), Meta(name='冥鳴ひまり', styles=[Style(name='ノーマル', id=14)], speaker_uuid='8eaad775-3119-417e-8cf4-2a10bfd592c8', version='0.14.3'), Meta(name='九州そら', styles=[Style(name='ノーマル', id=16), Style(name='あまあま', id=15), Style(name='ツンツン', id=18), Style(name='セクシー', id=17), Style(name='ささやき', id=19)], speaker_uuid='481fb609-6446-4870-9f46-90c4dd623403', version='0.14.3'), Meta(name='もち子さん', styles=[Style(name='ノーマル', id=20)], speaker_uuid='9f3ee141-26ad-437e-97bd-d22298d02ad2', version='0.14.3'), Meta(name='剣崎雌雄', styles=[Style(name='ノーマル', id=21)], speaker_uuid='1a17ca16-7ee5-4ea5-b191-2f02ace24d21', version='0.14.3'), Meta(name='WhiteCUL', styles=[Style(name='ノーマル', id=23), Style(name='たのしい', id=24), Style(name='かなしい', id=25), Style(name='びえーん', id=26)], speaker_uuid='67d5d8da-acd7-4207-bb10-b5542d3a663b', version='0.14.3'), Meta(name='後鬼', styles=[Style(name='人間ver.', id=27), Style(name='ぬいぐるみver.', id=28)], speaker_uuid='0f56c2f2-644c-49c9-8989-94e11f7129d0', version='0.14.3'), Meta(name='No.7', styles=[Style(name='ノーマル', id=29), Style(name='アナウンス', id=30), Style(name='読み聞かせ', id=31)], speaker_uuid='044830d2-f23b-44d6-ac0d-b5d733caa900', version='0.14.3'), Meta(name='ちび式じい', styles=[Style(name='ノーマル', id=42)], speaker_uuid='468b8e94-9da4-4f7a-8715-a22a48844f9e', version='0.14.3'), Meta(name='櫻歌ミコ', styles=[Style(name='ノーマル', id=43), Style(name='第二形態', id=44), Style(name='ロリ', id=45)], speaker_uuid='0693554c-338e-4790-8982-b9c6d476dc69', version='0.14.3'), Meta(name='小夜/SAYO', styles=[Style(name='ノーマル', id=46)], speaker_uuid='a8cc6d22-aad0-4ab8-bf1e-2f843924164a', version='0.14.3'), Meta(name='ナースロボ＿タイプＴ', styles=[Style(name='ノーマル', id=47), Style(name='楽々', id=48), Style(name='恐怖', id=49), Style(name='内緒話', id=50)], speaker_uuid='882a636f-3bac-431a-966d-c5e6bba9f949', version='0.14.3'), Meta(name='†聖騎士 紅桜†', styles=[Style(name='ノーマル', id=51)], speaker_uuid='471e39d2-fb11-4c8c-8d89-4b322d2498e0', version='0.14.3'), Meta(name='雀松朱司', styles=[Style(name='ノーマル', id=52)], speaker_uuid='0acebdee-a4a5-4e12-a695-e19609728e30', version='0.14.3'), Meta(name='麒ヶ島宗麟', styles=[Style(name='ノーマル', id=53)], speaker_uuid='7d1e7ba7-f957-40e5-a3fc-da49f769ab65', version='0.14.3'), Meta(name='春歌ナナ', styles=[Style(name='ノーマル', id=54)], speaker_uuid='ba5d2428-f7e0-4c20-ac41-9dd56e9178b4', version='0.14.3'), Meta(name='猫使アル', styles=[Style(name='ノーマル', id=55), Style(name='おちつき', id=56), Style(name='うきうき', id=57)], speaker_uuid='00a5c10c-d3bd-459f-83fd-43180b521a44', version='0.14.3'), Meta(name='猫使ビィ', styles=[Style(name='ノーマル', id=58), Style(name='おちつき', id=59), Style(name='人見知り', id=60)], speaker_uuid='c20a2254-0349-4470-9fc8-e5c0f8cf3404', version='0.14.3')]
-[DEBUG] run.py: voicevox_core.SUPPORTED_DEVICES=SupportedDevices(cpu=True, cuda=False, dml=False)
-[INFO] run.py: Initializing (acceleration_mode=<AccelerationMode.AUTO: 'AUTO'>, open_jtalk_dict_dir=PosixPath('voicevox_core/open_jtalk_dic_utf_8-1.11'))
-[DEBUG] run.py: core.is_gpu_mode=False
-[INFO] run.py: Loading model 0
-[DEBUG] run.py: core.is_model_loaded(0)=True
-[INFO] run.py: Creating an AudioQuery from 'この音声は、ボイスボックスを使用して、出力されています。'
-[INFO] run.py: Synthesizing with {"accent_phrases": [{"moras": [{"text": "コ", "consonant": "k", "consonant_length": 0.07850838, "vowel": "o", "vowel_length": 0.060881548, "pitch": 5.485674}, {"text": "ノ", "consonant": "n", "consonant_length": 0.05698543, "vowel": "o", "vowel_length": 0.096929096, "pitch": 5.633757}], "accent": 2, "pause_mora": null, "is_interrogative": false}, {"moras": [{"text": "オ", "consonant": null, "consonant_length": null, "vowel": "o", "vowel_length": 0.12184215, "pitch": 5.8332877}, {"text": "ン", "consonant": null, "consonant_length": null, "vowel": "N", "vowel_length": 0.07362788, "pitch": 5.8688555}, {"text": "セ", "consonant": "s", "consonant_length": 0.07512727, "vowel": "e", "vowel_length": 0.079007246, "pitch": 5.723918}, {"text": "エ", "consonant": null, "consonant_length": null, "vowel": "e", "vowel_length": 0.071942694, "pitch": 5.596015}, {"text": "ワ", "consonant": "w", "consonant_length": 0.06436361, "vowel": "a", "vowel_length": 0.15232985, "pitch": 5.4623356}], "accent": 1, "pause_mora": {"text": "、", "consonant": null, "consonant_length": null, "vowel": "pau", "vowel_length": 0.3085951, "pitch": 0.0}, "is_interrogative": false}, {"moras": [{"text": "ボ", "consonant": "b", "consonant_length": 0.056357853, "vowel": "o", "vowel_length": 0.103436954, "pitch": 5.5773916}, {"text": "イ", "consonant": null, "consonant_length": null, "vowel": "i", "vowel_length": 0.06483335, "pitch": 5.7643595}, {"text": "ス", "consonant": "s", "consonant_length": 0.06817152, "vowel": "u", "vowel_length": 0.06812488, "pitch": 5.878236}, {"text": "ボ", "consonant": "b", "consonant_length": 0.049351893, "vowel": "o", "vowel_length": 0.10104511, "pitch": 5.8876576}, {"text": "ッ", "consonant": null, "consonant_length": null, "vowel": "cl", "vowel_length": 0.06349718, "pitch": 0.0}, {"text": "ク", "consonant": "k", "consonant_length": 0.0527189, "vowel": "U", "vowel_length": 0.055740334, "pitch": 0.0}, {"text": "ス", "consonant": "s", "consonant_length": 0.08895182, "vowel": "u", "vowel_length": 0.058778323, "pitch": 5.6777925}, {"text": "オ", "consonant": null, "consonant_length": null, "vowel": "o", "vowel_length": 0.10338681, "pitch": 5.514599}], "accent": 4, "pause_mora": null, "is_interrogative": false}, {"moras": [{"text": "シ", "consonant": "sh", "consonant_length": 0.064573064, "vowel": "i", "vowel_length": 0.090709515, "pitch": 5.4414697}, {"text": "ヨ", "consonant": "y", "consonant_length": 0.060504176, "vowel": "o", "vowel_length": 0.07323781, "pitch": 5.5361524}, {"text": "オ", "consonant": null, "consonant_length": null, "vowel": "o", "vowel_length": 0.08485783, "pitch": 5.6284075}], "accent": 3, "pause_mora": null, "is_interrogative": false}, {"moras": [{"text": "シ", "consonant": "sh", "consonant_length": 0.034160726, "vowel": "I", "vowel_length": 0.061993457, "pitch": 0.0}, {"text": "テ", "consonant": "t", "consonant_length": 0.071078844, "vowel": "e", "vowel_length": 0.13735397, "pitch": 5.667881}], "accent": 2, "pause_mora": {"text": "、", "consonant": null, "consonant_length": null, "vowel": "pau", "vowel_length": 0.33965635, "pitch": 0.0}, "is_interrogative": false}, {"moras": [{"text": "シュ", "consonant": "sh", "consonant_length": 0.058649763, "vowel": "U", "vowel_length": 0.063781895, "pitch": 0.0}, {"text": "ツ", "consonant": "ts", "consonant_length": 0.09303636, "vowel": "u", "vowel_length": 0.06302382, "pitch": 5.857424}, {"text": "リョ", "consonant": "ry", "consonant_length": 0.046520084, "vowel": "o", "vowel_length": 0.07469649, "pitch": 5.8819184}, {"text": "ク", "consonant": "k", "consonant_length": 0.052582815, "vowel": "U", "vowel_length": 0.04418713, "pitch": 0.0}], "accent": 2, "pause_mora": null, "is_interrogative": false}, {"moras": [{"text": "サ", "consonant": "s", "consonant_length": 0.07928567, "vowel": "a", "vowel_length": 0.07227267, "pitch": 5.5369396}, {"text": "レ", "consonant": "r", "consonant_length": 0.040197723, "vowel": "e", "vowel_length": 0.082754314, "pitch": 5.575339}, {"text": "テ", "consonant": "t", "consonant_length": 0.057140626, "vowel": "e", "vowel_length": 0.09039307, "pitch": 5.700317}], "accent": 3, "pause_mora": null, "is_interrogative": false}, {"moras": [{"text": "イ", "consonant": null, "consonant_length": null, "vowel": "i", "vowel_length": 0.064962484, "pitch": 5.674076}, {"text": "マ", "consonant": "m", "consonant_length": 0.071327865, "vowel": "a", "vowel_length": 0.09092417, "pitch": 5.6674485}, {"text": "ス", "consonant": "s", "consonant_length": 0.07241123, "vowel": "U", "vowel_length": 0.107922144, "pitch": 0.0}], "accent": 2, "pause_mora": null, "is_interrogative": false}], "speed_scale": 1.0, "pitch_scale": 0.0, "intonation_scale": 1.0, "volume_scale": 1.0, "pre_phoneme_length": 0.1, "post_phoneme_length": 0.1, "output_sampling_rate": 24000, "output_stereo": false, "kana": "コノ'/オ'ンセエワ、ボイスボ'ッ_クスオ/シヨオ'/_シテ'、_シュツ'リョ_ク/サレテ'/イマ'_ス"}
-[INFO] run.py: Wrote `output.wav`
-[DEBUG] lib.rs: Destructing a VoicevoxCore
+❯ python ./run.py ../../model/sample.vvm
+[DEBUG] __main__: voicevox_core.supported_devices()=SupportedDevices(cpu=True, cuda=False, dml=False)
+[INFO] __main__: Initializing (acceleration_mode=<AccelerationMode.AUTO: 'AUTO'>, open_jtalk_dict_dir=PosixPath('open_jtalk_dic_utf_8-1.11'))
+[DEBUG] __main__: synthesizer.metas=[]
+[DEBUG] __main__: synthesizer.is_gpu_mode=False
+[INFO] __main__: Loading `../../model/sample.vvm`
+[INFO] __main__: Creating an AudioQuery from 'この音声は、ボイスボックスを使用して、出力されています。'
+[INFO] __main__: Synthesizing with {"accent_phrases": [{"moras": [{"text": "コ", "consonant": "k", "consonant_length": 0.0556899, "vowel": "o", "vowel_length": 0.075180575, "pitch": 5.542309}, {"text": "ノ", "consonant": "n", "consonant_length": 0.06551014, "vowel": "o", "vowel_length": 0.09984577, "pitch": 5.6173983}], "accent": 2, "pause_mora": null, "is_interrogative": false}, {"moras": [{"text": "オ", "consonant": null, "consonant_length": null, "vowel": "o", "vowel_length": 0.116150305, "pitch": 5.7063766}, {"text": "ン", "consonant": null, "consonant_length": null, "vowel": "N", "vowel_length": 0.044380233, "pitch": 5.785717}, {"text": "セ", "consonant": "s", "consonant_length": 0.07719758, "vowel": "e", "vowel_length": 0.08653869, "pitch": 5.662092}, {"text": "エ", "consonant": null, "consonant_length": null, "vowel": "e", "vowel_length": 0.08311573, "pitch": 5.532917}, {"text": "ワ", "consonant": "w", "consonant_length": 0.06373148, "vowel": "a", "vowel_length": 0.16219379, "pitch": 5.293258}], "accent": 1, "pause_mora": {"text": "、", "consonant": null, "consonant_length": null, "vowel": "pau", "vowel_length": 0.35826492, "pitch": 0.0}, "is_interrogative": false}, {"moras": [{"text": "ボ", "consonant": "b", "consonant_length": 0.047082342, "vowel": "o", "vowel_length": 0.12611786, "pitch": 5.583892}, {"text": "イ", "consonant": null, "consonant_length": null, "vowel": "i", "vowel_length": 0.059451744, "pitch": 5.7947493}, {"text": "ス", "consonant": "s", "consonant_length": 0.089278996, "vowel": "u", "vowel_length": 0.11847979, "pitch": 5.818695}, {"text": "ボ", "consonant": "b", "consonant_length": 0.06535433, "vowel": "o", "vowel_length": 0.120458946, "pitch": 5.7965107}, {"text": "ッ", "consonant": null, "consonant_length": null, "vowel": "cl", "vowel_length": 0.06940381, "pitch": 0.0}, {"text": "ク", "consonant": "k", "consonant_length": 0.053739145, "vowel": "U", "vowel_length": 0.05395376, "pitch": 0.0}, {"text": "ス", "consonant": "s", "consonant_length": 0.10222931, "vowel": "u", "vowel_length": 0.071811065, "pitch": 5.8024883}, {"text": "オ", "consonant": null, "consonant_length": null, "vowel": "o", "vowel_length": 0.11092262, "pitch": 5.5036163}], "accent": 4, "pause_mora": null, "is_interrogative": false}, {"moras": [{"text": "シ", "consonant": "sh", "consonant_length": 0.09327768, "vowel": "i", "vowel_length": 0.09126951, "pitch": 5.369444}, {"text": "ヨ", "consonant": "y", "consonant_length": 0.06251812, "vowel": "o", "vowel_length": 0.07805054, "pitch": 5.5021667}, {"text": "オ", "consonant": null, "consonant_length": null, "vowel": "o", "vowel_length": 0.09904325, "pitch": 5.5219536}], "accent": 3, "pause_mora": null, "is_interrogative": false}, {"moras": [{"text": "シ", "consonant": "sh", "consonant_length": 0.04879771, "vowel": "I", "vowel_length": 0.06514315, "pitch": 0.0}, {"text": "テ", "consonant": "t", "consonant_length": 0.0840496, "vowel": "e", "vowel_length": 0.19438823, "pitch": 5.4875555}], "accent": 2, "pause_mora": {"text": "、", "consonant": null, "consonant_length": null, "vowel": "pau", "vowel_length": 0.35208154, "pitch": 0.0}, "is_interrogative": false}, {"moras": [{"text": "シュ", "consonant": "sh", "consonant_length": 0.05436731, "vowel": "U", "vowel_length": 0.06044446, "pitch": 0.0}, {"text": "ツ", "consonant": "ts", "consonant_length": 0.102865085, "vowel": "u", "vowel_length": 0.057028636, "pitch": 5.6402535}, {"text": "リョ", "consonant": "ry", "consonant_length": 0.058293864, "vowel": "o", "vowel_length": 0.080050275, "pitch": 5.6997967}, {"text": "ク", "consonant": "k", "consonant_length": 0.054767884, "vowel": "U", "vowel_length": 0.042932786, "pitch": 0.0}], "accent": 2, "pause_mora": null, "is_interrogative": false}, {"moras": [{"text": "サ", "consonant": "s", "consonant_length": 0.08067487, "vowel": "a", "vowel_length": 0.07377973, "pitch": 5.652378}, {"text": "レ", "consonant": "r", "consonant_length": 0.040600352, "vowel": "e", "vowel_length": 0.079322875, "pitch": 5.6290326}, {"text": "テ", "consonant": "t", "consonant_length": 0.06773268, "vowel": "e", "vowel_length": 0.08347456, "pitch": 5.6427326}], "accent": 3, "pause_mora": null, "is_interrogative": false}, {"moras": [{"text": "イ", "consonant": null, "consonant_length": null, "vowel": "i", "vowel_length": 0.07542324, "pitch": 5.641289}, {"text": "マ", "consonant": "m", "consonant_length": 0.066299975, "vowel": "a", "vowel_length": 0.107257664, "pitch": 5.6201453}, {"text": "ス", "consonant": "s", "consonant_length": 0.07186453, "vowel": "U", "vowel_length": 0.1163103, "pitch": 0.0}], "accent": 2, "pause_mora": null, "is_interrogative": false}], "speed_scale": 1.0, "pitch_scale": 0.0, "intonation_scale": 1.0, "volume_scale": 1.0, "pre_phoneme_length": 0.1, "post_phoneme_length": 0.1, "output_sampling_rate": 24000, "output_stereo": false, "kana": "コノ'/オ'ンセエワ、ボイスボ'ッ_クスオ/シヨオ'/_シテ'、_シュツ' リョ_ク/サレテ'/イマ'_ス"}
+[INFO] __main__: Wrote `output.wav`
+[DEBUG] voicevox_core_python_api: Destructing a VoicevoxCore
 ```
 
 正常に実行されれば音声合成の結果である wav ファイルが生成されます。
diff --git a/example/python/run.py b/example/python/run.py
index a44b7a303..d0899b749 100644
--- a/example/python/run.py
+++ b/example/python/run.py
@@ -1,3 +1,4 @@
+import asyncio
 import dataclasses
 import json
 import logging
@@ -6,44 +7,56 @@
 from typing import Tuple
 
 import voicevox_core
-from voicevox_core import AccelerationMode, AudioQuery, VoicevoxCore
+from voicevox_core import (
+    AccelerationMode,
+    AudioQuery,
+    OpenJtalk,
+    Synthesizer,
+    VoiceModel,
+)
 
 
-def main() -> None:
+async def main() -> None:
     logging.basicConfig(format="[%(levelname)s] %(name)s: %(message)s")
     logger = logging.getLogger(__name__)
     logger.setLevel("DEBUG")
     logging.getLogger("voicevox_core_python_api").setLevel("DEBUG")
     logging.getLogger("voicevox_core").setLevel("DEBUG")
 
-    (acceleration_mode, open_jtalk_dict_dir, text, out, speaker_id) = parse_args()
+    (
+        acceleration_mode,
+        vvm_path,
+        open_jtalk_dict_dir,
+        text,
+        out,
+        speaker_id,
+    ) = parse_args()
 
-    logger.debug("%s", f"{voicevox_core.METAS=}")
-    logger.debug("%s", f"{voicevox_core.SUPPORTED_DEVICES=}")
+    logger.debug("%s", f"{voicevox_core.supported_devices()=}")
 
     logger.info("%s", f"Initializing ({acceleration_mode=}, {open_jtalk_dict_dir=})")
-    core = VoicevoxCore(
-        acceleration_mode=acceleration_mode, open_jtalk_dict_dir=open_jtalk_dict_dir
+    synthesizer = await Synthesizer.new_with_initialize(
+        OpenJtalk(open_jtalk_dict_dir), acceleration_mode=acceleration_mode
     )
 
-    logger.debug("%s", f"{core.is_gpu_mode=}")
+    logger.debug("%s", f"{synthesizer.metas=}")
+    logger.debug("%s", f"{synthesizer.is_gpu_mode=}")
 
-    logger.info("%s", f"Loading model {speaker_id}")
-    core.load_model(speaker_id)
-
-    logger.debug("%s", f"{core.is_model_loaded(0)=}")
+    logger.info("%s", f"Loading `{vvm_path}`")
+    model = await VoiceModel.from_path(vvm_path)
+    await synthesizer.load_voice_model(model)
 
     logger.info("%s", f"Creating an AudioQuery from {text!r}")
-    audio_query = core.audio_query(text, speaker_id)
+    audio_query = await synthesizer.audio_query(text, speaker_id)
 
     logger.info("%s", f"Synthesizing with {display_as_json(audio_query)}")
-    wav = core.synthesis(audio_query, speaker_id)
+    wav = await synthesizer.synthesis(audio_query, speaker_id)
 
     out.write_bytes(wav)
     logger.info("%s", f"Wrote `{out}`")
 
 
-def parse_args() -> Tuple[AccelerationMode, Path, str, Path, int]:
+def parse_args() -> Tuple[AccelerationMode, Path, Path, str, Path, int]:
     argparser = ArgumentParser()
     argparser.add_argument(
         "--mode",
@@ -51,6 +64,11 @@ def parse_args() -> Tuple[AccelerationMode, Path, str, Path, int]:
         type=AccelerationMode,
         help='モード ("AUTO", "CPU", "GPU")',
     )
+    argparser.add_argument(
+        "vvm",
+        type=Path,
+        help="vvmファイルへのパス",
+    )
     argparser.add_argument(
         "--dict-dir",
         default="./open_jtalk_dic_utf_8-1.11",
@@ -75,7 +93,7 @@ def parse_args() -> Tuple[AccelerationMode, Path, str, Path, int]:
         help="話者IDを指定",
     )
     args = argparser.parse_args()
-    return (args.mode, args.dict_dir, args.text, args.out, args.speaker_id)
+    return (args.mode, args.vvm, args.dict_dir, args.text, args.out, args.speaker_id)
 
 
 def display_as_json(audio_query: AudioQuery) -> str:
@@ -83,4 +101,4 @@ def display_as_json(audio_query: AudioQuery) -> str:
 
 
 if __name__ == "__main__":
-    main()
+    asyncio.run(main())
diff --git a/model/predict_duration-1.onnx b/model/predict_duration-1.onnx
deleted file mode 100644
index d650f4541..000000000
Binary files a/model/predict_duration-1.onnx and /dev/null differ
diff --git a/model/predict_intonation-1.onnx b/model/predict_intonation-1.onnx
deleted file mode 100644
index 7db90a624..000000000
Binary files a/model/predict_intonation-1.onnx and /dev/null differ
diff --git a/model/decode-1.onnx b/model/sample.vvm
similarity index 77%
rename from model/decode-1.onnx
rename to model/sample.vvm
index 0551a8c16..1918172f9 100644
Binary files a/model/decode-1.onnx and b/model/sample.vvm differ