-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape2Wav.php
73 lines (62 loc) · 2.52 KB
/
scrape2Wav.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
<?php
require_once 'vendor/autoload.php';
require_once 'config/pages.php';
use Goutte\Client;
$voicePath = "/usr/share/hts-voice/mei";
$voiceType = "mei_normal.htsvoice";
$dictPath = "/var/lib/mecab/dic/open-jtalk/naist-jdic";
$destTxtPath = './dest/txt';
$destWavPath = './dest/wav';
$maxBuffer = 1024;
$pageCount = 0;
foreach ($pages as $pageName) {
$pageCount++;
$client = new Client();
try {
$crawler = $client->request('GET', "http://php.net/manual/ja/{$pageName}.php");
// 音声変換する記事を配列に格納していく
try {
$news[] = $crawler->filter('h2.title')->first()->text();
} catch (\InvalidArgumentException $e) {
}
try {
$news[] = $crawler->filter('p.simpara')->first()->text();
} catch (\InvalidArgumentException $e) {
}
try {
$crawler->filter('p.para')->each(function ($element) use (&$news) {
$news[] = $element->text();
});
} catch (\InvalidArgumentException $e) {
}
} catch (\Exception $e) {
echo "pageName: {$pageName} スクレイピング処理でエラーが発生しました。";
throw $e;
}
try {
// 音声に変換していく
$sentences = "";
$outputUnits = [];
foreach ($news as $sentence) {
// TODO $sentence自体が1024バイト超える場合は?
// 指定バイト数を超えないように配列に格納する
if (strlen($sentences . $sentence) >= $maxBuffer) {
$outputUnits[] = $sentences;
$sentences = "";
}
$sentences .= trim(str_replace(" ", "", $sentence));
}
$outputUnits[] = $sentences;
$outputUnitsCount = count($outputUnits);
for ($i = 0; $i < $outputUnitsCount; $i++) {
$outputContext = str_replace("\n", "", $outputUnits[$i]);
//exec("echo {$outputContext} | open_jtalk -m {$voicePath}/{$voiceType} -ow {$pageName}_{$i}.wav -x {$dictPath}", $output);
file_put_contents("{$destTxtPath}/{$pageCount}_{$pageName}_{$i}.txt", $outputContext);
exec("open_jtalk -m {$voicePath}/{$voiceType} -ow {$destWavPath}/{$pageCount}_{$pageName}_{$i}.wav -x {$dictPath} {$destTxtPath}/{$pageCount}_{$pageName}_{$i}.txt", $output);
}
unset($client, $crawler, $news, $outputUnits);
} catch (\Exception $e) {
echo "pageName: {$pageName} 音声変換処理でエラーが発生しました。";
throw $e;
}
}