How to get text to speech with timestamps? #83

Zaniyar · 2024-08-16T17:52:39Z

app.post('/text-to-speech-timestamps', async (req, res) => {
    try {
      const audioStream = await client.textToSpeech.streamWithTimestamps("pMsXgVXv3BLzUgSXRplE", {
        text: req.body.text,
        optimize_streaming_latency: 0,
        output_format: 'pcm_22050',
        voice_settings: {
          stability: 0.1,
          similarity_boost: 0.3,
          style: 0.2
        }
      });
      **console.log(audioStream); // audioStream is always undefined ...** 

      // Collecting chunks of JSON with timestamps and audio
      const chunks = [];
      audioStream.on('data', (chunk) => {
        chunks.push(chunk);
      });
  
      audioStream.on('end', () => {
        const fullResponse = Buffer.concat(chunks).toString();
        console.log(fullResponse)
        res.send(fullResponse);
      });
  
    } catch (error) {
      console.error('Error generating speech with timestamps:', error);
      res.status(500).send('Failed to generate speech with timestamps');
    }
  });

How can I do the same with the websocket endpoint using this npm package? I need the fastest way to get the audio + timestamp for words/phonems

The text was updated successfully, but these errors were encountered:

Zaniyar · 2024-08-16T19:57:53Z

app.post('/text-to-speech-timestamps', async (req, res) => {
    const VOICE_ID = "21m00Tcm4TlvDq8ikWAM";
    const YOUR_XI_API_KEY = "XXX";
    const url = `https://api.elevenlabs.io/v1/text-to-speech/${VOICE_ID}/stream/with-timestamps`;
  
    const data = {
      text: req.body.text,
      model_id: "eleven_multilingual_v2",
      voice_settings: {
        stability: 0.5,
        similarity_boost: 0.75
      }
    };
  
    try {
      const response = await axios({
        method: 'post',
        url: url,
        headers: {
          'Content-Type': 'application/json',
          'xi-api-key': YOUR_XI_API_KEY
        },
        data: data,
        responseType: 'stream'
      });
  
      let audioBytes = Buffer.from('');
      let characters = [];
      let characterStartTimesSeconds = [];
      let characterEndTimesSeconds = [];
      let buffer = '';
  
      response.data.on('data', (chunk) => {
        buffer += chunk.toString('utf-8');
  
        let boundary = buffer.lastIndexOf('\n');
        if (boundary !== -1) {
          const jsonString = buffer.slice(0, boundary);
          buffer = buffer.slice(boundary + 1);
  
          try {
            const responseDict = JSON.parse(jsonString);
  
            const audioBytesChunk = Buffer.from(responseDict.audio_base64, 'base64');
            audioBytes = Buffer.concat([audioBytes, audioBytesChunk]);
  
            if (responseDict.alignment) {
              characters = characters.concat(responseDict.alignment.characters);
              characterStartTimesSeconds = characterStartTimesSeconds.concat(responseDict.alignment.character_start_times_seconds);
              characterEndTimesSeconds = characterEndTimesSeconds.concat(responseDict.alignment.character_end_times_seconds);
            }
          } catch (e) {
            console.error('JSON parsing error:', e);
          }
        }
      });
  
      response.data.on('end', () => {
        res.json({
          audio: audioBytes.toString('base64'),
          characters: characters,
          character_start_times_seconds: characterStartTimesSeconds,
          character_end_times_seconds: characterEndTimesSeconds
        });
      });
  
    } catch (error) {
      console.error('Error:', error);
      res.status(500).send('Failed to generate speech with timestamps');
    }
  });

ok with http request it's now working - any websocket examples?

rayfarer · 2024-09-04T15:35:53Z

Just wanted to comment and thank you for providing a working example of timestamps in JS @Zaniyar

louisjoecodes · 2024-10-12T10:20:26Z

Thanks @Zaniyar for the example - Here's another example of using the Websocket endpoint to get audio with word alignment data: elevenlabs-websockets-demo.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

How to get text to speech with timestamps? #83

How to get text to speech with timestamps? #83

Zaniyar commented Aug 16, 2024

Zaniyar commented Aug 16, 2024

rayfarer commented Sep 4, 2024

louisjoecodes commented Oct 12, 2024

How to get text to speech with timestamps? #83

How to get text to speech with timestamps? #83

Comments

Zaniyar commented Aug 16, 2024

Zaniyar commented Aug 16, 2024

rayfarer commented Sep 4, 2024

louisjoecodes commented Oct 12, 2024