From 445542eb92d5d46439e612d3db0789ba5cbf97b0 Mon Sep 17 00:00:00 2001
From: Neko-Life <nekolife123579@gmail.com>
Date: Fri, 18 Oct 2024 00:20:58 +0700
Subject: [PATCH] feat: implement sending stop frames on pause/stop, TODO: make
 voice receive smooth while sending audio

---
 include/dpp/discordvoiceclient.h       | 35 +++++++++++++++++++++++---
 src/dpp/discordvoiceclient.cpp         | 18 ++++++++++++-
 src/dpp/voice/enabled/courier_loop.cpp | 16 +++++++++++-
 src/dpp/voice/enabled/opus.cpp         |  8 +++---
 src/dpp/voice/enabled/read_write.cpp   | 17 +++++++------
 src/dpp/voice/enabled/write_ready.cpp  |  9 ++++++-
 6 files changed, 86 insertions(+), 17 deletions(-)

diff --git a/include/dpp/discordvoiceclient.h b/include/dpp/discordvoiceclient.h
index 658ba6422a..3a5f457938 100644
--- a/include/dpp/discordvoiceclient.h
+++ b/include/dpp/discordvoiceclient.h
@@ -240,6 +240,11 @@ class DPP_EXPORT discord_voice_client : public websocket_client
 	 */
 	void cleanup();
 
+	/**
+	 * @brief A frame of silence packet
+	 */
+	static constexpr uint8_t silence_packet[3] = { 0xf8, 0xff, 0xfe };
+
 	/**
 	 * @brief Mutex for outbound packet stream
 	 */
@@ -434,6 +439,13 @@ class DPP_EXPORT discord_voice_client : public websocket_client
 	 */
 	bool paused;
 
+	/**
+	 * @brief Whether has sent 5 frame of silence before stopping on pause/stop.
+	 *
+	 * This is to avoid unintended Opus interpolation with subsequent transmissions.
+	 */
+	bool sent_stop_frames;
+
 #ifdef HAVE_VOICE
 	/**
 	 * @brief libopus encoder
@@ -650,8 +662,10 @@ class DPP_EXPORT discord_voice_client : public websocket_client
 	 * @param packet packet data
 	 * @param len length of packet
 	 * @param duration duration of opus packet
+	 * @param send_now send this packet right away without buffering.
+	 * Do NOT set send_now to true outside write_ready.
 	 */
-	void send(const char* packet, size_t len, uint64_t duration);
+	void send(const char* packet, size_t len, uint64_t duration, bool send_now = false);
 
 	/**
 	 * @brief Queue a message to be sent via the websocket
@@ -962,6 +976,10 @@ class DPP_EXPORT discord_voice_client : public websocket_client
 	 * @param duration Generally duration is 2.5, 5, 10, 20, 40 or 60
 	 * if the timescale is 1000000 (1ms) 
 	 * 
+	 * @param send_now Send this packet right away without buffering,
+	 * this will skip duration calculation for the packet being sent
+	 * and only safe to be set to true in write_ready.
+	 *
 	 * @return discord_voice_client& Reference to self
 	 * 
 	 * @note It is your responsibility to ensure that packets of data 
@@ -972,7 +990,7 @@ class DPP_EXPORT discord_voice_client : public websocket_client
 	 * 
 	 * @throw dpp::voice_exception If data length is invalid or voice support not compiled into D++
 	 */
-	discord_voice_client& send_audio_opus(uint8_t* opus_packet, const size_t length, uint64_t duration);
+	discord_voice_client& send_audio_opus(const uint8_t* opus_packet, const size_t length, uint64_t duration, bool send_now = false);
 
 	/**
 	 * @brief Send opus packets to the voice channel
@@ -999,7 +1017,7 @@ class DPP_EXPORT discord_voice_client : public websocket_client
 	 * 
 	 * @throw dpp::voice_exception If data length is invalid or voice support not compiled into D++
 	 */
-	discord_voice_client& send_audio_opus(uint8_t* opus_packet, const size_t length);
+	discord_voice_client& send_audio_opus(const uint8_t* opus_packet, const size_t length);
 
 	/**
 	 * @brief Send silence to the voice channel
@@ -1012,6 +1030,17 @@ class DPP_EXPORT discord_voice_client : public websocket_client
 	 */
 	discord_voice_client& send_silence(const uint64_t duration);
 
+	/**
+	 * @brief Send stop frames to the voice channel.
+	 *
+	 * @param send_now send this packet right away without buffering.
+	 * Do NOT set send_now to true outside write_ready.
+	 * 
+	 * @return discord_voice_client& Reference to self
+	 * @throw dpp::voice_exception if voice support is not compiled into D++
+	 */
+	discord_voice_client& send_stop_frames(bool send_now = false);
+
 	/**
 	 * @brief Sets the audio type that will be sent with send_audio_* methods.
 	 *
diff --git a/src/dpp/discordvoiceclient.cpp b/src/dpp/discordvoiceclient.cpp
index 1b8fe7b95f..d63cccc6a6 100644
--- a/src/dpp/discordvoiceclient.cpp
+++ b/src/dpp/discordvoiceclient.cpp
@@ -20,6 +20,7 @@
  *
  ************************************************************************************/
 
+#include <cstdint>
 #ifdef _WIN32
 	#include <WinSock2.h>
 	#include <WS2tcpip.h>
@@ -148,6 +149,9 @@ bool discord_voice_client::is_end_to_end_encrypted() const {
 
 discord_voice_client& discord_voice_client::pause_audio(bool pause) {
 	this->paused = pause;
+	if (!this->paused) {
+		this->sent_stop_frames = false;
+	}
 	return *this;
 }
 
@@ -176,6 +180,7 @@ discord_voice_client& discord_voice_client::stop_audio() {
 	outbuf.clear();
 	track_meta.clear();
 	tracks = 0;
+	this->send_stop_frames();
 	return *this;
 }
 
@@ -398,7 +403,6 @@ discord_voice_client& discord_voice_client::skip_to_next_marker() {
 }
 
 discord_voice_client& discord_voice_client::send_silence(const uint64_t duration) {
-	uint8_t silence_packet[3] = { 0xf8, 0xff, 0xfe };
 	send_audio_opus(silence_packet, 3, duration);
 	return *this;
 }
@@ -412,6 +416,7 @@ discord_voice_client& discord_voice_client::set_send_audio_type(send_audio_type_
 
 discord_voice_client& discord_voice_client::speak() {
 	if (!this->sending) {
+		std::cout << "Sending voice_opcode_client_speaking\n";
 		this->queue_message(json({
 		{"op", voice_opcode_client_speaking},
 		{"d", {
@@ -443,4 +448,15 @@ uint16_t discord_voice_client::get_iteration_interval() {
 	return this->iteration_interval;
 }
 
+discord_voice_client& discord_voice_client::send_stop_frames(bool send_now) {
+	uint8_t silence_frames[sizeof(silence_packet) / sizeof(*silence_packet) * 5];
+	for (size_t i = 0; i < sizeof(silence_frames) / sizeof(*silence_frames); i++) {
+		silence_frames[i] = silence_packet[i % 3];
+	}
+
+	this->send_audio_opus(silence_frames, sizeof(silence_frames) / sizeof(*silence_frames), 20, send_now);
+
+	return *this;
+}
+
 } // namespace dpp
diff --git a/src/dpp/voice/enabled/courier_loop.cpp b/src/dpp/voice/enabled/courier_loop.cpp
index 6526c7f8ba..57b5e99652 100644
--- a/src/dpp/voice/enabled/courier_loop.cpp
+++ b/src/dpp/voice/enabled/courier_loop.cpp
@@ -76,7 +76,21 @@ void discord_voice_client::voice_courier_loop(discord_voice_client& client, cour
 					break;
 				}
 
-				shared_state.signal_iteration.wait(lk);
+                shared_state.signal_iteration.wait(lk, [&shared_state](){
+                    /* 
+                     * Actually check the state we're looking for instead of waking up
+                     * everytime read_ready was called.
+                     */
+                    for (auto &[user_id, parking_lot]: shared_state.parked_voice_payloads) {
+                        if (parking_lot.parked_payloads.empty()) {
+                            continue;
+                        }
+
+                        return true;
+                    }
+                    return false;
+                });
+
 				/*
 				 * More data came or about to terminate, or just a spurious wake.
 				 * We need to collect the payloads again to determine what to do next.
diff --git a/src/dpp/voice/enabled/opus.cpp b/src/dpp/voice/enabled/opus.cpp
index d99a3776d0..a92886ac9c 100644
--- a/src/dpp/voice/enabled/opus.cpp
+++ b/src/dpp/voice/enabled/opus.cpp
@@ -71,14 +71,14 @@ discord_voice_client& discord_voice_client::send_audio_raw(uint16_t* audio_data,
 	return *this;
 }
 
-discord_voice_client& discord_voice_client::send_audio_opus(uint8_t* opus_packet, const size_t length) {
+discord_voice_client& discord_voice_client::send_audio_opus(const uint8_t* opus_packet, const size_t length) {
 	int samples = opus_packet_get_nb_samples(opus_packet, (opus_int32)length, opus_sample_rate_hz);
 	uint64_t duration = (samples / 48) / (timescale / 1000000);
-	send_audio_opus(opus_packet, length, duration);
+	send_audio_opus(opus_packet, length, duration, false);
 	return *this;
 }
 
-discord_voice_client& discord_voice_client::send_audio_opus(uint8_t* opus_packet, const size_t length, uint64_t duration) {
+discord_voice_client& discord_voice_client::send_audio_opus(const uint8_t* opus_packet, const size_t length, uint64_t duration, bool send_now) {
 	int frame_size = (int)(48 * duration * (timescale / 1000000));
 	opus_int32 encoded_audio_max_length = (opus_int32)length;
 	std::vector<uint8_t> encoded_audio(encoded_audio_max_length);
@@ -147,7 +147,7 @@ discord_voice_client& discord_voice_client::send_audio_opus(uint8_t* opus_packet
 	/* Append the 4 byte nonce to the resulting payload */
 	std::memcpy(payload.data() + payload.size() - sizeof(noncel), &noncel, sizeof(noncel));
 
-	this->send(reinterpret_cast<const char *>(payload.data()), payload.size(), duration);
+	this->send(reinterpret_cast<const char *>(payload.data()), payload.size(), duration, send_now);
 
 	timestamp += frame_size;
 
diff --git a/src/dpp/voice/enabled/read_write.cpp b/src/dpp/voice/enabled/read_write.cpp
index c24200b49b..52a09d5a39 100644
--- a/src/dpp/voice/enabled/read_write.cpp
+++ b/src/dpp/voice/enabled/read_write.cpp
@@ -30,7 +30,7 @@ namespace dpp {
 
 dpp::socket discord_voice_client::want_write() {
 	std::lock_guard<std::mutex> lock(this->stream_mutex);
-	if (!this->paused && !outbuf.empty()) {
+	if (!this->sent_stop_frames && !outbuf.empty()) {
 		return fd;
 	}
 	return INVALID_SOCKET;
@@ -42,13 +42,16 @@ dpp::socket discord_voice_client::want_read() {
 }
 
 
-void discord_voice_client::send(const char* packet, size_t len, uint64_t duration) {
-	voice_out_packet frame;
-	frame.packet.assign(packet, packet + len);
-	frame.duration = duration;
-	{
+void discord_voice_client::send(const char* packet, size_t len, uint64_t duration, bool send_now) {
+	if (!send_now) [[likely]] {
+		voice_out_packet frame;
+		frame.packet.assign(packet, packet + len);
+		frame.duration = duration;
+
 		std::lock_guard<std::mutex> lock(this->stream_mutex);
 		outbuf.emplace_back(frame);
+	} else [[unlikely]] {
+		this->udp_send(packet, len);
 	}
 }
 
@@ -68,4 +71,4 @@ int discord_voice_client::udp_recv(char* data, size_t max_length)
 	return static_cast<int>(recv(this->fd, data, static_cast<int>(max_length), 0));
 }
 
-}
\ No newline at end of file
+}
diff --git a/src/dpp/voice/enabled/write_ready.cpp b/src/dpp/voice/enabled/write_ready.cpp
index 46c0307055..3fda474f45 100644
--- a/src/dpp/voice/enabled/write_ready.cpp
+++ b/src/dpp/voice/enabled/write_ready.cpp
@@ -37,7 +37,14 @@ void discord_voice_client::write_ready() {
 	send_audio_type_t type = satype_recorded_audio;
 	{
 		std::lock_guard<std::mutex> lock(this->stream_mutex);
-		if (!this->paused && outbuf.size()) {
+		if (this->paused) {
+			if (!this->sent_stop_frames) {
+				this->send_stop_frames(true);
+				this->sent_stop_frames = true;
+			}
+
+			/* Fallthrough if paused */
+		} else if (outbuf.size()) {
 			type = send_audio_type;
 			if (outbuf[0].packet.size() == sizeof(uint16_t) && (*(reinterpret_cast<uint16_t*>(outbuf[0].packet.data()))) == AUDIO_TRACK_MARKER) {
 				outbuf.erase(outbuf.begin());