kernel: update TCP BBR to v3

BBR v3 is an enhacement to the BBR v1 algorithm. It's designed to aim for lower queues, lower loss, and better Reno/CUBIC coexistence than BBR v1. More information: https://github.com/google/bbr/blob/bbr-v3-upstream-prep-2024-02-19-01/README.md https://github.com/google/bbr/tree/001a430b3ddbbc6d9173061a74d8b580c757b91b Signed-off-by: Tan Zien <[email protected]> iproute2: ss: output TCP BBRv3 diag information Add logic for printing diag information for TCP BBRv3 congestion control. Signed-off-by: Tan Zien <[email protected]> ath79: refresh patch refresh: target/linux/ath79/patches-6.6/900-unaligned_access_hacks.patch Signed-off-by: Tan Zien <[email protected]>
vincejv · Dec 24, 2024 · d7fe3cd · d7fe3cd
1 parent 9f1a6ae
commit d7fe3cd
Show file tree

Hide file tree

Showing 21 changed files with 4,198 additions and 4 deletions.
diff --git a/package/network/utils/iproute2/patches/500-ss-output-TCP-BBRv3-diag-information.patch b/package/network/utils/iproute2/patches/500-ss-output-TCP-BBRv3-diag-information.patch
@@ -0,0 +1,165 @@
+From ca7f11ebc4d4a99ccfd44be8555d505b26996c12 Mon Sep 17 00:00:00 2001
+From: Arjun Roy <[email protected]>
+Date: Mon, 25 Jul 2022 12:49:35 -0400
+Subject: [PATCH 2/2] ss: output TCP BBRv3 diag information
+
+Add logic for printing diag information for TCP BBRv3 congestion
+control.  This commit leaves in place the support for printing the
+earlier TCP BBRv1 congestion control information.
+
+Both BBRv1 and BBRv3 are using the same enum value.  The BBRv3 struct
+starts with the same data as BBRv1, so it is is backward-compatible
+with BBRv1, to allow lder ss binaries to print basic information for
+BBRv3. We use the size of the returned data and the version field to
+check the version of the data.
+
+Signed-off-by: Arjun Roy <[email protected]>
+Signed-off-by: Neal Cardwell <[email protected]>
+Signed-off-by: David Morley <[email protected]>
+---
+ include/uapi/linux/inet_diag.h | 23 ++++++++++++
+ misc/ss.c                      | 66 +++++++++++++++++++++++++++++++++-
+ 2 files changed, 88 insertions(+), 1 deletion(-)
+
+diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h
+index d81cb69a26a9..dca833ecb783 100644
+--- a/include/uapi/linux/inet_diag.h
++++ b/include/uapi/linux/inet_diag.h
+@@ -229,6 +229,29 @@ struct tcp_bbr_info {
+ 	__u32	bbr_min_rtt;		/* min-filtered RTT in uSec */
+ 	__u32	bbr_pacing_gain;	/* pacing gain shifted left 8 bits */
+ 	__u32	bbr_cwnd_gain;		/* cwnd gain shifted left 8 bits */
++	__u32	bbr_bw_hi_lsb;		/* lower 32 bits of bw_hi */
++	__u32	bbr_bw_hi_msb;		/* upper 32 bits of bw_hi */
++	__u32	bbr_bw_lo_lsb;		/* lower 32 bits of bw_lo */
++	__u32	bbr_bw_lo_msb;		/* upper 32 bits of bw_lo */
++	__u8	bbr_mode;		/* current bbr_mode in state machine */
++	__u8	bbr_phase;		/* current state machine phase */
++	__u8	unused1;		/* alignment padding; not used yet */
++	__u8	bbr_version;		/* BBR algorithm version */
++	__u32	bbr_inflight_lo;	/* lower short-term data volume bound */
++	__u32	bbr_inflight_hi;	/* higher long-term data volume bound */
++	__u32	bbr_extra_acked;	/* max excess packets ACKed in epoch */
++};
++
++/* TCP BBR congestion control bbr_phase as reported in netlink/ss stats. */
++enum tcp_bbr_phase {
++	BBR_PHASE_INVALID		= 0,
++	BBR_PHASE_STARTUP		= 1,
++	BBR_PHASE_DRAIN			= 2,
++	BBR_PHASE_PROBE_RTT		= 3,
++	BBR_PHASE_PROBE_BW_UP		= 4,
++	BBR_PHASE_PROBE_BW_DOWN		= 5,
++	BBR_PHASE_PROBE_BW_CRUISE	= 6,
++	BBR_PHASE_PROBE_BW_REFILL	= 7,
+ };
+
+ union tcp_cc_info {
+diff --git a/misc/ss.c b/misc/ss.c
+index e9d813596b91..5f413118f0dd 100644
+--- a/misc/ss.c
++++ b/misc/ss.c
+@@ -912,6 +912,7 @@ struct tcpstat {
+ 	bool		    app_limited;
+ 	struct dctcpstat    *dctcp;
+ 	struct tcp_bbr_info *bbr_info;
++	unsigned int	    bbr_info_len;
+ };
+
+ /* SCTP assocs share the same inode number with their parent endpoint. So if we
+@@ -2585,6 +2586,29 @@ static void sctp_stats_print(struct sctp_info *s)
+ 		out(" fraginl:%d", s->sctpi_s_frag_interleave);
+ }
+
++static const char* bbr_phase_to_str(enum tcp_bbr_phase phase)
++{
++	switch (phase) {
++	case BBR_PHASE_STARTUP:
++		return "STARTUP";
++	case BBR_PHASE_DRAIN:
++		return "DRAIN";
++	case BBR_PHASE_PROBE_RTT:
++		return "PROBE_RTT";
++	case BBR_PHASE_PROBE_BW_UP:
++		return "PROBE_BW_UP";
++	case BBR_PHASE_PROBE_BW_DOWN:
++		return "PROBE_BW_DOWN";
++	case BBR_PHASE_PROBE_BW_CRUISE:
++		return "PROBE_BW_CRUISE";
++	case BBR_PHASE_PROBE_BW_REFILL:
++		return "PROBE_BW_REFILL";
++	case BBR_PHASE_INVALID:
++	default:
++		return "INVALID";
++	}
++}
++
+ static void tcp_stats_print(struct tcpstat *s)
+ {
+ 	char b1[64];
+@@ -2658,7 +2682,14 @@ static void tcp_stats_print(struct tcpstat *s)
+ 	}
+
+ 	if (s->bbr_info) {
+-		__u64 bw;
++		/* All versions of the BBR algorithm use the INET_DIAG_BBRINFO
++		 * enum value. Later versions of the tcp_bbr_info struct are
++		 * backward-compatible with earlier versions, to allow older ss
++		 * binaries to print basic information for newer versions of
++		 * the algorithm. We use the size of the returned tcp_bbr_info
++		 * struct to decide how much to print.
++		 */
++		__u64 bw, bw_hi, bw_lo;
+
+ 		bw = s->bbr_info->bbr_bw_hi;
+ 		bw <<= 32;
+@@ -2673,6 +2704,38 @@ static void tcp_stats_print(struct tcpstat *s)
+ 		if (s->bbr_info->bbr_cwnd_gain)
+ 			out(",cwnd_gain:%g",
+ 			    (double)s->bbr_info->bbr_cwnd_gain / 256.0);
++
++		if (s->bbr_info_len >=
++		    (offsetof(struct tcp_bbr_info, bbr_extra_acked) +
++		     sizeof(__u32))) {
++
++			bw_hi = s->bbr_info->bbr_bw_hi_msb;
++			bw_hi <<= 32;
++			bw_hi |= s->bbr_info->bbr_bw_hi_lsb;
++
++			bw_lo = s->bbr_info->bbr_bw_lo_msb;
++			bw_lo <<= 32;
++			bw_lo |= s->bbr_info->bbr_bw_lo_lsb;
++
++			out(",version:%u", s->bbr_info->bbr_version);
++			if (bw_hi != ~0UL)
++				out(",bw_hi:%sbps", sprint_bw(b1, bw_hi * 8.0));
++			if (bw_lo != ~0UL)
++				out(",bw_lo:%sbps", sprint_bw(b1, bw_lo * 8.0));
++			if (s->bbr_info->bbr_inflight_hi != ~0U)
++				out(",inflight_hi:%u",
++				    s->bbr_info->bbr_inflight_hi);
++			if (s->bbr_info->bbr_inflight_lo != ~0U)
++				out(",inflight_lo:%u",
++				    s->bbr_info->bbr_inflight_lo);
++			out(",extra_acked:%u", s->bbr_info->bbr_extra_acked);
++			out(",mode:%d", (int)s->bbr_info->bbr_mode);
++			out(",phase:%s",
++			    bbr_phase_to_str(
++				    (enum tcp_bbr_phase)
++				    s->bbr_info->bbr_phase));
++		}
++
+ 		out(")");
+ 	}
+
+@@ -3147,6 +3210,7 @@ static void tcp_show_info(const struct nlmsghdr *nlh, struct inet_diag_msg *r,
+ 			s.bbr_info = calloc(1, sizeof(*s.bbr_info));
+ 			if (s.bbr_info && bbr_info)
+ 				memcpy(s.bbr_info, bbr_info, len);
++			s.bbr_info_len = len;
+ 		}
+
+ 		if (rtt > 0 && info->tcpi_snd_mss && info->tcpi_snd_cwnd) {
+-- 
+2.41.0.487.g6d72f3e995-goog
+
diff --git a/target/linux/ath79/patches-6.6/900-unaligned_access_hacks.patch b/target/linux/ath79/patches-6.6/900-unaligned_access_hacks.patch
@@ -350,7 +350,7 @@ SVN-Revision: 35130
  	list_for_each_entry(p, head, list) {
 --- a/net/ipv4/tcp_output.c
 +++ b/net/ipv4/tcp_output.c
-@@ -622,48 +622,53 @@ static void tcp_options_write(struct tcp
+@@ -625,48 +625,53 @@ static void tcp_options_write(struct tcp
  	u16 options = opts->options;	/* mungable copy */
 
  	if (unlikely(OPTION_MD5 & options)) {
@@ -427,7 +427,7 @@ SVN-Revision: 35130
  	}
 
  	if (unlikely(opts->num_sack_blocks)) {
-@@ -671,16 +676,17 @@ static void tcp_options_write(struct tcp
+@@ -674,16 +679,17 @@ static void tcp_options_write(struct tcp
  			tp->duplicate_sack : tp->selective_acks;
  		int this_sack;
 
@@ -451,7 +451,7 @@ SVN-Revision: 35130
  		}
 
  		tp->rx_opt.dsack = 0;
-@@ -693,13 +699,14 @@ static void tcp_options_write(struct tcp
+@@ -696,13 +702,14 @@ static void tcp_options_write(struct tcp
 
  		if (foc->exp) {
  			len = TCPOLEN_EXP_FASTOPEN_BASE + foc->len;
@@ -751,7 +751,7 @@ SVN-Revision: 35130
  EXPORT_SYMBOL(xfrm_parse_spi);
 --- a/net/ipv4/tcp_input.c
 +++ b/net/ipv4/tcp_input.c
-@@ -4262,14 +4262,16 @@ static bool tcp_parse_aligned_timestamp(
+@@ -4287,14 +4287,16 @@ static bool tcp_parse_aligned_timestamp(
  {
  	const __be32 *ptr = (const __be32 *)(th + 1);
 

diff --git a/...v3-upstream-prep-2024-02-19-01-net-tcp_bbr-broaden-app-limited-rate-sample-detectio.patch b/...v3-upstream-prep-2024-02-19-01-net-tcp_bbr-broaden-app-limited-rate-sample-detectio.patch
@@ -0,0 +1,51 @@
+From 304c6ff637e53c8f3530a0bb50ba95e532d681b8 Mon Sep 17 00:00:00 2001
+From: Neal Cardwell <[email protected]>
+Date: Tue, 11 Jun 2019 12:26:55 -0400
+Subject: [PATCH 01/19] net-tcp_bbr: broaden app-limited rate sample detection
+
+This commit is a bug fix for the Linux TCP app-limited
+(application-limited) logic that is used for collecting rate
+(bandwidth) samples.
+
+Previously the app-limited logic only looked for "bubbles" of
+silence in between application writes, by checking at the start
+of each sendmsg. But "bubbles" of silence can also happen before
+retransmits: e.g. bubbles can happen between an application write
+and a retransmit, or between two retransmits.
+
+Retransmits are triggered by ACKs or timers. So this commit checks
+for bubbles of app-limited silence upon ACKs or timers.
+
+Why does this commit check for app-limited state at the start of
+ACKs and timer handling? Because at that point we know whether
+inflight was fully using the cwnd.  During processing the ACK or
+timer event we often change the cwnd; after changing the cwnd we
+can't know whether inflight was fully using the old cwnd.
+
+Origin-9xx-SHA1: 3fe9b53291e018407780fb8c356adb5666722cbc
+Change-Id: I37221506f5166877c2b110753d39bb0757985e68
+---
+ net/ipv4/tcp_input.c | 1 +
+ net/ipv4/tcp_timer.c | 1 +
+ 2 files changed, 2 insertions(+)
+
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -3915,6 +3915,7 @@ static int tcp_ack(struct sock *sk, cons
+
+ 	prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una;
+ 	rs.prior_in_flight = tcp_packets_in_flight(tp);
++	tcp_rate_check_app_limited(sk);
+
+ 	/* ts_recent update must be made after we are sure that the packet
+ 	 * is in window.
+--- a/net/ipv4/tcp_timer.c
++++ b/net/ipv4/tcp_timer.c
+@@ -677,6 +677,7 @@ void tcp_write_timer_handler(struct sock
+ 		return;
+ 	}
+
++	tcp_rate_check_app_limited(sk);
+ 	tcp_mstamp_refresh(tcp_sk(sk));
+ 	event = icsk->icsk_pending;
+
diff --git a/...v3-upstream-prep-2024-02-19-01-net-tcp_bbr-v2-shrink-delivered_mstamp-first_tx_msta.patch b/...v3-upstream-prep-2024-02-19-01-net-tcp_bbr-v2-shrink-delivered_mstamp-first_tx_msta.patch
@@ -0,0 +1,73 @@
+From 5a47eb49ad406b439a00b90f5285359cd1e876f4 Mon Sep 17 00:00:00 2001
+From: Neal Cardwell <[email protected]>
+Date: Sun, 24 Jun 2018 21:55:59 -0400
+Subject: [PATCH 02/19] net-tcp_bbr: v2: shrink delivered_mstamp,
+ first_tx_mstamp to u32 to free up 8 bytes
+
+Free up some space for tracking inflight and losses for each
+bw sample, in upcoming commits.
+
+These timestamps are in microseconds, and are now stored in 32
+bits. So they can only hold time intervals up to roughly 2^12 = 4096
+seconds.  But Linux TCP RTT and RTO tracking has the same 32-bit
+microsecond implementation approach and resulting deployment
+limitations. So this is not introducing a new limit. And these should
+not be a limitation for the foreseeable future.
+
+Effort: net-tcp_bbr
+Origin-9xx-SHA1: 238a7e6b5d51625fef1ce7769826a7b21b02ae55
+Change-Id: I3b779603797263b52a61ad57c565eb91fe42680c
+---
+ include/net/tcp.h   | 9 +++++++--
+ net/ipv4/tcp_rate.c | 7 ++++---
+ 2 files changed, 11 insertions(+), 5 deletions(-)
+
+--- a/include/net/tcp.h
++++ b/include/net/tcp.h
+@@ -822,6 +822,11 @@ static inline u32 tcp_stamp_us_delta(u64
+ 	return max_t(s64, t1 - t0, 0);
+ }
+
++static inline u32 tcp_stamp32_us_delta(u32 t1, u32 t0)
++{
++	return max_t(s32, t1 - t0, 0);
++}
++
+ static inline u32 tcp_skb_timestamp(const struct sk_buff *skb)
+ {
+ 	return tcp_ns_to_ts(skb->skb_mstamp_ns);
+@@ -897,9 +902,9 @@ struct tcp_skb_cb {
+ 			/* pkts S/ACKed so far upon tx of skb, incl retrans: */
+ 			__u32 delivered;
+ 			/* start of send pipeline phase */
+-			u64 first_tx_mstamp;
++			u32 first_tx_mstamp;
+ 			/* when we reached the "delivered" count */
+-			u64 delivered_mstamp;
++			u32 delivered_mstamp;
+ 		} tx;   /* only used for outgoing skbs */
+ 		union {
+ 			struct inet_skb_parm	h4;
+--- a/net/ipv4/tcp_rate.c
++++ b/net/ipv4/tcp_rate.c
+@@ -101,8 +101,9 @@ void tcp_rate_skb_delivered(struct sock
+ 		/* Record send time of most recently ACKed packet: */
+ 		tp->first_tx_mstamp  = tx_tstamp;
+ 		/* Find the duration of the "send phase" of this window: */
+-		rs->interval_us = tcp_stamp_us_delta(tp->first_tx_mstamp,
+-						     scb->tx.first_tx_mstamp);
++		rs->interval_us      = tcp_stamp32_us_delta(
++						tp->first_tx_mstamp,
++						scb->tx.first_tx_mstamp);
+
+ 	}
+ 	/* Mark off the skb delivered once it's sacked to avoid being
+@@ -155,7 +156,7 @@ void tcp_rate_gen(struct sock *sk, u32 d
+ 	 * longer phase.
+ 	 */
+ 	snd_us = rs->interval_us;				/* send phase */
+-	ack_us = tcp_stamp_us_delta(tp->tcp_mstamp,
++	ack_us = tcp_stamp32_us_delta(tp->tcp_mstamp,
+ 				    rs->prior_mstamp); /* ack phase */
+ 	rs->interval_us = max(snd_us, ack_us);
+