Skip to content

Commit

Permalink
kernel: update TCP BBR to v3
Browse files Browse the repository at this point in the history
BBR v3 is an enhacement to the BBR v1 algorithm. It's designed to aim for lower
queues, lower loss, and better Reno/CUBIC coexistence than BBR v1.

More information:
https://github.com/google/bbr/blob/bbr-v3-upstream-prep-2024-02-19-01/README.md

https://github.com/google/bbr/tree/001a430b3ddbbc6d9173061a74d8b580c757b91b

Signed-off-by: Tan Zien <[email protected]>

iproute2: ss: output TCP BBRv3 diag information

Add logic for printing diag information for TCP BBRv3 congestion
control.

Signed-off-by: Tan Zien <[email protected]>

ath79: refresh patch

refresh:
target/linux/ath79/patches-6.6/900-unaligned_access_hacks.patch

Signed-off-by: Tan Zien <[email protected]>
  • Loading branch information
nasbdh9 authored and vincejv committed Dec 24, 2024
1 parent 9f1a6ae commit d7fe3cd
Show file tree
Hide file tree
Showing 21 changed files with 4,198 additions and 4 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
From ca7f11ebc4d4a99ccfd44be8555d505b26996c12 Mon Sep 17 00:00:00 2001
From: Arjun Roy <[email protected]>
Date: Mon, 25 Jul 2022 12:49:35 -0400
Subject: [PATCH 2/2] ss: output TCP BBRv3 diag information

Add logic for printing diag information for TCP BBRv3 congestion
control. This commit leaves in place the support for printing the
earlier TCP BBRv1 congestion control information.

Both BBRv1 and BBRv3 are using the same enum value. The BBRv3 struct
starts with the same data as BBRv1, so it is is backward-compatible
with BBRv1, to allow lder ss binaries to print basic information for
BBRv3. We use the size of the returned data and the version field to
check the version of the data.

Signed-off-by: Arjun Roy <[email protected]>
Signed-off-by: Neal Cardwell <[email protected]>
Signed-off-by: David Morley <[email protected]>
---
include/uapi/linux/inet_diag.h | 23 ++++++++++++
misc/ss.c | 66 +++++++++++++++++++++++++++++++++-
2 files changed, 88 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h
index d81cb69a26a9..dca833ecb783 100644
--- a/include/uapi/linux/inet_diag.h
+++ b/include/uapi/linux/inet_diag.h
@@ -229,6 +229,29 @@ struct tcp_bbr_info {
__u32 bbr_min_rtt; /* min-filtered RTT in uSec */
__u32 bbr_pacing_gain; /* pacing gain shifted left 8 bits */
__u32 bbr_cwnd_gain; /* cwnd gain shifted left 8 bits */
+ __u32 bbr_bw_hi_lsb; /* lower 32 bits of bw_hi */
+ __u32 bbr_bw_hi_msb; /* upper 32 bits of bw_hi */
+ __u32 bbr_bw_lo_lsb; /* lower 32 bits of bw_lo */
+ __u32 bbr_bw_lo_msb; /* upper 32 bits of bw_lo */
+ __u8 bbr_mode; /* current bbr_mode in state machine */
+ __u8 bbr_phase; /* current state machine phase */
+ __u8 unused1; /* alignment padding; not used yet */
+ __u8 bbr_version; /* BBR algorithm version */
+ __u32 bbr_inflight_lo; /* lower short-term data volume bound */
+ __u32 bbr_inflight_hi; /* higher long-term data volume bound */
+ __u32 bbr_extra_acked; /* max excess packets ACKed in epoch */
+};
+
+/* TCP BBR congestion control bbr_phase as reported in netlink/ss stats. */
+enum tcp_bbr_phase {
+ BBR_PHASE_INVALID = 0,
+ BBR_PHASE_STARTUP = 1,
+ BBR_PHASE_DRAIN = 2,
+ BBR_PHASE_PROBE_RTT = 3,
+ BBR_PHASE_PROBE_BW_UP = 4,
+ BBR_PHASE_PROBE_BW_DOWN = 5,
+ BBR_PHASE_PROBE_BW_CRUISE = 6,
+ BBR_PHASE_PROBE_BW_REFILL = 7,
};

union tcp_cc_info {
diff --git a/misc/ss.c b/misc/ss.c
index e9d813596b91..5f413118f0dd 100644
--- a/misc/ss.c
+++ b/misc/ss.c
@@ -912,6 +912,7 @@ struct tcpstat {
bool app_limited;
struct dctcpstat *dctcp;
struct tcp_bbr_info *bbr_info;
+ unsigned int bbr_info_len;
};

/* SCTP assocs share the same inode number with their parent endpoint. So if we
@@ -2585,6 +2586,29 @@ static void sctp_stats_print(struct sctp_info *s)
out(" fraginl:%d", s->sctpi_s_frag_interleave);
}

+static const char* bbr_phase_to_str(enum tcp_bbr_phase phase)
+{
+ switch (phase) {
+ case BBR_PHASE_STARTUP:
+ return "STARTUP";
+ case BBR_PHASE_DRAIN:
+ return "DRAIN";
+ case BBR_PHASE_PROBE_RTT:
+ return "PROBE_RTT";
+ case BBR_PHASE_PROBE_BW_UP:
+ return "PROBE_BW_UP";
+ case BBR_PHASE_PROBE_BW_DOWN:
+ return "PROBE_BW_DOWN";
+ case BBR_PHASE_PROBE_BW_CRUISE:
+ return "PROBE_BW_CRUISE";
+ case BBR_PHASE_PROBE_BW_REFILL:
+ return "PROBE_BW_REFILL";
+ case BBR_PHASE_INVALID:
+ default:
+ return "INVALID";
+ }
+}
+
static void tcp_stats_print(struct tcpstat *s)
{
char b1[64];
@@ -2658,7 +2682,14 @@ static void tcp_stats_print(struct tcpstat *s)
}

if (s->bbr_info) {
- __u64 bw;
+ /* All versions of the BBR algorithm use the INET_DIAG_BBRINFO
+ * enum value. Later versions of the tcp_bbr_info struct are
+ * backward-compatible with earlier versions, to allow older ss
+ * binaries to print basic information for newer versions of
+ * the algorithm. We use the size of the returned tcp_bbr_info
+ * struct to decide how much to print.
+ */
+ __u64 bw, bw_hi, bw_lo;

bw = s->bbr_info->bbr_bw_hi;
bw <<= 32;
@@ -2673,6 +2704,38 @@ static void tcp_stats_print(struct tcpstat *s)
if (s->bbr_info->bbr_cwnd_gain)
out(",cwnd_gain:%g",
(double)s->bbr_info->bbr_cwnd_gain / 256.0);
+
+ if (s->bbr_info_len >=
+ (offsetof(struct tcp_bbr_info, bbr_extra_acked) +
+ sizeof(__u32))) {
+
+ bw_hi = s->bbr_info->bbr_bw_hi_msb;
+ bw_hi <<= 32;
+ bw_hi |= s->bbr_info->bbr_bw_hi_lsb;
+
+ bw_lo = s->bbr_info->bbr_bw_lo_msb;
+ bw_lo <<= 32;
+ bw_lo |= s->bbr_info->bbr_bw_lo_lsb;
+
+ out(",version:%u", s->bbr_info->bbr_version);
+ if (bw_hi != ~0UL)
+ out(",bw_hi:%sbps", sprint_bw(b1, bw_hi * 8.0));
+ if (bw_lo != ~0UL)
+ out(",bw_lo:%sbps", sprint_bw(b1, bw_lo * 8.0));
+ if (s->bbr_info->bbr_inflight_hi != ~0U)
+ out(",inflight_hi:%u",
+ s->bbr_info->bbr_inflight_hi);
+ if (s->bbr_info->bbr_inflight_lo != ~0U)
+ out(",inflight_lo:%u",
+ s->bbr_info->bbr_inflight_lo);
+ out(",extra_acked:%u", s->bbr_info->bbr_extra_acked);
+ out(",mode:%d", (int)s->bbr_info->bbr_mode);
+ out(",phase:%s",
+ bbr_phase_to_str(
+ (enum tcp_bbr_phase)
+ s->bbr_info->bbr_phase));
+ }
+
out(")");
}

@@ -3147,6 +3210,7 @@ static void tcp_show_info(const struct nlmsghdr *nlh, struct inet_diag_msg *r,
s.bbr_info = calloc(1, sizeof(*s.bbr_info));
if (s.bbr_info && bbr_info)
memcpy(s.bbr_info, bbr_info, len);
+ s.bbr_info_len = len;
}

if (rtt > 0 && info->tcpi_snd_mss && info->tcpi_snd_cwnd) {
--
2.41.0.487.g6d72f3e995-goog

Original file line number Diff line number Diff line change
Expand Up @@ -350,7 +350,7 @@ SVN-Revision: 35130
list_for_each_entry(p, head, list) {
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -622,48 +622,53 @@ static void tcp_options_write(struct tcp
@@ -625,48 +625,53 @@ static void tcp_options_write(struct tcp
u16 options = opts->options; /* mungable copy */

if (unlikely(OPTION_MD5 & options)) {
Expand Down Expand Up @@ -427,7 +427,7 @@ SVN-Revision: 35130
}

if (unlikely(opts->num_sack_blocks)) {
@@ -671,16 +676,17 @@ static void tcp_options_write(struct tcp
@@ -674,16 +679,17 @@ static void tcp_options_write(struct tcp
tp->duplicate_sack : tp->selective_acks;
int this_sack;

Expand All @@ -451,7 +451,7 @@ SVN-Revision: 35130
}

tp->rx_opt.dsack = 0;
@@ -693,13 +699,14 @@ static void tcp_options_write(struct tcp
@@ -696,13 +702,14 @@ static void tcp_options_write(struct tcp

if (foc->exp) {
len = TCPOLEN_EXP_FASTOPEN_BASE + foc->len;
Expand Down Expand Up @@ -751,7 +751,7 @@ SVN-Revision: 35130
EXPORT_SYMBOL(xfrm_parse_spi);
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4262,14 +4262,16 @@ static bool tcp_parse_aligned_timestamp(
@@ -4287,14 +4287,16 @@ static bool tcp_parse_aligned_timestamp(
{
const __be32 *ptr = (const __be32 *)(th + 1);

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
From 304c6ff637e53c8f3530a0bb50ba95e532d681b8 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <[email protected]>
Date: Tue, 11 Jun 2019 12:26:55 -0400
Subject: [PATCH 01/19] net-tcp_bbr: broaden app-limited rate sample detection

This commit is a bug fix for the Linux TCP app-limited
(application-limited) logic that is used for collecting rate
(bandwidth) samples.

Previously the app-limited logic only looked for "bubbles" of
silence in between application writes, by checking at the start
of each sendmsg. But "bubbles" of silence can also happen before
retransmits: e.g. bubbles can happen between an application write
and a retransmit, or between two retransmits.

Retransmits are triggered by ACKs or timers. So this commit checks
for bubbles of app-limited silence upon ACKs or timers.

Why does this commit check for app-limited state at the start of
ACKs and timer handling? Because at that point we know whether
inflight was fully using the cwnd. During processing the ACK or
timer event we often change the cwnd; after changing the cwnd we
can't know whether inflight was fully using the old cwnd.

Origin-9xx-SHA1: 3fe9b53291e018407780fb8c356adb5666722cbc
Change-Id: I37221506f5166877c2b110753d39bb0757985e68
---
net/ipv4/tcp_input.c | 1 +
net/ipv4/tcp_timer.c | 1 +
2 files changed, 2 insertions(+)

--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3915,6 +3915,7 @@ static int tcp_ack(struct sock *sk, cons

prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una;
rs.prior_in_flight = tcp_packets_in_flight(tp);
+ tcp_rate_check_app_limited(sk);

/* ts_recent update must be made after we are sure that the packet
* is in window.
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -677,6 +677,7 @@ void tcp_write_timer_handler(struct sock
return;
}

+ tcp_rate_check_app_limited(sk);
tcp_mstamp_refresh(tcp_sk(sk));
event = icsk->icsk_pending;

Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
From 5a47eb49ad406b439a00b90f5285359cd1e876f4 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <[email protected]>
Date: Sun, 24 Jun 2018 21:55:59 -0400
Subject: [PATCH 02/19] net-tcp_bbr: v2: shrink delivered_mstamp,
first_tx_mstamp to u32 to free up 8 bytes

Free up some space for tracking inflight and losses for each
bw sample, in upcoming commits.

These timestamps are in microseconds, and are now stored in 32
bits. So they can only hold time intervals up to roughly 2^12 = 4096
seconds. But Linux TCP RTT and RTO tracking has the same 32-bit
microsecond implementation approach and resulting deployment
limitations. So this is not introducing a new limit. And these should
not be a limitation for the foreseeable future.

Effort: net-tcp_bbr
Origin-9xx-SHA1: 238a7e6b5d51625fef1ce7769826a7b21b02ae55
Change-Id: I3b779603797263b52a61ad57c565eb91fe42680c
---
include/net/tcp.h | 9 +++++++--
net/ipv4/tcp_rate.c | 7 ++++---
2 files changed, 11 insertions(+), 5 deletions(-)

--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -822,6 +822,11 @@ static inline u32 tcp_stamp_us_delta(u64
return max_t(s64, t1 - t0, 0);
}

+static inline u32 tcp_stamp32_us_delta(u32 t1, u32 t0)
+{
+ return max_t(s32, t1 - t0, 0);
+}
+
static inline u32 tcp_skb_timestamp(const struct sk_buff *skb)
{
return tcp_ns_to_ts(skb->skb_mstamp_ns);
@@ -897,9 +902,9 @@ struct tcp_skb_cb {
/* pkts S/ACKed so far upon tx of skb, incl retrans: */
__u32 delivered;
/* start of send pipeline phase */
- u64 first_tx_mstamp;
+ u32 first_tx_mstamp;
/* when we reached the "delivered" count */
- u64 delivered_mstamp;
+ u32 delivered_mstamp;
} tx; /* only used for outgoing skbs */
union {
struct inet_skb_parm h4;
--- a/net/ipv4/tcp_rate.c
+++ b/net/ipv4/tcp_rate.c
@@ -101,8 +101,9 @@ void tcp_rate_skb_delivered(struct sock
/* Record send time of most recently ACKed packet: */
tp->first_tx_mstamp = tx_tstamp;
/* Find the duration of the "send phase" of this window: */
- rs->interval_us = tcp_stamp_us_delta(tp->first_tx_mstamp,
- scb->tx.first_tx_mstamp);
+ rs->interval_us = tcp_stamp32_us_delta(
+ tp->first_tx_mstamp,
+ scb->tx.first_tx_mstamp);

}
/* Mark off the skb delivered once it's sacked to avoid being
@@ -155,7 +156,7 @@ void tcp_rate_gen(struct sock *sk, u32 d
* longer phase.
*/
snd_us = rs->interval_us; /* send phase */
- ack_us = tcp_stamp_us_delta(tp->tcp_mstamp,
+ ack_us = tcp_stamp32_us_delta(tp->tcp_mstamp,
rs->prior_mstamp); /* ack phase */
rs->interval_us = max(snd_us, ack_us);

Loading

0 comments on commit d7fe3cd

Please sign in to comment.