diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index dbe87fe8cc56..88c315841f3d 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -433,9 +433,9 @@ tcp_ecn_option - INTEGER tcp_ecn_option_beacon - INTEGER Control Accurate ECN (AccECN) option sending frequency per RTT and it - take effect only when tcp_ecn_option is set to 2. + takes effect only when tcp_ecn_option is set to 2. - Default: 3 (AccECN will be send at least 3 times per RTT) + Default: 1 (AccECN will be send at least 1 time per RTT) tcp_ecn_fallback - BOOLEAN If the kernel detects that ECN connection misbehaves, enable fall diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 1557cf0241c8..73ab3e9c7942 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -126,8 +126,7 @@ struct tcp_request_sock { #if IS_ENABLED(CONFIG_MPTCP) bool drop_req; #endif - u8 noect : 1, - accecn_ok : 1, + u8 accecn_ok : 1, saw_accecn_opt : 2, syn_ect_snt: 2, syn_ect_rcv: 2; @@ -234,6 +233,9 @@ struct tcp_sock { syn_ect_snt:2, /* AccECN ECT memory, only */ syn_ect_rcv:2, /* ... needed durign 3WHS + first seqno */ ecn_fail:1; /* ECN reflector detected path mangling */ + u8 accecn_no_respond:1, /* AccECN no response on feedback */ + accecn_no_options:1, /* AccECN no options send out */ + first_data_ack:1; /* Check for first data ack */ u8 saw_accecn_opt:2, /* An AccECN option was seen */ fast_ack_mode:2, /* which fast ack mode ? */ unused:4; diff --git a/include/net/request_sock.h b/include/net/request_sock.h index 29e41ff3ec93..d382e540f329 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -62,7 +62,8 @@ struct request_sock { u16 mss; u8 num_retrans; /* number of retransmits */ u8 syncookie:1; /* syncookie: encode tcpopts in timestamp */ - u8 num_timeout:7; /* number of timeouts */ + u8 num_timeout:7, + is_rtx:1; /* number of timeouts */ u32 ts_recent; struct timer_list rsk_timer; const struct request_sock_ops *rsk_ops; @@ -105,6 +106,7 @@ reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk_listener, sk_tx_queue_clear(req_to_sk(req)); req->saved_syn = NULL; req->num_timeout = 0; + req->is_rtx = 0; req->num_retrans = 0; req->sk = NULL; refcount_set(&req->rsk_refcnt, 0); diff --git a/include/net/tcp.h b/include/net/tcp.h index f26a931d2ae2..6745b64ca705 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -432,8 +432,8 @@ static inline int tcp_accecn_extract_syn_ect(u8 ace) } bool tcp_accecn_validate_syn_feedback(struct sock *sk, u8 ace, u8 sent_ect); -bool tcp_accecn_third_ack(struct sock *sk, const struct sk_buff *skb, - struct request_sock *req, u8 syn_ect_snt); +void tcp_accecn_third_ack(struct sock *sk, const struct sk_buff *skb, + u8 syn_ect_snt); u8 tcp_accecn_option_init(const struct sk_buff *skb, u8 opt_offset); void tcp_ecn_received_counters(struct sock *sk, const struct sk_buff *skb, u32 payload_len); diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index a53f9bf7886f..9d20f2456cf2 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -688,6 +688,7 @@ static void syn_ack_recalc(struct request_sock *req, int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req) { + req->is_rtx = 1; int err = req->rsk_ops->rtx_syn_ack(parent, req); if (!err) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 9721d7f0db9b..3f7a83eb6fae 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3033,6 +3033,9 @@ int tcp_disconnect(struct sock *sk, int flags) tp->delivered_ce = 0; tp->saw_accecn_opt = 0; tp->ecn_fail = 0; + tp->accecn_no_respond = 0; + tp->accecn_no_options = 0; + tp->first_data_ack = 0; tcp_accecn_init_counters(tp); tp->prev_ecnfield = 0; tp->accecn_opt_tstamp = 0; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index fdd58702b736..c2c72a2688ef 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -449,11 +449,11 @@ static void tcp_ecn_rcv_synack(struct sock *sk, const struct sk_buff *skb, else tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168); break; - // [CY] 3.1.2. Backward Compatibility - If a TCP Client has sent a SYN requesting AccECN feedback with - // (AE,CWR,ECE) = (1,1,1) then receives a SYN/ACK with the currently reserved combination (AE,CWR,ECE) - // = (1,0,1) but it does not have logic specific to such a combination, the Client MUST enable AccECN - // mode as if the SYN/ACK confirmed that the Server supported AccECN and as if it fed back that the - // IP-ECN field on the SYN had arrived unchanged. + /* [CY] 3.1.2. Backward Compatibility - If a TCP Client has sent a SYN requesting AccECN feedback with (AE,CWR,ECE) = + * (1,1,1) then receives a SYN/ACK with the currently reserved combination (AE,CWR,ECE) = (1,0,1) but it does not + * have logic specific to such a combination, the Client MUST enable AccECN mode as if the SYN/ACK confirmed that the + * Server supported AccECN and as if it fed back that the IP-ECN field on the SYN had arrived unchanged. + */ case 0x5: if (tcp_ecn_mode_pending(tp)) { tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); @@ -595,7 +595,7 @@ static bool tcp_accecn_process_option(struct tcp_sock *tp, bool order1, res; unsigned int i; - if (tp->saw_accecn_opt == TCP_ACCECN_OPT_FAIL) + if (tp->saw_accecn_opt == TCP_ACCECN_OPT_FAIL || tp->accecn_no_respond) return false; if (!(flag & FLAG_SLOWPATH) || !tp->rx_opt.accecn) { @@ -703,6 +703,22 @@ static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, if (flag & FLAG_SYN_ACKED) return 0; + /* [CY] 3.2.2.4. Testing for Zeroing of the ACE Field - If AccECN has been successfully negotiated, the Data Sender + * MAY check the value of the ACE counter in the first feedback packet (with or without data) that arrives after the + * 3-way handshake. If the value of this ACE field is found to be zero (0b000), for the remainder of the half- + * connection the Data Sender ought to send non-ECN-capable packets and it is advised not to respond to any feedback + * of CE markings. + */ + if (!tp->first_data_ack) { + tp->first_data_ack = 1; + if (tcp_accecn_ace(tcp_hdr(skb)) == 0x0) { + tp->ecn_fail = 1; + INET_ECN_dontxmit(sk); + tp->accecn_no_respond = 1; + return 0; + } + } + if (tp->received_ce_pending >= TCP_ACCECN_ACE_MAX_DELTA) inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; @@ -4893,8 +4909,18 @@ static void tcp_rcv_spurious_retrans(struct sock *sk, const struct sk_buff *skb) * DSACK state and change the txhash to re-route speculatively. */ if (TCP_SKB_CB(skb)->seq == tcp_sk(sk)->duplicate_sack[0].start_seq && - sk_rethink_txhash(sk)) + sk_rethink_txhash(sk)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDUPLICATEDATAREHASH); + /* [CY] 3.2.3.2.2. Testing for Loss of Packets Carrying the AccECN Option - If a middlebox is dropping + * packets with options it does not recognize, a host that is sending little or no data but mostly pure + * ACKs will not inherently detect such losses. Such a host MAY detect loss of ACKs carrying the AccECN + * Option by detecting whether the acknowledged data always reappears as a retransmission. In such cases, + * the host SHOULD disable the sending of the AccECN Option for this half-connection. + */ + if (tcp_ecn_mode_accecn(tcp_sk(sk))) + tcp_sk(sk)->accecn_no_options = 1; + + } } static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb) @@ -6235,6 +6261,11 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, if (th->syn) { if (tcp_ecn_mode_accecn(tp)) { send_accecn_reflector = true; + /* [CY] 3.1.5. Implications of AccECN Mode - A host in AccECN mode that is feeding back the IP-ECN + * field on a SYN or SYN/ACK: MUST feed back the IP-ECN field on the latest valid SYN or acceptable + * SYN/ACK to arrive.” + */ + tp->syn_ect_rcv = TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK; if (tp->rx_opt.accecn && tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) { tp->saw_accecn_opt = tcp_accecn_option_init(skb, @@ -7017,7 +7048,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) tcp_initialize_rcv_mss(sk); if (tcp_ecn_mode_accecn(tp)) - tcp_accecn_third_ack(sk, skb, req, tp->syn_ect_snt); + tcp_accecn_third_ack(sk, skb, tp->syn_ect_snt); tcp_fast_path_on(tp); break; @@ -7218,7 +7249,6 @@ static void tcp_openreq_init(struct request_sock *req, tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; tcp_rsk(req)->snt_synack = 0; tcp_rsk(req)->last_oow_ack_time = 0; - tcp_rsk(req)->noect = 0; tcp_rsk(req)->accecn_ok = 0; tcp_rsk(req)->saw_accecn_opt = 0; tcp_rsk(req)->syn_ect_rcv = 0; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 8450748d6873..ba104fb82c7d 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -397,26 +397,21 @@ void tcp_openreq_init_rwin(struct request_sock *req, } EXPORT_SYMBOL(tcp_openreq_init_rwin); -bool tcp_accecn_third_ack(struct sock *sk, const struct sk_buff *skb, - struct request_sock *req, u8 syn_ect_snt) +void tcp_accecn_third_ack(struct sock *sk, const struct sk_buff *skb, + u8 syn_ect_snt) { u8 ace = tcp_accecn_ace(tcp_hdr(skb)); struct tcp_sock *tp = tcp_sk(sk); - bool verify_ace = true; switch (ace) { case 0x0: + /* [CY] 3.2.2.1. ACE Field on the ACK of the SYN/ACK - If the Server is in AccECN mode and in SYN-RCVD + * state, and if it receives a value of zero on a pure ACK with SYN=0 and no SACK blocks, for the rest + * of the connection the Server MUST NOT set ECT on outgoing packets and MUST NOT respond to AccECN + * feedback. Nonetheless, as a Data Receiver it MUST NOT disable AccECN feedback. + */ tp->ecn_fail = 1; - // [CY] 3.2.2.1. ACE Field on the ACK of the SYN/ACK - If the Server is in AccECN mode and in SYN-RCVD - // state, and if it receives a value of zero on a pure ACK with SYN=0 and no SACK blocks, for the rest - // of the connection the Server MUST NOT set ECT on outgoing packets and MUST NOT respond to AccECN - // feedback. Nonetheless, as a Data Receiver it MUST NOT disable AccECN feedback. - if (!TCP_SKB_CB(skb)->sacked) { - inet_rsk(req)->ecn_ok = 0; - tcp_rsk(req)->accecn_ok = 0; - tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); - verify_ace = false; - } + tp->accecn_no_respond = 1; break; case 0x7: case 0x5: @@ -434,37 +429,32 @@ bool tcp_accecn_third_ack(struct sock *sk, const struct sk_buff *skb, } break; } - return verify_ace; } static void tcp_ecn_openreq_child(struct sock *sk, - struct request_sock *req, + const struct request_sock *req, const struct sk_buff *skb) { - struct tcp_request_sock *treq = tcp_rsk(req); + const struct tcp_request_sock *treq = tcp_rsk(req); struct tcp_sock *tp = tcp_sk(sk); - // [CY] 3.1.5. Implications of AccECN Mode - A TCP Server in AccECN mode: MUST NOT set ECT on - // any packet for the rest of the connection, if it has received or sent at least one valid - // SYN or Acceptable SYN/ACK with (AE,CWR,ECE) = (0,0,0) during the handshake. - if (treq->noect) { - tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); - } else { - if (treq->accecn_ok) { + /* [CY] 3.1.5. Implications of AccECN Mode - A TCP Server in AccECN mode: MUST NOT set ECT on + * any packet for the rest of the connection, if it has received or sent at least one valid + * SYN or Acceptable SYN/ACK with (AE,CWR,ECE) = (0,0,0) during the handshake. + */ + if (treq->accecn_ok) { const struct tcphdr *th = (const struct tcphdr *)skb->data; - if (tcp_accecn_third_ack(sk, skb, req, treq->syn_ect_snt)) { - tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); - tp->syn_ect_snt = treq->syn_ect_snt; - tp->saw_accecn_opt = treq->saw_accecn_opt; - tp->prev_ecnfield = treq->syn_ect_rcv; - tp->accecn_opt_demand = 1; - tcp_ecn_received_counters(sk, skb, skb->len - th->doff * 4); - } - } else { + tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); + tp->syn_ect_snt = treq->syn_ect_snt; + tcp_accecn_third_ack(sk, skb, treq->syn_ect_snt); + tp->saw_accecn_opt = treq->saw_accecn_opt; + tp->prev_ecnfield = treq->syn_ect_rcv; + tp->accecn_opt_demand = 1; + tcp_ecn_received_counters(sk, skb, skb->len - th->doff * 4); + } else { tcp_ecn_mode_set(tp, inet_rsk(req)->ecn_ok && !tcp_ca_no_fallback_rfc3168(sk) ? TCP_ECN_MODE_RFC3168 : - TCP_ECN_DISABLED); - } + TCP_ECN_DISABLED); } } @@ -717,17 +707,17 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, &tcp_rsk(req)->last_oow_ack_time)) { if (tcp_rsk(req)->accecn_ok) { + /* [CY] 3.1.5 Implications of AccECN Mode - A host in AccECN mode that is feeding back the IP-ECN + * field on a SYN or SYN/ACK: MUST feed back the IP-ECN field on the latest valid SYN or acceptable + * SYN/ACK to arrive. + */ + tcp_rsk(req)->syn_ect_rcv = TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK; if (tcp_accecn_ace(tcp_hdr(skb)) == 0x0) { - // [CY] 3.1.5. Implications of AccECN Mode - A TCP Server already in AccECN mode: SHOULD - // acknowledge a valid SYN arriving with (AE,CWR,ECE) =(0,0,0) by emitting an AccECN SYN/ACK (with - // the appropriate combination of TCP-ECN flags to feed back the IP-ECN field of this latest SYN) - tcp_sk(sk)->syn_ect_rcv = TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK; - - // [CY] 3.1.5. Implications of AccECN Mode - A TCP Server in AccECN mode: MUST NOT set ECT on - // any packet for the rest of the connection, if it has received or sent at least one valid - // SYN or Acceptable SYN/ACK with (AE,CWR,ECE) = (0,0,0) during the handshake. - tcp_rsk(req)->noect = 1; - INET_ECN_dontxmit(sk); + /* [CY] 3.1.5. Implications of AccECN Mode - A TCP Server in AccECN mode: MUST NOT set ECT on + * any packet for the rest of the connection, if it has received or sent at least one valid + * SYN or Acceptable SYN/ACK with (AE,CWR,ECE) = (0,0,0) during the handshake + */ + tcp_sk(sk)->ecn_fail = 1; } } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 99ae746753eb..4806686bcb49 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -378,26 +378,26 @@ static void tcp_accecn_echo_syn_ect(struct tcphdr *th, u8 ect) } static void -tcp_ecn_make_synack(struct sock *sk, struct request_sock *req, struct tcphdr *th) +tcp_ecn_make_synack(struct sock *sk, const struct request_sock *req, struct tcphdr *th) { - if (req->num_timeout < 2) { + if (!req->is_rtx || req->num_timeout < 1) { if (tcp_rsk(req)->accecn_ok) tcp_accecn_echo_syn_ect(th, tcp_rsk(req)->syn_ect_rcv); else if (inet_rsk(req)->ecn_ok) th->ece = 1; } else if (tcp_rsk(req)->accecn_ok) { - // [CY] 3.2.3.2.2. Testing for Loss of Packets Carrying the AccECN Option - If this retransmission times out, - // to expedite connection setup, the TCP Server SHOULD retransmit the SYN/ACK with (AE,CWR,ECE) = (0,0,0) and - // no AccECN Option, but it remains in AccECN feedback mode + /* [CY] 3.2.3.2.2. Testing for Loss of Packets Carrying the AccECN Option - If this retransmission times out, + * to expedite connection setup, the TCP Server SHOULD retransmit the SYN/ACK with (AE,CWR,ECE) = (0,0,0) and + * no AccECN Option, but it remains in AccECN feedback mode + */ th->ae = 0; th->cwr = 0; th->ece = 0; - - // [CY] 3.1.5. Implications of AccECN Mode - A TCP Server in AccECN mode: MUST NOT set ECT on - // any packet for the rest of the connection, if it has received or sent at least one valid - // SYN or Acceptable SYN/ACK with (AE,CWR,ECE) = (0,0,0) during the handshake. - tcp_rsk(req)->noect = 1; - INET_ECN_dontxmit(sk); + /* [CY] 3.1.5. Implications of AccECN Mode - A TCP Server in AccECN mode: MUST NOT set ECT on any packet for + * the rest of the connection, if it has received or sent at least one valid SYN or Acceptable SYN/ACK with + * (AE,CWR,ECE) = (0,0,0) during the handshake. + */ + tcp_sk(sk)->ecn_fail = 1; } } @@ -1105,10 +1105,11 @@ static unsigned int tcp_synack_options(const struct sock *sk, smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining); - // [CY] 3.2.3.2.2. Testing for Loss of Packets Carrying the AccECN Option - TCP Server SHOULD retransmit the - // SYN/ACK, but with no AccECN Option + /* [CY] 3.2.3.2.2. Testing for Loss of Packets Carrying the AccECN Option - TCP Server SHOULD retransmit the + * SYN/ACK, but with no AccECN Option + */ if (treq->accecn_ok && sock_net(sk)->ipv4.sysctl_tcp_ecn_option && - req->num_timeout < 1 && (remaining >= TCPOLEN_ACCECN_BASE)) { + !req->is_rtx && (remaining >= TCPOLEN_ACCECN_BASE)) { opts->ecn_bytes = synack_ecn_bytes; remaining -= tcp_options_fit_accecn(opts, 0, remaining, tcp_synack_options_combine_saving(opts)); @@ -1188,7 +1189,7 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb if (tcp_ecn_mode_accecn(tp) && sock_net(sk)->ipv4.sysctl_tcp_ecn_option && - (tp->saw_accecn_opt && tp->saw_accecn_opt != TCP_ACCECN_OPT_FAIL)) { + (tp->saw_accecn_opt && tp->saw_accecn_opt != TCP_ACCECN_OPT_FAIL && !tp->accecn_no_options)) { if (sock_net(sk)->ipv4.sysctl_tcp_ecn_option >= 2 || tp->accecn_opt_demand || tcp_accecn_option_beacon_check(sk)) { @@ -3452,12 +3453,20 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) tcp_retrans_try_collapse(sk, skb, avail_wnd); } - /* RFC3168, section 6.1.1.1. ECN fallback - * As AccECN uses the same SYN flags (+ AE), this check covers both - * cases. + /* [CY] 3.1.4.1. Retransmitted SYNs - If the sender of an AccECN SYN (the TCP Client) times out before receiving the SYN/ACK, + * it SHOULD attempt to negotiate the use of AccECN at least one more time by continuing to set all three TCP ECN flags + * (AE,CWR,ECE) = (1,1,1) on the first retransmitted SYN (using the usual retransmission time-outs). If this first + * retransmission also fails to be acknowledged, in deployment scenarios where AccECN path traversal might be problematic, the + * TCP Client SHOULD send subsequent retransmissions of the SYN with the three TCP-ECN flags cleared (AE,CWR,ECE) = (0,0,0). */ - if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN) - tcp_ecn_clear_syn(sk, skb); + if (!tcp_ecn_mode_pending(tp) || icsk->icsk_retransmits > 1) { + /* RFC3168, section 6.1.1.1. ECN fallback + * As AccECN uses the same SYN flags (+ AE), this check covers both + * cases. + */ + if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN) + tcp_ecn_clear_syn(sk, skb); + } /* Update global and local TCP statistics. */ segs = tcp_skb_pcount(skb);