Skip to content

Commit

Permalink
tc: add dualpi2 scheduler module
Browse files Browse the repository at this point in the history
DUALPI2 AQM is a combination of the DUALQ Coupled-AQM with a PI2
base-AQM. The PI2 AQM is in turn both an extension and a simplification
of the PIE AQM. PI2 makes quite some PIE heuristics unnecessary, while
being able to control scalable congestion controls like TCP-Prague.
With PI2, both Reno/Cubic can be used in parallel with Prague,
maintaining window fairness. DUALQ provides latency separation between
low latency Prague flows and Reno/Cubic flows that need a bigger queue.

This patch adds support to tc to configure it through its netlink
interface.

Signed-off-by: Olga Albisser <[email protected]>
Co-developed-by: Koen De Schepper <[email protected]>
Signed-off-by: Koen De Schepper <[email protected]>
Co-developed-by: Oliver Tilmans <[email protected]>
Signed-off-by: Oliver Tilmans <[email protected]>
Signed-off-by: Bob Briscoe <[email protected]>
Co-developed-by: Henrik Steen <[email protected]>
Signed-off-by: Henrik Steen <[email protected]>
Co-developed-by: Chia-Yu Chang <[email protected]>
Signed-off-by: Chia-Yu Chang <[email protected]>
  • Loading branch information
olgaalb authored and minuscat committed Oct 29, 2024
1 parent 2d35b77 commit 6ed3fa2
Show file tree
Hide file tree
Showing 8 changed files with 783 additions and 15 deletions.
9 changes: 8 additions & 1 deletion bash-completion/tc
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

QDISC_KIND=' choke codel bfifo pfifo pfifo_head_drop fq fq_codel gred hhf \
mqprio multiq netem pfifo_fast pie fq_pie red sfb sfq tbf \
drr hfsc htb prio qfq '
drr hfsc htb prio qfq dualpi2'
FILTER_KIND=' basic bpf cgroup flow flower fw route u32 matchall '
ACTION_KIND=' gact mirred bpf sample '

Expand Down Expand Up @@ -366,6 +366,13 @@ _tc_qdisc_options()
_tc_once_attr 'default r2q direct_qlen debug'
return 0
;;
dualpi2)
_tc_once_attr 'limit coupling_factor step_thresh classic_protection \
max_rtt typical_rtt target tupdate alpha beta'
_tc_one_of_list 'drop_on_overload overflow'
_tc_one_of_list 'drop_enqueue drop_dequeue'
_tc_one_of_list 'split_gso no_split_gso'
;;
multiq|pfifo_fast|drr|qfq)
return 0
;;
Expand Down
34 changes: 34 additions & 0 deletions include/uapi/linux/pkt_sched.h
Original file line number Diff line number Diff line change
Expand Up @@ -1208,4 +1208,38 @@ enum {

#define TCA_ETS_MAX (__TCA_ETS_MAX - 1)

/* DUALPI2 */
enum {
TCA_DUALPI2_UNSPEC,
TCA_DUALPI2_LIMIT, /* Packets */
TCA_DUALPI2_TARGET, /* us */
TCA_DUALPI2_TUPDATE, /* us */
TCA_DUALPI2_ALPHA, /* Hz scaled up by 256 */
TCA_DUALPI2_BETA, /* HZ scaled up by 256 */
TCA_DUALPI2_STEP_THRESH, /* Packets or us */
TCA_DUALPI2_STEP_PACKETS, /* Whether STEP_THRESH is in packets */
TCA_DUALPI2_COUPLING, /* Coupling factor between queues */
TCA_DUALPI2_DROP_OVERLOAD, /* Whether to drop on overload */
TCA_DUALPI2_DROP_EARLY, /* Whether to drop on enqueue */
TCA_DUALPI2_C_PROTECTION, /* Percentage */
TCA_DUALPI2_ECN_MASK, /* L4S queue classification mask */
TCA_DUALPI2_SPLIT_GSO, /* Split aggregated packets */
TCA_DUALPI2_PAD,
__TCA_DUALPI2_MAX
};

#define TCA_DUALPI2_MAX (__TCA_DUALPI2_MAX - 1)

struct tc_dualpi2_xstats {
__u32 prob; /* current probability */
__u32 delay_c; /* current delay in C queue */
__u32 delay_l; /* current delay in L queue */
__s32 credit; /* current c_protection credit */
__u32 packets_in_c; /* number of packets enqueued in C queue */
__u32 packets_in_l; /* number of packets enqueued in L queue */
__u32 maxq; /* maximum queue size */
__u32 ecn_mark; /* packets marked with ecn*/
__u32 step_marks; /* ECN marks due to the step AQM */
};

#endif
2 changes: 2 additions & 0 deletions include/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,8 @@ int read_prop(const char *dev, char *prop, long *value);
int get_long(long *val, const char *arg, int base);
int get_integer(int *val, const char *arg, int base);
int get_unsigned(unsigned *val, const char *arg, int base);
int get_float(float *val, const char *arg);
int get_float_min_max(float *val, const char *arg, float min, float max);
int get_time_rtt(unsigned *val, const char *arg, int *raw);
#define get_byte get_u8
#define get_ushort get_u16
Expand Down
14 changes: 0 additions & 14 deletions ip/iplink_can.c
Original file line number Diff line number Diff line change
Expand Up @@ -61,20 +61,6 @@ static void usage(void)
print_usage(stderr);
}

static int get_float(float *val, const char *arg)
{
float res;
char *ptr;

if (!arg || !*arg)
return -1;
res = strtof(arg, &ptr);
if (!ptr || ptr == arg || *ptr)
return -1;
*val = res;
return 0;
}

static void set_ctrlmode(char *name, char *arg,
struct can_ctrlmode *cm, __u32 flags)
{
Expand Down
30 changes: 30 additions & 0 deletions lib/utils.c
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,36 @@ int get_unsigned(unsigned int *val, const char *arg, int base)
return 0;
}

int get_float(float *val, const char *arg)
{
float res;
char *ptr;

if (!arg || !*arg)
return -1;
res = strtof(arg, &ptr);
if (!ptr || ptr == arg || *ptr)
return -1;
*val = res;
return 0;
}

int get_float_min_max(float *val, const char *arg, float min, float max)
{
float res;
char *ptr;

if (!arg || !*arg)
return -1;
res = strtof(arg, &ptr);
if (!ptr || ptr == arg || *ptr)
return -1;
if (res < min || res > max)
return -1;
*val = res;
return 0;
}

/*
* get_time_rtt is "translated" from a similar routine "get_time" in
* tc_util.c. We don't use the exact same routine because tc passes
Expand Down
231 changes: 231 additions & 0 deletions man/man8/tc-dualpi2.8
Original file line number Diff line number Diff line change
@@ -0,0 +1,231 @@
.TH DUALPI2 8 "29 Oct 2024" "iproute2" "Linux"

.SH NAME
DUALPI2 \- Dual Queue Proportional Integral Controller AQM - Improved with a square
.SH SYNOPSIS
.sp
.ad l
.in +8
.ti -8
.BR tc " " qdisc " ... " dualpi2
.br
.RB "[ " limit
.IR PACKETS " ]"
.br
.RB "[ " coupling_factor
.IR NUMBER " ]"
.br
.RB "[ " step_thresh
.IR TIME | PACKETS " ]"
.br
.RB "[ " drop_on_overload " | " overflow " ]"
.br
.RB "[ " drop_enqueue " | " drop_dequeue " ]"
.br
.RB "[ " l4s_ect " | " any_ect " ]"
.br
.RB "[ " classic_protection
.IR PERCENTAGE " ] "
.br
.RB "[ " max_rtt
.IR TIME
.RB " [ " typical_rtt
.IR TIME " ]] "
.br
.RB "[ " target
.IR TIME " ]"
.br
.RB "[ " tupdate
.IR TIME " ]"
.br
.RB "[ " alpha
.IR float " ]"
.br
.RB "[ " beta
.IR float " ] "
.br
.RB "[ " split_gso " | " no_split_gso " ]"

.SH DESCRIPTION
DUALPI2 AQM is a combination of the DUALQ Coupled-AQM with a PI2 base-AQM. The PI2 AQM (details can be found in the paper cited below) is in turn both an extension and a simplification of the PIE AQM. PI2 makes quite some PIE heuristics unnecessary, while being able to control scalable congestion controls like TCP-Prague. With PI2, both Reno/Cubic can be used in parallel with Prague, maintaining window fairness. DUALQ provides latency separation between low latency Prague flows and Reno/Cubic flows that need a bigger queue. The main design goals are:
.PD 0
.IP \(bu 4
L4S - Low Loss, Low Latency and Scalable congestion control support
.IP \(bu 4
DualQ option to separate the L4S traffic in a low latency queue, without harming remaining traffic that is scheduled in classic queue due to congestion-coupling
.IP \(bu 4
Configurable overload strategies
.IP \(bu 4
Use of sojourn time to reliably estimate queue delay
.IP \(bu 4
Simple implementation
.IP \(bu 4
Guaranteed stability and fast responsiveness
.PP
The detailed PI2 parameters (alpha, beta, and tupdate) of DualPI2 are hard to get right and typically give bad results if just tried or guessed. These parameters need to be calculated to a coherent set with a typical objective in mind. DualPI2 has a set of default parameters that can be used for the general Internet, where the maximum RTT is around 100ms and the typical RTT is around 15ms. It is highly recommended to use
.I "" max_rtt
and
.I "" typical_rtt
(or target) helper parameters if your deployment is deviating from the above objectives (e.g., in a data center). These helpers are used to provide the theoretically optimal PI2 parameters (alpha, beta, and tupdate) for those objectives, and that can be used as a basis for further finetuning, experimentation, and testing if desired.

.SH ALGORITHM
DUALPI2 is designed to provide low loss and low latency to L4S traffic, without harming classic traffic. Every update interval, a new internal base probability is calculated based on queue delay. The base probability is updated with a delta based on the difference between the current queue delay and the
.I "" target
delay, and the queue growth compared with the queuing delay during the previous
.I "" tupdate
interval. The integral gain factor
.RB "" alpha
is used to correct slowly enough any persistent standing queue error to the user specified target delay, while the proportional gain factor
.RB "" beta
is used to quickly compensate for queue changes (growth or shrink).

The updated base probability is used as input to decide to mark and drop packets. DUALPI2 scales the calculated probability for each of the two queues accordingly. For the L4S queue, the probability is multiplied by a
.RB "" coupling_factor
, while for the classic queue, it is squared to compensate the squareroot rate equation of Reno/Cubic. The ECT identifier (
.RB "" l4s_ect | any_ect
) is used to classify traffic into respective queues.

If DUALPI2 AQM has detected overload (when excessive non-responsive traffic is sent), it can signal congestion solely using
.RB "" drop
, irrespective of the ECN field, or alternatively limit the drop probability and let the queue grow and eventually
.RB "" overflow
(like tail-drop).

Additional details can be found in the RFC cited below.

.SH PARAMETERS
.TP
.BI limit " PACKETS"
Limit the number of packets that can be enqueued. Incoming packets are dropped when this limit is reached. This limit is common for the L4S and Classic queues. Defaults to
.I 10000
packets. This is about 125ms delay on a 1Gbps link.
.PD
.TP
.BI coupling_factor " NUMBER"
Set the coupling rate factor between Classic and L4S. Defaults to
.I 2
.PD
.TP
.BI l4s_ect | any_ect
Configures the ECT classifier. Packets whose ECT codepoint matches this are sent to the L4S queue, where they receive a scalable marking. Defaults to
.I l4s_ect
, i.e., the L4S identifier ECT(1). Setting this to
.I any_ect
causes all packets whose ECN field is not zero to be sent to the L4S queue. This enables it to be backward compatible with, e.g., DCTCP. Note DCTCP should only be used for intra-DC traffic with very low RTTs and AQM delay targets bigger than those RTTs, separated from Internet traffic (also if Prague compliant CC), as it does not support all Prague requirements that make sure that a congestion control can work well with the range of RTTs on the Internet.
.PD
.TP
.BI step_thresh " TIME | PACKETS"
Set the step threshold for the L4S queue. This will cause packets with a sojourn time exceeding the threshold to always be marked. This value can either be specified using time units (i.e., us, ms, s), or in packets (p, pkt, packet(s)). A value without units is assumed to be in time (us). If defining the step in packets, be sure to disable GRO on the ingress interfaces. Defaults to
.I 1ms
.PD
.TP
.B drop_on_overload | overflow
Control the overload strategy.
.I drop_on_overload
preserves the delay in the L4S queue by dropping in both queues on overload.
.I overflow
sacrifices delay to avoid losses, eventually resulting in a taildrop behavior once the
.I limit
is reached. Defaults to
.I drop_on_overload
.PD
.TP
.B drop_enqueue | drop_dequeue
Decide when packets are PI-based dropped or marked. The
.I step_thresh
based L4S marking is always at dequeue. Defaults to
.I drop_dequeue
.PD
.TP
.BI classic_protection " PERCENTAGE
Protects the classic queue from unresponsive traffic in the L4S queue. This bounds the maximal scheduling delay in the C queue to be
.I (100 - PERCENTAGE)
times greater than the one in the L queue. Defaults to
.I 10
.TP
.BI typical_rtt " TIME"
.PD 0
.TP
.PD
.BI max_rtt " TIME"
Specify the maximum round trip time (RTT) and/or the typical RTT of the traffic that will be controlled by DUALPI2. These values are specified using time units (i.e., us, ms, s). A value without units is assumed to be in us. If either
.I max_rtt
or
.I typical_rtt
is not specified, the missing value will be computed from the following relationship:
.I max_rtt = typical_rtt * 6.
If any of these parameters is given, it will be used to automatically compute suitable values for
.I alpha, beta, target, and tupdate,
according to the relationship from the appendix A.1 in the IETF RFC cited below, to achieve a stable control. Consequently, those derived values will override their eventual user-provided ones. The default range of operation for the qdisc uses
.I max_rtt = 100ms
and
.I typical_rtt = 15ms
, which is suited to controlling Internet traffic.
.TP
.BI target " TIME"
Set the expected queue delay. Defaults to
.I 15
ms. A value without units is assumed to be in us.
.TP
.BI tupdate " TIME"
Set the frequency at which the system drop probability is calculated. Defaults to
.I 16
ms. A value without units is assumed to be in us. This should be less than a third of the max RTT supported.
.TP
.BI alpha " float"
.PD 0
.TP
.PD
.BI beta " float"
Set alpha and beta, the integral and proportional gain factors in Hz for the PI controller. These can be calculated based on control theory. Defaults are
.I 0.16
and
.I 3.2
Hz, which provide stable control for RTT's up to 100ms with tupdate of 16ms. Be aware, unlike with PIE, these are the real unscaled gain factors. If not provided, they will be automatically derived from
.I typical_rtt and max_rtt
, if one of them or both are provided.
.PD
.TP
.B split_gso | no_split_gso
Decide how to handle aggregated packets. Either treat the aggregate as a single packet (thus all share fate with respect to marks and drops) with
.I no_split_gso
, trading some tail latency for CPU usage, or treat each packet individually (i.e., split them) with
.I split_gso
to finely mark/drop and control queueing latencies. Defaults to
.I split_gso

.SH EXAMPLES
Setting DUALPI2 for the Internet with default parameters:
# sudo tc qdisc add dev eth0 root dualpi2

Setting DUALPI2 for datacenter with legacy DCTCP using ECT(0):
# sudo tc qdisc add dev eth0 root dualpi2 any_ect

.SH FILTERS
This qdisc can be used in conjunction with tc-filters. More precisely, it will honor filters "stealing packets", as well as accept other classification schemes.
.BR
.TP
Packets whose priority/classid are set to
.I 1
will be enqueued in the L queue, alongside L4S traffic, and thus subject to the increased marking probability (or drops if they are marked not-ECT).
.BR
.TP
Packets whose priority/classid are set to
.I 2
will also be enqueued in the L queue, but will never be dropped if they are not-ECT (unless the qdisc is full and thus resorts to taildrop).
.BR
.TP
Finally, all the other classid/priority map to the classic queue.

.SH SEE ALSO
.BR tc (8),
.BR tc-pie (8)

.SH SOURCES
.IP \(bu 4
IETF RFC9332 : https://datatracker.ietf.org/doc/html/rfc9332
.IP \(bu 4
CoNEXT '16 Proceedings of the 12th International on Conference on emerging Networking EXperiments and Technologies : "PI2: A Linearized AQM for both Classic and Scalable TCP"

.SH AUTHORS
DUALPI2 was implemented by Koen De Schepper, Olga Albisser, Henrik Steen, Olivier Tilmans, and Chia-Yu Chang, also the authors of this man page. Please report bugs and corrections to the Linux networking development mailing list at <[email protected]>.
1 change: 1 addition & 0 deletions tc/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ SHARED_LIBS ?= y

TCMODULES :=
TCMODULES += q_fifo.o
TCMODULES += q_dualpi2.o
TCMODULES += q_sfq.o
TCMODULES += q_red.o
TCMODULES += q_prio.o
Expand Down
Loading

0 comments on commit 6ed3fa2

Please sign in to comment.