From a394ebb79add03c401b879c2c363cfb99ebf8a87 Mon Sep 17 00:00:00 2001
From: Gray Liang <gray.liang@isovalent.com>
Date: Thu, 11 Apr 2024 01:32:51 +0800
Subject: [PATCH 01/11] bpf: parse_transport() accept *l3proto

---
 control/kern/tproxy.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/control/kern/tproxy.c b/control/kern/tproxy.c
index 5b74c299e..0905f83b9 100644
--- a/control/kern/tproxy.c
+++ b/control/kern/tproxy.c
@@ -553,7 +553,7 @@ static __always_inline int
 parse_transport(const struct __sk_buff *skb, __u32 link_h_len,
 		struct ethhdr *ethh, struct iphdr *iph, struct ipv6hdr *ipv6h,
 		struct icmp6hdr *icmp6h, struct tcphdr *tcph,
-		struct udphdr *udph, __u8 *ihl, __u8 *l4proto)
+		struct udphdr *udph, __u8 *ihl, __u16 *l3proto, __u8 *l4proto)
 {
 	__u32 offset = 0;
 	int ret;
@@ -571,6 +571,7 @@ parse_transport(const struct __sk_buff *skb, __u32 link_h_len,
 		__builtin_memset(ethh, 0, sizeof(struct ethhdr));
 		ethh->h_proto = skb->protocol;
 	}
+	*l3proto = ethh->h_proto;
 
 	*ihl = 0;
 	*l4proto = 0;
@@ -994,13 +995,14 @@ int tproxy_lan_egress(struct __sk_buff *skb)
 	struct tcphdr tcph;
 	struct udphdr udph;
 	__u8 ihl;
+	__u16 l3proto;
 	__u8 l4proto;
 	__u32 link_h_len;
 
 	if (get_link_h_len(skb->ifindex, &link_h_len))
 		return TC_ACT_OK;
 	int ret = parse_transport(skb, link_h_len, &ethh, &iph, &ipv6h, &icmp6h,
-				  &tcph, &udph, &ihl, &l4proto);
+				  &tcph, &udph, &ihl, &l3proto, &l4proto);
 	if (ret) {
 		bpf_printk("parse_transport: %d", ret);
 		return TC_ACT_OK;
@@ -1022,13 +1024,14 @@ int tproxy_lan_ingress(struct __sk_buff *skb)
 	struct tcphdr tcph;
 	struct udphdr udph;
 	__u8 ihl;
+	__u16 l3proto;
 	__u8 l4proto;
 	__u32 link_h_len;
 
 	if (get_link_h_len(skb->ifindex, &link_h_len))
 		return TC_ACT_OK;
 	int ret = parse_transport(skb, link_h_len, &ethh, &iph, &ipv6h, &icmp6h,
-				  &tcph, &udph, &ihl, &l4proto);
+				  &tcph, &udph, &ihl, &l3proto, &l4proto);
 	if (ret) {
 		bpf_printk("parse_transport: %d", ret);
 		return TC_ACT_OK;
@@ -1311,13 +1314,14 @@ int tproxy_wan_ingress(struct __sk_buff *skb)
 	struct tcphdr tcph;
 	struct udphdr udph;
 	__u8 ihl;
+	__u16 l3proto;
 	__u8 l4proto;
 	__u32 link_h_len;
 
 	if (get_link_h_len(skb->ifindex, &link_h_len))
 		return TC_ACT_OK;
 	int ret = parse_transport(skb, link_h_len, &ethh, &iph, &ipv6h, &icmp6h,
-				  &tcph, &udph, &ihl, &l4proto);
+				  &tcph, &udph, &ihl, &l3proto, &l4proto);
 	if (ret)
 		return TC_ACT_OK;
 	if (l4proto != IPPROTO_UDP)
@@ -1354,6 +1358,7 @@ int tproxy_wan_egress(struct __sk_buff *skb)
 	struct tcphdr tcph;
 	struct udphdr udph;
 	__u8 ihl;
+	__u16 l3proto;
 	__u8 l4proto;
 	__u32 link_h_len;
 
@@ -1361,7 +1366,7 @@ int tproxy_wan_egress(struct __sk_buff *skb)
 		return TC_ACT_OK;
 	bool tcp_state_syn;
 	int ret = parse_transport(skb, link_h_len, &ethh, &iph, &ipv6h, &icmp6h,
-				  &tcph, &udph, &ihl, &l4proto);
+				  &tcph, &udph, &ihl, &l3proto, &l4proto);
 	if (ret)
 		return TC_ACT_OK;
 	if (l4proto == IPPROTO_ICMPV6)
@@ -1635,11 +1640,12 @@ int tproxy_dae0_ingress(struct __sk_buff *skb)
 	struct tcphdr tcph;
 	struct udphdr udph;
 	__u8 ihl;
+	__u16 l3proto;
 	__u8 l4proto;
 	__u32 link_h_len = 14;
 
 	if (parse_transport(skb, link_h_len, &ethh, &iph, &ipv6h, &icmp6h,
-			    &tcph, &udph, &ihl, &l4proto))
+			    &tcph, &udph, &ihl, &l3proto, &l4proto))
 		return TC_ACT_OK;
 	struct tuples tuples;
 

From 301c9fbd1d953745593104df0960cb141115a1df Mon Sep 17 00:00:00 2001
From: Gray Liang <gray.liang@isovalent.com>
Date: Thu, 11 Apr 2024 01:35:51 +0800
Subject: [PATCH 02/11] bpf: get_tuples() doesn't need skb parameter

Because we pass l3proto instead, which is parsed from parse_transport()
---
 control/kern/tproxy.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/control/kern/tproxy.c b/control/kern/tproxy.c
index 0905f83b9..1afd8c976 100644
--- a/control/kern/tproxy.c
+++ b/control/kern/tproxy.c
@@ -413,14 +413,15 @@ static __always_inline __u8 ipv6_get_dscp(const struct ipv6hdr *ipv6h)
 }
 
 static __always_inline void
-get_tuples(const struct __sk_buff *skb, struct tuples *tuples,
+get_tuples(struct tuples *tuples,
 	   const struct iphdr *iph, const struct ipv6hdr *ipv6h,
-	   const struct tcphdr *tcph, const struct udphdr *udph, __u8 l4proto)
+	   const struct tcphdr *tcph, const struct udphdr *udph,
+	   __u16 l3proto, __u8 l4proto)
 {
 	__builtin_memset(tuples, 0, sizeof(*tuples));
 	tuples->five.l4proto = l4proto;
 
-	if (skb->protocol == bpf_htons(ETH_P_IP)) {
+	if (l3proto == bpf_htons(ETH_P_IP)) {
 		tuples->five.sip.u6_addr32[2] = bpf_htonl(0x0000ffff);
 		tuples->five.sip.u6_addr32[3] = iph->saddr;
 
@@ -1042,7 +1043,7 @@ int tproxy_lan_ingress(struct __sk_buff *skb)
 	// Prepare five tuples.
 	struct tuples tuples;
 
-	get_tuples(skb, &tuples, &iph, &ipv6h, &tcph, &udph, l4proto);
+	get_tuples(&tuples, &iph, &ipv6h, &tcph, &udph, l3proto, l4proto);
 
 	/*
    * ip rule add fwmark 0x8000000/0x8000000 table 2023
@@ -1330,7 +1331,7 @@ int tproxy_wan_ingress(struct __sk_buff *skb)
 	struct tuples tuples;
 	struct tuples_key reversed_tuples_key;
 
-	get_tuples(skb, &tuples, &iph, &ipv6h, &tcph, &udph, l4proto);
+	get_tuples(&tuples, &iph, &ipv6h, &tcph, &udph, l3proto, l4proto);
 	copy_reversed_tuples(&tuples.five, &reversed_tuples_key);
 
 	if (refresh_udp_conn_state_timer(&reversed_tuples_key))
@@ -1375,7 +1376,7 @@ int tproxy_wan_egress(struct __sk_buff *skb)
 	// Backup for further use.
 	struct tuples tuples;
 
-	get_tuples(skb, &tuples, &iph, &ipv6h, &tcph, &udph, l4proto);
+	get_tuples(&tuples, &iph, &ipv6h, &tcph, &udph, l3proto, l4proto);
 
 	// Normal packets.
 	if (l4proto == IPPROTO_TCP) {
@@ -1649,7 +1650,7 @@ int tproxy_dae0_ingress(struct __sk_buff *skb)
 		return TC_ACT_OK;
 	struct tuples tuples;
 
-	get_tuples(skb, &tuples, &iph, &ipv6h, &tcph, &udph, l4proto);
+	get_tuples(&tuples, &iph, &ipv6h, &tcph, &udph, l3proto, l4proto);
 
 	// reverse the tuple!
 	struct redirect_tuple redirect_tuple = {};

From 40e8a0844bd6ba566cba5a49028d5ef2aaac9fd7 Mon Sep 17 00:00:00 2001
From: Gray Liang <gray.liang@isovalent.com>
Date: Thu, 11 Apr 2024 01:46:36 +0800
Subject: [PATCH 03/11] bpf: redirect_to_control_plane() accepts l3proto

And it also returns bpf_redirect().
---
 control/kern/tproxy.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/control/kern/tproxy.c b/control/kern/tproxy.c
index 1afd8c976..fef3c63e7 100644
--- a/control/kern/tproxy.c
+++ b/control/kern/tproxy.c
@@ -937,14 +937,14 @@ static __always_inline int assign_listener(struct __sk_buff *skb, __u8 l4proto)
 	return ret;
 }
 
-static __always_inline void prep_redirect_to_control_plane(
-	struct __sk_buff *skb, __u32 link_h_len, struct tuples *tuples,
-	__u8 l4proto, struct ethhdr *ethh, __u8 from_wan, struct tcphdr *tcph)
+static __always_inline int
+redirect_to_control_plane(struct __sk_buff *skb, __u32 link_h_len,
+			  struct tuples *tuples,
+			  struct ethhdr *ethh, struct tcphdr *tcph,
+			  __u8 from_wan, __u16 l3proto, __u8 l4proto)
 {
 	/* Redirect from L3 dev to L2 dev, e.g. wg0 -> veth */
 	if (!link_h_len) {
-		__u16 l3proto = skb->protocol;
-
 		bpf_skb_change_head(skb, sizeof(struct ethhdr), 0);
 		bpf_skb_store_bytes(skb, offsetof(struct ethhdr, h_proto),
 				    &l3proto, sizeof(l3proto), 0);
@@ -956,7 +956,7 @@ static __always_inline void prep_redirect_to_control_plane(
 
 	struct redirect_tuple redirect_tuple = {};
 
-	if (skb->protocol == bpf_htons(ETH_P_IP)) {
+	if (l3proto == bpf_htons(ETH_P_IP)) {
 		redirect_tuple.sip.u6_addr32[3] = tuples->five.sip.u6_addr32[3];
 		redirect_tuple.dip.u6_addr32[3] = tuples->five.dip.u6_addr32[3];
 	} else {
@@ -981,6 +981,8 @@ static __always_inline void prep_redirect_to_control_plane(
 	skb->cb[1] = 0;
 	if ((l4proto == IPPROTO_TCP && tcph->syn) || l4proto == IPPROTO_UDP)
 		skb->cb[1] = l4proto;
+
+	return bpf_redirect(PARAM.dae0_ifindex, 0);
 }
 
 SEC("tc/egress")
@@ -1192,9 +1194,8 @@ int tproxy_lan_ingress(struct __sk_buff *skb)
 
 	// Assign to control plane.
 control_plane:
-	prep_redirect_to_control_plane(skb, link_h_len, &tuples, l4proto, &ethh,
-				       0, &tcph);
-	return bpf_redirect(PARAM.dae0_ifindex, 0);
+	return redirect_to_control_plane(skb, link_h_len, &tuples, &ethh, &tcph,
+					 0, l3proto, l4proto);
 
 direct:
 	return TC_ACT_OK;
@@ -1601,9 +1602,8 @@ int tproxy_wan_egress(struct __sk_buff *skb)
 		}
 	}
 
-	prep_redirect_to_control_plane(skb, link_h_len, &tuples, l4proto, &ethh,
-				       1, &tcph);
-	return bpf_redirect(PARAM.dae0_ifindex, 0);
+	return redirect_to_control_plane(skb, link_h_len, &tuples, &ethh, &tcph,
+					 1, l3proto, l4proto);
 }
 
 SEC("tc/dae0peer_ingress")

From 468dc2e1ef5763f8eb5e505112ba16a869be8c3e Mon Sep 17 00:00:00 2001
From: Gray Liang <gray.liang@isovalent.com>
Date: Fri, 12 Apr 2024 18:53:43 +0800
Subject: [PATCH 04/11] bpf: set meta in ethhdr instead of skb->cb

This is because:
1. XDP can't operate skb->cb
2. Further handling doesn't need L2 info at all
---
 control/kern/tproxy.c | 33 ++++++++++++++++++++++-----------
 1 file changed, 22 insertions(+), 11 deletions(-)

diff --git a/control/kern/tproxy.c b/control/kern/tproxy.c
index fef3c63e7..0ab326a94 100644
--- a/control/kern/tproxy.c
+++ b/control/kern/tproxy.c
@@ -171,6 +171,12 @@ struct tuples {
 	__u8 dscp;
 };
 
+struct redirect_meta {
+	__u32 mark;
+	__u8 l4proto;
+	__u8 pad[3];
+};
+
 struct dae_param {
 	__u32 tproxy_port;
 	__u32 control_plane_pid;
@@ -950,10 +956,6 @@ redirect_to_control_plane(struct __sk_buff *skb, __u32 link_h_len,
 				    &l3proto, sizeof(l3proto), 0);
 	}
 
-	bpf_skb_store_bytes(skb, offsetof(struct ethhdr, h_dest),
-			    (void *)&PARAM.dae0peer_mac, sizeof(ethh->h_dest),
-			    0);
-
 	struct redirect_tuple redirect_tuple = {};
 
 	if (l3proto == bpf_htons(ETH_P_IP)) {
@@ -977,10 +979,14 @@ redirect_to_control_plane(struct __sk_buff *skb, __u32 link_h_len,
 	bpf_map_update_elem(&redirect_track, &redirect_tuple, &redirect_entry,
 			    BPF_ANY);
 
-	skb->cb[0] = TPROXY_MARK;
-	skb->cb[1] = 0;
+	struct redirect_meta *meta = (void *)(long)skb->data;
+	if ((void *)(meta + 1) > (void *)(long)skb->data_end)
+		return TC_ACT_SHOT;
+
+	__builtin_memset(meta, 0, sizeof(*meta));
+	meta->mark = TPROXY_MARK;
 	if ((l4proto == IPPROTO_TCP && tcph->syn) || l4proto == IPPROTO_UDP)
-		skb->cb[1] = l4proto;
+		meta->l4proto = l4proto;
 
 	return bpf_redirect(PARAM.dae0_ifindex, 0);
 }
@@ -1609,9 +1615,14 @@ int tproxy_wan_egress(struct __sk_buff *skb)
 SEC("tc/dae0peer_ingress")
 int tproxy_dae0peer_ingress(struct __sk_buff *skb)
 {
-	/* Only packets redirected from wan_egress or lan_ingress have this cb mark.
+	/* Only packets redirected from wan_egress or lan_ingress have this mark.
    */
-	if (skb->cb[0] != TPROXY_MARK)
+	struct redirect_meta *meta = (void *)(long)skb->data;
+
+	if ((void *)(meta + 1) > (void *)(long)skb->data_end)
+		return TC_ACT_SHOT;
+
+	if (meta->mark != TPROXY_MARK)
 		return TC_ACT_SHOT;
 
 	/* ip rule add fwmark 0x8000000/0x8000000 table 2023
@@ -1620,11 +1631,11 @@ int tproxy_dae0peer_ingress(struct __sk_buff *skb)
 	skb->mark = TPROXY_MARK;
 	bpf_skb_change_type(skb, PACKET_HOST);
 
-	/* l4proto is stored in skb->cb[1] only for UDP and new TCP. As for
+	/* l4proto is stored in meta only for UDP and new TCP. As for
    * established TCP, kernel can take care of socket lookup, so just
    * return them to stack without calling bpf_sk_assign.
    */
-	__u8 l4proto = skb->cb[1];
+	__u8 l4proto = meta->l4proto;
 
 	if (l4proto != 0)
 		assign_listener(skb, l4proto);

From 0d425f905b2f2270768b6baa844b95dd1cf62b40 Mon Sep 17 00:00:00 2001
From: Gray Liang <gray.liang@isovalent.com>
Date: Fri, 12 Apr 2024 19:01:30 +0800
Subject: [PATCH 05/11] bpf: Move three functions to lib/skb.h

we will implement the corresponding functions for XDP soon.
---
 control/kern/lib/skb.h | 226 ++++++++++++++++++++++++++++++++++++++++
 control/kern/tproxy.c  | 228 +----------------------------------------
 2 files changed, 227 insertions(+), 227 deletions(-)
 create mode 100644 control/kern/lib/skb.h

diff --git a/control/kern/lib/skb.h b/control/kern/lib/skb.h
new file mode 100644
index 000000000..2e9a497be
--- /dev/null
+++ b/control/kern/lib/skb.h
@@ -0,0 +1,226 @@
+static __always_inline int
+handle_ipv6_extensions(const struct __sk_buff *skb, __u32 offset, __u32 hdr,
+		       struct icmp6hdr *icmp6h, struct tcphdr *tcph,
+		       struct udphdr *udph, __u8 *ihl, __u8 *l4proto)
+{
+	__u8 hdr_length = 0;
+	__u8 nexthdr = 0;
+	*ihl = sizeof(struct ipv6hdr) / 4;
+	int ret;
+	// We only process TCP and UDP traffic.
+
+	// Unroll can give less instructions but more memory consumption when loading.
+	// We disable it here to support more poor memory devices.
+	// #pragma unroll
+	for (int i = 0; i < IPV6_MAX_EXTENSIONS;
+	     i++, offset += hdr_length, hdr = nexthdr, *ihl += hdr_length / 4) {
+		if (hdr_length % 4) {
+			bpf_printk(
+				"IPv6 extension length is not multiples of 4");
+			return 1;
+		}
+		// See control/control_plane.go.
+
+		switch (hdr) {
+		case IPPROTO_ICMPV6:
+			*l4proto = hdr;
+			hdr_length = sizeof(struct icmp6hdr);
+			// Assume ICMPV6 as a level 4 protocol.
+			ret = bpf_skb_load_bytes(skb, offset, icmp6h,
+						 hdr_length);
+			if (ret) {
+				bpf_printk("not a valid IPv6 packet");
+				return -EFAULT;
+			}
+			return 0;
+
+		case IPPROTO_HOPOPTS:
+		case IPPROTO_ROUTING:
+			ret = bpf_skb_load_bytes(skb, offset + 1, &hdr_length,
+						 sizeof(hdr_length));
+			if (ret) {
+				bpf_printk("not a valid IPv6 packet");
+				return -EFAULT;
+			}
+
+special_n1:
+			ret = bpf_skb_load_bytes(skb, offset, &nexthdr,
+						 sizeof(nexthdr));
+			if (ret) {
+				bpf_printk("not a valid IPv6 packet");
+				return -EFAULT;
+			}
+			break;
+		case IPPROTO_FRAGMENT:
+			hdr_length = 4;
+			goto special_n1;
+		case IPPROTO_TCP:
+		case IPPROTO_UDP:
+			*l4proto = hdr;
+			if (hdr == IPPROTO_TCP) {
+				// Upper layer;
+				ret = bpf_skb_load_bytes(skb, offset, tcph,
+							 sizeof(struct tcphdr));
+				if (ret) {
+					bpf_printk("not a valid IPv6 packet");
+					return -EFAULT;
+				}
+			} else if (hdr == IPPROTO_UDP) {
+				// Upper layer;
+				ret = bpf_skb_load_bytes(skb, offset, udph,
+							 sizeof(struct udphdr));
+				if (ret) {
+					bpf_printk("not a valid IPv6 packet");
+					return -EFAULT;
+				}
+			} else {
+				// Unknown hdr.
+				bpf_printk("Unexpected hdr.");
+				return 1;
+			}
+			return 0;
+		default:
+			/// EXPECTED: Maybe ICMP, etc.
+			// bpf_printk("IPv6 but unrecognized extension protocol: %u", hdr);
+			return 1;
+		}
+	}
+	bpf_printk("exceeds IPV6_MAX_EXTENSIONS limit");
+	return 1;
+}
+
+static __always_inline int
+parse_transport(const struct __sk_buff *skb, __u32 link_h_len,
+		struct ethhdr *ethh, struct iphdr *iph, struct ipv6hdr *ipv6h,
+		struct icmp6hdr *icmp6h, struct tcphdr *tcph,
+		struct udphdr *udph, __u8 *ihl, __u16 *l3proto, __u8 *l4proto)
+{
+	__u32 offset = 0;
+	int ret;
+
+	if (link_h_len == ETH_HLEN) {
+		ret = bpf_skb_load_bytes(skb, offset, ethh,
+					 sizeof(struct ethhdr));
+		if (ret) {
+			bpf_printk("not ethernet packet");
+			return 1;
+		}
+		// Skip ethhdr for next hdr.
+		offset += sizeof(struct ethhdr);
+	} else {
+		__builtin_memset(ethh, 0, sizeof(struct ethhdr));
+		ethh->h_proto = skb->protocol;
+	}
+	*l3proto = ethh->h_proto;
+
+	*ihl = 0;
+	*l4proto = 0;
+	__builtin_memset(iph, 0, sizeof(struct iphdr));
+	__builtin_memset(ipv6h, 0, sizeof(struct ipv6hdr));
+	__builtin_memset(icmp6h, 0, sizeof(struct icmp6hdr));
+	__builtin_memset(tcph, 0, sizeof(struct tcphdr));
+	__builtin_memset(udph, 0, sizeof(struct udphdr));
+
+	// bpf_printk("parse_transport: h_proto: %u ? %u %u", ethh->h_proto,
+	//						bpf_htons(ETH_P_IP),
+	// bpf_htons(ETH_P_IPV6));
+	if (ethh->h_proto == bpf_htons(ETH_P_IP)) {
+		ret = bpf_skb_load_bytes(skb, offset, iph,
+					 sizeof(struct iphdr));
+		if (ret)
+			return -EFAULT;
+		// Skip ipv4hdr and options for next hdr.
+		offset += iph->ihl * 4;
+
+		// We only process TCP and UDP traffic.
+		*l4proto = iph->protocol;
+		switch (iph->protocol) {
+		case IPPROTO_TCP: {
+			ret = bpf_skb_load_bytes(skb, offset, tcph,
+						 sizeof(struct tcphdr));
+			if (ret) {
+				// Not a complete tcphdr.
+				return -EFAULT;
+			}
+		} break;
+		case IPPROTO_UDP: {
+			ret = bpf_skb_load_bytes(skb, offset, udph,
+						 sizeof(struct udphdr));
+			if (ret) {
+				// Not a complete udphdr.
+				return -EFAULT;
+			}
+		} break;
+		default:
+			return 1;
+		}
+		*ihl = iph->ihl;
+		return 0;
+	} else if (ethh->h_proto == bpf_htons(ETH_P_IPV6)) {
+		ret = bpf_skb_load_bytes(skb, offset, ipv6h,
+					 sizeof(struct ipv6hdr));
+		if (ret) {
+			bpf_printk("not a valid IPv6 packet");
+			return -EFAULT;
+		}
+
+		offset += sizeof(struct ipv6hdr);
+
+		return handle_ipv6_extensions(skb, offset, ipv6h->nexthdr,
+					      icmp6h, tcph, udph, ihl, l4proto);
+	} else {
+		/// EXPECTED: Maybe ICMP, MPLS, etc.
+		// bpf_printk("IP but not supported packet: protocol is %u",
+		// iph->protocol);
+		// bpf_printk("unknown link proto: %u", bpf_ntohl(skb->protocol));
+		return 1;
+	}
+}
+
+static __always_inline int
+redirect_to_control_plane(struct __sk_buff *skb, __u32 link_h_len,
+			  struct tuples *tuples,
+			  struct ethhdr *ethh, struct tcphdr *tcph,
+			  __u8 from_wan, __u16 l3proto, __u8 l4proto)
+{
+	/* Redirect from L3 dev to L2 dev, e.g. wg0 -> veth */
+	if (!link_h_len) {
+		bpf_skb_change_head(skb, sizeof(struct ethhdr), 0);
+		bpf_skb_store_bytes(skb, offsetof(struct ethhdr, h_proto),
+				    &l3proto, sizeof(l3proto), 0);
+	}
+
+	struct redirect_tuple redirect_tuple = {};
+
+	if (l3proto == bpf_htons(ETH_P_IP)) {
+		redirect_tuple.sip.u6_addr32[3] = tuples->five.sip.u6_addr32[3];
+		redirect_tuple.dip.u6_addr32[3] = tuples->five.dip.u6_addr32[3];
+	} else {
+		__builtin_memcpy(&redirect_tuple.sip, &tuples->five.sip,
+				 IPV6_BYTE_LENGTH);
+		__builtin_memcpy(&redirect_tuple.dip, &tuples->five.dip,
+				 IPV6_BYTE_LENGTH);
+	}
+	redirect_tuple.l4proto = l4proto;
+	struct redirect_entry redirect_entry = {};
+
+	redirect_entry.ifindex = skb->ifindex;
+	redirect_entry.from_wan = from_wan;
+	__builtin_memcpy(redirect_entry.smac, ethh->h_source,
+			 sizeof(ethh->h_source));
+	__builtin_memcpy(redirect_entry.dmac, ethh->h_dest,
+			 sizeof(ethh->h_dest));
+	bpf_map_update_elem(&redirect_track, &redirect_tuple, &redirect_entry,
+			    BPF_ANY);
+
+	struct redirect_meta *meta = (void *)(long)skb->data;
+	if ((void *)(meta + 1) > (void *)(long)skb->data_end)
+		return TC_ACT_SHOT;
+
+	__builtin_memset(meta, 0, sizeof(*meta));
+	meta->mark = TPROXY_MARK;
+	if ((l4proto == IPPROTO_TCP && tcph->syn) || l4proto == IPPROTO_UDP)
+		meta->l4proto = l4proto;
+
+	return bpf_redirect(PARAM.dae0_ifindex, 0);
+}
diff --git a/control/kern/tproxy.c b/control/kern/tproxy.c
index 0ab326a94..bbfdeb4ef 100644
--- a/control/kern/tproxy.c
+++ b/control/kern/tproxy.c
@@ -407,6 +407,7 @@ struct {
 } udp_conn_state_map SEC(".maps");
 
 // Functions:
+#include "lib/skb.h"
 
 static __always_inline __u8 ipv4_get_dscp(const struct iphdr *iph)
 {
@@ -465,185 +466,6 @@ static __always_inline bool equal16(const __be32 x[4], const __be32 y[4])
 #endif
 }
 
-static __always_inline int
-handle_ipv6_extensions(const struct __sk_buff *skb, __u32 offset, __u32 hdr,
-		       struct icmp6hdr *icmp6h, struct tcphdr *tcph,
-		       struct udphdr *udph, __u8 *ihl, __u8 *l4proto)
-{
-	__u8 hdr_length = 0;
-	__u8 nexthdr = 0;
-	*ihl = sizeof(struct ipv6hdr) / 4;
-	int ret;
-	// We only process TCP and UDP traffic.
-
-	// Unroll can give less instructions but more memory consumption when loading.
-	// We disable it here to support more poor memory devices.
-	// #pragma unroll
-	for (int i = 0; i < IPV6_MAX_EXTENSIONS;
-	     i++, offset += hdr_length, hdr = nexthdr, *ihl += hdr_length / 4) {
-		if (hdr_length % 4) {
-			bpf_printk(
-				"IPv6 extension length is not multiples of 4");
-			return 1;
-		}
-		// See control/control_plane.go.
-
-		switch (hdr) {
-		case IPPROTO_ICMPV6:
-			*l4proto = hdr;
-			hdr_length = sizeof(struct icmp6hdr);
-			// Assume ICMPV6 as a level 4 protocol.
-			ret = bpf_skb_load_bytes(skb, offset, icmp6h,
-						 hdr_length);
-			if (ret) {
-				bpf_printk("not a valid IPv6 packet");
-				return -EFAULT;
-			}
-			return 0;
-
-		case IPPROTO_HOPOPTS:
-		case IPPROTO_ROUTING:
-			ret = bpf_skb_load_bytes(skb, offset + 1, &hdr_length,
-						 sizeof(hdr_length));
-			if (ret) {
-				bpf_printk("not a valid IPv6 packet");
-				return -EFAULT;
-			}
-
-special_n1:
-			ret = bpf_skb_load_bytes(skb, offset, &nexthdr,
-						 sizeof(nexthdr));
-			if (ret) {
-				bpf_printk("not a valid IPv6 packet");
-				return -EFAULT;
-			}
-			break;
-		case IPPROTO_FRAGMENT:
-			hdr_length = 4;
-			goto special_n1;
-		case IPPROTO_TCP:
-		case IPPROTO_UDP:
-			*l4proto = hdr;
-			if (hdr == IPPROTO_TCP) {
-				// Upper layer;
-				ret = bpf_skb_load_bytes(skb, offset, tcph,
-							 sizeof(struct tcphdr));
-				if (ret) {
-					bpf_printk("not a valid IPv6 packet");
-					return -EFAULT;
-				}
-			} else if (hdr == IPPROTO_UDP) {
-				// Upper layer;
-				ret = bpf_skb_load_bytes(skb, offset, udph,
-							 sizeof(struct udphdr));
-				if (ret) {
-					bpf_printk("not a valid IPv6 packet");
-					return -EFAULT;
-				}
-			} else {
-				// Unknown hdr.
-				bpf_printk("Unexpected hdr.");
-				return 1;
-			}
-			return 0;
-		default:
-			/// EXPECTED: Maybe ICMP, etc.
-			// bpf_printk("IPv6 but unrecognized extension protocol: %u", hdr);
-			return 1;
-		}
-	}
-	bpf_printk("exceeds IPV6_MAX_EXTENSIONS limit");
-	return 1;
-}
-
-static __always_inline int
-parse_transport(const struct __sk_buff *skb, __u32 link_h_len,
-		struct ethhdr *ethh, struct iphdr *iph, struct ipv6hdr *ipv6h,
-		struct icmp6hdr *icmp6h, struct tcphdr *tcph,
-		struct udphdr *udph, __u8 *ihl, __u16 *l3proto, __u8 *l4proto)
-{
-	__u32 offset = 0;
-	int ret;
-
-	if (link_h_len == ETH_HLEN) {
-		ret = bpf_skb_load_bytes(skb, offset, ethh,
-					 sizeof(struct ethhdr));
-		if (ret) {
-			bpf_printk("not ethernet packet");
-			return 1;
-		}
-		// Skip ethhdr for next hdr.
-		offset += sizeof(struct ethhdr);
-	} else {
-		__builtin_memset(ethh, 0, sizeof(struct ethhdr));
-		ethh->h_proto = skb->protocol;
-	}
-	*l3proto = ethh->h_proto;
-
-	*ihl = 0;
-	*l4proto = 0;
-	__builtin_memset(iph, 0, sizeof(struct iphdr));
-	__builtin_memset(ipv6h, 0, sizeof(struct ipv6hdr));
-	__builtin_memset(icmp6h, 0, sizeof(struct icmp6hdr));
-	__builtin_memset(tcph, 0, sizeof(struct tcphdr));
-	__builtin_memset(udph, 0, sizeof(struct udphdr));
-
-	// bpf_printk("parse_transport: h_proto: %u ? %u %u", ethh->h_proto,
-	//						bpf_htons(ETH_P_IP),
-	// bpf_htons(ETH_P_IPV6));
-	if (ethh->h_proto == bpf_htons(ETH_P_IP)) {
-		ret = bpf_skb_load_bytes(skb, offset, iph,
-					 sizeof(struct iphdr));
-		if (ret)
-			return -EFAULT;
-		// Skip ipv4hdr and options for next hdr.
-		offset += iph->ihl * 4;
-
-		// We only process TCP and UDP traffic.
-		*l4proto = iph->protocol;
-		switch (iph->protocol) {
-		case IPPROTO_TCP: {
-			ret = bpf_skb_load_bytes(skb, offset, tcph,
-						 sizeof(struct tcphdr));
-			if (ret) {
-				// Not a complete tcphdr.
-				return -EFAULT;
-			}
-		} break;
-		case IPPROTO_UDP: {
-			ret = bpf_skb_load_bytes(skb, offset, udph,
-						 sizeof(struct udphdr));
-			if (ret) {
-				// Not a complete udphdr.
-				return -EFAULT;
-			}
-		} break;
-		default:
-			return 1;
-		}
-		*ihl = iph->ihl;
-		return 0;
-	} else if (ethh->h_proto == bpf_htons(ETH_P_IPV6)) {
-		ret = bpf_skb_load_bytes(skb, offset, ipv6h,
-					 sizeof(struct ipv6hdr));
-		if (ret) {
-			bpf_printk("not a valid IPv6 packet");
-			return -EFAULT;
-		}
-
-		offset += sizeof(struct ipv6hdr);
-
-		return handle_ipv6_extensions(skb, offset, ipv6h->nexthdr,
-					      icmp6h, tcph, udph, ihl, l4proto);
-	} else {
-		/// EXPECTED: Maybe ICMP, MPLS, etc.
-		// bpf_printk("IP but not supported packet: protocol is %u",
-		// iph->protocol);
-		// bpf_printk("unknown link proto: %u", bpf_ntohl(skb->protocol));
-		return 1;
-	}
-}
-
 // Do not use __always_inline here because this function is too heavy.
 // low -> high: outbound(8b) mark(32b) unused(23b) sign(1b)
 static __s64 __attribute__((noinline))
@@ -943,54 +765,6 @@ static __always_inline int assign_listener(struct __sk_buff *skb, __u8 l4proto)
 	return ret;
 }
 
-static __always_inline int
-redirect_to_control_plane(struct __sk_buff *skb, __u32 link_h_len,
-			  struct tuples *tuples,
-			  struct ethhdr *ethh, struct tcphdr *tcph,
-			  __u8 from_wan, __u16 l3proto, __u8 l4proto)
-{
-	/* Redirect from L3 dev to L2 dev, e.g. wg0 -> veth */
-	if (!link_h_len) {
-		bpf_skb_change_head(skb, sizeof(struct ethhdr), 0);
-		bpf_skb_store_bytes(skb, offsetof(struct ethhdr, h_proto),
-				    &l3proto, sizeof(l3proto), 0);
-	}
-
-	struct redirect_tuple redirect_tuple = {};
-
-	if (l3proto == bpf_htons(ETH_P_IP)) {
-		redirect_tuple.sip.u6_addr32[3] = tuples->five.sip.u6_addr32[3];
-		redirect_tuple.dip.u6_addr32[3] = tuples->five.dip.u6_addr32[3];
-	} else {
-		__builtin_memcpy(&redirect_tuple.sip, &tuples->five.sip,
-				 IPV6_BYTE_LENGTH);
-		__builtin_memcpy(&redirect_tuple.dip, &tuples->five.dip,
-				 IPV6_BYTE_LENGTH);
-	}
-	redirect_tuple.l4proto = l4proto;
-	struct redirect_entry redirect_entry = {};
-
-	redirect_entry.ifindex = skb->ifindex;
-	redirect_entry.from_wan = from_wan;
-	__builtin_memcpy(redirect_entry.smac, ethh->h_source,
-			 sizeof(ethh->h_source));
-	__builtin_memcpy(redirect_entry.dmac, ethh->h_dest,
-			 sizeof(ethh->h_dest));
-	bpf_map_update_elem(&redirect_track, &redirect_tuple, &redirect_entry,
-			    BPF_ANY);
-
-	struct redirect_meta *meta = (void *)(long)skb->data;
-	if ((void *)(meta + 1) > (void *)(long)skb->data_end)
-		return TC_ACT_SHOT;
-
-	__builtin_memset(meta, 0, sizeof(*meta));
-	meta->mark = TPROXY_MARK;
-	if ((l4proto == IPPROTO_TCP && tcph->syn) || l4proto == IPPROTO_UDP)
-		meta->l4proto = l4proto;
-
-	return bpf_redirect(PARAM.dae0_ifindex, 0);
-}
-
 SEC("tc/egress")
 int tproxy_lan_egress(struct __sk_buff *skb)
 {

From 30558947b39a6c893770d6bfbccdfc4262d0bd17 Mon Sep 17 00:00:00 2001
From: Gray Liang <gray.liang@isovalent.com>
Date: Thu, 11 Apr 2024 01:55:20 +0800
Subject: [PATCH 06/11] bpf: Rename lib funtion with prefix "skb_"

To distinguish from XDP lib functions.
---
 control/kern/lib/skb.h | 26 +++++++++++++-------------
 control/kern/tproxy.c  | 32 ++++++++++++++++----------------
 2 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/control/kern/lib/skb.h b/control/kern/lib/skb.h
index 2e9a497be..78529735a 100644
--- a/control/kern/lib/skb.h
+++ b/control/kern/lib/skb.h
@@ -1,7 +1,7 @@
 static __always_inline int
-handle_ipv6_extensions(const struct __sk_buff *skb, __u32 offset, __u32 hdr,
-		       struct icmp6hdr *icmp6h, struct tcphdr *tcph,
-		       struct udphdr *udph, __u8 *ihl, __u8 *l4proto)
+skb_handle_ipv6_extensions(const struct __sk_buff *skb, __u32 offset, __u32 hdr,
+			   struct icmp6hdr *icmp6h, struct tcphdr *tcph,
+			   struct udphdr *udph, __u8 *ihl, __u8 *l4proto)
 {
 	__u8 hdr_length = 0;
 	__u8 nexthdr = 0;
@@ -90,10 +90,10 @@ handle_ipv6_extensions(const struct __sk_buff *skb, __u32 offset, __u32 hdr,
 }
 
 static __always_inline int
-parse_transport(const struct __sk_buff *skb, __u32 link_h_len,
-		struct ethhdr *ethh, struct iphdr *iph, struct ipv6hdr *ipv6h,
-		struct icmp6hdr *icmp6h, struct tcphdr *tcph,
-		struct udphdr *udph, __u8 *ihl, __u16 *l3proto, __u8 *l4proto)
+skb_parse_transport(const struct __sk_buff *skb, __u32 link_h_len,
+		    struct ethhdr *ethh, struct iphdr *iph, struct ipv6hdr *ipv6h,
+		    struct icmp6hdr *icmp6h, struct tcphdr *tcph,
+		    struct udphdr *udph, __u8 *ihl, __u16 *l3proto, __u8 *l4proto)
 {
 	__u32 offset = 0;
 	int ret;
@@ -166,8 +166,8 @@ parse_transport(const struct __sk_buff *skb, __u32 link_h_len,
 
 		offset += sizeof(struct ipv6hdr);
 
-		return handle_ipv6_extensions(skb, offset, ipv6h->nexthdr,
-					      icmp6h, tcph, udph, ihl, l4proto);
+		return skb_handle_ipv6_extensions(skb, offset, ipv6h->nexthdr,
+						  icmp6h, tcph, udph, ihl, l4proto);
 	} else {
 		/// EXPECTED: Maybe ICMP, MPLS, etc.
 		// bpf_printk("IP but not supported packet: protocol is %u",
@@ -178,10 +178,10 @@ parse_transport(const struct __sk_buff *skb, __u32 link_h_len,
 }
 
 static __always_inline int
-redirect_to_control_plane(struct __sk_buff *skb, __u32 link_h_len,
-			  struct tuples *tuples,
-			  struct ethhdr *ethh, struct tcphdr *tcph,
-			  __u8 from_wan, __u16 l3proto, __u8 l4proto)
+skb_redirect_to_control_plane(struct __sk_buff *skb, __u32 link_h_len,
+			      struct tuples *tuples,
+			      struct ethhdr *ethh, struct tcphdr *tcph,
+			      __u8 from_wan, __u16 l3proto, __u8 l4proto)
 {
 	/* Redirect from L3 dev to L2 dev, e.g. wg0 -> veth */
 	if (!link_h_len) {
diff --git a/control/kern/tproxy.c b/control/kern/tproxy.c
index bbfdeb4ef..bdb2486ef 100644
--- a/control/kern/tproxy.c
+++ b/control/kern/tproxy.c
@@ -784,10 +784,10 @@ int tproxy_lan_egress(struct __sk_buff *skb)
 
 	if (get_link_h_len(skb->ifindex, &link_h_len))
 		return TC_ACT_OK;
-	int ret = parse_transport(skb, link_h_len, &ethh, &iph, &ipv6h, &icmp6h,
-				  &tcph, &udph, &ihl, &l3proto, &l4proto);
+	int ret = skb_parse_transport(skb, link_h_len, &ethh, &iph, &ipv6h, &icmp6h,
+				      &tcph, &udph, &ihl, &l3proto, &l4proto);
 	if (ret) {
-		bpf_printk("parse_transport: %d", ret);
+		bpf_printk("skb_parse_transport: %d", ret);
 		return TC_ACT_OK;
 	}
 	if (l4proto == IPPROTO_ICMPV6 && icmp6h.icmp6_type == NDP_REDIRECT) {
@@ -813,10 +813,10 @@ int tproxy_lan_ingress(struct __sk_buff *skb)
 
 	if (get_link_h_len(skb->ifindex, &link_h_len))
 		return TC_ACT_OK;
-	int ret = parse_transport(skb, link_h_len, &ethh, &iph, &ipv6h, &icmp6h,
-				  &tcph, &udph, &ihl, &l3proto, &l4proto);
+	int ret = skb_parse_transport(skb, link_h_len, &ethh, &iph, &ipv6h, &icmp6h,
+				      &tcph, &udph, &ihl, &l3proto, &l4proto);
 	if (ret) {
-		bpf_printk("parse_transport: %d", ret);
+		bpf_printk("skb_parse_transport: %d", ret);
 		return TC_ACT_OK;
 	}
 	if (l4proto == IPPROTO_ICMPV6)
@@ -974,8 +974,8 @@ int tproxy_lan_ingress(struct __sk_buff *skb)
 
 	// Assign to control plane.
 control_plane:
-	return redirect_to_control_plane(skb, link_h_len, &tuples, &ethh, &tcph,
-					 0, l3proto, l4proto);
+	return skb_redirect_to_control_plane(skb, link_h_len, &tuples, &ethh, &tcph,
+					     0, l3proto, l4proto);
 
 direct:
 	return TC_ACT_OK;
@@ -1102,8 +1102,8 @@ int tproxy_wan_ingress(struct __sk_buff *skb)
 
 	if (get_link_h_len(skb->ifindex, &link_h_len))
 		return TC_ACT_OK;
-	int ret = parse_transport(skb, link_h_len, &ethh, &iph, &ipv6h, &icmp6h,
-				  &tcph, &udph, &ihl, &l3proto, &l4proto);
+	int ret = skb_parse_transport(skb, link_h_len, &ethh, &iph, &ipv6h, &icmp6h,
+				      &tcph, &udph, &ihl, &l3proto, &l4proto);
 	if (ret)
 		return TC_ACT_OK;
 	if (l4proto != IPPROTO_UDP)
@@ -1147,8 +1147,8 @@ int tproxy_wan_egress(struct __sk_buff *skb)
 	if (get_link_h_len(skb->ifindex, &link_h_len))
 		return TC_ACT_OK;
 	bool tcp_state_syn;
-	int ret = parse_transport(skb, link_h_len, &ethh, &iph, &ipv6h, &icmp6h,
-				  &tcph, &udph, &ihl, &l3proto, &l4proto);
+	int ret = skb_parse_transport(skb, link_h_len, &ethh, &iph, &ipv6h, &icmp6h,
+				      &tcph, &udph, &ihl, &l3proto, &l4proto);
 	if (ret)
 		return TC_ACT_OK;
 	if (l4proto == IPPROTO_ICMPV6)
@@ -1382,8 +1382,8 @@ int tproxy_wan_egress(struct __sk_buff *skb)
 		}
 	}
 
-	return redirect_to_control_plane(skb, link_h_len, &tuples, &ethh, &tcph,
-					 1, l3proto, l4proto);
+	return skb_redirect_to_control_plane(skb, link_h_len, &tuples, &ethh, &tcph,
+					     1, l3proto, l4proto);
 }
 
 SEC("tc/dae0peer_ingress")
@@ -1430,8 +1430,8 @@ int tproxy_dae0_ingress(struct __sk_buff *skb)
 	__u8 l4proto;
 	__u32 link_h_len = 14;
 
-	if (parse_transport(skb, link_h_len, &ethh, &iph, &ipv6h, &icmp6h,
-			    &tcph, &udph, &ihl, &l3proto, &l4proto))
+	if (skb_parse_transport(skb, link_h_len, &ethh, &iph, &ipv6h, &icmp6h,
+				&tcph, &udph, &ihl, &l3proto, &l4proto))
 		return TC_ACT_OK;
 	struct tuples tuples;
 

From d499e6de3d5e7458c9af25ede83f57178fa65a1c Mon Sep 17 00:00:00 2001
From: Gray Liang <gray.liang@isovalent.com>
Date: Fri, 12 Apr 2024 19:02:09 +0800
Subject: [PATCH 07/11] bpf: Copy lib/skb.h to lib/xdp.h

Just copy without a single character change.
---
 control/kern/lib/xdp.h | 226 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 226 insertions(+)
 create mode 100644 control/kern/lib/xdp.h

diff --git a/control/kern/lib/xdp.h b/control/kern/lib/xdp.h
new file mode 100644
index 000000000..78529735a
--- /dev/null
+++ b/control/kern/lib/xdp.h
@@ -0,0 +1,226 @@
+static __always_inline int
+skb_handle_ipv6_extensions(const struct __sk_buff *skb, __u32 offset, __u32 hdr,
+			   struct icmp6hdr *icmp6h, struct tcphdr *tcph,
+			   struct udphdr *udph, __u8 *ihl, __u8 *l4proto)
+{
+	__u8 hdr_length = 0;
+	__u8 nexthdr = 0;
+	*ihl = sizeof(struct ipv6hdr) / 4;
+	int ret;
+	// We only process TCP and UDP traffic.
+
+	// Unroll can give less instructions but more memory consumption when loading.
+	// We disable it here to support more poor memory devices.
+	// #pragma unroll
+	for (int i = 0; i < IPV6_MAX_EXTENSIONS;
+	     i++, offset += hdr_length, hdr = nexthdr, *ihl += hdr_length / 4) {
+		if (hdr_length % 4) {
+			bpf_printk(
+				"IPv6 extension length is not multiples of 4");
+			return 1;
+		}
+		// See control/control_plane.go.
+
+		switch (hdr) {
+		case IPPROTO_ICMPV6:
+			*l4proto = hdr;
+			hdr_length = sizeof(struct icmp6hdr);
+			// Assume ICMPV6 as a level 4 protocol.
+			ret = bpf_skb_load_bytes(skb, offset, icmp6h,
+						 hdr_length);
+			if (ret) {
+				bpf_printk("not a valid IPv6 packet");
+				return -EFAULT;
+			}
+			return 0;
+
+		case IPPROTO_HOPOPTS:
+		case IPPROTO_ROUTING:
+			ret = bpf_skb_load_bytes(skb, offset + 1, &hdr_length,
+						 sizeof(hdr_length));
+			if (ret) {
+				bpf_printk("not a valid IPv6 packet");
+				return -EFAULT;
+			}
+
+special_n1:
+			ret = bpf_skb_load_bytes(skb, offset, &nexthdr,
+						 sizeof(nexthdr));
+			if (ret) {
+				bpf_printk("not a valid IPv6 packet");
+				return -EFAULT;
+			}
+			break;
+		case IPPROTO_FRAGMENT:
+			hdr_length = 4;
+			goto special_n1;
+		case IPPROTO_TCP:
+		case IPPROTO_UDP:
+			*l4proto = hdr;
+			if (hdr == IPPROTO_TCP) {
+				// Upper layer;
+				ret = bpf_skb_load_bytes(skb, offset, tcph,
+							 sizeof(struct tcphdr));
+				if (ret) {
+					bpf_printk("not a valid IPv6 packet");
+					return -EFAULT;
+				}
+			} else if (hdr == IPPROTO_UDP) {
+				// Upper layer;
+				ret = bpf_skb_load_bytes(skb, offset, udph,
+							 sizeof(struct udphdr));
+				if (ret) {
+					bpf_printk("not a valid IPv6 packet");
+					return -EFAULT;
+				}
+			} else {
+				// Unknown hdr.
+				bpf_printk("Unexpected hdr.");
+				return 1;
+			}
+			return 0;
+		default:
+			/// EXPECTED: Maybe ICMP, etc.
+			// bpf_printk("IPv6 but unrecognized extension protocol: %u", hdr);
+			return 1;
+		}
+	}
+	bpf_printk("exceeds IPV6_MAX_EXTENSIONS limit");
+	return 1;
+}
+
+static __always_inline int
+skb_parse_transport(const struct __sk_buff *skb, __u32 link_h_len,
+		    struct ethhdr *ethh, struct iphdr *iph, struct ipv6hdr *ipv6h,
+		    struct icmp6hdr *icmp6h, struct tcphdr *tcph,
+		    struct udphdr *udph, __u8 *ihl, __u16 *l3proto, __u8 *l4proto)
+{
+	__u32 offset = 0;
+	int ret;
+
+	if (link_h_len == ETH_HLEN) {
+		ret = bpf_skb_load_bytes(skb, offset, ethh,
+					 sizeof(struct ethhdr));
+		if (ret) {
+			bpf_printk("not ethernet packet");
+			return 1;
+		}
+		// Skip ethhdr for next hdr.
+		offset += sizeof(struct ethhdr);
+	} else {
+		__builtin_memset(ethh, 0, sizeof(struct ethhdr));
+		ethh->h_proto = skb->protocol;
+	}
+	*l3proto = ethh->h_proto;
+
+	*ihl = 0;
+	*l4proto = 0;
+	__builtin_memset(iph, 0, sizeof(struct iphdr));
+	__builtin_memset(ipv6h, 0, sizeof(struct ipv6hdr));
+	__builtin_memset(icmp6h, 0, sizeof(struct icmp6hdr));
+	__builtin_memset(tcph, 0, sizeof(struct tcphdr));
+	__builtin_memset(udph, 0, sizeof(struct udphdr));
+
+	// bpf_printk("parse_transport: h_proto: %u ? %u %u", ethh->h_proto,
+	//						bpf_htons(ETH_P_IP),
+	// bpf_htons(ETH_P_IPV6));
+	if (ethh->h_proto == bpf_htons(ETH_P_IP)) {
+		ret = bpf_skb_load_bytes(skb, offset, iph,
+					 sizeof(struct iphdr));
+		if (ret)
+			return -EFAULT;
+		// Skip ipv4hdr and options for next hdr.
+		offset += iph->ihl * 4;
+
+		// We only process TCP and UDP traffic.
+		*l4proto = iph->protocol;
+		switch (iph->protocol) {
+		case IPPROTO_TCP: {
+			ret = bpf_skb_load_bytes(skb, offset, tcph,
+						 sizeof(struct tcphdr));
+			if (ret) {
+				// Not a complete tcphdr.
+				return -EFAULT;
+			}
+		} break;
+		case IPPROTO_UDP: {
+			ret = bpf_skb_load_bytes(skb, offset, udph,
+						 sizeof(struct udphdr));
+			if (ret) {
+				// Not a complete udphdr.
+				return -EFAULT;
+			}
+		} break;
+		default:
+			return 1;
+		}
+		*ihl = iph->ihl;
+		return 0;
+	} else if (ethh->h_proto == bpf_htons(ETH_P_IPV6)) {
+		ret = bpf_skb_load_bytes(skb, offset, ipv6h,
+					 sizeof(struct ipv6hdr));
+		if (ret) {
+			bpf_printk("not a valid IPv6 packet");
+			return -EFAULT;
+		}
+
+		offset += sizeof(struct ipv6hdr);
+
+		return skb_handle_ipv6_extensions(skb, offset, ipv6h->nexthdr,
+						  icmp6h, tcph, udph, ihl, l4proto);
+	} else {
+		/// EXPECTED: Maybe ICMP, MPLS, etc.
+		// bpf_printk("IP but not supported packet: protocol is %u",
+		// iph->protocol);
+		// bpf_printk("unknown link proto: %u", bpf_ntohl(skb->protocol));
+		return 1;
+	}
+}
+
+static __always_inline int
+skb_redirect_to_control_plane(struct __sk_buff *skb, __u32 link_h_len,
+			      struct tuples *tuples,
+			      struct ethhdr *ethh, struct tcphdr *tcph,
+			      __u8 from_wan, __u16 l3proto, __u8 l4proto)
+{
+	/* Redirect from L3 dev to L2 dev, e.g. wg0 -> veth */
+	if (!link_h_len) {
+		bpf_skb_change_head(skb, sizeof(struct ethhdr), 0);
+		bpf_skb_store_bytes(skb, offsetof(struct ethhdr, h_proto),
+				    &l3proto, sizeof(l3proto), 0);
+	}
+
+	struct redirect_tuple redirect_tuple = {};
+
+	if (l3proto == bpf_htons(ETH_P_IP)) {
+		redirect_tuple.sip.u6_addr32[3] = tuples->five.sip.u6_addr32[3];
+		redirect_tuple.dip.u6_addr32[3] = tuples->five.dip.u6_addr32[3];
+	} else {
+		__builtin_memcpy(&redirect_tuple.sip, &tuples->five.sip,
+				 IPV6_BYTE_LENGTH);
+		__builtin_memcpy(&redirect_tuple.dip, &tuples->five.dip,
+				 IPV6_BYTE_LENGTH);
+	}
+	redirect_tuple.l4proto = l4proto;
+	struct redirect_entry redirect_entry = {};
+
+	redirect_entry.ifindex = skb->ifindex;
+	redirect_entry.from_wan = from_wan;
+	__builtin_memcpy(redirect_entry.smac, ethh->h_source,
+			 sizeof(ethh->h_source));
+	__builtin_memcpy(redirect_entry.dmac, ethh->h_dest,
+			 sizeof(ethh->h_dest));
+	bpf_map_update_elem(&redirect_track, &redirect_tuple, &redirect_entry,
+			    BPF_ANY);
+
+	struct redirect_meta *meta = (void *)(long)skb->data;
+	if ((void *)(meta + 1) > (void *)(long)skb->data_end)
+		return TC_ACT_SHOT;
+
+	__builtin_memset(meta, 0, sizeof(*meta));
+	meta->mark = TPROXY_MARK;
+	if ((l4proto == IPPROTO_TCP && tcph->syn) || l4proto == IPPROTO_UDP)
+		meta->l4proto = l4proto;
+
+	return bpf_redirect(PARAM.dae0_ifindex, 0);
+}

From 3d7e6a28defda3c86fa4b61beade362d540e6c49 Mon Sep 17 00:00:00 2001
From: Gray Liang <gray.liang@isovalent.com>
Date: Fri, 12 Apr 2024 19:09:34 +0800
Subject: [PATCH 08/11]  bpf: Finalize xdp.h

1. xdp won't be attached to L3 netdev, so no need to handle 0 hlen
2. use xdp helpers instead of skb helpers
---
 control/kern/lib/xdp.h | 62 +++++++++++++++++-------------------------
 control/kern/tproxy.c  |  1 +
 2 files changed, 26 insertions(+), 37 deletions(-)

diff --git a/control/kern/lib/xdp.h b/control/kern/lib/xdp.h
index 78529735a..85b35b40b 100644
--- a/control/kern/lib/xdp.h
+++ b/control/kern/lib/xdp.h
@@ -1,5 +1,5 @@
 static __always_inline int
-skb_handle_ipv6_extensions(const struct __sk_buff *skb, __u32 offset, __u32 hdr,
+xdp_handle_ipv6_extensions(struct xdp_md *ctx, __u32 offset, __u32 hdr,
 			   struct icmp6hdr *icmp6h, struct tcphdr *tcph,
 			   struct udphdr *udph, __u8 *ihl, __u8 *l4proto)
 {
@@ -26,7 +26,7 @@ skb_handle_ipv6_extensions(const struct __sk_buff *skb, __u32 offset, __u32 hdr,
 			*l4proto = hdr;
 			hdr_length = sizeof(struct icmp6hdr);
 			// Assume ICMPV6 as a level 4 protocol.
-			ret = bpf_skb_load_bytes(skb, offset, icmp6h,
+			ret = bpf_xdp_load_bytes(ctx, offset, icmp6h,
 						 hdr_length);
 			if (ret) {
 				bpf_printk("not a valid IPv6 packet");
@@ -36,7 +36,7 @@ skb_handle_ipv6_extensions(const struct __sk_buff *skb, __u32 offset, __u32 hdr,
 
 		case IPPROTO_HOPOPTS:
 		case IPPROTO_ROUTING:
-			ret = bpf_skb_load_bytes(skb, offset + 1, &hdr_length,
+			ret = bpf_xdp_load_bytes(ctx, offset + 1, &hdr_length,
 						 sizeof(hdr_length));
 			if (ret) {
 				bpf_printk("not a valid IPv6 packet");
@@ -44,7 +44,7 @@ skb_handle_ipv6_extensions(const struct __sk_buff *skb, __u32 offset, __u32 hdr,
 			}
 
 special_n1:
-			ret = bpf_skb_load_bytes(skb, offset, &nexthdr,
+			ret = bpf_xdp_load_bytes(ctx, offset, &nexthdr,
 						 sizeof(nexthdr));
 			if (ret) {
 				bpf_printk("not a valid IPv6 packet");
@@ -59,7 +59,7 @@ skb_handle_ipv6_extensions(const struct __sk_buff *skb, __u32 offset, __u32 hdr,
 			*l4proto = hdr;
 			if (hdr == IPPROTO_TCP) {
 				// Upper layer;
-				ret = bpf_skb_load_bytes(skb, offset, tcph,
+				ret = bpf_xdp_load_bytes(ctx, offset, tcph,
 							 sizeof(struct tcphdr));
 				if (ret) {
 					bpf_printk("not a valid IPv6 packet");
@@ -67,7 +67,7 @@ skb_handle_ipv6_extensions(const struct __sk_buff *skb, __u32 offset, __u32 hdr,
 				}
 			} else if (hdr == IPPROTO_UDP) {
 				// Upper layer;
-				ret = bpf_skb_load_bytes(skb, offset, udph,
+				ret = bpf_xdp_load_bytes(ctx, offset, udph,
 							 sizeof(struct udphdr));
 				if (ret) {
 					bpf_printk("not a valid IPv6 packet");
@@ -90,7 +90,7 @@ skb_handle_ipv6_extensions(const struct __sk_buff *skb, __u32 offset, __u32 hdr,
 }
 
 static __always_inline int
-skb_parse_transport(const struct __sk_buff *skb, __u32 link_h_len,
+xdp_parse_transport(struct xdp_md *ctx,
 		    struct ethhdr *ethh, struct iphdr *iph, struct ipv6hdr *ipv6h,
 		    struct icmp6hdr *icmp6h, struct tcphdr *tcph,
 		    struct udphdr *udph, __u8 *ihl, __u16 *l3proto, __u8 *l4proto)
@@ -98,19 +98,14 @@ skb_parse_transport(const struct __sk_buff *skb, __u32 link_h_len,
 	__u32 offset = 0;
 	int ret;
 
-	if (link_h_len == ETH_HLEN) {
-		ret = bpf_skb_load_bytes(skb, offset, ethh,
-					 sizeof(struct ethhdr));
-		if (ret) {
-			bpf_printk("not ethernet packet");
-			return 1;
-		}
-		// Skip ethhdr for next hdr.
-		offset += sizeof(struct ethhdr);
-	} else {
-		__builtin_memset(ethh, 0, sizeof(struct ethhdr));
-		ethh->h_proto = skb->protocol;
+	ret = bpf_xdp_load_bytes(ctx, offset, ethh,
+				 sizeof(struct ethhdr));
+	if (ret) {
+		bpf_printk("not ethernet packet");
+		return 1;
 	}
+	// Skip ethhdr for next hdr.
+	offset += sizeof(struct ethhdr);
 	*l3proto = ethh->h_proto;
 
 	*ihl = 0;
@@ -125,7 +120,7 @@ skb_parse_transport(const struct __sk_buff *skb, __u32 link_h_len,
 	//						bpf_htons(ETH_P_IP),
 	// bpf_htons(ETH_P_IPV6));
 	if (ethh->h_proto == bpf_htons(ETH_P_IP)) {
-		ret = bpf_skb_load_bytes(skb, offset, iph,
+		ret = bpf_xdp_load_bytes(ctx, offset, iph,
 					 sizeof(struct iphdr));
 		if (ret)
 			return -EFAULT;
@@ -136,7 +131,7 @@ skb_parse_transport(const struct __sk_buff *skb, __u32 link_h_len,
 		*l4proto = iph->protocol;
 		switch (iph->protocol) {
 		case IPPROTO_TCP: {
-			ret = bpf_skb_load_bytes(skb, offset, tcph,
+			ret = bpf_xdp_load_bytes(ctx, offset, tcph,
 						 sizeof(struct tcphdr));
 			if (ret) {
 				// Not a complete tcphdr.
@@ -144,7 +139,7 @@ skb_parse_transport(const struct __sk_buff *skb, __u32 link_h_len,
 			}
 		} break;
 		case IPPROTO_UDP: {
-			ret = bpf_skb_load_bytes(skb, offset, udph,
+			ret = bpf_xdp_load_bytes(ctx, offset, udph,
 						 sizeof(struct udphdr));
 			if (ret) {
 				// Not a complete udphdr.
@@ -157,7 +152,7 @@ skb_parse_transport(const struct __sk_buff *skb, __u32 link_h_len,
 		*ihl = iph->ihl;
 		return 0;
 	} else if (ethh->h_proto == bpf_htons(ETH_P_IPV6)) {
-		ret = bpf_skb_load_bytes(skb, offset, ipv6h,
+		ret = bpf_xdp_load_bytes(ctx, offset, ipv6h,
 					 sizeof(struct ipv6hdr));
 		if (ret) {
 			bpf_printk("not a valid IPv6 packet");
@@ -166,30 +161,23 @@ skb_parse_transport(const struct __sk_buff *skb, __u32 link_h_len,
 
 		offset += sizeof(struct ipv6hdr);
 
-		return skb_handle_ipv6_extensions(skb, offset, ipv6h->nexthdr,
+		return xdp_handle_ipv6_extensions(ctx, offset, ipv6h->nexthdr,
 						  icmp6h, tcph, udph, ihl, l4proto);
 	} else {
 		/// EXPECTED: Maybe ICMP, MPLS, etc.
 		// bpf_printk("IP but not supported packet: protocol is %u",
 		// iph->protocol);
-		// bpf_printk("unknown link proto: %u", bpf_ntohl(skb->protocol));
+		// bpf_printk("unknown link proto: %u", bpf_ntohl(ctx->protocol));
 		return 1;
 	}
 }
 
 static __always_inline int
-skb_redirect_to_control_plane(struct __sk_buff *skb, __u32 link_h_len,
+xdp_redirect_to_control_plane(struct xdp_md *ctx,
 			      struct tuples *tuples,
 			      struct ethhdr *ethh, struct tcphdr *tcph,
 			      __u8 from_wan, __u16 l3proto, __u8 l4proto)
 {
-	/* Redirect from L3 dev to L2 dev, e.g. wg0 -> veth */
-	if (!link_h_len) {
-		bpf_skb_change_head(skb, sizeof(struct ethhdr), 0);
-		bpf_skb_store_bytes(skb, offsetof(struct ethhdr, h_proto),
-				    &l3proto, sizeof(l3proto), 0);
-	}
-
 	struct redirect_tuple redirect_tuple = {};
 
 	if (l3proto == bpf_htons(ETH_P_IP)) {
@@ -204,7 +192,7 @@ skb_redirect_to_control_plane(struct __sk_buff *skb, __u32 link_h_len,
 	redirect_tuple.l4proto = l4proto;
 	struct redirect_entry redirect_entry = {};
 
-	redirect_entry.ifindex = skb->ifindex;
+	redirect_entry.ifindex = ctx->ingress_ifindex;
 	redirect_entry.from_wan = from_wan;
 	__builtin_memcpy(redirect_entry.smac, ethh->h_source,
 			 sizeof(ethh->h_source));
@@ -213,9 +201,9 @@ skb_redirect_to_control_plane(struct __sk_buff *skb, __u32 link_h_len,
 	bpf_map_update_elem(&redirect_track, &redirect_tuple, &redirect_entry,
 			    BPF_ANY);
 
-	struct redirect_meta *meta = (void *)(long)skb->data;
-	if ((void *)(meta + 1) > (void *)(long)skb->data_end)
-		return TC_ACT_SHOT;
+	struct redirect_meta *meta = (void *)(long)ctx->data;
+	if ((void *)(meta + 1) > (void *)(long)ctx->data_end)
+		return XDP_DROP;
 
 	__builtin_memset(meta, 0, sizeof(*meta));
 	meta->mark = TPROXY_MARK;
diff --git a/control/kern/tproxy.c b/control/kern/tproxy.c
index bdb2486ef..5ae759d75 100644
--- a/control/kern/tproxy.c
+++ b/control/kern/tproxy.c
@@ -408,6 +408,7 @@ struct {
 
 // Functions:
 #include "lib/skb.h"
+#include "lib/xdp.h"
 
 static __always_inline __u8 ipv4_get_dscp(const struct iphdr *iph)
 {

From 939b6fae5fb2f3999a42333b6ee05591ffa4829d Mon Sep 17 00:00:00 2001
From: Gray Liang <gray.liang@isovalent.com>
Date: Thu, 11 Apr 2024 02:16:23 +0800
Subject: [PATCH 09/11] bpf: Copy tc/ingress to xdp/ingress

No adjustment.
---
 control/control_plane_core.go |   2 +-
 control/kern/tproxy.c         | 189 +++++++++++++++++++++++++++++++++-
 2 files changed, 189 insertions(+), 2 deletions(-)

diff --git a/control/control_plane_core.go b/control/control_plane_core.go
index ab84170c4..b59fbff5d 100644
--- a/control/control_plane_core.go
+++ b/control/control_plane_core.go
@@ -318,7 +318,7 @@ func (c *controlPlaneCore) _bindLan(ifname string) error {
 			// Priority should be behind of WAN's
 			Priority: 2,
 		},
-		Fd:           c.bpf.bpfPrograms.TproxyLanIngress.FD(),
+		Fd:           c.bpf.bpfPrograms.TcTproxyLanIngress.FD(),
 		Name:         consts.AppName + "_lan_ingress",
 		DirectAction: true,
 	}
diff --git a/control/kern/tproxy.c b/control/kern/tproxy.c
index 5ae759d75..053447214 100644
--- a/control/kern/tproxy.c
+++ b/control/kern/tproxy.c
@@ -799,7 +799,194 @@ int tproxy_lan_egress(struct __sk_buff *skb)
 }
 
 SEC("tc/ingress")
-int tproxy_lan_ingress(struct __sk_buff *skb)
+int tc_tproxy_lan_ingress(struct __sk_buff *skb)
+{
+	struct ethhdr ethh;
+	struct iphdr iph;
+	struct ipv6hdr ipv6h;
+	struct icmp6hdr icmp6h;
+	struct tcphdr tcph;
+	struct udphdr udph;
+	__u8 ihl;
+	__u16 l3proto;
+	__u8 l4proto;
+	__u32 link_h_len;
+
+	if (get_link_h_len(skb->ifindex, &link_h_len))
+		return TC_ACT_OK;
+	int ret = skb_parse_transport(skb, link_h_len, &ethh, &iph, &ipv6h, &icmp6h,
+				      &tcph, &udph, &ihl, &l3proto, &l4proto);
+	if (ret) {
+		bpf_printk("skb_parse_transport: %d", ret);
+		return TC_ACT_OK;
+	}
+	if (l4proto == IPPROTO_ICMPV6)
+		return TC_ACT_OK;
+
+	// Prepare five tuples.
+	struct tuples tuples;
+
+	get_tuples(&tuples, &iph, &ipv6h, &tcph, &udph, l3proto, l4proto);
+
+	/*
+   * ip rule add fwmark 0x8000000/0x8000000 table 2023
+   * ip route add local default dev lo table 2023
+   * ip -6 rule add fwmark 0x8000000/0x8000000 table 2023
+   * ip -6 route add local default dev lo table 2023
+
+   * ip rule del fwmark 0x8000000/0x8000000 table 2023
+   * ip route del local default dev lo table 2023
+   * ip -6 rule del fwmark 0x8000000/0x8000000 table 2023
+   * ip -6 route del local default dev lo table 2023
+   */
+	// Socket lookup and assign skb to existing socket connection.
+	struct bpf_sock_tuple tuple = { 0 };
+	__u32 tuple_size;
+	struct bpf_sock *sk;
+	__u32 flag[8];
+	void *l4hdr;
+
+	if (skb->protocol == bpf_htons(ETH_P_IP)) {
+		tuple.ipv4.daddr = tuples.five.dip.u6_addr32[3];
+		tuple.ipv4.saddr = tuples.five.sip.u6_addr32[3];
+		tuple.ipv4.dport = tuples.five.dport;
+		tuple.ipv4.sport = tuples.five.sport;
+		tuple_size = sizeof(tuple.ipv4);
+	} else {
+		__builtin_memcpy(tuple.ipv6.daddr, &tuples.five.dip,
+				 IPV6_BYTE_LENGTH);
+		__builtin_memcpy(tuple.ipv6.saddr, &tuples.five.sip,
+				 IPV6_BYTE_LENGTH);
+		tuple.ipv6.dport = tuples.five.dport;
+		tuple.ipv6.sport = tuples.five.sport;
+		tuple_size = sizeof(tuple.ipv6);
+	}
+
+	if (l4proto == IPPROTO_TCP) {
+		// TCP.
+		if (tcph.syn && !tcph.ack)
+			goto new_connection;
+
+		sk = bpf_skc_lookup_tcp(skb, &tuple, tuple_size,
+					PARAM.dae_netns_id, 0);
+		if (sk) {
+			if (sk->state != BPF_TCP_LISTEN) {
+				bpf_sk_release(sk);
+				goto control_plane;
+			}
+			bpf_sk_release(sk);
+		}
+	}
+
+// Routing for new connection.
+new_connection:
+	__builtin_memset(flag, 0, sizeof(flag));
+	if (l4proto == IPPROTO_TCP) {
+		if (!(tcph.syn && !tcph.ack)) {
+			// Not a new TCP connection.
+			// Perhaps single-arm.
+			return TC_ACT_OK;
+		}
+		l4hdr = &tcph;
+		flag[0] = L4ProtoType_TCP;
+	} else {
+		l4hdr = &udph;
+		flag[0] = L4ProtoType_UDP;
+	}
+	if (skb->protocol == bpf_htons(ETH_P_IP))
+		flag[1] = IpVersionType_4;
+	else
+		flag[1] = IpVersionType_6;
+	flag[6] = tuples.dscp;
+	__be32 mac[4] = {
+		0,
+		0,
+		bpf_htonl((ethh.h_source[0] << 8) + (ethh.h_source[1])),
+		bpf_htonl((ethh.h_source[2] << 24) + (ethh.h_source[3] << 16) +
+			  (ethh.h_source[4] << 8) + (ethh.h_source[5])),
+	};
+	__s64 s64_ret;
+
+	s64_ret = route(flag, l4hdr, tuples.five.sip.u6_addr32,
+			tuples.five.dip.u6_addr32, mac);
+	if (s64_ret < 0) {
+		bpf_printk("shot routing: %d", s64_ret);
+		return TC_ACT_SHOT;
+	}
+	struct routing_result routing_result = { 0 };
+
+	routing_result.outbound = s64_ret;
+	routing_result.mark = s64_ret >> 8;
+	routing_result.must = (s64_ret >> 40) & 1;
+	routing_result.dscp = tuples.dscp;
+	__builtin_memcpy(routing_result.mac, ethh.h_source,
+			 sizeof(routing_result.mac));
+	/// NOTICE: No pid pname info for LAN packet.
+	// // Maybe this packet is also in the host (such as docker) ?
+	// // I tried and it is false.
+	//__u64 cookie = bpf_get_socket_cookie(skb);
+	//struct pid_pname *pid_pname =
+	//	bpf_map_lookup_elem(&cookie_pid_map, &cookie);
+	//if (pid_pname) {
+	//	__builtin_memcpy(routing_result.pname, pid_pname->pname,
+	//			 TASK_COMM_LEN);
+	//	routing_result.pid = pid_pname->pid;
+	//}
+
+	// Save routing result.
+	ret = bpf_map_update_elem(&routing_tuples_map, &tuples.five,
+				  &routing_result, BPF_ANY);
+	if (ret) {
+		bpf_printk("shot save routing result: %d", ret);
+		return TC_ACT_SHOT;
+	}
+#if defined(__DEBUG_ROUTING) || defined(__PRINT_ROUTING_RESULT)
+	if (l4proto == IPPROTO_TCP) {
+		bpf_printk("tcp(lan): outbound: %u, target: %pI6:%u", ret,
+			   tuples.five.dip.u6_addr32,
+			   bpf_ntohs(tuples.five.dport));
+	} else {
+		bpf_printk("udp(lan): outbound: %u, target: %pI6:%u",
+			   routing_result.outbound, tuples.five.dip.u6_addr32,
+			   bpf_ntohs(tuples.five.dport));
+	}
+#endif
+	if (routing_result.outbound == OUTBOUND_DIRECT) {
+		skb->mark = routing_result.mark;
+		goto direct;
+	} else if (unlikely(routing_result.outbound == OUTBOUND_BLOCK)) {
+		goto block;
+	}
+
+	// Check outbound connectivity in specific ipversion and l4proto.
+	struct outbound_connectivity_query q = { 0 };
+
+	q.outbound = routing_result.outbound;
+	q.ipversion = skb->protocol == bpf_htons(ETH_P_IP) ? 4 : 6;
+	q.l4proto = l4proto;
+	__u32 *alive;
+
+	alive = bpf_map_lookup_elem(&outbound_connectivity_map, &q);
+	if (alive && *alive == 0 &&
+	    !(l4proto == IPPROTO_UDP && tuples.five.dport == bpf_htons(53))) {
+		// Outbound is not alive. Dns is an exception.
+		goto block;
+	}
+
+	// Assign to control plane.
+control_plane:
+	return skb_redirect_to_control_plane(skb, link_h_len, &tuples, &ethh, &tcph,
+					     0, l3proto, l4proto);
+
+direct:
+	return TC_ACT_OK;
+
+block:
+	return TC_ACT_SHOT;
+}
+
+SEC("xdp/ingress")
+int xdp_tproxy_lan_ingress(struct __sk_buff *skb)
 {
 	struct ethhdr ethh;
 	struct iphdr iph;

From dadec6601327b7909fa4b381c5008794c89bc5ed Mon Sep 17 00:00:00 2001
From: Gray Liang <gray.liang@isovalent.com>
Date: Thu, 11 Apr 2024 02:21:25 +0800
Subject: [PATCH 10/11] bpf: Finalize xdp/ingress

---
 control/kern/tproxy.c | 40 ++++++++++++++++++----------------------
 1 file changed, 18 insertions(+), 22 deletions(-)

diff --git a/control/kern/tproxy.c b/control/kern/tproxy.c
index 053447214..142eee006 100644
--- a/control/kern/tproxy.c
+++ b/control/kern/tproxy.c
@@ -986,7 +986,7 @@ int tc_tproxy_lan_ingress(struct __sk_buff *skb)
 }
 
 SEC("xdp/ingress")
-int xdp_tproxy_lan_ingress(struct __sk_buff *skb)
+int xdp_tproxy_lan_ingress(struct xdp_md *ctx)
 {
 	struct ethhdr ethh;
 	struct iphdr iph;
@@ -997,18 +997,14 @@ int xdp_tproxy_lan_ingress(struct __sk_buff *skb)
 	__u8 ihl;
 	__u16 l3proto;
 	__u8 l4proto;
-	__u32 link_h_len;
 
-	if (get_link_h_len(skb->ifindex, &link_h_len))
-		return TC_ACT_OK;
-	int ret = skb_parse_transport(skb, link_h_len, &ethh, &iph, &ipv6h, &icmp6h,
+	int ret = xdp_parse_transport(ctx, &ethh, &iph, &ipv6h, &icmp6h,
 				      &tcph, &udph, &ihl, &l3proto, &l4proto);
-	if (ret) {
-		bpf_printk("skb_parse_transport: %d", ret);
-		return TC_ACT_OK;
-	}
+	if (ret)
+		return XDP_PASS;
+
 	if (l4proto == IPPROTO_ICMPV6)
-		return TC_ACT_OK;
+		return XDP_PASS;
 
 	// Prepare five tuples.
 	struct tuples tuples;
@@ -1033,7 +1029,7 @@ int xdp_tproxy_lan_ingress(struct __sk_buff *skb)
 	__u32 flag[8];
 	void *l4hdr;
 
-	if (skb->protocol == bpf_htons(ETH_P_IP)) {
+	if (l3proto == bpf_htons(ETH_P_IP)) {
 		tuple.ipv4.daddr = tuples.five.dip.u6_addr32[3];
 		tuple.ipv4.saddr = tuples.five.sip.u6_addr32[3];
 		tuple.ipv4.dport = tuples.five.dport;
@@ -1054,7 +1050,7 @@ int xdp_tproxy_lan_ingress(struct __sk_buff *skb)
 		if (tcph.syn && !tcph.ack)
 			goto new_connection;
 
-		sk = bpf_skc_lookup_tcp(skb, &tuple, tuple_size,
+		sk = bpf_skc_lookup_tcp(ctx, &tuple, tuple_size,
 					PARAM.dae_netns_id, 0);
 		if (sk) {
 			if (sk->state != BPF_TCP_LISTEN) {
@@ -1072,7 +1068,7 @@ int xdp_tproxy_lan_ingress(struct __sk_buff *skb)
 		if (!(tcph.syn && !tcph.ack)) {
 			// Not a new TCP connection.
 			// Perhaps single-arm.
-			return TC_ACT_OK;
+			return XDP_PASS;
 		}
 		l4hdr = &tcph;
 		flag[0] = L4ProtoType_TCP;
@@ -1080,7 +1076,7 @@ int xdp_tproxy_lan_ingress(struct __sk_buff *skb)
 		l4hdr = &udph;
 		flag[0] = L4ProtoType_UDP;
 	}
-	if (skb->protocol == bpf_htons(ETH_P_IP))
+	if (l3proto == bpf_htons(ETH_P_IP))
 		flag[1] = IpVersionType_4;
 	else
 		flag[1] = IpVersionType_6;
@@ -1098,7 +1094,7 @@ int xdp_tproxy_lan_ingress(struct __sk_buff *skb)
 			tuples.five.dip.u6_addr32, mac);
 	if (s64_ret < 0) {
 		bpf_printk("shot routing: %d", s64_ret);
-		return TC_ACT_SHOT;
+		return XDP_DROP;
 	}
 	struct routing_result routing_result = { 0 };
 
@@ -1125,7 +1121,7 @@ int xdp_tproxy_lan_ingress(struct __sk_buff *skb)
 				  &routing_result, BPF_ANY);
 	if (ret) {
 		bpf_printk("shot save routing result: %d", ret);
-		return TC_ACT_SHOT;
+		return XDP_DROP;
 	}
 #if defined(__DEBUG_ROUTING) || defined(__PRINT_ROUTING_RESULT)
 	if (l4proto == IPPROTO_TCP) {
@@ -1139,7 +1135,7 @@ int xdp_tproxy_lan_ingress(struct __sk_buff *skb)
 	}
 #endif
 	if (routing_result.outbound == OUTBOUND_DIRECT) {
-		skb->mark = routing_result.mark;
+		// xdp doesn't support mark settings.
 		goto direct;
 	} else if (unlikely(routing_result.outbound == OUTBOUND_BLOCK)) {
 		goto block;
@@ -1149,7 +1145,7 @@ int xdp_tproxy_lan_ingress(struct __sk_buff *skb)
 	struct outbound_connectivity_query q = { 0 };
 
 	q.outbound = routing_result.outbound;
-	q.ipversion = skb->protocol == bpf_htons(ETH_P_IP) ? 4 : 6;
+	q.ipversion = l3proto == bpf_htons(ETH_P_IP) ? 4 : 6;
 	q.l4proto = l4proto;
 	__u32 *alive;
 
@@ -1162,14 +1158,14 @@ int xdp_tproxy_lan_ingress(struct __sk_buff *skb)
 
 	// Assign to control plane.
 control_plane:
-	return skb_redirect_to_control_plane(skb, link_h_len, &tuples, &ethh, &tcph,
-					     0, l3proto, l4proto);
+	return xdp_redirect_to_control_plane(ctx, &tuples, &ethh,
+					     &tcph, 0, l3proto, l4proto);
 
 direct:
-	return TC_ACT_OK;
+	return XDP_PASS;
 
 block:
-	return TC_ACT_SHOT;
+	return XDP_DROP;
 }
 
 // Cookie will change after the first packet, so we just use it for

From 71a6d3462640a63bf079909705fdce774ba875be Mon Sep 17 00:00:00 2001
From: Gray Liang <gray.liang@isovalent.com>
Date: Fri, 12 Apr 2024 19:19:33 +0800
Subject: [PATCH 11/11] control: Attach XDP to lan

---
 control/control_plane_core.go | 77 ++++++++++++++++++++---------------
 1 file changed, 45 insertions(+), 32 deletions(-)

diff --git a/control/control_plane_core.go b/control/control_plane_core.go
index b59fbff5d..bfd2a4313 100644
--- a/control/control_plane_core.go
+++ b/control/control_plane_core.go
@@ -284,7 +284,7 @@ func (c *controlPlaneCore) _bindLan(ifname string) error {
 	}
 	c.log.Infof("Bind to LAN: %v", ifname)
 
-	link, err := netlink.LinkByName(ifname)
+	iface, err := netlink.LinkByName(ifname)
 	if err != nil {
 		return err
 	}
@@ -297,52 +297,65 @@ func (c *controlPlaneCore) _bindLan(ifname string) error {
 	_ = c.addQdisc(ifname)
 	_ = c.mapLinkType(ifname)
 	/// Insert an elem into IfindexParamsMap.
-	ifParams, err := getIfParamsFromLink(link)
+	ifParams, err := getIfParamsFromLink(iface)
 	if err != nil {
 		return err
 	}
 	if err = ifParams.CheckVersionRequirement(c.kernelVersion); err != nil {
 		return err
 	}
-	if err := c.bpf.IfindexParamsMap.Update(uint32(link.Attrs().Index), ifParams, ebpf.UpdateAny); err != nil {
+	if err := c.bpf.IfindexParamsMap.Update(uint32(iface.Attrs().Index), ifParams, ebpf.UpdateAny); err != nil {
 		return fmt.Errorf("update IfindexIpsMap: %w", err)
 	}
 
-	// Insert filters.
-	filterIngress := &netlink.BpfFilter{
-		FilterAttrs: netlink.FilterAttrs{
-			LinkIndex: link.Attrs().Index,
-			Parent:    netlink.HANDLE_MIN_INGRESS,
-			Handle:    netlink.MakeHandle(0x2023, 0b100+uint16(c.flip)),
-			Protocol:  unix.ETH_P_ALL,
-			// Priority should be behind of WAN's
-			Priority: 2,
-		},
-		Fd:           c.bpf.bpfPrograms.TcTproxyLanIngress.FD(),
-		Name:         consts.AppName + "_lan_ingress",
-		DirectAction: true,
-	}
-	// Remove and add.
-	_ = netlink.FilterDel(filterIngress)
-	if !c.isReload {
-		// Clean up thoroughly.
-		filterIngressFlipped := deepcopy.Copy(filterIngress).(*netlink.BpfFilter)
-		filterIngressFlipped.FilterAttrs.Handle ^= 1
-		_ = netlink.FilterDel(filterIngressFlipped)
-	}
-	if err := netlink.FilterAdd(filterIngress); err != nil {
-		return fmt.Errorf("cannot attach ebpf object to filter ingress: %w", err)
+	// Don't specify XDP mode, let kernel decide if driver supports XDP or fallback to XDP generic.
+	l, err := link.AttachXDP(link.XDPOptions{
+		Program:   c.bpf.bpfPrograms.XdpTproxyLanIngress,
+		Interface: iface.Attrs().Index,
+		Flags:     link.XDPGenericMode,
+	})
+	if err != nil {
+		return fmt.Errorf("AttachXDP: %w", err)
 	}
 	c.deferFuncs = append(c.deferFuncs, func() error {
-		if err := netlink.FilterDel(filterIngress); err != nil {
-			return fmt.Errorf("FilterDel(%v:%v): %w", ifname, filterIngress.Name, err)
-		}
-		return nil
+		return l.Close()
 	})
 
+	// Insert filters.
+	//filterIngress := &netlink.BpfFilter{
+	//	FilterAttrs: netlink.FilterAttrs{
+	//		LinkIndex: iface.Attrs().Index,
+	//		Parent:    netlink.HANDLE_MIN_INGRESS,
+	//		Handle:    netlink.MakeHandle(0x2023, 0b100+uint16(c.flip)),
+	//		Protocol:  unix.ETH_P_ALL,
+	//		// Priority should be behind of WAN's
+	//		Priority: 2,
+	//	},
+	//	Fd:           c.bpf.bpfPrograms.TcTproxyLanIngress.FD(),
+	//	Name:         consts.AppName + "_lan_ingress",
+	//	DirectAction: true,
+	//}
+	//// Remove and add.
+	//_ = netlink.FilterDel(filterIngress)
+	//if !c.isReload {
+	//	// Clean up thoroughly.
+	//	filterIngressFlipped := deepcopy.Copy(filterIngress).(*netlink.BpfFilter)
+	//	filterIngressFlipped.FilterAttrs.Handle ^= 1
+	//	_ = netlink.FilterDel(filterIngressFlipped)
+	//}
+	//if err := netlink.FilterAdd(filterIngress); err != nil {
+	//	return fmt.Errorf("cannot attach ebpf object to filter ingress: %w", err)
+	//}
+	//c.deferFuncs = append(c.deferFuncs, func() error {
+	//	if err := netlink.FilterDel(filterIngress); err != nil {
+	//		return fmt.Errorf("FilterDel(%v:%v): %w", ifname, filterIngress.Name, err)
+	//	}
+	//	return nil
+	//})
+
 	filterEgress := &netlink.BpfFilter{
 		FilterAttrs: netlink.FilterAttrs{
-			LinkIndex: link.Attrs().Index,
+			LinkIndex: iface.Attrs().Index,
 			Parent:    netlink.HANDLE_MIN_EGRESS,
 			Handle:    netlink.MakeHandle(0x2023, 0b010+uint16(c.flip)),
 			Protocol:  unix.ETH_P_ALL,