Skip to content

Commit

Permalink
fix(bootstrap): correct some bugs on cilium compability
Browse files Browse the repository at this point in the history
  • Loading branch information
HoKim98 committed Aug 11, 2024
1 parent c394b82 commit dc62c31
Show file tree
Hide file tree
Showing 13 changed files with 127 additions and 30 deletions.
4 changes: 4 additions & 0 deletions crates/kiss/ansible/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ pub struct KissConfig {
pub group_enforce_ansible_control_planes: bool,
pub group_force_reset: bool,
pub group_force_reset_os: bool,
pub group_reset_storage: bool,
pub kiss_cluster_name: String,
pub kubespray_image: String,
pub network_interface_mtu_size: u16,
Expand All @@ -27,6 +28,7 @@ pub struct KissConfig {
pub network_ipv4_subnet: Ipv4Net,
pub network_nameserver_incluster_ipv4: Ipv4Addr,
pub os_default: String,
pub os_kernel: String,
}

impl KissConfig {
Expand Down Expand Up @@ -55,6 +57,7 @@ impl KissConfig {
)?,
group_force_reset: infer(&config, "group_force_reset")?,
group_force_reset_os: infer(&config, "group_force_reset_os")?,
group_reset_storage: infer(&config, "group_reset_storage")?,
kiss_cluster_name: infer(&config, "kiss_cluster_name")?,
kubespray_image: infer(&config, "kubespray_image")?,
network_interface_mtu_size: infer(&config, "network_interface_mtu_size")?,
Expand All @@ -65,6 +68,7 @@ impl KissConfig {
network_ipv4_subnet: infer(&config, "network_ipv4_subnet")?,
network_nameserver_incluster_ipv4: infer(&config, "network_nameserver_incluster_ipv4")?,
os_default: infer(&config, "os_default")?,
os_kernel: infer(&config, "os_kernel")?,
})
}
}
Expand Down
18 changes: 18 additions & 0 deletions crates/kiss/ansible/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,14 @@ impl AnsibleClient {
}),
spec: Some(PodSpec {
affinity: Some(crate::job::affinity()),
dns_config: Some(PodDNSConfig {
nameservers: Some(vec![
self.kiss.bootstrapper_network_dns_server_ns1.to_string(),
self.kiss.bootstrapper_network_dns_server_ns2.to_string(),
]),
..Default::default()
}),
host_network: Some(true),
priority_class_name: Some(priority_class_name.into()),
restart_policy: Some("OnFailure".into()),
service_account: Some("ansible-playbook".into()),
Expand Down Expand Up @@ -312,6 +320,11 @@ impl AnsibleClient {
value: Some(self.kiss.group_force_reset_os.to_string()),
..Default::default()
},
EnvVar {
name: "kiss_group_reset_storage".into(),
value: Some(self.kiss.group_reset_storage.to_string()),
..Default::default()
},
EnvVar {
name: "kiss_group_role".into(),
value: Some(group.role.to_string()),
Expand Down Expand Up @@ -420,6 +433,11 @@ impl AnsibleClient {
value: Some(self.kiss.os_default.to_string()),
..Default::default()
},
EnvVar {
name: "kiss_os_kernel".into(),
value: Some(self.kiss.os_kernel.to_string()),
..Default::default()
},
EnvVar {
name: "kiss_power_intel_amt_host".into(),
value: job
Expand Down
2 changes: 1 addition & 1 deletion templates/bootstrap/bootstrap.sh
Original file line number Diff line number Diff line change
Expand Up @@ -732,7 +732,7 @@ function install_kiss_cluster() {
kubectl -n kiss rollout status deployment operator

# Register the boxes
for node in ${nodes}; do
for node in $@; do
__log 'INFO' "Registering the box: ${node} ..."
cat <<EOF | $(__shell "${node_first}") kubectl create -f -
---
Expand Down
2 changes: 2 additions & 0 deletions templates/bootstrap/kiss-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ data:
group_enforce_ansible_control_planes: "false"
group_force_reset: "false"
group_force_reset_os: "false"
group_reset_storage: "true"

###########################################################################
# Bootstrapper Node Configuration
Expand Down Expand Up @@ -83,6 +84,7 @@ data:
# OS Configuration
###########################################################################
os_default: rocky9 # One of: flatcar, rocky9 (default)
os_kernel: stable # One of: edge, stable (default)

###########################################################################
# Service/CSI Configuration
Expand Down
17 changes: 13 additions & 4 deletions templates/bootstrap/node/ansible/defaults/all.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,7 @@ all:
# @schema
# -- Enable installation of PodCIDR routes between worker
# nodes if worker nodes share a common L2 network segment.
autoDirectNodeRoutes: false
autoDirectNodeRoutes: true

# -- Enable bandwidth manager to optimize TCP and UDP workloads and allow
# for rate-limiting traffic from individual Pods with EDT (Earliest Departure
Expand Down Expand Up @@ -248,13 +248,22 @@ all:
# preallocateMaps: false
# -- (string) Mode for Pod devices for the core datapath (veth, netkit, netkit-l2, lb-only)
# @default -- `veth`
datapathMode: netkit
datapathMode: netkit # FIXME: migrate OS from Rocky to Ubuntu that supports `CONFIG_NETKIT`
# @schema
# type: [null, boolean]
# @schema
# -- (bool) Enable native IP masquerade support in eBPF
# @default -- `false`
masquerade: true
# @schema
# type: [null, boolean]
# @schema
# -- (bool) Configure whether direct routing mode should route traffic via
# host stack (true) or directly and more efficiently out of BPF (false) if
# the kernel supports it. The latter has the implication that it will also
# bypass netfilter in the host namespace.
# @default -- `false`
hostLegacyRouting: false
# -- Allow cluster external access to ClusterIP services.
lbExternalClusterIP: true

Expand Down Expand Up @@ -532,7 +541,7 @@ all:
# direct routing and the Kubernetes CIDR is included in the native routing CIDR,
# the user must configure the routes to reach pods, either manually or by
# setting the auto-direct-node-routes flag.
# ipv4NativeRoutingCIDR: "{{ kiss_network_ipv4_subnet }}"
ipv4NativeRoutingCIDR: "10.0.0.0/8"

ipv6:
# -- Enable IPv6 support.
Expand All @@ -548,7 +557,7 @@ all:
# direct routing and the Kubernetes CIDR is included in the native routing CIDR,
# the user must configure the routes to reach pods, either manually or by
# setting the auto-direct-node-routes flag.
# ipv6NativeRoutingCIDR: "{{ kiss_network_ipv6_subnet }}"
ipv6NativeRoutingCIDR: "fd00::/100"

# -- Configure Kubernetes specific configuration
k8s:
Expand Down
21 changes: 21 additions & 0 deletions templates/contrib/speed-test/iperf3/daemonset.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: iperf3-ds
namespace: default
spec:
selector:
matchLabels:
app: iperf3
template:
metadata:
labels:
app: iperf3
spec:
containers:
- name: iperf3
image: docker.io/leodotcloud/swiss-army-knife:latest
imagePullPolicy: Always
ports:
- containerPort: 5201
9 changes: 0 additions & 9 deletions templates/kiss/matchbox/boot/rocky9.ks
Original file line number Diff line number Diff line change
Expand Up @@ -196,15 +196,6 @@ echo 'timeout=300' >>/etc/dnf/dnf.conf
echo 'fastestmirror=True' >>/etc/dnf/dnf.conf
echo 'max_parallel_downloads=5' >>/etc/dnf/dnf.conf
# Kernel Configuration
## Install Bleeding-edge kernel
dnf install --enablerepo='elrepo-kernel' -y \
kernel-ml \
kernel-ml-core \
kernel-ml-devel \
kernel-ml-modules \
kernel-ml-modules-extra
# Advanced Network configuration
mkdir -p /etc/NetworkManager/system-connections/
## Wireless - WIFI
Expand Down
1 change: 1 addition & 0 deletions templates/kiss/tasks/commission/storage.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,7 @@

- name: Cleanup disks
when:
- kiss_group_reset_storage is not defined or kiss_group_reset_storage
- kiss_group_role_is_member is defined and kiss_group_role_is_member
- not kiss_os_exists or kiss_os_dirty
- not kiss_storage_exists
Expand Down
56 changes: 40 additions & 16 deletions templates/kiss/tasks/commission/system-kernel-upgrade.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
---
- name: Check whether the latest kernel is running
when: kiss_os_kernel is defined and kiss_os_kernel == 'edge'
block:
- name: Check whether the latest kernel is running - Flatcar Container Linux
when: kiss_os_default == 'flatcar'
Expand All @@ -8,11 +9,12 @@
- name: Check whether the latest kernel is running - RockyLinux
when: kiss_os_default == 'rocky9'
set_fact:
kiss_is_kernel_latest: "{{ '.elrepo.' in ansible_facts.kernel }}"
kiss_is_kernel_latest: "{{ '.fc' in ansible_facts.kernel }}"
register: boot_file

- name: Upgrade kernel - RockyLinux
when:
- kiss_os_kernel is defined and kiss_os_kernel == 'edge'
- not kiss_is_kernel_latest
- kiss_os_default == 'rocky9'
block:
Expand All @@ -21,32 +23,54 @@
name: "*"
state: latest

- name: Upgrade kernel - RockyLinux - Add ELRepo Repository
- name: Upgrade kernel - RockyLinux - Install dependencies
dnf:
name:
- elrepo-release
- koji
- grubby
- sqlite
enablerepo: elrepo-kernel
enablerepo: epel
state: latest

- name: Upgrade kernel - RockyLinux - Remove Stable Kernel
dnf:
name:
- kernel
- kernel-core
- kernel-devel
- kernel-modules
- kernel-modules-core
state: absent
autoremove: no
- name: Upgrade kernel - RockyLinux - Get the latest kernel version
shell:
cmd: >
koji list-builds --package=kernel --state=COMPLETE
| awk '{print $1}'
| grep -P '^kernel-[0-9\.-]+\.fc[0-9]+$'
| sort -V
| tail -n1
register: kiss_kernel_version

- name: Upgrade kernel - RockyLinux - Install Bleeding-edge Kernel ({{ kiss_kernel_version.stdout }})
vars:
kiss_kernel_dir: /tmp/kernel-koji
shell:
cmd: >
mkdir -p "{{ kiss_kernel_dir }}"
&& cd "{{ kiss_kernel_dir }}"
&& koji download-build --noprogress --arch="{{ ansible_architecture }}" "{{ kiss_kernel_version.stdout }}"
&& dnf install -y
"{{ kiss_kernel_dir }}/kernel-{{ kiss_kernel_version.stdout[7:] }}.{{ ansible_architecture }}.rpm"
"{{ kiss_kernel_dir }}/kernel-core-{{ kiss_kernel_version.stdout[7:] }}.{{ ansible_architecture }}.rpm"
"{{ kiss_kernel_dir }}/kernel-devel-{{ kiss_kernel_version.stdout[7:] }}.{{ ansible_architecture }}.rpm"
"{{ kiss_kernel_dir }}/kernel-modules-{{ kiss_kernel_version.stdout[7:] }}.{{ ansible_architecture }}.rpm"
"{{ kiss_kernel_dir }}/kernel-modules-core-{{ kiss_kernel_version.stdout[7:] }}.{{ ansible_architecture }}.rpm"
&& cd /
&& rm -rf "{{ kiss_kernel_dir }}"
&& grubby --set-default="/boot/vmlinuz-{{ kiss_kernel_version.stdout[7:] }}.{{ ansible_architecture }}"
&& grub2-mkconfig -o /boot/grub2/grub.cfg
- name: Reboot the boxes if upgraded (The task will be restarted)
when: not kiss_is_kernel_latest
when:
- kiss_os_kernel is defined and kiss_os_kernel == 'edge'
- not kiss_is_kernel_latest
reboot:
reboot_timeout: 3600 # 1h (booting can take a long time)

- name: Assert rebooting
when: not kiss_is_kernel_latest
when:
- kiss_os_kernel is defined and kiss_os_kernel == 'edge'
- not kiss_is_kernel_latest
fail:
msg: The nodes should be rebooted!
8 changes: 8 additions & 0 deletions templates/kiss/tasks/commission/system-systemd.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
---
# See: https://docs.cilium.io/en/v1.16/operations/performance/tuning/#stop-irqbalance-and-pin-the-nic-interrupts-to-specific-cpus
- name: Disable irqbalance
systemd:
name: irqbalance.service
state: stopped
enabled: false
daemon_reload: true
3 changes: 3 additions & 0 deletions templates/kiss/tasks/commission/system.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
---
- name: Provision Kernel
include_tasks: system-kernel.yaml

- name: Provision SystemD
include_tasks: system-systemd.yaml
2 changes: 2 additions & 0 deletions templates/kiss/tasks/common/playbook-common.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
kiss_group_enable_default_cluster: "{{ lookup('env', 'kiss_group_enable_default_cluster') == 'true' }}"
kiss_group_force_reset: "{{ lookup('env', 'kiss_group_force_reset') == 'true' }}"
kiss_group_force_reset_os: "{{ lookup('env', 'kiss_group_force_reset_os') == 'true' }}"
kiss_group_reset_storage: "{{ lookup('env', 'kiss_group_reset_storage') == 'true' }}"
kiss_group_role: "{{ lookup('env', 'kiss_group_role') }}"
kiss_group_role_is_domain_specific: "{{ lookup('env', 'kiss_group_role_is_domain_specific') }}"
kiss_group_role_is_member: "{{ lookup('env', 'kiss_group_role_is_member') == 'true' }}"
Expand All @@ -71,6 +72,7 @@
kiss_network_wireless_wifi_ssid: "{{ lookup('env', 'kiss_network_wireless_wifi_ssid') }}"
kiss_os_default: "{{ lookup('env', 'kiss_os_default') }}"
kiss_os_hot_install: "{{ lookup('env', 'kiss_os_default') in ['flatcar'] }}"
kiss_os_kernel: "{{ lookup('env', 'kiss_os_kernel') }}"
kiss_power_intel_amt_host: "{{ lookup('env', 'kiss_power_intel_amt_host') }}"
kiss_power_intel_amt_username: "{{ lookup('env', 'kiss_power_intel_amt_username') }}"
kiss_power_intel_amt_password: "{{ lookup('env', 'kiss_power_intel_amt_password') }}"
Expand Down
14 changes: 14 additions & 0 deletions templates/kiss/tasks/join/add-node-role.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,18 @@
---
- name: Post-install | Wait for apiserver to be operated
hosts: target
tasks:
- command: "{{ bin_dir }}/kubectl version"
delegate_to: "{{ groups['kube_control_plane'] | first }}"
retries: 10
delay: 5

- command: >
{{ bin_dir }}/kubectl get nodes {{ inventory_hostname }} --output name
delegate_to: "{{ groups['kube_control_plane'] | first }}"
retries: 10
delay: 5
- hosts: target
tasks:
- name: Mark the bootstrapped node as "{{ kiss_group_role }}"
Expand Down

0 comments on commit dc62c31

Please sign in to comment.