https://github.com/cilium/cilium
Tip revision: c191ef6f9b5b1dd87bb4f513bfc67641b421dad0 authored by Maciej Kwiek on 10 August 2023, 16:54:57 UTC
Prepare for release v1.14.1
Prepare for release v1.14.1
Tip revision: c191ef6
init.sh
#!/bin/bash
# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
# Copyright Authors of Cilium
# LIB=${1}
RUNDIR=${2}
PROCSYSNETDIR=${3}
SYSCLASSNETDIR=${4}
IP4_HOST=${5}
IP6_HOST=${6}
MODE=${7}
TUNNEL_PROTOCOL=${8}
# Only set if TUNNEL_PROTOCOL = "vxlan", "geneve"
TUNNEL_PORT=${9}
# Only set if MODE = "direct"
NATIVE_DEVS=${10}
HOST_DEV1=${11}
HOST_DEV2=${12}
MTU=${13}
# SOCKETLB=${14}
# SOCKETLB_PEER=${15}
# CGROUP_ROOT=${16}
# BPFFS_ROOT=${17}
NODE_PORT=${18}
# NODE_PORT_BIND=${19}
# MCPU=${20}
# NR_CPUS=${21}
ENDPOINT_ROUTES=${22}
PROXY_RULE=${23}
FILTER_PRIO=${24}
DEFAULT_RTPROTO=${25}
LOCAL_RULE_PRIO=${26}
# If the value below is changed, be sure to update bugtool/cmd/configuration.go
# as well when dumping the routing table in bugtool. See GH-5828.
PROXY_RT_TABLE=2005
TO_PROXY_RT_TABLE=2004
set -e
set -x
set -o pipefail
# Remove old legacy files
rm $RUNDIR/encap.state 2> /dev/null || true
function setup_dev()
{
local -r NAME=$1
ip link set $NAME up
if [ "$IP6_HOST" != "<nil>" ]; then
echo 1 > "${PROCSYSNETDIR}/ipv6/conf/${NAME}/forwarding"
fi
if [ "$IP4_HOST" != "<nil>" ]; then
echo 1 > "${PROCSYSNETDIR}/ipv4/conf/${NAME}/forwarding"
echo 0 > "${PROCSYSNETDIR}/ipv4/conf/${NAME}/rp_filter"
echo 1 > "${PROCSYSNETDIR}/ipv4/conf/${NAME}/accept_local"
echo 0 > "${PROCSYSNETDIR}/ipv4/conf/${NAME}/send_redirects"
fi
}
function move_local_rules_af()
{
IP=$1
# Do not move the rule if we don't support the address family
if [ -z "$($IP rule list)" ]; then
return
fi
# move the local table lookup rule from pref 0 to pref LOCAL_RULE_PRIO so we
# can insert the cilium ip rules before the local table. It is strictly
# required to add the new local rule before deleting the old one as
# otherwise local addresses will not be reachable for a short period of
# time.
$IP rule list | grep "${LOCAL_RULE_PRIO}" | grep "lookup local" || {
$IP rule add from all lookup local pref ${LOCAL_RULE_PRIO} proto $DEFAULT_RTPROTO
}
$IP rule del from all lookup local pref 0 2> /dev/null || true
# check if the move of the local table move was successful and restore
# it otherwise
if [ "$($IP rule list | grep "lookup local" | wc -l)" -eq "0" ]; then
$IP rule add from all lookup local pref 0 proto $DEFAULT_RTPROTO
$IP rule del from all lookup local pref ${LOCAL_RULE_PRIO}
echo "Error: The kernel does not support moving the local table routing rule"
echo "Local routing rules:"
$IP rule list lookup local
exit 1
fi
}
function move_local_rules()
{
if [ "$IP4_HOST" != "<nil>" ]; then
move_local_rules_af "ip -4"
fi
if [ "$IP6_HOST" != "<nil>" ]; then
move_local_rules_af "ip -6"
fi
}
function setup_proxy_rules()
{
# TODO(brb): remove $PROXY_RT_TABLE -related code in v1.15
from_ingress_rulespec="fwmark 0xA00/0xF00 pref 10 lookup $PROXY_RT_TABLE proto $DEFAULT_RTPROTO"
# Any packet to an ingress or egress proxy uses a separate routing table
# that routes the packet to the loopback device regardless of the destination
# address in the packet. For this to work the ctx must have a socket set
# (e.g., via TPROXY).
to_proxy_rulespec="fwmark 0x200/0xF00 pref 9 lookup $TO_PROXY_RT_TABLE proto $DEFAULT_RTPROTO"
if [ "$IP4_HOST" != "<nil>" ]; then
if [ -n "$(ip -4 rule list)" ]; then
if [ -z "$(ip -4 rule list $to_proxy_rulespec)" ]; then
ip -4 rule add $to_proxy_rulespec
fi
ip -4 rule delete $from_ingress_rulespec || true
fi
# Traffic to the host proxy is local
ip route replace table $TO_PROXY_RT_TABLE local 0.0.0.0/0 dev lo proto $DEFAULT_RTPROTO
# The $PROXY_RT_TABLE is no longer in use, so delete it
ip route delete table $PROXY_RT_TABLE $IP4_HOST/32 dev $HOST_DEV1 2>/dev/null || true
ip route delete table $PROXY_RT_TABLE default via $IP4_HOST 2>/dev/null || true
else
ip -4 rule del $to_proxy_rulespec 2> /dev/null || true
ip -4 rule del $from_ingress_rulespec 2> /dev/null || true
fi
if [ "$IP6_HOST" != "<nil>" ]; then
if [ -n "$(ip -6 rule list)" ]; then
if [ -z "$(ip -6 rule list $to_proxy_rulespec)" ]; then
ip -6 rule add $to_proxy_rulespec
fi
ip -6 rule delete $from_ingress_rulespec || true
fi
IP6_LLADDR=$(ip -6 addr show dev $HOST_DEV2 | grep inet6 | head -1 | awk '{print $2}' | awk -F'/' '{print $1}')
if [ -n "$IP6_LLADDR" ]; then
# Traffic to the host proxy is local
ip -6 route replace table $TO_PROXY_RT_TABLE local ::/0 dev lo proto $DEFAULT_RTPROTO
# The $PROXY_RT_TABLE is no longer in use, so delete it
ip -6 route delete table $PROXY_RT_TABLE ${IP6_LLADDR}/128 dev $HOST_DEV1 2>/dev/null || true
ip -6 route delete table $PROXY_RT_TABLE default via $IP6_LLADDR dev $HOST_DEV1 2>/dev/null || true
fi
else
ip -6 rule del $to_proxy_rulespec 2> /dev/null || true
ip -6 rule del $from_ingress_rulespec 2> /dev/null || true
fi
}
function rnd_mac_addr()
{
local lower=$(od /dev/urandom -N5 -t x1 -An | sed 's/ /:/g')
local upper=$(( 0x$(od /dev/urandom -N1 -t x1 -An | cut -d' ' -f2) & 0xfe | 0x02 ))
printf '%02x%s' $upper $lower
}
function create_encap_dev()
{
TUNNEL_OPTS="external"
if [ "${TUNNEL_PORT}" != "<nil>" ]; then
TUNNEL_OPTS="dstport $TUNNEL_PORT $TUNNEL_OPTS"
fi
ip link add name $ENCAP_DEV address $(rnd_mac_addr) type $TUNNEL_PROTOCOL $TUNNEL_OPTS || encap_fail
}
function encap_fail()
{
(>&2 echo "ERROR: Setup of encapsulation device $ENCAP_DEV has failed. Is another program using a $MODE device?")
(>&2 echo "Configured $MODE devices on the system:")
(>&2 ip link show type $MODE)
exit 1
}
if [ "$PROXY_RULE" = "true" ]; then
# Decrease priority of the rule to identify local addresses
move_local_rules
# Install new rules before local rule to ensure that packets from the proxy are
# using a separate routing table
setup_proxy_rules
fi
if [ "$MODE" = "ipip" ]; then
if [ "$IP4_HOST" != "<nil>" ]; then
ENCAP_DEV="cilium_ipip4"
ip link show $ENCAP_DEV || {
# Upon module load it will create a non-removable tunl0
# device. Instead of creating an additional useless one,
# rename tunl0 with cilium prefix in a second step. If
# we to do 'ip link add name $ENCAP_DEV [...]' it would
# create two devices. :/
ip link add name tunl0 type ipip external || true
ip link set tunl0 name $ENCAP_DEV
}
setup_dev $ENCAP_DEV || encap_fail
ENCAP_IDX=$(cat "${SYSCLASSNETDIR}/${ENCAP_DEV}/ifindex")
sed -i '/^#.*ENCAP4_IFINDEX.*$/d' $RUNDIR/globals/node_config.h
echo "#define ENCAP4_IFINDEX $ENCAP_IDX" >> $RUNDIR/globals/node_config.h
else
ip link del cilium_ipip4 2> /dev/null || true
fi
if [ "$IP6_HOST" != "<nil>" ]; then
ENCAP_DEV="cilium_ipip6"
ip link show $ENCAP_DEV || {
# For cilium_ipip6 device, we unfortunately cannot use the
# same workaround as cilium_ipip4. While the latter allows
# to set an existing tunl0 into collect_md mode, the default
# ip6tnl0 if present cannot. It's quite annoying, but if v6
# was built into the kernel, we might just need to live with
# it. Default device creation can still be worked around
# via boot param if the sysctl from agent won't do it.
ip link add name $ENCAP_DEV type ip6tnl external || true
ip link set sit0 name cilium_sit || true
}
setup_dev $ENCAP_DEV || encap_fail
ENCAP_IDX=$(cat "${SYSCLASSNETDIR}/${ENCAP_DEV}/ifindex")
sed -i '/^#.*ENCAP6_IFINDEX.*$/d' $RUNDIR/globals/node_config.h
echo "#define ENCAP6_IFINDEX $ENCAP_IDX" >> $RUNDIR/globals/node_config.h
else
ip link del cilium_ipip6 2> /dev/null || true
ip link del cilium_sit 2> /dev/null || true
fi
else
ip link del cilium_ipip4 2> /dev/null || true
ip link del cilium_ipip6 2> /dev/null || true
ip link del cilium_sit 2> /dev/null || true
fi
# Remove eventual existing encapsulation device from previous run
case "${TUNNEL_PROTOCOL}" in
"<nil>")
ip link del cilium_vxlan 2> /dev/null || true
ip link del cilium_geneve 2> /dev/null || true
;;
"vxlan")
ip link del cilium_geneve 2> /dev/null || true
;;
"geneve")
ip link del cilium_vxlan 2> /dev/null || true
;;
*)
(>&2 echo "ERROR: Unknown tunnel mode")
exit 1
;;
esac
if [ "${TUNNEL_PROTOCOL}" != "<nil>" ]; then
ENCAP_DEV="cilium_${TUNNEL_PROTOCOL}"
ip link show $ENCAP_DEV || create_encap_dev
if [ "${TUNNEL_PORT}" != "<nil>" ]; then
ip -details link show $ENCAP_DEV | grep "dstport $TUNNEL_PORT" || {
ip link delete name $ENCAP_DEV type $TUNNEL_PROTOCOL
create_encap_dev
}
fi
ip link set $ENCAP_DEV mtu $MTU || encap_fail
setup_dev $ENCAP_DEV || encap_fail
ENCAP_IDX=$(cat "${SYSCLASSNETDIR}/${ENCAP_DEV}/ifindex")
sed -i '/^#.*ENCAP_IFINDEX.*$/d' $RUNDIR/globals/node_config.h
echo "#define ENCAP_IFINDEX $ENCAP_IDX" >> $RUNDIR/globals/node_config.h
fi
if [ "$MODE" = "direct" ] || [ "$NODE_PORT" = "true" ] ; then
if [ "$NATIVE_DEVS" == "<nil>" ]; then
echo "No device specified for $MODE mode, ignoring..."
else
if [ "$IP6_HOST" != "<nil>" ]; then
echo 1 > "${PROCSYSNETDIR}/ipv6/conf/all/forwarding"
fi
echo "$NATIVE_DEVS" > $RUNDIR/device.state
fi
else
FILE=$RUNDIR/device.state
if [ -f $FILE ]; then
DEVS=$(cat $FILE)
for DEV in ${DEVS//,/ }; do
echo "Removed BPF program from device $DEV"
tc qdisc del dev $DEV clsact 2> /dev/null || true
done
rm $FILE
fi
fi
# Remove bpf_host.o from previously used devices
for iface in $(ip -o -a l | awk '{print $2}' | cut -d: -f1 | cut -d@ -f1 | grep -v cilium); do
found=false
for NATIVE_DEV in ${NATIVE_DEVS//;/ }; do
if [ "${iface}" == "$NATIVE_DEV" ]; then
found=true
break
fi
done
$found && continue
for where in ingress egress; do
# iproute2 uses the filename and section (bpf_overlay.o:[from-overlay]) as
# the filter name. Filters created by the Go bpf loader contain the bpf
# function and interface name, like cil_from_netdev-eth0.
# Only detach programs known to be attached to 'physical' network devices.
if tc filter show dev "$iface" "$where" | grep -qE "\b(bpf_host|cil_from_netdev|cil_to_netdev)"; then
echo "Removing $where TC filter from interface $iface"
tc filter del dev "$iface" "$where" || true
fi
done
done
if [ "$HOST_DEV1" != "$HOST_DEV2" ]; then
tc filter del dev $HOST_DEV2 "egress" 2> /dev/null || true
fi