From fca59bea770346cf1c1f9b0e00cb48a61b44a8f3 Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Sun, 25 Oct 2015 21:00:20 +0100 Subject: import of old now defunct presentation slides svn repo --- 2002/netfilter-failover-ols2002/abstract | 31 ++ 2002/netfilter-failover-ols2002/biography | 22 + .../netfilter-failover-ols2002.mgp | 294 ++++++++++++ .../netfilter-failover-ols2002.tex | 504 +++++++++++++++++++++ 2002/netfilter-failover-ols2002/ols.sty | 56 +++ 5 files changed, 907 insertions(+) create mode 100644 2002/netfilter-failover-ols2002/abstract create mode 100644 2002/netfilter-failover-ols2002/biography create mode 100644 2002/netfilter-failover-ols2002/netfilter-failover-ols2002.mgp create mode 100644 2002/netfilter-failover-ols2002/netfilter-failover-ols2002.tex create mode 100644 2002/netfilter-failover-ols2002/ols.sty (limited to '2002/netfilter-failover-ols2002') diff --git a/2002/netfilter-failover-ols2002/abstract b/2002/netfilter-failover-ols2002/abstract new file mode 100644 index 0000000..9cd4ef3 --- /dev/null +++ b/2002/netfilter-failover-ols2002/abstract @@ -0,0 +1,31 @@ +How to replicate the fire - HA for netfilter based firewalls. + + With traditional, stateless firewalling (such as ipfwadm, ipchains) there is +no need for special HA support in the firewalling subsystem. As long as all +packet filtering rules and routing table entries are configured in exactly the +same way, one can use any available tool for IP-Address takeover to accomplish +the goal of failing over from one node to the other. + + With Linux 2.4.x netfilter/iptables, the Linux firewalling code moves beyond +traditional packet filtering. Netfilter provides a modular connection tracking +susbsystem which can be employed for stateful firewalling. The connection +tracking subsystem gathers information about the state of all current network +flows (connections). Packet filtering decisions and NAT information is +associated with this state information. + + In a high availability scenario, this connection tracking state needs to be +replicated from the currently active firewall node to all standby slave +firewall nodes. Only when all connection tracking state is replicated, the +slave node will have all necessarry state information at the time a failover +event occurs. + + The netfilter/iptables does currently not have any functionality for +replicating connection tracking state accross multiple nodes. However, +the author of this presentation, Harald Welte, has started a project for +connection tracking state replication with netfilter/iptables. + + The presentation will cover the architectural design and implementation +of the connection tracking failover sytem. With respect to the date of +the conference, it is to be expected that the project is still a +work-in-progress at that time. + diff --git a/2002/netfilter-failover-ols2002/biography b/2002/netfilter-failover-ols2002/biography new file mode 100644 index 0000000..27b77bd --- /dev/null +++ b/2002/netfilter-failover-ols2002/biography @@ -0,0 +1,22 @@ + Harald Welte is one +of the five netfilter/iptables core +team members, and the current Linux 2.4.x firewalling maintainer. + + His main interest in computing has always been networking. In the few time +left besides netfilter/iptables related work, he's writing obscure documents +like the UUCP +over SSL HOWTO. Other kernel-related projects he has been contributing are +user mode linux and the international (crypto) kernel patch. + + In the past he has been working as an independent IT Consultant working on +closed-source projecst for various companies ranging from banks to +manufacturers of networking gear. During the year 2001 he was living in +Curitiba (Brazil), where he got sponsored for his Linux related work by +Conectiva Inc.. + + Starting with February 2002, Harald has been contracted part-time by +Astaro AG, who are sponsoring him for his +current netfilter/iptables work. + + Harald is living in Erlangen, Germany. + diff --git a/2002/netfilter-failover-ols2002/netfilter-failover-ols2002.mgp b/2002/netfilter-failover-ols2002/netfilter-failover-ols2002.mgp new file mode 100644 index 0000000..468d974 --- /dev/null +++ b/2002/netfilter-failover-ols2002/netfilter-failover-ols2002.mgp @@ -0,0 +1,294 @@ +%include "default.mgp" +%default 1 bgrad +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%page +%nodefault +%back "blue" + +%center +%size 7 + + +How to replicate the fire +HA for netfilter-based firewalls + + +%center +%size 4 +by + +Harald Welte + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%page +HA for netfillter/iptables +Contents + + + Introduction + Connection Tracking Subsystem + Packet selection based on IP Tables + The Connection Tracking Subsystem + The NAT Subsystem + Poor man's failover + Real state replication + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%page +HA for netfillter/iptables +Introduction + +What is special about firewall failover? + + Nothing, in case of the stateless packet filter + Common IP takeover solutions can be used + VRRP + Hartbeat + + Distribution of packet filtering ruleset no problem + can be done manually + or implemented with simple userspace process + + Problems arise with stateful packet filters + Connection state only on active node + NAT mappings only on active node + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%page +HA for netfillter/iptables +Connection Tracking Subsystem + +Connection tracking... + + implemented seperately from NAT + enables stateful filtering + implementation + hooks into NF_IP_PRE_ROUTING to track packets + hooks into NF_IP_POST_ROUTING and NF_IP_LOCAL_IN to see if packet passed filtering rules + protocol modules (currently TCP/UDP/ICMP) + application helpers currently (FTP,IRC,H.323,talk,SNMP) + divides packets in the following four categories + NEW - would establish new connection + ESTABLISHED - part of already established connection + RELATED - is related to established connection + INVALID - (multicast, errors...) + does _NOT_ filter packets itself + can be utilized by iptables using the 'state' match + is used by NAT Subsystem + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%page +HA for netfillter/iptables +Connection Tracking Subsystem + +Common structures + struct ip_conntrack_tuple, representing unidirectional flow + layer 3 src + dst + layer 4 protocol + layer 4 src + dst + + + connetions represented as struct ip_conntrack + original tuple + reply tuple + timeout + l4 state private data + app helper + app helper private data + expected connections + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%page +HA for netfillter/iptables +Connection Tracking Subsystem + +Flow of events for new packet + packet enters NF_IP_PRE_ROUTING + tuple is derived from packet + lookup conntrack hash table with hash(tuple) -> fails + new ip_conntrack is allocated + fill in original and reply == inverted(original) tuple + initialize timer + assign app helper if applicable + see if we've been expected -> fails + call layer 4 helper 'new' function + + ... + + packet enters NF_IP_POST_ROUTING + do hashtable lookup for packet -> fails + place struct ip_conntrack in hashtable + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%page +HA for netfillter/iptables +Connection Tracking Subsystem + +Flow of events for packet part of existing connection + packet enters NF_IP_PRE_ROUTING + tuple is derived from packet + lookup conntrack hash table with hash(tuple) + assosiate conntrack entry with skb->nfct + call l4 protocol helper 'packet' function + do l4 state tracking + update timeouts as needed [i.e. TCP TIME_WAIT,...] + + ... + + packet enters NF_IP_POST_ROUTING + do hashtable lookup for packet -> succeds + do nothing else + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%page +HA for netfillter/iptables +Network Address Translation + +Overview + Previous Linux Kernels only implemented one special case of NAT: Masquerading + Linux 2.4.x can do any kind of NAT. + NAT subsystem implemented on top of netfilter, iptables and conntrack + NAT subsystem registers with all five netfilter hooks + 'nat' Table registers chains PREROUTING, POSTROUTING and OUTPUT + Following targets available within 'nat' Table + SNAT changes the packet's source whille passing NF_IP_POST_ROUTING + DNAT changes the packet's destination while passing NF_IP_PRE_ROUTING + MASQUERADE is a special case of SNAT + REDIRECT is a special case of DNAT + NAT bindings determined only for NEW packet and saved in ip_conntrack + Further packets within connection NATed according NAT bindings + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%page +HA for netfillter/iptables +Poor man's failover + +Poor man's failover + principle + let every node do it's own tracking rather than replicating state + two possible implementations + connect every node to shared media (i.e. real ethernet) + forwarding only turned on on active node + slave nodes use promiscuous mode to sniff packets + copy all traffic to slave nodes + active master needs to copy all traffic to other nodes + disadvantage: high load, sync traffic == payload traffic + IMHO stupid way of solving the problem + advantages + very easy implementation + only addition of sniffing mode to conntrack needed + existing means of address takeover can be used + same load on active master and slave nodes + no additional load on active master + disadvantages + can only be used with real shared media (no switches, ...) + can not be used with NAT + remaining problem + no initial state sync after reboot of slave node! + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%page +HA for netfillter/iptables +Real state replication + +Parts needed + state replication protocol + multicast based + sequence numbers for detection of packet loss + NACK-based retransmission + no security, since private ethernet segment to be used + event interface on active node + calling out to callback function at all state changes + exported interface to manipulate conntrack hash table + kernel thread for sending conntrack state protocol messages + registers with event interface + creates and accumulates state replication packets + sends them via in-kernel sockets api + kernel thread for receiving conntrack state replication messages + receives state replication packets via in-kernel sockets + uses conntrack hashtable manipulation interface + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%page +HA for netfillter/iptables +Real state replication + + Flow of events in chronological order: + on active node, inside the network RX softirq + connection tracking code is analyzing a forwarded packet + connection tracking gathers some new state information + connection tracking updates local connection tracking database + connection tracking sends event message to event API + on active node, inside the conntrack-sync kernel thread + conntrack sync daemon receives event through event API + conntrack sync daemon aggregates multiple event messages into a state replication protocol message, removing possible redundancy + conntrack sync daemon generates state replication protocol message + conntrack sync daemon sends state replication protocol message + on slave node(s), inside network RX softirq + connection tracking code ignores packets coming from the interface attached to the private conntrac sync network + state replication protocol messages is appended to socket receive queue of conntrack-sync kernel thread + on slave node(s), inside conntrack-sync kernel thread + conntrack sync daemon receives state replication message + conntrack sync daemon creates/updates conntrack entry + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%page +HA for netfillter/iptables +Neccessary changes to kernel + +Neccessary changes to current conntrack core + + event generation (callback functions) for all state changes + + conntrack hashtable manipulation API + is needed (and already implemented) for 'ctnetlink' API + + conntrack exemptions + needed to _not_ track conntrack state replication packets + is needed for other cases as well + currently being developed by Jozsef Kadlecsik + + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%page +HA for netfillter/iptables +Thanks + + Thanks to + the BBS people, Z-Netz, FIDO, ... + for heavily increasing my computer usage in 1992 + + KNF + for bringing me in touch with the internet as early as 1994 + for providing a playground for technical people + for telling me about the existance of Linux! + + Alan Cox, Alexey Kuznetsov, David Miller, Andi Kleen + for implementing (one of?) the world's best TCP/IP stacks + + Paul 'Rusty' Russell + for starting the netfilter/iptables project + for trusting me to maintain it today + + Astaro AG + for sponsoring parts of my netfilter work + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%page +HA for netfillter/iptables +Availability of slides / Links + +The slides and the an according paper of this presentation are available at + http://www.gnumonks.org/ + +The netfilter homepage + http://www.netfilter.org/ + diff --git a/2002/netfilter-failover-ols2002/netfilter-failover-ols2002.tex b/2002/netfilter-failover-ols2002/netfilter-failover-ols2002.tex new file mode 100644 index 0000000..bf8d142 --- /dev/null +++ b/2002/netfilter-failover-ols2002/netfilter-failover-ols2002.tex @@ -0,0 +1,504 @@ +\documentclass[twocolumn]{article} +\usepackage{ols} +\begin{document} + +\date{} + +\title{\Large \bf How to replicate the fire - HA for netfilter based firewalls} + +\author{ +Harald Welte\\ +{\em Netfilter Core Team + Astaro AG}\\ +{\normalsize laforge@gnumonks.org/laforge@astaro.com, http://www.gnumonks.org/} +} + +\maketitle + +\thispagestyle{empty} + +\subsection*{Abstract} + With traditional, stateless firewalling (such as ipfwadm, ipchains) there is +no need for special HA support in the firewalling subsystem. As long as all +packet filtering rules and routing table entries are configured in exactly the +same way, one can use any available tool for IP-Address takeover to accomplish +the goal of failing over from one node to the other. + + With Linux 2.4.x netfilter/iptables, the Linux firewalling code moves beyond +traditional packet filtering. Netfilter provides a modular connection tracking +susbsystem which can be employed for stateful firewalling. The connection +tracking subsystem gathers information about the state of all current network +flows (connections). Packet filtering decisions and NAT information is +associated with this state information. + + In a high availability scenario, this connection tracking state needs to be +replicated from the currently active firewall node to all standby slave +firewall nodes. Only when all connection tracking state is replicated, the +slave node will have all necessarry state information at the time a failover +event occurs. + + The netfilter/iptables does currently not have any functionality for +replicating connection tracking state accross multiple nodes. However, +the author of this presentation, Harald Welte, has started a project for +connection tracking state replication with netfilter/iptables. + + The presentation will cover the architectural design and implementation +of the connection tracking failover sytem. With respect to the date of +the conference, it is to be expected that the project is still a +work-in-progress at that time. + +\section{Failover of stateless firewalls} + +There are no special precautions when installing a highly available +stateless packet filter. Since there is no state kept, all information +needed for filtering is the ruleset and the individual, seperate packets. + +Building a set of highly available stateless packet filters can thus be +achieved by using any traditional means of IP-address takeover, such +as Hartbeat or VRRPd. + +The only remaining issue is to make sure the firewalling ruleset is +exactly the same on both machines. This should be ensured by the firewall +administrator every time he updates the ruleset. + +If this is not applicable, because a very dynamic ruleset is employed, one +can build a very easy solution using iptables-supplied tools iptables-save +and iptables-restore. The output of iptables-save can be piped over ssh +to iptables-restore on a different host. + +Limitations +\begin{itemize} +\item +no state tracking +\item +not possible in combination with NAT +\item +no counter consistency of per-rule packet/byte counters +\end{itemize} + +\section{Failover of stateful firewalls} + +Modern firewalls implement state tracking (aka connection tracking) in order +to keep some state about the currently active sessions. The amount of +per-connection state kept at the firewall depends on the particular +implementation. + +As soon as {\bf any} state is kept at the packet filter, this state information +needs to be replicated to the slave/backup nodes within the failover setup. + +In Linux 2.4.x, all relevant state is kept within the {\it connection tracking +subsystem}. In order to understand how this state could possibly be +replicated, we need to understand the architecture of this conntrack subsystem. + +\subsection{Architecture of the Linux Connection Tracking Subsystem} + +Connection tracking within Linux is implemented as a netfilter module, called +ip\_conntrack.o. + +Before describing the connection tracking subsystem, we need to describe a +couple of definitions and primitives used throughout the conntrack code. + +A connection is represented within the conntrack subsystem using {\it struct +ip\_conntrack}, also called {\it connection tracking entry}. + +Connection tracking is utilizing {\it conntrack tuples}, which are tuples +consisting out of (srcip, srcport, dstip, dstport, l4prot). A connection is +uniquely identified by two tuples: The tuple in the original direction +(IP\_CT\_DIR\_ORIGINAL) and the tuple for the reply direction +(IP\_CT\_DIR\_REPLY). + +Connection tracking itself does not drop packets\footnote{well, in some rare +cases in combination with NAT it needs to drop. But don't tell anyone, this is +secret.} or impose any policy. It just associates every packet with a +connection tracking entry, which in turn has a particular state. All other +kernel code can use this state information\footnote{state information is +internally represented via the {\it struct sk\_buff.nfct} structure member of a +packet.}. + +\subsubsection{Integration of conntrack with netfilter} + +If the ip\_conntrack.o module is registered with netfilter, it attaches to the +NF\_IP\_PRE\_ROUTING, NF\_IP\_POST\_ROUTING, NF\_IP\_LOCAL\_IN and +NF\_IP\_LOCAL\_OUT hooks. + +Because forwarded packets are the most common case on firewalls, I will only +describe how connection tracking works for forwarded packets. The two relevant +hooks for forwarded packets are NF\_IP\_PRE\_ROUTING and NF\_IP\_POST\_ROUTING. + +Every time a packet arrives at the NF\_IP\_PRE\_ROUTING hook, connection +tracking creates a conntrack tuple from the packet. It then compares this +tuple to the original and reply tuples of all already-seen connections +\footnote{Of course this is not implemented as a linear search over all existing connections.} to find out if this just-arrived packet belongs to any existing +connection. If there is no match, a new conntrack table entry (struct +ip\_conntrack) is created. + +Let's assume the case where we have already existing connections but are +starting from scratch. + +The first packet comes in, we derive the tuple from the packet headers, look up +the conntrack hash table, don't find any matching entry. As a result, we +create a new struct ip\_conntrack. This struct ip\_conntrack is filled with +all necessarry data, like the original and reply tuple of the connection. +How do we know the reply tuple? By inverting the source and destination +parts of the original tuple.\footnote{So why do we need two tuples, if they can +be derived from each other? Wait until we discuss NAT.} +Please note that this new struct ip\_conntrack is {\bf not} yet placed +into the conntrack hash table. + +The packet is now passed on to other callback functions which have registered +with a lower priority at NF\_IP\_PRE\_ROUTING. It then continues traversal of +the network stack as usual, including all respective netfilter hooks. + +If the packet survives (i.e. is not dropped by the routing code, network stack, +firewall ruleset, ...), it re-appears at NF\_IP\_POST\_ROUTING. In this case, +we can now safely assume that this packet will be sent off on the outgoing +interface, and thus put the connection tracking entry which we created at +NF\_IP\_PRE\_ROUTING into the conntrack hash table. This process is called +{\it confirming the conntrack}. + +The connection tracking code itself is not monolithic, but consists out of a +couple of seperate modules\footnote{They don't actually have to be seperate +kernel modules; e.g. TCP, UDP and ICMP tracking modules are all part of +the linux kernel module ip\_conntrack.o}. Besides the conntrack core, there +are two important kind of modules: Protocol helpers and application helpers. + +Protocol helpers implement the layer-4-protocol specific parts. They currently +exist for TCP, UDP and ICMP (an experimental helper for GRE exists). + +\subsubsection{TCP connection tracking} + +As TCP is a connection oriented protocol, it is not very difficult to imagine +how conntection tracking for this protocol could work. There are well-defined +state transitions possible, and conntrack can decide which state transitions +are valid within the TCP specification. In reality it's not all that easy, +since we cannot assume that all packets that pass the packet filter actually +arrive at the receiving end, ... + +It is noteworthy that the standard connection tracking code does {\bf not} +do TCP sequence number and window tracking. A well-maintained patch to add +this feature exists almost as long as connection tracking itself. It will +be integrated with the 2.5.x kernel. The problem with window tracking is +it's bad interaction with connection pickup. The TCP conntrack code is able to +pick up already existing connections, e.g. in case your firewall was rebooted. +However, connection pickup is conflicting with TCP window tracking: The TCP +window scaling option is only transferred at connection setup time, and we +don't know about it in case of pickup... + +\subsubsection{ICMP tracking} + +ICMP is not really a connection oriented protocol. So how is it possible to +do connection tracking for ICMP? + +The ICMP protocol can be split in two groups of messages + +\begin{itemize} +\item +ICMP error messages, which sort-of belong to a different connection +ICMP error messages are associated {\it RELATED} to a different connection. +(ICMP\_DEST\_UNREACH, ICMP\_SOURCE\_QUENCH, ICMP\_TIME\_EXCEEDED, +ICMP\_PARAMETERPROB, ICMP\_REDIRECT). +\item +ICMP queries, which have a request->reply character. So what the conntrack +code does, is let the request have a state of {\it NEW}, and the reply +{\it ESTABLISHED}. The reply closes the connection immediately. +(ICMP\_ECHO, ICMP\_TIMESTAMP, ICMP\_INFO\_REQUEST, ICMP\_ADDRESS) +\end{itemize} + +\subsubsection{UDP connection tracking} + +UDP is designed as a connectionless datagram protocol. But most common +protocols using UDP as layer 4 protocol have bi-directional UDP communication. +Imagine a DNS query, where the client sends an UDP frame to port 53 of the +nameserver, and the nameserver sends back a DNS reply packet from it's UDP +port 53 to the client. + +Netfilter trats this as a connection. The first packet (the DNS request) is +assigned a state of {\it NEW}, because the packet is expected to create a new +'connection'. The dns servers' reply packet is marked as {\it ESTABLISHED}. + +\subsubsection{conntrack application helpers} + +More complex application protocols involving multiple connections need special +support by a so-called ``conntrack application helper module''. Modules in +the stock kernel come for FTP and IRC(DCC). Netfilter CVS currently contains +patches for PPTP, H.323, Eggdrop botnet, tftp ald talk. We're still lacking +a lot of protocols (e.g. SIP, SMB/CIFS) - but they are unlikely to appear +until somebody really needs them and either develops them on his own or +funds development. + +\subsubsection{Integration of connection tracking with iptables} + +As stated earlier, conntrack doesn't impose any policy on packets. It just +determines the relation of a packet to already existing connections. To base +packet filtering decision on this sate information, the iptables {\it state} +match can be used. Every packet is within one of the following categories: + +\begin{itemize} +\item +{\bf NEW}: packet would create a new connection, if it survives +\item +{\bf ESTABLISHED}: packet is part of an already established connection +(either direction) +\item +{\bf RELATED}: packet is in some way related to an already established connection, e.g. ICMP errors or FTP data sessions +\item +{\bf INVALID}: conntrack is unable to derive conntrack information from this packet. Please note that all multicast or broadcast packets fall in this category. +\end{itemize} + + +\subsection{Poor man's conntrack failover} + +When thinking about failover of stateful firewalls, one usually thinks about +replication of state. This presumes that the state is gathered at one +firewalling node (the currently active node), and replicated to several other +passive standby nodes. There is, howeve, a very different approach to +replication: concurrent state tracking on all firewalling nodes. + +The basic assumption of this approach is: In a setup where all firewalling +nodes receive exactly the same traffic, all nodes will deduct the same state +information. + +The implementability of this approach is totally dependent on fulfillment of +this assumption. + +\begin{itemize} +\item +{\it All packets need to be seen by all nodes}. This is not always true, but +can be achieved by using shared media like traditional ethernet (no switches!!) +and promiscuous mode on all ethernet interfaces. +\item +{\it All nodes need to be able to process all packets}. This cannot be +universally guaranteed. Even if the hardware (CPU, RAM, Chipset, NIC's) and +software (Linux kernel) are exactly the same, they might behave different, +especially under high load. To avoid those effects, the hardware should be +able to deal with way more traffic than seen during operation. Also, there +should be no userspace processes (like proxes, etc.) running on the firewalling +nodes at all. WARNING: Nobody guarantees this behaviour. However, the poor +man is usually not interested in scientific proof but in usability in his +particular practical setup. +\end{itemize} + +However, even if those conditions are fulfilled, ther are remaining issues: +\begin{itemize} +\item +{\it No resynchronization after reboot}. If a node is rebooted (because of +a hardware fault, software bug, software update, ..) it will loose all state +information until the event of the reboot. This means, the state information +of this node after reboot will not contain any old state, gathered before the +reboot. The effect depend on the traffic. Generally, it is only assured that +state information about all connections initiated after the reboot will be +present. If there are short-lived connections (like http), the state +information on the just rebooted node will approximate the state information of +an older node. Only after all sessions active at the time of reboot have +terminated, state information is guaranteed to be resynchronized. +\item +{\it Only possible with shared medium}. The practical implication is that no +switched ethernet (and thus no full duplex) can be used. +\end{itemize} + +The major advantage of the poor man's approach is implementation simplicity. +No state transfer mechanism needs to be developed. Only very little changes +to the existing conntrack code would be needed in order to be able to +do tracking based on packets received from promiscuous interfaces. The active +node would have packet forwarding turned on, the passive nodes off. + +I'm not proposing this as a real solution to the failover problem. It's +hackish, buggy and likely to break very easily. But considering it can be +implemented in very little programming time, it could be an option for very +small installations with low reliability criteria. + +\subsection{Conntrack state replication} + +The preferred solution to the failover problem is, without any doubt, +replication of the connection tracking state. + +The proposed conntrack state replication soltution consists out of several +parts: +\begin{itemize} +\item +A connection tracking state replication protocol +\item +An event interface generating event messages as soon as state information +changes on the active node +\item +An interface for explicit generation of connection tracking table entries on +the standby slaves +\item +Some code (preferrably a kernel thread) running on the active node, receiving +state updates by the event interface and generating conntrack state replication +protocol messages +\item +Some code (preferrably a kernel thread) running on the slave node(s), receiving +conntrack state replication protocol messages and updating the local conntrack +table accordingly +\end{itemize} + +Flow of events in chronological order: +\begin{itemize} +\item +{\it on active node, inside the network RX softirq} +\begin{itemize} +\item + connection tracking code is analyzing a forwarded packet +\item + connection tracking gathers some new state information +\item + connection tracking updates local connection tracking database +\item + connection tracking sends event message to event API +\end{itemize} +\item +{\it on active node, inside the conntrack-sync kernel thread} + \begin{itemize} + \item + conntrack sync daemon receives event through event API + \item + conntrack sync daemon aggregates multiple event messages into a state replication protocol message, removing possible redundancy + \item + conntrack sync daemon generates state replication protocol message + \item + conntrack sync daemon sends state replication protocol message +(private network between firewall nodes) + \end{itemize} +\item +{\it on slave node(s), inside network RX softirq} + \begin{itemize} + \item + connection tracking code ignores packets coming from the interface attached to the private conntrac sync network + \item + state replication protocol messages is appended to socket receive queue of conntrack-sync kernel thread + \end{itemize} +\item +{\it on slave node(s), inside conntrack-sync kernel thread} + \begin{itemize} + \item + conntrack sync daemon receives state replication message + \item + conntrack sync daemon creates/updates conntrack entry + \end{itemize} +\end{itemize} + + +\subsubsection{Connection tracking state replication protocol} + + + In order to be able to replicate the state between two or more firewalls, a +state replication protocol is needed. This protocol is used over a private +network segment shared by all nodes for state replication. It is designed to +work over IP unicast and IP multicast transport. IP unicast will be used for +direct point-to-point communication between one active firewall and one +standby firewall. IP multicast will be used when the state needs to be +replicated to more than one standby firewall. + + + The principle design criteria of this protocol are: +\begin{itemize} +\item + {\bf reliable against data loss}, as the underlying UDP layer does only + provide checksumming against data corruption, but doesn't employ any + means against data loss +\item + {\bf lightweight}, since generating the state update messages is + already a very expensive process for the sender, eating additional CPU, + memory and IO bandwith. +\item + {\bf easy to parse}, to minimize overhead at the receiver(s) +\end{itemize} + +The protocol does not employ any security mechanism like encryption, +authentication or reliability against spoofing attacks. It is +assumed that the private conntrack sync network is a secure communications +channel, not accessible to any malicious 3rd party. + +To achieve the reliability against data loss, an easy sequence numbering +scheme is used. All protocol messages are prefixed by a seuqence number, +determined by the sender. If the slave detects packet loss by discontinuous +sequence numbers, it can request the retransmission of the missing packets +by stating the missing sequence number(s). Since there is no acknowledgement +for sucessfully received packets, the sender has to keep a reasonably-sized +backlog of recently-sent packets in order to be able to fulfill retransmission +requests. + +The different state replication protocol messages types are: +\begin{itemize} +\item +{\bf NF\_CTSRP\_NEW}: New conntrack entry has been created (and +confirmed\footnote{See the above description of the conntrack code for what is +meant by {\it confirming} a conntrack entry}) +\item +{\bf NF\_CTSRP\_UPDATE}: State information of existing conntrack entry has +changed +\item +{\bf NF\_CTSRP\_EXPIRE}: Existing conntrack entry has been expired +\end{itemize} + +To uniquely identify (and later reference) a conntrack entry, a +{\it conntrack\_id} is assigned to every conntrack entry transferred +using a NF\_CTSRP\_NEW message. This conntrack\_id must be saved at the +receiver(s) together with the conntrack entry, since it is used by the sender +for subsequent NF\_CTSRP\_UPDATE and NF\_CTSRP\_EXPIRE messages. + +The protocol itself does not care about the source of this conntrack\_id, +but since the current netfilter connection tracking implementation does never +change the addres of a conntrack entry, the memory address of the entry can be +used, since it comes for free. + + +\subsubsection{Connection tracking state syncronization sender} + +Maximum care needs to be taken for the implementation of the ctsyncd sender. + +The normal workload of the active firewall node is likely to be already very +high, so generating and sending the conntrack state replication messages needs +to be highly efficient. + +\begin{itemize} +\item + {\bf NF\_CTSRP\_NEW} will be generated at the NF\_IP\_POST\_ROUTING + hook, at the time ip\_conntrack\_confirm() is called. Delaying + this message until conntrack confirmation happens saves us from + replicating otherwise unneeded state information. +\item + {\bf NF\_CTSRP\_UPDATE} need to be created automagically by the + conntrack core. It is not possible to have any failover-specific + code within conntrack protocol and/or application helpers. + The easiest way involving the least changes to the conntrack core + code is to copy parts of the conntrack entry before calling any + helper functions, and then use memcmp() to find out if the helper + has changed any information. +\item + {\bf NF\_CTSRP\_EXPIRE} can be added very easily to the existing + conntrack destroy function. +\end{itemize} + + +\subsubsection{Connection tracking state syncronization receiver} + +Impmentation of the receiver is very straightforward. + +Apart from dealing with lost CTSRP packets, it just needs to call the +respective conntrack add/modify/delete functions offered by the core. + + +\subsubsection{Necessary changes within netfilter conntrack core} + +To be able to implement the described conntrack state replication mechanism, +the following changes to the conntrack core are needed: +\begin{itemize} +\item + Ability to exclude certain packets from being tracked. This is a + long-wanted feature on the TODO list of the netfilter project and will + be implemented by having a ``prestate'' table in combination with a + ``NOTRACK'' target. +\item + Ability to register callback functions to be called every time a new + conntrack entry is created or an existing entry modified. +\item + Export an API to add externally add, modify and remove conntrack + entries. Since the needed ip\_conntrack\_lock is exported, + implementation could even reside outside the conntrack core code. +\end{itemize} + +Since the number of changes is very low, it is very likely that the +modifications will go into the mainstream kernel without any big hazzle. + +\end{document} diff --git a/2002/netfilter-failover-ols2002/ols.sty b/2002/netfilter-failover-ols2002/ols.sty new file mode 100644 index 0000000..5e5fe49 --- /dev/null +++ b/2002/netfilter-failover-ols2002/ols.sty @@ -0,0 +1,56 @@ + +% TEMPLATE for Usenix papers, specifically to meet requirements of +% TCL97 committee. +% originally a template for producing IEEE-format articles using LaTeX. +% written by Matthew Ward, CS Department, Worcester Polytechnic Institute. +% adapted by David Beazley for his excellent SWIG paper in Proceedings, +% Tcl 96 +% turned into a smartass generic template by De Clarke, with thanks to +% both the above pioneers +% use at your own risk. Complaints to /dev/null. +% make it two column with no page numbering, default is 10 point + +% adapted for Ottawa Linux Symposium + +% include following in document. +%\documentclass[twocolumn]{article} +%\usepackage{usits,epsfig} +\pagestyle{empty} + +%set dimensions of columns, gap between columns, and space between paragraphs +%\setlength{\textheight}{8.75in} +\setlength{\textheight}{9.0in} +\setlength{\columnsep}{0.25in} +\setlength{\textwidth}{6.45in} +\setlength{\footskip}{0.0in} +\setlength{\topmargin}{0.0in} +\setlength{\headheight}{0.0in} +\setlength{\headsep}{0.0in} +\setlength{\oddsidemargin}{0in} +%\setlength{\oddsidemargin}{-.065in} +%\setlength{\oddsidemargin}{-.17in} +\setlength{\parindent}{0pc} +\setlength{\parskip}{\baselineskip} + +% started out with art10.sty and modified params to conform to IEEE format +% further mods to conform to Usenix standard + +\makeatletter +%as Latex considers descenders in its calculation of interline spacing, +%to get 12 point spacing for normalsize text, must set it to 10 points +\def\@normalsize{\@setsize\normalsize{12pt}\xpt\@xpt +\abovedisplayskip 10pt plus2pt minus5pt\belowdisplayskip \abovedisplayskip +\abovedisplayshortskip \z@ plus3pt\belowdisplayshortskip 6pt plus3pt +minus3pt\let\@listi\@listI} + +%need a 12 pt font size for subsection and abstract headings +\def\subsize{\@setsize\subsize{12pt}\xipt\@xipt} + +%make section titles bold and 12 point, 2 blank lines before, 1 after +\def\section{\@startsection {section}{1}{\z@}{24pt plus 2pt minus 2pt} +{12pt plus 2pt minus 2pt}{\large\bf}} + +%make subsection titles bold and 11 point, 1 blank line before, 1 after +\def\subsection{\@startsection {subsection}{2}{\z@}{12pt plus 2pt minus 2pt} +{12pt plus 2pt minus 2pt}{\subsize\bf}} +\makeatother -- cgit v1.2.3