/* IPsec IKE Dead Peer Detection code. * Copyright (C) 2003 Ken Bantoft * Copyright (C) 2003-2006 Michael Richardson * Copyright (C) 2008-2010 Paul Wouters * Copyright (C) 2010 FURUSO Shinichi * Copyright (C) 2012 Avesh Agarwal * Copyright (C) 2012 Andrey Alexandrenko * Copyright (C) 2012 Paul Wouters * Copyright (C) 2013 Paul Wouters * Copyright (C) 2013 Matt Rogers * Copyright (C) 2013 D. Hugh Redelmeier * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation; either version 2 of the License, or (at your * option) any later version. See . * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * for more details. */ #include #include #include #include #include #include #include #include #include #include /* for gettimeofday */ #include #include "sysdep.h" #include "constants.h" #include "defs.h" #include "state.h" #include "id.h" #include "x509.h" #include "certs.h" #include "connections.h" /* needs id.h */ #include "keys.h" #include "packet.h" #include "demux.h" /* needs packet.h */ #include "adns.h" /* needs */ #include "dnskey.h" /* needs keys.h and adns.h */ #include "kernel.h" /* needs connections.h */ #include "log.h" #include "cookie.h" #include "server.h" #include "spdb.h" #include "timer.h" #include "rnd.h" #include "ipsec_doi.h" /* needs demux.h and state.h */ #include "whack.h" #include "pending.h" /* for flush_pending_by_connection */ #include "ikev1_dpd.h" #include "x509more.h" /** * Initialize RFC 3706 Dead Peer Detection * * @param st An initialized state structure * @return void * * How DPD works. * * There are two kinds of events that can be scheduled. * At most one of them is schedule at any given time. * * The EVENT_DPD_TIMEOUT event, if it ever goes off, means that * neither the ISAKMP SA nor the IPsec SA has *RECEIVED* any DPD * events lately. * * 0) So, every time we receive a DPD (R_U_THERE or R_U_ACK), then * we delete any DPD event (EVENT_DPD or EVENT_DPD_TIMEOUT), and * we schedule a new DPD_EVENT (sending) for "delay" in the future. * * 1) When the DPD_EVENT goes off, we check the phase 2 (if there is one) * SA to see if there was incoming traffic. If there was, then we are happy, * we set a new DPD_EVENT, and we are done. * * 2) If there was no phase 2 activity, we check if there was a recent enough * DPD activity (st->st_last_dpd). If so, we just reschedule, and do * nothing. * * 3) Otherwise, we send a DPD R_U_THERE message, and set the * EVENT_DPD_TIMEOUT on the phase 1. * * One thing to realize when looking at "ipsec whack --listevents" output, * is there there will only be DPD_EVENT_TIMEOUT events if there are * outstanding R_U_THERE messages. * * The above is the basic idea, but things are a bit more complicated because * multiple phase 2s can share the same phase 1 ISAKMP SA. Each phase 2 state * has its own DPD_EVENT. Further, we start a DPD_EVENT for phase 1 when it * gets established. This is because the phase 2 may never actually succeed * (usually due to authorization issues, which may be DNS or otherwise related) * and if the responding end dies (gets restarted, or the conn gets reloaded * with the right policy), then we may have a bum phase 1 SA, and we cannot * re-negotiate. (This happens WAY too often) * * The phase 2 dpd_init() will attempt to kill the phase 1 DPD_EVENT, if it * can, to reduce the amount of work. * * The st_last_dpd member which is used is always the one from the phase 1. * So, if there are multiple phase 2s, then if any of them receive DPD data * they will update the st_last_dpd, so the test in #2 will avoid the traffic * for all by one phase 2. * * Note that the EVENT_DPD are attached to phase 2s (typically), while the * EVENT_DPD_TIMEOUT are attached to phase 1s only. * * Finally, if the connection is using NAT-T, then we ignore the phase 2 * activity check, because in the case of a unidirectional stream (VoIP for * a conference call, for instance), we may not send enough traffic to keep * the NAT port mapping valid. * */ stf_status dpd_init(struct state *st) { /** * Used to store the 1st state */ #ifdef HAVE_LABELED_IPSEC if (st->st_connection->loopback) { libreswan_log( "dpd is not required for ipsec connections over loopback"); return STF_OK; } #endif struct state *p1st; /* find the related Phase 1 state */ p1st = find_state_ikev1(st->st_icookie, st->st_rcookie, 0); if (p1st == NULL) { loglog(RC_LOG_SERIOUS, "could not find phase 1 state for DPD"); /* * if the phase 1 state has gone away, it really should have * deleted all of its children. * Why would this happen? because a quick mode SA can take * some time to create (DNS lookups for instance), and the phase 1 * might have been taken down for some reason in the meantime. * We really cannot do anything here --- attempting to invoke * the DPD action would be a good idea, but we really should * do that outside this function. */ return STF_FAIL; } /* if it was enabled, and we haven't turned it on already */ if (p1st->hidden_variables.st_peer_supports_dpd) { libreswan_log("Dead Peer Detection (RFC 3706): enabled"); if (st->st_dpd_event == NULL || monobefore(monotimesum(mononow(), st->st_connection->dpd_delay), st->st_dpd_event->ev_time)) { if (st->st_dpd_event != NULL) delete_dpd_event(st); event_schedule(EVENT_DPD, deltasecs(st->st_connection->dpd_delay), st); } } else { libreswan_log( "Dead Peer Detection (RFC 3706): not enabled because peer did not advertise it"); } if (p1st != st) { /* st was not a phase 1 SA, so kill the DPD_EVENT on the phase 1 */ if (p1st->st_dpd_event != NULL && p1st->st_dpd_event->ev_type == EVENT_DPD) delete_dpd_event(p1st); } return STF_OK; } /* * Only schedule a new timeout if there isn't one currently, * or if it would be sooner than the current timeout. */ static void dpd_sched_timeout(struct state *p1st, monotime_t nw, deltatime_t timeout) { passert(deltasecs(timeout) > 0); if (p1st->st_dpd_event == NULL || monobefore(monotimesum(nw, timeout), p1st->st_dpd_event->ev_time)) { DBG(DBG_DPD, DBG_log("DPD: scheduling timeout to %ld", (long)deltasecs(timeout))); if (p1st->st_dpd_event != NULL) delete_dpd_event(p1st); event_schedule(EVENT_DPD_TIMEOUT, deltasecs(timeout), p1st); } } /** * DPD Out Initiator * * @param p2st A state struct that is already in phase2 * @return void */ static void dpd_outI(struct state *p1st, struct state *st, bool eroute_care, deltatime_t delay, deltatime_t timeout) { monotime_t nw; monotime_t last; deltatime_t nextdelay; u_int32_t seqno; DBG(DBG_DPD, DBG_log("DPD: processing for state #%lu (\"%s\")", st->st_serialno, st->st_connection->name)); /* If no DPD, then get out of here */ if (!st->hidden_variables.st_peer_supports_dpd) { DBG(DBG_DPD, DBG_log("DPD: peer does not support dpd")); return; } /* If there is no state, there can be no DPD */ if (!IS_ISAKMP_SA_ESTABLISHED(p1st->st_state)) { DBG(DBG_DPD, DBG_log("DPD: no phase1 state, so no DPD")); return; } /* find out when now is */ nw = mononow(); /* * pick least recent activity value, since with multiple phase 2s, * it may well be that one phase 2 is very active, while the other * for some reason, gets stomped upon by some network screw up. * * (this would only happen if the network was sensitive to different * SPI#, since for NAT-T, all traffic should be on the same UDP port. * At worst, this means that we send a bit more traffic then we need * to when there are multiple SAs and one is much less active. * * ??? the code actually picks the most recent. So much for comments. */ last = !monobefore(p1st->st_last_dpd, st->st_last_dpd) ? p1st->st_last_dpd : st->st_last_dpd; nextdelay = monotimediff(monotimesum(last, delay), nw); /* has there been enough activity of late? */ if (deltasecs(nextdelay) > 0) { /* Yes, just reschedule "phase 2" */ DBG(DBG_DPD, DBG_log("DPD: not yet time for dpd event: %ld < %ld", (long)nw.mono_secs, (long)(last.mono_secs + deltasecs(delay)))); event_schedule(EVENT_DPD, deltasecs(nextdelay), st); return; } /* now plan next check time */ /* ??? this test is nuts: it will always succeed! */ if (deltasecs(nextdelay) < 1) nextdelay = delay; /* * check the phase 2, if we are supposed to, * and return if it is active recently */ if (eroute_care && st->hidden_variables.st_nat_traversal == LEMPTY && !was_eroute_idle(st, delay)) { DBG(DBG_DPD, DBG_log("DPD: out event not sent, phase 2 active")); /* update phase 2 time stamp only */ st->st_last_dpd = nw; /* * Since there was activity, kill any EVENT_DPD_TIMEOUT that might * be waiting. This can happen when a R_U_THERE_ACK is lost, and * subsequently traffic started flowing over the SA again, and no * more DPD packets are sent to cancel the outstanding DPD timer. */ if (p1st->st_dpd_event != NULL && p1st->st_dpd_event->ev_type == EVENT_DPD_TIMEOUT) { DBG(DBG_DPD, DBG_log("DPD: deleting p1st DPD event")); delete_dpd_event(p1st); } event_schedule(EVENT_DPD, deltasecs(nextdelay), st); return; } if (st != p1st) { /* * reschedule next event, since we cannot do it from the activity * routine. */ event_schedule(EVENT_DPD, deltasecs(nextdelay), st); } if (p1st->st_dpd_seqno == 0) { /* Get a non-zero random value that has room to grow */ get_rnd_bytes((u_char *)&p1st->st_dpd_seqno, sizeof(p1st->st_dpd_seqno)); p1st->st_dpd_seqno &= 0x7fff; p1st->st_dpd_seqno++; } seqno = htonl(p1st->st_dpd_seqno); /* make sure that the timeout occurs. We do this before the send, * because the send may fail due to network issues, etc, and * the timeout has to occur anyway */ dpd_sched_timeout(p1st, nw, timeout); DBG(DBG_DPD, { ipstr_buf b; DBG_log("DPD: sending R_U_THERE %u to %s:%d (state #%lu)", p1st->st_dpd_seqno, ipstr(&p1st->st_remoteaddr, &b), p1st->st_remoteport, p1st->st_serialno); }); if (send_isakmp_notification(p1st, R_U_THERE, &seqno, sizeof(seqno)) != STF_IGNORE) { loglog(RC_LOG_SERIOUS, "DPD: could not send R_U_THERE"); return; } st->st_last_dpd = nw; p1st->st_last_dpd = nw; p1st->st_dpd_expectseqno = p1st->st_dpd_seqno++; } static void p1_dpd_outI1(struct state *p1st) { deltatime_t delay = p1st->st_connection->dpd_delay; deltatime_t timeout = p1st->st_connection->dpd_timeout; dpd_outI(p1st, p1st, TRUE, delay, timeout); } static void p2_dpd_outI1(struct state *p2st) { struct state *st; deltatime_t delay = p2st->st_connection->dpd_delay; deltatime_t timeout = p2st->st_connection->dpd_timeout; /* find the related Phase 1 state */ st = find_phase1_state(p2st->st_connection, ISAKMP_SA_ESTABLISHED_STATES); if (st == NULL) { loglog(RC_LOG_SERIOUS, "DPD: could not find newest phase 1 state"); return; } dpd_outI(st, p2st, TRUE, delay, timeout); } void dpd_event(struct state *st) { passert(st != NULL); if (IS_PHASE1(st->st_state) || IS_PHASE15(st->st_state )) p1_dpd_outI1(st); else p2_dpd_outI1(st); } /** * DPD in Initiator, out Responder * * @param st A state structure (the phase 1 state) * @param n A notification (isakmp_notification) * @param pbs A PB Stream * @return stf_status */ stf_status dpd_inI_outR(struct state *p1st, struct isakmp_notification *const n, pb_stream *pbs) { monotime_t nw = mononow(); u_int32_t seqno; if (!IS_ISAKMP_SA_ESTABLISHED(p1st->st_state)) { loglog(RC_LOG_SERIOUS, "DPD: received R_U_THERE for unestablished ISKAMP SA"); return STF_IGNORE; } if (n->isan_spisize != COOKIE_SIZE * 2 || pbs_left(pbs) < COOKIE_SIZE * 2) { loglog(RC_LOG_SERIOUS, "DPD: R_U_THERE has invalid SPI length (%d)", n->isan_spisize); return STF_FAIL + PAYLOAD_MALFORMED; } if (!memeq(pbs->cur, p1st->st_icookie, COOKIE_SIZE)) { /* RFC states we *SHOULD* check cookies, not MUST. So invalid cookies are technically valid, as per Geoffrey Huang */ DBG(DBG_DPD, DBG_log("DPD: R_U_THERE has invalid icookie (tolerated)")); } pbs->cur += COOKIE_SIZE; if (!memeq(pbs->cur, p1st->st_rcookie, COOKIE_SIZE)) { DBG(DBG_DPD, DBG_log("DPD: R_U_THERE has invalid rcookie (tolerated)")); } pbs->cur += COOKIE_SIZE; if (pbs_left(pbs) != sizeof(seqno)) { loglog(RC_LOG_SERIOUS, "DPD: R_U_THERE has invalid data length (%d)", (int) pbs_left( pbs)); return STF_FAIL + PAYLOAD_MALFORMED; } seqno = ntohl(*(u_int32_t *)pbs->cur); if (p1st->st_dpd_peerseqno && seqno <= p1st->st_dpd_peerseqno) { loglog(RC_LOG_SERIOUS, "DPD: received old or duplicate R_U_THERE"); if (p1st->st_dpd_rdupcount >= DPD_RETRANS_MAX) { loglog(RC_LOG_SERIOUS, "DPD: received %d or more duplicate R_U_THERE's - will no longer answer", DPD_RETRANS_MAX); return STF_IGNORE; } else { /* * Needed to work around openbsd bug (isakmpd/dpd.c * around line 350) where they forget to increase * isakmp_sa->dpd_seq on unanswered DPD probe violating * RFC 3706 Section 7 "Security Considerations" */ loglog(RC_LOG_SERIOUS, "DPD: received less than %d duplicate R_U_THERE's - will reluctantly answer", DPD_RETRANS_MAX); p1st->st_dpd_rdupcount++; } } else { p1st->st_dpd_rdupcount = 0; } DBG(DBG_DPD, DBG_log("DPD: received R_U_THERE seq:%u monotime:%ld (state=#%lu name=\"%s\")", seqno, (long)nw.mono_secs, p1st->st_serialno, p1st->st_connection->name)); p1st->st_dpd_peerseqno = seqno; if (send_isakmp_notification(p1st, R_U_THERE_ACK, pbs->cur, pbs_left(pbs)) != STF_IGNORE) { loglog(RC_LOG_SERIOUS, "DPD: could not send R_U_THERE_ACK"); return STF_IGNORE; } /* update the time stamp */ p1st->st_last_dpd = nw; /* * since there was activity, kill any EVENT_DPD_TIMEOUT that might * be waiting. */ if (p1st->st_dpd_event != NULL && p1st->st_dpd_event->ev_type == EVENT_DPD_TIMEOUT) delete_dpd_event(p1st); return STF_IGNORE; } /** * DPD out Responder * * @param st A state structure (phase 1) * @param n A notification (isakmp_notification) * @param pbs A PB Stream * @return stf_status */ stf_status dpd_inR(struct state *p1st, struct isakmp_notification *const n, pb_stream *pbs) { u_int32_t seqno; if (!IS_ISAKMP_SA_ESTABLISHED(p1st->st_state)) { loglog(RC_LOG_SERIOUS, "DPD: recevied R_U_THERE_ACK for unestablished ISKAMP SA"); return STF_FAIL; } if (n->isan_spisize != COOKIE_SIZE * 2 || pbs_left(pbs) < COOKIE_SIZE * 2) { loglog(RC_LOG_SERIOUS, "DPD: R_U_THERE_ACK has invalid SPI length (%d)", n->isan_spisize); return STF_FAIL + PAYLOAD_MALFORMED; } if (!memeq(pbs->cur, p1st->st_icookie, COOKIE_SIZE)) { /* RFC states we *SHOULD* check cookies, not MUST. So invalid cookies are technically valid, as per Geoffrey Huang */ DBG(DBG_DPD, DBG_log("DPD: R_U_THERE_ACK has invalid icookie")); } pbs->cur += COOKIE_SIZE; if (!memeq(pbs->cur, p1st->st_rcookie, COOKIE_SIZE)) { /* RFC states we *SHOULD* check cookies, not MUST. So invalid cookies are technically valid, as per Geoffrey Huang */ DBG(DBG_DPD, DBG_log("DPD: R_U_THERE_ACK has invalid rcookie")); } pbs->cur += COOKIE_SIZE; if (pbs_left(pbs) != sizeof(seqno)) { loglog(RC_LOG_SERIOUS, "DPD: R_U_THERE_ACK has invalid data length (%d)", (int) pbs_left( pbs)); return STF_FAIL + PAYLOAD_MALFORMED; } seqno = ntohl(*(u_int32_t *)pbs->cur); DBG(DBG_DPD, DBG_log("DPD: R_U_THERE_ACK, seqno received: %u expected: %u (state=#%lu)", seqno, p1st->st_dpd_expectseqno, p1st->st_serialno)); if (seqno == p1st->st_dpd_expectseqno) { /* update the time stamp */ p1st->st_last_dpd = mononow(); p1st->st_dpd_expectseqno = 0; } else if (!p1st->st_dpd_expectseqno) { loglog(RC_LOG_SERIOUS, "DPD: unexpected R_U_THERE_ACK packet with sequence number %u", seqno); /* do not update time stamp, so we'll send a new one sooner */ } /* * since there was activity, kill any EVENT_DPD_TIMEOUT that might * be waiting. */ if (p1st->st_dpd_event != NULL && p1st->st_dpd_event->ev_type == EVENT_DPD_TIMEOUT) delete_dpd_event(p1st); return STF_IGNORE; } /** * DPD Timeout Function * * This function is called when a timeout DPD_EVENT occurs. We set clear/trap * both the SA and the eroutes, depending on what the connection definition * tells us (either 'hold' or 'clear') * * @param st A state structure that is fully negotiated * @return void */ void dpd_timeout(struct state *st) { struct connection *c = st->st_connection; enum dpd_action action = c->dpd_action; /** delete the state, which is probably in phase 2 */ set_cur_connection(c); libreswan_log("DPD: No response from peer - declaring peer dead"); switch (action) { case DPD_ACTION_HOLD: /** dpdaction=hold - Wipe the SA's but %trap the eroute so we don't leak traffic. Also, being in %trap means new packets will force an initiation of the conn again. */ libreswan_log("DPD: Putting connection into %%trap"); if (c->kind == CK_INSTANCE) { DBG(DBG_DPD, DBG_log("DPD: warning dpdaction=hold on instance futile - will be deleted")); } delete_states_by_connection(c, TRUE); break; case DPD_ACTION_CLEAR: /** dpdaction=clear - Wipe the SA & eroute - everything */ libreswan_log("DPD: Clearing Connection"); /* * For CK_INSTANCE, delete_states_by_connection() will clear * Note that delete_states_by_connection changes c->kind but we need * to remember what it was to know if we still need to unroute after delete */ if (c->kind == CK_INSTANCE) { delete_states_by_connection(c, TRUE); } else { flush_pending_by_connection(c); /* remove any partial negotiations that are failing */ delete_states_by_connection(c, TRUE); DBG(DBG_DPD, DBG_log("DPD: unrouting connection (%s)", enum_name(&connection_kind_names, c->kind))); unroute_connection(c); /* --unroute */ } break; case DPD_ACTION_RESTART: /* dpdaction=restart - immediately renegotiate connections to the same peer. */ libreswan_log( "DPD: Restarting all connections that share this peer"); restart_connections_by_peer(c); break; default: bad_case(action); } reset_cur_connection(); }