./net/ipv4/tcp

      /*
       * INET		An implementation of the TCP/IP protocol suite for the LINUX
       *		operating system.  INET is implemented using the  BSD Socket
       *		interface as the means of communication with the user level.
       *
       *		Implementation of the Transmission Control Protocol(TCP).
       *
       * Version:	$Id: tcp_timer.c,v 1.80 2000/10/03 07:29:01 anton Exp $
       *
       * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
       *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
       *		Mark Evans, <evansmp@uhura.aston.ac.uk>
       *		Corey Minyard <wf-rch!minyard@relay.EU.net>
       *		Florian La Roche, <flla@stud.uni-sb.de>
       *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
       *		Linus Torvalds, <torvalds@cs.helsinki.fi>
       *		Alan Cox, <gw4pts@gw4pts.ampr.org>
       *		Matthew Dillon, <dillon@apollo.west.oic.com>
       *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
       *		Jorge Cwik, <jorge@laser.satlink.net>
       */
      
      #include <net/tcp.h>
      
      int sysctl_tcp_syn_retries = TCP_SYN_RETRIES; 
      int sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; 
      int sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
      int sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
      int sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
      int sysctl_tcp_retries1 = TCP_RETR1;
      int sysctl_tcp_retries2 = TCP_RETR2;
      int sysctl_tcp_orphan_retries;
      
      static void tcp_write_timer(unsigned long);
      static void tcp_delack_timer(unsigned long);
      static void tcp_keepalive_timer (unsigned long data);
      
      const char timer_bug_msg[] = KERN_DEBUG "tcpbug: unknown timer value\n";
      
      /*
       * Using different timers for retransmit, delayed acks and probes
       * We may wish use just one timer maintaining a list of expire jiffies 
       * to optimize.
       */
      
  46  void tcp_init_xmit_timers(struct sock *sk)
      {
      	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
      
      	init_timer(&tp->retransmit_timer);
      	tp->retransmit_timer.function=&tcp_write_timer;
      	tp->retransmit_timer.data = (unsigned long) sk;
      	tp->pending = 0;
      
      	init_timer(&tp->delack_timer);
      	tp->delack_timer.function=&tcp_delack_timer;
      	tp->delack_timer.data = (unsigned long) sk;
      	tp->ack.pending = 0;
      
      	init_timer(&sk->timer);
      	sk->timer.function=&tcp_keepalive_timer;
      	sk->timer.data = (unsigned long) sk;
      }
      
  65  void tcp_clear_xmit_timers(struct sock *sk)
      {
      	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
      
      	tp->pending = 0;
      	if (timer_pending(&tp->retransmit_timer) &&
  71  	    del_timer(&tp->retransmit_timer))
      		__sock_put(sk);
      
      	tp->ack.pending = 0;
      	tp->ack.blocked = 0;
      	if (timer_pending(&tp->delack_timer) &&
  77  	    del_timer(&tp->delack_timer))
      		__sock_put(sk);
      
  80  	if(timer_pending(&sk->timer) && del_timer(&sk->timer))
      		__sock_put(sk);
      }
      
  84  static void tcp_write_err(struct sock *sk)
      {
      	sk->err = sk->err_soft ? : ETIMEDOUT;
      	sk->error_report(sk);
      
      	tcp_done(sk);
      	NET_INC_STATS_BH(TCPAbortOnTimeout);
      }
      
      /* Do not allow orphaned sockets to eat all our resources.
       * This is direct violation of TCP specs, but it is required
       * to prevent DoS attacks. It is called when a retransmission timeout
       * or zero probe timeout occurs on orphaned socket.
       *
       * Criterium is still not confirmed experimentally and may change.
       * We kill the socket, if:
       * 1. If number of orphaned sockets exceeds an administratively configured
       *    limit.
       * 2. If we have strong memory pressure.
       */
 104  static int tcp_out_of_resources(struct sock *sk, int do_reset)
      {
      	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
      	int orphans = atomic_read(&tcp_orphan_count);
      
      	/* If peer does not open window for long time, or did not transmit 
      	 * anything for long time, penalize it. */
 111  	if ((s32)(tcp_time_stamp - tp->lsndtime) > 2*TCP_RTO_MAX || !do_reset)
      		orphans <<= 1;
      
      	/* If some dubious ICMP arrived, penalize even more. */
 115  	if (sk->err_soft)
      		orphans <<= 1;
      
      	if (orphans >= sysctl_tcp_max_orphans ||
      	    (sk->wmem_queued > SOCK_MIN_SNDBUF &&
 120  	     atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
 121  		if (net_ratelimit())
      			printk(KERN_INFO "Out of socket memory\n");
      
      		/* Catch exceptional cases, when connection requires reset.
      		 *      1. Last segment was sent recently. */
      		if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN ||
      		    /*  2. Window is closed. */
 128  		    (!tp->snd_wnd && !tp->packets_out))
      			do_reset = 1;
 130  		if (do_reset)
      			tcp_send_active_reset(sk, GFP_ATOMIC);
      		tcp_done(sk);
      		NET_INC_STATS_BH(TCPAbortOnMemory);
 134  		return 1;
      	}
 136  	return 0;
      }
      
      /* Calculate maximal number or retries on an orphaned socket. */
 140  static int tcp_orphan_retries(struct sock *sk, int alive)
      {
      	int retries = sysctl_tcp_orphan_retries; /* May be zero. */
      
      	/* We know from an ICMP that something is wrong. */
 145  	if (sk->err_soft && !alive)
      		retries = 0;
      
      	/* However, if socket sent something recently, select some safe
      	 * number of retries. 8 corresponds to >100 seconds with minimal
      	 * RTO of 200msec. */
 151  	if (retries == 0 && alive)
      		retries = 8;
 153  	return retries;
      }
      
      /* A write timeout has occurred. Process the after effects. */
 157  static int tcp_write_timeout(struct sock *sk)
      {
      	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
      	int retry_until;
      
 162  	if ((1<<sk->state)&(TCPF_SYN_SENT|TCPF_SYN_RECV)) {
 163  		if (tp->retransmits)
      			dst_negative_advice(&sk->dst_cache);
      		retry_until = tp->syn_retries ? : sysctl_tcp_syn_retries;
 166  	} else {
 167  		if (tp->retransmits >= sysctl_tcp_retries1) {
      			/* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu black
      			   hole detection. :-(
      
      			   It is place to make it. It is not made. I do not want
      			   to make it. It is disguisting. It does not work in any
      			   case. Let me to cite the same draft, which requires for
      			   us to implement this:
      
         "The one security concern raised by this memo is that ICMP black holes
         are often caused by over-zealous security administrators who block
         all ICMP messages.  It is vitally important that those who design and
         deploy security systems understand the impact of strict filtering on
         upper-layer protocols.  The safest web site in the world is worthless
         if most TCP implementations cannot transfer data from it.  It would
         be far nicer to have all of the black holes fixed rather than fixing
         all of the TCP implementations."
      
                                 Golden words :-).
      		   */
      
      			dst_negative_advice(&sk->dst_cache);
      		}
      
      		retry_until = sysctl_tcp_retries2;
 192  		if (sk->dead) {
      			int alive = (tp->rto < TCP_RTO_MAX);
       
      			retry_until = tcp_orphan_retries(sk, alive);
      
 197  			if (tcp_out_of_resources(sk, alive || tp->retransmits < retry_until))
 198  				return 1;
      		}
      	}
      
 202  	if (tp->retransmits >= retry_until) {
      		/* Has it gone just too far? */
      		tcp_write_err(sk);
 205  		return 1;
      	}
 207  	return 0;
      }
      
 210  static void tcp_delack_timer(unsigned long data)
      {
      	struct sock *sk = (struct sock*)data;
      	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
      
      	bh_lock_sock(sk);
 216  	if (sk->lock.users) {
      		/* Try again later. */
      		tp->ack.blocked = 1;
      		NET_INC_STATS_BH(DelayedACKLocked);
 220  		if (!mod_timer(&tp->delack_timer, jiffies + TCP_DELACK_MIN))
      			sock_hold(sk);
 222  		goto out_unlock;
      	}
      
      	tcp_mem_reclaim(sk);
      
 227  	if (sk->state == TCP_CLOSE || !(tp->ack.pending&TCP_ACK_TIMER))
 228  		goto out;
      
 230  	if ((long)(tp->ack.timeout - jiffies) > 0) {
 231  		if (!mod_timer(&tp->delack_timer, tp->ack.timeout))
      			sock_hold(sk);
 233  		goto out;
      	}
      	tp->ack.pending &= ~TCP_ACK_TIMER;
      
 237  	if (skb_queue_len(&tp->ucopy.prequeue)) {
      		struct sk_buff *skb;
      
      		net_statistics[smp_processor_id()*2].TCPSchedulerFailed += skb_queue_len(&tp->ucopy.prequeue);
      
 242  		while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
      			sk->backlog_rcv(sk, skb);
      
      		tp->ucopy.memory = 0;
      	}
      
 248  	if (tcp_ack_scheduled(tp)) {
 249  		if (!tp->ack.pingpong) {
      			/* Delayed ACK missed: inflate ATO. */
      			tp->ack.ato = min(tp->ack.ato<<1, tp->rto);
 252  		} else {
      			/* Delayed ACK missed: leave pingpong mode and
      			 * deflate ATO.
      			 */
      			tp->ack.pingpong = 0;
      			tp->ack.ato = TCP_ATO_MIN;
      		}
      		tcp_send_ack(sk);
      		NET_INC_STATS_BH(DelayedACKs);
      	}
 262  	TCP_CHECK_TIMER(sk);
      
      out:
 265  	if (tcp_memory_pressure)
      		tcp_mem_reclaim(sk);
      out_unlock:
 268  	bh_unlock_sock(sk);
      	sock_put(sk);
      }
      
 272  static void tcp_probe_timer(struct sock *sk)
      {
      	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
      	int max_probes;
      
 277  	if (tp->packets_out || !tp->send_head) {
      		tp->probes_out = 0;
 279  		return;
      	}
      
      	/* *WARNING* RFC 1122 forbids this
      	 *
      	 * It doesn't AFAIK, because we kill the retransmit timer -AK
      	 *
      	 * FIXME: We ought not to do it, Solaris 2.5 actually has fixing
      	 * this behaviour in Solaris down as a bug fix. [AC]
      	 *
      	 * Let me to explain. probes_out is zeroed by incoming ACKs
      	 * even if they advertise zero window. Hence, connection is killed only
      	 * if we received no ACKs for normal connection timeout. It is not killed
      	 * only because window stays zero for some time, window may be zero
      	 * until armageddon and even later. We are in full accordance
      	 * with RFCs, only probe timer combines both retransmission timeout
      	 * and probe timeout in one bottle.				--ANK
      	 */
      	max_probes = sysctl_tcp_retries2;
      
 299  	if (sk->dead) {
      		int alive = ((tp->rto<<tp->backoff) < TCP_RTO_MAX);
       
      		max_probes = tcp_orphan_retries(sk, alive);
      
 304  		if (tcp_out_of_resources(sk, alive || tp->probes_out <= max_probes))
 305  			return;
      	}
      
 308  	if (tp->probes_out > max_probes) {
      		tcp_write_err(sk);
 310  	} else {
      		/* Only send another probe if we didn't close things up. */
      		tcp_send_probe0(sk);
      	}
      }
      
      /*
       *	The TCP retransmit timer.
       */
      
 320  static void tcp_retransmit_timer(struct sock *sk)
      {
      	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
      
 324  	if (tp->packets_out == 0)
 325  		goto out;
      
 327  	BUG_TRAP(!skb_queue_empty(&sk->write_queue));
      
 329  	if (tcp_write_timeout(sk))
 330  		goto out;
      
 332  	if (tp->retransmits == 0) {
 333  		if (tp->ca_state == TCP_CA_Disorder || tp->ca_state == TCP_CA_Recovery) {
 334  			if (tp->sack_ok) {
 335  				if (tp->ca_state == TCP_CA_Recovery)
      					NET_INC_STATS_BH(TCPSackRecoveryFail);
 337  				else
      					NET_INC_STATS_BH(TCPSackFailures);
 339  			} else {
 340  				if (tp->ca_state == TCP_CA_Recovery)
      					NET_INC_STATS_BH(TCPRenoRecoveryFail);
 342  				else
      					NET_INC_STATS_BH(TCPRenoFailures);
      			}
 345  		} else if (tp->ca_state == TCP_CA_Loss) {
      			NET_INC_STATS_BH(TCPLossFailures);
 347  		} else {
      			NET_INC_STATS_BH(TCPTimeouts);
      		}
      	}
      
      	tcp_enter_loss(sk, 0);
      
 354  	if (tcp_retransmit_skb(sk, skb_peek(&sk->write_queue)) > 0) {
      		/* Retransmission failed because of local congestion,
      		 * do not backoff.
      		 */
 358  		if (!tp->retransmits)
      			tp->retransmits=1;
      		tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS,
      				     min(tp->rto, TCP_RESOURCE_PROBE_INTERVAL));
 362  		goto out;
      	}
      
      	/* Increase the timeout each time we retransmit.  Note that
      	 * we do not increase the rtt estimate.  rto is initialized
      	 * from rtt, but increases here.  Jacobson (SIGCOMM 88) suggests
      	 * that doubling rto each time is the least we can get away with.
      	 * In KA9Q, Karn uses this for the first few times, and then
      	 * goes to quadratic.  netBSD doubles, but only goes up to *64,
      	 * and clamps at 1 to 64 sec afterwards.  Note that 120 sec is
      	 * defined in the protocol as the maximum possible RTT.  I guess
      	 * we'll have to use something other than TCP to talk to the
      	 * University of Mars.
      	 *
      	 * PAWS allows us longer timeouts and large windows, so once
      	 * implemented ftp to mars will work nicely. We will have to fix
      	 * the 120 second clamps though!
      	 */
      	tp->backoff++;
      	tp->retransmits++;
      	tp->rto = min(tp->rto << 1, TCP_RTO_MAX);
      	tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
 384  	if (tp->retransmits > sysctl_tcp_retries1)
      		__sk_dst_reset(sk);
      
      out:;
      }
      
 390  static void tcp_write_timer(unsigned long data)
      {
      	struct sock *sk = (struct sock*)data;
      	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
      	int event;
      
      	bh_lock_sock(sk);
 397  	if (sk->lock.users) {
      		/* Try again later */
 399  		if (!mod_timer(&tp->retransmit_timer, jiffies + (HZ/20)))
      			sock_hold(sk);
 401  		goto out_unlock;
      	}
      
 404  	if (sk->state == TCP_CLOSE || !tp->pending)
 405  		goto out;
      
 407  	if ((long)(tp->timeout - jiffies) > 0) {
 408  		if (!mod_timer(&tp->retransmit_timer, tp->timeout))
      			sock_hold(sk);
 410  		goto out;
      	}
      
      	event = tp->pending;
      	tp->pending = 0;
      
 416  	switch (event) {
 417  	case TCP_TIME_RETRANS:
      		tcp_retransmit_timer(sk);
 419  		break;
 420  	case TCP_TIME_PROBE0:
      		tcp_probe_timer(sk);
 422  		break;
      	}
 424  	TCP_CHECK_TIMER(sk);
      
      out:
      	tcp_mem_reclaim(sk);
      out_unlock:
 429  	bh_unlock_sock(sk);
      	sock_put(sk);
      }
      
      /*
       *	Timer for listening sockets
       */
      
 437  static void tcp_synack_timer(struct sock *sk)
      {
      	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
      	struct tcp_listen_opt *lopt = tp->listen_opt;
      	int max_retries = tp->syn_retries ? : sysctl_tcp_synack_retries;
      	int thresh = max_retries;
      	unsigned long now = jiffies;
      	struct open_request **reqp, *req;
      	int i, budget;
      
 447  	if (lopt == NULL || lopt->qlen == 0)
 448  		return;
      
      	/* Normally all the openreqs are young and become mature
      	 * (i.e. converted to established socket) for first timeout.
      	 * If synack was not acknowledged for 3 seconds, it means
      	 * one of the following things: synack was lost, ack was lost,
      	 * rtt is high or nobody planned to ack (i.e. synflood).
      	 * When server is a bit loaded, queue is populated with old
      	 * open requests, reducing effective size of queue.
      	 * When server is well loaded, queue size reduces to zero
      	 * after several minutes of work. It is not synflood,
      	 * it is normal operation. The solution is pruning
      	 * too old entries overriding normal timeout, when
      	 * situation becomes dangerous.
      	 *
      	 * Essentially, we reserve half of room for young
      	 * embrions; and abort old ones without pity, if old
      	 * ones are about to clog our table.
      	 */
 467  	if (lopt->qlen>>(lopt->max_qlen_log-1)) {
      		int young = (lopt->qlen_young<<1);
      
 470  		while (thresh > 2) {
 471  			if (lopt->qlen < young)
 472  				break;
      			thresh--;
      			young <<= 1;
      		}
      	}
      
 478  	if (tp->defer_accept)
      		max_retries = tp->defer_accept;
      
      	budget = 2*(TCP_SYNQ_HSIZE/(TCP_TIMEOUT_INIT/TCP_SYNQ_INTERVAL));
      	i = lopt->clock_hand;
      
 484  	do {
      		reqp=&lopt->syn_table[i];
 486  		while ((req = *reqp) != NULL) {
 487  			if ((long)(now - req->expires) >= 0) {
      				if ((req->retrans < thresh ||
      				     (req->acked && req->retrans < max_retries))
 490  				    && !req->class->rtx_syn_ack(sk, req, NULL)) {
      					unsigned long timeo;
      
 493  					if (req->retrans++ == 0)
      						lopt->qlen_young--;
      					timeo = min((TCP_TIMEOUT_INIT << req->retrans),
      						    TCP_RTO_MAX);
      					req->expires = now + timeo;
      					reqp = &req->dl_next;
 499  					continue;
      				}
      
      				/* Drop this request */
      				write_lock(&tp->syn_wait_lock);
      				*reqp = req->dl_next;
 505  				write_unlock(&tp->syn_wait_lock);
      				lopt->qlen--;
 507  				if (req->retrans == 0)
      					lopt->qlen_young--;
      				tcp_openreq_free(req);
 510  				continue;
      			}
      			reqp = &req->dl_next;
      		}
      
      		i = (i+1)&(TCP_SYNQ_HSIZE-1);
      
 517  	} while (--budget > 0);
      
      	lopt->clock_hand = i;
      
 521  	if (lopt->qlen)
      		tcp_reset_keepalive_timer(sk, TCP_SYNQ_INTERVAL);
      }
      
 525  void tcp_delete_keepalive_timer (struct sock *sk)
      {
 527  	if (timer_pending(&sk->timer) && del_timer (&sk->timer))
      		__sock_put(sk);
      }
      
 531  void tcp_reset_keepalive_timer (struct sock *sk, unsigned long len)
      {
 533  	if (!mod_timer(&sk->timer, jiffies+len))
      		sock_hold(sk);
      }
      
 537  void tcp_set_keepalive(struct sock *sk, int val)
      {
 539  	if ((1<<sk->state)&(TCPF_CLOSE|TCPF_LISTEN))
 540  		return;
      
 542  	if (val && !sk->keepopen)
      		tcp_reset_keepalive_timer(sk, keepalive_time_when(&sk->tp_pinfo.af_tcp));
 544  	else if (!val)
      		tcp_delete_keepalive_timer(sk);
      }
      
      
 549  static void tcp_keepalive_timer (unsigned long data)
      {
      	struct sock *sk = (struct sock *) data;
      	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
      	__u32 elapsed;
      
      	/* Only process if socket is not in use. */
      	bh_lock_sock(sk);
 557  	if (sk->lock.users) {
      		/* Try again later. */ 
      		tcp_reset_keepalive_timer (sk, HZ/20);
 560  		goto out;
      	}
      
 563  	if (sk->state == TCP_LISTEN) {
      		tcp_synack_timer(sk);
 565  		goto out;
      	}
      
 568  	if (sk->state == TCP_FIN_WAIT2 && sk->dead) {
 569  		if (tp->linger2 >= 0) {
      			int tmo = tcp_fin_time(tp) - TCP_TIMEWAIT_LEN;
      
 572  			if (tmo > 0) {
      				tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
 574  				goto out;
      			}
      		}
      		tcp_send_active_reset(sk, GFP_ATOMIC);
 578  		goto death;
      	}
      
 581  	if (!sk->keepopen || sk->state == TCP_CLOSE)
 582  		goto out;
      
      	elapsed = keepalive_time_when(tp);
      
      	/* It is alive without keepalive 8) */
 587  	if (tp->packets_out || tp->send_head)
 588  		goto resched;
      
      	elapsed = tcp_time_stamp - tp->rcv_tstamp;
      
 592  	if (elapsed >= keepalive_time_when(tp)) {
      		if ((!tp->keepalive_probes && tp->probes_out >= sysctl_tcp_keepalive_probes) ||
 594  		     (tp->keepalive_probes && tp->probes_out >= tp->keepalive_probes)) {
      			tcp_send_active_reset(sk, GFP_ATOMIC);
      			tcp_write_err(sk);
 597  			goto out;
      		}
 599  		if (tcp_write_wakeup(sk) <= 0) {
      			tp->probes_out++;
      			elapsed = keepalive_intvl_when(tp);
 602  		} else {
      			/* If keepalive was lost due to local congestion,
      			 * try harder.
      			 */
      			elapsed = TCP_RESOURCE_PROBE_INTERVAL;
      		}
 608  	} else {
      		/* It is tp->rcv_tstamp + keepalive_time_when(tp) */
      		elapsed = keepalive_time_when(tp) - elapsed;
      	}
      
 613  	TCP_CHECK_TIMER(sk);
      	tcp_mem_reclaim(sk);
      
      resched:
      	tcp_reset_keepalive_timer (sk, elapsed);
 618  	goto out;
      
      death:	
      	tcp_done(sk);
      
      out:
 624  	bh_unlock_sock(sk);
      	sock_put(sk);
      }