/* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Implementation of the Transmission Control Protocol(TCP). * * Version: $Id: tcp_timer.c,v 1.80 2000/10/03 07:29:01 anton Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Mark Evans, <evansmp@uhura.aston.ac.uk> * Corey Minyard <wf-rch!minyard@relay.EU.net> * Florian La Roche, <flla@stud.uni-sb.de> * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> * Linus Torvalds, <torvalds@cs.helsinki.fi> * Alan Cox, <gw4pts@gw4pts.ampr.org> * Matthew Dillon, <dillon@apollo.west.oic.com> * Arnt Gulbrandsen, <agulbra@nvg.unit.no> * Jorge Cwik, <jorge@laser.satlink.net> */ #include <net/tcp.h> int sysctl_tcp_syn_retries = TCP_SYN_RETRIES; int sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; int sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME; int sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES; int sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL; int sysctl_tcp_retries1 = TCP_RETR1; int sysctl_tcp_retries2 = TCP_RETR2; int sysctl_tcp_orphan_retries; static void tcp_write_timer(unsigned long); static void tcp_delack_timer(unsigned long); static void tcp_keepalive_timer (unsigned long data); const char timer_bug_msg[] = KERN_DEBUG "tcpbug: unknown timer value\n"; /* * Using different timers for retransmit, delayed acks and probes * We may wish use just one timer maintaining a list of expire jiffies * to optimize. */ 46 void tcp_init_xmit_timers(struct sock *sk) { struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; init_timer(&tp->retransmit_timer); tp->retransmit_timer.function=&tcp_write_timer; tp->retransmit_timer.data = (unsigned long) sk; tp->pending = 0; init_timer(&tp->delack_timer); tp->delack_timer.function=&tcp_delack_timer; tp->delack_timer.data = (unsigned long) sk; tp->ack.pending = 0; init_timer(&sk->timer); sk->timer.function=&tcp_keepalive_timer; sk->timer.data = (unsigned long) sk; } 65 void tcp_clear_xmit_timers(struct sock *sk) { struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; tp->pending = 0; if (timer_pending(&tp->retransmit_timer) && 71 del_timer(&tp->retransmit_timer)) __sock_put(sk); tp->ack.pending = 0; tp->ack.blocked = 0; if (timer_pending(&tp->delack_timer) && 77 del_timer(&tp->delack_timer)) __sock_put(sk); 80 if(timer_pending(&sk->timer) && del_timer(&sk->timer)) __sock_put(sk); } 84 static void tcp_write_err(struct sock *sk) { sk->err = sk->err_soft ? : ETIMEDOUT; sk->error_report(sk); tcp_done(sk); NET_INC_STATS_BH(TCPAbortOnTimeout); } /* Do not allow orphaned sockets to eat all our resources. * This is direct violation of TCP specs, but it is required * to prevent DoS attacks. It is called when a retransmission timeout * or zero probe timeout occurs on orphaned socket. * * Criterium is still not confirmed experimentally and may change. * We kill the socket, if: * 1. If number of orphaned sockets exceeds an administratively configured * limit. * 2. If we have strong memory pressure. */ 104 static int tcp_out_of_resources(struct sock *sk, int do_reset) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); int orphans = atomic_read(&tcp_orphan_count); /* If peer does not open window for long time, or did not transmit * anything for long time, penalize it. */ 111 if ((s32)(tcp_time_stamp - tp->lsndtime) > 2*TCP_RTO_MAX || !do_reset) orphans <<= 1; /* If some dubious ICMP arrived, penalize even more. */ 115 if (sk->err_soft) orphans <<= 1; if (orphans >= sysctl_tcp_max_orphans || (sk->wmem_queued > SOCK_MIN_SNDBUF && 120 atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) { 121 if (net_ratelimit()) printk(KERN_INFO "Out of socket memory\n"); /* Catch exceptional cases, when connection requires reset. * 1. Last segment was sent recently. */ if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN || /* 2. Window is closed. */ 128 (!tp->snd_wnd && !tp->packets_out)) do_reset = 1; 130 if (do_reset) tcp_send_active_reset(sk, GFP_ATOMIC); tcp_done(sk); NET_INC_STATS_BH(TCPAbortOnMemory); 134 return 1; } 136 return 0; } /* Calculate maximal number or retries on an orphaned socket. */ 140 static int tcp_orphan_retries(struct sock *sk, int alive) { int retries = sysctl_tcp_orphan_retries; /* May be zero. */ /* We know from an ICMP that something is wrong. */ 145 if (sk->err_soft && !alive) retries = 0; /* However, if socket sent something recently, select some safe * number of retries. 8 corresponds to >100 seconds with minimal * RTO of 200msec. */ 151 if (retries == 0 && alive) retries = 8; 153 return retries; } /* A write timeout has occurred. Process the after effects. */ 157 static int tcp_write_timeout(struct sock *sk) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); int retry_until; 162 if ((1<<sk->state)&(TCPF_SYN_SENT|TCPF_SYN_RECV)) { 163 if (tp->retransmits) dst_negative_advice(&sk->dst_cache); retry_until = tp->syn_retries ? : sysctl_tcp_syn_retries; 166 } else { 167 if (tp->retransmits >= sysctl_tcp_retries1) { /* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu black hole detection. :-( It is place to make it. It is not made. I do not want to make it. It is disguisting. It does not work in any case. Let me to cite the same draft, which requires for us to implement this: "The one security concern raised by this memo is that ICMP black holes are often caused by over-zealous security administrators who block all ICMP messages. It is vitally important that those who design and deploy security systems understand the impact of strict filtering on upper-layer protocols. The safest web site in the world is worthless if most TCP implementations cannot transfer data from it. It would be far nicer to have all of the black holes fixed rather than fixing all of the TCP implementations." Golden words :-). */ dst_negative_advice(&sk->dst_cache); } retry_until = sysctl_tcp_retries2; 192 if (sk->dead) { int alive = (tp->rto < TCP_RTO_MAX); retry_until = tcp_orphan_retries(sk, alive); 197 if (tcp_out_of_resources(sk, alive || tp->retransmits < retry_until)) 198 return 1; } } 202 if (tp->retransmits >= retry_until) { /* Has it gone just too far? */ tcp_write_err(sk); 205 return 1; } 207 return 0; } 210 static void tcp_delack_timer(unsigned long data) { struct sock *sk = (struct sock*)data; struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); bh_lock_sock(sk); 216 if (sk->lock.users) { /* Try again later. */ tp->ack.blocked = 1; NET_INC_STATS_BH(DelayedACKLocked); 220 if (!mod_timer(&tp->delack_timer, jiffies + TCP_DELACK_MIN)) sock_hold(sk); 222 goto out_unlock; } tcp_mem_reclaim(sk); 227 if (sk->state == TCP_CLOSE || !(tp->ack.pending&TCP_ACK_TIMER)) 228 goto out; 230 if ((long)(tp->ack.timeout - jiffies) > 0) { 231 if (!mod_timer(&tp->delack_timer, tp->ack.timeout)) sock_hold(sk); 233 goto out; } tp->ack.pending &= ~TCP_ACK_TIMER; 237 if (skb_queue_len(&tp->ucopy.prequeue)) { struct sk_buff *skb; net_statistics[smp_processor_id()*2].TCPSchedulerFailed += skb_queue_len(&tp->ucopy.prequeue); 242 while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) sk->backlog_rcv(sk, skb); tp->ucopy.memory = 0; } 248 if (tcp_ack_scheduled(tp)) { 249 if (!tp->ack.pingpong) { /* Delayed ACK missed: inflate ATO. */ tp->ack.ato = min(tp->ack.ato<<1, tp->rto); 252 } else { /* Delayed ACK missed: leave pingpong mode and * deflate ATO. */ tp->ack.pingpong = 0; tp->ack.ato = TCP_ATO_MIN; } tcp_send_ack(sk); NET_INC_STATS_BH(DelayedACKs); } 262 TCP_CHECK_TIMER(sk); out: 265 if (tcp_memory_pressure) tcp_mem_reclaim(sk); out_unlock: 268 bh_unlock_sock(sk); sock_put(sk); } 272 static void tcp_probe_timer(struct sock *sk) { struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; int max_probes; 277 if (tp->packets_out || !tp->send_head) { tp->probes_out = 0; 279 return; } /* *WARNING* RFC 1122 forbids this * * It doesn't AFAIK, because we kill the retransmit timer -AK * * FIXME: We ought not to do it, Solaris 2.5 actually has fixing * this behaviour in Solaris down as a bug fix. [AC] * * Let me to explain. probes_out is zeroed by incoming ACKs * even if they advertise zero window. Hence, connection is killed only * if we received no ACKs for normal connection timeout. It is not killed * only because window stays zero for some time, window may be zero * until armageddon and even later. We are in full accordance * with RFCs, only probe timer combines both retransmission timeout * and probe timeout in one bottle. --ANK */ max_probes = sysctl_tcp_retries2; 299 if (sk->dead) { int alive = ((tp->rto<<tp->backoff) < TCP_RTO_MAX); max_probes = tcp_orphan_retries(sk, alive); 304 if (tcp_out_of_resources(sk, alive || tp->probes_out <= max_probes)) 305 return; } 308 if (tp->probes_out > max_probes) { tcp_write_err(sk); 310 } else { /* Only send another probe if we didn't close things up. */ tcp_send_probe0(sk); } } /* * The TCP retransmit timer. */ 320 static void tcp_retransmit_timer(struct sock *sk) { struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; 324 if (tp->packets_out == 0) 325 goto out; 327 BUG_TRAP(!skb_queue_empty(&sk->write_queue)); 329 if (tcp_write_timeout(sk)) 330 goto out; 332 if (tp->retransmits == 0) { 333 if (tp->ca_state == TCP_CA_Disorder || tp->ca_state == TCP_CA_Recovery) { 334 if (tp->sack_ok) { 335 if (tp->ca_state == TCP_CA_Recovery) NET_INC_STATS_BH(TCPSackRecoveryFail); 337 else NET_INC_STATS_BH(TCPSackFailures); 339 } else { 340 if (tp->ca_state == TCP_CA_Recovery) NET_INC_STATS_BH(TCPRenoRecoveryFail); 342 else NET_INC_STATS_BH(TCPRenoFailures); } 345 } else if (tp->ca_state == TCP_CA_Loss) { NET_INC_STATS_BH(TCPLossFailures); 347 } else { NET_INC_STATS_BH(TCPTimeouts); } } tcp_enter_loss(sk, 0); 354 if (tcp_retransmit_skb(sk, skb_peek(&sk->write_queue)) > 0) { /* Retransmission failed because of local congestion, * do not backoff. */ 358 if (!tp->retransmits) tp->retransmits=1; tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, min(tp->rto, TCP_RESOURCE_PROBE_INTERVAL)); 362 goto out; } /* Increase the timeout each time we retransmit. Note that * we do not increase the rtt estimate. rto is initialized * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests * that doubling rto each time is the least we can get away with. * In KA9Q, Karn uses this for the first few times, and then * goes to quadratic. netBSD doubles, but only goes up to *64, * and clamps at 1 to 64 sec afterwards. Note that 120 sec is * defined in the protocol as the maximum possible RTT. I guess * we'll have to use something other than TCP to talk to the * University of Mars. * * PAWS allows us longer timeouts and large windows, so once * implemented ftp to mars will work nicely. We will have to fix * the 120 second clamps though! */ tp->backoff++; tp->retransmits++; tp->rto = min(tp->rto << 1, TCP_RTO_MAX); tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); 384 if (tp->retransmits > sysctl_tcp_retries1) __sk_dst_reset(sk); out:; } 390 static void tcp_write_timer(unsigned long data) { struct sock *sk = (struct sock*)data; struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; int event; bh_lock_sock(sk); 397 if (sk->lock.users) { /* Try again later */ 399 if (!mod_timer(&tp->retransmit_timer, jiffies + (HZ/20))) sock_hold(sk); 401 goto out_unlock; } 404 if (sk->state == TCP_CLOSE || !tp->pending) 405 goto out; 407 if ((long)(tp->timeout - jiffies) > 0) { 408 if (!mod_timer(&tp->retransmit_timer, tp->timeout)) sock_hold(sk); 410 goto out; } event = tp->pending; tp->pending = 0; 416 switch (event) { 417 case TCP_TIME_RETRANS: tcp_retransmit_timer(sk); 419 break; 420 case TCP_TIME_PROBE0: tcp_probe_timer(sk); 422 break; } 424 TCP_CHECK_TIMER(sk); out: tcp_mem_reclaim(sk); out_unlock: 429 bh_unlock_sock(sk); sock_put(sk); } /* * Timer for listening sockets */ 437 static void tcp_synack_timer(struct sock *sk) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); struct tcp_listen_opt *lopt = tp->listen_opt; int max_retries = tp->syn_retries ? : sysctl_tcp_synack_retries; int thresh = max_retries; unsigned long now = jiffies; struct open_request **reqp, *req; int i, budget; 447 if (lopt == NULL || lopt->qlen == 0) 448 return; /* Normally all the openreqs are young and become mature * (i.e. converted to established socket) for first timeout. * If synack was not acknowledged for 3 seconds, it means * one of the following things: synack was lost, ack was lost, * rtt is high or nobody planned to ack (i.e. synflood). * When server is a bit loaded, queue is populated with old * open requests, reducing effective size of queue. * When server is well loaded, queue size reduces to zero * after several minutes of work. It is not synflood, * it is normal operation. The solution is pruning * too old entries overriding normal timeout, when * situation becomes dangerous. * * Essentially, we reserve half of room for young * embrions; and abort old ones without pity, if old * ones are about to clog our table. */ 467 if (lopt->qlen>>(lopt->max_qlen_log-1)) { int young = (lopt->qlen_young<<1); 470 while (thresh > 2) { 471 if (lopt->qlen < young) 472 break; thresh--; young <<= 1; } } 478 if (tp->defer_accept) max_retries = tp->defer_accept; budget = 2*(TCP_SYNQ_HSIZE/(TCP_TIMEOUT_INIT/TCP_SYNQ_INTERVAL)); i = lopt->clock_hand; 484 do { reqp=&lopt->syn_table[i]; 486 while ((req = *reqp) != NULL) { 487 if ((long)(now - req->expires) >= 0) { if ((req->retrans < thresh || (req->acked && req->retrans < max_retries)) 490 && !req->class->rtx_syn_ack(sk, req, NULL)) { unsigned long timeo; 493 if (req->retrans++ == 0) lopt->qlen_young--; timeo = min((TCP_TIMEOUT_INIT << req->retrans), TCP_RTO_MAX); req->expires = now + timeo; reqp = &req->dl_next; 499 continue; } /* Drop this request */ write_lock(&tp->syn_wait_lock); *reqp = req->dl_next; 505 write_unlock(&tp->syn_wait_lock); lopt->qlen--; 507 if (req->retrans == 0) lopt->qlen_young--; tcp_openreq_free(req); 510 continue; } reqp = &req->dl_next; } i = (i+1)&(TCP_SYNQ_HSIZE-1); 517 } while (--budget > 0); lopt->clock_hand = i; 521 if (lopt->qlen) tcp_reset_keepalive_timer(sk, TCP_SYNQ_INTERVAL); } 525 void tcp_delete_keepalive_timer (struct sock *sk) { 527 if (timer_pending(&sk->timer) && del_timer (&sk->timer)) __sock_put(sk); } 531 void tcp_reset_keepalive_timer (struct sock *sk, unsigned long len) { 533 if (!mod_timer(&sk->timer, jiffies+len)) sock_hold(sk); } 537 void tcp_set_keepalive(struct sock *sk, int val) { 539 if ((1<<sk->state)&(TCPF_CLOSE|TCPF_LISTEN)) 540 return; 542 if (val && !sk->keepopen) tcp_reset_keepalive_timer(sk, keepalive_time_when(&sk->tp_pinfo.af_tcp)); 544 else if (!val) tcp_delete_keepalive_timer(sk); } 549 static void tcp_keepalive_timer (unsigned long data) { struct sock *sk = (struct sock *) data; struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; __u32 elapsed; /* Only process if socket is not in use. */ bh_lock_sock(sk); 557 if (sk->lock.users) { /* Try again later. */ tcp_reset_keepalive_timer (sk, HZ/20); 560 goto out; } 563 if (sk->state == TCP_LISTEN) { tcp_synack_timer(sk); 565 goto out; } 568 if (sk->state == TCP_FIN_WAIT2 && sk->dead) { 569 if (tp->linger2 >= 0) { int tmo = tcp_fin_time(tp) - TCP_TIMEWAIT_LEN; 572 if (tmo > 0) { tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); 574 goto out; } } tcp_send_active_reset(sk, GFP_ATOMIC); 578 goto death; } 581 if (!sk->keepopen || sk->state == TCP_CLOSE) 582 goto out; elapsed = keepalive_time_when(tp); /* It is alive without keepalive 8) */ 587 if (tp->packets_out || tp->send_head) 588 goto resched; elapsed = tcp_time_stamp - tp->rcv_tstamp; 592 if (elapsed >= keepalive_time_when(tp)) { if ((!tp->keepalive_probes && tp->probes_out >= sysctl_tcp_keepalive_probes) || 594 (tp->keepalive_probes && tp->probes_out >= tp->keepalive_probes)) { tcp_send_active_reset(sk, GFP_ATOMIC); tcp_write_err(sk); 597 goto out; } 599 if (tcp_write_wakeup(sk) <= 0) { tp->probes_out++; elapsed = keepalive_intvl_when(tp); 602 } else { /* If keepalive was lost due to local congestion, * try harder. */ elapsed = TCP_RESOURCE_PROBE_INTERVAL; } 608 } else { /* It is tp->rcv_tstamp + keepalive_time_when(tp) */ elapsed = keepalive_time_when(tp) - elapsed; } 613 TCP_CHECK_TIMER(sk); tcp_mem_reclaim(sk); resched: tcp_reset_keepalive_timer (sk, elapsed); 618 goto out; death: tcp_done(sk); out: 624 bh_unlock_sock(sk); sock_put(sk); }