tcp: fix possible socket refcount problem
authorEric Dumazet <edumazet@google.com>
Mon, 20 Aug 2012 00:22:46 +0000 (00:22 +0000)
committerDavid S. Miller <davem@davemloft.net>
Tue, 21 Aug 2012 21:42:23 +0000 (14:42 -0700)
Commit 6f458dfb40 (tcp: improve latencies of timer triggered events)
added bug leading to following trace :

[ 2866.131281] IPv4: Attempt to release TCP socket in state 1 ffff880019ec0000
[ 2866.131726]
[ 2866.132188] =========================
[ 2866.132281] [ BUG: held lock freed! ]
[ 2866.132281] 3.6.0-rc1+ #622 Not tainted
[ 2866.132281] -------------------------
[ 2866.132281] kworker/0:1/652 is freeing memory ffff880019ec0000-ffff880019ec0a1f, with a lock still held there!
[ 2866.132281]  (sk_lock-AF_INET-RPC){+.+...}, at: [<ffffffff81903619>] tcp_sendmsg+0x29/0xcc6
[ 2866.132281] 4 locks held by kworker/0:1/652:
[ 2866.132281]  #0:  (rpciod){.+.+.+}, at: [<ffffffff81083567>] process_one_work+0x1de/0x47f
[ 2866.132281]  #1:  ((&task->u.tk_work)){+.+.+.}, at: [<ffffffff81083567>] process_one_work+0x1de/0x47f
[ 2866.132281]  #2:  (sk_lock-AF_INET-RPC){+.+...}, at: [<ffffffff81903619>] tcp_sendmsg+0x29/0xcc6
[ 2866.132281]  #3:  (&icsk->icsk_retransmit_timer){+.-...}, at: [<ffffffff81078017>] run_timer_softirq+0x1ad/0x35f
[ 2866.132281]
[ 2866.132281] stack backtrace:
[ 2866.132281] Pid: 652, comm: kworker/0:1 Not tainted 3.6.0-rc1+ #622
[ 2866.132281] Call Trace:
[ 2866.132281]  <IRQ>  [<ffffffff810bc527>] debug_check_no_locks_freed+0x112/0x159
[ 2866.132281]  [<ffffffff818a0839>] ? __sk_free+0xfd/0x114
[ 2866.132281]  [<ffffffff811549fa>] kmem_cache_free+0x6b/0x13a
[ 2866.132281]  [<ffffffff818a0839>] __sk_free+0xfd/0x114
[ 2866.132281]  [<ffffffff818a08c0>] sk_free+0x1c/0x1e
[ 2866.132281]  [<ffffffff81911e1c>] tcp_write_timer+0x51/0x56
[ 2866.132281]  [<ffffffff81078082>] run_timer_softirq+0x218/0x35f
[ 2866.132281]  [<ffffffff81078017>] ? run_timer_softirq+0x1ad/0x35f
[ 2866.132281]  [<ffffffff810f5831>] ? rb_commit+0x58/0x85
[ 2866.132281]  [<ffffffff81911dcb>] ? tcp_write_timer_handler+0x148/0x148
[ 2866.132281]  [<ffffffff81070bd6>] __do_softirq+0xcb/0x1f9
[ 2866.132281]  [<ffffffff81a0a00c>] ? _raw_spin_unlock+0x29/0x2e
[ 2866.132281]  [<ffffffff81a1227c>] call_softirq+0x1c/0x30
[ 2866.132281]  [<ffffffff81039f38>] do_softirq+0x4a/0xa6
[ 2866.132281]  [<ffffffff81070f2b>] irq_exit+0x51/0xad
[ 2866.132281]  [<ffffffff81a129cd>] do_IRQ+0x9d/0xb4
[ 2866.132281]  [<ffffffff81a0a3ef>] common_interrupt+0x6f/0x6f
[ 2866.132281]  <EOI>  [<ffffffff8109d006>] ? sched_clock_cpu+0x58/0xd1
[ 2866.132281]  [<ffffffff81a0a172>] ? _raw_spin_unlock_irqrestore+0x4c/0x56
[ 2866.132281]  [<ffffffff81078692>] mod_timer+0x178/0x1a9
[ 2866.132281]  [<ffffffff818a00aa>] sk_reset_timer+0x19/0x26
[ 2866.132281]  [<ffffffff8190b2cc>] tcp_rearm_rto+0x99/0xa4
[ 2866.132281]  [<ffffffff8190dfba>] tcp_event_new_data_sent+0x6e/0x70
[ 2866.132281]  [<ffffffff8190f7ea>] tcp_write_xmit+0x7de/0x8e4
[ 2866.132281]  [<ffffffff818a565d>] ? __alloc_skb+0xa0/0x1a1
[ 2866.132281]  [<ffffffff8190f952>] __tcp_push_pending_frames+0x2e/0x8a
[ 2866.132281]  [<ffffffff81904122>] tcp_sendmsg+0xb32/0xcc6
[ 2866.132281]  [<ffffffff819229c2>] inet_sendmsg+0xaa/0xd5
[ 2866.132281]  [<ffffffff81922918>] ? inet_autobind+0x5f/0x5f
[ 2866.132281]  [<ffffffff810ee7f1>] ? trace_clock_local+0x9/0xb
[ 2866.132281]  [<ffffffff8189adab>] sock_sendmsg+0xa3/0xc4
[ 2866.132281]  [<ffffffff810f5de6>] ? rb_reserve_next_event+0x26f/0x2d5
[ 2866.132281]  [<ffffffff8103e6a9>] ? native_sched_clock+0x29/0x6f
[ 2866.132281]  [<ffffffff8103e6f8>] ? sched_clock+0x9/0xd
[ 2866.132281]  [<ffffffff810ee7f1>] ? trace_clock_local+0x9/0xb
[ 2866.132281]  [<ffffffff8189ae03>] kernel_sendmsg+0x37/0x43
[ 2866.132281]  [<ffffffff8199ce49>] xs_send_kvec+0x77/0x80
[ 2866.132281]  [<ffffffff8199cec1>] xs_sendpages+0x6f/0x1a0
[ 2866.132281]  [<ffffffff8107826d>] ? try_to_del_timer_sync+0x55/0x61
[ 2866.132281]  [<ffffffff8199d0d2>] xs_tcp_send_request+0x55/0xf1
[ 2866.132281]  [<ffffffff8199bb90>] xprt_transmit+0x89/0x1db
[ 2866.132281]  [<ffffffff81999bcd>] ? call_connect+0x3c/0x3c
[ 2866.132281]  [<ffffffff81999d92>] call_transmit+0x1c5/0x20e
[ 2866.132281]  [<ffffffff819a0d55>] __rpc_execute+0x6f/0x225
[ 2866.132281]  [<ffffffff81999bcd>] ? call_connect+0x3c/0x3c
[ 2866.132281]  [<ffffffff819a0f33>] rpc_async_schedule+0x28/0x34
[ 2866.132281]  [<ffffffff810835d6>] process_one_work+0x24d/0x47f
[ 2866.132281]  [<ffffffff81083567>] ? process_one_work+0x1de/0x47f
[ 2866.132281]  [<ffffffff819a0f0b>] ? __rpc_execute+0x225/0x225
[ 2866.132281]  [<ffffffff81083a6d>] worker_thread+0x236/0x317
[ 2866.132281]  [<ffffffff81083837>] ? process_scheduled_works+0x2f/0x2f
[ 2866.132281]  [<ffffffff8108b7b8>] kthread+0x9a/0xa2
[ 2866.132281]  [<ffffffff81a12184>] kernel_thread_helper+0x4/0x10
[ 2866.132281]  [<ffffffff81a0a4b0>] ? retint_restore_args+0x13/0x13
[ 2866.132281]  [<ffffffff8108b71e>] ? __init_kthread_worker+0x5a/0x5a
[ 2866.132281]  [<ffffffff81a12180>] ? gs_change+0x13/0x13
[ 2866.308506] IPv4: Attempt to release TCP socket in state 1 ffff880019ec0000
[ 2866.309689] =============================================================================
[ 2866.310254] BUG TCP (Not tainted): Object already free
[ 2866.310254] -----------------------------------------------------------------------------
[ 2866.310254]

The bug comes from the fact that timer set in sk_reset_timer() can run
before we actually do the sock_hold(). socket refcount reaches zero and
we free the socket too soon.

timer handler is not allowed to reduce socket refcnt if socket is owned
by the user, or we need to change sk_reset_timer() implementation.

We should take a reference on the socket in case TCP_DELACK_TIMER_DEFERRED
or TCP_DELACK_TIMER_DEFERRED bit are set in tsq_flags

Also fix a typo in tcp_delack_timer(), where TCP_WRITE_TIMER_DEFERRED
was used instead of TCP_DELACK_TIMER_DEFERRED.

For consistency, use same socket refcount change for TCP_MTU_REDUCED_DEFERRED,
even if not fired from a timer.

Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Tested-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
net/ipv4/tcp_ipv4.c
net/ipv4/tcp_output.c
net/ipv4/tcp_timer.c

index 5bf2040..00a748d 100644 (file)
@@ -417,10 +417,12 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 
                if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
                        tp->mtu_info = info;
-                       if (!sock_owned_by_user(sk))
+                       if (!sock_owned_by_user(sk)) {
                                tcp_v4_mtu_reduced(sk);
-                       else
-                               set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags);
+                       } else {
+                               if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
+                                       sock_hold(sk);
+                       }
                        goto out;
                }
 
index 20dfd89..d046326 100644 (file)
@@ -910,14 +910,18 @@ void tcp_release_cb(struct sock *sk)
        if (flags & (1UL << TCP_TSQ_DEFERRED))
                tcp_tsq_handler(sk);
 
-       if (flags & (1UL << TCP_WRITE_TIMER_DEFERRED))
+       if (flags & (1UL << TCP_WRITE_TIMER_DEFERRED)) {
                tcp_write_timer_handler(sk);
-
-       if (flags & (1UL << TCP_DELACK_TIMER_DEFERRED))
+               __sock_put(sk);
+       }
+       if (flags & (1UL << TCP_DELACK_TIMER_DEFERRED)) {
                tcp_delack_timer_handler(sk);
-
-       if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED))
+               __sock_put(sk);
+       }
+       if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED)) {
                sk->sk_prot->mtu_reduced(sk);
+               __sock_put(sk);
+       }
 }
 EXPORT_SYMBOL(tcp_release_cb);
 
index 6df36ad..b774a03 100644 (file)
@@ -252,7 +252,8 @@ static void tcp_delack_timer(unsigned long data)
                inet_csk(sk)->icsk_ack.blocked = 1;
                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);
                /* deleguate our work to tcp_release_cb() */
-               set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags);
+               if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags))
+                       sock_hold(sk);
        }
        bh_unlock_sock(sk);
        sock_put(sk);
@@ -481,7 +482,8 @@ static void tcp_write_timer(unsigned long data)
                tcp_write_timer_handler(sk);
        } else {
                /* deleguate our work to tcp_release_cb() */
-               set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags);
+               if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags))
+                       sock_hold(sk);
        }
        bh_unlock_sock(sk);
        sock_put(sk);