Merge tag 'tif-task_work.arch-2020-12-14' of git://git.kernel.dk/linux-block
[linux-2.6-microblaze.git] / net / ipv4 / tcp.c
index 2135ee7..ed42d21 100644 (file)
@@ -418,6 +418,8 @@ void tcp_init_sock(struct sock *sk)
        INIT_LIST_HEAD(&tp->tsorted_sent_queue);
 
        icsk->icsk_rto = TCP_TIMEOUT_INIT;
+       icsk->icsk_rto_min = TCP_RTO_MIN;
+       icsk->icsk_delack_max = TCP_DELACK_MAX;
        tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
        minmax_reset(&tp->rtt_min, tcp_jiffies32, ~0U);
 
@@ -483,6 +485,8 @@ static inline bool tcp_stream_is_readable(const struct tcp_sock *tp,
                        return true;
                if (tcp_rmem_pressure(sk))
                        return true;
+               if (tcp_receive_window(tp) <= inet_csk(sk)->icsk_ack.rcv_mss)
+                       return true;
        }
        if (sk->sk_prot->stream_memory_read)
                return sk->sk_prot->stream_memory_read(sk);
@@ -562,7 +566,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
                        mask |= EPOLLIN | EPOLLRDNORM;
 
                if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
-                       if (sk_stream_is_writeable(sk)) {
+                       if (__sk_stream_is_writeable(sk, 1)) {
                                mask |= EPOLLOUT | EPOLLWRNORM;
                        } else {  /* send SIGIO later */
                                sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
@@ -574,7 +578,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
                                 * pairs with the input side.
                                 */
                                smp_mb__after_atomic();
-                               if (sk_stream_is_writeable(sk))
+                               if (__sk_stream_is_writeable(sk, 1))
                                        mask |= EPOLLOUT | EPOLLWRNORM;
                        }
                } else
@@ -950,7 +954,7 @@ int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
  * importantly be able to generate EPOLLOUT for Edge Trigger epoll()
  * users.
  */
-static void tcp_remove_empty_skb(struct sock *sk, struct sk_buff *skb)
+void tcp_remove_empty_skb(struct sock *sk, struct sk_buff *skb)
 {
        if (skb && !skb->len) {
                tcp_unlink_write_queue(skb, sk);
@@ -960,6 +964,68 @@ static void tcp_remove_empty_skb(struct sock *sk, struct sk_buff *skb)
        }
 }
 
+struct sk_buff *tcp_build_frag(struct sock *sk, int size_goal, int flags,
+                              struct page *page, int offset, size_t *size)
+{
+       struct sk_buff *skb = tcp_write_queue_tail(sk);
+       struct tcp_sock *tp = tcp_sk(sk);
+       bool can_coalesce;
+       int copy, i;
+
+       if (!skb || (copy = size_goal - skb->len) <= 0 ||
+           !tcp_skb_can_collapse_to(skb)) {
+new_segment:
+               if (!sk_stream_memory_free(sk))
+                       return NULL;
+
+               skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
+                                         tcp_rtx_and_write_queues_empty(sk));
+               if (!skb)
+                       return NULL;
+
+#ifdef CONFIG_TLS_DEVICE
+               skb->decrypted = !!(flags & MSG_SENDPAGE_DECRYPTED);
+#endif
+               skb_entail(sk, skb);
+               copy = size_goal;
+       }
+
+       if (copy > *size)
+               copy = *size;
+
+       i = skb_shinfo(skb)->nr_frags;
+       can_coalesce = skb_can_coalesce(skb, i, page, offset);
+       if (!can_coalesce && i >= sysctl_max_skb_frags) {
+               tcp_mark_push(tp, skb);
+               goto new_segment;
+       }
+       if (!sk_wmem_schedule(sk, copy))
+               return NULL;
+
+       if (can_coalesce) {
+               skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
+       } else {
+               get_page(page);
+               skb_fill_page_desc(skb, i, page, offset, copy);
+       }
+
+       if (!(flags & MSG_NO_SHARED_FRAGS))
+               skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
+
+       skb->len += copy;
+       skb->data_len += copy;
+       skb->truesize += copy;
+       sk_wmem_queued_add(sk, copy);
+       sk_mem_charge(sk, copy);
+       skb->ip_summed = CHECKSUM_PARTIAL;
+       WRITE_ONCE(tp->write_seq, tp->write_seq + copy);
+       TCP_SKB_CB(skb)->end_seq += copy;
+       tcp_skb_pcount_set(skb, 0);
+
+       *size = copy;
+       return skb;
+}
+
 ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
                         size_t size, int flags)
 {
@@ -995,59 +1061,12 @@ ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
                goto out_err;
 
        while (size > 0) {
-               struct sk_buff *skb = tcp_write_queue_tail(sk);
-               int copy, i;
-               bool can_coalesce;
-
-               if (!skb || (copy = size_goal - skb->len) <= 0 ||
-                   !tcp_skb_can_collapse_to(skb)) {
-new_segment:
-                       if (!sk_stream_memory_free(sk))
-                               goto wait_for_sndbuf;
-
-                       skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
-                                       tcp_rtx_and_write_queues_empty(sk));
-                       if (!skb)
-                               goto wait_for_memory;
-
-#ifdef CONFIG_TLS_DEVICE
-                       skb->decrypted = !!(flags & MSG_SENDPAGE_DECRYPTED);
-#endif
-                       skb_entail(sk, skb);
-                       copy = size_goal;
-               }
-
-               if (copy > size)
-                       copy = size;
-
-               i = skb_shinfo(skb)->nr_frags;
-               can_coalesce = skb_can_coalesce(skb, i, page, offset);
-               if (!can_coalesce && i >= sysctl_max_skb_frags) {
-                       tcp_mark_push(tp, skb);
-                       goto new_segment;
-               }
-               if (!sk_wmem_schedule(sk, copy))
-                       goto wait_for_memory;
-
-               if (can_coalesce) {
-                       skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
-               } else {
-                       get_page(page);
-                       skb_fill_page_desc(skb, i, page, offset, copy);
-               }
+               struct sk_buff *skb;
+               size_t copy = size;
 
-               if (!(flags & MSG_NO_SHARED_FRAGS))
-                       skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
-
-               skb->len += copy;
-               skb->data_len += copy;
-               skb->truesize += copy;
-               sk_wmem_queued_add(sk, copy);
-               sk_mem_charge(sk, copy);
-               skb->ip_summed = CHECKSUM_PARTIAL;
-               WRITE_ONCE(tp->write_seq, tp->write_seq + copy);
-               TCP_SKB_CB(skb)->end_seq += copy;
-               tcp_skb_pcount_set(skb, 0);
+               skb = tcp_build_frag(sk, size_goal, flags, page, offset, &copy);
+               if (!skb)
+                       goto wait_for_space;
 
                if (!copied)
                        TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
@@ -1068,9 +1087,8 @@ new_segment:
                        tcp_push_one(sk, mss_now);
                continue;
 
-wait_for_sndbuf:
+wait_for_space:
                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
-wait_for_memory:
                tcp_push(sk, flags & ~MSG_MORE, mss_now,
                         TCP_NAGLE_PUSH, size_goal);
 
@@ -1281,7 +1299,7 @@ restart:
 
 new_segment:
                        if (!sk_stream_memory_free(sk))
-                               goto wait_for_sndbuf;
+                               goto wait_for_space;
 
                        if (unlikely(process_backlog >= 16)) {
                                process_backlog = 0;
@@ -1292,7 +1310,7 @@ new_segment:
                        skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
                                                  first_skb);
                        if (!skb)
-                               goto wait_for_memory;
+                               goto wait_for_space;
 
                        process_backlog++;
                        skb->ip_summed = CHECKSUM_PARTIAL;
@@ -1325,7 +1343,7 @@ new_segment:
                        struct page_frag *pfrag = sk_page_frag(sk);
 
                        if (!sk_page_frag_refill(sk, pfrag))
-                               goto wait_for_memory;
+                               goto wait_for_space;
 
                        if (!skb_can_coalesce(skb, i, pfrag->page,
                                              pfrag->offset)) {
@@ -1339,7 +1357,7 @@ new_segment:
                        copy = min_t(int, copy, pfrag->size - pfrag->offset);
 
                        if (!sk_wmem_schedule(sk, copy))
-                               goto wait_for_memory;
+                               goto wait_for_space;
 
                        err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb,
                                                       pfrag->page,
@@ -1392,9 +1410,8 @@ new_segment:
                        tcp_push_one(sk, mss_now);
                continue;
 
-wait_for_sndbuf:
+wait_for_space:
                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
-wait_for_memory:
                if (copied)
                        tcp_push(sk, flags & ~MSG_MORE, mss_now,
                                 TCP_NAGLE_PUSH, size_goal);
@@ -1526,7 +1543,7 @@ static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
  * calculation of whether or not we must ACK for the sake of
  * a window update.
  */
-static void tcp_cleanup_rbuf(struct sock *sk, int copied)
+void tcp_cleanup_rbuf(struct sock *sk, int copied)
 {
        struct tcp_sock *tp = tcp_sk(sk);
        bool time_to_ack = false;
@@ -1539,10 +1556,8 @@ static void tcp_cleanup_rbuf(struct sock *sk, int copied)
 
        if (inet_csk_ack_scheduled(sk)) {
                const struct inet_connection_sock *icsk = inet_csk(sk);
-                  /* Delayed ACKs frequently hit locked sockets during bulk
-                   * receive. */
-               if (icsk->icsk_ack.blocked ||
-                   /* Once-per-two-segments ACK was not sent by tcp_input.c */
+
+               if (/* Once-per-two-segments ACK was not sent by tcp_input.c */
                    tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
                    /*
                     * If this read emptied read buffer, we send ACK, if
@@ -1743,52 +1758,272 @@ int tcp_mmap(struct file *file, struct socket *sock,
 }
 EXPORT_SYMBOL(tcp_mmap);
 
+static skb_frag_t *skb_advance_to_frag(struct sk_buff *skb, u32 offset_skb,
+                                      u32 *offset_frag)
+{
+       skb_frag_t *frag;
+
+       offset_skb -= skb_headlen(skb);
+       if ((int)offset_skb < 0 || skb_has_frag_list(skb))
+               return NULL;
+
+       frag = skb_shinfo(skb)->frags;
+       while (offset_skb) {
+               if (skb_frag_size(frag) > offset_skb) {
+                       *offset_frag = offset_skb;
+                       return frag;
+               }
+               offset_skb -= skb_frag_size(frag);
+               ++frag;
+       }
+       *offset_frag = 0;
+       return frag;
+}
+
+static bool can_map_frag(const skb_frag_t *frag)
+{
+       return skb_frag_size(frag) == PAGE_SIZE && !skb_frag_off(frag);
+}
+
+static int find_next_mappable_frag(const skb_frag_t *frag,
+                                  int remaining_in_skb)
+{
+       int offset = 0;
+
+       if (likely(can_map_frag(frag)))
+               return 0;
+
+       while (offset < remaining_in_skb && !can_map_frag(frag)) {
+               offset += skb_frag_size(frag);
+               ++frag;
+       }
+       return offset;
+}
+
+static void tcp_zerocopy_set_hint_for_skb(struct sock *sk,
+                                         struct tcp_zerocopy_receive *zc,
+                                         struct sk_buff *skb, u32 offset)
+{
+       u32 frag_offset, partial_frag_remainder = 0;
+       int mappable_offset;
+       skb_frag_t *frag;
+
+       /* worst case: skip to next skb. try to improve on this case below */
+       zc->recv_skip_hint = skb->len - offset;
+
+       /* Find the frag containing this offset (and how far into that frag) */
+       frag = skb_advance_to_frag(skb, offset, &frag_offset);
+       if (!frag)
+               return;
+
+       if (frag_offset) {
+               struct skb_shared_info *info = skb_shinfo(skb);
+
+               /* We read part of the last frag, must recvmsg() rest of skb. */
+               if (frag == &info->frags[info->nr_frags - 1])
+                       return;
+
+               /* Else, we must at least read the remainder in this frag. */
+               partial_frag_remainder = skb_frag_size(frag) - frag_offset;
+               zc->recv_skip_hint -= partial_frag_remainder;
+               ++frag;
+       }
+
+       /* partial_frag_remainder: If part way through a frag, must read rest.
+        * mappable_offset: Bytes till next mappable frag, *not* counting bytes
+        * in partial_frag_remainder.
+        */
+       mappable_offset = find_next_mappable_frag(frag, zc->recv_skip_hint);
+       zc->recv_skip_hint = mappable_offset + partial_frag_remainder;
+}
+
+static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
+                             int nonblock, int flags,
+                             struct scm_timestamping_internal *tss,
+                             int *cmsg_flags);
+static int receive_fallback_to_copy(struct sock *sk,
+                                   struct tcp_zerocopy_receive *zc, int inq)
+{
+       unsigned long copy_address = (unsigned long)zc->copybuf_address;
+       struct scm_timestamping_internal tss_unused;
+       int err, cmsg_flags_unused;
+       struct msghdr msg = {};
+       struct iovec iov;
+
+       zc->length = 0;
+       zc->recv_skip_hint = 0;
+
+       if (copy_address != zc->copybuf_address)
+               return -EINVAL;
+
+       err = import_single_range(READ, (void __user *)copy_address,
+                                 inq, &iov, &msg.msg_iter);
+       if (err)
+               return err;
+
+       err = tcp_recvmsg_locked(sk, &msg, inq, /*nonblock=*/1, /*flags=*/0,
+                                &tss_unused, &cmsg_flags_unused);
+       if (err < 0)
+               return err;
+
+       zc->copybuf_len = err;
+       if (likely(zc->copybuf_len)) {
+               struct sk_buff *skb;
+               u32 offset;
+
+               skb = tcp_recv_skb(sk, tcp_sk(sk)->copied_seq, &offset);
+               if (skb)
+                       tcp_zerocopy_set_hint_for_skb(sk, zc, skb, offset);
+       }
+       return 0;
+}
+
+static int tcp_copy_straggler_data(struct tcp_zerocopy_receive *zc,
+                                  struct sk_buff *skb, u32 copylen,
+                                  u32 *offset, u32 *seq)
+{
+       unsigned long copy_address = (unsigned long)zc->copybuf_address;
+       struct msghdr msg = {};
+       struct iovec iov;
+       int err;
+
+       if (copy_address != zc->copybuf_address)
+               return -EINVAL;
+
+       err = import_single_range(READ, (void __user *)copy_address,
+                                 copylen, &iov, &msg.msg_iter);
+       if (err)
+               return err;
+       err = skb_copy_datagram_msg(skb, *offset, &msg, copylen);
+       if (err)
+               return err;
+       zc->recv_skip_hint -= copylen;
+       *offset += copylen;
+       *seq += copylen;
+       return (__s32)copylen;
+}
+
+static int tcp_zerocopy_handle_leftover_data(struct tcp_zerocopy_receive *zc,
+                                            struct sock *sk,
+                                            struct sk_buff *skb,
+                                            u32 *seq,
+                                            s32 copybuf_len)
+{
+       u32 offset, copylen = min_t(u32, copybuf_len, zc->recv_skip_hint);
+
+       if (!copylen)
+               return 0;
+       /* skb is null if inq < PAGE_SIZE. */
+       if (skb)
+               offset = *seq - TCP_SKB_CB(skb)->seq;
+       else
+               skb = tcp_recv_skb(sk, *seq, &offset);
+
+       zc->copybuf_len = tcp_copy_straggler_data(zc, skb, copylen, &offset,
+                                                 seq);
+       return zc->copybuf_len < 0 ? 0 : copylen;
+}
+
+static int tcp_zerocopy_vm_insert_batch_error(struct vm_area_struct *vma,
+                                             struct page **pending_pages,
+                                             unsigned long pages_remaining,
+                                             unsigned long *address,
+                                             u32 *length,
+                                             u32 *seq,
+                                             struct tcp_zerocopy_receive *zc,
+                                             u32 total_bytes_to_map,
+                                             int err)
+{
+       /* At least one page did not map. Try zapping if we skipped earlier. */
+       if (err == -EBUSY &&
+           zc->flags & TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT) {
+               u32 maybe_zap_len;
+
+               maybe_zap_len = total_bytes_to_map -  /* All bytes to map */
+                               *length + /* Mapped or pending */
+                               (pages_remaining * PAGE_SIZE); /* Failed map. */
+               zap_page_range(vma, *address, maybe_zap_len);
+               err = 0;
+       }
+
+       if (!err) {
+               unsigned long leftover_pages = pages_remaining;
+               int bytes_mapped;
+
+               /* We called zap_page_range, try to reinsert. */
+               err = vm_insert_pages(vma, *address,
+                                     pending_pages,
+                                     &pages_remaining);
+               bytes_mapped = PAGE_SIZE * (leftover_pages - pages_remaining);
+               *seq += bytes_mapped;
+               *address += bytes_mapped;
+       }
+       if (err) {
+               /* Either we were unable to zap, OR we zapped, retried an
+                * insert, and still had an issue. Either ways, pages_remaining
+                * is the number of pages we were unable to map, and we unroll
+                * some state we speculatively touched before.
+                */
+               const int bytes_not_mapped = PAGE_SIZE * pages_remaining;
+
+               *length -= bytes_not_mapped;
+               zc->recv_skip_hint += bytes_not_mapped;
+       }
+       return err;
+}
+
 static int tcp_zerocopy_vm_insert_batch(struct vm_area_struct *vma,
                                        struct page **pages,
-                                       unsigned long pages_to_map,
-                                       unsigned long *insert_addr,
-                                       u32 *length_with_pending,
+                                       unsigned int pages_to_map,
+                                       unsigned long *address,
+                                       u32 *length,
                                        u32 *seq,
-                                       struct tcp_zerocopy_receive *zc)
+                                       struct tcp_zerocopy_receive *zc,
+                                       u32 total_bytes_to_map)
 {
        unsigned long pages_remaining = pages_to_map;
-       int bytes_mapped;
-       int ret;
+       unsigned int pages_mapped;
+       unsigned int bytes_mapped;
+       int err;
 
-       ret = vm_insert_pages(vma, *insert_addr, pages, &pages_remaining);
-       bytes_mapped = PAGE_SIZE * (pages_to_map - pages_remaining);
+       err = vm_insert_pages(vma, *address, pages, &pages_remaining);
+       pages_mapped = pages_to_map - (unsigned int)pages_remaining;
+       bytes_mapped = PAGE_SIZE * pages_mapped;
        /* Even if vm_insert_pages fails, it may have partially succeeded in
         * mapping (some but not all of the pages).
         */
        *seq += bytes_mapped;
-       *insert_addr += bytes_mapped;
-       if (ret) {
-               /* But if vm_insert_pages did fail, we have to unroll some state
-                * we speculatively touched before.
-                */
-               const int bytes_not_mapped = PAGE_SIZE * pages_remaining;
-               *length_with_pending -= bytes_not_mapped;
-               zc->recv_skip_hint += bytes_not_mapped;
-       }
-       return ret;
+       *address += bytes_mapped;
+
+       if (likely(!err))
+               return 0;
+
+       /* Error: maybe zap and retry + rollback state for failed inserts. */
+       return tcp_zerocopy_vm_insert_batch_error(vma, pages + pages_mapped,
+               pages_remaining, address, length, seq, zc, total_bytes_to_map,
+               err);
 }
 
+#define TCP_ZEROCOPY_PAGE_BATCH_SIZE 32
 static int tcp_zerocopy_receive(struct sock *sk,
                                struct tcp_zerocopy_receive *zc)
 {
+       u32 length = 0, offset, vma_len, avail_len, copylen = 0;
        unsigned long address = (unsigned long)zc->address;
-       u32 length = 0, seq, offset, zap_len;
-       #define PAGE_BATCH_SIZE 8
-       struct page *pages[PAGE_BATCH_SIZE];
+       struct page *pages[TCP_ZEROCOPY_PAGE_BATCH_SIZE];
+       s32 copybuf_len = zc->copybuf_len;
+       struct tcp_sock *tp = tcp_sk(sk);
        const skb_frag_t *frags = NULL;
+       unsigned int pages_to_map = 0;
        struct vm_area_struct *vma;
        struct sk_buff *skb = NULL;
-       unsigned long pg_idx = 0;
-       unsigned long curr_addr;
-       struct tcp_sock *tp;
-       int inq;
+       u32 seq = tp->copied_seq;
+       u32 total_bytes_to_map;
+       int inq = tcp_inq(sk);
        int ret;
 
+       zc->copybuf_len = 0;
+
        if (address & (PAGE_SIZE - 1) || address != zc->address)
                return -EINVAL;
 
@@ -1797,7 +2032,16 @@ static int tcp_zerocopy_receive(struct sock *sk,
 
        sock_rps_record_flow(sk);
 
-       tp = tcp_sk(sk);
+       if (inq && inq <= copybuf_len)
+               return receive_fallback_to_copy(sk, zc, inq);
+
+       if (inq < PAGE_SIZE) {
+               zc->length = 0;
+               zc->recv_skip_hint = inq;
+               if (!inq && sock_flag(sk, SOCK_DONE))
+                       return -EIO;
+               return 0;
+       }
 
        mmap_read_lock(current->mm);
 
@@ -1806,33 +2050,26 @@ static int tcp_zerocopy_receive(struct sock *sk,
                mmap_read_unlock(current->mm);
                return -EINVAL;
        }
-       zc->length = min_t(unsigned long, zc->length, vma->vm_end - address);
-
-       seq = tp->copied_seq;
-       inq = tcp_inq(sk);
-       zc->length = min_t(u32, zc->length, inq);
-       zap_len = zc->length & ~(PAGE_SIZE - 1);
-       if (zap_len) {
-               zap_page_range(vma, address, zap_len);
+       vma_len = min_t(unsigned long, zc->length, vma->vm_end - address);
+       avail_len = min_t(u32, vma_len, inq);
+       total_bytes_to_map = avail_len & ~(PAGE_SIZE - 1);
+       if (total_bytes_to_map) {
+               if (!(zc->flags & TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT))
+                       zap_page_range(vma, address, total_bytes_to_map);
+               zc->length = total_bytes_to_map;
                zc->recv_skip_hint = 0;
        } else {
-               zc->recv_skip_hint = zc->length;
+               zc->length = avail_len;
+               zc->recv_skip_hint = avail_len;
        }
        ret = 0;
-       curr_addr = address;
        while (length + PAGE_SIZE <= zc->length) {
+               int mappable_offset;
+               struct page *page;
+
                if (zc->recv_skip_hint < PAGE_SIZE) {
-                       /* If we're here, finish the current batch. */
-                       if (pg_idx) {
-                               ret = tcp_zerocopy_vm_insert_batch(vma, pages,
-                                                                  pg_idx,
-                                                                  &curr_addr,
-                                                                  &length,
-                                                                  &seq, zc);
-                               if (ret)
-                                       goto out;
-                               pg_idx = 0;
-                       }
+                       u32 offset_frag;
+
                        if (skb) {
                                if (zc->recv_skip_hint > 0)
                                        break;
@@ -1842,56 +2079,57 @@ static int tcp_zerocopy_receive(struct sock *sk,
                                skb = tcp_recv_skb(sk, seq, &offset);
                        }
                        zc->recv_skip_hint = skb->len - offset;
-                       offset -= skb_headlen(skb);
-                       if ((int)offset < 0 || skb_has_frag_list(skb))
+                       frags = skb_advance_to_frag(skb, offset, &offset_frag);
+                       if (!frags || offset_frag)
                                break;
-                       frags = skb_shinfo(skb)->frags;
-                       while (offset) {
-                               if (skb_frag_size(frags) > offset)
-                                       goto out;
-                               offset -= skb_frag_size(frags);
-                               frags++;
-                       }
                }
-               if (skb_frag_size(frags) != PAGE_SIZE || skb_frag_off(frags)) {
-                       int remaining = zc->recv_skip_hint;
 
-                       while (remaining && (skb_frag_size(frags) != PAGE_SIZE ||
-                                            skb_frag_off(frags))) {
-                               remaining -= skb_frag_size(frags);
-                               frags++;
-                       }
-                       zc->recv_skip_hint -= remaining;
+               mappable_offset = find_next_mappable_frag(frags,
+                                                         zc->recv_skip_hint);
+               if (mappable_offset) {
+                       zc->recv_skip_hint = mappable_offset;
                        break;
                }
-               pages[pg_idx] = skb_frag_page(frags);
-               pg_idx++;
+               page = skb_frag_page(frags);
+               prefetchw(page);
+               pages[pages_to_map++] = page;
                length += PAGE_SIZE;
                zc->recv_skip_hint -= PAGE_SIZE;
                frags++;
-               if (pg_idx == PAGE_BATCH_SIZE) {
-                       ret = tcp_zerocopy_vm_insert_batch(vma, pages, pg_idx,
-                                                          &curr_addr, &length,
-                                                          &seq, zc);
+               if (pages_to_map == TCP_ZEROCOPY_PAGE_BATCH_SIZE ||
+                   zc->recv_skip_hint < PAGE_SIZE) {
+                       /* Either full batch, or we're about to go to next skb
+                        * (and we cannot unroll failed ops across skbs).
+                        */
+                       ret = tcp_zerocopy_vm_insert_batch(vma, pages,
+                                                          pages_to_map,
+                                                          &address, &length,
+                                                          &seq, zc,
+                                                          total_bytes_to_map);
                        if (ret)
                                goto out;
-                       pg_idx = 0;
+                       pages_to_map = 0;
                }
        }
-       if (pg_idx) {
-               ret = tcp_zerocopy_vm_insert_batch(vma, pages, pg_idx,
-                                                  &curr_addr, &length, &seq,
-                                                  zc);
+       if (pages_to_map) {
+               ret = tcp_zerocopy_vm_insert_batch(vma, pages, pages_to_map,
+                                                  &address, &length, &seq,
+                                                  zc, total_bytes_to_map);
        }
 out:
        mmap_read_unlock(current->mm);
-       if (length) {
+       /* Try to copy straggler data. */
+       if (!ret)
+               copylen = tcp_zerocopy_handle_leftover_data(zc, sk, skb, &seq,
+                                                           copybuf_len);
+
+       if (length + copylen) {
                WRITE_ONCE(tp->copied_seq, seq);
                tcp_rcv_space_adjust(sk);
 
                /* Clean up data we have read: This will do ACK frames. */
                tcp_recv_skb(sk, seq, &offset);
-               tcp_cleanup_rbuf(sk, length);
+               tcp_cleanup_rbuf(sk, length + copylen);
                ret = 0;
                if (length == zc->length)
                        zc->recv_skip_hint = 0;
@@ -2013,36 +2251,28 @@ static int tcp_inq_hint(struct sock *sk)
  *     Probably, code can be easily improved even more.
  */
 
-int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
-               int flags, int *addr_len)
+static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
+                             int nonblock, int flags,
+                             struct scm_timestamping_internal *tss,
+                             int *cmsg_flags)
 {
        struct tcp_sock *tp = tcp_sk(sk);
        int copied = 0;
        u32 peek_seq;
        u32 *seq;
        unsigned long used;
-       int err, inq;
+       int err;
        int target;             /* Read at least this many bytes */
        long timeo;
        struct sk_buff *skb, *last;
        u32 urg_hole = 0;
-       struct scm_timestamping_internal tss;
-       int cmsg_flags;
-
-       if (unlikely(flags & MSG_ERRQUEUE))
-               return inet_recv_error(sk, msg, len, addr_len);
-
-       if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue) &&
-           (sk->sk_state == TCP_ESTABLISHED))
-               sk_busy_loop(sk, nonblock);
-
-       lock_sock(sk);
 
        err = -ENOTCONN;
        if (sk->sk_state == TCP_LISTEN)
                goto out;
 
-       cmsg_flags = tp->recvmsg_inq ? 1 : 0;
+       if (tp->recvmsg_inq)
+               *cmsg_flags = 1;
        timeo = sock_rcvtimeo(sk, nonblock);
 
        /* Urgent data needs to be handled specially. */
@@ -2222,8 +2452,8 @@ skip_copy:
                }
 
                if (TCP_SKB_CB(skb)->has_rxtstamp) {
-                       tcp_update_recv_tstamps(skb, &tss);
-                       cmsg_flags |= 2;
+                       tcp_update_recv_tstamps(skb, tss);
+                       *cmsg_flags |= 2;
                }
 
                if (used + offset < skb->len)
@@ -2249,22 +2479,9 @@ found_fin_ok:
 
        /* Clean up data we have read: This will do ACK frames. */
        tcp_cleanup_rbuf(sk, copied);
-
-       release_sock(sk);
-
-       if (cmsg_flags) {
-               if (cmsg_flags & 2)
-                       tcp_recv_timestamp(msg, sk, &tss);
-               if (cmsg_flags & 1) {
-                       inq = tcp_inq_hint(sk);
-                       put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq);
-               }
-       }
-
        return copied;
 
 out:
-       release_sock(sk);
        return err;
 
 recv_urg:
@@ -2275,6 +2492,36 @@ recv_sndq:
        err = tcp_peek_sndq(sk, msg, len);
        goto out;
 }
+
+int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
+               int flags, int *addr_len)
+{
+       int cmsg_flags = 0, ret, inq;
+       struct scm_timestamping_internal tss;
+
+       if (unlikely(flags & MSG_ERRQUEUE))
+               return inet_recv_error(sk, msg, len, addr_len);
+
+       if (sk_can_busy_loop(sk) &&
+           skb_queue_empty_lockless(&sk->sk_receive_queue) &&
+           sk->sk_state == TCP_ESTABLISHED)
+               sk_busy_loop(sk, nonblock);
+
+       lock_sock(sk);
+       ret = tcp_recvmsg_locked(sk, msg, len, nonblock, flags, &tss,
+                                &cmsg_flags);
+       release_sock(sk);
+
+       if (cmsg_flags && ret >= 0) {
+               if (cmsg_flags & 2)
+                       tcp_recv_timestamp(msg, sk, &tss);
+               if (cmsg_flags & 1) {
+                       inq = tcp_inq_hint(sk);
+                       put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq);
+               }
+       }
+       return ret;
+}
 EXPORT_SYMBOL(tcp_recvmsg);
 
 void tcp_set_state(struct sock *sk, int state)
@@ -2405,13 +2652,12 @@ bool tcp_check_oom(struct sock *sk, int shift)
        return too_many_orphans || out_of_socket_memory;
 }
 
-void tcp_close(struct sock *sk, long timeout)
+void __tcp_close(struct sock *sk, long timeout)
 {
        struct sk_buff *skb;
        int data_was_unread = 0;
        int state;
 
-       lock_sock(sk);
        sk->sk_shutdown = SHUTDOWN_MASK;
 
        if (sk->sk_state == TCP_LISTEN) {
@@ -2575,6 +2821,12 @@ adjudge_to_death:
 out:
        bh_unlock_sock(sk);
        local_bh_enable();
+}
+
+void tcp_close(struct sock *sk, long timeout)
+{
+       lock_sock(sk);
+       __tcp_close(sk, timeout);
        release_sock(sk);
        sock_put(sk);
 }
@@ -2686,6 +2938,8 @@ int tcp_disconnect(struct sock *sk, int flags)
        icsk->icsk_backoff = 0;
        icsk->icsk_probes_out = 0;
        icsk->icsk_rto = TCP_TIMEOUT_INIT;
+       icsk->icsk_rto_min = TCP_RTO_MIN;
+       icsk->icsk_delack_max = TCP_DELACK_MAX;
        tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
        tp->snd_cwnd = TCP_INIT_CWND;
        tp->snd_cwnd_cnt = 0;
@@ -2695,6 +2949,7 @@ int tcp_disconnect(struct sock *sk, int flags)
        if (icsk->icsk_ca_ops->release)
                icsk->icsk_ca_ops->release(sk);
        memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
+       icsk->icsk_ca_initialized = 0;
        tcp_set_ca_state(sk, TCP_CA_Open);
        tp->is_sack_reneg = 0;
        tcp_clear_retrans(tp);
@@ -3019,6 +3274,21 @@ int tcp_sock_set_keepcnt(struct sock *sk, int val)
 }
 EXPORT_SYMBOL(tcp_sock_set_keepcnt);
 
+int tcp_set_window_clamp(struct sock *sk, int val)
+{
+       struct tcp_sock *tp = tcp_sk(sk);
+
+       if (!val) {
+               if (sk->sk_state != TCP_CLOSE)
+                       return -EINVAL;
+               tp->window_clamp = 0;
+       } else {
+               tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
+                       SOCK_MIN_RCVBUF / 2 : val;
+       }
+       return 0;
+}
+
 /*
  *     Socket option code for TCP.
  */
@@ -3046,7 +3316,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, int optname,
                name[val] = 0;
 
                lock_sock(sk);
-               err = tcp_set_congestion_control(sk, name, true, true,
+               err = tcp_set_congestion_control(sk, name, true,
                                                 ns_capable(sock_net(sk)->user_ns,
                                                            CAP_NET_ADMIN));
                release_sock(sk);
@@ -3208,7 +3478,8 @@ static int do_tcp_setsockopt(struct sock *sk, int level, int optname,
                break;
 
        case TCP_SAVE_SYN:
-               if (val < 0 || val > 1)
+               /* 0: disable, 1: enable, 2: start from ether_header */
+               if (val < 0 || val > 2)
                        err = -EINVAL;
                else
                        tp->save_syn = val;
@@ -3231,15 +3502,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, int optname,
                break;
 
        case TCP_WINDOW_CLAMP:
-               if (!val) {
-                       if (sk->sk_state != TCP_CLOSE) {
-                               err = -EINVAL;
-                               break;
-                       }
-                       tp->window_clamp = 0;
-               } else
-                       tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
-                                               SOCK_MIN_RCVBUF / 2 : val;
+               err = tcp_set_window_clamp(sk, val);
                break;
 
        case TCP_QUICKACK:
@@ -3789,20 +4052,21 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
 
                lock_sock(sk);
                if (tp->saved_syn) {
-                       if (len < tp->saved_syn[0]) {
-                               if (put_user(tp->saved_syn[0], optlen)) {
+                       if (len < tcp_saved_syn_len(tp->saved_syn)) {
+                               if (put_user(tcp_saved_syn_len(tp->saved_syn),
+                                            optlen)) {
                                        release_sock(sk);
                                        return -EFAULT;
                                }
                                release_sock(sk);
                                return -EINVAL;
                        }
-                       len = tp->saved_syn[0];
+                       len = tcp_saved_syn_len(tp->saved_syn);
                        if (put_user(len, optlen)) {
                                release_sock(sk);
                                return -EFAULT;
                        }
-                       if (copy_to_user(optval, tp->saved_syn + 1, len)) {
+                       if (copy_to_user(optval, tp->saved_syn->data, len)) {
                                release_sock(sk);
                                return -EFAULT;
                        }
@@ -3818,7 +4082,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
        }
 #ifdef CONFIG_MMU
        case TCP_ZEROCOPY_RECEIVE: {
-               struct tcp_zerocopy_receive zc;
+               struct tcp_zerocopy_receive zc = {};
                int err;
 
                if (get_user(len, optlen))
@@ -3835,7 +4099,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
                lock_sock(sk);
                err = tcp_zerocopy_receive(sk, &zc);
                release_sock(sk);
-               if (len == sizeof(zc))
+               if (len >= offsetofend(struct tcp_zerocopy_receive, err))
                        goto zerocopy_rcv_sk_err;
                switch (len) {
                case offsetofend(struct tcp_zerocopy_receive, err):