OpenCores
URL https://opencores.org/ocsvn/or1k_soc_on_altera_embedded_dev_kit/or1k_soc_on_altera_embedded_dev_kit/trunk

Subversion Repositories or1k_soc_on_altera_embedded_dev_kit

[/] [or1k_soc_on_altera_embedded_dev_kit/] [tags/] [linux-2.6/] [linux-2.6.24_or32_unified_v2.3/] [net/] [ipv4/] [tcp_output.c] - Blame information for rev 8

Details | Compare with Previous | View Log

Line No. Rev Author Line
1 3 xianfeng
/*
2
 * INET         An implementation of the TCP/IP protocol suite for the LINUX
3
 *              operating system.  INET is implemented using the  BSD Socket
4
 *              interface as the means of communication with the user level.
5
 *
6
 *              Implementation of the Transmission Control Protocol(TCP).
7
 *
8
 * Version:     $Id: tcp_output.c,v 1.146 2002/02/01 22:01:04 davem Exp $
9
 *
10
 * Authors:     Ross Biro
11
 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12
 *              Mark Evans, <evansmp@uhura.aston.ac.uk>
13
 *              Corey Minyard <wf-rch!minyard@relay.EU.net>
14
 *              Florian La Roche, <flla@stud.uni-sb.de>
15
 *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16
 *              Linus Torvalds, <torvalds@cs.helsinki.fi>
17
 *              Alan Cox, <gw4pts@gw4pts.ampr.org>
18
 *              Matthew Dillon, <dillon@apollo.west.oic.com>
19
 *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20
 *              Jorge Cwik, <jorge@laser.satlink.net>
21
 */
22
 
23
/*
24
 * Changes:     Pedro Roque     :       Retransmit queue handled by TCP.
25
 *                              :       Fragmentation on mtu decrease
26
 *                              :       Segment collapse on retransmit
27
 *                              :       AF independence
28
 *
29
 *              Linus Torvalds  :       send_delayed_ack
30
 *              David S. Miller :       Charge memory using the right skb
31
 *                                      during syn/ack processing.
32
 *              David S. Miller :       Output engine completely rewritten.
33
 *              Andrea Arcangeli:       SYNACK carry ts_recent in tsecr.
34
 *              Cacophonix Gaul :       draft-minshall-nagle-01
35
 *              J Hadi Salim    :       ECN support
36
 *
37
 */
38
 
39
#include <net/tcp.h>
40
 
41
#include <linux/compiler.h>
42
#include <linux/module.h>
43
 
44
/* People can turn this off for buggy TCP's found in printers etc. */
45
int sysctl_tcp_retrans_collapse __read_mostly = 1;
46
 
47
/* People can turn this on to  work with those rare, broken TCPs that
48
 * interpret the window field as a signed quantity.
49
 */
50
int sysctl_tcp_workaround_signed_windows __read_mostly = 0;
51
 
52
/* This limits the percentage of the congestion window which we
53
 * will allow a single TSO frame to consume.  Building TSO frames
54
 * which are too large can cause TCP streams to be bursty.
55
 */
56
int sysctl_tcp_tso_win_divisor __read_mostly = 3;
57
 
58
int sysctl_tcp_mtu_probing __read_mostly = 0;
59
int sysctl_tcp_base_mss __read_mostly = 512;
60
 
61
/* By default, RFC2861 behavior.  */
62
int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
63
 
64
static inline void tcp_packets_out_inc(struct sock *sk,
65
                                       const struct sk_buff *skb)
66
{
67
        struct tcp_sock *tp = tcp_sk(sk);
68
        int orig = tp->packets_out;
69
 
70
        tp->packets_out += tcp_skb_pcount(skb);
71
        if (!orig)
72
                inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
73
                                          inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
74
}
75
 
76
static void update_send_head(struct sock *sk, struct sk_buff *skb)
77
{
78
        struct tcp_sock *tp = tcp_sk(sk);
79
 
80
        tcp_advance_send_head(sk, skb);
81
        tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
82
        tcp_packets_out_inc(sk, skb);
83
}
84
 
85
/* SND.NXT, if window was not shrunk.
86
 * If window has been shrunk, what should we make? It is not clear at all.
87
 * Using SND.UNA we will fail to open window, SND.NXT is out of window. :-(
88
 * Anything in between SND.UNA...SND.UNA+SND.WND also can be already
89
 * invalid. OK, let's make this for now:
90
 */
91
static inline __u32 tcp_acceptable_seq(struct sock *sk)
92
{
93
        struct tcp_sock *tp = tcp_sk(sk);
94
 
95
        if (!before(tp->snd_una+tp->snd_wnd, tp->snd_nxt))
96
                return tp->snd_nxt;
97
        else
98
                return tp->snd_una+tp->snd_wnd;
99
}
100
 
101
/* Calculate mss to advertise in SYN segment.
102
 * RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that:
103
 *
104
 * 1. It is independent of path mtu.
105
 * 2. Ideally, it is maximal possible segment size i.e. 65535-40.
106
 * 3. For IPv4 it is reasonable to calculate it from maximal MTU of
107
 *    attached devices, because some buggy hosts are confused by
108
 *    large MSS.
109
 * 4. We do not make 3, we advertise MSS, calculated from first
110
 *    hop device mtu, but allow to raise it to ip_rt_min_advmss.
111
 *    This may be overridden via information stored in routing table.
112
 * 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible,
113
 *    probably even Jumbo".
114
 */
115
static __u16 tcp_advertise_mss(struct sock *sk)
116
{
117
        struct tcp_sock *tp = tcp_sk(sk);
118
        struct dst_entry *dst = __sk_dst_get(sk);
119
        int mss = tp->advmss;
120
 
121
        if (dst && dst_metric(dst, RTAX_ADVMSS) < mss) {
122
                mss = dst_metric(dst, RTAX_ADVMSS);
123
                tp->advmss = mss;
124
        }
125
 
126
        return (__u16)mss;
127
}
128
 
129
/* RFC2861. Reset CWND after idle period longer RTO to "restart window".
130
 * This is the first part of cwnd validation mechanism. */
131
static void tcp_cwnd_restart(struct sock *sk, struct dst_entry *dst)
132
{
133
        struct tcp_sock *tp = tcp_sk(sk);
134
        s32 delta = tcp_time_stamp - tp->lsndtime;
135
        u32 restart_cwnd = tcp_init_cwnd(tp, dst);
136
        u32 cwnd = tp->snd_cwnd;
137
 
138
        tcp_ca_event(sk, CA_EVENT_CWND_RESTART);
139
 
140
        tp->snd_ssthresh = tcp_current_ssthresh(sk);
141
        restart_cwnd = min(restart_cwnd, cwnd);
142
 
143
        while ((delta -= inet_csk(sk)->icsk_rto) > 0 && cwnd > restart_cwnd)
144
                cwnd >>= 1;
145
        tp->snd_cwnd = max(cwnd, restart_cwnd);
146
        tp->snd_cwnd_stamp = tcp_time_stamp;
147
        tp->snd_cwnd_used = 0;
148
}
149
 
150
static void tcp_event_data_sent(struct tcp_sock *tp,
151
                                struct sk_buff *skb, struct sock *sk)
152
{
153
        struct inet_connection_sock *icsk = inet_csk(sk);
154
        const u32 now = tcp_time_stamp;
155
 
156
        if (sysctl_tcp_slow_start_after_idle &&
157
            (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto))
158
                tcp_cwnd_restart(sk, __sk_dst_get(sk));
159
 
160
        tp->lsndtime = now;
161
 
162
        /* If it is a reply for ato after last received
163
         * packet, enter pingpong mode.
164
         */
165
        if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato)
166
                icsk->icsk_ack.pingpong = 1;
167
}
168
 
169
static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
170
{
171
        tcp_dec_quickack_mode(sk, pkts);
172
        inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
173
}
174
 
175
/* Determine a window scaling and initial window to offer.
176
 * Based on the assumption that the given amount of space
177
 * will be offered. Store the results in the tp structure.
178
 * NOTE: for smooth operation initial space offering should
179
 * be a multiple of mss if possible. We assume here that mss >= 1.
180
 * This MUST be enforced by all callers.
181
 */
182
void tcp_select_initial_window(int __space, __u32 mss,
183
                               __u32 *rcv_wnd, __u32 *window_clamp,
184
                               int wscale_ok, __u8 *rcv_wscale)
185
{
186
        unsigned int space = (__space < 0 ? 0 : __space);
187
 
188
        /* If no clamp set the clamp to the max possible scaled window */
189
        if (*window_clamp == 0)
190
                (*window_clamp) = (65535 << 14);
191
        space = min(*window_clamp, space);
192
 
193
        /* Quantize space offering to a multiple of mss if possible. */
194
        if (space > mss)
195
                space = (space / mss) * mss;
196
 
197
        /* NOTE: offering an initial window larger than 32767
198
         * will break some buggy TCP stacks. If the admin tells us
199
         * it is likely we could be speaking with such a buggy stack
200
         * we will truncate our initial window offering to 32K-1
201
         * unless the remote has sent us a window scaling option,
202
         * which we interpret as a sign the remote TCP is not
203
         * misinterpreting the window field as a signed quantity.
204
         */
205
        if (sysctl_tcp_workaround_signed_windows)
206
                (*rcv_wnd) = min(space, MAX_TCP_WINDOW);
207
        else
208
                (*rcv_wnd) = space;
209
 
210
        (*rcv_wscale) = 0;
211
        if (wscale_ok) {
212
                /* Set window scaling on max possible window
213
                 * See RFC1323 for an explanation of the limit to 14
214
                 */
215
                space = max_t(u32, sysctl_tcp_rmem[2], sysctl_rmem_max);
216
                space = min_t(u32, space, *window_clamp);
217
                while (space > 65535 && (*rcv_wscale) < 14) {
218
                        space >>= 1;
219
                        (*rcv_wscale)++;
220
                }
221
        }
222
 
223
        /* Set initial window to value enough for senders,
224
         * following RFC2414. Senders, not following this RFC,
225
         * will be satisfied with 2.
226
         */
227
        if (mss > (1<<*rcv_wscale)) {
228
                int init_cwnd = 4;
229
                if (mss > 1460*3)
230
                        init_cwnd = 2;
231
                else if (mss > 1460)
232
                        init_cwnd = 3;
233
                if (*rcv_wnd > init_cwnd*mss)
234
                        *rcv_wnd = init_cwnd*mss;
235
        }
236
 
237
        /* Set the clamp no higher than max representable value */
238
        (*window_clamp) = min(65535U << (*rcv_wscale), *window_clamp);
239
}
240
 
241
/* Chose a new window to advertise, update state in tcp_sock for the
242
 * socket, and return result with RFC1323 scaling applied.  The return
243
 * value can be stuffed directly into th->window for an outgoing
244
 * frame.
245
 */
246
static u16 tcp_select_window(struct sock *sk)
247
{
248
        struct tcp_sock *tp = tcp_sk(sk);
249
        u32 cur_win = tcp_receive_window(tp);
250
        u32 new_win = __tcp_select_window(sk);
251
 
252
        /* Never shrink the offered window */
253
        if (new_win < cur_win) {
254
                /* Danger Will Robinson!
255
                 * Don't update rcv_wup/rcv_wnd here or else
256
                 * we will not be able to advertise a zero
257
                 * window in time.  --DaveM
258
                 *
259
                 * Relax Will Robinson.
260
                 */
261
                new_win = cur_win;
262
        }
263
        tp->rcv_wnd = new_win;
264
        tp->rcv_wup = tp->rcv_nxt;
265
 
266
        /* Make sure we do not exceed the maximum possible
267
         * scaled window.
268
         */
269
        if (!tp->rx_opt.rcv_wscale && sysctl_tcp_workaround_signed_windows)
270
                new_win = min(new_win, MAX_TCP_WINDOW);
271
        else
272
                new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale));
273
 
274
        /* RFC1323 scaling applied */
275
        new_win >>= tp->rx_opt.rcv_wscale;
276
 
277
        /* If we advertise zero window, disable fast path. */
278
        if (new_win == 0)
279
                tp->pred_flags = 0;
280
 
281
        return new_win;
282
}
283
 
284
static inline void TCP_ECN_send_synack(struct tcp_sock *tp,
285
                                       struct sk_buff *skb)
286
{
287
        TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_CWR;
288
        if (!(tp->ecn_flags&TCP_ECN_OK))
289
                TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_ECE;
290
}
291
 
292
static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb)
293
{
294
        struct tcp_sock *tp = tcp_sk(sk);
295
 
296
        tp->ecn_flags = 0;
297
        if (sysctl_tcp_ecn) {
298
                TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ECE|TCPCB_FLAG_CWR;
299
                tp->ecn_flags = TCP_ECN_OK;
300
        }
301
}
302
 
303
static __inline__ void
304
TCP_ECN_make_synack(struct request_sock *req, struct tcphdr *th)
305
{
306
        if (inet_rsk(req)->ecn_ok)
307
                th->ece = 1;
308
}
309
 
310
static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb,
311
                                int tcp_header_len)
312
{
313
        struct tcp_sock *tp = tcp_sk(sk);
314
 
315
        if (tp->ecn_flags & TCP_ECN_OK) {
316
                /* Not-retransmitted data segment: set ECT and inject CWR. */
317
                if (skb->len != tcp_header_len &&
318
                    !before(TCP_SKB_CB(skb)->seq, tp->snd_nxt)) {
319
                        INET_ECN_xmit(sk);
320
                        if (tp->ecn_flags&TCP_ECN_QUEUE_CWR) {
321
                                tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;
322
                                tcp_hdr(skb)->cwr = 1;
323
                                skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
324
                        }
325
                } else {
326
                        /* ACK or retransmitted segment: clear ECT|CE */
327
                        INET_ECN_dontxmit(sk);
328
                }
329
                if (tp->ecn_flags & TCP_ECN_DEMAND_CWR)
330
                        tcp_hdr(skb)->ece = 1;
331
        }
332
}
333
 
334
static void tcp_build_and_update_options(__be32 *ptr, struct tcp_sock *tp,
335
                                         __u32 tstamp, __u8 **md5_hash)
336
{
337
        if (tp->rx_opt.tstamp_ok) {
338
                *ptr++ = htonl((TCPOPT_NOP << 24) |
339
                               (TCPOPT_NOP << 16) |
340
                               (TCPOPT_TIMESTAMP << 8) |
341
                               TCPOLEN_TIMESTAMP);
342
                *ptr++ = htonl(tstamp);
343
                *ptr++ = htonl(tp->rx_opt.ts_recent);
344
        }
345
        if (tp->rx_opt.eff_sacks) {
346
                struct tcp_sack_block *sp = tp->rx_opt.dsack ? tp->duplicate_sack : tp->selective_acks;
347
                int this_sack;
348
 
349
                *ptr++ = htonl((TCPOPT_NOP  << 24) |
350
                               (TCPOPT_NOP  << 16) |
351
                               (TCPOPT_SACK <<  8) |
352
                               (TCPOLEN_SACK_BASE + (tp->rx_opt.eff_sacks *
353
                                                     TCPOLEN_SACK_PERBLOCK)));
354
 
355
                for (this_sack = 0; this_sack < tp->rx_opt.eff_sacks; this_sack++) {
356
                        *ptr++ = htonl(sp[this_sack].start_seq);
357
                        *ptr++ = htonl(sp[this_sack].end_seq);
358
                }
359
 
360
                if (tp->rx_opt.dsack) {
361
                        tp->rx_opt.dsack = 0;
362
                        tp->rx_opt.eff_sacks--;
363
                }
364
        }
365
#ifdef CONFIG_TCP_MD5SIG
366
        if (md5_hash) {
367
                *ptr++ = htonl((TCPOPT_NOP << 24) |
368
                               (TCPOPT_NOP << 16) |
369
                               (TCPOPT_MD5SIG << 8) |
370
                               TCPOLEN_MD5SIG);
371
                *md5_hash = (__u8 *)ptr;
372
        }
373
#endif
374
}
375
 
376
/* Construct a tcp options header for a SYN or SYN_ACK packet.
377
 * If this is every changed make sure to change the definition of
378
 * MAX_SYN_SIZE to match the new maximum number of options that you
379
 * can generate.
380
 *
381
 * Note - that with the RFC2385 TCP option, we make room for the
382
 * 16 byte MD5 hash. This will be filled in later, so the pointer for the
383
 * location to be filled is passed back up.
384
 */
385
static void tcp_syn_build_options(__be32 *ptr, int mss, int ts, int sack,
386
                                  int offer_wscale, int wscale, __u32 tstamp,
387
                                  __u32 ts_recent, __u8 **md5_hash)
388
{
389
        /* We always get an MSS option.
390
         * The option bytes which will be seen in normal data
391
         * packets should timestamps be used, must be in the MSS
392
         * advertised.  But we subtract them from tp->mss_cache so
393
         * that calculations in tcp_sendmsg are simpler etc.
394
         * So account for this fact here if necessary.  If we
395
         * don't do this correctly, as a receiver we won't
396
         * recognize data packets as being full sized when we
397
         * should, and thus we won't abide by the delayed ACK
398
         * rules correctly.
399
         * SACKs don't matter, we never delay an ACK when we
400
         * have any of those going out.
401
         */
402
        *ptr++ = htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | mss);
403
        if (ts) {
404
                if (sack)
405
                        *ptr++ = htonl((TCPOPT_SACK_PERM << 24) |
406
                                       (TCPOLEN_SACK_PERM << 16) |
407
                                       (TCPOPT_TIMESTAMP << 8) |
408
                                       TCPOLEN_TIMESTAMP);
409
                else
410
                        *ptr++ = htonl((TCPOPT_NOP << 24) |
411
                                       (TCPOPT_NOP << 16) |
412
                                       (TCPOPT_TIMESTAMP << 8) |
413
                                       TCPOLEN_TIMESTAMP);
414
                *ptr++ = htonl(tstamp);         /* TSVAL */
415
                *ptr++ = htonl(ts_recent);      /* TSECR */
416
        } else if (sack)
417
                *ptr++ = htonl((TCPOPT_NOP << 24) |
418
                               (TCPOPT_NOP << 16) |
419
                               (TCPOPT_SACK_PERM << 8) |
420
                               TCPOLEN_SACK_PERM);
421
        if (offer_wscale)
422
                *ptr++ = htonl((TCPOPT_NOP << 24) |
423
                               (TCPOPT_WINDOW << 16) |
424
                               (TCPOLEN_WINDOW << 8) |
425
                               (wscale));
426
#ifdef CONFIG_TCP_MD5SIG
427
        /*
428
         * If MD5 is enabled, then we set the option, and include the size
429
         * (always 18). The actual MD5 hash is added just before the
430
         * packet is sent.
431
         */
432
        if (md5_hash) {
433
                *ptr++ = htonl((TCPOPT_NOP << 24) |
434
                               (TCPOPT_NOP << 16) |
435
                               (TCPOPT_MD5SIG << 8) |
436
                               TCPOLEN_MD5SIG);
437
                *md5_hash = (__u8 *) ptr;
438
        }
439
#endif
440
}
441
 
442
/* This routine actually transmits TCP packets queued in by
443
 * tcp_do_sendmsg().  This is used by both the initial
444
 * transmission and possible later retransmissions.
445
 * All SKB's seen here are completely headerless.  It is our
446
 * job to build the TCP header, and pass the packet down to
447
 * IP so it can do the same plus pass the packet off to the
448
 * device.
449
 *
450
 * We are working here with either a clone of the original
451
 * SKB, or a fresh unique copy made by the retransmit engine.
452
 */
453
static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, gfp_t gfp_mask)
454
{
455
        const struct inet_connection_sock *icsk = inet_csk(sk);
456
        struct inet_sock *inet;
457
        struct tcp_sock *tp;
458
        struct tcp_skb_cb *tcb;
459
        int tcp_header_size;
460
#ifdef CONFIG_TCP_MD5SIG
461
        struct tcp_md5sig_key *md5;
462
        __u8 *md5_hash_location;
463
#endif
464
        struct tcphdr *th;
465
        int sysctl_flags;
466
        int err;
467
 
468
        BUG_ON(!skb || !tcp_skb_pcount(skb));
469
 
470
        /* If congestion control is doing timestamping, we must
471
         * take such a timestamp before we potentially clone/copy.
472
         */
473
        if (icsk->icsk_ca_ops->flags & TCP_CONG_RTT_STAMP)
474
                __net_timestamp(skb);
475
 
476
        if (likely(clone_it)) {
477
                if (unlikely(skb_cloned(skb)))
478
                        skb = pskb_copy(skb, gfp_mask);
479
                else
480
                        skb = skb_clone(skb, gfp_mask);
481
                if (unlikely(!skb))
482
                        return -ENOBUFS;
483
        }
484
 
485
        inet = inet_sk(sk);
486
        tp = tcp_sk(sk);
487
        tcb = TCP_SKB_CB(skb);
488
        tcp_header_size = tp->tcp_header_len;
489
 
490
#define SYSCTL_FLAG_TSTAMPS     0x1
491
#define SYSCTL_FLAG_WSCALE      0x2
492
#define SYSCTL_FLAG_SACK        0x4
493
 
494
        sysctl_flags = 0;
495
        if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) {
496
                tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS;
497
                if (sysctl_tcp_timestamps) {
498
                        tcp_header_size += TCPOLEN_TSTAMP_ALIGNED;
499
                        sysctl_flags |= SYSCTL_FLAG_TSTAMPS;
500
                }
501
                if (sysctl_tcp_window_scaling) {
502
                        tcp_header_size += TCPOLEN_WSCALE_ALIGNED;
503
                        sysctl_flags |= SYSCTL_FLAG_WSCALE;
504
                }
505
                if (sysctl_tcp_sack) {
506
                        sysctl_flags |= SYSCTL_FLAG_SACK;
507
                        if (!(sysctl_flags & SYSCTL_FLAG_TSTAMPS))
508
                                tcp_header_size += TCPOLEN_SACKPERM_ALIGNED;
509
                }
510
        } else if (unlikely(tp->rx_opt.eff_sacks)) {
511
                /* A SACK is 2 pad bytes, a 2 byte header, plus
512
                 * 2 32-bit sequence numbers for each SACK block.
513
                 */
514
                tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED +
515
                                    (tp->rx_opt.eff_sacks *
516
                                     TCPOLEN_SACK_PERBLOCK));
517
        }
518
 
519
        if (tcp_packets_in_flight(tp) == 0)
520
                tcp_ca_event(sk, CA_EVENT_TX_START);
521
 
522
#ifdef CONFIG_TCP_MD5SIG
523
        /*
524
         * Are we doing MD5 on this segment? If so - make
525
         * room for it.
526
         */
527
        md5 = tp->af_specific->md5_lookup(sk, sk);
528
        if (md5)
529
                tcp_header_size += TCPOLEN_MD5SIG_ALIGNED;
530
#endif
531
 
532
        skb_push(skb, tcp_header_size);
533
        skb_reset_transport_header(skb);
534
        skb_set_owner_w(skb, sk);
535
 
536
        /* Build TCP header and checksum it. */
537
        th = tcp_hdr(skb);
538
        th->source              = inet->sport;
539
        th->dest                = inet->dport;
540
        th->seq                 = htonl(tcb->seq);
541
        th->ack_seq             = htonl(tp->rcv_nxt);
542
        *(((__be16 *)th) + 6)   = htons(((tcp_header_size >> 2) << 12) |
543
                                        tcb->flags);
544
 
545
        if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) {
546
                /* RFC1323: The window in SYN & SYN/ACK segments
547
                 * is never scaled.
548
                 */
549
                th->window      = htons(min(tp->rcv_wnd, 65535U));
550
        } else {
551
                th->window      = htons(tcp_select_window(sk));
552
        }
553
        th->check               = 0;
554
        th->urg_ptr             = 0;
555
 
556
        if (unlikely(tp->urg_mode &&
557
                     between(tp->snd_up, tcb->seq+1, tcb->seq+0xFFFF))) {
558
                th->urg_ptr             = htons(tp->snd_up-tcb->seq);
559
                th->urg                 = 1;
560
        }
561
 
562
        if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) {
563
                tcp_syn_build_options((__be32 *)(th + 1),
564
                                      tcp_advertise_mss(sk),
565
                                      (sysctl_flags & SYSCTL_FLAG_TSTAMPS),
566
                                      (sysctl_flags & SYSCTL_FLAG_SACK),
567
                                      (sysctl_flags & SYSCTL_FLAG_WSCALE),
568
                                      tp->rx_opt.rcv_wscale,
569
                                      tcb->when,
570
                                      tp->rx_opt.ts_recent,
571
 
572
#ifdef CONFIG_TCP_MD5SIG
573
                                      md5 ? &md5_hash_location :
574
#endif
575
                                      NULL);
576
        } else {
577
                tcp_build_and_update_options((__be32 *)(th + 1),
578
                                             tp, tcb->when,
579
#ifdef CONFIG_TCP_MD5SIG
580
                                             md5 ? &md5_hash_location :
581
#endif
582
                                             NULL);
583
                TCP_ECN_send(sk, skb, tcp_header_size);
584
        }
585
 
586
#ifdef CONFIG_TCP_MD5SIG
587
        /* Calculate the MD5 hash, as we have all we need now */
588
        if (md5) {
589
                tp->af_specific->calc_md5_hash(md5_hash_location,
590
                                               md5,
591
                                               sk, NULL, NULL,
592
                                               tcp_hdr(skb),
593
                                               sk->sk_protocol,
594
                                               skb->len);
595
        }
596
#endif
597
 
598
        icsk->icsk_af_ops->send_check(sk, skb->len, skb);
599
 
600
        if (likely(tcb->flags & TCPCB_FLAG_ACK))
601
                tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
602
 
603
        if (skb->len != tcp_header_size)
604
                tcp_event_data_sent(tp, skb, sk);
605
 
606
        if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
607
                TCP_INC_STATS(TCP_MIB_OUTSEGS);
608
 
609
        err = icsk->icsk_af_ops->queue_xmit(skb, 0);
610
        if (likely(err <= 0))
611
                return err;
612
 
613
        tcp_enter_cwr(sk, 1);
614
 
615
        return net_xmit_eval(err);
616
 
617
#undef SYSCTL_FLAG_TSTAMPS
618
#undef SYSCTL_FLAG_WSCALE
619
#undef SYSCTL_FLAG_SACK
620
}
621
 
622
 
623
/* This routine just queue's the buffer
624
 *
625
 * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
626
 * otherwise socket can stall.
627
 */
628
static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
629
{
630
        struct tcp_sock *tp = tcp_sk(sk);
631
 
632
        /* Advance write_seq and place onto the write_queue. */
633
        tp->write_seq = TCP_SKB_CB(skb)->end_seq;
634
        skb_header_release(skb);
635
        tcp_add_write_queue_tail(sk, skb);
636
        sk_charge_skb(sk, skb);
637
}
638
 
639
static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now)
640
{
641
        if (skb->len <= mss_now || !sk_can_gso(sk)) {
642
                /* Avoid the costly divide in the normal
643
                 * non-TSO case.
644
                 */
645
                skb_shinfo(skb)->gso_segs = 1;
646
                skb_shinfo(skb)->gso_size = 0;
647
                skb_shinfo(skb)->gso_type = 0;
648
        } else {
649
                skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss_now);
650
                skb_shinfo(skb)->gso_size = mss_now;
651
                skb_shinfo(skb)->gso_type = sk->sk_gso_type;
652
        }
653
}
654
 
655
/* When a modification to fackets out becomes necessary, we need to check
656
 * skb is counted to fackets_out or not. Another important thing is to
657
 * tweak SACK fastpath hint too as it would overwrite all changes unless
658
 * hint is also changed.
659
 */
660
static void tcp_adjust_fackets_out(struct tcp_sock *tp, struct sk_buff *skb,
661
                                   int decr)
662
{
663
        if (!tp->sacked_out || tcp_is_reno(tp))
664
                return;
665
 
666
        if (!before(tp->highest_sack, TCP_SKB_CB(skb)->seq))
667
                tp->fackets_out -= decr;
668
 
669
        /* cnt_hint is "off-by-one" compared with fackets_out (see sacktag) */
670
        if (tp->fastpath_skb_hint != NULL &&
671
            after(TCP_SKB_CB(tp->fastpath_skb_hint)->seq, TCP_SKB_CB(skb)->seq))
672
                tp->fastpath_cnt_hint -= decr;
673
}
674
 
675
/* Function to create two new TCP segments.  Shrinks the given segment
676
 * to the specified size and appends a new segment with the rest of the
677
 * packet to the list.  This won't be called frequently, I hope.
678
 * Remember, these are still headerless SKBs at this point.
679
 */
680
int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss_now)
681
{
682
        struct tcp_sock *tp = tcp_sk(sk);
683
        struct sk_buff *buff;
684
        int nsize, old_factor;
685
        int nlen;
686
        u16 flags;
687
 
688
        BUG_ON(len > skb->len);
689
 
690
        tcp_clear_retrans_hints_partial(tp);
691
        nsize = skb_headlen(skb) - len;
692
        if (nsize < 0)
693
                nsize = 0;
694
 
695
        if (skb_cloned(skb) &&
696
            skb_is_nonlinear(skb) &&
697
            pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
698
                return -ENOMEM;
699
 
700
        /* Get a new skb... force flag on. */
701
        buff = sk_stream_alloc_skb(sk, nsize, GFP_ATOMIC);
702
        if (buff == NULL)
703
                return -ENOMEM; /* We'll just try again later. */
704
 
705
        sk_charge_skb(sk, buff);
706
        nlen = skb->len - len - nsize;
707
        buff->truesize += nlen;
708
        skb->truesize -= nlen;
709
 
710
        /* Correct the sequence numbers. */
711
        TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
712
        TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
713
        TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
714
 
715
        if (tcp_is_sack(tp) && tp->sacked_out &&
716
            (TCP_SKB_CB(skb)->seq == tp->highest_sack))
717
                tp->highest_sack = TCP_SKB_CB(buff)->seq;
718
 
719
        /* PSH and FIN should only be set in the second packet. */
720
        flags = TCP_SKB_CB(skb)->flags;
721
        TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
722
        TCP_SKB_CB(buff)->flags = flags;
723
        TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked;
724
        TCP_SKB_CB(skb)->sacked &= ~TCPCB_AT_TAIL;
725
 
726
        if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_PARTIAL) {
727
                /* Copy and checksum data tail into the new buffer. */
728
                buff->csum = csum_partial_copy_nocheck(skb->data + len, skb_put(buff, nsize),
729
                                                       nsize, 0);
730
 
731
                skb_trim(skb, len);
732
 
733
                skb->csum = csum_block_sub(skb->csum, buff->csum, len);
734
        } else {
735
                skb->ip_summed = CHECKSUM_PARTIAL;
736
                skb_split(skb, buff, len);
737
        }
738
 
739
        buff->ip_summed = skb->ip_summed;
740
 
741
        /* Looks stupid, but our code really uses when of
742
         * skbs, which it never sent before. --ANK
743
         */
744
        TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
745
        buff->tstamp = skb->tstamp;
746
 
747
        old_factor = tcp_skb_pcount(skb);
748
 
749
        /* Fix up tso_factor for both original and new SKB.  */
750
        tcp_set_skb_tso_segs(sk, skb, mss_now);
751
        tcp_set_skb_tso_segs(sk, buff, mss_now);
752
 
753
        /* If this packet has been sent out already, we must
754
         * adjust the various packet counters.
755
         */
756
        if (!before(tp->snd_nxt, TCP_SKB_CB(buff)->end_seq)) {
757
                int diff = old_factor - tcp_skb_pcount(skb) -
758
                        tcp_skb_pcount(buff);
759
 
760
                tp->packets_out -= diff;
761
 
762
                if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
763
                        tp->sacked_out -= diff;
764
                if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
765
                        tp->retrans_out -= diff;
766
 
767
                if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST)
768
                        tp->lost_out -= diff;
769
 
770
                /* Adjust Reno SACK estimate. */
771
                if (tcp_is_reno(tp) && diff > 0) {
772
                        tcp_dec_pcount_approx_int(&tp->sacked_out, diff);
773
                        tcp_verify_left_out(tp);
774
                }
775
                tcp_adjust_fackets_out(tp, skb, diff);
776
        }
777
 
778
        /* Link BUFF into the send queue. */
779
        skb_header_release(buff);
780
        tcp_insert_write_queue_after(skb, buff, sk);
781
 
782
        return 0;
783
}
784
 
785
/* This is similar to __pskb_pull_head() (it will go to core/skbuff.c
786
 * eventually). The difference is that pulled data not copied, but
787
 * immediately discarded.
788
 */
789
static void __pskb_trim_head(struct sk_buff *skb, int len)
790
{
791
        int i, k, eat;
792
 
793
        eat = len;
794
        k = 0;
795
        for (i=0; i<skb_shinfo(skb)->nr_frags; i++) {
796
                if (skb_shinfo(skb)->frags[i].size <= eat) {
797
                        put_page(skb_shinfo(skb)->frags[i].page);
798
                        eat -= skb_shinfo(skb)->frags[i].size;
799
                } else {
800
                        skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i];
801
                        if (eat) {
802
                                skb_shinfo(skb)->frags[k].page_offset += eat;
803
                                skb_shinfo(skb)->frags[k].size -= eat;
804
                                eat = 0;
805
                        }
806
                        k++;
807
                }
808
        }
809
        skb_shinfo(skb)->nr_frags = k;
810
 
811
        skb_reset_tail_pointer(skb);
812
        skb->data_len -= len;
813
        skb->len = skb->data_len;
814
}
815
 
816
int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
817
{
818
        if (skb_cloned(skb) &&
819
            pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
820
                return -ENOMEM;
821
 
822
        /* If len == headlen, we avoid __skb_pull to preserve alignment. */
823
        if (unlikely(len < skb_headlen(skb)))
824
                __skb_pull(skb, len);
825
        else
826
                __pskb_trim_head(skb, len - skb_headlen(skb));
827
 
828
        TCP_SKB_CB(skb)->seq += len;
829
        skb->ip_summed = CHECKSUM_PARTIAL;
830
 
831
        skb->truesize        -= len;
832
        sk->sk_wmem_queued   -= len;
833
        sk->sk_forward_alloc += len;
834
        sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
835
 
836
        /* Any change of skb->len requires recalculation of tso
837
         * factor and mss.
838
         */
839
        if (tcp_skb_pcount(skb) > 1)
840
                tcp_set_skb_tso_segs(sk, skb, tcp_current_mss(sk, 1));
841
 
842
        return 0;
843
}
844
 
845
/* Not accounting for SACKs here. */
846
int tcp_mtu_to_mss(struct sock *sk, int pmtu)
847
{
848
        struct tcp_sock *tp = tcp_sk(sk);
849
        struct inet_connection_sock *icsk = inet_csk(sk);
850
        int mss_now;
851
 
852
        /* Calculate base mss without TCP options:
853
           It is MMS_S - sizeof(tcphdr) of rfc1122
854
         */
855
        mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr);
856
 
857
        /* Clamp it (mss_clamp does not include tcp options) */
858
        if (mss_now > tp->rx_opt.mss_clamp)
859
                mss_now = tp->rx_opt.mss_clamp;
860
 
861
        /* Now subtract optional transport overhead */
862
        mss_now -= icsk->icsk_ext_hdr_len;
863
 
864
        /* Then reserve room for full set of TCP options and 8 bytes of data */
865
        if (mss_now < 48)
866
                mss_now = 48;
867
 
868
        /* Now subtract TCP options size, not including SACKs */
869
        mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
870
 
871
        return mss_now;
872
}
873
 
874
/* Inverse of above */
875
int tcp_mss_to_mtu(struct sock *sk, int mss)
876
{
877
        struct tcp_sock *tp = tcp_sk(sk);
878
        struct inet_connection_sock *icsk = inet_csk(sk);
879
        int mtu;
880
 
881
        mtu = mss +
882
              tp->tcp_header_len +
883
              icsk->icsk_ext_hdr_len +
884
              icsk->icsk_af_ops->net_header_len;
885
 
886
        return mtu;
887
}
888
 
889
void tcp_mtup_init(struct sock *sk)
890
{
891
        struct tcp_sock *tp = tcp_sk(sk);
892
        struct inet_connection_sock *icsk = inet_csk(sk);
893
 
894
        icsk->icsk_mtup.enabled = sysctl_tcp_mtu_probing > 1;
895
        icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) +
896
                               icsk->icsk_af_ops->net_header_len;
897
        icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, sysctl_tcp_base_mss);
898
        icsk->icsk_mtup.probe_size = 0;
899
}
900
 
901
/* This function synchronize snd mss to current pmtu/exthdr set.
902
 
903
   tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
904
   for TCP options, but includes only bare TCP header.
905
 
906
   tp->rx_opt.mss_clamp is mss negotiated at connection setup.
907
   It is minimum of user_mss and mss received with SYN.
908
   It also does not include TCP options.
909
 
910
   inet_csk(sk)->icsk_pmtu_cookie is last pmtu, seen by this function.
911
 
912
   tp->mss_cache is current effective sending mss, including
913
   all tcp options except for SACKs. It is evaluated,
914
   taking into account current pmtu, but never exceeds
915
   tp->rx_opt.mss_clamp.
916
 
917
   NOTE1. rfc1122 clearly states that advertised MSS
918
   DOES NOT include either tcp or ip options.
919
 
920
   NOTE2. inet_csk(sk)->icsk_pmtu_cookie and tp->mss_cache
921
   are READ ONLY outside this function.         --ANK (980731)
922
 */
923
 
924
unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
925
{
926
        struct tcp_sock *tp = tcp_sk(sk);
927
        struct inet_connection_sock *icsk = inet_csk(sk);
928
        int mss_now;
929
 
930
        if (icsk->icsk_mtup.search_high > pmtu)
931
                icsk->icsk_mtup.search_high = pmtu;
932
 
933
        mss_now = tcp_mtu_to_mss(sk, pmtu);
934
 
935
        /* Bound mss with half of window */
936
        if (tp->max_window && mss_now > (tp->max_window>>1))
937
                mss_now = max((tp->max_window>>1), 68U - tp->tcp_header_len);
938
 
939
        /* And store cached results */
940
        icsk->icsk_pmtu_cookie = pmtu;
941
        if (icsk->icsk_mtup.enabled)
942
                mss_now = min(mss_now, tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low));
943
        tp->mss_cache = mss_now;
944
 
945
        return mss_now;
946
}
947
 
948
/* Compute the current effective MSS, taking SACKs and IP options,
949
 * and even PMTU discovery events into account.
950
 *
951
 * LARGESEND note: !urg_mode is overkill, only frames up to snd_up
952
 * cannot be large. However, taking into account rare use of URG, this
953
 * is not a big flaw.
954
 */
955
unsigned int tcp_current_mss(struct sock *sk, int large_allowed)
956
{
957
        struct tcp_sock *tp = tcp_sk(sk);
958
        struct dst_entry *dst = __sk_dst_get(sk);
959
        u32 mss_now;
960
        u16 xmit_size_goal;
961
        int doing_tso = 0;
962
 
963
        mss_now = tp->mss_cache;
964
 
965
        if (large_allowed && sk_can_gso(sk) && !tp->urg_mode)
966
                doing_tso = 1;
967
 
968
        if (dst) {
969
                u32 mtu = dst_mtu(dst);
970
                if (mtu != inet_csk(sk)->icsk_pmtu_cookie)
971
                        mss_now = tcp_sync_mss(sk, mtu);
972
        }
973
 
974
        if (tp->rx_opt.eff_sacks)
975
                mss_now -= (TCPOLEN_SACK_BASE_ALIGNED +
976
                            (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK));
977
 
978
#ifdef CONFIG_TCP_MD5SIG
979
        if (tp->af_specific->md5_lookup(sk, sk))
980
                mss_now -= TCPOLEN_MD5SIG_ALIGNED;
981
#endif
982
 
983
        xmit_size_goal = mss_now;
984
 
985
        if (doing_tso) {
986
                xmit_size_goal = (65535 -
987
                                  inet_csk(sk)->icsk_af_ops->net_header_len -
988
                                  inet_csk(sk)->icsk_ext_hdr_len -
989
                                  tp->tcp_header_len);
990
 
991
                if (tp->max_window &&
992
                    (xmit_size_goal > (tp->max_window >> 1)))
993
                        xmit_size_goal = max((tp->max_window >> 1),
994
                                             68U - tp->tcp_header_len);
995
 
996
                xmit_size_goal -= (xmit_size_goal % mss_now);
997
        }
998
        tp->xmit_size_goal = xmit_size_goal;
999
 
1000
        return mss_now;
1001
}
1002
 
1003
/* Congestion window validation. (RFC2861) */
1004
 
1005
static void tcp_cwnd_validate(struct sock *sk)
1006
{
1007
        struct tcp_sock *tp = tcp_sk(sk);
1008
        __u32 packets_out = tp->packets_out;
1009
 
1010
        if (packets_out >= tp->snd_cwnd) {
1011
                /* Network is feed fully. */
1012
                tp->snd_cwnd_used = 0;
1013
                tp->snd_cwnd_stamp = tcp_time_stamp;
1014
        } else {
1015
                /* Network starves. */
1016
                if (tp->packets_out > tp->snd_cwnd_used)
1017
                        tp->snd_cwnd_used = tp->packets_out;
1018
 
1019
                if (sysctl_tcp_slow_start_after_idle &&
1020
                    (s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto)
1021
                        tcp_cwnd_application_limited(sk);
1022
        }
1023
}
1024
 
1025
static unsigned int tcp_window_allows(struct tcp_sock *tp, struct sk_buff *skb, unsigned int mss_now, unsigned int cwnd)
1026
{
1027
        u32 window, cwnd_len;
1028
 
1029
        window = (tp->snd_una + tp->snd_wnd - TCP_SKB_CB(skb)->seq);
1030
        cwnd_len = mss_now * cwnd;
1031
        return min(window, cwnd_len);
1032
}
1033
 
1034
/* Can at least one segment of SKB be sent right now, according to the
1035
 * congestion window rules?  If so, return how many segments are allowed.
1036
 */
1037
static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, struct sk_buff *skb)
1038
{
1039
        u32 in_flight, cwnd;
1040
 
1041
        /* Don't be strict about the congestion window for the final FIN.  */
1042
        if ((TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) &&
1043
            tcp_skb_pcount(skb) == 1)
1044
                return 1;
1045
 
1046
        in_flight = tcp_packets_in_flight(tp);
1047
        cwnd = tp->snd_cwnd;
1048
        if (in_flight < cwnd)
1049
                return (cwnd - in_flight);
1050
 
1051
        return 0;
1052
}
1053
 
1054
/* This must be invoked the first time we consider transmitting
1055
 * SKB onto the wire.
1056
 */
1057
static int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now)
1058
{
1059
        int tso_segs = tcp_skb_pcount(skb);
1060
 
1061
        if (!tso_segs ||
1062
            (tso_segs > 1 &&
1063
             tcp_skb_mss(skb) != mss_now)) {
1064
                tcp_set_skb_tso_segs(sk, skb, mss_now);
1065
                tso_segs = tcp_skb_pcount(skb);
1066
        }
1067
        return tso_segs;
1068
}
1069
 
1070
static inline int tcp_minshall_check(const struct tcp_sock *tp)
1071
{
1072
        return after(tp->snd_sml,tp->snd_una) &&
1073
                !after(tp->snd_sml, tp->snd_nxt);
1074
}
1075
 
1076
/* Return 0, if packet can be sent now without violation Nagle's rules:
1077
 * 1. It is full sized.
1078
 * 2. Or it contains FIN. (already checked by caller)
1079
 * 3. Or TCP_NODELAY was set.
1080
 * 4. Or TCP_CORK is not set, and all sent packets are ACKed.
1081
 *    With Minshall's modification: all sent small packets are ACKed.
1082
 */
1083
 
1084
static inline int tcp_nagle_check(const struct tcp_sock *tp,
1085
                                  const struct sk_buff *skb,
1086
                                  unsigned mss_now, int nonagle)
1087
{
1088
        return (skb->len < mss_now &&
1089
                ((nonagle&TCP_NAGLE_CORK) ||
1090
                 (!nonagle &&
1091
                  tp->packets_out &&
1092
                  tcp_minshall_check(tp))));
1093
}
1094
 
1095
/* Return non-zero if the Nagle test allows this packet to be
1096
 * sent now.
1097
 */
1098
static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb,
1099
                                 unsigned int cur_mss, int nonagle)
1100
{
1101
        /* Nagle rule does not apply to frames, which sit in the middle of the
1102
         * write_queue (they have no chances to get new data).
1103
         *
1104
         * This is implemented in the callers, where they modify the 'nonagle'
1105
         * argument based upon the location of SKB in the send queue.
1106
         */
1107
        if (nonagle & TCP_NAGLE_PUSH)
1108
                return 1;
1109
 
1110
        /* Don't use the nagle rule for urgent data (or for the final FIN).
1111
         * Nagle can be ignored during F-RTO too (see RFC4138).
1112
         */
1113
        if (tp->urg_mode || (tp->frto_counter == 2) ||
1114
            (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN))
1115
                return 1;
1116
 
1117
        if (!tcp_nagle_check(tp, skb, cur_mss, nonagle))
1118
                return 1;
1119
 
1120
        return 0;
1121
}
1122
 
1123
/* Does at least the first segment of SKB fit into the send window? */
1124
static inline int tcp_snd_wnd_test(struct tcp_sock *tp, struct sk_buff *skb, unsigned int cur_mss)
1125
{
1126
        u32 end_seq = TCP_SKB_CB(skb)->end_seq;
1127
 
1128
        if (skb->len > cur_mss)
1129
                end_seq = TCP_SKB_CB(skb)->seq + cur_mss;
1130
 
1131
        return !after(end_seq, tp->snd_una + tp->snd_wnd);
1132
}
1133
 
1134
/* This checks if the data bearing packet SKB (usually tcp_send_head(sk))
1135
 * should be put on the wire right now.  If so, it returns the number of
1136
 * packets allowed by the congestion window.
1137
 */
1138
static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb,
1139
                                 unsigned int cur_mss, int nonagle)
1140
{
1141
        struct tcp_sock *tp = tcp_sk(sk);
1142
        unsigned int cwnd_quota;
1143
 
1144
        tcp_init_tso_segs(sk, skb, cur_mss);
1145
 
1146
        if (!tcp_nagle_test(tp, skb, cur_mss, nonagle))
1147
                return 0;
1148
 
1149
        cwnd_quota = tcp_cwnd_test(tp, skb);
1150
        if (cwnd_quota &&
1151
            !tcp_snd_wnd_test(tp, skb, cur_mss))
1152
                cwnd_quota = 0;
1153
 
1154
        return cwnd_quota;
1155
}
1156
 
1157
int tcp_may_send_now(struct sock *sk)
1158
{
1159
        struct tcp_sock *tp = tcp_sk(sk);
1160
        struct sk_buff *skb = tcp_send_head(sk);
1161
 
1162
        return (skb &&
1163
                tcp_snd_test(sk, skb, tcp_current_mss(sk, 1),
1164
                             (tcp_skb_is_last(sk, skb) ?
1165
                              tp->nonagle : TCP_NAGLE_PUSH)));
1166
}
1167
 
1168
/* Trim TSO SKB to LEN bytes, put the remaining data into a new packet
1169
 * which is put after SKB on the list.  It is very much like
1170
 * tcp_fragment() except that it may make several kinds of assumptions
1171
 * in order to speed up the splitting operation.  In particular, we
1172
 * know that all the data is in scatter-gather pages, and that the
1173
 * packet has never been sent out before (and thus is not cloned).
1174
 */
1175
static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, unsigned int mss_now)
1176
{
1177
        struct sk_buff *buff;
1178
        int nlen = skb->len - len;
1179
        u16 flags;
1180
 
1181
        /* All of a TSO frame must be composed of paged data.  */
1182
        if (skb->len != skb->data_len)
1183
                return tcp_fragment(sk, skb, len, mss_now);
1184
 
1185
        buff = sk_stream_alloc_pskb(sk, 0, 0, GFP_ATOMIC);
1186
        if (unlikely(buff == NULL))
1187
                return -ENOMEM;
1188
 
1189
        sk_charge_skb(sk, buff);
1190
        buff->truesize += nlen;
1191
        skb->truesize -= nlen;
1192
 
1193
        /* Correct the sequence numbers. */
1194
        TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
1195
        TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
1196
        TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
1197
 
1198
        /* PSH and FIN should only be set in the second packet. */
1199
        flags = TCP_SKB_CB(skb)->flags;
1200
        TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
1201
        TCP_SKB_CB(buff)->flags = flags;
1202
 
1203
        /* This packet was never sent out yet, so no SACK bits. */
1204
        TCP_SKB_CB(buff)->sacked = 0;
1205
 
1206
        buff->ip_summed = skb->ip_summed = CHECKSUM_PARTIAL;
1207
        skb_split(skb, buff, len);
1208
 
1209
        /* Fix up tso_factor for both original and new SKB.  */
1210
        tcp_set_skb_tso_segs(sk, skb, mss_now);
1211
        tcp_set_skb_tso_segs(sk, buff, mss_now);
1212
 
1213
        /* Link BUFF into the send queue. */
1214
        skb_header_release(buff);
1215
        tcp_insert_write_queue_after(skb, buff, sk);
1216
 
1217
        return 0;
1218
}
1219
 
1220
/* Try to defer sending, if possible, in order to minimize the amount
1221
 * of TSO splitting we do.  View it as a kind of TSO Nagle test.
1222
 *
1223
 * This algorithm is from John Heffner.
1224
 */
1225
static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
1226
{
1227
        struct tcp_sock *tp = tcp_sk(sk);
1228
        const struct inet_connection_sock *icsk = inet_csk(sk);
1229
        u32 send_win, cong_win, limit, in_flight;
1230
 
1231
        if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)
1232
                goto send_now;
1233
 
1234
        if (icsk->icsk_ca_state != TCP_CA_Open)
1235
                goto send_now;
1236
 
1237
        /* Defer for less than two clock ticks. */
1238
        if (!tp->tso_deferred && ((jiffies<<1)>>1) - (tp->tso_deferred>>1) > 1)
1239
                goto send_now;
1240
 
1241
        in_flight = tcp_packets_in_flight(tp);
1242
 
1243
        BUG_ON(tcp_skb_pcount(skb) <= 1 ||
1244
               (tp->snd_cwnd <= in_flight));
1245
 
1246
        send_win = (tp->snd_una + tp->snd_wnd) - TCP_SKB_CB(skb)->seq;
1247
 
1248
        /* From in_flight test above, we know that cwnd > in_flight.  */
1249
        cong_win = (tp->snd_cwnd - in_flight) * tp->mss_cache;
1250
 
1251
        limit = min(send_win, cong_win);
1252
 
1253
        /* If a full-sized TSO skb can be sent, do it. */
1254
        if (limit >= 65536)
1255
                goto send_now;
1256
 
1257
        if (sysctl_tcp_tso_win_divisor) {
1258
                u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
1259
 
1260
                /* If at least some fraction of a window is available,
1261
                 * just use it.
1262
                 */
1263
                chunk /= sysctl_tcp_tso_win_divisor;
1264
                if (limit >= chunk)
1265
                        goto send_now;
1266
        } else {
1267
                /* Different approach, try not to defer past a single
1268
                 * ACK.  Receiver should ACK every other full sized
1269
                 * frame, so if we have space for more than 3 frames
1270
                 * then send now.
1271
                 */
1272
                if (limit > tcp_max_burst(tp) * tp->mss_cache)
1273
                        goto send_now;
1274
        }
1275
 
1276
        /* Ok, it looks like it is advisable to defer.  */
1277
        tp->tso_deferred = 1 | (jiffies<<1);
1278
 
1279
        return 1;
1280
 
1281
send_now:
1282
        tp->tso_deferred = 0;
1283
        return 0;
1284
}
1285
 
1286
/* Create a new MTU probe if we are ready.
1287
 * Returns 0 if we should wait to probe (no cwnd available),
1288
 *         1 if a probe was sent,
1289
 *         -1 otherwise */
1290
static int tcp_mtu_probe(struct sock *sk)
1291
{
1292
        struct tcp_sock *tp = tcp_sk(sk);
1293
        struct inet_connection_sock *icsk = inet_csk(sk);
1294
        struct sk_buff *skb, *nskb, *next;
1295
        int len;
1296
        int probe_size;
1297
        int size_needed;
1298
        unsigned int pif;
1299
        int copy;
1300
        int mss_now;
1301
 
1302
        /* Not currently probing/verifying,
1303
         * not in recovery,
1304
         * have enough cwnd, and
1305
         * not SACKing (the variable headers throw things off) */
1306
        if (!icsk->icsk_mtup.enabled ||
1307
            icsk->icsk_mtup.probe_size ||
1308
            inet_csk(sk)->icsk_ca_state != TCP_CA_Open ||
1309
            tp->snd_cwnd < 11 ||
1310
            tp->rx_opt.eff_sacks)
1311
                return -1;
1312
 
1313
        /* Very simple search strategy: just double the MSS. */
1314
        mss_now = tcp_current_mss(sk, 0);
1315
        probe_size = 2*tp->mss_cache;
1316
        size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache;
1317
        if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high)) {
1318
                /* TODO: set timer for probe_converge_event */
1319
                return -1;
1320
        }
1321
 
1322
        /* Have enough data in the send queue to probe? */
1323
        if (tp->write_seq - tp->snd_nxt < size_needed)
1324
                return -1;
1325
 
1326
        if (tp->snd_wnd < size_needed)
1327
                return -1;
1328
        if (after(tp->snd_nxt + size_needed, tp->snd_una + tp->snd_wnd))
1329
                return 0;
1330
 
1331
        /* Do we need to wait to drain cwnd? */
1332
        pif = tcp_packets_in_flight(tp);
1333
        if (pif + 2 > tp->snd_cwnd) {
1334
                /* With no packets in flight, don't stall. */
1335
                if (pif == 0)
1336
                        return -1;
1337
                else
1338
                        return 0;
1339
        }
1340
 
1341
        /* We're allowed to probe.  Build it now. */
1342
        if ((nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC)) == NULL)
1343
                return -1;
1344
        sk_charge_skb(sk, nskb);
1345
 
1346
        skb = tcp_send_head(sk);
1347
        tcp_insert_write_queue_before(nskb, skb, sk);
1348
 
1349
        TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq;
1350
        TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size;
1351
        TCP_SKB_CB(nskb)->flags = TCPCB_FLAG_ACK;
1352
        TCP_SKB_CB(nskb)->sacked = 0;
1353
        nskb->csum = 0;
1354
        nskb->ip_summed = skb->ip_summed;
1355
 
1356
        len = 0;
1357
        while (len < probe_size) {
1358
                next = tcp_write_queue_next(sk, skb);
1359
 
1360
                copy = min_t(int, skb->len, probe_size - len);
1361
                if (nskb->ip_summed)
1362
                        skb_copy_bits(skb, 0, skb_put(nskb, copy), copy);
1363
                else
1364
                        nskb->csum = skb_copy_and_csum_bits(skb, 0,
1365
                                         skb_put(nskb, copy), copy, nskb->csum);
1366
 
1367
                if (skb->len <= copy) {
1368
                        /* We've eaten all the data from this skb.
1369
                         * Throw it away. */
1370
                        TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags;
1371
                        tcp_unlink_write_queue(skb, sk);
1372
                        sk_stream_free_skb(sk, skb);
1373
                } else {
1374
                        TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags &
1375
                                                   ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
1376
                        if (!skb_shinfo(skb)->nr_frags) {
1377
                                skb_pull(skb, copy);
1378
                                if (skb->ip_summed != CHECKSUM_PARTIAL)
1379
                                        skb->csum = csum_partial(skb->data, skb->len, 0);
1380
                        } else {
1381
                                __pskb_trim_head(skb, copy);
1382
                                tcp_set_skb_tso_segs(sk, skb, mss_now);
1383
                        }
1384
                        TCP_SKB_CB(skb)->seq += copy;
1385
                }
1386
 
1387
                len += copy;
1388
                skb = next;
1389
        }
1390
        tcp_init_tso_segs(sk, nskb, nskb->len);
1391
 
1392
        /* We're ready to send.  If this fails, the probe will
1393
         * be resegmented into mss-sized pieces by tcp_write_xmit(). */
1394
        TCP_SKB_CB(nskb)->when = tcp_time_stamp;
1395
        if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) {
1396
                /* Decrement cwnd here because we are sending
1397
                * effectively two packets. */
1398
                tp->snd_cwnd--;
1399
                update_send_head(sk, nskb);
1400
 
1401
                icsk->icsk_mtup.probe_size = tcp_mss_to_mtu(sk, nskb->len);
1402
                tp->mtu_probe.probe_seq_start = TCP_SKB_CB(nskb)->seq;
1403
                tp->mtu_probe.probe_seq_end = TCP_SKB_CB(nskb)->end_seq;
1404
 
1405
                return 1;
1406
        }
1407
 
1408
        return -1;
1409
}
1410
 
1411
 
1412
/* This routine writes packets to the network.  It advances the
1413
 * send_head.  This happens as incoming acks open up the remote
1414
 * window for us.
1415
 *
1416
 * Returns 1, if no segments are in flight and we have queued segments, but
1417
 * cannot send anything now because of SWS or another problem.
1418
 */
1419
static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
1420
{
1421
        struct tcp_sock *tp = tcp_sk(sk);
1422
        struct sk_buff *skb;
1423
        unsigned int tso_segs, sent_pkts;
1424
        int cwnd_quota;
1425
        int result;
1426
 
1427
        /* If we are closed, the bytes will have to remain here.
1428
         * In time closedown will finish, we empty the write queue and all
1429
         * will be happy.
1430
         */
1431
        if (unlikely(sk->sk_state == TCP_CLOSE))
1432
                return 0;
1433
 
1434
        sent_pkts = 0;
1435
 
1436
        /* Do MTU probing. */
1437
        if ((result = tcp_mtu_probe(sk)) == 0) {
1438
                return 0;
1439
        } else if (result > 0) {
1440
                sent_pkts = 1;
1441
        }
1442
 
1443
        while ((skb = tcp_send_head(sk))) {
1444
                unsigned int limit;
1445
 
1446
                tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
1447
                BUG_ON(!tso_segs);
1448
 
1449
                cwnd_quota = tcp_cwnd_test(tp, skb);
1450
                if (!cwnd_quota)
1451
                        break;
1452
 
1453
                if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now)))
1454
                        break;
1455
 
1456
                if (tso_segs == 1) {
1457
                        if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
1458
                                                     (tcp_skb_is_last(sk, skb) ?
1459
                                                      nonagle : TCP_NAGLE_PUSH))))
1460
                                break;
1461
                } else {
1462
                        if (tcp_tso_should_defer(sk, skb))
1463
                                break;
1464
                }
1465
 
1466
                limit = mss_now;
1467
                if (tso_segs > 1) {
1468
                        limit = tcp_window_allows(tp, skb,
1469
                                                  mss_now, cwnd_quota);
1470
 
1471
                        if (skb->len < limit) {
1472
                                unsigned int trim = skb->len % mss_now;
1473
 
1474
                                if (trim)
1475
                                        limit = skb->len - trim;
1476
                        }
1477
                }
1478
 
1479
                if (skb->len > limit &&
1480
                    unlikely(tso_fragment(sk, skb, limit, mss_now)))
1481
                        break;
1482
 
1483
                TCP_SKB_CB(skb)->when = tcp_time_stamp;
1484
 
1485
                if (unlikely(tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC)))
1486
                        break;
1487
 
1488
                /* Advance the send_head.  This one is sent out.
1489
                 * This call will increment packets_out.
1490
                 */
1491
                update_send_head(sk, skb);
1492
 
1493
                tcp_minshall_update(tp, mss_now, skb);
1494
                sent_pkts++;
1495
        }
1496
 
1497
        if (likely(sent_pkts)) {
1498
                tcp_cwnd_validate(sk);
1499
                return 0;
1500
        }
1501
        return !tp->packets_out && tcp_send_head(sk);
1502
}
1503
 
1504
/* Push out any pending frames which were held back due to
1505
 * TCP_CORK or attempt at coalescing tiny packets.
1506
 * The socket must be locked by the caller.
1507
 */
1508
void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
1509
                               int nonagle)
1510
{
1511
        struct sk_buff *skb = tcp_send_head(sk);
1512
 
1513
        if (skb) {
1514
                if (tcp_write_xmit(sk, cur_mss, nonagle))
1515
                        tcp_check_probe_timer(sk);
1516
        }
1517
}
1518
 
1519
/* Send _single_ skb sitting at the send head. This function requires
1520
 * true push pending frames to setup probe timer etc.
1521
 */
1522
void tcp_push_one(struct sock *sk, unsigned int mss_now)
1523
{
1524
        struct tcp_sock *tp = tcp_sk(sk);
1525
        struct sk_buff *skb = tcp_send_head(sk);
1526
        unsigned int tso_segs, cwnd_quota;
1527
 
1528
        BUG_ON(!skb || skb->len < mss_now);
1529
 
1530
        tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
1531
        cwnd_quota = tcp_snd_test(sk, skb, mss_now, TCP_NAGLE_PUSH);
1532
 
1533
        if (likely(cwnd_quota)) {
1534
                unsigned int limit;
1535
 
1536
                BUG_ON(!tso_segs);
1537
 
1538
                limit = mss_now;
1539
                if (tso_segs > 1) {
1540
                        limit = tcp_window_allows(tp, skb,
1541
                                                  mss_now, cwnd_quota);
1542
 
1543
                        if (skb->len < limit) {
1544
                                unsigned int trim = skb->len % mss_now;
1545
 
1546
                                if (trim)
1547
                                        limit = skb->len - trim;
1548
                        }
1549
                }
1550
 
1551
                if (skb->len > limit &&
1552
                    unlikely(tso_fragment(sk, skb, limit, mss_now)))
1553
                        return;
1554
 
1555
                /* Send it out now. */
1556
                TCP_SKB_CB(skb)->when = tcp_time_stamp;
1557
 
1558
                if (likely(!tcp_transmit_skb(sk, skb, 1, sk->sk_allocation))) {
1559
                        update_send_head(sk, skb);
1560
                        tcp_cwnd_validate(sk);
1561
                        return;
1562
                }
1563
        }
1564
}
1565
 
1566
/* This function returns the amount that we can raise the
1567
 * usable window based on the following constraints
1568
 *
1569
 * 1. The window can never be shrunk once it is offered (RFC 793)
1570
 * 2. We limit memory per socket
1571
 *
1572
 * RFC 1122:
1573
 * "the suggested [SWS] avoidance algorithm for the receiver is to keep
1574
 *  RECV.NEXT + RCV.WIN fixed until:
1575
 *  RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)"
1576
 *
1577
 * i.e. don't raise the right edge of the window until you can raise
1578
 * it at least MSS bytes.
1579
 *
1580
 * Unfortunately, the recommended algorithm breaks header prediction,
1581
 * since header prediction assumes th->window stays fixed.
1582
 *
1583
 * Strictly speaking, keeping th->window fixed violates the receiver
1584
 * side SWS prevention criteria. The problem is that under this rule
1585
 * a stream of single byte packets will cause the right side of the
1586
 * window to always advance by a single byte.
1587
 *
1588
 * Of course, if the sender implements sender side SWS prevention
1589
 * then this will not be a problem.
1590
 *
1591
 * BSD seems to make the following compromise:
1592
 *
1593
 *      If the free space is less than the 1/4 of the maximum
1594
 *      space available and the free space is less than 1/2 mss,
1595
 *      then set the window to 0.
1596
 *      [ Actually, bsd uses MSS and 1/4 of maximal _window_ ]
1597
 *      Otherwise, just prevent the window from shrinking
1598
 *      and from being larger than the largest representable value.
1599
 *
1600
 * This prevents incremental opening of the window in the regime
1601
 * where TCP is limited by the speed of the reader side taking
1602
 * data out of the TCP receive queue. It does nothing about
1603
 * those cases where the window is constrained on the sender side
1604
 * because the pipeline is full.
1605
 *
1606
 * BSD also seems to "accidentally" limit itself to windows that are a
1607
 * multiple of MSS, at least until the free space gets quite small.
1608
 * This would appear to be a side effect of the mbuf implementation.
1609
 * Combining these two algorithms results in the observed behavior
1610
 * of having a fixed window size at almost all times.
1611
 *
1612
 * Below we obtain similar behavior by forcing the offered window to
1613
 * a multiple of the mss when it is feasible to do so.
1614
 *
1615
 * Note, we don't "adjust" for TIMESTAMP or SACK option bytes.
1616
 * Regular options like TIMESTAMP are taken into account.
1617
 */
1618
u32 __tcp_select_window(struct sock *sk)
1619
{
1620
        struct inet_connection_sock *icsk = inet_csk(sk);
1621
        struct tcp_sock *tp = tcp_sk(sk);
1622
        /* MSS for the peer's data.  Previous versions used mss_clamp
1623
         * here.  I don't know if the value based on our guesses
1624
         * of peer's MSS is better for the performance.  It's more correct
1625
         * but may be worse for the performance because of rcv_mss
1626
         * fluctuations.  --SAW  1998/11/1
1627
         */
1628
        int mss = icsk->icsk_ack.rcv_mss;
1629
        int free_space = tcp_space(sk);
1630
        int full_space = min_t(int, tp->window_clamp, tcp_full_space(sk));
1631
        int window;
1632
 
1633
        if (mss > full_space)
1634
                mss = full_space;
1635
 
1636
        if (free_space < full_space/2) {
1637
                icsk->icsk_ack.quick = 0;
1638
 
1639
                if (tcp_memory_pressure)
1640
                        tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U*tp->advmss);
1641
 
1642
                if (free_space < mss)
1643
                        return 0;
1644
        }
1645
 
1646
        if (free_space > tp->rcv_ssthresh)
1647
                free_space = tp->rcv_ssthresh;
1648
 
1649
        /* Don't do rounding if we are using window scaling, since the
1650
         * scaled window will not line up with the MSS boundary anyway.
1651
         */
1652
        window = tp->rcv_wnd;
1653
        if (tp->rx_opt.rcv_wscale) {
1654
                window = free_space;
1655
 
1656
                /* Advertise enough space so that it won't get scaled away.
1657
                 * Import case: prevent zero window announcement if
1658
                 * 1<<rcv_wscale > mss.
1659
                 */
1660
                if (((window >> tp->rx_opt.rcv_wscale) << tp->rx_opt.rcv_wscale) != window)
1661
                        window = (((window >> tp->rx_opt.rcv_wscale) + 1)
1662
                                  << tp->rx_opt.rcv_wscale);
1663
        } else {
1664
                /* Get the largest window that is a nice multiple of mss.
1665
                 * Window clamp already applied above.
1666
                 * If our current window offering is within 1 mss of the
1667
                 * free space we just keep it. This prevents the divide
1668
                 * and multiply from happening most of the time.
1669
                 * We also don't do any window rounding when the free space
1670
                 * is too small.
1671
                 */
1672
                if (window <= free_space - mss || window > free_space)
1673
                        window = (free_space/mss)*mss;
1674
                else if (mss == full_space &&
1675
                         free_space > window + full_space/2)
1676
                        window = free_space;
1677
        }
1678
 
1679
        return window;
1680
}
1681
 
1682
/* Attempt to collapse two adjacent SKB's during retransmission. */
1683
static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int mss_now)
1684
{
1685
        struct tcp_sock *tp = tcp_sk(sk);
1686
        struct sk_buff *next_skb = tcp_write_queue_next(sk, skb);
1687
 
1688
        /* The first test we must make is that neither of these two
1689
         * SKB's are still referenced by someone else.
1690
         */
1691
        if (!skb_cloned(skb) && !skb_cloned(next_skb)) {
1692
                int skb_size = skb->len, next_skb_size = next_skb->len;
1693
                u16 flags = TCP_SKB_CB(skb)->flags;
1694
 
1695
                /* Also punt if next skb has been SACK'd. */
1696
                if (TCP_SKB_CB(next_skb)->sacked & TCPCB_SACKED_ACKED)
1697
                        return;
1698
 
1699
                /* Next skb is out of window. */
1700
                if (after(TCP_SKB_CB(next_skb)->end_seq, tp->snd_una+tp->snd_wnd))
1701
                        return;
1702
 
1703
                /* Punt if not enough space exists in the first SKB for
1704
                 * the data in the second, or the total combined payload
1705
                 * would exceed the MSS.
1706
                 */
1707
                if ((next_skb_size > skb_tailroom(skb)) ||
1708
                    ((skb_size + next_skb_size) > mss_now))
1709
                        return;
1710
 
1711
                BUG_ON(tcp_skb_pcount(skb) != 1 ||
1712
                       tcp_skb_pcount(next_skb) != 1);
1713
 
1714
                if (WARN_ON(tcp_is_sack(tp) && tp->sacked_out &&
1715
                    (TCP_SKB_CB(next_skb)->seq == tp->highest_sack)))
1716
                        return;
1717
 
1718
                /* Ok.  We will be able to collapse the packet. */
1719
                tcp_unlink_write_queue(next_skb, sk);
1720
 
1721
                skb_copy_from_linear_data(next_skb,
1722
                                          skb_put(skb, next_skb_size),
1723
                                          next_skb_size);
1724
 
1725
                if (next_skb->ip_summed == CHECKSUM_PARTIAL)
1726
                        skb->ip_summed = CHECKSUM_PARTIAL;
1727
 
1728
                if (skb->ip_summed != CHECKSUM_PARTIAL)
1729
                        skb->csum = csum_block_add(skb->csum, next_skb->csum, skb_size);
1730
 
1731
                /* Update sequence range on original skb. */
1732
                TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;
1733
 
1734
                /* Merge over control information. */
1735
                flags |= TCP_SKB_CB(next_skb)->flags; /* This moves PSH/FIN etc. over */
1736
                TCP_SKB_CB(skb)->flags = flags;
1737
 
1738
                /* All done, get rid of second SKB and account for it so
1739
                 * packet counting does not break.
1740
                 */
1741
                TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked&(TCPCB_EVER_RETRANS|TCPCB_AT_TAIL);
1742
                if (TCP_SKB_CB(next_skb)->sacked&TCPCB_SACKED_RETRANS)
1743
                        tp->retrans_out -= tcp_skb_pcount(next_skb);
1744
                if (TCP_SKB_CB(next_skb)->sacked&TCPCB_LOST)
1745
                        tp->lost_out -= tcp_skb_pcount(next_skb);
1746
                /* Reno case is special. Sigh... */
1747
                if (tcp_is_reno(tp) && tp->sacked_out)
1748
                        tcp_dec_pcount_approx(&tp->sacked_out, next_skb);
1749
 
1750
                tcp_adjust_fackets_out(tp, next_skb, tcp_skb_pcount(next_skb));
1751
                tp->packets_out -= tcp_skb_pcount(next_skb);
1752
 
1753
                /* changed transmit queue under us so clear hints */
1754
                tcp_clear_retrans_hints_partial(tp);
1755
                /* manually tune sacktag skb hint */
1756
                if (tp->fastpath_skb_hint == next_skb) {
1757
                        tp->fastpath_skb_hint = skb;
1758
                        tp->fastpath_cnt_hint -= tcp_skb_pcount(skb);
1759
                }
1760
 
1761
                sk_stream_free_skb(sk, next_skb);
1762
        }
1763
}
1764
 
1765
/* Do a simple retransmit without using the backoff mechanisms in
1766
 * tcp_timer. This is used for path mtu discovery.
1767
 * The socket is already locked here.
1768
 */
1769
void tcp_simple_retransmit(struct sock *sk)
1770
{
1771
        const struct inet_connection_sock *icsk = inet_csk(sk);
1772
        struct tcp_sock *tp = tcp_sk(sk);
1773
        struct sk_buff *skb;
1774
        unsigned int mss = tcp_current_mss(sk, 0);
1775
        int lost = 0;
1776
 
1777
        tcp_for_write_queue(skb, sk) {
1778
                if (skb == tcp_send_head(sk))
1779
                        break;
1780
                if (skb->len > mss &&
1781
                    !(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) {
1782
                        if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) {
1783
                                TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
1784
                                tp->retrans_out -= tcp_skb_pcount(skb);
1785
                        }
1786
                        if (!(TCP_SKB_CB(skb)->sacked&TCPCB_LOST)) {
1787
                                TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
1788
                                tp->lost_out += tcp_skb_pcount(skb);
1789
                                lost = 1;
1790
                        }
1791
                }
1792
        }
1793
 
1794
        tcp_clear_all_retrans_hints(tp);
1795
 
1796
        if (!lost)
1797
                return;
1798
 
1799
        tcp_verify_left_out(tp);
1800
 
1801
        /* Don't muck with the congestion window here.
1802
         * Reason is that we do not increase amount of _data_
1803
         * in network, but units changed and effective
1804
         * cwnd/ssthresh really reduced now.
1805
         */
1806
        if (icsk->icsk_ca_state != TCP_CA_Loss) {
1807
                tp->high_seq = tp->snd_nxt;
1808
                tp->snd_ssthresh = tcp_current_ssthresh(sk);
1809
                tp->prior_ssthresh = 0;
1810
                tp->undo_marker = 0;
1811
                tcp_set_ca_state(sk, TCP_CA_Loss);
1812
        }
1813
        tcp_xmit_retransmit_queue(sk);
1814
}
1815
 
1816
/* This retransmits one SKB.  Policy decisions and retransmit queue
1817
 * state updates are done by the caller.  Returns non-zero if an
1818
 * error occurred which prevented the send.
1819
 */
1820
int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
1821
{
1822
        struct tcp_sock *tp = tcp_sk(sk);
1823
        struct inet_connection_sock *icsk = inet_csk(sk);
1824
        unsigned int cur_mss = tcp_current_mss(sk, 0);
1825
        int err;
1826
 
1827
        /* Inconslusive MTU probe */
1828
        if (icsk->icsk_mtup.probe_size) {
1829
                icsk->icsk_mtup.probe_size = 0;
1830
        }
1831
 
1832
        /* Do not sent more than we queued. 1/4 is reserved for possible
1833
         * copying overhead: fragmentation, tunneling, mangling etc.
1834
         */
1835
        if (atomic_read(&sk->sk_wmem_alloc) >
1836
            min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf))
1837
                return -EAGAIN;
1838
 
1839
        if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {
1840
                if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
1841
                        BUG();
1842
                if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
1843
                        return -ENOMEM;
1844
        }
1845
 
1846
        /* If receiver has shrunk his window, and skb is out of
1847
         * new window, do not retransmit it. The exception is the
1848
         * case, when window is shrunk to zero. In this case
1849
         * our retransmit serves as a zero window probe.
1850
         */
1851
        if (!before(TCP_SKB_CB(skb)->seq, tp->snd_una+tp->snd_wnd)
1852
            && TCP_SKB_CB(skb)->seq != tp->snd_una)
1853
                return -EAGAIN;
1854
 
1855
        if (skb->len > cur_mss) {
1856
                if (tcp_fragment(sk, skb, cur_mss, cur_mss))
1857
                        return -ENOMEM; /* We'll try again later. */
1858
        }
1859
 
1860
        /* Collapse two adjacent packets if worthwhile and we can. */
1861
        if (!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) &&
1862
            (skb->len < (cur_mss >> 1)) &&
1863
            (tcp_write_queue_next(sk, skb) != tcp_send_head(sk)) &&
1864
            (!tcp_skb_is_last(sk, skb)) &&
1865
            (skb_shinfo(skb)->nr_frags == 0 && skb_shinfo(tcp_write_queue_next(sk, skb))->nr_frags == 0) &&
1866
            (tcp_skb_pcount(skb) == 1 && tcp_skb_pcount(tcp_write_queue_next(sk, skb)) == 1) &&
1867
            (sysctl_tcp_retrans_collapse != 0))
1868
                tcp_retrans_try_collapse(sk, skb, cur_mss);
1869
 
1870
        if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
1871
                return -EHOSTUNREACH; /* Routing failure or similar. */
1872
 
1873
        /* Some Solaris stacks overoptimize and ignore the FIN on a
1874
         * retransmit when old data is attached.  So strip it off
1875
         * since it is cheap to do so and saves bytes on the network.
1876
         */
1877
        if (skb->len > 0 &&
1878
            (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) &&
1879
            tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) {
1880
                if (!pskb_trim(skb, 0)) {
1881
                        TCP_SKB_CB(skb)->seq = TCP_SKB_CB(skb)->end_seq - 1;
1882
                        skb_shinfo(skb)->gso_segs = 1;
1883
                        skb_shinfo(skb)->gso_size = 0;
1884
                        skb_shinfo(skb)->gso_type = 0;
1885
                        skb->ip_summed = CHECKSUM_NONE;
1886
                        skb->csum = 0;
1887
                }
1888
        }
1889
 
1890
        /* Make a copy, if the first transmission SKB clone we made
1891
         * is still in somebody's hands, else make a clone.
1892
         */
1893
        TCP_SKB_CB(skb)->when = tcp_time_stamp;
1894
 
1895
        err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
1896
 
1897
        if (err == 0) {
1898
                /* Update global TCP statistics. */
1899
                TCP_INC_STATS(TCP_MIB_RETRANSSEGS);
1900
 
1901
                tp->total_retrans++;
1902
 
1903
#if FASTRETRANS_DEBUG > 0
1904
                if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) {
1905
                        if (net_ratelimit())
1906
                                printk(KERN_DEBUG "retrans_out leaked.\n");
1907
                }
1908
#endif
1909
                if (!tp->retrans_out)
1910
                        tp->lost_retrans_low = tp->snd_nxt;
1911
                TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS;
1912
                tp->retrans_out += tcp_skb_pcount(skb);
1913
 
1914
                /* Save stamp of the first retransmit. */
1915
                if (!tp->retrans_stamp)
1916
                        tp->retrans_stamp = TCP_SKB_CB(skb)->when;
1917
 
1918
                tp->undo_retrans++;
1919
 
1920
                /* snd_nxt is stored to detect loss of retransmitted segment,
1921
                 * see tcp_input.c tcp_sacktag_write_queue().
1922
                 */
1923
                TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt;
1924
        }
1925
        return err;
1926
}
1927
 
1928
/* This gets called after a retransmit timeout, and the initially
1929
 * retransmitted data is acknowledged.  It tries to continue
1930
 * resending the rest of the retransmit queue, until either
1931
 * we've sent it all or the congestion window limit is reached.
1932
 * If doing SACK, the first ACK which comes back for a timeout
1933
 * based retransmit packet might feed us FACK information again.
1934
 * If so, we use it to avoid unnecessarily retransmissions.
1935
 */
1936
void tcp_xmit_retransmit_queue(struct sock *sk)
1937
{
1938
        const struct inet_connection_sock *icsk = inet_csk(sk);
1939
        struct tcp_sock *tp = tcp_sk(sk);
1940
        struct sk_buff *skb;
1941
        int packet_cnt;
1942
 
1943
        if (tp->retransmit_skb_hint) {
1944
                skb = tp->retransmit_skb_hint;
1945
                packet_cnt = tp->retransmit_cnt_hint;
1946
        }else{
1947
                skb = tcp_write_queue_head(sk);
1948
                packet_cnt = 0;
1949
        }
1950
 
1951
        /* First pass: retransmit lost packets. */
1952
        if (tp->lost_out) {
1953
                tcp_for_write_queue_from(skb, sk) {
1954
                        __u8 sacked = TCP_SKB_CB(skb)->sacked;
1955
 
1956
                        if (skb == tcp_send_head(sk))
1957
                                break;
1958
                        /* we could do better than to assign each time */
1959
                        tp->retransmit_skb_hint = skb;
1960
                        tp->retransmit_cnt_hint = packet_cnt;
1961
 
1962
                        /* Assume this retransmit will generate
1963
                         * only one packet for congestion window
1964
                         * calculation purposes.  This works because
1965
                         * tcp_retransmit_skb() will chop up the
1966
                         * packet to be MSS sized and all the
1967
                         * packet counting works out.
1968
                         */
1969
                        if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
1970
                                return;
1971
 
1972
                        if (sacked & TCPCB_LOST) {
1973
                                if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) {
1974
                                        if (tcp_retransmit_skb(sk, skb)) {
1975
                                                tp->retransmit_skb_hint = NULL;
1976
                                                return;
1977
                                        }
1978
                                        if (icsk->icsk_ca_state != TCP_CA_Loss)
1979
                                                NET_INC_STATS_BH(LINUX_MIB_TCPFASTRETRANS);
1980
                                        else
1981
                                                NET_INC_STATS_BH(LINUX_MIB_TCPSLOWSTARTRETRANS);
1982
 
1983
                                        if (skb == tcp_write_queue_head(sk))
1984
                                                inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
1985
                                                                          inet_csk(sk)->icsk_rto,
1986
                                                                          TCP_RTO_MAX);
1987
                                }
1988
 
1989
                                packet_cnt += tcp_skb_pcount(skb);
1990
                                if (packet_cnt >= tp->lost_out)
1991
                                        break;
1992
                        }
1993
                }
1994
        }
1995
 
1996
        /* OK, demanded retransmission is finished. */
1997
 
1998
        /* Forward retransmissions are possible only during Recovery. */
1999
        if (icsk->icsk_ca_state != TCP_CA_Recovery)
2000
                return;
2001
 
2002
        /* No forward retransmissions in Reno are possible. */
2003
        if (tcp_is_reno(tp))
2004
                return;
2005
 
2006
        /* Yeah, we have to make difficult choice between forward transmission
2007
         * and retransmission... Both ways have their merits...
2008
         *
2009
         * For now we do not retransmit anything, while we have some new
2010
         * segments to send. In the other cases, follow rule 3 for
2011
         * NextSeg() specified in RFC3517.
2012
         */
2013
 
2014
        if (tcp_may_send_now(sk))
2015
                return;
2016
 
2017
        /* If nothing is SACKed, highest_sack in the loop won't be valid */
2018
        if (!tp->sacked_out)
2019
                return;
2020
 
2021
        if (tp->forward_skb_hint)
2022
                skb = tp->forward_skb_hint;
2023
        else
2024
                skb = tcp_write_queue_head(sk);
2025
 
2026
        tcp_for_write_queue_from(skb, sk) {
2027
                if (skb == tcp_send_head(sk))
2028
                        break;
2029
                tp->forward_skb_hint = skb;
2030
 
2031
                if (after(TCP_SKB_CB(skb)->seq, tp->highest_sack))
2032
                        break;
2033
 
2034
                if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
2035
                        break;
2036
 
2037
                if (TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS)
2038
                        continue;
2039
 
2040
                /* Ok, retransmit it. */
2041
                if (tcp_retransmit_skb(sk, skb)) {
2042
                        tp->forward_skb_hint = NULL;
2043
                        break;
2044
                }
2045
 
2046
                if (skb == tcp_write_queue_head(sk))
2047
                        inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
2048
                                                  inet_csk(sk)->icsk_rto,
2049
                                                  TCP_RTO_MAX);
2050
 
2051
                NET_INC_STATS_BH(LINUX_MIB_TCPFORWARDRETRANS);
2052
        }
2053
}
2054
 
2055
 
2056
/* Send a fin.  The caller locks the socket for us.  This cannot be
2057
 * allowed to fail queueing a FIN frame under any circumstances.
2058
 */
2059
void tcp_send_fin(struct sock *sk)
2060
{
2061
        struct tcp_sock *tp = tcp_sk(sk);
2062
        struct sk_buff *skb = tcp_write_queue_tail(sk);
2063
        int mss_now;
2064
 
2065
        /* Optimization, tack on the FIN if we have a queue of
2066
         * unsent frames.  But be careful about outgoing SACKS
2067
         * and IP options.
2068
         */
2069
        mss_now = tcp_current_mss(sk, 1);
2070
 
2071
        if (tcp_send_head(sk) != NULL) {
2072
                TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN;
2073
                TCP_SKB_CB(skb)->end_seq++;
2074
                tp->write_seq++;
2075
        } else {
2076
                /* Socket is locked, keep trying until memory is available. */
2077
                for (;;) {
2078
                        skb = alloc_skb_fclone(MAX_TCP_HEADER, GFP_KERNEL);
2079
                        if (skb)
2080
                                break;
2081
                        yield();
2082
                }
2083
 
2084
                /* Reserve space for headers and prepare control bits. */
2085
                skb_reserve(skb, MAX_TCP_HEADER);
2086
                skb->csum = 0;
2087
                TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_FIN);
2088
                TCP_SKB_CB(skb)->sacked = 0;
2089
                skb_shinfo(skb)->gso_segs = 1;
2090
                skb_shinfo(skb)->gso_size = 0;
2091
                skb_shinfo(skb)->gso_type = 0;
2092
 
2093
                /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
2094
                TCP_SKB_CB(skb)->seq = tp->write_seq;
2095
                TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
2096
                tcp_queue_skb(sk, skb);
2097
        }
2098
        __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_OFF);
2099
}
2100
 
2101
/* We get here when a process closes a file descriptor (either due to
2102
 * an explicit close() or as a byproduct of exit()'ing) and there
2103
 * was unread data in the receive queue.  This behavior is recommended
2104
 * by RFC 2525, section 2.17.  -DaveM
2105
 */
2106
void tcp_send_active_reset(struct sock *sk, gfp_t priority)
2107
{
2108
        struct sk_buff *skb;
2109
 
2110
        /* NOTE: No TCP options attached and we never retransmit this. */
2111
        skb = alloc_skb(MAX_TCP_HEADER, priority);
2112
        if (!skb) {
2113
                NET_INC_STATS(LINUX_MIB_TCPABORTFAILED);
2114
                return;
2115
        }
2116
 
2117
        /* Reserve space for headers and prepare control bits. */
2118
        skb_reserve(skb, MAX_TCP_HEADER);
2119
        skb->csum = 0;
2120
        TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_RST);
2121
        TCP_SKB_CB(skb)->sacked = 0;
2122
        skb_shinfo(skb)->gso_segs = 1;
2123
        skb_shinfo(skb)->gso_size = 0;
2124
        skb_shinfo(skb)->gso_type = 0;
2125
 
2126
        /* Send it off. */
2127
        TCP_SKB_CB(skb)->seq = tcp_acceptable_seq(sk);
2128
        TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
2129
        TCP_SKB_CB(skb)->when = tcp_time_stamp;
2130
        if (tcp_transmit_skb(sk, skb, 0, priority))
2131
                NET_INC_STATS(LINUX_MIB_TCPABORTFAILED);
2132
}
2133
 
2134
/* WARNING: This routine must only be called when we have already sent
2135
 * a SYN packet that crossed the incoming SYN that caused this routine
2136
 * to get called. If this assumption fails then the initial rcv_wnd
2137
 * and rcv_wscale values will not be correct.
2138
 */
2139
int tcp_send_synack(struct sock *sk)
2140
{
2141
        struct sk_buff* skb;
2142
 
2143
        skb = tcp_write_queue_head(sk);
2144
        if (skb == NULL || !(TCP_SKB_CB(skb)->flags&TCPCB_FLAG_SYN)) {
2145
                printk(KERN_DEBUG "tcp_send_synack: wrong queue state\n");
2146
                return -EFAULT;
2147
        }
2148
        if (!(TCP_SKB_CB(skb)->flags&TCPCB_FLAG_ACK)) {
2149
                if (skb_cloned(skb)) {
2150
                        struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC);
2151
                        if (nskb == NULL)
2152
                                return -ENOMEM;
2153
                        tcp_unlink_write_queue(skb, sk);
2154
                        skb_header_release(nskb);
2155
                        __tcp_add_write_queue_head(sk, nskb);
2156
                        sk_stream_free_skb(sk, skb);
2157
                        sk_charge_skb(sk, nskb);
2158
                        skb = nskb;
2159
                }
2160
 
2161
                TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ACK;
2162
                TCP_ECN_send_synack(tcp_sk(sk), skb);
2163
        }
2164
        TCP_SKB_CB(skb)->when = tcp_time_stamp;
2165
        return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
2166
}
2167
 
2168
/*
2169
 * Prepare a SYN-ACK.
2170
 */
2171
struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2172
                                 struct request_sock *req)
2173
{
2174
        struct inet_request_sock *ireq = inet_rsk(req);
2175
        struct tcp_sock *tp = tcp_sk(sk);
2176
        struct tcphdr *th;
2177
        int tcp_header_size;
2178
        struct sk_buff *skb;
2179
#ifdef CONFIG_TCP_MD5SIG
2180
        struct tcp_md5sig_key *md5;
2181
        __u8 *md5_hash_location;
2182
#endif
2183
 
2184
        skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC);
2185
        if (skb == NULL)
2186
                return NULL;
2187
 
2188
        /* Reserve space for headers. */
2189
        skb_reserve(skb, MAX_TCP_HEADER);
2190
 
2191
        skb->dst = dst_clone(dst);
2192
 
2193
        tcp_header_size = (sizeof(struct tcphdr) + TCPOLEN_MSS +
2194
                           (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) +
2195
                           (ireq->wscale_ok ? TCPOLEN_WSCALE_ALIGNED : 0) +
2196
                           /* SACK_PERM is in the place of NOP NOP of TS */
2197
                           ((ireq->sack_ok && !ireq->tstamp_ok) ? TCPOLEN_SACKPERM_ALIGNED : 0));
2198
 
2199
#ifdef CONFIG_TCP_MD5SIG
2200
        /* Are we doing MD5 on this segment? If so - make room for it */
2201
        md5 = tcp_rsk(req)->af_specific->md5_lookup(sk, req);
2202
        if (md5)
2203
                tcp_header_size += TCPOLEN_MD5SIG_ALIGNED;
2204
#endif
2205
        skb_push(skb, tcp_header_size);
2206
        skb_reset_transport_header(skb);
2207
 
2208
        th = tcp_hdr(skb);
2209
        memset(th, 0, sizeof(struct tcphdr));
2210
        th->syn = 1;
2211
        th->ack = 1;
2212
        TCP_ECN_make_synack(req, th);
2213
        th->source = inet_sk(sk)->sport;
2214
        th->dest = ireq->rmt_port;
2215
        TCP_SKB_CB(skb)->seq = tcp_rsk(req)->snt_isn;
2216
        TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
2217
        TCP_SKB_CB(skb)->sacked = 0;
2218
        skb_shinfo(skb)->gso_segs = 1;
2219
        skb_shinfo(skb)->gso_size = 0;
2220
        skb_shinfo(skb)->gso_type = 0;
2221
        th->seq = htonl(TCP_SKB_CB(skb)->seq);
2222
        th->ack_seq = htonl(tcp_rsk(req)->rcv_isn + 1);
2223
        if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
2224
                __u8 rcv_wscale;
2225
                /* Set this up on the first call only */
2226
                req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW);
2227
                /* tcp_full_space because it is guaranteed to be the first packet */
2228
                tcp_select_initial_window(tcp_full_space(sk),
2229
                        dst_metric(dst, RTAX_ADVMSS) - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
2230
                        &req->rcv_wnd,
2231
                        &req->window_clamp,
2232
                        ireq->wscale_ok,
2233
                        &rcv_wscale);
2234
                ireq->rcv_wscale = rcv_wscale;
2235
        }
2236
 
2237
        /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
2238
        th->window = htons(min(req->rcv_wnd, 65535U));
2239
 
2240
        TCP_SKB_CB(skb)->when = tcp_time_stamp;
2241
        tcp_syn_build_options((__be32 *)(th + 1), dst_metric(dst, RTAX_ADVMSS), ireq->tstamp_ok,
2242
                              ireq->sack_ok, ireq->wscale_ok, ireq->rcv_wscale,
2243
                              TCP_SKB_CB(skb)->when,
2244
                              req->ts_recent,
2245
                              (
2246
#ifdef CONFIG_TCP_MD5SIG
2247
                               md5 ? &md5_hash_location :
2248
#endif
2249
                               NULL)
2250
                              );
2251
 
2252
        skb->csum = 0;
2253
        th->doff = (tcp_header_size >> 2);
2254
        TCP_INC_STATS(TCP_MIB_OUTSEGS);
2255
 
2256
#ifdef CONFIG_TCP_MD5SIG
2257
        /* Okay, we have all we need - do the md5 hash if needed */
2258
        if (md5) {
2259
                tp->af_specific->calc_md5_hash(md5_hash_location,
2260
                                               md5,
2261
                                               NULL, dst, req,
2262
                                               tcp_hdr(skb), sk->sk_protocol,
2263
                                               skb->len);
2264
        }
2265
#endif
2266
 
2267
        return skb;
2268
}
2269
 
2270
/*
2271
 * Do all connect socket setups that can be done AF independent.
2272
 */
2273
static void tcp_connect_init(struct sock *sk)
2274
{
2275
        struct dst_entry *dst = __sk_dst_get(sk);
2276
        struct tcp_sock *tp = tcp_sk(sk);
2277
        __u8 rcv_wscale;
2278
 
2279
        /* We'll fix this up when we get a response from the other end.
2280
         * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
2281
         */
2282
        tp->tcp_header_len = sizeof(struct tcphdr) +
2283
                (sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0);
2284
 
2285
#ifdef CONFIG_TCP_MD5SIG
2286
        if (tp->af_specific->md5_lookup(sk, sk) != NULL)
2287
                tp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;
2288
#endif
2289
 
2290
        /* If user gave his TCP_MAXSEG, record it to clamp */
2291
        if (tp->rx_opt.user_mss)
2292
                tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
2293
        tp->max_window = 0;
2294
        tcp_mtup_init(sk);
2295
        tcp_sync_mss(sk, dst_mtu(dst));
2296
 
2297
        if (!tp->window_clamp)
2298
                tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
2299
        tp->advmss = dst_metric(dst, RTAX_ADVMSS);
2300
        tcp_initialize_rcv_mss(sk);
2301
 
2302
        tcp_select_initial_window(tcp_full_space(sk),
2303
                                  tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
2304
                                  &tp->rcv_wnd,
2305
                                  &tp->window_clamp,
2306
                                  sysctl_tcp_window_scaling,
2307
                                  &rcv_wscale);
2308
 
2309
        tp->rx_opt.rcv_wscale = rcv_wscale;
2310
        tp->rcv_ssthresh = tp->rcv_wnd;
2311
 
2312
        sk->sk_err = 0;
2313
        sock_reset_flag(sk, SOCK_DONE);
2314
        tp->snd_wnd = 0;
2315
        tcp_init_wl(tp, tp->write_seq, 0);
2316
        tp->snd_una = tp->write_seq;
2317
        tp->snd_sml = tp->write_seq;
2318
        tp->rcv_nxt = 0;
2319
        tp->rcv_wup = 0;
2320
        tp->copied_seq = 0;
2321
 
2322
        inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
2323
        inet_csk(sk)->icsk_retransmits = 0;
2324
        tcp_clear_retrans(tp);
2325
}
2326
 
2327
/*
2328
 * Build a SYN and send it off.
2329
 */
2330
int tcp_connect(struct sock *sk)
2331
{
2332
        struct tcp_sock *tp = tcp_sk(sk);
2333
        struct sk_buff *buff;
2334
 
2335
        tcp_connect_init(sk);
2336
 
2337
        buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation);
2338
        if (unlikely(buff == NULL))
2339
                return -ENOBUFS;
2340
 
2341
        /* Reserve space for headers. */
2342
        skb_reserve(buff, MAX_TCP_HEADER);
2343
 
2344
        TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN;
2345
        TCP_ECN_send_syn(sk, buff);
2346
        TCP_SKB_CB(buff)->sacked = 0;
2347
        skb_shinfo(buff)->gso_segs = 1;
2348
        skb_shinfo(buff)->gso_size = 0;
2349
        skb_shinfo(buff)->gso_type = 0;
2350
        buff->csum = 0;
2351
        tp->snd_nxt = tp->write_seq;
2352
        TCP_SKB_CB(buff)->seq = tp->write_seq++;
2353
        TCP_SKB_CB(buff)->end_seq = tp->write_seq;
2354
 
2355
        /* Send it off. */
2356
        TCP_SKB_CB(buff)->when = tcp_time_stamp;
2357
        tp->retrans_stamp = TCP_SKB_CB(buff)->when;
2358
        skb_header_release(buff);
2359
        __tcp_add_write_queue_tail(sk, buff);
2360
        sk_charge_skb(sk, buff);
2361
        tp->packets_out += tcp_skb_pcount(buff);
2362
        tcp_transmit_skb(sk, buff, 1, GFP_KERNEL);
2363
 
2364
        /* We change tp->snd_nxt after the tcp_transmit_skb() call
2365
         * in order to make this packet get counted in tcpOutSegs.
2366
         */
2367
        tp->snd_nxt = tp->write_seq;
2368
        tp->pushed_seq = tp->write_seq;
2369
        TCP_INC_STATS(TCP_MIB_ACTIVEOPENS);
2370
 
2371
        /* Timer for repeating the SYN until an answer. */
2372
        inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
2373
                                  inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
2374
        return 0;
2375
}
2376
 
2377
/* Send out a delayed ack, the caller does the policy checking
2378
 * to see if we should even be here.  See tcp_input.c:tcp_ack_snd_check()
2379
 * for details.
2380
 */
2381
void tcp_send_delayed_ack(struct sock *sk)
2382
{
2383
        struct inet_connection_sock *icsk = inet_csk(sk);
2384
        int ato = icsk->icsk_ack.ato;
2385
        unsigned long timeout;
2386
 
2387
        if (ato > TCP_DELACK_MIN) {
2388
                const struct tcp_sock *tp = tcp_sk(sk);
2389
                int max_ato = HZ/2;
2390
 
2391
                if (icsk->icsk_ack.pingpong || (icsk->icsk_ack.pending & ICSK_ACK_PUSHED))
2392
                        max_ato = TCP_DELACK_MAX;
2393
 
2394
                /* Slow path, intersegment interval is "high". */
2395
 
2396
                /* If some rtt estimate is known, use it to bound delayed ack.
2397
                 * Do not use inet_csk(sk)->icsk_rto here, use results of rtt measurements
2398
                 * directly.
2399
                 */
2400
                if (tp->srtt) {
2401
                        int rtt = max(tp->srtt>>3, TCP_DELACK_MIN);
2402
 
2403
                        if (rtt < max_ato)
2404
                                max_ato = rtt;
2405
                }
2406
 
2407
                ato = min(ato, max_ato);
2408
        }
2409
 
2410
        /* Stay within the limit we were given */
2411
        timeout = jiffies + ato;
2412
 
2413
        /* Use new timeout only if there wasn't a older one earlier. */
2414
        if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) {
2415
                /* If delack timer was blocked or is about to expire,
2416
                 * send ACK now.
2417
                 */
2418
                if (icsk->icsk_ack.blocked ||
2419
                    time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) {
2420
                        tcp_send_ack(sk);
2421
                        return;
2422
                }
2423
 
2424
                if (!time_before(timeout, icsk->icsk_ack.timeout))
2425
                        timeout = icsk->icsk_ack.timeout;
2426
        }
2427
        icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER;
2428
        icsk->icsk_ack.timeout = timeout;
2429
        sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout);
2430
}
2431
 
2432
/* This routine sends an ack and also updates the window. */
2433
void tcp_send_ack(struct sock *sk)
2434
{
2435
        /* If we have been reset, we may not send again. */
2436
        if (sk->sk_state != TCP_CLOSE) {
2437
                struct sk_buff *buff;
2438
 
2439
                /* We are not putting this on the write queue, so
2440
                 * tcp_transmit_skb() will set the ownership to this
2441
                 * sock.
2442
                 */
2443
                buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
2444
                if (buff == NULL) {
2445
                        inet_csk_schedule_ack(sk);
2446
                        inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
2447
                        inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
2448
                                                  TCP_DELACK_MAX, TCP_RTO_MAX);
2449
                        return;
2450
                }
2451
 
2452
                /* Reserve space for headers and prepare control bits. */
2453
                skb_reserve(buff, MAX_TCP_HEADER);
2454
                buff->csum = 0;
2455
                TCP_SKB_CB(buff)->flags = TCPCB_FLAG_ACK;
2456
                TCP_SKB_CB(buff)->sacked = 0;
2457
                skb_shinfo(buff)->gso_segs = 1;
2458
                skb_shinfo(buff)->gso_size = 0;
2459
                skb_shinfo(buff)->gso_type = 0;
2460
 
2461
                /* Send it off, this clears delayed acks for us. */
2462
                TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tcp_acceptable_seq(sk);
2463
                TCP_SKB_CB(buff)->when = tcp_time_stamp;
2464
                tcp_transmit_skb(sk, buff, 0, GFP_ATOMIC);
2465
        }
2466
}
2467
 
2468
/* This routine sends a packet with an out of date sequence
2469
 * number. It assumes the other end will try to ack it.
2470
 *
2471
 * Question: what should we make while urgent mode?
2472
 * 4.4BSD forces sending single byte of data. We cannot send
2473
 * out of window data, because we have SND.NXT==SND.MAX...
2474
 *
2475
 * Current solution: to send TWO zero-length segments in urgent mode:
2476
 * one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is
2477
 * out-of-date with SND.UNA-1 to probe window.
2478
 */
2479
static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
2480
{
2481
        struct tcp_sock *tp = tcp_sk(sk);
2482
        struct sk_buff *skb;
2483
 
2484
        /* We don't queue it, tcp_transmit_skb() sets ownership. */
2485
        skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
2486
        if (skb == NULL)
2487
                return -1;
2488
 
2489
        /* Reserve space for headers and set control bits. */
2490
        skb_reserve(skb, MAX_TCP_HEADER);
2491
        skb->csum = 0;
2492
        TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
2493
        TCP_SKB_CB(skb)->sacked = urgent;
2494
        skb_shinfo(skb)->gso_segs = 1;
2495
        skb_shinfo(skb)->gso_size = 0;
2496
        skb_shinfo(skb)->gso_type = 0;
2497
 
2498
        /* Use a previous sequence.  This should cause the other
2499
         * end to send an ack.  Don't queue or clone SKB, just
2500
         * send it.
2501
         */
2502
        TCP_SKB_CB(skb)->seq = urgent ? tp->snd_una : tp->snd_una - 1;
2503
        TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
2504
        TCP_SKB_CB(skb)->when = tcp_time_stamp;
2505
        return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC);
2506
}
2507
 
2508
int tcp_write_wakeup(struct sock *sk)
2509
{
2510
        if (sk->sk_state != TCP_CLOSE) {
2511
                struct tcp_sock *tp = tcp_sk(sk);
2512
                struct sk_buff *skb;
2513
 
2514
                if ((skb = tcp_send_head(sk)) != NULL &&
2515
                    before(TCP_SKB_CB(skb)->seq, tp->snd_una+tp->snd_wnd)) {
2516
                        int err;
2517
                        unsigned int mss = tcp_current_mss(sk, 0);
2518
                        unsigned int seg_size = tp->snd_una+tp->snd_wnd-TCP_SKB_CB(skb)->seq;
2519
 
2520
                        if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
2521
                                tp->pushed_seq = TCP_SKB_CB(skb)->end_seq;
2522
 
2523
                        /* We are probing the opening of a window
2524
                         * but the window size is != 0
2525
                         * must have been a result SWS avoidance ( sender )
2526
                         */
2527
                        if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq ||
2528
                            skb->len > mss) {
2529
                                seg_size = min(seg_size, mss);
2530
                                TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
2531
                                if (tcp_fragment(sk, skb, seg_size, mss))
2532
                                        return -1;
2533
                        } else if (!tcp_skb_pcount(skb))
2534
                                tcp_set_skb_tso_segs(sk, skb, mss);
2535
 
2536
                        TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
2537
                        TCP_SKB_CB(skb)->when = tcp_time_stamp;
2538
                        err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
2539
                        if (!err) {
2540
                                update_send_head(sk, skb);
2541
                        }
2542
                        return err;
2543
                } else {
2544
                        if (tp->urg_mode &&
2545
                            between(tp->snd_up, tp->snd_una+1, tp->snd_una+0xFFFF))
2546
                                tcp_xmit_probe_skb(sk, TCPCB_URG);
2547
                        return tcp_xmit_probe_skb(sk, 0);
2548
                }
2549
        }
2550
        return -1;
2551
}
2552
 
2553
/* A window probe timeout has occurred.  If window is not closed send
2554
 * a partial packet else a zero probe.
2555
 */
2556
void tcp_send_probe0(struct sock *sk)
2557
{
2558
        struct inet_connection_sock *icsk = inet_csk(sk);
2559
        struct tcp_sock *tp = tcp_sk(sk);
2560
        int err;
2561
 
2562
        err = tcp_write_wakeup(sk);
2563
 
2564
        if (tp->packets_out || !tcp_send_head(sk)) {
2565
                /* Cancel probe timer, if it is not required. */
2566
                icsk->icsk_probes_out = 0;
2567
                icsk->icsk_backoff = 0;
2568
                return;
2569
        }
2570
 
2571
        if (err <= 0) {
2572
                if (icsk->icsk_backoff < sysctl_tcp_retries2)
2573
                        icsk->icsk_backoff++;
2574
                icsk->icsk_probes_out++;
2575
                inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
2576
                                          min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX),
2577
                                          TCP_RTO_MAX);
2578
        } else {
2579
                /* If packet was not sent due to local congestion,
2580
                 * do not backoff and do not remember icsk_probes_out.
2581
                 * Let local senders to fight for local resources.
2582
                 *
2583
                 * Use accumulated backoff yet.
2584
                 */
2585
                if (!icsk->icsk_probes_out)
2586
                        icsk->icsk_probes_out = 1;
2587
                inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
2588
                                          min(icsk->icsk_rto << icsk->icsk_backoff,
2589
                                              TCP_RESOURCE_PROBE_INTERVAL),
2590
                                          TCP_RTO_MAX);
2591
        }
2592
}
2593
 
2594
EXPORT_SYMBOL(tcp_connect);
2595
EXPORT_SYMBOL(tcp_make_synack);
2596
EXPORT_SYMBOL(tcp_simple_retransmit);
2597
EXPORT_SYMBOL(tcp_sync_mss);
2598
EXPORT_SYMBOL(sysctl_tcp_tso_win_divisor);
2599
EXPORT_SYMBOL(tcp_mtup_init);

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.