1 |
1275 |
phoenix |
/*
|
2 |
|
|
* INET An implementation of the TCP/IP protocol suite for the LINUX
|
3 |
|
|
* operating system. INET is implemented using the BSD Socket
|
4 |
|
|
* interface as the means of communication with the user level.
|
5 |
|
|
*
|
6 |
|
|
* Implementation of the Transmission Control Protocol(TCP).
|
7 |
|
|
*
|
8 |
|
|
* Version: $Id: tcp_output.c,v 1.1.1.1 2004-04-15 01:13:57 phoenix Exp $
|
9 |
|
|
*
|
10 |
|
|
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
|
11 |
|
|
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
|
12 |
|
|
* Mark Evans, <evansmp@uhura.aston.ac.uk>
|
13 |
|
|
* Corey Minyard <wf-rch!minyard@relay.EU.net>
|
14 |
|
|
* Florian La Roche, <flla@stud.uni-sb.de>
|
15 |
|
|
* Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
|
16 |
|
|
* Linus Torvalds, <torvalds@cs.helsinki.fi>
|
17 |
|
|
* Alan Cox, <gw4pts@gw4pts.ampr.org>
|
18 |
|
|
* Matthew Dillon, <dillon@apollo.west.oic.com>
|
19 |
|
|
* Arnt Gulbrandsen, <agulbra@nvg.unit.no>
|
20 |
|
|
* Jorge Cwik, <jorge@laser.satlink.net>
|
21 |
|
|
*/
|
22 |
|
|
|
23 |
|
|
/*
|
24 |
|
|
* Changes: Pedro Roque : Retransmit queue handled by TCP.
|
25 |
|
|
* : Fragmentation on mtu decrease
|
26 |
|
|
* : Segment collapse on retransmit
|
27 |
|
|
* : AF independence
|
28 |
|
|
*
|
29 |
|
|
* Linus Torvalds : send_delayed_ack
|
30 |
|
|
* David S. Miller : Charge memory using the right skb
|
31 |
|
|
* during syn/ack processing.
|
32 |
|
|
* David S. Miller : Output engine completely rewritten.
|
33 |
|
|
* Andrea Arcangeli: SYNACK carry ts_recent in tsecr.
|
34 |
|
|
* Cacophonix Gaul : draft-minshall-nagle-01
|
35 |
|
|
* J Hadi Salim : ECN support
|
36 |
|
|
*
|
37 |
|
|
*/
|
38 |
|
|
|
39 |
|
|
#include <net/tcp.h>
|
40 |
|
|
|
41 |
|
|
#include <linux/compiler.h>
|
42 |
|
|
#include <linux/smp_lock.h>
|
43 |
|
|
|
44 |
|
|
/* People can turn this off for buggy TCP's found in printers etc. */
|
45 |
|
|
int sysctl_tcp_retrans_collapse = 1;
|
46 |
|
|
|
47 |
|
|
static __inline__
|
48 |
|
|
void update_send_head(struct sock *sk, struct tcp_opt *tp, struct sk_buff *skb)
|
49 |
|
|
{
|
50 |
|
|
tp->send_head = skb->next;
|
51 |
|
|
if (tp->send_head == (struct sk_buff *) &sk->write_queue)
|
52 |
|
|
tp->send_head = NULL;
|
53 |
|
|
tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
|
54 |
|
|
if (tp->packets_out++ == 0)
|
55 |
|
|
tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
|
56 |
|
|
}
|
57 |
|
|
|
58 |
|
|
/* SND.NXT, if window was not shrunk.
|
59 |
|
|
* If window has been shrunk, what should we make? It is not clear at all.
|
60 |
|
|
* Using SND.UNA we will fail to open window, SND.NXT is out of window. :-(
|
61 |
|
|
* Anything in between SND.UNA...SND.UNA+SND.WND also can be already
|
62 |
|
|
* invalid. OK, let's make this for now:
|
63 |
|
|
*/
|
64 |
|
|
static __inline__ __u32 tcp_acceptable_seq(struct sock *sk, struct tcp_opt *tp)
|
65 |
|
|
{
|
66 |
|
|
if (!before(tp->snd_una+tp->snd_wnd, tp->snd_nxt))
|
67 |
|
|
return tp->snd_nxt;
|
68 |
|
|
else
|
69 |
|
|
return tp->snd_una+tp->snd_wnd;
|
70 |
|
|
}
|
71 |
|
|
|
72 |
|
|
/* Calculate mss to advertise in SYN segment.
|
73 |
|
|
* RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that:
|
74 |
|
|
*
|
75 |
|
|
* 1. It is independent of path mtu.
|
76 |
|
|
* 2. Ideally, it is maximal possible segment size i.e. 65535-40.
|
77 |
|
|
* 3. For IPv4 it is reasonable to calculate it from maximal MTU of
|
78 |
|
|
* attached devices, because some buggy hosts are confused by
|
79 |
|
|
* large MSS.
|
80 |
|
|
* 4. We do not make 3, we advertise MSS, calculated from first
|
81 |
|
|
* hop device mtu, but allow to raise it to ip_rt_min_advmss.
|
82 |
|
|
* This may be overriden via information stored in routing table.
|
83 |
|
|
* 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible,
|
84 |
|
|
* probably even Jumbo".
|
85 |
|
|
*/
|
86 |
|
|
static __u16 tcp_advertise_mss(struct sock *sk)
|
87 |
|
|
{
|
88 |
|
|
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
|
89 |
|
|
struct dst_entry *dst = __sk_dst_get(sk);
|
90 |
|
|
int mss = tp->advmss;
|
91 |
|
|
|
92 |
|
|
if (dst && dst->advmss < mss) {
|
93 |
|
|
mss = dst->advmss;
|
94 |
|
|
tp->advmss = mss;
|
95 |
|
|
}
|
96 |
|
|
|
97 |
|
|
return (__u16)mss;
|
98 |
|
|
}
|
99 |
|
|
|
100 |
|
|
/* RFC2861. Reset CWND after idle period longer RTO to "restart window".
|
101 |
|
|
* This is the first part of cwnd validation mechanism. */
|
102 |
|
|
static void tcp_cwnd_restart(struct tcp_opt *tp)
|
103 |
|
|
{
|
104 |
|
|
s32 delta = tcp_time_stamp - tp->lsndtime;
|
105 |
|
|
u32 restart_cwnd = tcp_init_cwnd(tp);
|
106 |
|
|
u32 cwnd = tp->snd_cwnd;
|
107 |
|
|
|
108 |
|
|
tp->snd_ssthresh = tcp_current_ssthresh(tp);
|
109 |
|
|
restart_cwnd = min(restart_cwnd, cwnd);
|
110 |
|
|
|
111 |
|
|
while ((delta -= tp->rto) > 0 && cwnd > restart_cwnd)
|
112 |
|
|
cwnd >>= 1;
|
113 |
|
|
tp->snd_cwnd = max(cwnd, restart_cwnd);
|
114 |
|
|
tp->snd_cwnd_stamp = tcp_time_stamp;
|
115 |
|
|
tp->snd_cwnd_used = 0;
|
116 |
|
|
}
|
117 |
|
|
|
118 |
|
|
static __inline__ void tcp_event_data_sent(struct tcp_opt *tp, struct sk_buff *skb)
|
119 |
|
|
{
|
120 |
|
|
u32 now = tcp_time_stamp;
|
121 |
|
|
|
122 |
|
|
if (!tp->packets_out && (s32)(now - tp->lsndtime) > tp->rto)
|
123 |
|
|
tcp_cwnd_restart(tp);
|
124 |
|
|
|
125 |
|
|
tp->lsndtime = now;
|
126 |
|
|
|
127 |
|
|
/* If it is a reply for ato after last received
|
128 |
|
|
* packet, enter pingpong mode.
|
129 |
|
|
*/
|
130 |
|
|
if ((u32)(now - tp->ack.lrcvtime) < tp->ack.ato)
|
131 |
|
|
tp->ack.pingpong = 1;
|
132 |
|
|
}
|
133 |
|
|
|
134 |
|
|
static __inline__ void tcp_event_ack_sent(struct sock *sk)
|
135 |
|
|
{
|
136 |
|
|
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
|
137 |
|
|
|
138 |
|
|
tcp_dec_quickack_mode(tp);
|
139 |
|
|
tcp_clear_xmit_timer(sk, TCP_TIME_DACK);
|
140 |
|
|
}
|
141 |
|
|
|
142 |
|
|
/* Chose a new window to advertise, update state in tcp_opt for the
|
143 |
|
|
* socket, and return result with RFC1323 scaling applied. The return
|
144 |
|
|
* value can be stuffed directly into th->window for an outgoing
|
145 |
|
|
* frame.
|
146 |
|
|
*/
|
147 |
|
|
static __inline__ u16 tcp_select_window(struct sock *sk)
|
148 |
|
|
{
|
149 |
|
|
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
|
150 |
|
|
u32 cur_win = tcp_receive_window(tp);
|
151 |
|
|
u32 new_win = __tcp_select_window(sk);
|
152 |
|
|
|
153 |
|
|
/* Never shrink the offered window */
|
154 |
|
|
if(new_win < cur_win) {
|
155 |
|
|
/* Danger Will Robinson!
|
156 |
|
|
* Don't update rcv_wup/rcv_wnd here or else
|
157 |
|
|
* we will not be able to advertise a zero
|
158 |
|
|
* window in time. --DaveM
|
159 |
|
|
*
|
160 |
|
|
* Relax Will Robinson.
|
161 |
|
|
*/
|
162 |
|
|
new_win = cur_win;
|
163 |
|
|
}
|
164 |
|
|
tp->rcv_wnd = new_win;
|
165 |
|
|
tp->rcv_wup = tp->rcv_nxt;
|
166 |
|
|
|
167 |
|
|
/* RFC1323 scaling applied */
|
168 |
|
|
new_win >>= tp->rcv_wscale;
|
169 |
|
|
|
170 |
|
|
/* If we advertise zero window, disable fast path. */
|
171 |
|
|
if (new_win == 0)
|
172 |
|
|
tp->pred_flags = 0;
|
173 |
|
|
|
174 |
|
|
return new_win;
|
175 |
|
|
}
|
176 |
|
|
|
177 |
|
|
|
178 |
|
|
/* This routine actually transmits TCP packets queued in by
|
179 |
|
|
* tcp_do_sendmsg(). This is used by both the initial
|
180 |
|
|
* transmission and possible later retransmissions.
|
181 |
|
|
* All SKB's seen here are completely headerless. It is our
|
182 |
|
|
* job to build the TCP header, and pass the packet down to
|
183 |
|
|
* IP so it can do the same plus pass the packet off to the
|
184 |
|
|
* device.
|
185 |
|
|
*
|
186 |
|
|
* We are working here with either a clone of the original
|
187 |
|
|
* SKB, or a fresh unique copy made by the retransmit engine.
|
188 |
|
|
*/
|
189 |
|
|
int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
|
190 |
|
|
{
|
191 |
|
|
if(skb != NULL) {
|
192 |
|
|
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
|
193 |
|
|
struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
|
194 |
|
|
int tcp_header_size = tp->tcp_header_len;
|
195 |
|
|
struct tcphdr *th;
|
196 |
|
|
int sysctl_flags;
|
197 |
|
|
int err;
|
198 |
|
|
|
199 |
|
|
#define SYSCTL_FLAG_TSTAMPS 0x1
|
200 |
|
|
#define SYSCTL_FLAG_WSCALE 0x2
|
201 |
|
|
#define SYSCTL_FLAG_SACK 0x4
|
202 |
|
|
|
203 |
|
|
sysctl_flags = 0;
|
204 |
|
|
if (tcb->flags & TCPCB_FLAG_SYN) {
|
205 |
|
|
tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS;
|
206 |
|
|
if(sysctl_tcp_timestamps) {
|
207 |
|
|
tcp_header_size += TCPOLEN_TSTAMP_ALIGNED;
|
208 |
|
|
sysctl_flags |= SYSCTL_FLAG_TSTAMPS;
|
209 |
|
|
}
|
210 |
|
|
if(sysctl_tcp_window_scaling) {
|
211 |
|
|
tcp_header_size += TCPOLEN_WSCALE_ALIGNED;
|
212 |
|
|
sysctl_flags |= SYSCTL_FLAG_WSCALE;
|
213 |
|
|
}
|
214 |
|
|
if(sysctl_tcp_sack) {
|
215 |
|
|
sysctl_flags |= SYSCTL_FLAG_SACK;
|
216 |
|
|
if(!(sysctl_flags & SYSCTL_FLAG_TSTAMPS))
|
217 |
|
|
tcp_header_size += TCPOLEN_SACKPERM_ALIGNED;
|
218 |
|
|
}
|
219 |
|
|
} else if (tp->eff_sacks) {
|
220 |
|
|
/* A SACK is 2 pad bytes, a 2 byte header, plus
|
221 |
|
|
* 2 32-bit sequence numbers for each SACK block.
|
222 |
|
|
*/
|
223 |
|
|
tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED +
|
224 |
|
|
(tp->eff_sacks * TCPOLEN_SACK_PERBLOCK));
|
225 |
|
|
}
|
226 |
|
|
th = (struct tcphdr *) skb_push(skb, tcp_header_size);
|
227 |
|
|
skb->h.th = th;
|
228 |
|
|
skb_set_owner_w(skb, sk);
|
229 |
|
|
|
230 |
|
|
/* Build TCP header and checksum it. */
|
231 |
|
|
th->source = sk->sport;
|
232 |
|
|
th->dest = sk->dport;
|
233 |
|
|
th->seq = htonl(tcb->seq);
|
234 |
|
|
th->ack_seq = htonl(tp->rcv_nxt);
|
235 |
|
|
*(((__u16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | tcb->flags);
|
236 |
|
|
if (tcb->flags & TCPCB_FLAG_SYN) {
|
237 |
|
|
/* RFC1323: The window in SYN & SYN/ACK segments
|
238 |
|
|
* is never scaled.
|
239 |
|
|
*/
|
240 |
|
|
th->window = htons(tp->rcv_wnd);
|
241 |
|
|
} else {
|
242 |
|
|
th->window = htons(tcp_select_window(sk));
|
243 |
|
|
}
|
244 |
|
|
th->check = 0;
|
245 |
|
|
th->urg_ptr = 0;
|
246 |
|
|
|
247 |
|
|
if (tp->urg_mode &&
|
248 |
|
|
between(tp->snd_up, tcb->seq+1, tcb->seq+0xFFFF)) {
|
249 |
|
|
th->urg_ptr = htons(tp->snd_up-tcb->seq);
|
250 |
|
|
th->urg = 1;
|
251 |
|
|
}
|
252 |
|
|
|
253 |
|
|
if (tcb->flags & TCPCB_FLAG_SYN) {
|
254 |
|
|
tcp_syn_build_options((__u32 *)(th + 1),
|
255 |
|
|
tcp_advertise_mss(sk),
|
256 |
|
|
(sysctl_flags & SYSCTL_FLAG_TSTAMPS),
|
257 |
|
|
(sysctl_flags & SYSCTL_FLAG_SACK),
|
258 |
|
|
(sysctl_flags & SYSCTL_FLAG_WSCALE),
|
259 |
|
|
tp->rcv_wscale,
|
260 |
|
|
tcb->when,
|
261 |
|
|
tp->ts_recent);
|
262 |
|
|
} else {
|
263 |
|
|
tcp_build_and_update_options((__u32 *)(th + 1),
|
264 |
|
|
tp, tcb->when);
|
265 |
|
|
|
266 |
|
|
TCP_ECN_send(sk, tp, skb, tcp_header_size);
|
267 |
|
|
}
|
268 |
|
|
tp->af_specific->send_check(sk, th, skb->len, skb);
|
269 |
|
|
|
270 |
|
|
if (tcb->flags & TCPCB_FLAG_ACK)
|
271 |
|
|
tcp_event_ack_sent(sk);
|
272 |
|
|
|
273 |
|
|
if (skb->len != tcp_header_size)
|
274 |
|
|
tcp_event_data_sent(tp, skb);
|
275 |
|
|
|
276 |
|
|
TCP_INC_STATS(TcpOutSegs);
|
277 |
|
|
|
278 |
|
|
err = tp->af_specific->queue_xmit(skb, 0);
|
279 |
|
|
if (err <= 0)
|
280 |
|
|
return err;
|
281 |
|
|
|
282 |
|
|
tcp_enter_cwr(tp);
|
283 |
|
|
|
284 |
|
|
/* NET_XMIT_CN is special. It does not guarantee,
|
285 |
|
|
* that this packet is lost. It tells that device
|
286 |
|
|
* is about to start to drop packets or already
|
287 |
|
|
* drops some packets of the same priority and
|
288 |
|
|
* invokes us to send less aggressively.
|
289 |
|
|
*/
|
290 |
|
|
return err == NET_XMIT_CN ? 0 : err;
|
291 |
|
|
}
|
292 |
|
|
return -ENOBUFS;
|
293 |
|
|
#undef SYSCTL_FLAG_TSTAMPS
|
294 |
|
|
#undef SYSCTL_FLAG_WSCALE
|
295 |
|
|
#undef SYSCTL_FLAG_SACK
|
296 |
|
|
}
|
297 |
|
|
|
298 |
|
|
|
299 |
|
|
/* This is the main buffer sending routine. We queue the buffer
|
300 |
|
|
* and decide whether to queue or transmit now.
|
301 |
|
|
*
|
302 |
|
|
* NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
|
303 |
|
|
* otherwise socket can stall.
|
304 |
|
|
*/
|
305 |
|
|
void tcp_send_skb(struct sock *sk, struct sk_buff *skb, int force_queue, unsigned cur_mss)
|
306 |
|
|
{
|
307 |
|
|
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
|
308 |
|
|
|
309 |
|
|
/* Advance write_seq and place onto the write_queue. */
|
310 |
|
|
tp->write_seq = TCP_SKB_CB(skb)->end_seq;
|
311 |
|
|
__skb_queue_tail(&sk->write_queue, skb);
|
312 |
|
|
tcp_charge_skb(sk, skb);
|
313 |
|
|
|
314 |
|
|
if (!force_queue && tp->send_head == NULL && tcp_snd_test(tp, skb, cur_mss, tp->nonagle)) {
|
315 |
|
|
/* Send it out now. */
|
316 |
|
|
TCP_SKB_CB(skb)->when = tcp_time_stamp;
|
317 |
|
|
if (tcp_transmit_skb(sk, skb_clone(skb, sk->allocation)) == 0) {
|
318 |
|
|
tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
|
319 |
|
|
tcp_minshall_update(tp, cur_mss, skb);
|
320 |
|
|
if (tp->packets_out++ == 0)
|
321 |
|
|
tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
|
322 |
|
|
return;
|
323 |
|
|
}
|
324 |
|
|
}
|
325 |
|
|
/* Queue it, remembering where we must start sending. */
|
326 |
|
|
if (tp->send_head == NULL)
|
327 |
|
|
tp->send_head = skb;
|
328 |
|
|
}
|
329 |
|
|
|
330 |
|
|
/* Send _single_ skb sitting at the send head. This function requires
|
331 |
|
|
* true push pending frames to setup probe timer etc.
|
332 |
|
|
*/
|
333 |
|
|
void tcp_push_one(struct sock *sk, unsigned cur_mss)
|
334 |
|
|
{
|
335 |
|
|
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
|
336 |
|
|
struct sk_buff *skb = tp->send_head;
|
337 |
|
|
|
338 |
|
|
if (tcp_snd_test(tp, skb, cur_mss, 1)) {
|
339 |
|
|
/* Send it out now. */
|
340 |
|
|
TCP_SKB_CB(skb)->when = tcp_time_stamp;
|
341 |
|
|
if (tcp_transmit_skb(sk, skb_clone(skb, sk->allocation)) == 0) {
|
342 |
|
|
tp->send_head = NULL;
|
343 |
|
|
tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
|
344 |
|
|
if (tp->packets_out++ == 0)
|
345 |
|
|
tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
|
346 |
|
|
return;
|
347 |
|
|
}
|
348 |
|
|
}
|
349 |
|
|
}
|
350 |
|
|
|
351 |
|
|
/* Split fragmented skb to two parts at length len. */
|
352 |
|
|
|
353 |
|
|
static void skb_split(struct sk_buff *skb, struct sk_buff *skb1, u32 len)
|
354 |
|
|
{
|
355 |
|
|
int i;
|
356 |
|
|
int pos = skb->len - skb->data_len;
|
357 |
|
|
|
358 |
|
|
if (len < pos) {
|
359 |
|
|
/* Split line is inside header. */
|
360 |
|
|
memcpy(skb_put(skb1, pos-len), skb->data + len, pos-len);
|
361 |
|
|
|
362 |
|
|
/* And move data appendix as is. */
|
363 |
|
|
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
|
364 |
|
|
skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i];
|
365 |
|
|
|
366 |
|
|
skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags;
|
367 |
|
|
skb_shinfo(skb)->nr_frags = 0;
|
368 |
|
|
|
369 |
|
|
skb1->data_len = skb->data_len;
|
370 |
|
|
skb1->len += skb1->data_len;
|
371 |
|
|
skb->data_len = 0;
|
372 |
|
|
skb->len = len;
|
373 |
|
|
skb->tail = skb->data+len;
|
374 |
|
|
} else {
|
375 |
|
|
int k = 0;
|
376 |
|
|
int nfrags = skb_shinfo(skb)->nr_frags;
|
377 |
|
|
|
378 |
|
|
/* Second chunk has no header, nothing to copy. */
|
379 |
|
|
|
380 |
|
|
skb_shinfo(skb)->nr_frags = 0;
|
381 |
|
|
skb1->len = skb1->data_len = skb->len - len;
|
382 |
|
|
skb->len = len;
|
383 |
|
|
skb->data_len = len - pos;
|
384 |
|
|
|
385 |
|
|
for (i=0; i<nfrags; i++) {
|
386 |
|
|
int size = skb_shinfo(skb)->frags[i].size;
|
387 |
|
|
if (pos + size > len) {
|
388 |
|
|
skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i];
|
389 |
|
|
|
390 |
|
|
if (pos < len) {
|
391 |
|
|
/* Split frag.
|
392 |
|
|
* We have to variants in this case:
|
393 |
|
|
* 1. Move all the frag to the second
|
394 |
|
|
* part, if it is possible. F.e.
|
395 |
|
|
* this approach is mandatory for TUX,
|
396 |
|
|
* where splitting is expensive.
|
397 |
|
|
* 2. Split is accurately. We make this.
|
398 |
|
|
*/
|
399 |
|
|
get_page(skb_shinfo(skb)->frags[i].page);
|
400 |
|
|
skb_shinfo(skb1)->frags[0].page_offset += (len-pos);
|
401 |
|
|
skb_shinfo(skb1)->frags[0].size -= (len-pos);
|
402 |
|
|
skb_shinfo(skb)->frags[i].size = len-pos;
|
403 |
|
|
skb_shinfo(skb)->nr_frags++;
|
404 |
|
|
}
|
405 |
|
|
k++;
|
406 |
|
|
} else {
|
407 |
|
|
skb_shinfo(skb)->nr_frags++;
|
408 |
|
|
}
|
409 |
|
|
pos += size;
|
410 |
|
|
}
|
411 |
|
|
skb_shinfo(skb1)->nr_frags = k;
|
412 |
|
|
}
|
413 |
|
|
}
|
414 |
|
|
|
415 |
|
|
/* Function to create two new TCP segments. Shrinks the given segment
|
416 |
|
|
* to the specified size and appends a new segment with the rest of the
|
417 |
|
|
* packet to the list. This won't be called frequently, I hope.
|
418 |
|
|
* Remember, these are still headerless SKBs at this point.
|
419 |
|
|
*/
|
420 |
|
|
static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
|
421 |
|
|
{
|
422 |
|
|
struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
|
423 |
|
|
struct sk_buff *buff;
|
424 |
|
|
int nsize = skb->len - len;
|
425 |
|
|
u16 flags;
|
426 |
|
|
|
427 |
|
|
if (skb_cloned(skb) &&
|
428 |
|
|
skb_is_nonlinear(skb) &&
|
429 |
|
|
pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
|
430 |
|
|
return -ENOMEM;
|
431 |
|
|
|
432 |
|
|
/* Get a new skb... force flag on. */
|
433 |
|
|
buff = tcp_alloc_skb(sk, nsize, GFP_ATOMIC);
|
434 |
|
|
if (buff == NULL)
|
435 |
|
|
return -ENOMEM; /* We'll just try again later. */
|
436 |
|
|
tcp_charge_skb(sk, buff);
|
437 |
|
|
|
438 |
|
|
/* Correct the sequence numbers. */
|
439 |
|
|
TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
|
440 |
|
|
TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
|
441 |
|
|
TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
|
442 |
|
|
|
443 |
|
|
/* PSH and FIN should only be set in the second packet. */
|
444 |
|
|
flags = TCP_SKB_CB(skb)->flags;
|
445 |
|
|
TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
|
446 |
|
|
TCP_SKB_CB(buff)->flags = flags;
|
447 |
|
|
TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked&(TCPCB_LOST|TCPCB_EVER_RETRANS|TCPCB_AT_TAIL);
|
448 |
|
|
if (TCP_SKB_CB(buff)->sacked&TCPCB_LOST) {
|
449 |
|
|
tp->lost_out++;
|
450 |
|
|
tp->left_out++;
|
451 |
|
|
}
|
452 |
|
|
TCP_SKB_CB(skb)->sacked &= ~TCPCB_AT_TAIL;
|
453 |
|
|
|
454 |
|
|
if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_HW) {
|
455 |
|
|
/* Copy and checksum data tail into the new buffer. */
|
456 |
|
|
buff->csum = csum_partial_copy_nocheck(skb->data + len, skb_put(buff, nsize),
|
457 |
|
|
nsize, 0);
|
458 |
|
|
|
459 |
|
|
skb_trim(skb, len);
|
460 |
|
|
|
461 |
|
|
skb->csum = csum_block_sub(skb->csum, buff->csum, len);
|
462 |
|
|
} else {
|
463 |
|
|
skb->ip_summed = CHECKSUM_HW;
|
464 |
|
|
skb_split(skb, buff, len);
|
465 |
|
|
}
|
466 |
|
|
|
467 |
|
|
buff->ip_summed = skb->ip_summed;
|
468 |
|
|
|
469 |
|
|
/* Looks stupid, but our code really uses when of
|
470 |
|
|
* skbs, which it never sent before. --ANK
|
471 |
|
|
*/
|
472 |
|
|
TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
|
473 |
|
|
|
474 |
|
|
/* Link BUFF into the send queue. */
|
475 |
|
|
__skb_append(skb, buff);
|
476 |
|
|
|
477 |
|
|
return 0;
|
478 |
|
|
}
|
479 |
|
|
|
480 |
|
|
/* This function synchronize snd mss to current pmtu/exthdr set.
|
481 |
|
|
|
482 |
|
|
tp->user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
|
483 |
|
|
for TCP options, but includes only bare TCP header.
|
484 |
|
|
|
485 |
|
|
tp->mss_clamp is mss negotiated at connection setup.
|
486 |
|
|
It is minumum of user_mss and mss received with SYN.
|
487 |
|
|
It also does not include TCP options.
|
488 |
|
|
|
489 |
|
|
tp->pmtu_cookie is last pmtu, seen by this function.
|
490 |
|
|
|
491 |
|
|
tp->mss_cache is current effective sending mss, including
|
492 |
|
|
all tcp options except for SACKs. It is evaluated,
|
493 |
|
|
taking into account current pmtu, but never exceeds
|
494 |
|
|
tp->mss_clamp.
|
495 |
|
|
|
496 |
|
|
NOTE1. rfc1122 clearly states that advertised MSS
|
497 |
|
|
DOES NOT include either tcp or ip options.
|
498 |
|
|
|
499 |
|
|
NOTE2. tp->pmtu_cookie and tp->mss_cache are READ ONLY outside
|
500 |
|
|
this function. --ANK (980731)
|
501 |
|
|
*/
|
502 |
|
|
|
503 |
|
|
int tcp_sync_mss(struct sock *sk, u32 pmtu)
|
504 |
|
|
{
|
505 |
|
|
struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
|
506 |
|
|
int mss_now;
|
507 |
|
|
|
508 |
|
|
/* Calculate base mss without TCP options:
|
509 |
|
|
It is MMS_S - sizeof(tcphdr) of rfc1122
|
510 |
|
|
*/
|
511 |
|
|
|
512 |
|
|
mss_now = pmtu - tp->af_specific->net_header_len - sizeof(struct tcphdr);
|
513 |
|
|
|
514 |
|
|
/* Clamp it (mss_clamp does not include tcp options) */
|
515 |
|
|
if (mss_now > tp->mss_clamp)
|
516 |
|
|
mss_now = tp->mss_clamp;
|
517 |
|
|
|
518 |
|
|
/* Now subtract optional transport overhead */
|
519 |
|
|
mss_now -= tp->ext_header_len;
|
520 |
|
|
|
521 |
|
|
/* Then reserve room for full set of TCP options and 8 bytes of data */
|
522 |
|
|
if (mss_now < 48)
|
523 |
|
|
mss_now = 48;
|
524 |
|
|
|
525 |
|
|
/* Now subtract TCP options size, not including SACKs */
|
526 |
|
|
mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
|
527 |
|
|
|
528 |
|
|
/* Bound mss with half of window */
|
529 |
|
|
if (tp->max_window && mss_now > (tp->max_window>>1))
|
530 |
|
|
mss_now = max((tp->max_window>>1), 68U - tp->tcp_header_len);
|
531 |
|
|
|
532 |
|
|
/* And store cached results */
|
533 |
|
|
tp->pmtu_cookie = pmtu;
|
534 |
|
|
tp->mss_cache = mss_now;
|
535 |
|
|
return mss_now;
|
536 |
|
|
}
|
537 |
|
|
|
538 |
|
|
|
539 |
|
|
/* This routine writes packets to the network. It advances the
|
540 |
|
|
* send_head. This happens as incoming acks open up the remote
|
541 |
|
|
* window for us.
|
542 |
|
|
*
|
543 |
|
|
* Returns 1, if no segments are in flight and we have queued segments, but
|
544 |
|
|
* cannot send anything now because of SWS or another problem.
|
545 |
|
|
*/
|
546 |
|
|
int tcp_write_xmit(struct sock *sk, int nonagle)
|
547 |
|
|
{
|
548 |
|
|
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
|
549 |
|
|
unsigned int mss_now;
|
550 |
|
|
|
551 |
|
|
/* If we are closed, the bytes will have to remain here.
|
552 |
|
|
* In time closedown will finish, we empty the write queue and all
|
553 |
|
|
* will be happy.
|
554 |
|
|
*/
|
555 |
|
|
if(sk->state != TCP_CLOSE) {
|
556 |
|
|
struct sk_buff *skb;
|
557 |
|
|
int sent_pkts = 0;
|
558 |
|
|
|
559 |
|
|
/* Account for SACKS, we may need to fragment due to this.
|
560 |
|
|
* It is just like the real MSS changing on us midstream.
|
561 |
|
|
* We also handle things correctly when the user adds some
|
562 |
|
|
* IP options mid-stream. Silly to do, but cover it.
|
563 |
|
|
*/
|
564 |
|
|
mss_now = tcp_current_mss(sk);
|
565 |
|
|
|
566 |
|
|
while((skb = tp->send_head) &&
|
567 |
|
|
tcp_snd_test(tp, skb, mss_now, tcp_skb_is_last(sk, skb) ? nonagle : 1)) {
|
568 |
|
|
if (skb->len > mss_now) {
|
569 |
|
|
if (tcp_fragment(sk, skb, mss_now))
|
570 |
|
|
break;
|
571 |
|
|
}
|
572 |
|
|
|
573 |
|
|
TCP_SKB_CB(skb)->when = tcp_time_stamp;
|
574 |
|
|
if (tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)))
|
575 |
|
|
break;
|
576 |
|
|
/* Advance the send_head. This one is sent out. */
|
577 |
|
|
update_send_head(sk, tp, skb);
|
578 |
|
|
tcp_minshall_update(tp, mss_now, skb);
|
579 |
|
|
sent_pkts = 1;
|
580 |
|
|
}
|
581 |
|
|
|
582 |
|
|
if (sent_pkts) {
|
583 |
|
|
tcp_cwnd_validate(sk, tp);
|
584 |
|
|
return 0;
|
585 |
|
|
}
|
586 |
|
|
|
587 |
|
|
return !tp->packets_out && tp->send_head;
|
588 |
|
|
}
|
589 |
|
|
return 0;
|
590 |
|
|
}
|
591 |
|
|
|
592 |
|
|
/* This function returns the amount that we can raise the
|
593 |
|
|
* usable window based on the following constraints
|
594 |
|
|
*
|
595 |
|
|
* 1. The window can never be shrunk once it is offered (RFC 793)
|
596 |
|
|
* 2. We limit memory per socket
|
597 |
|
|
*
|
598 |
|
|
* RFC 1122:
|
599 |
|
|
* "the suggested [SWS] avoidance algorithm for the receiver is to keep
|
600 |
|
|
* RECV.NEXT + RCV.WIN fixed until:
|
601 |
|
|
* RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)"
|
602 |
|
|
*
|
603 |
|
|
* i.e. don't raise the right edge of the window until you can raise
|
604 |
|
|
* it at least MSS bytes.
|
605 |
|
|
*
|
606 |
|
|
* Unfortunately, the recommended algorithm breaks header prediction,
|
607 |
|
|
* since header prediction assumes th->window stays fixed.
|
608 |
|
|
*
|
609 |
|
|
* Strictly speaking, keeping th->window fixed violates the receiver
|
610 |
|
|
* side SWS prevention criteria. The problem is that under this rule
|
611 |
|
|
* a stream of single byte packets will cause the right side of the
|
612 |
|
|
* window to always advance by a single byte.
|
613 |
|
|
*
|
614 |
|
|
* Of course, if the sender implements sender side SWS prevention
|
615 |
|
|
* then this will not be a problem.
|
616 |
|
|
*
|
617 |
|
|
* BSD seems to make the following compromise:
|
618 |
|
|
*
|
619 |
|
|
* If the free space is less than the 1/4 of the maximum
|
620 |
|
|
* space available and the free space is less than 1/2 mss,
|
621 |
|
|
* then set the window to 0.
|
622 |
|
|
* [ Actually, bsd uses MSS and 1/4 of maximal _window_ ]
|
623 |
|
|
* Otherwise, just prevent the window from shrinking
|
624 |
|
|
* and from being larger than the largest representable value.
|
625 |
|
|
*
|
626 |
|
|
* This prevents incremental opening of the window in the regime
|
627 |
|
|
* where TCP is limited by the speed of the reader side taking
|
628 |
|
|
* data out of the TCP receive queue. It does nothing about
|
629 |
|
|
* those cases where the window is constrained on the sender side
|
630 |
|
|
* because the pipeline is full.
|
631 |
|
|
*
|
632 |
|
|
* BSD also seems to "accidentally" limit itself to windows that are a
|
633 |
|
|
* multiple of MSS, at least until the free space gets quite small.
|
634 |
|
|
* This would appear to be a side effect of the mbuf implementation.
|
635 |
|
|
* Combining these two algorithms results in the observed behavior
|
636 |
|
|
* of having a fixed window size at almost all times.
|
637 |
|
|
*
|
638 |
|
|
* Below we obtain similar behavior by forcing the offered window to
|
639 |
|
|
* a multiple of the mss when it is feasible to do so.
|
640 |
|
|
*
|
641 |
|
|
* Note, we don't "adjust" for TIMESTAMP or SACK option bytes.
|
642 |
|
|
* Regular options like TIMESTAMP are taken into account.
|
643 |
|
|
*/
|
644 |
|
|
u32 __tcp_select_window(struct sock *sk)
|
645 |
|
|
{
|
646 |
|
|
struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
|
647 |
|
|
/* MSS for the peer's data. Previous verions used mss_clamp
|
648 |
|
|
* here. I don't know if the value based on our guesses
|
649 |
|
|
* of peer's MSS is better for the performance. It's more correct
|
650 |
|
|
* but may be worse for the performance because of rcv_mss
|
651 |
|
|
* fluctuations. --SAW 1998/11/1
|
652 |
|
|
*/
|
653 |
|
|
int mss = tp->ack.rcv_mss;
|
654 |
|
|
int free_space = tcp_space(sk);
|
655 |
|
|
int full_space = min_t(int, tp->window_clamp, tcp_full_space(sk));
|
656 |
|
|
int window;
|
657 |
|
|
|
658 |
|
|
if (mss > full_space)
|
659 |
|
|
mss = full_space;
|
660 |
|
|
|
661 |
|
|
if (free_space < full_space/2) {
|
662 |
|
|
tp->ack.quick = 0;
|
663 |
|
|
|
664 |
|
|
if (tcp_memory_pressure)
|
665 |
|
|
tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U*tp->advmss);
|
666 |
|
|
|
667 |
|
|
if (free_space < mss)
|
668 |
|
|
return 0;
|
669 |
|
|
}
|
670 |
|
|
|
671 |
|
|
if (free_space > tp->rcv_ssthresh)
|
672 |
|
|
free_space = tp->rcv_ssthresh;
|
673 |
|
|
|
674 |
|
|
/* Get the largest window that is a nice multiple of mss.
|
675 |
|
|
* Window clamp already applied above.
|
676 |
|
|
* If our current window offering is within 1 mss of the
|
677 |
|
|
* free space we just keep it. This prevents the divide
|
678 |
|
|
* and multiply from happening most of the time.
|
679 |
|
|
* We also don't do any window rounding when the free space
|
680 |
|
|
* is too small.
|
681 |
|
|
*/
|
682 |
|
|
window = tp->rcv_wnd;
|
683 |
|
|
if (window <= free_space - mss || window > free_space)
|
684 |
|
|
window = (free_space/mss)*mss;
|
685 |
|
|
|
686 |
|
|
return window;
|
687 |
|
|
}
|
688 |
|
|
|
689 |
|
|
/* Attempt to collapse two adjacent SKB's during retransmission. */
|
690 |
|
|
static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int mss_now)
|
691 |
|
|
{
|
692 |
|
|
struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
|
693 |
|
|
struct sk_buff *next_skb = skb->next;
|
694 |
|
|
|
695 |
|
|
/* The first test we must make is that neither of these two
|
696 |
|
|
* SKB's are still referenced by someone else.
|
697 |
|
|
*/
|
698 |
|
|
if(!skb_cloned(skb) && !skb_cloned(next_skb)) {
|
699 |
|
|
int skb_size = skb->len, next_skb_size = next_skb->len;
|
700 |
|
|
u16 flags = TCP_SKB_CB(skb)->flags;
|
701 |
|
|
|
702 |
|
|
/* Also punt if next skb has been SACK'd. */
|
703 |
|
|
if(TCP_SKB_CB(next_skb)->sacked & TCPCB_SACKED_ACKED)
|
704 |
|
|
return;
|
705 |
|
|
|
706 |
|
|
/* Next skb is out of window. */
|
707 |
|
|
if (after(TCP_SKB_CB(next_skb)->end_seq, tp->snd_una+tp->snd_wnd))
|
708 |
|
|
return;
|
709 |
|
|
|
710 |
|
|
/* Punt if not enough space exists in the first SKB for
|
711 |
|
|
* the data in the second, or the total combined payload
|
712 |
|
|
* would exceed the MSS.
|
713 |
|
|
*/
|
714 |
|
|
if ((next_skb_size > skb_tailroom(skb)) ||
|
715 |
|
|
((skb_size + next_skb_size) > mss_now))
|
716 |
|
|
return;
|
717 |
|
|
|
718 |
|
|
/* Ok. We will be able to collapse the packet. */
|
719 |
|
|
__skb_unlink(next_skb, next_skb->list);
|
720 |
|
|
|
721 |
|
|
memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size);
|
722 |
|
|
|
723 |
|
|
if (next_skb->ip_summed == CHECKSUM_HW)
|
724 |
|
|
skb->ip_summed = CHECKSUM_HW;
|
725 |
|
|
|
726 |
|
|
if (skb->ip_summed != CHECKSUM_HW)
|
727 |
|
|
skb->csum = csum_block_add(skb->csum, next_skb->csum, skb_size);
|
728 |
|
|
|
729 |
|
|
/* Update sequence range on original skb. */
|
730 |
|
|
TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;
|
731 |
|
|
|
732 |
|
|
/* Merge over control information. */
|
733 |
|
|
flags |= TCP_SKB_CB(next_skb)->flags; /* This moves PSH/FIN etc. over */
|
734 |
|
|
TCP_SKB_CB(skb)->flags = flags;
|
735 |
|
|
|
736 |
|
|
/* All done, get rid of second SKB and account for it so
|
737 |
|
|
* packet counting does not break.
|
738 |
|
|
*/
|
739 |
|
|
TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked&(TCPCB_EVER_RETRANS|TCPCB_AT_TAIL);
|
740 |
|
|
if (TCP_SKB_CB(next_skb)->sacked&TCPCB_SACKED_RETRANS)
|
741 |
|
|
tp->retrans_out--;
|
742 |
|
|
if (TCP_SKB_CB(next_skb)->sacked&TCPCB_LOST) {
|
743 |
|
|
tp->lost_out--;
|
744 |
|
|
tp->left_out--;
|
745 |
|
|
}
|
746 |
|
|
/* Reno case is special. Sigh... */
|
747 |
|
|
if (!tp->sack_ok && tp->sacked_out) {
|
748 |
|
|
tp->sacked_out--;
|
749 |
|
|
tp->left_out--;
|
750 |
|
|
}
|
751 |
|
|
|
752 |
|
|
/* Not quite right: it can be > snd.fack, but
|
753 |
|
|
* it is better to underestimate fackets.
|
754 |
|
|
*/
|
755 |
|
|
if (tp->fackets_out)
|
756 |
|
|
tp->fackets_out--;
|
757 |
|
|
tcp_free_skb(sk, next_skb);
|
758 |
|
|
tp->packets_out--;
|
759 |
|
|
}
|
760 |
|
|
}
|
761 |
|
|
|
762 |
|
|
/* Do a simple retransmit without using the backoff mechanisms in
|
763 |
|
|
* tcp_timer. This is used for path mtu discovery.
|
764 |
|
|
* The socket is already locked here.
|
765 |
|
|
*/
|
766 |
|
|
void tcp_simple_retransmit(struct sock *sk)
|
767 |
|
|
{
|
768 |
|
|
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
|
769 |
|
|
struct sk_buff *skb;
|
770 |
|
|
unsigned int mss = tcp_current_mss(sk);
|
771 |
|
|
int lost = 0;
|
772 |
|
|
|
773 |
|
|
for_retrans_queue(skb, sk, tp) {
|
774 |
|
|
if (skb->len > mss &&
|
775 |
|
|
!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) {
|
776 |
|
|
if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) {
|
777 |
|
|
TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
|
778 |
|
|
tp->retrans_out--;
|
779 |
|
|
}
|
780 |
|
|
if (!(TCP_SKB_CB(skb)->sacked&TCPCB_LOST)) {
|
781 |
|
|
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
|
782 |
|
|
tp->lost_out++;
|
783 |
|
|
lost = 1;
|
784 |
|
|
}
|
785 |
|
|
}
|
786 |
|
|
}
|
787 |
|
|
|
788 |
|
|
if (!lost)
|
789 |
|
|
return;
|
790 |
|
|
|
791 |
|
|
tcp_sync_left_out(tp);
|
792 |
|
|
|
793 |
|
|
/* Don't muck with the congestion window here.
|
794 |
|
|
* Reason is that we do not increase amount of _data_
|
795 |
|
|
* in network, but units changed and effective
|
796 |
|
|
* cwnd/ssthresh really reduced now.
|
797 |
|
|
*/
|
798 |
|
|
if (tp->ca_state != TCP_CA_Loss) {
|
799 |
|
|
tp->high_seq = tp->snd_nxt;
|
800 |
|
|
tp->snd_ssthresh = tcp_current_ssthresh(tp);
|
801 |
|
|
tp->prior_ssthresh = 0;
|
802 |
|
|
tp->undo_marker = 0;
|
803 |
|
|
tp->ca_state = TCP_CA_Loss;
|
804 |
|
|
}
|
805 |
|
|
tcp_xmit_retransmit_queue(sk);
|
806 |
|
|
}
|
807 |
|
|
|
808 |
|
|
/* This retransmits one SKB. Policy decisions and retransmit queue
|
809 |
|
|
* state updates are done by the caller. Returns non-zero if an
|
810 |
|
|
* error occurred which prevented the send.
|
811 |
|
|
*/
|
812 |
|
|
int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
|
813 |
|
|
{
|
814 |
|
|
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
|
815 |
|
|
unsigned int cur_mss = tcp_current_mss(sk);
|
816 |
|
|
int err;
|
817 |
|
|
|
818 |
|
|
/* Do not sent more than we queued. 1/4 is reserved for possible
|
819 |
|
|
* copying overhead: frgagmentation, tunneling, mangling etc.
|
820 |
|
|
*/
|
821 |
|
|
if (atomic_read(&sk->wmem_alloc) > min(sk->wmem_queued+(sk->wmem_queued>>2),sk->sndbuf))
|
822 |
|
|
return -EAGAIN;
|
823 |
|
|
|
824 |
|
|
/* If receiver has shrunk his window, and skb is out of
|
825 |
|
|
* new window, do not retransmit it. The exception is the
|
826 |
|
|
* case, when window is shrunk to zero. In this case
|
827 |
|
|
* our retransmit serves as a zero window probe.
|
828 |
|
|
*/
|
829 |
|
|
if (!before(TCP_SKB_CB(skb)->seq, tp->snd_una+tp->snd_wnd)
|
830 |
|
|
&& TCP_SKB_CB(skb)->seq != tp->snd_una)
|
831 |
|
|
return -EAGAIN;
|
832 |
|
|
|
833 |
|
|
if(skb->len > cur_mss) {
|
834 |
|
|
if(tcp_fragment(sk, skb, cur_mss))
|
835 |
|
|
return -ENOMEM; /* We'll try again later. */
|
836 |
|
|
|
837 |
|
|
/* New SKB created, account for it. */
|
838 |
|
|
tp->packets_out++;
|
839 |
|
|
}
|
840 |
|
|
|
841 |
|
|
/* Collapse two adjacent packets if worthwhile and we can. */
|
842 |
|
|
if(!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) &&
|
843 |
|
|
(skb->len < (cur_mss >> 1)) &&
|
844 |
|
|
(skb->next != tp->send_head) &&
|
845 |
|
|
(skb->next != (struct sk_buff *)&sk->write_queue) &&
|
846 |
|
|
(skb_shinfo(skb)->nr_frags == 0 && skb_shinfo(skb->next)->nr_frags == 0) &&
|
847 |
|
|
(sysctl_tcp_retrans_collapse != 0))
|
848 |
|
|
tcp_retrans_try_collapse(sk, skb, cur_mss);
|
849 |
|
|
|
850 |
|
|
if(tp->af_specific->rebuild_header(sk))
|
851 |
|
|
return -EHOSTUNREACH; /* Routing failure or similar. */
|
852 |
|
|
|
853 |
|
|
/* Some Solaris stacks overoptimize and ignore the FIN on a
|
854 |
|
|
* retransmit when old data is attached. So strip it off
|
855 |
|
|
* since it is cheap to do so and saves bytes on the network.
|
856 |
|
|
*/
|
857 |
|
|
if(skb->len > 0 &&
|
858 |
|
|
(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) &&
|
859 |
|
|
tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) {
|
860 |
|
|
if (!pskb_trim(skb, 0)) {
|
861 |
|
|
TCP_SKB_CB(skb)->seq = TCP_SKB_CB(skb)->end_seq - 1;
|
862 |
|
|
skb->ip_summed = CHECKSUM_NONE;
|
863 |
|
|
skb->csum = 0;
|
864 |
|
|
}
|
865 |
|
|
}
|
866 |
|
|
|
867 |
|
|
/* Make a copy, if the first transmission SKB clone we made
|
868 |
|
|
* is still in somebody's hands, else make a clone.
|
869 |
|
|
*/
|
870 |
|
|
TCP_SKB_CB(skb)->when = tcp_time_stamp;
|
871 |
|
|
|
872 |
|
|
err = tcp_transmit_skb(sk, (skb_cloned(skb) ?
|
873 |
|
|
pskb_copy(skb, GFP_ATOMIC):
|
874 |
|
|
skb_clone(skb, GFP_ATOMIC)));
|
875 |
|
|
|
876 |
|
|
if (err == 0) {
|
877 |
|
|
/* Update global TCP statistics. */
|
878 |
|
|
TCP_INC_STATS(TcpRetransSegs);
|
879 |
|
|
|
880 |
|
|
#if FASTRETRANS_DEBUG > 0
|
881 |
|
|
if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) {
|
882 |
|
|
if (net_ratelimit())
|
883 |
|
|
printk(KERN_DEBUG "retrans_out leaked.\n");
|
884 |
|
|
}
|
885 |
|
|
#endif
|
886 |
|
|
TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS;
|
887 |
|
|
tp->retrans_out++;
|
888 |
|
|
|
889 |
|
|
/* Save stamp of the first retransmit. */
|
890 |
|
|
if (!tp->retrans_stamp)
|
891 |
|
|
tp->retrans_stamp = TCP_SKB_CB(skb)->when;
|
892 |
|
|
|
893 |
|
|
tp->undo_retrans++;
|
894 |
|
|
|
895 |
|
|
/* snd_nxt is stored to detect loss of retransmitted segment,
|
896 |
|
|
* see tcp_input.c tcp_sacktag_write_queue().
|
897 |
|
|
*/
|
898 |
|
|
TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt;
|
899 |
|
|
}
|
900 |
|
|
return err;
|
901 |
|
|
}
|
902 |
|
|
|
903 |
|
|
/* This gets called after a retransmit timeout, and the initially
|
904 |
|
|
* retransmitted data is acknowledged. It tries to continue
|
905 |
|
|
* resending the rest of the retransmit queue, until either
|
906 |
|
|
* we've sent it all or the congestion window limit is reached.
|
907 |
|
|
* If doing SACK, the first ACK which comes back for a timeout
|
908 |
|
|
* based retransmit packet might feed us FACK information again.
|
909 |
|
|
* If so, we use it to avoid unnecessarily retransmissions.
|
910 |
|
|
*/
|
911 |
|
|
void tcp_xmit_retransmit_queue(struct sock *sk)
|
912 |
|
|
{
|
913 |
|
|
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
|
914 |
|
|
struct sk_buff *skb;
|
915 |
|
|
int packet_cnt = tp->lost_out;
|
916 |
|
|
|
917 |
|
|
/* First pass: retransmit lost packets. */
|
918 |
|
|
if (packet_cnt) {
|
919 |
|
|
for_retrans_queue(skb, sk, tp) {
|
920 |
|
|
__u8 sacked = TCP_SKB_CB(skb)->sacked;
|
921 |
|
|
|
922 |
|
|
if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
|
923 |
|
|
return;
|
924 |
|
|
|
925 |
|
|
if (sacked&TCPCB_LOST) {
|
926 |
|
|
if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) {
|
927 |
|
|
if (tcp_retransmit_skb(sk, skb))
|
928 |
|
|
return;
|
929 |
|
|
if (tp->ca_state != TCP_CA_Loss)
|
930 |
|
|
NET_INC_STATS_BH(TCPFastRetrans);
|
931 |
|
|
else
|
932 |
|
|
NET_INC_STATS_BH(TCPSlowStartRetrans);
|
933 |
|
|
|
934 |
|
|
if (skb == skb_peek(&sk->write_queue))
|
935 |
|
|
tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
|
936 |
|
|
}
|
937 |
|
|
|
938 |
|
|
if (--packet_cnt <= 0)
|
939 |
|
|
break;
|
940 |
|
|
}
|
941 |
|
|
}
|
942 |
|
|
}
|
943 |
|
|
|
944 |
|
|
/* OK, demanded retransmission is finished. */
|
945 |
|
|
|
946 |
|
|
/* Forward retransmissions are possible only during Recovery. */
|
947 |
|
|
if (tp->ca_state != TCP_CA_Recovery)
|
948 |
|
|
return;
|
949 |
|
|
|
950 |
|
|
/* No forward retransmissions in Reno are possible. */
|
951 |
|
|
if (!tp->sack_ok)
|
952 |
|
|
return;
|
953 |
|
|
|
954 |
|
|
/* Yeah, we have to make difficult choice between forward transmission
|
955 |
|
|
* and retransmission... Both ways have their merits...
|
956 |
|
|
*
|
957 |
|
|
* For now we do not retrnamsit anything, while we have some new
|
958 |
|
|
* segments to send.
|
959 |
|
|
*/
|
960 |
|
|
|
961 |
|
|
if (tcp_may_send_now(sk, tp))
|
962 |
|
|
return;
|
963 |
|
|
|
964 |
|
|
packet_cnt = 0;
|
965 |
|
|
|
966 |
|
|
for_retrans_queue(skb, sk, tp) {
|
967 |
|
|
if(++packet_cnt > tp->fackets_out)
|
968 |
|
|
break;
|
969 |
|
|
|
970 |
|
|
if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
|
971 |
|
|
break;
|
972 |
|
|
|
973 |
|
|
if(TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS)
|
974 |
|
|
continue;
|
975 |
|
|
|
976 |
|
|
/* Ok, retransmit it. */
|
977 |
|
|
if(tcp_retransmit_skb(sk, skb))
|
978 |
|
|
break;
|
979 |
|
|
|
980 |
|
|
if (skb == skb_peek(&sk->write_queue))
|
981 |
|
|
tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
|
982 |
|
|
|
983 |
|
|
NET_INC_STATS_BH(TCPForwardRetrans);
|
984 |
|
|
}
|
985 |
|
|
}
|
986 |
|
|
|
987 |
|
|
|
988 |
|
|
/* Send a fin. The caller locks the socket for us. This cannot be
|
989 |
|
|
* allowed to fail queueing a FIN frame under any circumstances.
|
990 |
|
|
*/
|
991 |
|
|
void tcp_send_fin(struct sock *sk)
|
992 |
|
|
{
|
993 |
|
|
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
|
994 |
|
|
struct sk_buff *skb = skb_peek_tail(&sk->write_queue);
|
995 |
|
|
unsigned int mss_now;
|
996 |
|
|
|
997 |
|
|
/* Optimization, tack on the FIN if we have a queue of
|
998 |
|
|
* unsent frames. But be careful about outgoing SACKS
|
999 |
|
|
* and IP options.
|
1000 |
|
|
*/
|
1001 |
|
|
mss_now = tcp_current_mss(sk);
|
1002 |
|
|
|
1003 |
|
|
if(tp->send_head != NULL) {
|
1004 |
|
|
TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN;
|
1005 |
|
|
TCP_SKB_CB(skb)->end_seq++;
|
1006 |
|
|
tp->write_seq++;
|
1007 |
|
|
} else {
|
1008 |
|
|
/* Socket is locked, keep trying until memory is available. */
|
1009 |
|
|
for (;;) {
|
1010 |
|
|
skb = alloc_skb(MAX_TCP_HEADER, GFP_KERNEL);
|
1011 |
|
|
if (skb)
|
1012 |
|
|
break;
|
1013 |
|
|
yield();
|
1014 |
|
|
}
|
1015 |
|
|
|
1016 |
|
|
/* Reserve space for headers and prepare control bits. */
|
1017 |
|
|
skb_reserve(skb, MAX_TCP_HEADER);
|
1018 |
|
|
skb->csum = 0;
|
1019 |
|
|
TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_FIN);
|
1020 |
|
|
TCP_SKB_CB(skb)->sacked = 0;
|
1021 |
|
|
|
1022 |
|
|
/* FIN eats a sequence byte, write_seq advanced by tcp_send_skb(). */
|
1023 |
|
|
TCP_SKB_CB(skb)->seq = tp->write_seq;
|
1024 |
|
|
TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
|
1025 |
|
|
tcp_send_skb(sk, skb, 1, mss_now);
|
1026 |
|
|
}
|
1027 |
|
|
__tcp_push_pending_frames(sk, tp, mss_now, 1);
|
1028 |
|
|
}
|
1029 |
|
|
|
1030 |
|
|
/* We get here when a process closes a file descriptor (either due to
|
1031 |
|
|
* an explicit close() or as a byproduct of exit()'ing) and there
|
1032 |
|
|
* was unread data in the receive queue. This behavior is recommended
|
1033 |
|
|
* by draft-ietf-tcpimpl-prob-03.txt section 3.10. -DaveM
|
1034 |
|
|
*/
|
1035 |
|
|
void tcp_send_active_reset(struct sock *sk, int priority)
|
1036 |
|
|
{
|
1037 |
|
|
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
|
1038 |
|
|
struct sk_buff *skb;
|
1039 |
|
|
|
1040 |
|
|
/* NOTE: No TCP options attached and we never retransmit this. */
|
1041 |
|
|
skb = alloc_skb(MAX_TCP_HEADER, priority);
|
1042 |
|
|
if (!skb) {
|
1043 |
|
|
NET_INC_STATS(TCPAbortFailed);
|
1044 |
|
|
return;
|
1045 |
|
|
}
|
1046 |
|
|
|
1047 |
|
|
/* Reserve space for headers and prepare control bits. */
|
1048 |
|
|
skb_reserve(skb, MAX_TCP_HEADER);
|
1049 |
|
|
skb->csum = 0;
|
1050 |
|
|
TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_RST);
|
1051 |
|
|
TCP_SKB_CB(skb)->sacked = 0;
|
1052 |
|
|
|
1053 |
|
|
/* Send it off. */
|
1054 |
|
|
TCP_SKB_CB(skb)->seq = tcp_acceptable_seq(sk, tp);
|
1055 |
|
|
TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
|
1056 |
|
|
TCP_SKB_CB(skb)->when = tcp_time_stamp;
|
1057 |
|
|
if (tcp_transmit_skb(sk, skb))
|
1058 |
|
|
NET_INC_STATS(TCPAbortFailed);
|
1059 |
|
|
}
|
1060 |
|
|
|
1061 |
|
|
/* WARNING: This routine must only be called when we have already sent
|
1062 |
|
|
* a SYN packet that crossed the incoming SYN that caused this routine
|
1063 |
|
|
* to get called. If this assumption fails then the initial rcv_wnd
|
1064 |
|
|
* and rcv_wscale values will not be correct.
|
1065 |
|
|
*/
|
1066 |
|
|
int tcp_send_synack(struct sock *sk)
|
1067 |
|
|
{
|
1068 |
|
|
struct sk_buff* skb;
|
1069 |
|
|
|
1070 |
|
|
skb = skb_peek(&sk->write_queue);
|
1071 |
|
|
if (skb == NULL || !(TCP_SKB_CB(skb)->flags&TCPCB_FLAG_SYN)) {
|
1072 |
|
|
printk(KERN_DEBUG "tcp_send_synack: wrong queue state\n");
|
1073 |
|
|
return -EFAULT;
|
1074 |
|
|
}
|
1075 |
|
|
if (!(TCP_SKB_CB(skb)->flags&TCPCB_FLAG_ACK)) {
|
1076 |
|
|
if (skb_cloned(skb)) {
|
1077 |
|
|
struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC);
|
1078 |
|
|
if (nskb == NULL)
|
1079 |
|
|
return -ENOMEM;
|
1080 |
|
|
__skb_unlink(skb, &sk->write_queue);
|
1081 |
|
|
__skb_queue_head(&sk->write_queue, nskb);
|
1082 |
|
|
tcp_free_skb(sk, skb);
|
1083 |
|
|
tcp_charge_skb(sk, nskb);
|
1084 |
|
|
skb = nskb;
|
1085 |
|
|
}
|
1086 |
|
|
|
1087 |
|
|
TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ACK;
|
1088 |
|
|
TCP_ECN_send_synack(&sk->tp_pinfo.af_tcp, skb);
|
1089 |
|
|
}
|
1090 |
|
|
TCP_SKB_CB(skb)->when = tcp_time_stamp;
|
1091 |
|
|
return tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
|
1092 |
|
|
}
|
1093 |
|
|
|
1094 |
|
|
/*
|
1095 |
|
|
* Prepare a SYN-ACK.
|
1096 |
|
|
*/
|
1097 |
|
|
struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
|
1098 |
|
|
struct open_request *req)
|
1099 |
|
|
{
|
1100 |
|
|
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
|
1101 |
|
|
struct tcphdr *th;
|
1102 |
|
|
int tcp_header_size;
|
1103 |
|
|
struct sk_buff *skb;
|
1104 |
|
|
|
1105 |
|
|
skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC);
|
1106 |
|
|
if (skb == NULL)
|
1107 |
|
|
return NULL;
|
1108 |
|
|
|
1109 |
|
|
/* Reserve space for headers. */
|
1110 |
|
|
skb_reserve(skb, MAX_TCP_HEADER);
|
1111 |
|
|
|
1112 |
|
|
skb->dst = dst_clone(dst);
|
1113 |
|
|
|
1114 |
|
|
tcp_header_size = (sizeof(struct tcphdr) + TCPOLEN_MSS +
|
1115 |
|
|
(req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) +
|
1116 |
|
|
(req->wscale_ok ? TCPOLEN_WSCALE_ALIGNED : 0) +
|
1117 |
|
|
/* SACK_PERM is in the place of NOP NOP of TS */
|
1118 |
|
|
((req->sack_ok && !req->tstamp_ok) ? TCPOLEN_SACKPERM_ALIGNED : 0));
|
1119 |
|
|
skb->h.th = th = (struct tcphdr *) skb_push(skb, tcp_header_size);
|
1120 |
|
|
|
1121 |
|
|
memset(th, 0, sizeof(struct tcphdr));
|
1122 |
|
|
th->syn = 1;
|
1123 |
|
|
th->ack = 1;
|
1124 |
|
|
TCP_ECN_make_synack(req, th);
|
1125 |
|
|
th->source = sk->sport;
|
1126 |
|
|
th->dest = req->rmt_port;
|
1127 |
|
|
TCP_SKB_CB(skb)->seq = req->snt_isn;
|
1128 |
|
|
TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
|
1129 |
|
|
th->seq = htonl(TCP_SKB_CB(skb)->seq);
|
1130 |
|
|
th->ack_seq = htonl(req->rcv_isn + 1);
|
1131 |
|
|
if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
|
1132 |
|
|
__u8 rcv_wscale;
|
1133 |
|
|
/* Set this up on the first call only */
|
1134 |
|
|
req->window_clamp = tp->window_clamp ? : dst->window;
|
1135 |
|
|
/* tcp_full_space because it is guaranteed to be the first packet */
|
1136 |
|
|
tcp_select_initial_window(tcp_full_space(sk),
|
1137 |
|
|
dst->advmss - (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
|
1138 |
|
|
&req->rcv_wnd,
|
1139 |
|
|
&req->window_clamp,
|
1140 |
|
|
req->wscale_ok,
|
1141 |
|
|
&rcv_wscale);
|
1142 |
|
|
req->rcv_wscale = rcv_wscale;
|
1143 |
|
|
}
|
1144 |
|
|
|
1145 |
|
|
/* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
|
1146 |
|
|
th->window = htons(req->rcv_wnd);
|
1147 |
|
|
|
1148 |
|
|
TCP_SKB_CB(skb)->when = tcp_time_stamp;
|
1149 |
|
|
tcp_syn_build_options((__u32 *)(th + 1), dst->advmss, req->tstamp_ok,
|
1150 |
|
|
req->sack_ok, req->wscale_ok, req->rcv_wscale,
|
1151 |
|
|
TCP_SKB_CB(skb)->when,
|
1152 |
|
|
req->ts_recent);
|
1153 |
|
|
|
1154 |
|
|
skb->csum = 0;
|
1155 |
|
|
th->doff = (tcp_header_size >> 2);
|
1156 |
|
|
TCP_INC_STATS(TcpOutSegs);
|
1157 |
|
|
return skb;
|
1158 |
|
|
}
|
1159 |
|
|
|
1160 |
|
|
/*
|
1161 |
|
|
* Do all connect socket setups that can be done AF independent.
|
1162 |
|
|
*/
|
1163 |
|
|
static inline void tcp_connect_init(struct sock *sk)
|
1164 |
|
|
{
|
1165 |
|
|
struct dst_entry *dst = __sk_dst_get(sk);
|
1166 |
|
|
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
|
1167 |
|
|
|
1168 |
|
|
/* We'll fix this up when we get a response from the other end.
|
1169 |
|
|
* See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
|
1170 |
|
|
*/
|
1171 |
|
|
tp->tcp_header_len = sizeof(struct tcphdr) +
|
1172 |
|
|
(sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0);
|
1173 |
|
|
|
1174 |
|
|
/* If user gave his TCP_MAXSEG, record it to clamp */
|
1175 |
|
|
if (tp->user_mss)
|
1176 |
|
|
tp->mss_clamp = tp->user_mss;
|
1177 |
|
|
tp->max_window = 0;
|
1178 |
|
|
tcp_sync_mss(sk, dst->pmtu);
|
1179 |
|
|
|
1180 |
|
|
if (!tp->window_clamp)
|
1181 |
|
|
tp->window_clamp = dst->window;
|
1182 |
|
|
tp->advmss = dst->advmss;
|
1183 |
|
|
tcp_initialize_rcv_mss(sk);
|
1184 |
|
|
|
1185 |
|
|
tcp_select_initial_window(tcp_full_space(sk),
|
1186 |
|
|
tp->advmss - (tp->ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
|
1187 |
|
|
&tp->rcv_wnd,
|
1188 |
|
|
&tp->window_clamp,
|
1189 |
|
|
sysctl_tcp_window_scaling,
|
1190 |
|
|
&tp->rcv_wscale);
|
1191 |
|
|
|
1192 |
|
|
tp->rcv_ssthresh = tp->rcv_wnd;
|
1193 |
|
|
|
1194 |
|
|
sk->err = 0;
|
1195 |
|
|
sk->done = 0;
|
1196 |
|
|
tp->snd_wnd = 0;
|
1197 |
|
|
tcp_init_wl(tp, tp->write_seq, 0);
|
1198 |
|
|
tp->snd_una = tp->write_seq;
|
1199 |
|
|
tp->snd_sml = tp->write_seq;
|
1200 |
|
|
tp->rcv_nxt = 0;
|
1201 |
|
|
tp->rcv_wup = 0;
|
1202 |
|
|
tp->copied_seq = 0;
|
1203 |
|
|
|
1204 |
|
|
tp->rto = TCP_TIMEOUT_INIT;
|
1205 |
|
|
tp->retransmits = 0;
|
1206 |
|
|
tcp_clear_retrans(tp);
|
1207 |
|
|
}
|
1208 |
|
|
|
1209 |
|
|
/*
|
1210 |
|
|
* Build a SYN and send it off.
|
1211 |
|
|
*/
|
1212 |
|
|
int tcp_connect(struct sock *sk)
|
1213 |
|
|
{
|
1214 |
|
|
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
|
1215 |
|
|
struct sk_buff *buff;
|
1216 |
|
|
|
1217 |
|
|
tcp_connect_init(sk);
|
1218 |
|
|
|
1219 |
|
|
buff = alloc_skb(MAX_TCP_HEADER + 15, sk->allocation);
|
1220 |
|
|
if (unlikely(buff == NULL))
|
1221 |
|
|
return -ENOBUFS;
|
1222 |
|
|
|
1223 |
|
|
/* Reserve space for headers. */
|
1224 |
|
|
skb_reserve(buff, MAX_TCP_HEADER);
|
1225 |
|
|
|
1226 |
|
|
TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN;
|
1227 |
|
|
TCP_ECN_send_syn(tp, buff);
|
1228 |
|
|
TCP_SKB_CB(buff)->sacked = 0;
|
1229 |
|
|
buff->csum = 0;
|
1230 |
|
|
TCP_SKB_CB(buff)->seq = tp->write_seq++;
|
1231 |
|
|
TCP_SKB_CB(buff)->end_seq = tp->write_seq;
|
1232 |
|
|
tp->snd_nxt = tp->write_seq;
|
1233 |
|
|
tp->pushed_seq = tp->write_seq;
|
1234 |
|
|
|
1235 |
|
|
/* Send it off. */
|
1236 |
|
|
TCP_SKB_CB(buff)->when = tcp_time_stamp;
|
1237 |
|
|
tp->retrans_stamp = TCP_SKB_CB(buff)->when;
|
1238 |
|
|
__skb_queue_tail(&sk->write_queue, buff);
|
1239 |
|
|
tcp_charge_skb(sk, buff);
|
1240 |
|
|
tp->packets_out++;
|
1241 |
|
|
tcp_transmit_skb(sk, skb_clone(buff, GFP_KERNEL));
|
1242 |
|
|
TCP_INC_STATS(TcpActiveOpens);
|
1243 |
|
|
|
1244 |
|
|
/* Timer for repeating the SYN until an answer. */
|
1245 |
|
|
tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
|
1246 |
|
|
return 0;
|
1247 |
|
|
}
|
1248 |
|
|
|
1249 |
|
|
/* Send out a delayed ack, the caller does the policy checking
|
1250 |
|
|
* to see if we should even be here. See tcp_input.c:tcp_ack_snd_check()
|
1251 |
|
|
* for details.
|
1252 |
|
|
*/
|
1253 |
|
|
void tcp_send_delayed_ack(struct sock *sk)
|
1254 |
|
|
{
|
1255 |
|
|
struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
|
1256 |
|
|
int ato = tp->ack.ato;
|
1257 |
|
|
unsigned long timeout;
|
1258 |
|
|
|
1259 |
|
|
if (ato > TCP_DELACK_MIN) {
|
1260 |
|
|
int max_ato = HZ/2;
|
1261 |
|
|
|
1262 |
|
|
if (tp->ack.pingpong || (tp->ack.pending&TCP_ACK_PUSHED))
|
1263 |
|
|
max_ato = TCP_DELACK_MAX;
|
1264 |
|
|
|
1265 |
|
|
/* Slow path, intersegment interval is "high". */
|
1266 |
|
|
|
1267 |
|
|
/* If some rtt estimate is known, use it to bound delayed ack.
|
1268 |
|
|
* Do not use tp->rto here, use results of rtt measurements
|
1269 |
|
|
* directly.
|
1270 |
|
|
*/
|
1271 |
|
|
if (tp->srtt) {
|
1272 |
|
|
int rtt = max(tp->srtt>>3, TCP_DELACK_MIN);
|
1273 |
|
|
|
1274 |
|
|
if (rtt < max_ato)
|
1275 |
|
|
max_ato = rtt;
|
1276 |
|
|
}
|
1277 |
|
|
|
1278 |
|
|
ato = min(ato, max_ato);
|
1279 |
|
|
}
|
1280 |
|
|
|
1281 |
|
|
/* Stay within the limit we were given */
|
1282 |
|
|
timeout = jiffies + ato;
|
1283 |
|
|
|
1284 |
|
|
/* Use new timeout only if there wasn't a older one earlier. */
|
1285 |
|
|
if (tp->ack.pending&TCP_ACK_TIMER) {
|
1286 |
|
|
/* If delack timer was blocked or is about to expire,
|
1287 |
|
|
* send ACK now.
|
1288 |
|
|
*/
|
1289 |
|
|
if (tp->ack.blocked || time_before_eq(tp->ack.timeout, jiffies+(ato>>2))) {
|
1290 |
|
|
tcp_send_ack(sk);
|
1291 |
|
|
return;
|
1292 |
|
|
}
|
1293 |
|
|
|
1294 |
|
|
if (!time_before(timeout, tp->ack.timeout))
|
1295 |
|
|
timeout = tp->ack.timeout;
|
1296 |
|
|
}
|
1297 |
|
|
tp->ack.pending |= TCP_ACK_SCHED|TCP_ACK_TIMER;
|
1298 |
|
|
tp->ack.timeout = timeout;
|
1299 |
|
|
if (!mod_timer(&tp->delack_timer, timeout))
|
1300 |
|
|
sock_hold(sk);
|
1301 |
|
|
}
|
1302 |
|
|
|
1303 |
|
|
/* This routine sends an ack and also updates the window. */
|
1304 |
|
|
void tcp_send_ack(struct sock *sk)
|
1305 |
|
|
{
|
1306 |
|
|
/* If we have been reset, we may not send again. */
|
1307 |
|
|
if(sk->state != TCP_CLOSE) {
|
1308 |
|
|
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
|
1309 |
|
|
struct sk_buff *buff;
|
1310 |
|
|
|
1311 |
|
|
/* We are not putting this on the write queue, so
|
1312 |
|
|
* tcp_transmit_skb() will set the ownership to this
|
1313 |
|
|
* sock.
|
1314 |
|
|
*/
|
1315 |
|
|
buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
|
1316 |
|
|
if (buff == NULL) {
|
1317 |
|
|
tcp_schedule_ack(tp);
|
1318 |
|
|
tp->ack.ato = TCP_ATO_MIN;
|
1319 |
|
|
tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MAX);
|
1320 |
|
|
return;
|
1321 |
|
|
}
|
1322 |
|
|
|
1323 |
|
|
/* Reserve space for headers and prepare control bits. */
|
1324 |
|
|
skb_reserve(buff, MAX_TCP_HEADER);
|
1325 |
|
|
buff->csum = 0;
|
1326 |
|
|
TCP_SKB_CB(buff)->flags = TCPCB_FLAG_ACK;
|
1327 |
|
|
TCP_SKB_CB(buff)->sacked = 0;
|
1328 |
|
|
|
1329 |
|
|
/* Send it off, this clears delayed acks for us. */
|
1330 |
|
|
TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tcp_acceptable_seq(sk, tp);
|
1331 |
|
|
TCP_SKB_CB(buff)->when = tcp_time_stamp;
|
1332 |
|
|
tcp_transmit_skb(sk, buff);
|
1333 |
|
|
}
|
1334 |
|
|
}
|
1335 |
|
|
|
1336 |
|
|
/* This routine sends a packet with an out of date sequence
|
1337 |
|
|
* number. It assumes the other end will try to ack it.
|
1338 |
|
|
*
|
1339 |
|
|
* Question: what should we make while urgent mode?
|
1340 |
|
|
* 4.4BSD forces sending single byte of data. We cannot send
|
1341 |
|
|
* out of window data, because we have SND.NXT==SND.MAX...
|
1342 |
|
|
*
|
1343 |
|
|
* Current solution: to send TWO zero-length segments in urgent mode:
|
1344 |
|
|
* one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is
|
1345 |
|
|
* out-of-date with SND.UNA-1 to probe window.
|
1346 |
|
|
*/
|
1347 |
|
|
static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
|
1348 |
|
|
{
|
1349 |
|
|
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
|
1350 |
|
|
struct sk_buff *skb;
|
1351 |
|
|
|
1352 |
|
|
/* We don't queue it, tcp_transmit_skb() sets ownership. */
|
1353 |
|
|
skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
|
1354 |
|
|
if (skb == NULL)
|
1355 |
|
|
return -1;
|
1356 |
|
|
|
1357 |
|
|
/* Reserve space for headers and set control bits. */
|
1358 |
|
|
skb_reserve(skb, MAX_TCP_HEADER);
|
1359 |
|
|
skb->csum = 0;
|
1360 |
|
|
TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
|
1361 |
|
|
TCP_SKB_CB(skb)->sacked = urgent;
|
1362 |
|
|
|
1363 |
|
|
/* Use a previous sequence. This should cause the other
|
1364 |
|
|
* end to send an ack. Don't queue or clone SKB, just
|
1365 |
|
|
* send it.
|
1366 |
|
|
*/
|
1367 |
|
|
TCP_SKB_CB(skb)->seq = urgent ? tp->snd_una : tp->snd_una - 1;
|
1368 |
|
|
TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
|
1369 |
|
|
TCP_SKB_CB(skb)->when = tcp_time_stamp;
|
1370 |
|
|
return tcp_transmit_skb(sk, skb);
|
1371 |
|
|
}
|
1372 |
|
|
|
1373 |
|
|
int tcp_write_wakeup(struct sock *sk)
|
1374 |
|
|
{
|
1375 |
|
|
if (sk->state != TCP_CLOSE) {
|
1376 |
|
|
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
|
1377 |
|
|
struct sk_buff *skb;
|
1378 |
|
|
|
1379 |
|
|
if ((skb = tp->send_head) != NULL &&
|
1380 |
|
|
before(TCP_SKB_CB(skb)->seq, tp->snd_una+tp->snd_wnd)) {
|
1381 |
|
|
int err;
|
1382 |
|
|
int mss = tcp_current_mss(sk);
|
1383 |
|
|
int seg_size = tp->snd_una+tp->snd_wnd-TCP_SKB_CB(skb)->seq;
|
1384 |
|
|
|
1385 |
|
|
if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
|
1386 |
|
|
tp->pushed_seq = TCP_SKB_CB(skb)->end_seq;
|
1387 |
|
|
|
1388 |
|
|
/* We are probing the opening of a window
|
1389 |
|
|
* but the window size is != 0
|
1390 |
|
|
* must have been a result SWS avoidance ( sender )
|
1391 |
|
|
*/
|
1392 |
|
|
if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq ||
|
1393 |
|
|
skb->len > mss) {
|
1394 |
|
|
seg_size = min(seg_size, mss);
|
1395 |
|
|
TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
|
1396 |
|
|
if (tcp_fragment(sk, skb, seg_size))
|
1397 |
|
|
return -1;
|
1398 |
|
|
}
|
1399 |
|
|
TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
|
1400 |
|
|
TCP_SKB_CB(skb)->when = tcp_time_stamp;
|
1401 |
|
|
err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
|
1402 |
|
|
if (!err) {
|
1403 |
|
|
update_send_head(sk, tp, skb);
|
1404 |
|
|
}
|
1405 |
|
|
return err;
|
1406 |
|
|
} else {
|
1407 |
|
|
if (tp->urg_mode &&
|
1408 |
|
|
between(tp->snd_up, tp->snd_una+1, tp->snd_una+0xFFFF))
|
1409 |
|
|
tcp_xmit_probe_skb(sk, TCPCB_URG);
|
1410 |
|
|
return tcp_xmit_probe_skb(sk, 0);
|
1411 |
|
|
}
|
1412 |
|
|
}
|
1413 |
|
|
return -1;
|
1414 |
|
|
}
|
1415 |
|
|
|
1416 |
|
|
/* A window probe timeout has occurred. If window is not closed send
|
1417 |
|
|
* a partial packet else a zero probe.
|
1418 |
|
|
*/
|
1419 |
|
|
void tcp_send_probe0(struct sock *sk)
|
1420 |
|
|
{
|
1421 |
|
|
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
|
1422 |
|
|
int err;
|
1423 |
|
|
|
1424 |
|
|
err = tcp_write_wakeup(sk);
|
1425 |
|
|
|
1426 |
|
|
if (tp->packets_out || !tp->send_head) {
|
1427 |
|
|
/* Cancel probe timer, if it is not required. */
|
1428 |
|
|
tp->probes_out = 0;
|
1429 |
|
|
tp->backoff = 0;
|
1430 |
|
|
return;
|
1431 |
|
|
}
|
1432 |
|
|
|
1433 |
|
|
if (err <= 0) {
|
1434 |
|
|
if (tp->backoff < sysctl_tcp_retries2)
|
1435 |
|
|
tp->backoff++;
|
1436 |
|
|
tp->probes_out++;
|
1437 |
|
|
tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0,
|
1438 |
|
|
min(tp->rto << tp->backoff, TCP_RTO_MAX));
|
1439 |
|
|
} else {
|
1440 |
|
|
/* If packet was not sent due to local congestion,
|
1441 |
|
|
* do not backoff and do not remember probes_out.
|
1442 |
|
|
* Let local senders to fight for local resources.
|
1443 |
|
|
*
|
1444 |
|
|
* Use accumulated backoff yet.
|
1445 |
|
|
*/
|
1446 |
|
|
if (!tp->probes_out)
|
1447 |
|
|
tp->probes_out=1;
|
1448 |
|
|
tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0,
|
1449 |
|
|
min(tp->rto << tp->backoff, TCP_RESOURCE_PROBE_INTERVAL));
|
1450 |
|
|
}
|
1451 |
|
|
}
|