struct tcp_sock { /\* inet_connection_sock has to be the first member of tcp_sock \*/ struct inet_connection_sock inet_conn; u16 tcp_header_len; /\* Bytes of tcp header to send \*/ u16 xmit_size_goal_segs; /\* Goal for segmenting output packets \*/
/* * Header prediction flags * 0x5?10 << 16 + snd_wnd in net byte order */ __be32 pred_flags;
/* * RFC793 variables by their proper names. This means you can * read the code and the spec side by side (and laugh …) * See RFC793 and RFC1122. The RFC writes these in capitals. */ u32 rcv_nxt; /\* What we want to receive next \*/ u32 copied_seq; /\* Head of yet unread data \*/ u32 rcv_wup; /\* rcv_nxt on last window update sent \*/ u32 snd_nxt; /\* Next sequence we send \*/
u32 snd_una; /\* First byte we want an ack for \*/ u32 snd_sml; /\* Last byte of the most recently transmitted small packet \*/ u32 rcv_tstamp; /\* timestamp of last received ACK (for keepalives) \*/ u32 lsndtime; /\* timestamp of last sent data packet (for restart window) \*/
/\* Data for direct copy to user \*/ struct { struct sk_buff_head prequeue; struct task_struct *task; struct iovec *iov; int memory; int len; #ifdef CONFIG_NET_DMA /\* members for async copy \*/ struct dma_chan *dma_chan; int wakeup; struct dma_pinned_list *pinned_list; dma_cookie_t dma_cookie; #endif } ucopy;
u32 snd_wl1; /\* Sequence for window update \*/ u32 snd_wnd; /\* The window we expect to receive \*/ u32 max_window; /\* Maximal window ever seen from peer \*/ u32 mss_cache; /\* Cached effective mss, not including SACKS \*/
u32 window_clamp; /\* Maximal window to advertise \*/ u32 rcv_ssthresh; /\* Current window clamp \*/
u32 frto_highmark; /\* snd_nxt when RTO occurred \*/ u16 advmss; /\* Advertised MSS \*/ u8 frto_counter; /\* Number of new acks after RTO \*/ u8 nonagle : 4,/\* Disable Nagle algorithm? \*/ thin_lto : 1,/\* Use linear timeouts for thin streams \*/ thin_dupack : 1,/\* Fast retransmit on first dupack \*/ repair : 1, unused : 1; u8 repair_queue; u8 do_early_retrans:1,/\* Enable RFC5827 early-retransmit \*/ early_retrans_delayed:1; /\* Delayed ER timer installed \*/
/\* RTT measurement \*/ u32 srtt; /\* smoothed round trip time << 3 \*/ u32 mdev; /\* medium deviation \*/ u32 mdev_max; /\* maximal mdev for the last rtt period \*/ u32 rttvar; /\* smoothed mdev_max \*/ u32 rtt_seq; /\* sequence number to update rttvar \*/
u32 packets_out; /\* Packets which are "in flight" \*/ u32 retrans_out; /\* Retransmitted packets out \*/
u16 urg_data; /\* Saved octet of OOB data and control flags \*/ u8 ecn_flags; /\* ECN status bits. \*/ u8 reordering; /\* Packet reordering metric. \*/ u32 snd_up; /\* Urgent pointer \*/
u8 keepalive_probes; /\* num of allowed keep alive probes \*/ /* * Options received (usually on last packet, some only on SYN packets). */ struct tcp_options_received rx_opt;
/* * Slow start and congestion control (see also Nagle, and Karn & Partridge) */ u32 snd_ssthresh; /\* Slow start size threshold \*/ u32 snd_cwnd; /\* Sending congestion window \*/ u32 snd_cwnd_cnt; /\* Linear increase counter \*/ u32 snd_cwnd_clamp; /\* Do not allow snd_cwnd to grow above this \*/ u32 snd_cwnd_used; u32 snd_cwnd_stamp; u32 prior_cwnd; /\* Congestion window at start of Recovery. \*/ u32 prr_delivered; /* Number of newly delivered packets to \* receiver in Recovery. \*/ u32 prr_out; /\* Total number of pkts sent during Recovery. \*/
u32 rcv_wnd; /\* Current receiver window \*/ u32 write_seq; /\* Tail(+1) of data held in tcp send buffer \*/ u32 pushed_seq; /\* Last pushed seq, required to talk to windows \*/ u32 lost_out; /\* Lost packets \*/ u32 sacked_out; /\* SACK’d packets \*/ u32 fackets_out; /\* FACK’d packets \*/ u32 tso_deferred; u32 bytes_acked; /\* Appropriate Byte Counting – RFC3465 \*/
struct sk_buff_head out_of_order_queue; /\* Out of order segments go here \*/
/\* SACKs data, these 2 need to be together (see tcp_options_write) \*/ struct tcp_sack_block duplicate_sack[1]; /\* D-SACK block \*/ struct tcp_sack_block selective_acks[4]; /\* The SACKS themselves\*/
struct tcp_sack_block recv_sack_cache[4];
struct sk_buff \*highest_sack; /\* skb just after the highest * skb with SACKed bit set * (validity guaranteed only if * sacked_out > 0) */
int lost_cnt_hint; u32 retransmit_high; /\* L-bits may be on up to this seqno \*/
u32 lost_retrans_low; /\* Sent seq after any rxmit (lowest) \*/
u32 prior_ssthresh; /\* ssthresh saved at recovery start \*/ u32 high_seq; /\* snd_nxt at onset of congestion \*/
u32 retrans_stamp; /* Timestamp of the last retransmit, * also used in SYN-SENT to remember stamp of \* the first SYN. \*/ u32 undo_marker; /\* tracking retrans started here. \*/ int undo_retrans; /\* number of undoable retransmissions. \*/ u32 total_retrans; /\* Total retransmits for entire connection \*/
u32 urg_seq; /\* Seq of received urgent pointer \*/ unsigned int keepalive_time; /\* time before keep alive takes place \*/ unsigned int keepalive_intvl; /\* time interval between keep alive probes \*/
/* When the cookie options are generated and exchanged, then this * object holds a reference to them (cookie_values->kref). Also * contains related tcp_cookie_transactions fields. */ struct tcp_cookie_values *cookie_values; }
switch (icsk->icsk_ca_state) { ………………………………………………………………………………. /\* Loss is undone; fall through to processing in Open state. \*/ // 进入下面则有可能是 disorder,open, cwr,loss 这几个状态. default: //如果SACK关闭,那么就需要模拟SACK if (tcp_is_reno(tp)) { if (flag & FLAG_SND_UNA_ADVANCED) tcp_reset_reno_sack(tp); if (is_dupack) tcp_add_reno_sack(sk); } //从DSACK恢复 if (icsk->icsk_ca_state <= TCP_CA_Disorder) tcp_try_undo_dsack(sk); //是否需要进入revocer状态。 if (!tcp_time_to_recover(sk, flag)) { //如果不需要,则尝试着检测是否需要进入CWR或者Disorder状态. tcp_try_to_open(sk, flag); return; }
/\* MTU probe failure: don’t reduce cwnd \*/ if (icsk->icsk_ca_state < TCP_CA_CWR && icsk->icsk_mtup.probe_size && tp->snd_una == tp->mtu_probe.probe_seq_start) { tcp_mtup_probe_failed(sk); /\* Restores the reduction we did in tcp_mtup_probe() \*/ tp->snd_cwnd++; tcp_simple_retransmit(sk); return; } //最终进入recovery状态 /\* Otherwise enter Recovery state \*/ tcp_enter_recovery(sk, (flag & FLAG_ECE)); fast_rexmit = 1; }
/* Trick#3 : when we use RFC2988 timer restart, fast * retransmit can be triggered by timeout of queue head. */ //如果数据包超时(因为每次重传定时器都会被重置),则进入recover状态. if (tcp_is_fack(tp) && tcp_head_timedout(sk)) return true;
/* Trick#4: It is still not OK… But will it be useful to delay * recovery more? */ packets_out = tp->packets_out; //这里不太理解什么意思 if (packets_out <= tp->reordering && tp->sacked_out >= max_t(__u32, packets_out/2, sysctl_tcp_reordering) && !tcp_may_send_now(sk)) { /* We have nothing to send. This connection is limited * either by receiver window or by application. */ return true; }
/* If a thin stream is detected, retransmit after first * received dupack. Employ only if SACK is supported in order * to avoid possible corner-case series of spurious retransmissions * Use only if there are no unsent data. */ //处理thin stream if ((tp->thin_dupack || sysctl_tcp_thin_dupack) && tcp_stream_is_thin(tp) && tcp_dupack_heuristics(tp) > 1 && tcp_is_sack(tp) && !tcp_send_head(sk)) return true;
/* Trick#6: TCP early retransmit, per RFC5827. To avoid spurious * retransmissions due to small network reorderings, we implement * Mitigation A.3 in the RFC and delay the retransmission for a short * interval if appropriate. */ //处理early retransmit if (tp->do_early_retrans && !tp->retrans_out && tp->sacked_out && (tp->packets_out == (tp->sacked_out + 1) && tp->packets_out < 4) && !tcp_may_send_now(sk)) return !tcp_pause_early_retransmit(sk, flag);
static int tcp_try_undo_partial(struct sock *sk, int acked) { struct tcp_sock *tp = tcp_sk(sk); /\* Partial ACK arrived. Force Hoe’s retransmit. \*/ int failed = tcp_is_reno(tp) || (tcp_fackets_out(tp) > tp->reordering); //判断是否需要撤销 if (tcp_may_undo(tp)) { /* Plain luck! Hole if filled with delayed * packet, rather than with a retransmit. */ if (!tcp_any_retrans_done(sk)) tp->retrans_stamp = 0; //update重定序长度 tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1);
/* So… Do not make Hoe’s retransmit yet. * If the first packet was delayed, the rest * ones are most probably delayed as well. */ failed = 0; } return failed; }
case TCP_CA_Loss: icsk->icsk_retransmits = 0; //尝试撤销拥塞状态 if (tcp_try_undo_recovery(sk)) return; break;
case TCP_CA_CWR: /\* CWR is to be held something \*above* high_seq \* is ACKed for CWR bit to reach receiver. \*/ if (tp->snd_una != tp->high_seq) { tcp_complete_cwr(sk); //恢复到open状态. tcp_set_ca_state(sk, TCP_CA_Open); } break;
case TCP_CA_Recovery: if (tcp_is_reno(tp)) tcp_reset_reno_sack(tp); //尝试撤销拥塞状态 if (tcp_try_undo_recovery(sk)) return; //完成cwr tcp_complete_cwr(sk); break; }
static bool tcp_try_undo_recovery(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); //是否需要撤销 if (tcp_may_undo(tp)) { int mib_idx;
/* Happy end! We did not retransmit anything * or our original transmission succeeded. */ DBGUNDO(sk, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans"); //撤销设置 tcp_undo_cwr(sk, true); if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) mib_idx = LINUX_MIB_TCPLOSSUNDO; else mib_idx = LINUX_MIB_TCPFULLUNDO;
NET_INC_STATS_BH(sock_net(sk), mib_idx); tp->undo_marker = 0; } if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) { /\* Hold old state until something \*above* high_seq * is ACKed. For Reno it is MUST to prevent false \* fast retransmits (RFC2582). SACK TCP is safe. \*/ tcp_moderate_cwnd(tp); return true; } //设置状态为open. tcp_set_ca_state(sk, TCP_CA_Open); return false; }