001 // vim: set ft=c:
002 
003 // https://tools.ietf.org/html/rfc793
004 
005 // See https://en.wikipedia.org/wiki/File:Tcp_state_diagram_fixed_new.svg
006 #define TCP_STATE_CLOSED        0
007 #define TCP_STATE_LISTEN        1
008 #define TCP_STATE_SYN_SENT      2
009 #define TCP_STATE_SYN_RECEIVED  3
010 #define TCP_STATE_ESTABLISHED   4
011 #define TCP_STATE_FIN_WAIT_1    5
012 #define TCP_STATE_FIN_WAIT_2    6
013 #define TCP_STATE_CLOSE_WAIT    7
014 #define TCP_STATE_CLOSING       8
015 #define TCP_STATE_LAST_ACK      9
016 #define TCP_STATE_TIME_WAIT     10
017 
018 #define TCP_CONNECT_TIMEOUT     10000
019 
020 #define TCP_DEFAULT_MSS         536
021 
022 #define TCP_WINDOW_SIZE         8192
023 
024 #define TCP_FLAG_FIN            0x01
025 #define TCP_FLAG_SYN            0x02
026 #define TCP_FLAG_RST            0x04
027 #define TCP_FLAG_PSH            0x08
028 #define TCP_FLAG_ACK            0x10
029 #define TCP_FLAG_URG            0x20
030 
031 #define TCP_SRTT_ALPHA          0.9
032 #define TCP_RTO_MIN             0.2
033 #define TCP_RTO_MAX             10
034 #define TCP_RTO_BETA            2
035 
036 class CTcpHeader {
037   U16 source_port;
038   U16 dest_port;
039   U32 seq;
040   U32 ack;
041   U8 data_offset;
042   U8 flags;
043   U16 window_size;
044   U16 checksum;
045   U16 urgent_pointer;
046 };
047 
048 class CTcpSendBufHeader {
049   CTcpSendBufHeader* next;
050 
051   F64 time_sent;
052   U32 length;
053   U32 retries;
054   U32 seq_start;
055   U32 seq_end;
056 };
057 
058 class CTcpSocket {
059   CSocket sock;
060 
061   I64 state;
062 
063   U32 local_addr;
064   U16 local_port;
065 
066   U32 remote_addr;
067   U32 remote_port;
068 
069   U32 snd_una;    // seq number of first unacknowledged octet
070   U32 snd_nxt;    // seq number of next octet to send
071   U32 snd_wnd;    // allowed number of unacknowledged outgoing octets
072   U32 mss;        // maximum segment size
073 
074   U32 rcv_nxt;    // seq number of next octet to receive
075   U32 rcv_wnd;    // allowed number of unacknowledged incoming octets
076 
077   F64 conntime;
078   F64 srtt;
079 
080   I64 recv_buf_size;
081   U8* recv_buf;
082   I64 recv_buf_read_pos;
083   I64 recv_buf_write_pos;
084 
085   CTcpSocket* backlog_next;
086   CTcpSocket* backlog_first;
087   CTcpSocket* backlog_last;
088   I64 backlog_remaining;
089 
090   CTcpSendBufHeader* send_buf_first;
091   CTcpSendBufHeader* send_buf_last;
092 
093   //I64 rcvtimeo_ms;
094   //I64 recv_maxtime;
095 };
096 
097 class CTcpPseudoHeader {
098   U32 source_addr;
099   U32 dest_addr;
100   U8 zeros;
101   U8 protocol;
102   U16 tcp_length;
103 };
104 
105 // TODO: this takes up half a meg, change it to a binary tree or something
106 static CTcpSocket** tcp_bound_sockets;
107 
108 static U16 tcp_next_source_port = RandU16();
109 
110 static Bool TcpIsSynchronizedState(I64 state) {
111   return state == TCP_STATE_ESTABLISHED || state == TCP_STATE_FIN_WAIT_1
112       || state == TCP_STATE_FIN_WAIT_2 || state == TCP_STATE_CLOSE_WAIT
113       || state == TCP_STATE_CLOSING || state == TCP_STATE_LAST_ACK
114       || state == TCP_STATE_TIME_WAIT;
115 }
116 
117 static U16 TcpPartialChecksum(U32 sum, U8* header, I64 length) {
118   I64 nleft = length;
119   U16* w = header;
120 
121   while (nleft > 1)  {
122     sum += *(w++);
123     nleft -= 2;
124   }
125 
126   return sum;
127 }
128 
129 static U16 TcpFinalChecksum(U32 sum, U8* header, I64 length) {
130   I64 nleft = length;
131   U16* w = header;
132 
133   while (nleft > 1)  {
134     sum += *(w++);
135     nleft -= 2;
136   }
137 
138   // mop up an odd byte, if necessary
139   if (nleft == 1) {
140     sum += ((*w) & 0x00ff);
141   }
142 
143   // add back carry outs from top 16 bits to low 16 bits
144   sum = (sum >> 16) + (sum & 0xffff); // add hi 16 to low 16
145   sum += (sum >> 16);         // add carry
146   return (~sum) & 0xffff;
147 }
148 
149 I64 TcpPacketAlloc(U8** frame_out, U32 source_ip, U16 source_port, U32 dest_ip, U16 dest_port,
150     U32 seq, U32 ack, U8 flags, I64 length) {
151   U8* frame;
152   I64 index = IPv4PacketAlloc(&frame, IP_PROTO_TCP, source_ip, dest_ip,
153       sizeof(CTcpHeader) + length);
154 
155   if (index < 0)
156     return index;
157 
158   CTcpHeader* hdr = frame;
159   hdr->source_port = htons(source_port);
160   hdr->dest_port = htons(dest_port);
161   hdr->seq = htonl(seq);
162   hdr->ack = htonl(ack);
163   hdr->data_offset = (sizeof(CTcpHeader) / 4) << 4;
164   hdr->flags = flags;
165   hdr->window_size = htons(TCP_WINDOW_SIZE / 2);    // FIXME
166   hdr->checksum = 0;
167   hdr->urgent_pointer = 0;
168 
169   *frame_out = frame + sizeof(CTcpHeader);
170   return index;
171 }
172 
173 I64 TcpPacketFinish(I64 index, U32 source_ip, U32 dest_ip, U8* frame, I64 length,
174     CTcpSendBufHeader** send_buf_out) {
175   CTcpHeader* hdr = frame - sizeof(CTcpHeader);
176 
177   CTcpPseudoHeader pseudo;
178   pseudo.source_addr = htonl(source_ip);
179   pseudo.dest_addr = htonl(dest_ip);
180   pseudo.zeros = 0;
181   pseudo.protocol = IP_PROTO_TCP;
182   pseudo.tcp_length = htons(sizeof(CTcpHeader) + length);
183 
184   U32 sum = TcpPartialChecksum(0, &pseudo, sizeof(CTcpPseudoHeader));
185   hdr->checksum = TcpFinalChecksum(sum, hdr, sizeof(CTcpHeader) + length);
186 
187   if (send_buf_out) {
188     CTcpSendBufHeader* sb = MAlloc(sizeof(CTcpSendBufHeader) + sizeof(CTcpHeader) + length);
189     sb->next = NULL;
190     sb->time_sent = tS;
191     sb->length = sizeof(CTcpHeader) + length;
192     sb->retries = 0;
193     sb->seq_start = ntohl(hdr->seq);
194     sb->seq_end = 0;      // NEEDS TO BE SET UPSTREAM
195 
196     MemCpy((sb(U8*)) + sizeof(CTcpSendBufHeader), frame, sizeof(CTcpHeader) + length);
197     *send_buf_out = sb;
198   }
199 
200   return IPv4PacketFinish(index);
201 }
202 
203 I64 TcpSend(U32 local_addr, U16 local_port, U32 remote_addr, U16 remote_port, U32 seq, U32 ack, U8 flags) {
204   U8* frame;
205   I64 index = TcpPacketAlloc(&frame,
206       local_addr, local_port, remote_addr, remote_port,
207       seq, ack, flags, 0);
208 
209   if (index < 0)
210     return index;
211 
212   return TcpPacketFinish(index, local_addr, remote_addr, frame, 0, NULL);
213 }
214 
215 I64 TcpSend2(CTcpSocket* s, U8 flags) {
216   U8* frame;
217   I64 index = TcpPacketAlloc(&frame,
218       s->local_addr, s->local_port, s->remote_addr, s->remote_port,
219       s->snd_nxt, s->rcv_nxt, flags, 0);
220 
221   if (index < 0)
222     return index;
223 
224   if (flags & TCP_FLAG_SYN)
225     s->snd_nxt++;
226 
227   if (flags & TCP_FLAG_FIN)
228     s->snd_nxt++;
229 
230   //"Sent #%d, to %08X, err = %d\n", s->seq, s->remote_addr, error;
231   // FIXME: If the packet is SYN or FIN, we also need to queue for retransmit!
232   return TcpPacketFinish(index, s->local_addr, s->remote_addr, frame, 0, NULL);
233 }
234 
235 I64 TcpSendData2(CTcpSocket* s, U8 flags, U8* data, I64 length) {
236   U8* frame;
237   I64 index = TcpPacketAlloc(&frame,
238       s->local_addr, s->local_port, s->remote_addr, s->remote_port,
239       s->snd_nxt, s->rcv_nxt, flags, length);
240 
241   if (index < 0)
242     return index;
243 
244   if (length)
245   MemCpy(frame, data, length);
246 
247   if (flags & TCP_FLAG_SYN)
248     s->snd_nxt++;
249 
250   s->snd_nxt += length;
251 
252   if (flags & TCP_FLAG_FIN)
253     s->snd_nxt++;
254 
255   //"Sent #%d, to %08X, err = %d\n", s->seq, s->remote_addr, error;
256 
257   CTcpSendBufHeader* sb;
258   TcpPacketFinish(index, s->local_addr, s->remote_addr, frame, length, &sb);
259   sb->seq_end = s->snd_nxt;
260 
261   // Append to SendBuf chain
262   if (s->send_buf_first)
263     s->send_buf_last->next = sb;
264   else
265     s->send_buf_first = sb;
266 
267   s->send_buf_last = sb;
268 }
269 
270 I64 TcpParsePacket(CTcpHeader** header_out, U8** data_out, I64* length_out, CIPv4Packet* packet) {
271   if (packet->proto != IP_PROTO_TCP)
272     return -1;
273 
274   // FIXME: validate packet->length
275   // FIXME: checksum
276 
277   CTcpHeader* hdr = packet->data;
278   I64 header_length = (hdr->data_offset >> 4) * 4;
279 
280   //"TCP: in hdr %d, flags %02Xh, seq %d, ack %d, len %d, chksum %d\n",
281   //    header_length, hdr->flags, ntohl(hdr->seq), ntohl(hdr->ack),
282   //    packet->length - header_length, ntohs(hdr->checksum);
283 
284   *header_out = hdr;
285   *data_out = packet->data + header_length;
286   *length_out = packet->length - header_length;
287   return 0;
288 }
289 
290 /*
291 class CTcpSendBufHeader {
292   CTcpSendBufHeader* next;
293 
294   F64 time_sent;
295   U32 length;
296   U32 retries;
297   U32 seq_start;
298   U32 seq_end;
299 };
300 */
301 
302 static U0 TcpSocketAckSendBufs(CTcpSocket* s, U32 seg_ack) {
303   F64 time = tS;
304 
305   while (s->send_buf_first) {
306     CTcpSendBufHeader* sb = s->send_buf_first;
307 
308     // There's no notion of smaller/greater than in modular arithemtic,
309     // we can only check if a number lies within some range.
310     // Here we check that
311     //   sb->seq_end <= seg_ack <= s->snd_nxt
312     // because that will work for all meaningful ACKs.
313     I64 seg_ack_rel = (seg_ack - sb->seq_end) & 0xffffffff;
314     I64 snd_nxt_rel = (s->snd_nxt - sb->seq_end) & 0xffffffff;
315 
316     if (seg_ack_rel <= snd_nxt_rel) {
317       // Update smoothed RTT
318       F64 rtt = time - sb->time_sent;
319       s->srtt = (s->srtt * TCP_SRTT_ALPHA) + ((1.0 - TCP_SRTT_ALPHA) * rtt);
320       //"ACK'd %d->%d (RTT %f ms)", sb->seq_start, sb->seq_end, rtt * 1000;
321 
322       // Remove SendBuf from chain
323       s->send_buf_first = sb->next;
324 
325       if (s->send_buf_first == NULL)
326         s->send_buf_last = NULL;
327 
328       Free(sb);
329     }
330     else
331       break;
332   }
333 }
334 
335 static U0 TcpSocketCheckSendBufs(CTcpSocket* s) {
336   F64 time = tS;
337 
338   F64 rto = TCP_RTO_BETA * s->srtt;
339 
340   if (rto < TCP_RTO_MIN) rto = TCP_RTO_MIN;
341   if (rto > TCP_RTO_MAX) rto = TCP_RTO_MAX;
342 
343   while (s->send_buf_first) {
344     CTcpSendBufHeader* sb = s->send_buf_first;
345 
346     if (time > sb->time_sent + rto) {
347       // Retransmit
348       "Retransmit %d->%d (%f ms)!\n", sb->seq_start, sb->seq_end, (time - sb->time_sent) * 1000;
349       U8* frame;
350       I64 index = IPv4PacketAlloc(&frame, IP_PROTO_TCP, s->local_addr, s->remote_addr, sb->length);
351 
352       if (index < 0)
353         return;   // retry later I guess
354 
355       MemCpy(frame, (sb(U8*)) + sizeof(CTcpSendBufHeader), sb->length);
356       IPv4PacketFinish(index);
357 
358       sb->time_sent = tS;
359 
360       // Move to the end of the chain
361       s->send_buf_first = sb->next;
362       sb->next = NULL;
363 
364       if (s->send_buf_first)
365         s->send_buf_last->next = sb;
366       else
367         s->send_buf_first = sb;
368 
369       s->send_buf_last = sb;
370     }
371     else
372       break;
373   }
374 }
375 
376 I64 TcpSocketAccept(CTcpSocket* s, sockaddr* addr, I64 addrlen) {
377   if (s->state != TCP_STATE_LISTEN)
378     return -1;
379 
380   while (1) {
381     // TODO: Thread safe?
382     if (s->backlog_first) {
383       CTcpSocket* new_socket = s->backlog_first;
384       "Retr %p\n", new_socket;
385 
386       s->backlog_first = s->backlog_first->backlog_next;
387       if (!s->backlog_first)
388         s->backlog_last = NULL;
389 
390       s->backlog_remaining++;
391 
392       // TODO: this should be done in a way that doesn't block on accept()
393       I64 maxtime = cnts.jiffies + TCP_CONNECT_TIMEOUT * JIFFY_FREQ / 1000;
394 
395       while (cnts.jiffies < maxtime) {
396         if (new_socket->state == TCP_STATE_ESTABLISHED || new_socket->state == TCP_STATE_CLOSED)
397           break;
398         else
399           Yield;
400       }
401 
402       if (new_socket->state != TCP_STATE_ESTABLISHED) {
403         close(new_socket);
404         return -1;
405       }
406 
407       return new_socket;
408     }
409     else
410       Yield;
411   }
412 
413   no_warn addr;   // FIXME
414   no_warn addrlen;
415   return -1;
416 }
417 
418 I64 TcpSocketBind(CTcpSocket* s, sockaddr* addr, I64 addrlen) {
419   if (addrlen < sizeof(sockaddr_in))
420     return -1;
421 
422   if (s->state != TCP_STATE_CLOSED)
423     return -1;
424 
425   sockaddr_in* addr_in = addr;
426 
427   U16 local_port = ntohs(addr_in->sin_port);
428 
429   // TODO: address & stuff
430   if (tcp_bound_sockets[local_port] != NULL)
431     return -1;
432 
433   tcp_bound_sockets[local_port] = s;
434 
435   s->local_addr = IPv4GetAddress();
436   s->local_port = local_port;
437 
438   return 0;
439 }
440 
441 I64 TcpSocketClose(CTcpSocket* s) {
442   if (TcpIsSynchronizedState(s->state)) {
443     TcpSend2(s, TCP_FLAG_RST);
444   }
445 
446   // Free backlog
447   CTcpSocket* backlog = s->backlog_first;
448   CTcpSocket* backlog2;
449 
450   while (backlog) {
451     backlog2 = backlog->backlog_next;
452     close(backlog);
453     backlog = backlog2;
454   }
455 
456   if (s->local_port)
457     tcp_bound_sockets[s->local_port] = NULL;
458 
459   Free(s->recv_buf);
460   Free(s);
461   return 0;
462 }
463 
464 I64 TcpSocketConnect(CTcpSocket* s, sockaddr* addr, I64 addrlen) {
465   if (addrlen < sizeof(sockaddr_in))
466     return -1;
467 
468   if (s->state != TCP_STATE_CLOSED)
469     return -1;
470 
471   sockaddr_in* addr_in = addr;
472 
473   U16 local_port = 0x8000 + (tcp_next_source_port & 0x7fff);
474   tcp_next_source_port++;
475 
476   // TODO: address & stuff
477   if (tcp_bound_sockets[local_port] != NULL)
478     return -1;
479 
480   tcp_bound_sockets[local_port] = s;
481 
482   s->local_addr = IPv4GetAddress();
483   s->local_port = local_port;
484   s->remote_addr = ntohl(addr_in->sin_addr.s_addr);
485   s->remote_port = ntohs(addr_in->sin_port);
486 
487   s->snd_una = 0;
488   s->snd_nxt = 0;
489   s->snd_wnd = 0;
490   s->mss = TCP_DEFAULT_MSS;
491 
492   s->rcv_nxt = 0;
493   s->rcv_wnd = TCP_WINDOW_SIZE;
494 
495   s->conntime = tS;
496 
497   TcpSend2(s, TCP_FLAG_SYN);
498   s->state = TCP_STATE_SYN_SENT;
499 
500   // TODO: TcpSetTimeout
501   I64 maxtime = cnts.jiffies + TCP_CONNECT_TIMEOUT * JIFFY_FREQ / 1000;
502 
503   while (cnts.jiffies < maxtime) {
504     if (s->state == TCP_STATE_ESTABLISHED || s->state == TCP_STATE_CLOSED)
505       break;
506     else
507       Yield;
508   }
509 
510   if (s->state != TCP_STATE_ESTABLISHED)
511     return -1;
512 
513   return 0;
514 }
515 
516 I64 TcpSocketListen(CTcpSocket* s, I64 backlog) {
517   if (s->state != TCP_STATE_CLOSED)
518     return -1;
519 
520   // Enter listen state. If a SYN packet arrives, it will be processed by TcpHandler,
521   // which opens the connection and puts the new socket into the listening socket's accept backlog.
522   s->state = TCP_STATE_LISTEN;
523   s->backlog_remaining = backlog;
524 
525   return 0;
526 }
527 
528 I64 TcpSocketRecvfrom(CTcpSocket* s, U8* buf, I64 len, I64 flags, sockaddr* src_addr, I64 addrlen) {
529   no_warn flags;
530   no_warn src_addr;   // FIXME
531   no_warn addrlen;
532   //"TcpSocketRecvfrom\n";
533   while (s->state == TCP_STATE_ESTABLISHED && s->recv_buf_read_pos == s->recv_buf_write_pos) {
534     TcpSocketCheckSendBufs(s);
535     Yield;
536   }
537 
538   // TODO: this works for now, but we should be still able to receive data
539   //       in connection-closing states
540   if ((s->state != TCP_STATE_ESTABLISHED && s->recv_buf_read_pos == s->recv_buf_write_pos)
541       || len == 0)
542     return 0;
543 
544   I64 read_pos = s->recv_buf_read_pos;
545   I64 write_pos = s->recv_buf_write_pos;
546 
547   //I64 avail = (write_pos - read_pos) & (s->recv_buf_size);
548   I64 read_total = 0;
549   I64 step;
550 
551   if (write_pos < read_pos) {
552     // We can read up to the end of the buffer
553     step = s->recv_buf_size - read_pos;
554 
555     if (step > len)
556       step = len;
557 
558     //"Read %d from %d..end\n", step, read_pos;
559     MemCpy(buf, s->recv_buf + read_pos, step);
560     buf += step;
561     len -= step;
562     read_pos = (read_pos + step) & (s->recv_buf_size - 1);
563     read_total += step;
564 
565     // at this point, (len == 0 || read_pos == 0) must be true
566   }
567 
568   if (len) {
569     step = write_pos - read_pos;
570 
571     if (step > len)
572       step = len;
573 
574     //"Read %d from start+%d..\n", step, read_pos;
575     MemCpy(buf, s->recv_buf + read_pos, step);
576     buf += step;
577     len -= step;
578     read_pos += step;
579     read_total += step;
580   }
581 
582   s->recv_buf_read_pos = read_pos;
583   return read_total;
584 }
585 
586 I64 TcpSocketSendto(CTcpSocket* s, U8* buf, I64 len, I64 flags, sockaddr_in* dest_addr, I64 addrlen) {
587   no_warn dest_addr;
588   no_warn addrlen;
589   no_warn flags;
590 
591   I64 sent_total = 0;
592 
593   while (s->state == TCP_STATE_ESTABLISHED && len) {
594     I64 can_send = (s->snd_una + s->snd_wnd - s->snd_nxt) & 0xffffffff;
595 
596     // TODO: Keep trying
597     // Must be tied to a timeout; see RFC793/Managing-the-Window
598     //if (s->snd_wnd == 0)
599     //  can_send = 1;
600 
601     if (can_send == 0) {
602       if (sent_total > 0)
603         break;
604       else {
605         TcpSocketCheckSendBufs(s);
606         Yield;
607       }
608     }
609     else {
610       if (can_send > len)
611         can_send = len;
612 
613       if (can_send > s->mss)
614         can_send = s->mss;
615 
616       TcpSendData2(s, TCP_FLAG_ACK, buf, can_send);
617       buf += can_send;
618       len -= can_send;
619     }
620   }
621 
622   return sent_total;
623 }
624 
625 I64 TcpSocketSetsockopt(CTcpSocket* s, I64 level, I64 optname, U8* optval, I64 optlen) {
626   /*if (level == SOL_SOCKET && optname == SO_RCVTIMEO_MS && optlen == 8) {
627     s->rcvtimeo_ms = *(optval(I64*));
628     return 0;
629   }*/
630 
631   no_warn s;
632   no_warn level;
633   no_warn optname;
634   no_warn optval;
635   no_warn optlen;
636 
637   return -1;
638 }
639 
640 CTcpSocket* TcpSocket(U16 domain, U16 type) {
641   if (domain != AF_INET || type != SOCK_STREAM)
642     return NULL;
643 
644   CTcpSocket* s =       MAlloc(sizeof(CTcpSocket));
645   s->sock.accept =      &TcpSocketAccept;
646   s->sock.bind =        &TcpSocketBind;
647   s->sock.close =       &TcpSocketClose;
648   s->sock.connect =     &TcpSocketConnect;
649   s->sock.listen =      &TcpSocketListen;
650   s->sock.recvfrom =    &TcpSocketRecvfrom;
651   s->sock.sendto =      &TcpSocketSendto;
652   s->sock.setsockopt =  &TcpSocketSetsockopt;
653 
654   s->state = TCP_STATE_CLOSED;
655 
656   s->send_buf_first = NULL;
657   s->send_buf_last = NULL;
658 
659   s->recv_buf_size = TCP_WINDOW_SIZE;
660   s->recv_buf = MAlloc(s->recv_buf_size);
661   s->recv_buf_read_pos = 0;
662   s->recv_buf_write_pos = 0;
663 
664   s->backlog_next = NULL;
665   s->backlog_first = NULL;
666   s->backlog_last = NULL;
667   s->backlog_remaining = 0;
668 
669   /*s->rcvtimeo_ms = 0;
670   s->recv_maxtime = 0;
671 
672   s->recv_buf = NULL;
673   s->recv_len = 0;
674   s->recv_addr.sin_family = AF_INET;
675   s->bound_to = 0;*/
676   return s;
677 }
678 
679 U0 TcpSocketHandle(CTcpSocket* s, CIPv4Packet* packet, CTcpHeader* hdr, U8* data, I64 length) {
680   U32 seg_len = length;
681 
682   if (hdr->flags & TCP_FLAG_FIN) seg_len++;
683   if (hdr->flags & TCP_FLAG_SYN) seg_len++;
684 
685   U32 seg_seq = ntohl(hdr->seq);
686 
687   if (s->state == TCP_STATE_LISTEN) {
688     // A new connection is being opened.
689 
690     if ((hdr->flags & TCP_FLAG_SYN) && s->backlog_remaining > 0) {
691       //"SYN in from %08X:%d => %08X:%d.\n", packet->source_ip, ntohs(hdr->source_port),
692       //    packet->dest_ip, ntohs(hdr->dest_port);
693       CTcpSocket* new_socket = TcpSocket(AF_INET, SOCK_STREAM);
694 
695       new_socket->local_addr = IPv4GetAddress();
696       new_socket->local_port = s->local_port;
697       new_socket->remote_addr = packet->source_ip;
698       new_socket->remote_port = ntohs(hdr->source_port);
699 
700       new_socket->snd_una = 0;
701       new_socket->snd_nxt = 0;
702       new_socket->snd_wnd = 0;
703       new_socket->mss = TCP_DEFAULT_MSS;
704 
705       new_socket->rcv_nxt = ++seg_seq;
706       new_socket->rcv_wnd= TCP_WINDOW_SIZE;
707 
708       new_socket->conntime = tS;
709 
710       TcpSend2(new_socket, TCP_FLAG_SYN | TCP_FLAG_ACK);
711       new_socket->state = TCP_STATE_SYN_RECEIVED;
712 
713       // FIXME FIXME FIXME FIXME
714       tcp_bound_sockets[new_socket->local_port] = new_socket;
715 
716       if (s->backlog_last)
717         s->backlog_last->backlog_next = new_socket;
718       else
719         s->backlog_first = new_socket;
720 
721       s->backlog_last = new_socket;
722       s->backlog_remaining--;
723     }
724     else {
725       //"REJ %08X:%d (as %08X:%d)\n", packet->source_ip, ntohs(hdr->source_port),
726       //    packet->dest_ip, ntohs(hdr->dest_port);
727       TcpSend(packet->dest_ip, ntohs(hdr->dest_port), packet->source_ip, ntohs(hdr->source_port),
728           seg_seq + 1, seg_seq + 1, TCP_FLAG_ACK | TCP_FLAG_RST);
729     }
730 
731     return;
732   }
733 
734   if (s->state == TCP_STATE_CLOSED)
735     return;
736 
737   Bool must_ack = FALSE;
738 
739   // Process SYN
740   if (hdr->flags & TCP_FLAG_SYN) {
741     s->rcv_nxt = ++seg_seq;
742     //"Reset ACK to %d\n", s->ack;
743 
744     must_ack = TRUE;
745   }
746 
747   // Validate SEQ
748   Bool valid_seq;
749 
750   if (seg_len == 0 && s->rcv_wnd == 0) {
751     valid_seq = (seg_seq == s->rcv_nxt);
752   }
753   else {
754     // At least one of these must be true:
755     //   RCV.NXT =< SEG.SEQ < RCV.NXT+RCV.WND
756     //   RCV.NXT =< SEG.SEQ+SEG.LEN-1 < RCV.NXT+RCV.WND
757     I64 rel_seq = ((seg_seq - s->rcv_nxt) & 0xffffffff);
758     I64 rel_seq_end = ((seg_seq + seg_len - 1 - s->rcv_nxt) & 0xffffffff);
759 
760     if (rel_seq < s->rcv_wnd || rel_seq_end < s->rcv_wnd)
761       valid_seq = TRUE;
762     else
763       valid_seq = FALSE;
764   }
765 
766   if (!valid_seq)
767     "SEQ error: seg_seq %d, seg_len %d, rcv_nxt %d, rcv_wnd %d\n", seg_seq, seg_len, s->rcv_nxt, s->rcv_wnd;
768 
769   // Process ACK
770   if (hdr->flags & TCP_FLAG_ACK) {
771     U32 seg_ack = ntohl(hdr->ack);
772     // ACK is acceptable iff SND.UNA < SEG.ACK =< SND.NXT
773 
774     I64 rel_ack = ((seg_ack - s->snd_una) & 0xffffffff);
775     I64 rel_nxt = ((s->snd_nxt - s->snd_una) & 0xffffffff);
776 
777     // RFC 793 is poorly worded in this regard, unacceptable ACK
778     // is not the opposite of an acceptible (= new) ACK!
779     // TODO: Instead of zero, we should compare rel_ack to some NEGATIVE_CONSTANT,
780     // so that we don't unnecessarily try to correct every slightly delayed ACK
781     if (/*0 < rel_ack &&*/ rel_ack <= rel_nxt) {
782       TcpSocketAckSendBufs(s, seg_ack);
783 
784       // Accept ACK
785       s->snd_una = seg_ack;
786 
787       if (s->state == TCP_STATE_SYN_SENT && (hdr->flags & TCP_FLAG_SYN)) {
788         s->state = TCP_STATE_ESTABLISHED;
789         s->srtt = tS - s->conntime;
790         //"Initial RTT: %f ms", s->srtt * 1000;
791       }
792       else if (s->state == TCP_STATE_SYN_RECEIVED) {
793         //"Connection established.\n";
794         s->state = TCP_STATE_ESTABLISHED;
795         s->srtt = tS - s->conntime;
796         //"Initial RTT: %f ms", s->srtt * 1000;
797       }
798     }
799     else {
800       // Unacceptable ACK
801       "Bad ACK; state %d, seg_ack %d, snd_nxt %d\n", s->state, seg_ack, s->snd_nxt;
802 
803       if (s->state == TCP_STATE_LISTEN || s->state == TCP_STATE_SYN_SENT
804           || s->state == TCP_STATE_SYN_RECEIVED) {
805         // Reset
806         TcpSend(packet->dest_ip, ntohs(hdr->dest_port), packet->source_ip, ntohs(hdr->source_port),
807             seg_ack, seg_seq + seg_len, TCP_FLAG_ACK | TCP_FLAG_RST);
808       }
809       else if (TcpIsSynchronizedState(s->state)) {
810         // Send a 'corrective' ACK
811         must_ack = TRUE;
812       }
813     }
814   }
815 
816   // Process RST
817   if (hdr->flags & TCP_FLAG_RST) {
818     if ((s->state == TCP_STATE_SYN_SENT)) {
819       // If acknowledged
820       if (s->snd_una == s->snd_nxt) {
821         "Connection refused\n";
822         s->state = TCP_STATE_CLOSED;
823         return;
824       }
825     }
826     else {
827       if (valid_seq) {
828         "Connection reset by peer\n";
829         s->state = TCP_STATE_CLOSED;
830         return;
831       }
832     }
833 
834     "Spurious RST\n";
835   }
836 
837   // FIXME check remote addr & port
838 
839   // Process data
840   if (valid_seq) {
841     s->snd_wnd = hdr->window_size;
842 
843     if (s->state == TCP_STATE_ESTABLISHED) {
844       I64 write_pos = s->recv_buf_write_pos;
845       //"%d in @ %d", length, write_pos;
846 
847       // Skip retransmitted bytes
848       while (length && seg_seq != s->rcv_nxt) {
849         seg_seq = (seg_seq + 1) & 0xffffffff;
850         data++;
851         length--;
852       }
853 
854       // ugh!
855       I64 i = 0;
856       for (i = 0; i < length; i++) {
857         I64 next_pos = (write_pos + 1) & (s->recv_buf_size - 1);
858 
859         if (next_pos == s->recv_buf_read_pos)
860           break;
861 
862         s->recv_buf[write_pos] = data[i];
863         write_pos = next_pos;
864       }
865 
866       s->recv_buf_write_pos = write_pos;
867       s->rcv_nxt += i;
868       //"; %d saved\n", i;
869 
870       if (i > 0)
871         must_ack = TRUE;
872 
873       if (hdr->flags & TCP_FLAG_FIN) {
874         s->rcv_nxt++;
875         s->state = TCP_STATE_CLOSE_WAIT;
876         must_ack = TRUE;
877       }
878     }
879   }
880 
881   if (must_ack) {
882     TcpSend2(s, TCP_FLAG_ACK);
883   }
884 }
885 
886 I64 TcpHandler(CIPv4Packet* packet) {
887   CTcpHeader* hdr;
888   U8* data;
889   I64 length;
890 
891   I64 error = TcpParsePacket(&hdr, &data, &length, packet);
892 
893   if (error < 0)
894     return error;
895 
896   U16 dest_port = ntohs(hdr->dest_port);
897   //"%u => %p\n", dest_port, tcp_bound_sockets[dest_port];
898 
899   CTcpSocket* s = tcp_bound_sockets[dest_port];
900 
901   // FIXME: should also check that bound address is INADDR_ANY,
902   //        OR packet dest IP matches bound address
903   if (s != NULL) {
904     TcpSocketHandle(s, packet, hdr, data, length);
905   }
906   else {
907     // TODO: Send RST as per RFC793/Reset-Generation
908   }
909 
910   return error;
911 }
912 
913 U0 TcpInit() {
914   tcp_bound_sockets = MAlloc(65536 * sizeof(CTcpSocket*));
915   MemSet(tcp_bound_sockets, 0, 65536 * sizeof(CTcpSocket*));
916 }
917 
918 TcpInit;
919 RegisterL4Protocol(IP_PROTO_TCP, &TcpHandler);
920 RegisterSocketClass(AF_INET, SOCK_STREAM, &TcpSocket);