001 #ifdef NetSocketNew
002 U8 *NetReadLn(I64 sock) {
003   I64 cnt=0,ch=0,c;
004   U8 *tmp=StrNew(""),*tmp2;
005   F64 st=tS;
006   while(st+1.>tS) {
007     c=NetRead(sock,&ch,1);
008     if(c==1) {
009       if(ch=='\r') {
010         NetRead(sock,&ch,1); //Skip '\n'?
011 ret:;
012         return tmp;
013       }
014 again:
015       if(MSize(tmp)>cnt+1) {
016         tmp[cnt++]=ch;
017         tmp[cnt]=0;
018       } else {
019         tmp2=tmp;
020         tmp=MAlloc(MSize(tmp)*3);
021         MemCpy(tmp,tmp2,cnt+1);
022         Free(tmp2);
023         goto again;
024       }
025     }
026     else if(cnt)
027       goto ret;
028     Yield;
029     if(-1!=NetPollForHangup(1,&sock))  {
030       Free(tmp);
031       return NULL;
032     }
033   }
034 err:
035   if(cnt) goto ret;
036   Free(tmp);
037   return NULL;
038 }
039 U0 NetPrintLn(I64 sock,U8 *fmt,...) {
040   U8 *tmp=StrPrintJoin(NULL,fmt,argc,argv);
041   if(StrLen(tmp)) NetWrite(sock,tmp,StrLen(tmp));
042   NetWrite(sock,"\r\n",2);
043   Free(tmp);
044 }
045 U8 *FetchURL(U8 *url) {
046   Bool body=FALSE,*ret=NULL;
047   U8 *location,*path,*post,*tmp,*tmp2;
048   I64 cnt=0;
049   if(StrMatch("//",url)) {
050     location=StrNew(url=StrMatch("//",url)+2);
051   } else {
052     location=StrNew(url);
053   }
054   if(StrOcc(location,'/'))
055     *StrFirstOcc(location,"/")=0;
056   if(StrOcc(url,'/')) {
057     url=StrFirstOcc(url,"/");
058     path=StrNew(url);
059   } else
060     path=StrNew("/");
061   if(StrOcc(path,'?')) {
062     post=StrFirstOcc(path,"?");
063     *post=0;
064     post=StrNew(post+1);
065   } else
066     post=NULL;
067   I64 sock=NetSocketNew;
068   CNetAddr *addr=NetAddrNew(location,80);
069   NetConnect(sock,addr);
070   if(post)
071     NetPrintLn(sock,"POST %s HTTP/1.1",path);
072   else
073     NetPrintLn(sock,"GET %s HTTP/1.1",path);
074   NetPrintLn(sock,"Host: %s",location);
075   NetPrintLn(sock,"Accept: */*",location);
076   if(post) {
077     NetPrintLn(sock,"Content-Length: %d",StrLen(post));
078     NetPrintLn(sock,"Content-Type: application/x-www-form-urlencoded");
079   }
080   NetPrintLn(sock,"");
081   ret=StrNew("");
082   while(TRUE) {
083    tmp=NetReadLn(sock);
084    if(!tmp) break;
085    if(body) {
086 again:
087       if(MSize(ret)>StrLen(tmp)+1) {
088         StrCpy(ret+cnt,tmp);
089         cnt+=StrLen(tmp);
090       } else {
091         tmp2=ret;
092         ret=MAlloc(MSize(ret)<<1);
093         MemCpy(ret,tmp2,cnt+1);
094         Free(tmp2);
095         goto again;
096       }
097      
098    } else if(!StrLen(tmp))
099      body=TRUE;
100    Free(tmp);
101   }
102   NetAddrDel(addr);
103   NetClose(sock);
104   Free(location),Free(path),Free(post);
105   return ret;
106 }
107 #endif
108 I64 ReadUTF8(U8 *st,U8 **en=NULL) {
109   I64 ch=*st;
110   if(0b10000000&ch==0) {
111     if(en) *en=st+1;
112     return ch;
113   }
114   if(0b11100000&ch==0b11000000) {
115     if(en) *en=st+2;
116     return (st[0]&0b11111)<<6|((st[1]&0b111111));
117   }
118   if(0b11110000&ch==0b11100000) {
119     if(en) *en=st+3;
120     return (st[0]&0b1111)<<12|(st[1]&0b111111)<<6|((st[2]&0b111111));
121   }
122   if(en) *en=st+4;
123   return (st[1]&0b111)<<18|(st[1]&0b111111)<<12|(st[2]&0b111111)<<6|((st[3]&0b111111));  
124 }
125 #define H_NODE_TEXT 1
126 #define H_NODE_ELEMENT 2
127 #define H_NODE_ATTRIBUTE 3
128 class CHTMLNode:CQue {
129   I64 type;
130 };
131 class CHTMLAttr:CHTMLNode {
132   U8 *name,*value;
133 };
134 class CHTMLText:CHTMLNode {
135   U8 *str;
136 };
137 class CHTMLElem:CHTMLNode {
138   U8 *type2;
139   CQue attributes;
140   CQue children; 
141 };
142 union CHTML {
143   CHTMLNode node;
144   CHTMLAttr attr;
145   CHTMLElem elem;
146   CHTMLText text;
147 };
148 U0 DumpHTML(CHTML *n) {
149   CHTML *head,*cur;
150   switch(n->elem.type) {
151     case H_NODE_TEXT:
152       "STR:(%Q)\n",n->text.str;
153       break;
154     case H_NODE_ATTRIBUTE:
155       "Q=%Q\n",n->attr.name,n->attr.value;
156       break;
157     case H_NODE_ELEMENT:
158       head=&n->elem.children;
159       "ELEM:%s\n",n->elem.type2;
160       "$ID,2$";
161       for(cur=head->elem.next;cur!=head;cur=cur->elem.next)
162         DumpHTML(cur);
163       head=&n->elem.attributes;
164       for(cur=head->elem.next;cur!=head;cur=cur->elem.next)
165         DumpHTML(cur);
166       "$ID,-2$";
167       break;
168   }
169 }
170 U0 HTMLNodeDel(CHTML *n) {
171   CHTML *head,*cur;
172   switch(n->elem.type) {
173     case H_NODE_TEXT:
174       Free(n->text.str);
175       break;
176     case H_NODE_ATTRIBUTE:
177       Free(n->attr.name);
178       Free(n->attr.value);
179       break;
180     case H_NODE_ELEMENT:
181       Free(n->elem.type2);
182       head=&n->elem.children;
183       for(cur=head->elem.next;cur!=head;cur=cur->elem.next)
184         HTMLNodeDel(cur);
185       head=&n->elem.attributes;
186       for(cur=head->elem.next;cur!=head;cur=cur->elem.next)
187         HTMLNodeDel(cur);
188       break;
189   }
190   QueRem(n);
191   Free(n);
192 }
193 
194 
195 U8 *SkipComment(U8 *st) {
196   if(StrNCmp("<!--",st,4)) return st;
197   st+=4;
198   U8 *tmp,*tmp2;
199   I64 i=1;
200   i=1;
201 comment:
202   tmp=StrMatch("<!--",st);
203   tmp2=StrMatch("-->",st);
204   if(tmp&&tmp2) {
205     if(tmp<tmp2) {
206       i++;
207       st=tmp+4;
208     } else {
209       i--;
210       st=tmp2+3;
211     }
212     goto comment;
213   } else if(tmp) {
214     i++;
215     st=tmp+4;
216     goto comment;
217   } else if(tmp2) {
218     st=tmp2+3;
219     if(--i) {
220       goto comment;
221     }
222   }
223   return st;
224 }
225 
226 U8 *SkipWhitespace(U8 *s) {
227   while(*s&&Bt(char_bmp_white_space,*s))
228     s++;
229   if(!StrNCmp("<!--",s,4))
230     return SkipComment(s);
231   return s;
232 }
233 
234 CHTMLNode *ParseHTMLNode(U8 *st,U8 **en=NULL,I64 allow_types=1<<H_NODE_ELEMENT|1<<H_NODE_TEXT) {
235 enter:;
236   CHTML *e,*e2;
237   U8 name[STR_LEN],*tmp,*tmp2;
238   I64 i,ch,len;
239   if(!StrNCmp(st,"<!--",4)) {
240     st=SkipComment(st);
241     goto enter;
242   }
243   if(*st=='<'&&allow_types&(1<<H_NODE_ELEMENT)) {
244     st=SkipWhitespace(st);
245     e=CAlloc(sizeof CHTMLElem);
246     QueInit(e);
247     QueInit(&e->elem.attributes);
248     QueInit(&e->elem.children);
249     st++;
250     i=0;
251     while(Bt(char_bmp_alpha_numeric,*st)||*st=='!') { //Account for DOCTYPE html
252       if(!*st) {
253         "Expected an element type\n";
254         throw('HTML');
255       }
256       name[i++]=*st++;
257     }
258     name[i]=0;
259     e->elem.type=H_NODE_ELEMENT;
260     e->elem.type2=StrNew(name);
261     st=SkipWhitespace(st);  
262     while(*st!='>') {
263       e2=ParseHTMLNode(st,&st,1<<H_NODE_ATTRIBUTE);
264       if(!e2) {
265         "Expected an attribute\n";
266         throw('HTML');
267       }
268       QueIns(e2,e->elem.attributes.last);
269       st=SkipWhitespace(st);
270     }
271     st++;
272 //void tags
273     if(
274           !StrICmp(name,"area")||
275           !StrICmp(name,"!DOCTYPE")||
276           !StrICmp(name,"base")||
277           !StrICmp(name,"br")||
278           !StrICmp(name,"col")||
279           !StrICmp(name,"embed")||
280           !StrICmp(name,"hr")||
281           !StrICmp(name,"img")||
282           !StrICmp(name,"input")||
283           !StrICmp(name,"meta")||
284           !StrICmp(name,"source")||
285           !StrICmp(name,"track")||
286           !StrICmp(name,"wbr")
287     ) {
288       goto elem_fin;
289     }
290     while(st(U16*)[0]!='</') {
291       e2=ParseHTMLNode(st,&st,1<<H_NODE_ELEMENT|1<<H_NODE_TEXT);
292       if(!e2) {
293         throw('HTML');
294       }
295       QueIns(e2,e->elem.children.last); 
296     }
297     st+=2;
298     i=0;
299     while(Bt(char_bmp_alpha_numeric,*st)) {
300       if(!*st) {
301         "Expected a tag type\n";
302         throw('HTML');
303       }
304       name[i++]=*st++;
305     }
306     name[i]=0;
307     if(StrICmp(name,e->elem.type2)) {
308       "Got unexpected end tag(expected %s,got %s)\n",e->elem.type2,name;
309       throw('HTML');
310     }
311     st=SkipWhitespace(st);
312     if(*st!='>') {
313       "Expected a '>'\n";
314       throw('HTML');
315     } else
316       st++;
317 elem_fin:
318     if(en) *en=st;
319     QueInit(e);
320     return e;
321   }
322   if(allow_types&(1<<H_NODE_TEXT)) {
323     len=0;
324     tmp=StrNew("");
325     while(*st&&*st!='<') {
326       if(*st=='&') {
327         st++;
328         i=0;
329         while(*st!=';') {
330           if(!Bt(char_bmp_alpha_numeric,*st)) {
331             "Expected a ';'\n";
332             throw('HTML');
333           }
334           name[i++]=*st++;
335         }
336         name[i]=0;
337         st++;
338         if(!StrCmp(name,"lt")) {
339           ch='<';
340         } else if(!StrCmp(name,"gt")) {
341           ch='>';
342         } else if(!StrCmp(name,"amp")) {
343           ch='&';
344         } else if(!StrCmp(name,"qout")) {
345           ch='"';
346         } else if(!StrCmp(name,"apos")) {
347           ch='\'';
348         } else
349           ch=' ';
350       } else 
351         ch=ReadUTF8(st,&st);
352 //https://en.wiktionary.org/wiki/Appendix:Unicode/Latin_Extended-A
353       switch(ch) {
354         case 0x100...0x105:
355           ch='a';
356           break;
357         case 0x106...0x10D:
358           ch='c';
359           break;
360         case 0x10e...0x11b:
361           ch='e';
362           break;
363         case 0x11f...0x123:
364           ch='e';
365           break;
366         case 0x124...0x127:
367           ch='e';
368           break;
369         case 0x128...0x135:
370           ch='i';
371           break;
372         case 0x136...0x138:
373           ch='k';
374           break;
375         case 0x139...0x142:
376           ch='l';
377           break;
378         case 0x143...0x14b:
379           ch='n';
380           break;
381         case 0x14c...0x151:
382           ch='o';
383           break;
384         case 0x154...0x159:
385           ch='e';
386           break;
387         case 0x15a...0x161:
388           ch='s';
389           break;
390         case 0x162...0x167:
391           ch='t';
392           break;
393         case 0x168...0x173:
394           ch='u';
395           break;
396         case 0x174...0x175:
397           ch='w';
398           break;
399         case 0x176...0x178:
400           ch='y';
401           break;
402         case 0x179...0x17e:
403           ch='z';
404           break;
405       }
406       if(MSize(tmp)<=len+1) {
407         tmp2=MAlloc(MSize(tmp)*2);
408         MemCpy(tmp2,tmp,len);
409         Free(tmp);
410         tmp=tmp2;
411       }
412       tmp[len++]=ch;
413     }
414     tmp[len]=0;
415     if(en) *en=st;
416     if(!len) return NULL;
417     e=CAlloc(sizeof CHTMLText);
418     e->text.str=tmp;
419     e->elem.type=H_NODE_TEXT;
420     QueInit(e);
421     return e;
422   }
423 //Last one to try
424   if(allow_types&(1<<H_NODE_ATTRIBUTE)) {
425     st=SkipWhitespace(st);
426     e=NULL;
427     i=0;
428     while(Bt(char_bmp_alpha_numeric,*st)) {
429       name[i++]=*st++;
430     }
431     if(!i) goto fin_attr;
432     name[i]=0;
433     e=CAlloc(sizeof CHTMLAttr);
434     e->elem.type=H_NODE_ATTRIBUTE;
435     e->attr.name=StrNew(name);
436     st=SkipWhitespace(st);
437     if(*st=='=') {
438       st=SkipWhitespace(st+1);
439       if(Bt(char_bmp_alpha_numeric,*st)) {
440         i=0;
441         while(Bt(char_bmp_alpha_numeric,*st))
442           name[i++]=*st++;
443         name[i]=0;      
444         e->attr.value=StrNew(name);
445       } else
446         if(*st=='"'||*st=='\'') {
447           ch=*st++;
448           tmp=st;
449           i=0;
450           while(*st!=ch) {
451             if(!*st) {
452               "Expected a '%c'\n",ch;
453               throw('HTML');
454             }
455             i++,st++;
456           }
457           st++; //Go past "
458           tmp2=e->attr.value=MAlloc(i+1);
459           MemCpy(tmp2=e->attr.value,tmp,i);
460           tmp2[i]=0;
461         }
462     }
463 fin_attr:
464     if(en) *en=st;
465     QueInit(e);
466     return e;
467   }
468   return NULL;
469 }
470 /*U8 *src=
471 "<!DOCTYPE html>"
472 "<!-- Comment -->"
473 "<!-- Comment <!-- Comment2 --> -->"
474 "<HTML><H1>Potatoes<A HREF=\"duck\">Link</A></H1>"
475 "</HTML>";
476 CHTML *n=ParseHTMLNode(src,&src);
477 DumpHTML(n);
478 HTMLNodeDel(n);
479 n=ParseHTMLNode(src,&src);
480 DumpHTML(n);
481 HTMLNodeDel(n);*/
482 
483 U0 _HTML2Text(CDoc *tmp,CHTML *h) {
484   CQue *head,*c;
485   I64 len,idx;
486   U8 *t,*t2;
487   if(h->elem.type==H_NODE_ELEMENT) {
488     if(
489           !StrICmp("STYLE",h->elem.type2)
490           ||!StrICmp("SCRIPT",h->elem.type2)
491     ) {
492 //Dont dump style
493     } else {
494       head=&h->elem.children;
495       for(c=head->next;c!=head;c=c->next) {
496         _HTML2Text(tmp,c);
497       }
498     }
499   } else if(h->elem.type==H_NODE_TEXT) {
500     t2=t=StrNew(h->text.str);
501     len=StrLen(t);
502     for(idx=0;idx!=len;idx++) {
503       if(t[idx]=='\d')
504         t[idx]='D';
505       if(t[idx]==t[idx+1]&&t[idx]==' ')
506         ; //Dont repeat spaces;
507       else if(t+idx!=t2)
508         *t2++=t[idx];
509       else
510         t2++;
511     }
512     *t2=0;
513     DocPrint(tmp,"%s",t);
514     Free(t);
515   }
516 }
517 U8 *HTML2Text(U8 *src,I64 len) {
518   CDoc *tmp=DocNew;
519   CHTML *h;
520   while(h=ParseHTMLNode(src,&src)) {
521     _HTML2Text(tmp,h);
522     HTMLNodeDel(h);
523   }
524   src=DocSave(tmp);
525   DocDel(tmp);
526   return src;
527 }
528 I64 len;
529 U8 *src=FileRead("out.utf8",&len);;
530 U8 *text=HTML2Text(src,len);
531 Free(src);
532 FileWrite("Latin.TXT",text,StrLen(text));
533 Free(text);