001 #ifdef NetSocketNew 002 U8 *NetReadLn(I64 sock) { 003 I64 cnt=0,ch=0,c; 004 U8 *tmp=StrNew(""),*tmp2; 005 F64 st=tS; 006 while(st+1.>tS) { 007 c=NetRead(sock,&ch,1); 008 if(c==1) { 009 if(ch=='\r') { 010 NetRead(sock,&ch,1); //Skip '\n'? 011 ret:; 012 return tmp; 013 } 014 again: 015 if(MSize(tmp)>cnt+1) { 016 tmp[cnt++]=ch; 017 tmp[cnt]=0; 018 } else { 019 tmp2=tmp; 020 tmp=MAlloc(MSize(tmp)*3); 021 MemCpy(tmp,tmp2,cnt+1); 022 Free(tmp2); 023 goto again; 024 } 025 } 026 else if(cnt) 027 goto ret; 028 Yield; 029 if(-1!=NetPollForHangup(1,&sock)) { 030 Free(tmp); 031 return NULL; 032 } 033 } 034 err: 035 if(cnt) goto ret; 036 Free(tmp); 037 return NULL; 038 } 039 U0 NetPrintLn(I64 sock,U8 *fmt,...) { 040 U8 *tmp=StrPrintJoin(NULL,fmt,argc,argv); 041 if(StrLen(tmp)) NetWrite(sock,tmp,StrLen(tmp)); 042 NetWrite(sock,"\r\n",2); 043 Free(tmp); 044 } 045 U8 *FetchURL(U8 *url) { 046 Bool body=FALSE,*ret=NULL; 047 U8 *location,*path,*post,*tmp,*tmp2; 048 I64 cnt=0; 049 if(StrMatch("//",url)) { 050 location=StrNew(url=StrMatch("//",url)+2); 051 } else { 052 location=StrNew(url); 053 } 054 if(StrOcc(location,'/')) 055 *StrFirstOcc(location,"/")=0; 056 if(StrOcc(url,'/')) { 057 url=StrFirstOcc(url,"/"); 058 path=StrNew(url); 059 } else 060 path=StrNew("/"); 061 if(StrOcc(path,'?')) { 062 post=StrFirstOcc(path,"?"); 063 *post=0; 064 post=StrNew(post+1); 065 } else 066 post=NULL; 067 I64 sock=NetSocketNew; 068 CNetAddr *addr=NetAddrNew(location,80); 069 NetConnect(sock,addr); 070 if(post) 071 NetPrintLn(sock,"POST %s HTTP/1.1",path); 072 else 073 NetPrintLn(sock,"GET %s HTTP/1.1",path); 074 NetPrintLn(sock,"Host: %s",location); 075 NetPrintLn(sock,"Accept: */*",location); 076 if(post) { 077 NetPrintLn(sock,"Content-Length: %d",StrLen(post)); 078 NetPrintLn(sock,"Content-Type: application/x-www-form-urlencoded"); 079 } 080 NetPrintLn(sock,""); 081 ret=StrNew(""); 082 while(TRUE) { 083 tmp=NetReadLn(sock); 084 if(!tmp) break; 085 if(body) { 086 again: 087 if(MSize(ret)>StrLen(tmp)+1) { 088 StrCpy(ret+cnt,tmp); 089 cnt+=StrLen(tmp); 090 } else { 091 tmp2=ret; 092 ret=MAlloc(MSize(ret)<<1); 093 MemCpy(ret,tmp2,cnt+1); 094 Free(tmp2); 095 goto again; 096 } 097 098 } else if(!StrLen(tmp)) 099 body=TRUE; 100 Free(tmp); 101 } 102 NetAddrDel(addr); 103 NetClose(sock); 104 Free(location),Free(path),Free(post); 105 return ret; 106 } 107 #endif 108 I64 ReadUTF8(U8 *st,U8 **en=NULL) { 109 I64 ch=*st; 110 if(0b10000000&ch==0) { 111 if(en) *en=st+1; 112 return ch; 113 } 114 if(0b11100000&ch==0b11000000) { 115 if(en) *en=st+2; 116 return (st[0]&0b11111)<<6|((st[1]&0b111111)); 117 } 118 if(0b11110000&ch==0b11100000) { 119 if(en) *en=st+3; 120 return (st[0]&0b1111)<<12|(st[1]&0b111111)<<6|((st[2]&0b111111)); 121 } 122 if(en) *en=st+4; 123 return (st[1]&0b111)<<18|(st[1]&0b111111)<<12|(st[2]&0b111111)<<6|((st[3]&0b111111)); 124 } 125 #define H_NODE_TEXT 1 126 #define H_NODE_ELEMENT 2 127 #define H_NODE_ATTRIBUTE 3 128 class CHTMLNode:CQue { 129 I64 type; 130 }; 131 class CHTMLAttr:CHTMLNode { 132 U8 *name,*value; 133 }; 134 class CHTMLText:CHTMLNode { 135 U8 *str; 136 }; 137 class CHTMLElem:CHTMLNode { 138 U8 *type2; 139 CQue attributes; 140 CQue children; 141 }; 142 union CHTML { 143 CHTMLNode node; 144 CHTMLAttr attr; 145 CHTMLElem elem; 146 CHTMLText text; 147 }; 148 U0 DumpHTML(CHTML *n) { 149 CHTML *head,*cur; 150 switch(n->elem.type) { 151 case H_NODE_TEXT: 152 "STR:(%Q)\n",n->text.str; 153 break; 154 case H_NODE_ATTRIBUTE: 155 "Q=%Q\n",n->attr.name,n->attr.value; 156 break; 157 case H_NODE_ELEMENT: 158 head=&n->elem.children; 159 "ELEM:%s\n",n->elem.type2; 160 "$ID,2$"; 161 for(cur=head->elem.next;cur!=head;cur=cur->elem.next) 162 DumpHTML(cur); 163 head=&n->elem.attributes; 164 for(cur=head->elem.next;cur!=head;cur=cur->elem.next) 165 DumpHTML(cur); 166 "$ID,-2$"; 167 break; 168 } 169 } 170 U0 HTMLNodeDel(CHTML *n) { 171 CHTML *head,*cur; 172 switch(n->elem.type) { 173 case H_NODE_TEXT: 174 Free(n->text.str); 175 break; 176 case H_NODE_ATTRIBUTE: 177 Free(n->attr.name); 178 Free(n->attr.value); 179 break; 180 case H_NODE_ELEMENT: 181 Free(n->elem.type2); 182 head=&n->elem.children; 183 for(cur=head->elem.next;cur!=head;cur=cur->elem.next) 184 HTMLNodeDel(cur); 185 head=&n->elem.attributes; 186 for(cur=head->elem.next;cur!=head;cur=cur->elem.next) 187 HTMLNodeDel(cur); 188 break; 189 } 190 QueRem(n); 191 Free(n); 192 } 193 194 195 U8 *SkipComment(U8 *st) { 196 if(StrNCmp("<!--",st,4)) return st; 197 st+=4; 198 U8 *tmp,*tmp2; 199 I64 i=1; 200 i=1; 201 comment: 202 tmp=StrMatch("<!--",st); 203 tmp2=StrMatch("-->",st); 204 if(tmp&&tmp2) { 205 if(tmp<tmp2) { 206 i++; 207 st=tmp+4; 208 } else { 209 i--; 210 st=tmp2+3; 211 } 212 goto comment; 213 } else if(tmp) { 214 i++; 215 st=tmp+4; 216 goto comment; 217 } else if(tmp2) { 218 st=tmp2+3; 219 if(--i) { 220 goto comment; 221 } 222 } 223 return st; 224 } 225 226 U8 *SkipWhitespace(U8 *s) { 227 while(*s&&Bt(char_bmp_white_space,*s)) 228 s++; 229 if(!StrNCmp("<!--",s,4)) 230 return SkipComment(s); 231 return s; 232 } 233 234 CHTMLNode *ParseHTMLNode(U8 *st,U8 **en=NULL,I64 allow_types=1<<H_NODE_ELEMENT|1<<H_NODE_TEXT) { 235 enter:; 236 CHTML *e,*e2; 237 U8 name[STR_LEN],*tmp,*tmp2; 238 I64 i,ch,len; 239 if(!StrNCmp(st,"<!--",4)) { 240 st=SkipComment(st); 241 goto enter; 242 } 243 if(*st=='<'&&allow_types&(1<<H_NODE_ELEMENT)) { 244 st=SkipWhitespace(st); 245 e=CAlloc(sizeof CHTMLElem); 246 QueInit(e); 247 QueInit(&e->elem.attributes); 248 QueInit(&e->elem.children); 249 st++; 250 i=0; 251 while(Bt(char_bmp_alpha_numeric,*st)||*st=='!') { //Account for DOCTYPE html 252 if(!*st) { 253 "Expected an element type\n"; 254 throw('HTML'); 255 } 256 name[i++]=*st++; 257 } 258 name[i]=0; 259 e->elem.type=H_NODE_ELEMENT; 260 e->elem.type2=StrNew(name); 261 st=SkipWhitespace(st); 262 while(*st!='>') { 263 e2=ParseHTMLNode(st,&st,1<<H_NODE_ATTRIBUTE); 264 if(!e2) { 265 "Expected an attribute\n"; 266 throw('HTML'); 267 } 268 QueIns(e2,e->elem.attributes.last); 269 st=SkipWhitespace(st); 270 } 271 st++; 272 //void tags 273 if( 274 !StrICmp(name,"area")|| 275 !StrICmp(name,"!DOCTYPE")|| 276 !StrICmp(name,"base")|| 277 !StrICmp(name,"br")|| 278 !StrICmp(name,"col")|| 279 !StrICmp(name,"embed")|| 280 !StrICmp(name,"hr")|| 281 !StrICmp(name,"img")|| 282 !StrICmp(name,"input")|| 283 !StrICmp(name,"meta")|| 284 !StrICmp(name,"source")|| 285 !StrICmp(name,"track")|| 286 !StrICmp(name,"wbr") 287 ) { 288 goto elem_fin; 289 } 290 while(st(U16*)[0]!='</') { 291 e2=ParseHTMLNode(st,&st,1<<H_NODE_ELEMENT|1<<H_NODE_TEXT); 292 if(!e2) { 293 throw('HTML'); 294 } 295 QueIns(e2,e->elem.children.last); 296 } 297 st+=2; 298 i=0; 299 while(Bt(char_bmp_alpha_numeric,*st)) { 300 if(!*st) { 301 "Expected a tag type\n"; 302 throw('HTML'); 303 } 304 name[i++]=*st++; 305 } 306 name[i]=0; 307 if(StrICmp(name,e->elem.type2)) { 308 "Got unexpected end tag(expected %s,got %s)\n",e->elem.type2,name; 309 throw('HTML'); 310 } 311 st=SkipWhitespace(st); 312 if(*st!='>') { 313 "Expected a '>'\n"; 314 throw('HTML'); 315 } else 316 st++; 317 elem_fin: 318 if(en) *en=st; 319 QueInit(e); 320 return e; 321 } 322 if(allow_types&(1<<H_NODE_TEXT)) { 323 len=0; 324 tmp=StrNew(""); 325 while(*st&&*st!='<') { 326 if(*st=='&') { 327 st++; 328 i=0; 329 while(*st!=';') { 330 if(!Bt(char_bmp_alpha_numeric,*st)) { 331 "Expected a ';'\n"; 332 throw('HTML'); 333 } 334 name[i++]=*st++; 335 } 336 name[i]=0; 337 st++; 338 if(!StrCmp(name,"lt")) { 339 ch='<'; 340 } else if(!StrCmp(name,"gt")) { 341 ch='>'; 342 } else if(!StrCmp(name,"amp")) { 343 ch='&'; 344 } else if(!StrCmp(name,"qout")) { 345 ch='"'; 346 } else if(!StrCmp(name,"apos")) { 347 ch='\''; 348 } else 349 ch=' '; 350 } else 351 ch=ReadUTF8(st,&st); 352 //https://en.wiktionary.org/wiki/Appendix:Unicode/Latin_Extended-A 353 switch(ch) { 354 case 0x100...0x105: 355 ch='a'; 356 break; 357 case 0x106...0x10D: 358 ch='c'; 359 break; 360 case 0x10e...0x11b: 361 ch='e'; 362 break; 363 case 0x11f...0x123: 364 ch='e'; 365 break; 366 case 0x124...0x127: 367 ch='e'; 368 break; 369 case 0x128...0x135: 370 ch='i'; 371 break; 372 case 0x136...0x138: 373 ch='k'; 374 break; 375 case 0x139...0x142: 376 ch='l'; 377 break; 378 case 0x143...0x14b: 379 ch='n'; 380 break; 381 case 0x14c...0x151: 382 ch='o'; 383 break; 384 case 0x154...0x159: 385 ch='e'; 386 break; 387 case 0x15a...0x161: 388 ch='s'; 389 break; 390 case 0x162...0x167: 391 ch='t'; 392 break; 393 case 0x168...0x173: 394 ch='u'; 395 break; 396 case 0x174...0x175: 397 ch='w'; 398 break; 399 case 0x176...0x178: 400 ch='y'; 401 break; 402 case 0x179...0x17e: 403 ch='z'; 404 break; 405 } 406 if(MSize(tmp)<=len+1) { 407 tmp2=MAlloc(MSize(tmp)*2); 408 MemCpy(tmp2,tmp,len); 409 Free(tmp); 410 tmp=tmp2; 411 } 412 tmp[len++]=ch; 413 } 414 tmp[len]=0; 415 if(en) *en=st; 416 if(!len) return NULL; 417 e=CAlloc(sizeof CHTMLText); 418 e->text.str=tmp; 419 e->elem.type=H_NODE_TEXT; 420 QueInit(e); 421 return e; 422 } 423 //Last one to try 424 if(allow_types&(1<<H_NODE_ATTRIBUTE)) { 425 st=SkipWhitespace(st); 426 e=NULL; 427 i=0; 428 while(Bt(char_bmp_alpha_numeric,*st)) { 429 name[i++]=*st++; 430 } 431 if(!i) goto fin_attr; 432 name[i]=0; 433 e=CAlloc(sizeof CHTMLAttr); 434 e->elem.type=H_NODE_ATTRIBUTE; 435 e->attr.name=StrNew(name); 436 st=SkipWhitespace(st); 437 if(*st=='=') { 438 st=SkipWhitespace(st+1); 439 if(Bt(char_bmp_alpha_numeric,*st)) { 440 i=0; 441 while(Bt(char_bmp_alpha_numeric,*st)) 442 name[i++]=*st++; 443 name[i]=0; 444 e->attr.value=StrNew(name); 445 } else 446 if(*st=='"'||*st=='\'') { 447 ch=*st++; 448 tmp=st; 449 i=0; 450 while(*st!=ch) { 451 if(!*st) { 452 "Expected a '%c'\n",ch; 453 throw('HTML'); 454 } 455 i++,st++; 456 } 457 st++; //Go past " 458 tmp2=e->attr.value=MAlloc(i+1); 459 MemCpy(tmp2=e->attr.value,tmp,i); 460 tmp2[i]=0; 461 } 462 } 463 fin_attr: 464 if(en) *en=st; 465 QueInit(e); 466 return e; 467 } 468 return NULL; 469 } 470 /*U8 *src= 471 "<!DOCTYPE html>" 472 "<!-- Comment -->" 473 "<!-- Comment <!-- Comment2 --> -->" 474 "<HTML><H1>Potatoes<A HREF=\"duck\">Link</A></H1>" 475 "</HTML>"; 476 CHTML *n=ParseHTMLNode(src,&src); 477 DumpHTML(n); 478 HTMLNodeDel(n); 479 n=ParseHTMLNode(src,&src); 480 DumpHTML(n); 481 HTMLNodeDel(n);*/ 482 483 U0 _HTML2Text(CDoc *tmp,CHTML *h) { 484 CQue *head,*c; 485 I64 len,idx; 486 U8 *t,*t2; 487 if(h->elem.type==H_NODE_ELEMENT) { 488 if( 489 !StrICmp("STYLE",h->elem.type2) 490 ||!StrICmp("SCRIPT",h->elem.type2) 491 ) { 492 //Dont dump style 493 } else { 494 head=&h->elem.children; 495 for(c=head->next;c!=head;c=c->next) { 496 _HTML2Text(tmp,c); 497 } 498 } 499 } else if(h->elem.type==H_NODE_TEXT) { 500 t2=t=StrNew(h->text.str); 501 len=StrLen(t); 502 for(idx=0;idx!=len;idx++) { 503 if(t[idx]=='\d') 504 t[idx]='D'; 505 if(t[idx]==t[idx+1]&&t[idx]==' ') 506 ; //Dont repeat spaces; 507 else if(t+idx!=t2) 508 *t2++=t[idx]; 509 else 510 t2++; 511 } 512 *t2=0; 513 DocPrint(tmp,"%s",t); 514 Free(t); 515 } 516 } 517 U8 *HTML2Text(U8 *src,I64 len) { 518 CDoc *tmp=DocNew; 519 CHTML *h; 520 while(h=ParseHTMLNode(src,&src)) { 521 _HTML2Text(tmp,h); 522 HTMLNodeDel(h); 523 } 524 src=DocSave(tmp); 525 DocDel(tmp); 526 return src; 527 } 528 I64 len; 529 U8 *src=FileRead("out.utf8",&len);; 530 U8 *text=HTML2Text(src,len); 531 Free(src); 532 FileWrite("Latin.TXT",text,StrLen(text)); 533 Free(text);