I64 ReadUTF8(U8 *st,U8 **en=NULL) { I64 ch=*st; if(0b10000000&ch==0) { if(en) *en=st+1; return ch; } if(0b11100000&ch==0b11000000) { if(en) *en=st+2; return (st[0]&0b11111)<<6|((st[1]&0b111111)); } if(0b11110000&ch==0b11100000) { if(en) *en=st+3; return (st[0]&0b1111)<<12|(st[1]&0b111111)<<6|((st[2]&0b111111)); } if(en) *en=st+4; return (st[1]&0b111)<<18|(st[1]&0b111111)<<12|(st[2]&0b111111)<<6|((st[3]&0b111111)); } U8 Transliterate(U64 ch) { //https://en.wiktionary.org/wiki/Appendix:Unicode/Latin_Extended-A switch(ch) { case 0x100...0x105: ch='a'; break; case 0x106...0x10D: ch='c'; break; case 0x10e...0x11b: ch='e'; break; case 0x11f...0x123: ch='e'; break; case 0x124...0x127: ch='e'; break; case 0x128...0x135: ch='i'; break; case 0x136...0x138: ch='k'; break; case 0x139...0x142: ch='l'; break; case 0x143...0x14b: ch='n'; break; case 0x14c...0x151: ch='o'; break; case 0x154...0x159: ch='e'; break; case 0x15a...0x161: ch='s'; break; case 0x162...0x167: ch='t'; break; case 0x168...0x173: ch='u'; break; case 0x174...0x175: ch='w'; break; case 0x176...0x178: ch='y'; break; case 0x179...0x17e: ch='z'; break; default: if(!(31<=ch<=128)) ch='?'; } return ch; } #define J_STR 1 #define J_NUM 2 #define J_ARRAY 3 #define J_NODE 4 class CJson { I64 type; union { U8 *str; F64 num; CHashTable *hash_table; } I64 cnt; }; U8 *sqstring(U8 *ptr,U8 **en) { U8 *buf=MAlloc(256); I64 ch=0,C; if(en) *en=ptr; if(*ptr!='\'') return NULL; ptr++; while(*ptr&&*ptr!='\'') { if(*ptr=='\\') { ptr++; read_one: C=ReadUTF8(ptr,&ptr); if(ch<255) buf[ch++]=Transliterate(C); } else goto read_one; } buf[ch]=0; if(en) *en=ptr+1; return buf; } U8 *dqstring(U8 *ptr,U8 **en) { U8 *buf=MAlloc(256); I64 ch=0,C; if(en) *en=ptr; if(*ptr!='"') return NULL; ptr++; while(*ptr&&*ptr!='"') { if(*ptr=='\\') { ptr++; read_one: C=ReadUTF8(ptr,&ptr); if(ch<255) buf[ch++]=Transliterate(C); } else goto read_one; } buf[ch]=0; if(en) *en=ptr+1; return buf; } U8 *SkipWhitespace(U8 *s) { while(*s&&Bt(char_bmp_white_space,*s)) s++; return s; } U8 *word(U8 *ptr,U8 **en) { U8 *buf=MAlloc(256); I64 ch=0; if(en) *en=ptr; if(!Bt(char_bmp_alpha_numeric,*ptr)) return NULL; while(Bt(char_bmp_alpha_numeric,*ptr)) buf[ch++]=*ptr++; buf[ch]=0; if(en) *en=ptr; return buf; } class CIndexBlk { I64 ln; U8 *body['z'-'a'+1]; I64 pad[BLK_SIZE-8-8*('z'-'a'+1)]; }; CJson *ParseJson(U8 *st,U8 **en=NULL) { CJson *ret=NULL; U8 *name; CHashGeneric *g; st=SkipWhitespace(st); if(*st=='{') { ret=CAlloc(sizeof CJson); ret->type=J_NODE; ret->hash_table=HashTableNew(0x8); st=SkipWhitespace(st+1); while(*st!='}') { if(!*st) throw('JSON'); switch(*st) { case '\'': name=sqstring(st,&st); break; case '"': name=dqstring(st,&st); break; default: name=word(st,&st); break; } if(!name) throw('JSON'); st=StrFirstOcc(st,":"); if(!st) throw('JSON'); st++; g=CAlloc(sizeof CHashGeneric); g->str=name; g->type=HTT_WORD; g->user_data0=ParseJson(st,&st); HashAdd(g,ret->hash_table); st=StrFirstOcc(st,",}"); if(!st) throw('JSON'); if(*st==',') st++; st=SkipWhitespace(st); } st++; } else if(*st=='\'') { ret=CAlloc(sizeof CJson); ret->type=J_STR; ret->str=sqstring(st,&st); } else if(*st=='\"') { ret=CAlloc(sizeof CJson); ret->type=J_STR; ret->str=dqstring(st,&st); } else if(*st=='[') { st=SkipWhitespace(st+1); ret=CAlloc(sizeof CJson); ret->type=J_NODE; ret->hash_table=HashTableNew(0x10); while(*st!=']') { g=CAlloc(sizeof CHashGeneric); g->str=MStrPrint("%d",ret->cnt++); g->type=HTT_DICT_WORD; g->user_data0=ParseJson(st,&st); HashAdd(g,ret->hash_table); st=StrFirstOcc(st,",]"); if(!st) throw('JSON'); if(*st==',') st++; st=SkipWhitespace(st); } st++; } else { name=st; ret=CAlloc(sizeof CJson); ret->type=J_NUM; ret->num=Str2F64(st,&st); if(name==st) throw('JSON'); } if(en) *en=st; if(!ret) throw('JSON'); return ret; } U0 JsonDel(CJson *j) { I64 bucket; CHashGeneric *g; switch(j->type) { case J_STR: Free(j->str); break; case J_NUM: break; case J_NODE: case J_ARRAY: for(bucket=0;bucket<=j->hash_table->mask;bucket++) for(g=j->hash_table->body[bucket];g;g=g->next) { JsonDel(g->user_data0); } exitHashTableDel(j->hash_table); } Free(j); } U0 DumpJson(CJson *j) { I64 bucket; CHashGeneric *g; switch(j->type) { case J_STR: "\"%Q\"",j->str; break; case J_NUM: "%n",j->num; break; case J_NODE: case J_ARRAY: "{$$ID,2$$\n"; for(bucket=0;bucket<=j->hash_table->mask;bucket++) for(g=j->hash_table->body[bucket];g;g=g->next) { "%s:",g->str; DumpJson(g->user_data0); ",\n"; } "$$ID,-2$$\n}"; } } CJson *j=ParseJson("{a:1,b:'adsdsadsa',c:[1,2,3]}",NULL); DumpJson(j); I64 GetWordPtr(I64 *max,CFile *file,U8*str,I64 ptr=0) { if(!*str) return ptr; I64 idx=ToUpper(*str)-'A'; CIndexBlk dummy; FBlkRead(file,&dummy,ptr,1); if(!dummy.body[idx]) { dummy.body[idx]=*max; FBlkWrite(file,&dummy,ptr,1); MemSet(&dummy,0,sizeof CIndexBlk); FBlkWrite(file,&dummy,ptr=(*max)++,1); return GetWordPtr(max,file,str+1,ptr); } if(ptr>*max) throw('dsda'); return GetWordPtr(max,file,str+1,dummy.body[idx]); } U0 MakeIndex(U8 *outname,U8 *in_name) { CIndexBlk dummy; U8 *optr=FileRead(in_name),*fptr; CFile *file; CHeapCtrl *cc; CHashGeneric *g; CJson *j,*str; Del(outname); file=FOpen(outname,"w"); MemSet(&dummy,0,sizeof CIndexBlk); FBlkWrite(file,&dummy); ClassRep(&dummy); fptr=optr; I64 ln=0,blk,sz=1; do { fptr=SkipWhitespace(fptr); if(!*fptr) break; j=ParseJson(fptr,&fptr); if(j&&j->type==J_NODE) { g=HashFind("word",j->hash_table,-1); str=g->user_data0; if(str&&str->type=J_STR) { //Ensure all charactors are alpha for(blk=0;blk!=StrLen(str->str);blk++) { if(!('A'<=ToUpper(str->str[blk])<='Z')) goto skip; } blk=GetWordPtr(&sz,file,str->str); FBlkRead(file,&dummy,blk,1); dummy.ln=ln; FBlkWrite(file,&dummy,blk,1); skip:; } } JsonDel(j); ln++; } while(TRUE); file->de.size=sz<