001 Bool EndsWith(U8 *a,U8 *e) { 002 if(StrLen(e)>StrLen(a)) return FALSE; 003 return !StrICmp(a+StrLen(a)-StrLen(e),e); 004 } 005 U8 *Upperify(U8 *str) { 006 U8 *p=str,c; 007 p=StrNew(str); 008 Free(str); 009 str=p; 010 while(c=*p) 011 *p++=ToUpper(c); 012 return str; 013 } 014 U8 *WordForm0(U8 *word,I64 who=1) { 015 U8 *str=ACDDefsGet(word),*ret=NULL,*ptr=str; 016 again:; 017 if(ptr&&*ptr) { 018 switch(*ptr++) { 019 case ACD_POS_CHAR: 020 Free(ret); 021 ret=StrNew(ptr); 022 ptr+=StrLen(ptr)+1; 023 if(--who) { 024 Free(ret);ret=NULL; 025 goto again; 026 } 027 break; 028 default: 029 ptr+=StrLen(ptr)+1; 030 goto again; 031 } 032 } 033 Free(str); 034 return ret; 035 } 036 U64 WordForm1(U8 *word,U8 *copy_to=NULL,I64 who=1) { 037 U8 *wf,buf[STR_LEN]; 038 U64 ret=0; 039 StrCpy(buf,word); 040 if(EndsWith(word,"ing")) { 041 StrCpy(buf,word); 042 buf[StrLen(buf)-3]=0; 043 if(wf=WordForm0(buf,who)) { 044 Free(wf); 045 if(copy_to) StrCpy(copy_to,buf); 046 return 'Gerund'; 047 } 048 } 049 again:; 050 if(wf=WordForm0(buf,who)) { 051 use_wf:; 052 if(StrIMatch("pron.",wf)) 053 ret='Pronoun'; 054 else if(StrIMatch("adv.",wf)) 055 ret='Adv'; 056 else if(StrIMatch("prep.",wf)) 057 ret='Adv'; 058 else if(StrIMatch("a.",wf)) 059 ret='Adj'; 060 else if(StrIMatch("v.",wf)) 061 ret='Verb'; 062 else if(StrIMatch("n.",wf)) 063 ret='Noun'; 064 Free(wf); 065 if(!ret) { 066 ++who; 067 goto again; 068 } 069 if(copy_to) StrCpy(copy_to,buf); 070 return ret; 071 } 072 if(EndsWith(word,"ed")) { 073 StrPrint(buf,"%s",word); 074 buf[StrLen(buf)-2]=0; 075 if(wf=WordForm0(buf,who)) 076 goto use_wf; 077 } 078 if(EndsWith(word,"s")) { 079 StrPrint(buf,"%s",word); 080 buf[StrLen(buf)-1]=0; 081 if(wf=WordForm0(buf,who)) 082 goto use_wf; 083 } 084 if(EndsWith(word,"ies")) { 085 StrPrint(buf,"%s",word); 086 //good[ies] 087 buf[StrLen(buf)-3]=0; 088 if(wf=WordForm0(buf,who)) 089 goto use_wf; 090 CatPrint(buf,"Y"); 091 //Stud[ies]->study 092 if(wf=WordForm0(buf)) 093 goto use_wf; 094 } 095 if(EndsWith(word,"d")) { 096 StrPrint(buf,"%s",word); 097 //use[d] 098 buf[StrLen(buf)-1]=0; 099 if(wf=WordForm0(buf,who)) 100 goto use_wf; 101 } 102 if(wf=WordForm0(buf,who)) { 103 goto use_wf; 104 } 105 if(copy_to) StrCpy(copy_to,buf); 106 return ret; 107 } 108 109 110 U8 *BaseWordify(U8 *str) { 111 str=Upperify(StrNew(str)); 112 U8 buf[STR_LEN]; 113 WordForm1(str,buf); 114 Free(str); 115 return StrNew(buf); 116 } 117 I64 RelationshipsWithWord(U8 *word,U8 *other) { 118 I64 who,cnt=0; 119 CHashGeneric *gen; 120 U8 buf[STR_LEN]; 121 StrPrint(buf,"Relat.%s",word); 122 for(who=1;gen=HashSingleTableFind(buf,Fs->hash_table,HTT_FRAME_PTR,who);++who) { 123 if(!StrICmp(gen->user_data0,other)) { 124 cnt=gen->user_data1; 125 break; 126 } 127 } 128 return cnt; 129 } 130 131 U0 AddRelationshipWithWord(U8 *word,U8 *other,I64 amt=1) { 132 I64 who,cnt=0; 133 CHashGeneric *gen; 134 U8 buf[STR_LEN]; 135 StrPrint(buf,"Relat.%s",word); 136 for(who=1;gen=HashSingleTableFind(buf,Fs->hash_table,HTT_FRAME_PTR,who);++who) { 137 if(!StrICmp(gen->user_data0,other)) { 138 gen->user_data1+=amt; 139 cnt=1; 140 } 141 } 142 if(!cnt) { 143 gen=CAlloc(sizeof CHashGeneric); 144 gen->str=StrNew(buf); 145 gen->type=HTT_FRAME_PTR; 146 gen->user_data0=StrNew(other); 147 gen->user_data1=amt; 148 HashAdd(gen,Fs->hash_table); 149 } 150 } 151 152 153 Bool PronSubj(U8 *word) { 154 U8 buf[STR_LEN]; 155 word=Upperify(StrNew(word)); 156 StrCpy(buf,word); 157 Free(word); 158 word=buf; 159 if(LstMatch(word,"I\0YOU\0HE\0SHE\0IT\0WE\0THEY\0",word)!=-1) 160 return TRUE; 161 return FALSE; 162 } 163 Bool PronObj(U8 *word) { 164 U8 buf[STR_LEN]; 165 word=Upperify(StrNew(word)); 166 StrCpy(buf,word); 167 Free(word); 168 word=buf; 169 170 if(LstMatch(word,"ME\0YOU\0HIM\0HER\0IT\0US\0YOU\0THEM\0")!=-1) 171 return TRUE; 172 //Reflexive's act as objects(?) 173 if(LstMatch(word,"MYSELF\0YOURSELF\0HIMSELF\0HERSELF\0ITSELF\0OURSELVES\0YOUSELVES\0THEMSELVES\0")!=-1) 174 return TRUE; 175 return FALSE; 176 177 } 178 Bool PronDemo(U8 *word) { 179 U8 buf[STR_LEN]; 180 word=Upperify(StrNew(word)); 181 StrCpy(buf,word); 182 Free(word); 183 word=buf; 184 185 if(LstMatch(word,"THIS\0THAT\0THOSE\0THESE\0")!=-1) 186 return TRUE; 187 return FALSE; 188 189 } 190 Bool DetPoss(U8 *word) { 191 U8 buf[STR_LEN]; 192 word=Upperify(StrNew(word)); 193 StrCpy(buf,word); 194 Free(word); 195 word=buf; 196 197 if(LstMatch(word,"MY\0YOUR\0HER\0HIS\0ITS\0OUR\0YOUR\0THEIR\0")!=-1) 198 return TRUE; 199 return FALSE; 200 } 201 Bool PronPoss(U8 *word) { 202 U8 buf[STR_LEN]; 203 word=Upperify(StrNew(word)); 204 StrCpy(buf,word); 205 Free(word); 206 word=buf; 207 208 if(LstMatch(word,"MINE\0YOURS\0YOURS\0HIS\0HERS\0ITS\0OURS\0YOURS\0THEIRS\0")!=-1) 209 return TRUE; 210 return FALSE; 211 } 212 213 214 I64 PronPerson(U8 *word) { 215 U8 buf[STR_LEN]; 216 word=Upperify(StrNew(word)); 217 StrCpy(buf,word); 218 Free(word); 219 word=buf; 220 221 if(LstMatch(word,"I\0ME\0MY\0MINE\0MYSELF\0")!=-1) 222 return 1; 223 if(LstMatch(word,"WE\0US\0OUR\0\0OURS\0OURSELVES\0")!=-1) 224 return 1; 225 226 if(LstMatch(word,"YOU\0YOUR\0YOURS\0YOURSELF\0")!=-1) 227 return 2; 228 if(LstMatch(word,"YOURSELVES\0")!=-1) 229 return 2; 230 231 if(LstMatch(word,"HE\0SHE\0IT\0HIM\0HER\0IT\0ITS\0HERS\0HERSELF\0HIMSELF\0ITESELF\0")!=-1) 232 return 3; 233 if(LstMatch(word,"THEY\0THEM\0THEIR\0THEIRS\0THEMSELVES\0")!=-1) 234 return 3; 235 236 return 0; 237 } 238 239 Bool PronPlural(U8 *word) { 240 U8 buf[STR_LEN]; 241 word=Upperify(StrNew(word)); 242 StrCpy(buf,word); 243 Free(word); 244 word=buf; 245 246 if(LstMatch(word,"WE\0US\0OUR\0\0OURS\0OURSELVES\0")!=-1) 247 return TRUE; 248 if(LstMatch(word,"YOURSELVES\0")!=-1) 249 return TRUE; 250 251 if(LstMatch(word,"THESE\0THOSE\0")!=-1) 252 return TRUE; 253 254 if(LstMatch(word,"THEY\0THEM\0THEIR\0THEIRS\0THEMSELVES\0")!=-1) 255 return TRUE; 256 257 return FALSE; 258 } 259 U8 PronGender(U8 *word) { 260 U8 buf[STR_LEN]; 261 word=Upperify(StrNew(word)); 262 StrCpy(buf,word); 263 Free(word); 264 word=buf; 265 266 if(LstMatch(word,"I\0ME\0MINE\0MY\0MYSELF\0YOU\0YOURS\0YOURSELF\0WE\0US\0OUR\0OURS\0OURSELVES\0YOURSELVES\0")) 267 return 'y'; 268 if(LstMatch(word,"HE\0HIS\0HIM\0HIMSELF")) 269 return 'm'; 270 if(LstMatch(word,"HER\0HERS\0SHE\0HERSELF")) 271 return 'f'; 272 return 'i'; 273 } 274 U64 GetAntecedantCh(U8 *word) { 275 if(!StrICmp("the",word)) 276 return 'D'; 277 if(!StrICmp("a",word)||!StrICmp("an",word)) 278 return 'I'; 279 I64 per=PronPerson(word); 280 I64 own=DetPoss(word); 281 I64 owned=PronPoss(word); 282 I64 demo=PronDemo(word); 283 U8 gender=PronGender(word); 284 if(!per&&!own&&!owned&&!demo) 285 return 0; 286 U8 buf[STR_LEN]; 287 buf[0](U64)='A'; 288 if(per) { 289 CatPrint(buf,"%d",per); 290 } 291 if(PronPlural(word)) { 292 CatPrint(buf,"+"); 293 } else 294 CatPrint(buf,"="); 295 296 if(own) 297 CatPrint(buf,"P"); 298 if(owned) 299 CatPrint(buf,"O"); 300 if(demo) 301 CatPrint(buf,"D"); 302 if(PronObj(word)) 303 CatPrint(buf,"o"); 304 else if(PronSubj(word)) 305 CatPrint(buf,"s"); 306 CatPrint(buf,"%c",gender); 307 return buf[0](U64); 308 } 309 310 U64 WordForm(U8 *word,U8 *copy_to=NULL,I64 who=1) { 311 if(copy_to) StrCpy(copy_to,word); 312 if(!WordForm1(word,copy_to,who)) 313 return 0; 314 if(!StrICmp(word,"an")|| 315 !StrICmp(word,"a") 316 ) 317 return 'Art'; 318 if(!StrICmp(word,"the")) 319 return 'DefArt'; 320 if(!StrICmp(word,"this")|| 321 !StrICmp(word,"that")|| 322 !StrICmp(word,"these")|| 323 !StrICmp(word,"those") 324 ) 325 return 'Pron'; 326 if(!StrICmp(word,"my")|| 327 !StrICmp(word,"your")|| 328 !StrICmp(word,"yours")|| 329 !StrICmp(word,"his")|| 330 !StrICmp(word,"hers")|| 331 !StrICmp(word,"her")|| 332 !StrICmp(word,"its")|| 333 !StrICmp(word,"their")|| 334 !StrICmp(word,"theirs")|| 335 !StrICmp(word,"our")|| 336 !StrICmp(word,"ours")|| 337 !StrICmp(word,"whoose") 338 ) 339 return 'Pron'; 340 if(!StrICmp(word,"I")|| 341 !StrICmp(word,"you")|| 342 !StrICmp(word,"he")|| 343 !StrICmp(word,"she")|| 344 !StrICmp(word,"it")|| 345 !StrICmp(word,"me")|| 346 !StrICmp(word,"you")|| 347 !StrICmp(word,"her")|| 348 !StrICmp(word,"him")|| 349 !StrICmp(word,"it")|| 350 !StrICmp(word,"we")|| 351 !StrICmp(word,"they")|| 352 !StrICmp(word,"us")|| 353 !StrICmp(word,"them")|| 354 !StrICmp(word,"myself")|| 355 !StrICmp(word,"yourself")|| 356 !StrICmp(word,"himself")|| 357 !StrICmp(word,"herself")|| 358 !StrICmp(word,"itself")|| 359 !StrICmp(word,"ourselves")|| 360 !StrICmp(word,"themselves")|| 361 !StrICmp(word,"yourselves") 362 ) 363 return 'Pron'; 364 return WordForm1(word,copy_to,who); 365 } 366 Bool IsVerb(U8 *word) { 367 I64 who=1; 368 U64 type; 369 for(;type=WordForm(word,NULL,who);++who) { 370 if(type=='Verb') 371 return TRUE; 372 } 373 return FALSE; 374 } 375 Bool IsNoun(U8 *word) { 376 I64 who=1; 377 U64 type; 378 for(;type=WordForm(word,NULL,who);++who) { 379 if(type=='Noun') 380 return TRUE; 381 } 382 return FALSE; 383 } 384 385 Bool IsAdj(U8 *word) { 386 I64 who=1; 387 U64 type; 388 for(;type=WordForm(word,NULL,who);++who) { 389 if(type=='Adj') 390 return TRUE; 391 } 392 return FALSE; 393 } 394 Bool IsPluralNoun(U8 *word) { 395 I64 who=1; 396 U64 type; 397 U8 buf[STR_LEN]; 398 for(;type=WordForm(word,buf,who);++who) { 399 if(type=='Noun') { 400 if(EndsWith(word,"ES")) 401 return TRUE; 402 if(EndsWith(word,"S")&&!EndsWith(buf,"S")) 403 return TRUE; 404 if(EndsWith(word,"ES")&&( 405 EndsWith(buf,"J")|| 406 EndsWith(buf,"S")|| 407 EndsWith(buf,"SH")|| 408 EndsWith(buf,"X")|| 409 EndsWith(buf,"Z")|| 410 EndsWith(buf,"CH") 411 )) 412 return TRUE; 413 if(EndsWith(word,"IES")&&EndsWith(buf,"Y")) 414 return TRUE; 415 if(EndsWith(word,"ZES")&&EndsWith(buf,"Z")) 416 return TRUE; 417 } 418 } 419 return FALSE; 420 } 421 Bool IsPrep(U8 *word) { 422 I64 who=1; 423 U64 type; 424 for(;type=WordForm(word,NULL,who);++who) { 425 if(type=='Prep') 426 return TRUE; 427 } 428 return FALSE; 429 } 430 I64 total=0,total_relats=0,unique=0; 431 F64 WordOccurances(U8 *a) { 432 a=BaseWordify(StrNew(a)); 433 U8 buf[STR_LEN]; 434 I64 r=FramePtr(StrPrint(buf,"Freq.%s",a)); 435 Free(a); 436 return r; 437 } 438 #define FIXED 25. 439 U0 NewlineBetween(U8 *from,U8 *to) { 440 while(from<to) { 441 if(*from=='\n') 442 return TRUE; 443 ++from; 444 } 445 return FALSE; 446 } 447 U0 ConceptRank(U8 *file) { 448 U8 *a=GetWord(&file),*b=GetWord(&file); 449 U8 *sentence_words[0x1000],*old_file; 450 U8 buf[STR_LEN]; 451 Bool flush; 452 I64 sent_len=2,idx,idx2,bucket; 453 U64 anta,antb; 454 F64 mul; 455 CHashTable *t=Fs->hash_table; 456 CHashGeneric *gen; 457 sentence_words[0]=BaseWordify(a),sentence_words[1]=BaseWordify(b); 458 total=2; 459 while(*file) { 460 old_file=file; 461 a=GetWord(&file); 462 if(*a!='.') { 463 DbgPrint("%s",a); 464 sentence_words[sent_len++]=BaseWordify(a); 465 DbgPrint(",%s\n",sentence_words[sent_len-1]); 466 } else if(*a=='.') { 467 Free(a); 468 flush=TRUE; 469 mul=1; 470 finale: 471 for(idx=0;idx!=sent_len;++idx) 472 for(idx2=0;idx2!=sent_len;++idx2) { 473 if(!IsStopWord(sentence_words[idx])&&!IsStopWord(sentence_words[idx2])) { //TODO stop words 474 if(idx!=idx2) { 475 self: 476 AddRelationshipWithWord(sentence_words[idx],sentence_words[idx2]); 477 ++total_relats; 478 StrPrint(buf,"Relats.%s",sentence_words[idx]); 479 if(!FramePtr(buf)) 480 FramePtrAdd(buf,1); 481 else 482 FramePtrSet(buf,1+FramePtr(buf)); 483 } else { 484 StrPrint(buf,"Freq.%s",sentence_words[idx]); 485 if(gen=HashSingleTableFind(buf,Fs->hash_table,HTT_FRAME_PTR)) { 486 ++gen->user_data0; 487 } else { 488 FramePtrAdd(buf,1); 489 ++unique; 490 } 491 //Self relation 492 goto self; 493 } 494 } 495 } 496 if(flush) 497 while(--sent_len>=0) 498 Free(sentence_words[sent_len]); 499 500 sent_len=0; 501 if(!*file) { 502 return; 503 } 504 } 505 } 506 //End of passsage(paragraph) 507 mul=1; 508 flush=TRUE; 509 goto finale; 510 }