correct.c

   1 /*
   2  * Copyright (C) 2012, 2013
   3  *     Dale Weiler
   4  *     Wolfgang Bumiller
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy of
   7  * this software and associated documentation files (the "Software"), to deal in
   8  * the Software without restriction, including without limitation the rights to
   9  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
  10  * of the Software, and to permit persons to whom the Software is furnished to do
  11  * so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in all
  14  * copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24 #include "gmqcc.h"
  25
  26 /*
  27  * This is a very clever method for correcting mistakes in QuakeC code
  28  * most notably when invalid identifiers are used or inproper assignments;
  29  * we can proprly lookup in multiple dictonaries (depening on the rules
  30  * of what the task is trying to acomplish) to find the best possible
  31  * match.
  32  *
  33  *
  34  * A little about how it works, and probability theory:
  35  *
  36  *  When given an identifier (which we will denote I), we're essentially
  37  *  just trying to choose the most likely correction for that identifier.
  38  *  (the actual "correction" can very well be the identifier itself).
  39  *  There is actually no way to know for sure that certian identifers
  40  *  such as "lates", need to be corrected to "late" or "latest" or any
  41  *  other permutations that look lexically the same.  This is why we
  42  *  must advocate the usage of probabilities.  This means that instead of
  43  *  just guessing, instead we're trying to find the correction for C,
  44  *  out of all possible corrections that maximizes the probability of C
  45  *  for the original identifer I.
  46  *
  47  *  Thankfully there exists some theroies for probalistic interpretations
  48  *  of data.  Since we're operating on two distictive intepretations, the
  49  *  transposition from I to C. We need something that can express how much
  50  *  degree of I should rationally change to become C.  this is called the
  51  *  Bayesian interpretation. You can read more about it from here:
  52  *  http://www.celiagreen.com/charlesmccreery/statistics/bayestutorial.pdf
  53  *  (which is probably the only good online documentation for bayes theroy
  54  *  no lie.  Everything else just sucks ..)
  55  *
  56  *  Bayes' Thereom suggests something like the following:
  57  *      AC P(I|C) P(C) / P(I)
  58  *
  59  *  However since P(I) is the same for every possibility of I, we can
  60  *  completley ignore it giving just:
  61  *      AC P(I|C) P(C)
  62  *
  63  *  This greatly helps visualize how the parts of the expression are performed
  64  *  there is essentially three, from right to left we perform the following:
  65  *
  66  *  1: P(C), the probability that a proposed correction C will stand on its
  67  *     own.  This is called the language model.
  68  *
  69  *  2: P(I|C), the probability that I would be used, when the programmer
  70  *     really meant C.  This is the error model.
  71  *
  72  *  3: AC, the control mechanisim, an enumerator if you will, one that
  73  *     enumerates all feasible values of C, to determine the one that
  74  *     gives the greatest probability score.
  75  *
  76  *  In reality the requirement for a more complex expression involving
  77  *  two seperate models is considerably a waste.  But one must recognize
  78  *  that P(C|I) is already conflating two factors.  It's just much simpler
  79  *  to seperate the two models and deal with them explicitaly.  To properly
  80  *  estimate P(C|I) you have to consider both the probability of C and
  81  *  probability of the transposition from C to I.  It's simply much more
  82  *  cleaner, and direct to seperate the two factors.
  83  *
  84  *  Research tells us that 80% to 95% of all spelling errors have an edit
  85  *  distance no greater than one.  Knowing this we can optimize for most
  86  *  cases of mistakes without taking a performance hit.  Which is what we
  87  *  base longer edit distances off of.  Opposed to the original method of
  88  *  I had concieved of checking everything.
  89  *
  90  * A little information on additional algorithms used:
  91  *
  92  *   Initially when I implemented this corrector, it was very slow.
  93  *   Need I remind you this is essentially a brute force attack on strings,
  94  *   and since every transformation requires dynamic memory allocations,
  95  *   you can easily imagine where most of the runtime conflated.  Yes
  96  *   It went right to malloc.  More than THREE MILLION malloc calls are
  97  *   performed for an identifier about 16 bytes long.  This was such a
  98  *   shock to me.  A forward allocator (or as some call it a bump-point
  99  *   allocator, or just a memory pool) was implemented. To combat this.
 100  *
 101  *   But of course even other factors were making it slow.  Initially
 102  *   this used a hashtable.  And hashtables have a good constant lookup
 103  *   time complexity.  But the problem wasn't in the hashtable, it was
 104  *   in the hashing (despite having one of the fastest hash functions
 105  *   known).  Remember those 3 million mallocs? Well for every malloc
 106  *   there is also a hash.  After 3 million hashes .. you start to get
 107  *   very slow.  To combat this I had suggested burst tries to Blub.
 108  *   The next day he had implemented them. Sure enough this brought
 109  *   down the runtime by a factor > 100%
 110  *
 111  *   The trie initially was designed to work on all strings, but later it
 112  *   became aparent that not only was this not a requirement. It was also
 113  *   slowing down get/sets' for the trie.  To fully understand, only
 114  *   correct_alpha needs to be understood by the trie system, knowing this
 115  *   We can combat the slowness using a very clever but evil optimization.
 116  *   By Setting a fixed sized amount of branches for the trie using a
 117  *   char-to-index map into the branches. We've complelty made the trie
 118  *   accesses entierly constant in lookup time.  No really, a lookup is
 119  *   literally trie[str[0]] [str[1]] [2] .... .value.
 120  *
 121  *
 122  * Future Work (If we really need it)
 123  *
 124  *   Currently we can only distinguish one source of error in the
 125  *   language model we use.  This could become an issue for identifiers
 126  *   that have close colliding rates, e.g colate->coat yields collate.
 127  *
 128  *   Currently the error model has been fairly trivial, the smaller the
 129  *   edit distance the smaller the error.  This usually causes some un-
 130  *   expected problems. e.g reciet->recite yields recipt.  For QuakeC
 131  *   this could become a problem when lots of identifiers are involved.
 132  *
 133  *   Our control mechanisim could use a limit, i.e limit the number of
 134  *   sets of edits for distance X.  This would also increase execution
 135  *   speed considerably.
 136  */
 137
 138
 139 #define CORRECT_POOL_SIZE (128*1024*1024)
 140 /*
 141  * A forward allcator for the corrector.  This corrector requires a lot
 142  * of allocations.  This forward allocator combats all those allocations
 143  * and speeds us up a little.  It also saves us space in a way since each
 144  * allocation isn't wasting a little header space for when NOTRACK isn't
 145  * defined.
 146  */
 147 static unsigned char **correct_pool_data = NULL;
 148 static unsigned char  *correct_pool_this = NULL;
 149 static size_t          correct_pool_addr = 0;
 150
 151 static GMQCC_INLINE void correct_pool_new(void) {
 152     correct_pool_addr = 0;
 153     correct_pool_this = (unsigned char *)mem_a(CORRECT_POOL_SIZE);
 154
 155     vec_push(correct_pool_data, correct_pool_this);
 156 }
 157
 158 static GMQCC_INLINE void *correct_pool_alloc(size_t bytes) {
 159     void *data;
 160     if (correct_pool_addr + bytes>= CORRECT_POOL_SIZE)
 161         correct_pool_new();
 162
 163     data               = (void*)correct_pool_this;
 164     correct_pool_this += bytes;
 165     correct_pool_addr += bytes;
 166     return data;
 167 }
 168
 169 static GMQCC_INLINE void correct_pool_delete(void) {
 170     size_t i;
 171     for (i = 0; i < vec_size(correct_pool_data); ++i)
 172         mem_d(correct_pool_data[i]);
 173
 174     correct_pool_data = NULL;
 175     correct_pool_this = NULL;
 176     correct_pool_addr = 0;
 177 }
 178
 179
 180 static GMQCC_INLINE char *correct_pool_claim(const char *data) {
 181     char *claim = util_strdup(data);
 182     return claim;
 183 }
 184
 185 /*
 186  * _ is valid in identifiers. I've yet to implement numerics however
 187  * because they're only valid after the first character is of a _, or
 188  * alpha character.
 189  */
 190 static const char correct_alpha[] = "abcdefghijklmnopqrstuvwxyz"
 191                                     "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 192                                     "_"; /* TODO: Numbers ... */
 193
 194 static const size_t correct_alpha_index[0x80] = {
 195      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 196      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 197      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 198      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 199      0,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
 200     15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,  0,  0,  0,  0, 52,
 201      0, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
 202     41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,  0,  0,  0,  0,  0
 203 };
 204
 205 /*
 206  * A fast space efficent trie for a dictionary of identifiers.  This is
 207  * faster than a hashtable for one reason.  A hashtable itself may have
 208  * fast constant lookup time, but the hash itself must be very fast. We
 209  * have one of the fastest hash functions for strings, but if you do a
 210  * lost of hashing (which we do, almost 3 million hashes per identifier)
 211  * a hashtable becomes slow.
 212  */
 213 correct_trie_t* correct_trie_new() {
 214     correct_trie_t *t = (correct_trie_t*)mem_a(sizeof(correct_trie_t));
 215     t->value   = NULL;
 216     t->entries = NULL;
 217     return t;
 218 }
 219
 220 void correct_trie_del_sub(correct_trie_t *t) {
 221     size_t i;
 222     if (!t->entries)
 223         return;
 224     for (i = 0; i < sizeof(correct_alpha)-1; ++i) {
 225         correct_trie_del_sub(&t->entries[i]);
 226     }
 227     mem_d(t->entries);
 228 }
 229
 230 void correct_trie_del(correct_trie_t *t) {
 231     size_t i;
 232     if (t->entries) {
 233         for (i = 0; i < sizeof(correct_alpha)-1; ++i)
 234             correct_trie_del_sub(&t->entries[i]);
 235         mem_d(t->entries);
 236     }
 237     mem_d(t);
 238 }
 239
 240 void* correct_trie_get(const correct_trie_t *t, const char *key) {
 241     const unsigned char *data = (const unsigned char*)key;
 242
 243     while (*data) {
 244         if (!t->entries)
 245             return NULL;
 246         t = t->entries + correct_alpha_index[*data];
 247         ++data;
 248     }
 249     return t->value;
 250 }
 251
 252 void correct_trie_set(correct_trie_t *t, const char *key, void * const value) {
 253     const unsigned char *data = (const unsigned char*)key;
 254     while (*data) {
 255         if (!t->entries) {
 256             t->entries = (correct_trie_t*)mem_a(sizeof(correct_trie_t)*(sizeof(correct_alpha)-1));
 257             memset(t->entries, 0, sizeof(correct_trie_t)*(sizeof(correct_alpha)-1));
 258         }
 259         t = t->entries + correct_alpha_index[*data];
 260         ++data;
 261     }
 262     t->value = value;
 263 }
 264
 265
 266 /*
 267  * Implementation of the corrector algorithm commences. A very efficent
 268  * brute-force attack (thanks to tries and mempool :-)).
 269  */
 270 static GMQCC_INLINE size_t *correct_find(correct_trie_t *table, const char *word) {
 271     return (size_t*)correct_trie_get(table, word);
 272 }
 273
 274 static GMQCC_INLINE bool correct_update(correct_trie_t* *table, const char *word) {
 275     size_t *data = correct_find(*table, word);
 276     if (!data)
 277         return false;
 278
 279     (*data)++;
 280     return true;
 281 }
 282
 283 void correct_add(correct_trie_t* table, size_t ***size, const char *ident) {
 284     size_t     *data = NULL;
 285     const char *add  = ident;
 286
 287     if (!correct_update(&table, add)) {
 288         data  = (size_t*)mem_a(sizeof(size_t));
 289         *data = 1;
 290
 291         vec_push((*size), data);
 292         correct_trie_set(table, add, data);
 293     }
 294 }
 295
 296 void correct_del(correct_trie_t* dictonary, size_t **data) {
 297     size_t       i;
 298     const size_t vs = vec_size(data);
 299
 300     for (i = 0; i < vs; i++)
 301         mem_d(data[i]);
 302
 303     vec_free(data);
 304     correct_trie_del(dictonary);
 305 }
 306
 307 /*
 308  * correcting logic for the following forms of transformations:
 309  *  1) deletion
 310  *  2) transposition
 311  *  3) alteration
 312  *  4) insertion
 313  *
 314  * These functions could take an additional size_t **size paramater
 315  * and store back the results of their new length in an array that
 316  * is the same as **array for the memcmp in correct_exists. I'm just
 317  * not able to figure out how to do that just yet.  As my brain is
 318  * not in the mood to figure out that logic.  This is a reminder to
 319  * do it, or for someone else to :-) correct_edit however would also
 320  * need to take a size_t ** to carry it along (would all the argument
 321  * overhead be worth it?)
 322  */
 323 static size_t correct_deletion(const char *ident, char **array) {
 324     size_t       itr = 0;
 325     const size_t len = strlen(ident);
 326
 327     for (; itr < len; itr++) {
 328         char *a = (char*)correct_pool_alloc(len+1);
 329         memcpy(a, ident, itr);
 330         memcpy(a + itr, ident + itr + 1, len - itr);
 331         array[itr] = a;
 332     }
 333
 334     return itr;
 335 }
 336
 337 static size_t correct_transposition(const char *ident, char **array) {
 338     size_t       itr = 0;
 339     const size_t len = strlen(ident);
 340
 341     for (; itr < len - 1; itr++) {
 342         char  tmp;
 343         char *a = (char*)correct_pool_alloc(len+1);
 344         memcpy(a, ident, len+1);
 345         tmp      = a[itr];
 346         a[itr  ] = a[itr+1];
 347         a[itr+1] = tmp;
 348         array[itr] = a;
 349     }
 350
 351     return itr;
 352 }
 353
 354 static size_t correct_alteration(const char *ident, char **array) {
 355     size_t       itr = 0;
 356     size_t       jtr = 0;
 357     size_t       ktr = 0;
 358     const size_t len = strlen(ident);
 359
 360     for (; itr < len; itr++) {
 361         for (jtr = 0; jtr < sizeof(correct_alpha)-1; jtr++, ktr++) {
 362             char *a = (char*)correct_pool_alloc(len+1);
 363             memcpy(a, ident, len+1);
 364             a[itr] = correct_alpha[jtr];
 365             array[ktr] = a;
 366         }
 367     }
 368
 369     return ktr;
 370 }
 371
 372 static size_t correct_insertion(const char *ident, char **array) {
 373     size_t       itr = 0;
 374     size_t       jtr = 0;
 375     const size_t len = strlen(ident);
 376
 377     for (; itr <= len; itr++) {
 378         for (jtr = 0; jtr < sizeof(correct_alpha)-1; jtr++) {
 379             char *a = (char*)correct_pool_alloc(len+2);
 380             memcpy(a, ident, itr);
 381             memcpy(a + itr + 1, ident + itr, len - itr + 1);
 382             a[itr] = correct_alpha[jtr];
 383             array[itr * (sizeof(correct_alpha)-1) + jtr] = a;
 384         }
 385     }
 386
 387     return (len+1)*(sizeof(correct_alpha)-1);
 388 }
 389
 390 static GMQCC_INLINE size_t correct_size(const char *ident) {
 391     /*
 392      * deletion      = len
 393      * transposition = len - 1
 394      * alteration    = len * sizeof(correct_alpha)
 395      * insertion     = (len + 1) * sizeof(correct_alpha)
 396      */
 397
 398     register size_t len = strlen(ident);
 399     return (len) + (len - 1) + (len * (sizeof(correct_alpha)-1)) + ((len + 1) * (sizeof(correct_alpha)-1));
 400 }
 401
 402 static char **correct_edit(const char *ident) {
 403     size_t next;
 404     char **find = (char**)correct_pool_alloc(correct_size(ident) * sizeof(char*));
 405
 406     if (!find)
 407         return NULL;
 408
 409     next  = correct_deletion     (ident, find);
 410     next += correct_transposition(ident, find+next);
 411     next += correct_alteration   (ident, find+next);
 412     /*****/ correct_insertion    (ident, find+next);
 413
 414     return find;
 415 }
 416
 417 /*
 418  * We could use a hashtable but the space complexity isn't worth it
 419  * since we're only going to determine the "did you mean?" identifier
 420  * on error.
 421  */
 422 static int correct_exist(char **array, size_t rows, char *ident) {
 423     size_t itr;
 424     /*
 425      * As an experiment I tried the following assembly for memcmp here:
 426      *
 427      * correct_cmp_loop:
 428      * incl %eax            ; eax =  LHS
 429      * incl %edx            ; edx =  LRS
 430      * cmpl %eax, %ebx      ; ebx = &LHS[END_POS]
 431      *
 432      * jbe correct_cmp_eq
 433      * movb (%edx), %cl     ; micro-optimized even on atoms :-)
 434      * cmpb %cl, (%eax)     ; ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 435      * jg  correct_cmp_gt
 436      * jge correct_cmp_loop
 437      * ...
 438      *
 439      * Despite how much optimization went in to this, the speed was
 440      * being conflicted by the strlen(ident) used for &LHS[END_POS]
 441      * If we could eliminate the strlen with what I suggested on line
 442      * 311 ... we can accelerate this whole damn thing quite a bit.
 443      *
 444      * However there is still something we can do here that does give
 445      * us a little more speed.  Although one more branch, we know for
 446      * sure there is at least one byte to compare, if that one byte
 447      * simply isn't the same we can skip the full check. Which means
 448      * we skip a whole strlen call.
 449      */
 450     for (itr = 0; itr < rows; itr++) {
 451         if (!memcmp(array[itr], ident, strlen(ident)))
 452             return 1;
 453     }
 454
 455     return 0;
 456 }
 457
 458 static GMQCC_INLINE char **correct_known_resize(char **res, size_t *allocated, size_t size) {
 459     size_t oldallocated = *allocated;
 460     char **out;
 461     if (size < oldallocated)
 462         return res;
 463
 464     out = correct_pool_alloc(sizeof(*res) * oldallocated + 32);
 465     memcpy(out, res, sizeof(*res) * oldallocated);
 466
 467     *allocated += 32;
 468     return out;
 469 }
 470
 471 static char **correct_known(correction_t *corr, correct_trie_t* table, char **array, size_t rows, size_t *next) {
 472     size_t itr = 0;
 473     size_t jtr = 0;
 474     size_t len = 0;
 475     size_t row = 0;
 476     size_t nxt = 8;
 477     char **res = correct_pool_alloc(sizeof(char *) * nxt);
 478     char **end = NULL;
 479
 480     for (; itr < rows; itr++) {
 481         if (!array[itr][0])
 482             continue;
 483         if (vec_size(corr->edits) > itr+1)
 484             end = corr->edits[itr+1];
 485         else {
 486             end = correct_edit(array[itr]);
 487             vec_push(corr->edits, end);
 488         }
 489         row = correct_size(array[itr]);
 490
 491         for (jtr = 0; jtr < row; jtr++) {
 492             if (correct_find(table, end[jtr]) && !correct_exist(res, len, end[jtr])) {
 493                 res        = correct_known_resize(res, &nxt, len+1);
 494                 res[len++] = end[jtr];
 495             }
 496         }
 497     }
 498
 499     *next = len;
 500     return res;
 501 }
 502
 503 static char *correct_maximum(correct_trie_t* table, char **array, size_t rows) {
 504     char   *str = NULL;
 505     size_t *itm = NULL;
 506     size_t  itr = 0;
 507     size_t  top = 0;
 508
 509     for (; itr < rows; itr++) {
 510         if ((itm = correct_find(table, array[itr])) && (*itm > top)) {
 511             top = *itm;
 512             str = array[itr];
 513         }
 514     }
 515
 516     return str;
 517 }
 518
 519 /*
 520  * This is the exposed interface:
 521  * takes a table for the dictonary a vector of sizes (used for internal
 522  * probability calculation), and an identifier to "correct".
 523  */
 524 void correct_init(correction_t *c)
 525 {
 526     correct_pool_new();
 527     c->edits = NULL;
 528 }
 529
 530 void correct_free(correction_t *c)
 531 {
 532     vec_free(c->edits);
 533     correct_pool_delete();
 534 }
 535
 536 char *correct_str(correction_t *corr, correct_trie_t* table, const char *ident) {
 537     char **e1      = NULL;
 538     char **e2      = NULL;
 539     char  *e1ident = NULL;
 540     char  *e2ident = NULL;
 541     size_t e1rows  = 0;
 542     size_t e2rows  = 0;
 543
 544     /* needs to be allocated for free later */
 545     if (correct_find(table, ident))
 546         return correct_pool_claim(ident);
 547
 548     if ((e1rows = correct_size(ident))) {
 549         if (vec_size(corr->edits) > 0)
 550             e1 = corr->edits[0];
 551         else {
 552             e1 = correct_edit(ident);
 553             vec_push(corr->edits, e1);
 554         }
 555
 556         if ((e1ident = correct_maximum(table, e1, e1rows)))
 557             return correct_pool_claim(e1ident);
 558     }
 559
 560     e2 = correct_known(corr, table, e1, e1rows, &e2rows);
 561     if (e2rows && ((e2ident = correct_maximum(table, e2, e2rows))))
 562         return correct_pool_claim(e2ident);
 563
 564
 565     return util_strdup(ident);
 566 }