correct.c

   1 /*
   2  * Copyright (C) 2012, 2013
   3  *     Dale Weiler
   4  *     Wolfgang Bumiller
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy of
   7  * this software and associated documentation files (the "Software"), to deal in
   8  * the Software without restriction, including without limitation the rights to
   9  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
  10  * of the Software, and to permit persons to whom the Software is furnished to do
  11  * so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in all
  14  * copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24 #include "gmqcc.h"
  25
  26 /*
  27  * This is a very clever method for correcting mistakes in QuakeC code
  28  * most notably when invalid identifiers are used or inproper assignments;
  29  * we can proprly lookup in multiple dictonaries (depening on the rules
  30  * of what the task is trying to acomplish) to find the best possible
  31  * match.
  32  *
  33  *
  34  * A little about how it works, and probability theory:
  35  *
  36  *  When given an identifier (which we will denote I), we're essentially
  37  *  just trying to choose the most likely correction for that identifier.
  38  *  (the actual "correction" can very well be the identifier itself).
  39  *  There is actually no way to know for sure that certian identifers
  40  *  such as "lates", need to be corrected to "late" or "latest" or any
  41  *  other permutations that look lexically the same.  This is why we
  42  *  must advocate the usage of probabilities.  This means that instead of
  43  *  just guessing, instead we're trying to find the correction for C,
  44  *  out of all possible corrections that maximizes the probability of C
  45  *  for the original identifer I.
  46  *
  47  *  Thankfully there exists some theroies for probalistic interpretations
  48  *  of data.  Since we're operating on two distictive intepretations, the
  49  *  transposition from I to C. We need something that can express how much
  50  *  degree of I should rationally change to become C.  this is called the
  51  *  Bayesian interpretation. You can read more about it from here:
  52  *  http://www.celiagreen.com/charlesmccreery/statistics/bayestutorial.pdf
  53  *  (which is probably the only good online documentation for bayes theroy
  54  *  no lie.  Everything else just sucks ..)
  55  *
  56  *  Bayes' Thereom suggests something like the following:
  57  *      AC P(I|C) P(C) / P(I)
  58  *
  59  *  However since P(I) is the same for every possibility of I, we can
  60  *  completley ignore it giving just:
  61  *      AC P(I|C) P(C)
  62  *
  63  *  This greatly helps visualize how the parts of the expression are performed
  64  *  there is essentially three, from right to left we perform the following:
  65  *
  66  *  1: P(C), the probability that a proposed correction C will stand on its
  67  *     own.  This is called the language model.
  68  *
  69  *  2: P(I|C), the probability that I would be used, when the programmer
  70  *     really meant C.  This is the error model.
  71  *
  72  *  3: AC, the control mechanisim, an enumerator if you will, one that
  73  *     enumerates all feasible values of C, to determine the one that
  74  *     gives the greatest probability score.
  75  *
  76  *  In reality the requirement for a more complex expression involving
  77  *  two seperate models is considerably a waste.  But one must recognize
  78  *  that P(C|I) is already conflating two factors.  It's just much simpler
  79  *  to seperate the two models and deal with them explicitaly.  To properly
  80  *  estimate P(C|I) you have to consider both the probability of C and
  81  *  probability of the transposition from C to I.  It's simply much more
  82  *  cleaner, and direct to seperate the two factors.
  83  *
  84  *  Research tells us that 80% to 95% of all spelling errors have an edit
  85  *  distance no greater than one.  Knowing this we can optimize for most
  86  *  cases of mistakes without taking a performance hit.  Which is what we
  87  *  base longer edit distances off of.  Opposed to the original method of
  88  *  I had concieved of checking everything.
  89  *
  90  * A little information on additional algorithms used:
  91  *
  92  *   Initially when I implemented this corrector, it was very slow.
  93  *   Need I remind you this is essentially a brute force attack on strings,
  94  *   and since every transformation requires dynamic memory allocations,
  95  *   you can easily imagine where most of the runtime conflated.  Yes
  96  *   It went right to malloc.  More than THREE MILLION malloc calls are
  97  *   performed for an identifier about 16 bytes long.  This was such a
  98  *   shock to me.  A forward allocator (or as some call it a bump-point
  99  *   allocator, or just a memory pool) was implemented. To combat this.
 100  *
 101  *   But of course even other factors were making it slow.  Initially
 102  *   this used a hashtable.  And hashtables have a good constant lookup
 103  *   time complexity.  But the problem wasn't in the hashtable, it was
 104  *   in the hashing (despite having one of the fastest hash functions
 105  *   known).  Remember those 3 million mallocs? Well for every malloc
 106  *   there is also a hash.  After 3 million hashes .. you start to get
 107  *   very slow.  To combat this I had suggested burst tries to Blub.
 108  *   The next day he had implemented them. Sure enough this brought
 109  *   down the runtime by a factor > 100%
 110  *
 111  *   The trie initially was designed to work on all strings, but later it
 112  *   became aparent that not only was this not a requirement. It was also
 113  *   slowing down get/sets' for the trie.  To fully understand, only
 114  *   correct_alpha needs to be understood by the trie system, knowing this
 115  *   We can combat the slowness using a very clever but evil optimization.
 116  *   By Setting a fixed sized amount of branches for the trie using a
 117  *   char-to-index map into the branches. We've complelty made the trie
 118  *   accesses entierly constant in lookup time.  No really, a lookup is
 119  *   literally trie[str[0]] [str[1]] [2] .... .value.
 120  *
 121  *
 122  * Future Work (If we really need it)
 123  *
 124  *   Currently we can only distinguish one source of error in the
 125  *   language model we use.  This could become an issue for identifiers
 126  *   that have close colliding rates, e.g colate->coat yields collate.
 127  *
 128  *   Currently the error model has been fairly trivial, the smaller the
 129  *   edit distance the smaller the error.  This usually causes some un-
 130  *   expected problems. e.g reciet->recite yields recipt.  For QuakeC
 131  *   this could become a problem when lots of identifiers are involved.
 132  */
 133
 134
 135 #define CORRECT_POOL_SIZE (128*1024*1024)
 136 /*
 137  * A forward allcator for the corrector.  This corrector requires a lot
 138  * of allocations.  This forward allocator combats all those allocations
 139  * and speeds us up a little.  It also saves us space in a way since each
 140  * allocation isn't wasting a little header space for when NOTRACK isn't
 141  * defined.
 142  */
 143 static unsigned char **correct_pool_data = NULL;
 144 static unsigned char  *correct_pool_this = NULL;
 145 static size_t          correct_pool_addr = 0;
 146
 147 static GMQCC_INLINE void correct_pool_new(void) {
 148     correct_pool_addr = 0;
 149     correct_pool_this = (unsigned char *)mem_a(CORRECT_POOL_SIZE);
 150
 151     vec_push(correct_pool_data, correct_pool_this);
 152 }
 153
 154 static GMQCC_INLINE void *correct_pool_alloc(size_t bytes) {
 155     void *data;
 156     if (correct_pool_addr + bytes>= CORRECT_POOL_SIZE)
 157         correct_pool_new();
 158
 159     data               = (void*)correct_pool_this;
 160     correct_pool_this += bytes;
 161     correct_pool_addr += bytes;
 162     return data;
 163 }
 164
 165 static GMQCC_INLINE void correct_pool_delete(void) {
 166     size_t i;
 167     for (i = 0; i < vec_size(correct_pool_data); ++i)
 168         mem_d(correct_pool_data[i]);
 169
 170     correct_pool_data = NULL;
 171     correct_pool_this = NULL;
 172     correct_pool_addr = 0;
 173 }
 174
 175
 176 static GMQCC_INLINE char *correct_pool_claim(const char *data) {
 177     char *claim = util_strdup(data);
 178     return claim;
 179 }
 180
 181 /*
 182  * _ is valid in identifiers. I've yet to implement numerics however
 183  * because they're only valid after the first character is of a _, or
 184  * alpha character.
 185  */
 186 static const char correct_alpha[] = "abcdefghijklmnopqrstuvwxyz"
 187                                     "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 188                                     "_"; /* TODO: Numbers ... */
 189
 190 static const size_t correct_alpha_index[0x80] = {
 191      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 192      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 193      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 194      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 195      0,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
 196     15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,  0,  0,  0,  0, 52,
 197      0, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
 198     41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,  0,  0,  0,  0,  0
 199 };
 200
 201 /*
 202  * A fast space efficent trie for a dictionary of identifiers.  This is
 203  * faster than a hashtable for one reason.  A hashtable itself may have
 204  * fast constant lookup time, but the hash itself must be very fast. We
 205  * have one of the fastest hash functions for strings, but if you do a
 206  * lost of hashing (which we do, almost 3 million hashes per identifier)
 207  * a hashtable becomes slow.
 208  */
 209 correct_trie_t* correct_trie_new() {
 210     correct_trie_t *t = (correct_trie_t*)mem_a(sizeof(correct_trie_t));
 211     t->value   = NULL;
 212     t->entries = NULL;
 213     return t;
 214 }
 215
 216 static GMQCC_INLINE void correct_trie_del_sub(correct_trie_t *t) {
 217     size_t i;
 218     if (!t->entries)
 219         return;
 220     for (i = 0; i < sizeof(correct_alpha)-1; ++i) {
 221         correct_trie_del_sub(&t->entries[i]);
 222     }
 223     mem_d(t->entries);
 224 }
 225
 226 static GMQCC_INLINE void correct_trie_del(correct_trie_t *t) {
 227     size_t i;
 228     if (t->entries) {
 229         for (i = 0; i < sizeof(correct_alpha)-1; ++i)
 230             correct_trie_del_sub(&t->entries[i]);
 231         mem_d(t->entries);
 232     }
 233     mem_d(t);
 234 }
 235
 236 static GMQCC_INLINE void* correct_trie_get(const correct_trie_t *t, const char *key) {
 237     const unsigned char *data = (const unsigned char*)key;
 238
 239     while (*data) {
 240         if (!t->entries)
 241             return NULL;
 242         t = t->entries + correct_alpha_index[*data];
 243         ++data;
 244     }
 245     return t->value;
 246 }
 247
 248 static GMQCC_INLINE void correct_trie_set(correct_trie_t *t, const char *key, void * const value) {
 249     const unsigned char *data = (const unsigned char*)key;
 250     while (*data) {
 251         if (!t->entries) {
 252             t->entries = (correct_trie_t*)mem_a(sizeof(correct_trie_t)*(sizeof(correct_alpha)-1));
 253             memset(t->entries, 0, sizeof(correct_trie_t)*(sizeof(correct_alpha)-1));
 254         }
 255         t = t->entries + correct_alpha_index[*data];
 256         ++data;
 257     }
 258     t->value = value;
 259 }
 260
 261
 262 /*
 263  * Implementation of the corrector algorithm commences. A very efficent
 264  * brute-force attack (thanks to tries and mempool :-)).
 265  */
 266 static GMQCC_INLINE size_t *correct_find(correct_trie_t *table, const char *word) {
 267     return (size_t*)correct_trie_get(table, word);
 268 }
 269
 270 static GMQCC_INLINE bool correct_update(correct_trie_t* *table, const char *word) {
 271     size_t *data = correct_find(*table, word);
 272     if (!data)
 273         return false;
 274
 275     (*data)++;
 276     return true;
 277 }
 278
 279 void correct_add(correct_trie_t* table, size_t ***size, const char *ident) {
 280     size_t     *data = NULL;
 281     const char *add  = ident;
 282
 283     if (!correct_update(&table, add)) {
 284         data  = (size_t*)mem_a(sizeof(size_t));
 285         *data = 1;
 286
 287         vec_push((*size), data);
 288         correct_trie_set(table, add, data);
 289     }
 290 }
 291
 292 void correct_del(correct_trie_t* dictonary, size_t **data) {
 293     size_t       i;
 294     const size_t vs = vec_size(data);
 295
 296     for (i = 0; i < vs; i++)
 297         mem_d(data[i]);
 298
 299     vec_free(data);
 300     correct_trie_del(dictonary);
 301 }
 302
 303 /*
 304  * correcting logic for the following forms of transformations:
 305  *  1) deletion
 306  *  2) transposition
 307  *  3) alteration
 308  *  4) insertion
 309  *
 310  * These functions could take an additional size_t **size paramater
 311  * and store back the results of their new length in an array that
 312  * is the same as **array for the memcmp in correct_exists. I'm just
 313  * not able to figure out how to do that just yet.  As my brain is
 314  * not in the mood to figure out that logic.  This is a reminder to
 315  * do it, or for someone else to :-) correct_edit however would also
 316  * need to take a size_t ** to carry it along (would all the argument
 317  * overhead be worth it?)
 318  */
 319 static GMQCC_INLINE size_t correct_deletion(const char *ident, char **array) {
 320     size_t       itr = 0;
 321     const size_t len = strlen(ident);
 322
 323     for (; itr < len; itr++) {
 324         char *a = (char*)correct_pool_alloc(len+1);
 325         memcpy(a, ident, itr);
 326         memcpy(a + itr, ident + itr + 1, len - itr);
 327         array[itr] = a;
 328     }
 329
 330     return itr;
 331 }
 332
 333 static GMQCC_INLINE size_t correct_transposition(const char *ident, char **array) {
 334     size_t       itr = 0;
 335     const size_t len = strlen(ident);
 336
 337     for (; itr < len - 1; itr++) {
 338         char  tmp;
 339         char *a = (char*)correct_pool_alloc(len+1);
 340         memcpy(a, ident, len+1);
 341         tmp      = a[itr];
 342         a[itr  ] = a[itr+1];
 343         a[itr+1] = tmp;
 344         array[itr] = a;
 345     }
 346
 347     return itr;
 348 }
 349
 350 static GMQCC_INLINE size_t correct_alteration(const char *ident, char **array) {
 351     size_t       itr = 0;
 352     size_t       jtr = 0;
 353     size_t       ktr = 0;
 354     const size_t len = strlen(ident);
 355
 356     for (; itr < len; itr++) {
 357         for (jtr = 0; jtr < sizeof(correct_alpha)-1; jtr++, ktr++) {
 358             char *a = (char*)correct_pool_alloc(len+1);
 359             memcpy(a, ident, len+1);
 360             a[itr] = correct_alpha[jtr];
 361             array[ktr] = a;
 362         }
 363     }
 364
 365     return ktr;
 366 }
 367
 368 static GMQCC_INLINE size_t correct_insertion(const char *ident, char **array) {
 369     size_t       itr = 0;
 370     size_t       jtr = 0;
 371     const size_t len = strlen(ident);
 372
 373     for (; itr <= len; itr++) {
 374         for (jtr = 0; jtr < sizeof(correct_alpha)-1; jtr++) {
 375             char *a = (char*)correct_pool_alloc(len+2);
 376             memcpy(a, ident, itr);
 377             memcpy(a + itr + 1, ident + itr, len - itr + 1);
 378             a[itr] = correct_alpha[jtr];
 379             array[itr * (sizeof(correct_alpha)-1) + jtr] = a;
 380         }
 381     }
 382
 383     return (len+1)*(sizeof(correct_alpha)-1);
 384 }
 385
 386 static GMQCC_INLINE size_t correct_size(const char *ident) {
 387     /*
 388      * deletion      = len
 389      * transposition = len - 1
 390      * alteration    = len * sizeof(correct_alpha)
 391      * insertion     = (len + 1) * sizeof(correct_alpha)
 392      */
 393
 394     register size_t len = strlen(ident);
 395     return (len) + (len - 1) + (len * (sizeof(correct_alpha)-1)) + ((len + 1) * (sizeof(correct_alpha)-1));
 396 }
 397
 398 static GMQCC_INLINE char **correct_edit(const char *ident, size_t **lens) {
 399     size_t next;
 400     size_t size = correct_size(ident);
 401     char **find = (char**)correct_pool_alloc(size * sizeof(char*));
 402
 403     if (!find || !(*lens = (size_t*)correct_pool_alloc(size * sizeof(size_t))))
 404         return NULL;
 405
 406     next  = correct_deletion     (ident, find);
 407     next += correct_transposition(ident, find+next);
 408     next += correct_alteration   (ident, find+next);
 409     /*****/ correct_insertion    (ident, find+next);
 410
 411     /* precompute lengths */
 412     for (next = 0; next < size; next++)
 413         (*lens)[next] = strlen(find[next]);
 414
 415     return find;
 416 }
 417
 418 static GMQCC_INLINE int correct_exist(char **array, register size_t *sizes, size_t rows, char *ident, register size_t len) {
 419     size_t itr;
 420     for (itr = 0; itr < rows; itr++) {
 421         /*
 422          * We can save tons of calls to memcmp if we simply ignore comparisions
 423          * that we know cannot contain the same length.
 424          */
 425         if (sizes[itr] == len && !memcmp(array[itr], ident, len))
 426             return 1;
 427     }
 428
 429     return 0;
 430 }
 431
 432 static GMQCC_INLINE char **correct_known_resize(char **res, size_t *allocated, size_t size) {
 433     size_t oldallocated = *allocated;
 434     char **out;
 435     if (size < oldallocated)
 436         return res;
 437
 438     out = (char**)correct_pool_alloc(sizeof(*res) * oldallocated + 32);
 439     memcpy(out, res, sizeof(*res) * oldallocated);
 440
 441     *allocated += 32;
 442     return out;
 443 }
 444
 445 static char **correct_known(correction_t *corr, correct_trie_t* table, char **array, size_t rows, size_t *next) {
 446     size_t itr = 0;
 447     size_t jtr = 0;
 448     size_t len = 0;
 449     size_t row = 0;
 450     size_t nxt = 8;
 451     char   **res = (char**)correct_pool_alloc(sizeof(char *) * nxt);
 452     char   **end = NULL;
 453     size_t  *bit = NULL;
 454
 455     for (; itr < rows; itr++) {
 456         if (!array[itr][0])
 457             continue;
 458         if (vec_size(corr->edits) > itr+1) {
 459             end = corr->edits[itr+1];
 460             bit = corr->lens [itr+1];
 461         } else {
 462             end = correct_edit(array[itr], &bit);
 463             vec_push(corr->edits, end);
 464             vec_push(corr->lens,  bit);
 465         }
 466         row = correct_size(array[itr]);
 467
 468         for (jtr = 0; jtr < row; jtr++) {
 469             if (correct_find(table, end[jtr]) && !correct_exist(res, bit, len, end[jtr], bit[jtr])) {
 470                 res        = correct_known_resize(res, &nxt, len+1);
 471                 res[len++] = end[jtr];
 472             }
 473         }
 474     }
 475
 476     *next = len;
 477     return res;
 478 }
 479
 480 static GMQCC_INLINE char *correct_maximum(correct_trie_t* table, char **array, size_t rows) {
 481     char   *str = NULL;
 482     size_t *itm = NULL;
 483     size_t  itr = 0;
 484     size_t  top = 0;
 485
 486     for (; itr < rows; itr++) {
 487         if ((itm = correct_find(table, array[itr])) && (*itm > top)) {
 488             top = *itm;
 489             str = array[itr];
 490         }
 491     }
 492
 493     return str;
 494 }
 495
 496 /*
 497  * This is the exposed interface:
 498  * takes a table for the dictonary a vector of sizes (used for internal
 499  * probability calculation), and an identifier to "correct".
 500  */
 501 void correct_init(correction_t *c)
 502 {
 503     correct_pool_new();
 504     c->edits = NULL;
 505     c->lens  = NULL;
 506 }
 507
 508 void correct_free(correction_t *c)
 509 {
 510     vec_free(c->edits);
 511     vec_free(c->lens);
 512     correct_pool_delete();
 513 }
 514
 515 char *correct_str(correction_t *corr, correct_trie_t* table, const char *ident) {
 516     char **e1      = NULL;
 517     char **e2      = NULL;
 518     char  *e1ident = NULL;
 519     char  *e2ident = NULL;
 520     size_t e1rows  = 0;
 521     size_t e2rows  = 0;
 522     size_t *bits   = NULL;
 523
 524     /* needs to be allocated for free later */
 525     if (correct_find(table, ident))
 526         return correct_pool_claim(ident);
 527
 528     if ((e1rows = correct_size(ident))) {
 529         if (vec_size(corr->edits) > 0)
 530             e1 = corr->edits[0];
 531         else {
 532             e1 = correct_edit(ident, &bits);
 533             vec_push(corr->edits, e1);
 534             vec_push(corr->lens,  bits);
 535         }
 536
 537         if ((e1ident = correct_maximum(table, e1, e1rows)))
 538             return correct_pool_claim(e1ident);
 539     }
 540
 541     e2 = correct_known(corr, table, e1, e1rows, &e2rows);
 542     if (e2rows && ((e2ident = correct_maximum(table, e2, e2rows))))
 543         return correct_pool_claim(e2ident);
 544
 545
 546     return util_strdup(ident);
 547 }