correct.c

   1 /*
   2  * Copyright (C) 2012, 2013
   3  *     Dale Weiler
   4  *     Wolfgang Bumiller
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy of
   7  * this software and associated documentation files (the "Software"), to deal in
   8  * the Software without restriction, including without limitation the rights to
   9  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
  10  * of the Software, and to permit persons to whom the Software is furnished to do
  11  * so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in all
  14  * copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24 #include "gmqcc.h"
  25
  26 /*
  27  * This is a very clever method for correcting mistakes in QuakeC code
  28  * most notably when invalid identifiers are used or inproper assignments;
  29  * we can proprly lookup in multiple dictonaries (depening on the rules
  30  * of what the task is trying to acomplish) to find the best possible
  31  * match.
  32  *
  33  *
  34  * A little about how it works, and probability theory:
  35  *
  36  *  When given an identifier (which we will denote I), we're essentially
  37  *  just trying to choose the most likely correction for that identifier.
  38  *  (the actual "correction" can very well be the identifier itself).
  39  *  There is actually no way to know for sure that certian identifers
  40  *  such as "lates", need to be corrected to "late" or "latest" or any
  41  *  other permutations that look lexically the same.  This is why we
  42  *  must advocate the usage of probabilities.  This means that instead of
  43  *  just guessing, instead we're trying to find the correction for C,
  44  *  out of all possible corrections that maximizes the probability of C
  45  *  for the original identifer I.
  46  *
  47  *  Thankfully there exists some theroies for probalistic interpretations
  48  *  of data.  Since we're operating on two distictive intepretations, the
  49  *  transposition from I to C. We need something that can express how much
  50  *  degree of I should rationally change to become C.  this is called the
  51  *  Bayesian interpretation. You can read more about it from here:
  52  *  http://www.celiagreen.com/charlesmccreery/statistics/bayestutorial.pdf
  53  *  (which is probably the only good online documentation for bayes theroy
  54  *  no lie.  Everything else just sucks ..)
  55  *
  56  *  Bayes' Thereom suggests something like the following:
  57  *      AC P(I|C) P(C) / P(I)
  58  *
  59  *  However since P(I) is the same for every possibility of I, we can
  60  *  completley ignore it giving just:
  61  *      AC P(I|C) P(C)
  62  *
  63  *  This greatly helps visualize how the parts of the expression are performed
  64  *  there is essentially three, from right to left we perform the following:
  65  *
  66  *  1: P(C), the probability that a proposed correction C will stand on its
  67  *     own.  This is called the language model.
  68  *
  69  *  2: P(I|C), the probability that I would be used, when the programmer
  70  *     really meant C.  This is the error model.
  71  *
  72  *  3: AC, the control mechanisim, an enumerator if you will, one that
  73  *     enumerates all feasible values of C, to determine the one that
  74  *     gives the greatest probability score.
  75  *
  76  *  In reality the requirement for a more complex expression involving
  77  *  two seperate models is considerably a waste.  But one must recognize
  78  *  that P(C|I) is already conflating two factors.  It's just much simpler
  79  *  to seperate the two models and deal with them explicitaly.  To properly
  80  *  estimate P(C|I) you have to consider both the probability of C and
  81  *  probability of the transposition from C to I.  It's simply much more
  82  *  cleaner, and direct to seperate the two factors.
  83  *
  84  *  Research tells us that 80% to 95% of all spelling errors have an edit
  85  *  distance no greater than one.  Knowing this we can optimize for most
  86  *  cases of mistakes without taking a performance hit.  Which is what we
  87  *  base longer edit distances off of.  Opposed to the original method of
  88  *  I had concieved of checking everything.
  89  *
  90  * A little information on additional algorithms used:
  91  *
  92  *   Initially when I implemented this corrector, it was very slow.
  93  *   Need I remind you this is essentially a brute force attack on strings,
  94  *   and since every transformation requires dynamic memory allocations,
  95  *   you can easily imagine where most of the runtime conflated.  Yes
  96  *   It went right to malloc.  More than THREE MILLION malloc calls are
  97  *   performed for an identifier about 16 bytes long.  This was such a
  98  *   shock to me.  A forward allocator (or as some call it a bump-point
  99  *   allocator, or just a memory pool) was implemented. To combat this.
 100  *
 101  *   But of course even other factors were making it slow.  Initially
 102  *   this used a hashtable.  And hashtables have a good constant lookup
 103  *   time complexity.  But the problem wasn't in the hashtable, it was
 104  *   in the hashing (despite having one of the fastest hash functions
 105  *   known).  Remember those 3 million mallocs? Well for every malloc
 106  *   there is also a hash.  After 3 million hashes .. you start to get
 107  *   very slow.  To combat this I had suggested burst tries to Blub.
 108  *   The next day he had implemented them. Sure enough this brought
 109  *   down the runtime by a factory > 100%
 110  *
 111  * Future Work (If we really need it)
 112  *
 113  *   Currently we can only distinguishes one source of error in the
 114  *   language model we use.  This could become an issue for identifiers
 115  *   that have close colliding rates, e.g colate->coat yields collate.
 116  *
 117  *   Currently the error model has been fairly trivial, the smaller the
 118  *   edit distance the smaller the error.  This usually causes some un-
 119  *   expected problems. e.g reciet->recite yields recipt.  For QuakeC
 120  *   this could become a problem when lots of identifiers are involved.
 121  *
 122  *   Our control mechanisim could use a limit, i.e limit the number of
 123  *   sets of edits for distance X.  This would also increase execution
 124  *   speed considerably.
 125  */
 126
 127
 128 #define CORRECT_POOL_SIZE (128*1024*1024)
 129 /*
 130  * A forward allcator for the corrector.  This corrector requires a lot
 131  * of allocations.  This forward allocator combats all those allocations
 132  * and speeds us up a little.  It also saves us space in a way since each
 133  * allocation isn't wasting a little header space for when NOTRACK isn't
 134  * defined.
 135  */
 136 static unsigned char **correct_pool_data = NULL;
 137 static unsigned char  *correct_pool_this = NULL;
 138 static size_t          correct_pool_addr = 0;
 139
 140 static GMQCC_INLINE void correct_pool_new(void) {
 141     correct_pool_addr = 0;
 142     correct_pool_this = (unsigned char *)mem_a(CORRECT_POOL_SIZE);
 143
 144     vec_push(correct_pool_data, correct_pool_this);
 145 }
 146
 147 static GMQCC_INLINE void *correct_pool_alloc(size_t bytes) {
 148     void *data;
 149     if (correct_pool_addr + bytes>= CORRECT_POOL_SIZE)
 150         correct_pool_new();
 151
 152     data               = (void*)correct_pool_this;
 153     correct_pool_this += bytes;
 154     correct_pool_addr += bytes;
 155     return data;
 156 }
 157
 158 static GMQCC_INLINE void correct_pool_delete(void) {
 159     size_t i;
 160     for (i = 0; i < vec_size(correct_pool_data); ++i)
 161         mem_d(correct_pool_data[i]);
 162
 163     correct_pool_data = NULL;
 164     correct_pool_this = NULL;
 165     correct_pool_addr = 0;
 166 }
 167
 168
 169 static GMQCC_INLINE char *correct_pool_claim(const char *data) {
 170     char *claim = util_strdup(data);
 171     correct_pool_delete();
 172     return claim;
 173 }
 174
 175 /*
 176  * _ is valid in identifiers. I've yet to implement numerics however
 177  * because they're only valid after the first character is of a _, or
 178  * alpha character.
 179  */
 180 static const char correct_alpha[] = "abcdefghijklmnopqrstuvwxyz"
 181                                     "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 182                                     "_"; /* TODO: Numbers ... */
 183
 184 static const size_t correct_alpha_index[0x80] = {
 185      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 186      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 187      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 188      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 189      0,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
 190     15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,  0,  0,  0,  0, 52,
 191      0, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
 192     41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,  0,  0,  0,  0,  0
 193 };
 194
 195 /*
 196  * A fast space efficent trie for a dictionary of identifiers.  This is
 197  * faster than a hashtable for one reason.  A hashtable itself may have
 198  * fast constant lookup time, but the hash itself must be very fast. We
 199  * have one of the fastest hash functions for strings, but if you do a
 200  * lost of hashing (which we do, almost 3 million hashes per identifier)
 201  * a hashtable becomes slow.
 202  */
 203 correct_trie_t* correct_trie_new() {
 204     correct_trie_t *t = (correct_trie_t*)mem_a(sizeof(correct_trie_t));
 205     t->value   = NULL;
 206     t->entries = NULL;
 207     return t;
 208 }
 209
 210 void correct_trie_del_sub(correct_trie_t *t) {
 211     size_t i;
 212     if (!t->entries)
 213         return;
 214     for (i = 0; i < sizeof(correct_alpha)-1; ++i) {
 215         correct_trie_del_sub(&t->entries[i]);
 216     }
 217     mem_d(t->entries);
 218 }
 219
 220 void correct_trie_del(correct_trie_t *t) {
 221     size_t i;
 222     if (t->entries) {
 223         for (i = 0; i < sizeof(correct_alpha)-1; ++i)
 224             correct_trie_del_sub(&t->entries[i]);
 225         mem_d(t->entries);
 226     }
 227     mem_d(t);
 228 }
 229
 230 void* correct_trie_get(const correct_trie_t *t, const char *key) {
 231     const unsigned char *data = (const unsigned char*)key;
 232
 233     while (*data) {
 234         if (!t->entries)
 235             return NULL;
 236         t = t->entries + correct_alpha_index[*data];
 237         ++data;
 238     }
 239     return t->value;
 240 }
 241
 242 void correct_trie_set(correct_trie_t *t, const char *key, void * const value) {
 243     const unsigned char *data = (const unsigned char*)key;
 244     while (*data) {
 245         if (!t->entries) {
 246             t->entries = (correct_trie_t*)mem_a(sizeof(correct_trie_t)*(sizeof(correct_alpha)-1));
 247             memset(t->entries, 0, sizeof(correct_trie_t)*(sizeof(correct_alpha)-1));
 248         }
 249         t = t->entries + correct_alpha_index[*data];
 250         ++data;
 251     }
 252     t->value = value;
 253 }
 254
 255
 256 /*
 257  * Implementation of the corrector algorithm commences. A very efficent
 258  * brute-force attack (thanks to tries and mempool :-)).
 259  */
 260 static GMQCC_INLINE size_t *correct_find(correct_trie_t *table, const char *word) {
 261     return (size_t*)correct_trie_get(table, word);
 262 }
 263
 264 static GMQCC_INLINE bool correct_update(correct_trie_t* *table, const char *word) {
 265     size_t *data = correct_find(*table, word);
 266     if (!data)
 267         return false;
 268
 269     (*data)++;
 270     return true;
 271 }
 272
 273 void correct_add(correct_trie_t* table, size_t ***size, const char *ident) {
 274     size_t     *data = NULL;
 275     const char *add  = ident;
 276
 277     if (!correct_update(&table, add)) {
 278         data  = (size_t*)mem_a(sizeof(size_t));
 279         *data = 1;
 280
 281         vec_push((*size), data);
 282         correct_trie_set(table, add, data);
 283     }
 284 }
 285
 286 void correct_del(correct_trie_t* dictonary, size_t **data) {
 287     size_t       i;
 288     const size_t vs = vec_size(data);
 289
 290     for (i = 0; i < vs; i++)
 291         mem_d(data[i]);
 292
 293     vec_free(data);
 294     correct_trie_del(dictonary);
 295 }
 296
 297 /*
 298  * correcting logic for the following forms of transformations:
 299  *  1) deletion
 300  *  2) transposition
 301  *  3) alteration
 302  *  4) insertion
 303  *
 304  * These functions could take an additional size_t **size paramater
 305  * and store back the results of their new length in an array that
 306  * is the same as **array for the memcmp in correct_exists. I'm just
 307  * not able to figure out how to do that just yet.  As my brain is
 308  * not in the mood to figure out that logic.  This is a reminder to
 309  * do it, or for someone else to :-) correct_edit however would also
 310  * need to take a size_t ** to carry it along (would all the argument
 311  * overhead be worth it?)
 312  */
 313 static size_t correct_deletion(const char *ident, char **array) {
 314     size_t       itr = 0;
 315     const size_t len = strlen(ident);
 316
 317     for (; itr < len; itr++) {
 318         char *a = (char*)correct_pool_alloc(len+1);
 319         memcpy(a, ident, itr);
 320         memcpy(a + itr, ident + itr + 1, len - itr);
 321         array[itr] = a;
 322     }
 323
 324     return itr;
 325 }
 326
 327 static size_t correct_transposition(const char *ident, char **array) {
 328     size_t       itr = 0;
 329     const size_t len = strlen(ident);
 330
 331     for (; itr < len - 1; itr++) {
 332         char  tmp;
 333         char *a = (char*)correct_pool_alloc(len+1);
 334         memcpy(a, ident, len+1);
 335         tmp      = a[itr];
 336         a[itr  ] = a[itr+1];
 337         a[itr+1] = tmp;
 338         array[itr] = a;
 339     }
 340
 341     return itr;
 342 }
 343
 344 static size_t correct_alteration(const char *ident, char **array) {
 345     size_t       itr = 0;
 346     size_t       jtr = 0;
 347     size_t       ktr = 0;
 348     const size_t len = strlen(ident);
 349
 350     for (; itr < len; itr++) {
 351         for (jtr = 0; jtr < sizeof(correct_alpha)-1; jtr++, ktr++) {
 352             char *a = (char*)correct_pool_alloc(len+1);
 353             memcpy(a, ident, len+1);
 354             a[itr] = correct_alpha[jtr];
 355             array[ktr] = a;
 356         }
 357     }
 358
 359     return ktr;
 360 }
 361
 362 static size_t correct_insertion(const char *ident, char **array) {
 363     size_t       itr = 0;
 364     size_t       jtr = 0;
 365     const size_t len = strlen(ident);
 366
 367     for (; itr <= len; itr++) {
 368         for (jtr = 0; jtr < sizeof(correct_alpha)-1; jtr++) {
 369             char *a = (char*)correct_pool_alloc(len+2);
 370             memcpy(a, ident, itr);
 371             memcpy(a + itr + 1, ident + itr, len - itr + 1);
 372             a[itr] = correct_alpha[jtr];
 373             array[itr * (sizeof(correct_alpha)-1) + jtr] = a;
 374         }
 375     }
 376
 377     return (len+1)*(sizeof(correct_alpha)-1);
 378 }
 379
 380 static GMQCC_INLINE size_t correct_size(const char *ident) {
 381     /*
 382      * deletion      = len
 383      * transposition = len - 1
 384      * alteration    = len * sizeof(correct_alpha)
 385      * insertion     = (len + 1) * sizeof(correct_alpha)
 386      */
 387
 388     register size_t len = strlen(ident);
 389     return (len) + (len - 1) + (len * (sizeof(correct_alpha)-1)) + ((len + 1) * (sizeof(correct_alpha)-1));
 390 }
 391
 392 static char **correct_edit(const char *ident) {
 393     size_t next;
 394     char **find = (char**)correct_pool_alloc(correct_size(ident) * sizeof(char*));
 395
 396     if (!find)
 397         return NULL;
 398
 399     next  = correct_deletion     (ident, find);
 400     next += correct_transposition(ident, find+next);
 401     next += correct_alteration   (ident, find+next);
 402     /*****/ correct_insertion    (ident, find+next);
 403
 404     return find;
 405 }
 406
 407 /*
 408  * We could use a hashtable but the space complexity isn't worth it
 409  * since we're only going to determine the "did you mean?" identifier
 410  * on error.
 411  */
 412 static int correct_exist(char **array, size_t rows, char *ident) {
 413     size_t itr;
 414     /*
 415      * As an experiment I tried the following assembly for memcmp here:
 416      *
 417      * correct_cmp_loop:
 418      * incl %eax            ; eax =  LHS
 419      * incl %edx            ; edx =  LRS
 420      * cmpl %eax, %ebx      ; ebx = &LHS[END_POS]
 421      *
 422      * jbe correct_cmp_eq
 423      * movb (%edx), %cl     ; micro-optimized even on atoms :-)
 424      * cmpb %cl, (%eax)     ; ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 425      * jg  correct_cmp_gt
 426      * jge correct_cmp_loop
 427      * ...
 428      *
 429      * Despite how much optimization went in to this, the speed was the
 430      * being conflicted by the strlen(ident) used for &LHS[END_POS]
 431      * If we could eliminate the strlen with what I suggested on line
 432      * 311 ... we can accelerate this whole damn thing quite a bit.
 433      *
 434      * However there is still something we can do here that does give
 435      * us a little more speed.  Although one more branch, we know for
 436      * sure there is at least one byte to compare, if that one byte
 437      * simply isn't the same we can skip the full check. Which means
 438      * we skip a whole strlen call.
 439      */
 440     for (itr = 0; itr < rows; itr++) {
 441         if (!memcmp(array[itr], ident, strlen(ident)))
 442             return 1;
 443     }
 444
 445     return 0;
 446 }
 447
 448 static GMQCC_INLINE char **correct_known_resize(char **res, size_t *allocated, size_t size) {
 449     size_t oldallocated = *allocated;
 450     char **out;
 451     if (size < oldallocated)
 452         return res;
 453
 454     out = correct_pool_alloc(sizeof(*res) * oldallocated + 32);
 455     memcpy(out, res, sizeof(*res) * oldallocated);
 456
 457     *allocated += 32;
 458     return out;
 459 }
 460
 461 static char **correct_known(correct_trie_t* table, char **array, size_t rows, size_t *next) {
 462     size_t itr = 0;
 463     size_t jtr = 0;
 464     size_t len = 0;
 465     size_t row = 0;
 466     size_t nxt = 8;
 467     char **res = correct_pool_alloc(sizeof(char *) * nxt);
 468     char **end = NULL;
 469
 470     for (; itr < rows; itr++) {
 471         end = correct_edit(array[itr]);
 472         row = correct_size(array[itr]);
 473
 474         /* removing jtr=0 here speeds it up by 100ms O_o */
 475         for (jtr = 0; jtr < row; jtr++) {
 476             if (correct_find(table, end[jtr]) && !correct_exist(res, len, end[jtr])) {
 477                 res        = correct_known_resize(res, &nxt, len+1);
 478                 res[len++] = end[jtr];
 479             }
 480         }
 481     }
 482
 483     *next = len;
 484     return res;
 485 }
 486
 487 static char *correct_maximum(correct_trie_t* table, char **array, size_t rows) {
 488     char   *str = NULL;
 489     size_t *itm = NULL;
 490     size_t  itr = 0;
 491     size_t  top = 0;
 492
 493     for (; itr < rows; itr++) {
 494         if ((itm = correct_find(table, array[itr])) && (*itm > top)) {
 495             top = *itm;
 496             str = array[itr];
 497         }
 498     }
 499
 500     return str;
 501 }
 502
 503 /*
 504  * This is the exposed interface:
 505  * takes a table for the dictonary a vector of sizes (used for internal
 506  * probability calculation), and an identifier to "correct".
 507  */
 508 char *correct_str(correct_trie_t* table, const char *ident) {
 509     char **e1      = NULL;
 510     char **e2      = NULL;
 511     char  *e1ident = NULL;
 512     char  *e2ident = NULL;
 513     size_t e1rows  = 0;
 514     size_t e2rows  = 0;
 515
 516     correct_pool_new();
 517
 518     /* needs to be allocated for free later */
 519     if (correct_find(table, ident))
 520         return correct_pool_claim(ident);
 521
 522     if ((e1rows = correct_size(ident))) {
 523         e1      = correct_edit(ident);
 524
 525         if ((e1ident = correct_maximum(table, e1, e1rows)))
 526             return correct_pool_claim(e1ident);
 527     }
 528
 529     e2 = correct_known(table, e1, e1rows, &e2rows);
 530     if (e2rows && ((e2ident = correct_maximum(table, e2, e2rows))))
 531         return correct_pool_claim(e2ident);
 532
 533
 534     correct_pool_delete();
 535     return util_strdup(ident);
 536 }