correct.c

   1 /*
   2  * Copyright (C) 2012, 2013
   3  *     Dale Weiler
   4  *     Wolfgang Bumiller
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy of
   7  * this software and associated documentation files (the "Software"), to deal in
   8  * the Software without restriction, including without limitation the rights to
   9  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
  10  * of the Software, and to permit persons to whom the Software is furnished to do
  11  * so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in all
  14  * copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24 #include <string.h>
  25 #include "gmqcc.h"
  26
  27 /*
  28  * This is a very clever method for correcting mistakes in QuakeC code
  29  * most notably when invalid identifiers are used or inproper assignments;
  30  * we can proprly lookup in multiple dictonaries (depening on the rules
  31  * of what the task is trying to acomplish) to find the best possible
  32  * match.
  33  *
  34  *
  35  * A little about how it works, and probability theory:
  36  *
  37  *  When given an identifier (which we will denote I), we're essentially
  38  *  just trying to choose the most likely correction for that identifier.
  39  *  (the actual "correction" can very well be the identifier itself).
  40  *  There is actually no way to know for sure that certian identifers
  41  *  such as "lates", need to be corrected to "late" or "latest" or any
  42  *  other permutations that look lexically the same.  This is why we
  43  *  must advocate the usage of probabilities.  This means that instead of
  44  *  just guessing, instead we're trying to find the correction for C,
  45  *  out of all possible corrections that maximizes the probability of C
  46  *  for the original identifer I.
  47  *
  48  *  Thankfully there exists some theroies for probalistic interpretations
  49  *  of data.  Since we're operating on two distictive intepretations, the
  50  *  transposition from I to C. We need something that can express how much
  51  *  degree of I should rationally change to become C.  this is called the
  52  *  Bayesian interpretation. You can read more about it from here:
  53  *  http://www.celiagreen.com/charlesmccreery/statistics/bayestutorial.pdf
  54  *  (which is probably the only good online documentation for bayes theroy
  55  *  no lie.  Everything else just sucks ..)
  56  *
  57  *  Bayes' Thereom suggests something like the following:
  58  *      AC P(I|C) P(C) / P(I)
  59  *
  60  *  However since P(I) is the same for every possibility of I, we can
  61  *  completley ignore it giving just:
  62  *      AC P(I|C) P(C)
  63  *
  64  *  This greatly helps visualize how the parts of the expression are performed
  65  *  there is essentially three, from right to left we perform the following:
  66  *
  67  *  1: P(C), the probability that a proposed correction C will stand on its
  68  *     own.  This is called the language model.
  69  *
  70  *  2: P(I|C), the probability that I would be used, when the programmer
  71  *     really meant C.  This is the error model.
  72  *
  73  *  3: AC, the control mechanisim, an enumerator if you will, one that
  74  *     enumerates all feasible values of C, to determine the one that
  75  *     gives the greatest probability score.
  76  *
  77  *  In reality the requirement for a more complex expression involving
  78  *  two seperate models is considerably a waste.  But one must recognize
  79  *  that P(C|I) is already conflating two factors.  It's just much simpler
  80  *  to seperate the two models and deal with them explicitaly.  To properly
  81  *  estimate P(C|I) you have to consider both the probability of C and
  82  *  probability of the transposition from C to I.  It's simply much more
  83  *  cleaner, and direct to seperate the two factors.
  84  *
  85  *  Research tells us that 80% to 95% of all spelling errors have an edit
  86  *  distance no greater than one.  Knowing this we can optimize for most
  87  *  cases of mistakes without taking a performance hit.  Which is what we
  88  *  base longer edit distances off of.  Opposed to the original method of
  89  *  I had concieved of checking everything.
  90  *
  91  * A little information on additional algorithms used:
  92  *
  93  *   Initially when I implemented this corrector, it was very slow.
  94  *   Need I remind you this is essentially a brute force attack on strings,
  95  *   and since every transformation requires dynamic memory allocations,
  96  *   you can easily imagine where most of the runtime conflated.  Yes
  97  *   It went right to malloc.  More than THREE MILLION malloc calls are
  98  *   performed for an identifier about 16 bytes long.  This was such a
  99  *   shock to me.  A forward allocator (or as some call it a bump-point
 100  *   allocator, or just a memory pool) was implemented. To combat this.
 101  *
 102  *   But of course even other factors were making it slow.  Initially
 103  *   this used a hashtable.  And hashtables have a good constant lookup
 104  *   time complexity.  But the problem wasn't in the hashtable, it was
 105  *   in the hashing (despite having one of the fastest hash functions
 106  *   known).  Remember those 3 million mallocs? Well for every malloc
 107  *   there is also a hash.  After 3 million hashes .. you start to get
 108  *   very slow.  To combat this I had suggested burst tries to Blub.
 109  *   The next day he had implemented them. Sure enough this brought
 110  *   down the runtime by a factor > 100%
 111  *
 112  *   The trie initially was designed to work on all strings, but later it
 113  *   became aparent that not only was this not a requirement. It was also
 114  *   slowing down get/sets' for the trie.  To fully understand, only
 115  *   correct_alpha needs to be understood by the trie system, knowing this
 116  *   We can combat the slowness using a very clever but evil optimization.
 117  *   By Setting a fixed sized amount of branches for the trie using a
 118  *   char-to-index map into the branches. We've complelty made the trie
 119  *   accesses entierly constant in lookup time.  No really, a lookup is
 120  *   literally trie[str[0]] [str[1]] [2] .... .value.
 121  *
 122  *
 123  * Future Work (If we really need it)
 124  *
 125  *   Currently we can only distinguish one source of error in the
 126  *   language model we use.  This could become an issue for identifiers
 127  *   that have close colliding rates, e.g colate->coat yields collate.
 128  *
 129  *   Currently the error model has been fairly trivial, the smaller the
 130  *   edit distance the smaller the error.  This usually causes some un-
 131  *   expected problems. e.g reciet->recite yields recipt.  For QuakeC
 132  *   this could become a problem when lots of identifiers are involved.
 133  */
 134
 135
 136 #define CORRECT_POOL_SIZE (128*1024*1024)
 137 /*
 138  * A forward allcator for the corrector.  This corrector requires a lot
 139  * of allocations.  This forward allocator combats all those allocations
 140  * and speeds us up a little.  It also saves us space in a way since each
 141  * allocation isn't wasting a little header space for when NOTRACK isn't
 142  * defined.
 143  */
 144 static unsigned char **correct_pool_data = NULL;
 145 static unsigned char  *correct_pool_this = NULL;
 146 static size_t          correct_pool_addr = 0;
 147
 148 static GMQCC_INLINE void correct_pool_new(void) {
 149     correct_pool_addr = 0;
 150     correct_pool_this = (unsigned char *)mem_a(CORRECT_POOL_SIZE);
 151
 152     vec_push(correct_pool_data, correct_pool_this);
 153 }
 154
 155 static GMQCC_INLINE void *correct_pool_alloc(size_t bytes) {
 156     void *data;
 157     if (correct_pool_addr + bytes>= CORRECT_POOL_SIZE)
 158         correct_pool_new();
 159
 160     data               = (void*)correct_pool_this;
 161     correct_pool_this += bytes;
 162     correct_pool_addr += bytes;
 163     return data;
 164 }
 165
 166 static GMQCC_INLINE void correct_pool_delete(void) {
 167     size_t i;
 168     for (i = 0; i < vec_size(correct_pool_data); ++i)
 169         mem_d(correct_pool_data[i]);
 170
 171     correct_pool_data = NULL;
 172     correct_pool_this = NULL;
 173     correct_pool_addr = 0;
 174 }
 175
 176
 177 static GMQCC_INLINE char *correct_pool_claim(const char *data) {
 178     char *claim = util_strdup(data);
 179     return claim;
 180 }
 181
 182 /*
 183  * _ is valid in identifiers. I've yet to implement numerics however
 184  * because they're only valid after the first character is of a _, or
 185  * alpha character.
 186  */
 187 static const char correct_alpha[] = "abcdefghijklmnopqrstuvwxyz"
 188                                     "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 189                                     "_"; /* TODO: Numbers ... */
 190
 191 static const size_t correct_alpha_index[0x80] = {
 192      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 193      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 194      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 195      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 196      0,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
 197     15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,  0,  0,  0,  0, 52,
 198      0, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
 199     41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,  0,  0,  0,  0,  0
 200 };
 201
 202 /*
 203  * A fast space efficent trie for a dictionary of identifiers.  This is
 204  * faster than a hashtable for one reason.  A hashtable itself may have
 205  * fast constant lookup time, but the hash itself must be very fast. We
 206  * have one of the fastest hash functions for strings, but if you do a
 207  * lost of hashing (which we do, almost 3 million hashes per identifier)
 208  * a hashtable becomes slow.
 209  */
 210 correct_trie_t* correct_trie_new() {
 211     correct_trie_t *t = (correct_trie_t*)mem_a(sizeof(correct_trie_t));
 212     t->value   = NULL;
 213     t->entries = NULL;
 214     return t;
 215 }
 216
 217 static GMQCC_INLINE void correct_trie_del_sub(correct_trie_t *t) {
 218     size_t i;
 219     if (!t->entries)
 220         return;
 221     for (i = 0; i < sizeof(correct_alpha)-1; ++i) {
 222         correct_trie_del_sub(&t->entries[i]);
 223     }
 224     mem_d(t->entries);
 225 }
 226
 227 static GMQCC_INLINE void correct_trie_del(correct_trie_t *t) {
 228     size_t i;
 229     if (t->entries) {
 230         for (i = 0; i < sizeof(correct_alpha)-1; ++i)
 231             correct_trie_del_sub(&t->entries[i]);
 232         mem_d(t->entries);
 233     }
 234     mem_d(t);
 235 }
 236
 237 static GMQCC_INLINE void* correct_trie_get(const correct_trie_t *t, const char *key) {
 238     const unsigned char *data = (const unsigned char*)key;
 239
 240     while (*data) {
 241         if (!t->entries)
 242             return NULL;
 243         t = t->entries + correct_alpha_index[*data];
 244         ++data;
 245     }
 246     return t->value;
 247 }
 248
 249 static GMQCC_INLINE void correct_trie_set(correct_trie_t *t, const char *key, void * const value) {
 250     const unsigned char *data = (const unsigned char*)key;
 251     while (*data) {
 252         if (!t->entries) {
 253             t->entries = (correct_trie_t*)mem_a(sizeof(correct_trie_t)*(sizeof(correct_alpha)-1));
 254             memset(t->entries, 0, sizeof(correct_trie_t)*(sizeof(correct_alpha)-1));
 255         }
 256         t = t->entries + correct_alpha_index[*data];
 257         ++data;
 258     }
 259     t->value = value;
 260 }
 261
 262
 263 /*
 264  * Implementation of the corrector algorithm commences. A very efficent
 265  * brute-force attack (thanks to tries and mempool :-)).
 266  */
 267 static GMQCC_INLINE size_t *correct_find(correct_trie_t *table, const char *word) {
 268     return (size_t*)correct_trie_get(table, word);
 269 }
 270
 271 static GMQCC_INLINE bool correct_update(correct_trie_t* *table, const char *word) {
 272     size_t *data = correct_find(*table, word);
 273     if (!data)
 274         return false;
 275
 276     (*data)++;
 277     return true;
 278 }
 279
 280 void correct_add(correct_trie_t* table, size_t ***size, const char *ident) {
 281     size_t     *data = NULL;
 282     const char *add  = ident;
 283
 284     if (!correct_update(&table, add)) {
 285         data  = (size_t*)mem_a(sizeof(size_t));
 286         *data = 1;
 287
 288         vec_push((*size), data);
 289         correct_trie_set(table, add, data);
 290     }
 291 }
 292
 293 void correct_del(correct_trie_t* dictonary, size_t **data) {
 294     size_t       i;
 295     const size_t vs = vec_size(data);
 296
 297     for (i = 0; i < vs; i++)
 298         mem_d(data[i]);
 299
 300     vec_free(data);
 301     correct_trie_del(dictonary);
 302 }
 303
 304 /*
 305  * correcting logic for the following forms of transformations:
 306  *  1) deletion
 307  *  2) transposition
 308  *  3) alteration
 309  *  4) insertion
 310  *
 311  * These functions could take an additional size_t **size paramater
 312  * and store back the results of their new length in an array that
 313  * is the same as **array for the memcmp in correct_exists. I'm just
 314  * not able to figure out how to do that just yet.  As my brain is
 315  * not in the mood to figure out that logic.  This is a reminder to
 316  * do it, or for someone else to :-) correct_edit however would also
 317  * need to take a size_t ** to carry it along (would all the argument
 318  * overhead be worth it?)
 319  */
 320 static GMQCC_INLINE size_t correct_deletion(const char *ident, char **array) {
 321     size_t       itr = 0;
 322     const size_t len = strlen(ident);
 323
 324     for (; itr < len; itr++) {
 325         char *a = (char*)correct_pool_alloc(len+1);
 326         memcpy(a, ident, itr);
 327         memcpy(a + itr, ident + itr + 1, len - itr);
 328         array[itr] = a;
 329     }
 330
 331     return itr;
 332 }
 333
 334 static GMQCC_INLINE size_t correct_transposition(const char *ident, char **array) {
 335     size_t       itr = 0;
 336     const size_t len = strlen(ident);
 337
 338     for (; itr < len - 1; itr++) {
 339         char  tmp;
 340         char *a = (char*)correct_pool_alloc(len+1);
 341         memcpy(a, ident, len+1);
 342         tmp      = a[itr];
 343         a[itr  ] = a[itr+1];
 344         a[itr+1] = tmp;
 345         array[itr] = a;
 346     }
 347
 348     return itr;
 349 }
 350
 351 static GMQCC_INLINE size_t correct_alteration(const char *ident, char **array) {
 352     size_t       itr = 0;
 353     size_t       jtr = 0;
 354     size_t       ktr = 0;
 355     const size_t len = strlen(ident);
 356
 357     for (; itr < len; itr++) {
 358         for (jtr = 0; jtr < sizeof(correct_alpha)-1; jtr++, ktr++) {
 359             char *a = (char*)correct_pool_alloc(len+1);
 360             memcpy(a, ident, len+1);
 361             a[itr] = correct_alpha[jtr];
 362             array[ktr] = a;
 363         }
 364     }
 365
 366     return ktr;
 367 }
 368
 369 static GMQCC_INLINE size_t correct_insertion(const char *ident, char **array) {
 370     size_t       itr = 0;
 371     size_t       jtr = 0;
 372     const size_t len = strlen(ident);
 373
 374     for (; itr <= len; itr++) {
 375         for (jtr = 0; jtr < sizeof(correct_alpha)-1; jtr++) {
 376             char *a = (char*)correct_pool_alloc(len+2);
 377             memcpy(a, ident, itr);
 378             memcpy(a + itr + 1, ident + itr, len - itr + 1);
 379             a[itr] = correct_alpha[jtr];
 380             array[itr * (sizeof(correct_alpha)-1) + jtr] = a;
 381         }
 382     }
 383
 384     return (len+1)*(sizeof(correct_alpha)-1);
 385 }
 386
 387 static GMQCC_INLINE size_t correct_size(const char *ident) {
 388     /*
 389      * deletion      = len
 390      * transposition = len - 1
 391      * alteration    = len * sizeof(correct_alpha)
 392      * insertion     = (len + 1) * sizeof(correct_alpha)
 393      */
 394
 395     register size_t len = strlen(ident);
 396     return (len) + (len - 1) + (len * (sizeof(correct_alpha)-1)) + ((len + 1) * (sizeof(correct_alpha)-1));
 397 }
 398
 399 static GMQCC_INLINE char **correct_edit(const char *ident, size_t **lens) {
 400     size_t next;
 401     size_t size = correct_size(ident);
 402     char **find = (char**)correct_pool_alloc(size * sizeof(char*));
 403
 404     if (!find || !(*lens = (size_t*)correct_pool_alloc(size * sizeof(size_t))))
 405         return NULL;
 406
 407     next  = correct_deletion     (ident, find);
 408     next += correct_transposition(ident, find+next);
 409     next += correct_alteration   (ident, find+next);
 410     /*****/ correct_insertion    (ident, find+next);
 411
 412     /* precompute lengths */
 413     for (next = 0; next < size; next++)
 414         (*lens)[next] = strlen(find[next]);
 415
 416     return find;
 417 }
 418
 419 static GMQCC_INLINE int correct_exist(char **array, register size_t *sizes, size_t rows, char *ident, register size_t len) {
 420     size_t itr;
 421     for (itr = 0; itr < rows; itr++) {
 422         /*
 423          * We can save tons of calls to memcmp if we simply ignore comparisions
 424          * that we know cannot contain the same length.
 425          */
 426         if (sizes[itr] == len && !memcmp(array[itr], ident, len))
 427             return 1;
 428     }
 429
 430     return 0;
 431 }
 432
 433 static GMQCC_INLINE char **correct_known_resize(char **res, size_t *allocated, size_t size) {
 434     size_t oldallocated = *allocated;
 435     char **out;
 436     if (size < oldallocated)
 437         return res;
 438
 439     out = (char**)correct_pool_alloc(sizeof(*res) * oldallocated + 32);
 440     memcpy(out, res, sizeof(*res) * oldallocated);
 441
 442     *allocated += 32;
 443     return out;
 444 }
 445
 446 static char **correct_known(correction_t *corr, correct_trie_t* table, char **array, size_t rows, size_t *next) {
 447     size_t itr = 0;
 448     size_t jtr = 0;
 449     size_t len = 0;
 450     size_t row = 0;
 451     size_t nxt = 8;
 452     char   **res = (char**)correct_pool_alloc(sizeof(char *) * nxt);
 453     char   **end = NULL;
 454     size_t  *bit = NULL;
 455
 456     for (; itr < rows; itr++) {
 457         if (!array[itr][0])
 458             continue;
 459         if (vec_size(corr->edits) > itr+1) {
 460             end = corr->edits[itr+1];
 461             bit = corr->lens [itr+1];
 462         } else {
 463             end = correct_edit(array[itr], &bit);
 464             vec_push(corr->edits, end);
 465             vec_push(corr->lens,  bit);
 466         }
 467         row = correct_size(array[itr]);
 468
 469         for (jtr = 0; jtr < row; jtr++) {
 470             if (correct_find(table, end[jtr]) && !correct_exist(res, bit, len, end[jtr], bit[jtr])) {
 471                 res        = correct_known_resize(res, &nxt, len+1);
 472                 res[len++] = end[jtr];
 473             }
 474         }
 475     }
 476
 477     *next = len;
 478     return res;
 479 }
 480
 481 static GMQCC_INLINE char *correct_maximum(correct_trie_t* table, char **array, size_t rows) {
 482     char   *str = NULL;
 483     size_t *itm = NULL;
 484     size_t  itr = 0;
 485     size_t  top = 0;
 486
 487     for (; itr < rows; itr++) {
 488         if ((itm = correct_find(table, array[itr])) && (*itm > top)) {
 489             top = *itm;
 490             str = array[itr];
 491         }
 492     }
 493
 494     return str;
 495 }
 496
 497 /*
 498  * This is the exposed interface:
 499  * takes a table for the dictonary a vector of sizes (used for internal
 500  * probability calculation), and an identifier to "correct".
 501  */
 502 void correct_init(correction_t *c)
 503 {
 504     correct_pool_new();
 505     c->edits = NULL;
 506     c->lens  = NULL;
 507 }
 508
 509 void correct_free(correction_t *c)
 510 {
 511     vec_free(c->edits);
 512     vec_free(c->lens);
 513     correct_pool_delete();
 514 }
 515
 516 char *correct_str(correction_t *corr, correct_trie_t* table, const char *ident) {
 517     char **e1      = NULL;
 518     char **e2      = NULL;
 519     char  *e1ident = NULL;
 520     char  *e2ident = NULL;
 521     size_t e1rows  = 0;
 522     size_t e2rows  = 0;
 523     size_t *bits   = NULL;
 524
 525     /* needs to be allocated for free later */
 526     if (correct_find(table, ident))
 527         return correct_pool_claim(ident);
 528
 529     if ((e1rows = correct_size(ident))) {
 530         if (vec_size(corr->edits) > 0)
 531             e1 = corr->edits[0];
 532         else {
 533             e1 = correct_edit(ident, &bits);
 534             vec_push(corr->edits, e1);
 535             vec_push(corr->lens,  bits);
 536         }
 537
 538         if ((e1ident = correct_maximum(table, e1, e1rows)))
 539             return correct_pool_claim(e1ident);
 540     }
 541
 542     e2 = correct_known(corr, table, e1, e1rows, &e2rows);
 543     if (e2rows && ((e2ident = correct_maximum(table, e2, e2rows))))
 544         return correct_pool_claim(e2ident);
 545
 546
 547     return util_strdup(ident);
 548 }