2 * Copyright (C) 2012, 2013
6 * Permission is hereby granted, free of charge, to any person obtaining a copy of
7 * this software and associated documentation files (the "Software"), to deal in
8 * the Software without restriction, including without limitation the rights to
9 * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
10 * of the Software, and to permit persons to whom the Software is furnished to do
11 * so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27 * This is a very clever method for correcting mistakes in QuakeC code
28 * most notably when invalid identifiers are used or inproper assignments;
29 * we can proprly lookup in multiple dictonaries (depening on the rules
30 * of what the task is trying to acomplish) to find the best possible
34 * A little about how it works, and probability theory:
36 * When given an identifier (which we will denote I), we're essentially
37 * just trying to choose the most likely correction for that identifier.
38 * (the actual "correction" can very well be the identifier itself).
39 * There is actually no way to know for sure that certian identifers
40 * such as "lates", need to be corrected to "late" or "latest" or any
41 * other permutations that look lexically the same. This is why we
42 * must advocate the usage of probabilities. This means that instead of
43 * just guessing, instead we're trying to find the correction for C,
44 * out of all possible corrections that maximizes the probability of C
45 * for the original identifer I.
47 * Thankfully there exists some theroies for probalistic interpretations
48 * of data. Since we're operating on two distictive intepretations, the
49 * transposition from I to C. We need something that can express how much
50 * degree of I should rationally change to become C. this is called the
51 * Bayesian interpretation. You can read more about it from here:
52 * http://www.celiagreen.com/charlesmccreery/statistics/bayestutorial.pdf
53 * (which is probably the only good online documentation for bayes theroy
54 * no lie. Everything else just sucks ..)
56 * Bayes' Thereom suggests something like the following:
57 * AC P(I|C) P(C) / P(I)
59 * However since P(I) is the same for every possibility of I, we can
60 * completley ignore it giving just:
63 * This greatly helps visualize how the parts of the expression are performed
64 * there is essentially three, from right to left we perform the following:
66 * 1: P(C), the probability that a proposed correction C will stand on its
67 * own. This is called the language model.
69 * 2: P(I|C), the probability that I would be used, when the programmer
70 * really meant C. This is the error model.
72 * 3: AC, the control mechanisim, an enumerator if you will, one that
73 * enumerates all feasible values of C, to determine the one that
74 * gives the greatest probability score.
76 * In reality the requirement for a more complex expression involving
77 * two seperate models is considerably a waste. But one must recognize
78 * that P(C|I) is already conflating two factors. It's just much simpler
79 * to seperate the two models and deal with them explicitaly. To properly
80 * estimate P(C|I) you have to consider both the probability of C and
81 * probability of the transposition from C to I. It's simply much more
82 * cleaner, and direct to seperate the two factors.
84 * Research tells us that 80% to 95% of all spelling errors have an edit
85 * distance no greater than one. Knowing this we can optimize for most
86 * cases of mistakes without taking a performance hit. Which is what we
87 * base longer edit distances off of. Opposed to the original method of
88 * I had concieved of checking everything.
90 * A little information on additional algorithms used:
92 * Initially when I implemented this corrector, it was very slow.
93 * Need I remind you this is essentially a brute force attack on strings,
94 * and since every transformation requires dynamic memory allocations,
95 * you can easily imagine where most of the runtime conflated. Yes
96 * It went right to malloc. More than THREE MILLION malloc calls are
97 * performed for an identifier about 16 bytes long. This was such a
98 * shock to me. A forward allocator (or as some call it a bump-point
99 * allocator, or just a memory pool) was implemented. To combat this.
101 * But of course even other factors were making it slow. Initially
102 * this used a hashtable. And hashtables have a good constant lookup
103 * time complexity. But the problem wasn't in the hashtable, it was
104 * in the hashing (despite having one of the fastest hash functions
105 * known). Remember those 3 million mallocs? Well for every malloc
106 * there is also a hash. After 3 million hashes .. you start to get
107 * very slow. To combat this I had suggested burst tries to Blub.
108 * The next day he had implemented them. Sure enough this brought
109 * down the runtime by a factor > 100%
111 * The trie initially was designed to work on all strings, but later it
112 * became aparent that not only was this not a requirement. It was also
113 * slowing down get/sets' for the trie. To fully understand, only
114 * correct_alpha needs to be understood by the trie system, knowing this
115 * We can combat the slowness using a very clever but evil optimization.
116 * By Setting a fixed sized amount of branches for the trie using a
117 * char-to-index map into the branches. We've complelty made the trie
118 * accesses entierly constant in lookup time. No really, a lookup is
119 * literally trie[str[0]] [str[1]] [2] .... .value.
122 * Future Work (If we really need it)
124 * Currently we can only distinguish one source of error in the
125 * language model we use. This could become an issue for identifiers
126 * that have close colliding rates, e.g colate->coat yields collate.
128 * Currently the error model has been fairly trivial, the smaller the
129 * edit distance the smaller the error. This usually causes some un-
130 * expected problems. e.g reciet->recite yields recipt. For QuakeC
131 * this could become a problem when lots of identifiers are involved.
135 #define CORRECT_POOL_SIZE (128*1024*1024)
137 * A forward allcator for the corrector. This corrector requires a lot
138 * of allocations. This forward allocator combats all those allocations
139 * and speeds us up a little. It also saves us space in a way since each
140 * allocation isn't wasting a little header space for when NOTRACK isn't
143 static unsigned char **correct_pool_data = NULL;
144 static unsigned char *correct_pool_this = NULL;
145 static size_t correct_pool_addr = 0;
147 static GMQCC_INLINE void correct_pool_new(void) {
148 correct_pool_addr = 0;
149 correct_pool_this = (unsigned char *)mem_a(CORRECT_POOL_SIZE);
151 vec_push(correct_pool_data, correct_pool_this);
154 static GMQCC_INLINE void *correct_pool_alloc(size_t bytes) {
156 if (correct_pool_addr + bytes>= CORRECT_POOL_SIZE)
159 data = (void*)correct_pool_this;
160 correct_pool_this += bytes;
161 correct_pool_addr += bytes;
165 static GMQCC_INLINE void correct_pool_delete(void) {
167 for (i = 0; i < vec_size(correct_pool_data); ++i)
168 mem_d(correct_pool_data[i]);
170 correct_pool_data = NULL;
171 correct_pool_this = NULL;
172 correct_pool_addr = 0;
176 static GMQCC_INLINE char *correct_pool_claim(const char *data) {
177 char *claim = util_strdup(data);
182 * _ is valid in identifiers. I've yet to implement numerics however
183 * because they're only valid after the first character is of a _, or
186 static const char correct_alpha[] = "abcdefghijklmnopqrstuvwxyz"
187 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
188 "_"; /* TODO: Numbers ... */
190 static const size_t correct_alpha_index[0x80] = {
191 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
192 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
193 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
194 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
195 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
196 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 0, 0, 0, 0, 52,
197 0, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
198 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 0, 0, 0, 0, 0
202 * A fast space efficent trie for a dictionary of identifiers. This is
203 * faster than a hashtable for one reason. A hashtable itself may have
204 * fast constant lookup time, but the hash itself must be very fast. We
205 * have one of the fastest hash functions for strings, but if you do a
206 * lost of hashing (which we do, almost 3 million hashes per identifier)
207 * a hashtable becomes slow.
209 correct_trie_t* correct_trie_new() {
210 correct_trie_t *t = (correct_trie_t*)mem_a(sizeof(correct_trie_t));
216 static GMQCC_INLINE void correct_trie_del_sub(correct_trie_t *t) {
220 for (i = 0; i < sizeof(correct_alpha)-1; ++i) {
221 correct_trie_del_sub(&t->entries[i]);
226 static GMQCC_INLINE void correct_trie_del(correct_trie_t *t) {
229 for (i = 0; i < sizeof(correct_alpha)-1; ++i)
230 correct_trie_del_sub(&t->entries[i]);
236 static GMQCC_INLINE void* correct_trie_get(const correct_trie_t *t, const char *key) {
237 const unsigned char *data = (const unsigned char*)key;
242 t = t->entries + correct_alpha_index[*data];
248 static GMQCC_INLINE void correct_trie_set(correct_trie_t *t, const char *key, void * const value) {
249 const unsigned char *data = (const unsigned char*)key;
252 t->entries = (correct_trie_t*)mem_a(sizeof(correct_trie_t)*(sizeof(correct_alpha)-1));
253 memset(t->entries, 0, sizeof(correct_trie_t)*(sizeof(correct_alpha)-1));
255 t = t->entries + correct_alpha_index[*data];
263 * Implementation of the corrector algorithm commences. A very efficent
264 * brute-force attack (thanks to tries and mempool :-)).
266 static GMQCC_INLINE size_t *correct_find(correct_trie_t *table, const char *word) {
267 return (size_t*)correct_trie_get(table, word);
270 static GMQCC_INLINE bool correct_update(correct_trie_t* *table, const char *word) {
271 size_t *data = correct_find(*table, word);
279 void correct_add(correct_trie_t* table, size_t ***size, const char *ident) {
281 const char *add = ident;
283 if (!correct_update(&table, add)) {
284 data = (size_t*)mem_a(sizeof(size_t));
287 vec_push((*size), data);
288 correct_trie_set(table, add, data);
292 void correct_del(correct_trie_t* dictonary, size_t **data) {
294 const size_t vs = vec_size(data);
296 for (i = 0; i < vs; i++)
300 correct_trie_del(dictonary);
304 * correcting logic for the following forms of transformations:
310 * These functions could take an additional size_t **size paramater
311 * and store back the results of their new length in an array that
312 * is the same as **array for the memcmp in correct_exists. I'm just
313 * not able to figure out how to do that just yet. As my brain is
314 * not in the mood to figure out that logic. This is a reminder to
315 * do it, or for someone else to :-) correct_edit however would also
316 * need to take a size_t ** to carry it along (would all the argument
317 * overhead be worth it?)
319 static GMQCC_INLINE size_t correct_deletion(const char *ident, char **array) {
321 const size_t len = strlen(ident);
323 for (; itr < len; itr++) {
324 char *a = (char*)correct_pool_alloc(len+1);
325 memcpy(a, ident, itr);
326 memcpy(a + itr, ident + itr + 1, len - itr);
333 static GMQCC_INLINE size_t correct_transposition(const char *ident, char **array) {
335 const size_t len = strlen(ident);
337 for (; itr < len - 1; itr++) {
339 char *a = (char*)correct_pool_alloc(len+1);
340 memcpy(a, ident, len+1);
350 static GMQCC_INLINE size_t correct_alteration(const char *ident, char **array) {
354 const size_t len = strlen(ident);
356 for (; itr < len; itr++) {
357 for (jtr = 0; jtr < sizeof(correct_alpha)-1; jtr++, ktr++) {
358 char *a = (char*)correct_pool_alloc(len+1);
359 memcpy(a, ident, len+1);
360 a[itr] = correct_alpha[jtr];
368 static GMQCC_INLINE size_t correct_insertion(const char *ident, char **array) {
371 const size_t len = strlen(ident);
373 for (; itr <= len; itr++) {
374 for (jtr = 0; jtr < sizeof(correct_alpha)-1; jtr++) {
375 char *a = (char*)correct_pool_alloc(len+2);
376 memcpy(a, ident, itr);
377 memcpy(a + itr + 1, ident + itr, len - itr + 1);
378 a[itr] = correct_alpha[jtr];
379 array[itr * (sizeof(correct_alpha)-1) + jtr] = a;
383 return (len+1)*(sizeof(correct_alpha)-1);
386 static GMQCC_INLINE size_t correct_size(const char *ident) {
389 * transposition = len - 1
390 * alteration = len * sizeof(correct_alpha)
391 * insertion = (len + 1) * sizeof(correct_alpha)
394 register size_t len = strlen(ident);
395 return (len) + (len - 1) + (len * (sizeof(correct_alpha)-1)) + ((len + 1) * (sizeof(correct_alpha)-1));
398 static GMQCC_INLINE char **correct_edit(const char *ident, size_t **lens) {
400 size_t size = correct_size(ident);
401 char **find = (char**)correct_pool_alloc(size * sizeof(char*));
403 if (!find || !(*lens = (size_t*)correct_pool_alloc(size * sizeof(size_t))))
406 next = correct_deletion (ident, find);
407 next += correct_transposition(ident, find+next);
408 next += correct_alteration (ident, find+next);
409 /*****/ correct_insertion (ident, find+next);
411 /* precompute lengths */
412 for (next = 0; next < size; next++)
413 (*lens)[next] = strlen(find[next]);
418 static GMQCC_INLINE int correct_exist(char **array, register size_t *sizes, size_t rows, char *ident, register size_t len) {
420 for (itr = 0; itr < rows; itr++) {
422 * We can save tons of calls to memcmp if we simply ignore comparisions
423 * that we know cannot contain the same length.
425 if (sizes[itr] == len && !memcmp(array[itr], ident, len))
432 static GMQCC_INLINE char **correct_known_resize(char **res, size_t *allocated, size_t size) {
433 size_t oldallocated = *allocated;
435 if (size < oldallocated)
438 out = (char**)correct_pool_alloc(sizeof(*res) * oldallocated + 32);
439 memcpy(out, res, sizeof(*res) * oldallocated);
445 static char **correct_known(correction_t *corr, correct_trie_t* table, char **array, size_t rows, size_t *next) {
451 char **res = (char**)correct_pool_alloc(sizeof(char *) * nxt);
455 for (; itr < rows; itr++) {
458 if (vec_size(corr->edits) > itr+1) {
459 end = corr->edits[itr+1];
460 bit = corr->lens [itr+1];
462 end = correct_edit(array[itr], &bit);
463 vec_push(corr->edits, end);
464 vec_push(corr->lens, bit);
466 row = correct_size(array[itr]);
468 for (jtr = 0; jtr < row; jtr++) {
469 if (correct_find(table, end[jtr]) && !correct_exist(res, bit, len, end[jtr], bit[jtr])) {
470 res = correct_known_resize(res, &nxt, len+1);
471 res[len++] = end[jtr];
480 static GMQCC_INLINE char *correct_maximum(correct_trie_t* table, char **array, size_t rows) {
486 for (; itr < rows; itr++) {
487 if ((itm = correct_find(table, array[itr])) && (*itm > top)) {
497 * This is the exposed interface:
498 * takes a table for the dictonary a vector of sizes (used for internal
499 * probability calculation), and an identifier to "correct".
501 void correct_init(correction_t *c)
508 void correct_free(correction_t *c)
512 correct_pool_delete();
515 char *correct_str(correction_t *corr, correct_trie_t* table, const char *ident) {
518 char *e1ident = NULL;
519 char *e2ident = NULL;
524 /* needs to be allocated for free later */
525 if (correct_find(table, ident))
526 return correct_pool_claim(ident);
528 if ((e1rows = correct_size(ident))) {
529 if (vec_size(corr->edits) > 0)
532 e1 = correct_edit(ident, &bits);
533 vec_push(corr->edits, e1);
534 vec_push(corr->lens, bits);
537 if ((e1ident = correct_maximum(table, e1, e1rows)))
538 return correct_pool_claim(e1ident);
541 e2 = correct_known(corr, table, e1, e1rows, &e2rows);
542 if (e2rows && ((e2ident = correct_maximum(table, e2, e2rows))))
543 return correct_pool_claim(e2ident);
546 return util_strdup(ident);