2 * Copyright (C) 2012, 2013
6 * Permission is hereby granted, free of charge, to any person obtaining a copy of
7 * this software and associated documentation files (the "Software"), to deal in
8 * the Software without restriction, including without limitation the rights to
9 * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
10 * of the Software, and to permit persons to whom the Software is furnished to do
11 * so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
28 * This is a very clever method for correcting mistakes in QuakeC code
29 * most notably when invalid identifiers are used or inproper assignments;
30 * we can proprly lookup in multiple dictonaries (depening on the rules
31 * of what the task is trying to acomplish) to find the best possible
35 * A little about how it works, and probability theory:
37 * When given an identifier (which we will denote I), we're essentially
38 * just trying to choose the most likely correction for that identifier.
39 * (the actual "correction" can very well be the identifier itself).
40 * There is actually no way to know for sure that certian identifers
41 * such as "lates", need to be corrected to "late" or "latest" or any
42 * other permutations that look lexically the same. This is why we
43 * must advocate the usage of probabilities. This means that instead of
44 * just guessing, instead we're trying to find the correction for C,
45 * out of all possible corrections that maximizes the probability of C
46 * for the original identifer I.
48 * Thankfully there exists some theroies for probalistic interpretations
49 * of data. Since we're operating on two distictive intepretations, the
50 * transposition from I to C. We need something that can express how much
51 * degree of I should rationally change to become C. this is called the
52 * Bayesian interpretation. You can read more about it from here:
53 * http://www.celiagreen.com/charlesmccreery/statistics/bayestutorial.pdf
54 * (which is probably the only good online documentation for bayes theroy
55 * no lie. Everything else just sucks ..)
57 * Bayes' Thereom suggests something like the following:
58 * AC P(I|C) P(C) / P(I)
60 * However since P(I) is the same for every possibility of I, we can
61 * completley ignore it giving just:
64 * This greatly helps visualize how the parts of the expression are performed
65 * there is essentially three, from right to left we perform the following:
67 * 1: P(C), the probability that a proposed correction C will stand on its
68 * own. This is called the language model.
70 * 2: P(I|C), the probability that I would be used, when the programmer
71 * really meant C. This is the error model.
73 * 3: AC, the control mechanisim, an enumerator if you will, one that
74 * enumerates all feasible values of C, to determine the one that
75 * gives the greatest probability score.
77 * In reality the requirement for a more complex expression involving
78 * two seperate models is considerably a waste. But one must recognize
79 * that P(C|I) is already conflating two factors. It's just much simpler
80 * to seperate the two models and deal with them explicitaly. To properly
81 * estimate P(C|I) you have to consider both the probability of C and
82 * probability of the transposition from C to I. It's simply much more
83 * cleaner, and direct to seperate the two factors.
85 * Research tells us that 80% to 95% of all spelling errors have an edit
86 * distance no greater than one. Knowing this we can optimize for most
87 * cases of mistakes without taking a performance hit. Which is what we
88 * base longer edit distances off of. Opposed to the original method of
89 * I had concieved of checking everything.
91 * A little information on additional algorithms used:
93 * Initially when I implemented this corrector, it was very slow.
94 * Need I remind you this is essentially a brute force attack on strings,
95 * and since every transformation requires dynamic memory allocations,
96 * you can easily imagine where most of the runtime conflated. Yes
97 * It went right to malloc. More than THREE MILLION malloc calls are
98 * performed for an identifier about 16 bytes long. This was such a
99 * shock to me. A forward allocator (or as some call it a bump-point
100 * allocator, or just a memory pool) was implemented. To combat this.
102 * But of course even other factors were making it slow. Initially
103 * this used a hashtable. And hashtables have a good constant lookup
104 * time complexity. But the problem wasn't in the hashtable, it was
105 * in the hashing (despite having one of the fastest hash functions
106 * known). Remember those 3 million mallocs? Well for every malloc
107 * there is also a hash. After 3 million hashes .. you start to get
108 * very slow. To combat this I had suggested burst tries to Blub.
109 * The next day he had implemented them. Sure enough this brought
110 * down the runtime by a factor > 100%
112 * The trie initially was designed to work on all strings, but later it
113 * became aparent that not only was this not a requirement. It was also
114 * slowing down get/sets' for the trie. To fully understand, only
115 * correct_alpha needs to be understood by the trie system, knowing this
116 * We can combat the slowness using a very clever but evil optimization.
117 * By Setting a fixed sized amount of branches for the trie using a
118 * char-to-index map into the branches. We've complelty made the trie
119 * accesses entierly constant in lookup time. No really, a lookup is
120 * literally trie[str[0]] [str[1]] [2] .... .value.
123 * Future Work (If we really need it)
125 * Currently we can only distinguish one source of error in the
126 * language model we use. This could become an issue for identifiers
127 * that have close colliding rates, e.g colate->coat yields collate.
129 * Currently the error model has been fairly trivial, the smaller the
130 * edit distance the smaller the error. This usually causes some un-
131 * expected problems. e.g reciet->recite yields recipt. For QuakeC
132 * this could become a problem when lots of identifiers are involved.
136 #define CORRECT_POOL_SIZE (128*1024*1024)
138 * A forward allcator for the corrector. This corrector requires a lot
139 * of allocations. This forward allocator combats all those allocations
140 * and speeds us up a little. It also saves us space in a way since each
141 * allocation isn't wasting a little header space for when NOTRACK isn't
144 static unsigned char **correct_pool_data = NULL;
145 static unsigned char *correct_pool_this = NULL;
146 static size_t correct_pool_addr = 0;
148 static GMQCC_INLINE void correct_pool_new(void) {
149 correct_pool_addr = 0;
150 correct_pool_this = (unsigned char *)mem_a(CORRECT_POOL_SIZE);
152 vec_push(correct_pool_data, correct_pool_this);
155 static GMQCC_INLINE void *correct_pool_alloc(size_t bytes) {
157 if (correct_pool_addr + bytes>= CORRECT_POOL_SIZE)
160 data = (void*)correct_pool_this;
161 correct_pool_this += bytes;
162 correct_pool_addr += bytes;
166 static GMQCC_INLINE void correct_pool_delete(void) {
168 for (i = 0; i < vec_size(correct_pool_data); ++i)
169 mem_d(correct_pool_data[i]);
171 correct_pool_data = NULL;
172 correct_pool_this = NULL;
173 correct_pool_addr = 0;
177 static GMQCC_INLINE char *correct_pool_claim(const char *data) {
178 char *claim = util_strdup(data);
183 * _ is valid in identifiers. I've yet to implement numerics however
184 * because they're only valid after the first character is of a _, or
187 static const char correct_alpha[] = "abcdefghijklmnopqrstuvwxyz"
188 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
189 "_"; /* TODO: Numbers ... */
191 static const size_t correct_alpha_index[0x80] = {
192 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
193 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
194 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
195 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
196 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
197 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 0, 0, 0, 0, 52,
198 0, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
199 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 0, 0, 0, 0, 0
203 * A fast space efficent trie for a dictionary of identifiers. This is
204 * faster than a hashtable for one reason. A hashtable itself may have
205 * fast constant lookup time, but the hash itself must be very fast. We
206 * have one of the fastest hash functions for strings, but if you do a
207 * lost of hashing (which we do, almost 3 million hashes per identifier)
208 * a hashtable becomes slow.
210 correct_trie_t* correct_trie_new() {
211 correct_trie_t *t = (correct_trie_t*)mem_a(sizeof(correct_trie_t));
217 static GMQCC_INLINE void correct_trie_del_sub(correct_trie_t *t) {
221 for (i = 0; i < sizeof(correct_alpha)-1; ++i) {
222 correct_trie_del_sub(&t->entries[i]);
227 static GMQCC_INLINE void correct_trie_del(correct_trie_t *t) {
230 for (i = 0; i < sizeof(correct_alpha)-1; ++i)
231 correct_trie_del_sub(&t->entries[i]);
237 static GMQCC_INLINE void* correct_trie_get(const correct_trie_t *t, const char *key) {
238 const unsigned char *data = (const unsigned char*)key;
243 t = t->entries + correct_alpha_index[*data];
249 static GMQCC_INLINE void correct_trie_set(correct_trie_t *t, const char *key, void * const value) {
250 const unsigned char *data = (const unsigned char*)key;
253 t->entries = (correct_trie_t*)mem_a(sizeof(correct_trie_t)*(sizeof(correct_alpha)-1));
254 memset(t->entries, 0, sizeof(correct_trie_t)*(sizeof(correct_alpha)-1));
256 t = t->entries + correct_alpha_index[*data];
264 * Implementation of the corrector algorithm commences. A very efficent
265 * brute-force attack (thanks to tries and mempool :-)).
267 static GMQCC_INLINE size_t *correct_find(correct_trie_t *table, const char *word) {
268 return (size_t*)correct_trie_get(table, word);
271 static GMQCC_INLINE bool correct_update(correct_trie_t* *table, const char *word) {
272 size_t *data = correct_find(*table, word);
280 void correct_add(correct_trie_t* table, size_t ***size, const char *ident) {
282 const char *add = ident;
284 if (!correct_update(&table, add)) {
285 data = (size_t*)mem_a(sizeof(size_t));
288 vec_push((*size), data);
289 correct_trie_set(table, add, data);
293 void correct_del(correct_trie_t* dictonary, size_t **data) {
295 const size_t vs = vec_size(data);
297 for (i = 0; i < vs; i++)
301 correct_trie_del(dictonary);
305 * correcting logic for the following forms of transformations:
311 * These functions could take an additional size_t **size paramater
312 * and store back the results of their new length in an array that
313 * is the same as **array for the memcmp in correct_exists. I'm just
314 * not able to figure out how to do that just yet. As my brain is
315 * not in the mood to figure out that logic. This is a reminder to
316 * do it, or for someone else to :-) correct_edit however would also
317 * need to take a size_t ** to carry it along (would all the argument
318 * overhead be worth it?)
320 static GMQCC_INLINE size_t correct_deletion(const char *ident, char **array) {
322 const size_t len = strlen(ident);
324 for (; itr < len; itr++) {
325 char *a = (char*)correct_pool_alloc(len+1);
326 memcpy(a, ident, itr);
327 memcpy(a + itr, ident + itr + 1, len - itr);
334 static GMQCC_INLINE size_t correct_transposition(const char *ident, char **array) {
336 const size_t len = strlen(ident);
338 for (; itr < len - 1; itr++) {
340 char *a = (char*)correct_pool_alloc(len+1);
341 memcpy(a, ident, len+1);
351 static GMQCC_INLINE size_t correct_alteration(const char *ident, char **array) {
355 const size_t len = strlen(ident);
357 for (; itr < len; itr++) {
358 for (jtr = 0; jtr < sizeof(correct_alpha)-1; jtr++, ktr++) {
359 char *a = (char*)correct_pool_alloc(len+1);
360 memcpy(a, ident, len+1);
361 a[itr] = correct_alpha[jtr];
369 static GMQCC_INLINE size_t correct_insertion(const char *ident, char **array) {
372 const size_t len = strlen(ident);
374 for (; itr <= len; itr++) {
375 for (jtr = 0; jtr < sizeof(correct_alpha)-1; jtr++) {
376 char *a = (char*)correct_pool_alloc(len+2);
377 memcpy(a, ident, itr);
378 memcpy(a + itr + 1, ident + itr, len - itr + 1);
379 a[itr] = correct_alpha[jtr];
380 array[itr * (sizeof(correct_alpha)-1) + jtr] = a;
384 return (len+1)*(sizeof(correct_alpha)-1);
387 static GMQCC_INLINE size_t correct_size(const char *ident) {
390 * transposition = len - 1
391 * alteration = len * sizeof(correct_alpha)
392 * insertion = (len + 1) * sizeof(correct_alpha)
395 register size_t len = strlen(ident);
396 return (len) + (len - 1) + (len * (sizeof(correct_alpha)-1)) + ((len + 1) * (sizeof(correct_alpha)-1));
399 static GMQCC_INLINE char **correct_edit(const char *ident, size_t **lens) {
401 size_t size = correct_size(ident);
402 char **find = (char**)correct_pool_alloc(size * sizeof(char*));
404 if (!find || !(*lens = (size_t*)correct_pool_alloc(size * sizeof(size_t))))
407 next = correct_deletion (ident, find);
408 next += correct_transposition(ident, find+next);
409 next += correct_alteration (ident, find+next);
410 /*****/ correct_insertion (ident, find+next);
412 /* precompute lengths */
413 for (next = 0; next < size; next++)
414 (*lens)[next] = strlen(find[next]);
419 static GMQCC_INLINE int correct_exist(char **array, register size_t *sizes, size_t rows, char *ident, register size_t len) {
421 for (itr = 0; itr < rows; itr++) {
423 * We can save tons of calls to memcmp if we simply ignore comparisions
424 * that we know cannot contain the same length.
426 if (sizes[itr] == len && !memcmp(array[itr], ident, len))
433 static GMQCC_INLINE char **correct_known_resize(char **res, size_t *allocated, size_t size) {
434 size_t oldallocated = *allocated;
436 if (size < oldallocated)
439 out = (char**)correct_pool_alloc(sizeof(*res) * oldallocated + 32);
440 memcpy(out, res, sizeof(*res) * oldallocated);
446 static char **correct_known(correction_t *corr, correct_trie_t* table, char **array, size_t rows, size_t *next) {
452 char **res = (char**)correct_pool_alloc(sizeof(char *) * nxt);
456 for (; itr < rows; itr++) {
459 if (vec_size(corr->edits) > itr+1) {
460 end = corr->edits[itr+1];
461 bit = corr->lens [itr+1];
463 end = correct_edit(array[itr], &bit);
464 vec_push(corr->edits, end);
465 vec_push(corr->lens, bit);
467 row = correct_size(array[itr]);
469 for (jtr = 0; jtr < row; jtr++) {
470 if (correct_find(table, end[jtr]) && !correct_exist(res, bit, len, end[jtr], bit[jtr])) {
471 res = correct_known_resize(res, &nxt, len+1);
472 res[len++] = end[jtr];
481 static GMQCC_INLINE char *correct_maximum(correct_trie_t* table, char **array, size_t rows) {
487 for (; itr < rows; itr++) {
488 if ((itm = correct_find(table, array[itr])) && (*itm > top)) {
498 * This is the exposed interface:
499 * takes a table for the dictonary a vector of sizes (used for internal
500 * probability calculation), and an identifier to "correct".
502 void correct_init(correction_t *c)
509 void correct_free(correction_t *c)
513 correct_pool_delete();
516 char *correct_str(correction_t *corr, correct_trie_t* table, const char *ident) {
519 char *e1ident = NULL;
520 char *e2ident = NULL;
525 /* needs to be allocated for free later */
526 if (correct_find(table, ident))
527 return correct_pool_claim(ident);
529 if ((e1rows = correct_size(ident))) {
530 if (vec_size(corr->edits) > 0)
533 e1 = correct_edit(ident, &bits);
534 vec_push(corr->edits, e1);
535 vec_push(corr->lens, bits);
538 if ((e1ident = correct_maximum(table, e1, e1rows)))
539 return correct_pool_claim(e1ident);
542 e2 = correct_known(corr, table, e1, e1rows, &e2rows);
543 if (e2rows && ((e2ident = correct_maximum(table, e2, e2rows))))
544 return correct_pool_claim(e2ident);
547 return util_strdup(ident);