From e2e4907b606f121e69556c4b4d583172140775d7 Mon Sep 17 00:00:00 2001
From: Dale Weiler <killfieldengine@gmail.com>
Date: Sun, 6 Jan 2013 03:26:09 +0000
Subject: [PATCH] Nicer loops

---
 correct.c | 67 ++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 46 insertions(+), 21 deletions(-)

diff --git a/correct.c b/correct.c
index 8e5c759..7056b52 100644
--- a/correct.c
+++ b/correct.c
@@ -69,8 +69,14 @@
  *  probability of the transposition from C to I.  It's simply much more
  *  cleaner, and direct to seperate the two factors.
  *
+ *  Research tells us that 80% to 95% of all spelling errors have an edit
+ *  distance no greater than one.  Knowing this we can optimize for most
+ *  cases of mistakes without taking a performance hit.  Which is what we
+ *  base longer edit distances off of.  Opposed to the original method of
+ *  I had concieved of checking everything.     
+ *  
  * A little information on additional algorithms used:
- * 
+ *
  *   Initially when I implemented this corrector, it was very slow.
  *   Need I remind you this is essentially a brute force attack on strings,
  *   and since every transformation requires dynamic memory allocations,
@@ -89,6 +95,21 @@
  *   very slow.  To combat this I had suggested burst tries to Blub.
  *   The next day he had implemented them. Sure enough this brought
  *   down the runtime by a factory > 100%
+ *
+ * Future Work (If we really need it)
+ *
+ *   Currently we can only distinguishes one source of error in the
+ *   language model we use.  This could become an issue for identifiers
+ *   that have close colliding rates, e.g colate->coat yields collate.
+ *
+ *   Currently the error model has been fairly trivial, the smaller the
+ *   edit distance the smaller the error.  This usually causes some un-
+ *   expected problems. e.g reciet->recite yields recipt.  For QuakeC
+ *   this could become a problem when lots of identifiers are involved. 
+ *
+ *   Our control mechanisim could use a limit, i.e limit the number of
+ *   sets of edits for distance X.  This would also increase execution
+ *   speed considerably.  
  */
 
 
@@ -193,10 +214,11 @@ void* correct_trie_get(const correct_trie_t *t, const char *key) {
 void correct_trie_set(correct_trie_t *t, const char *key, void * const value) {
     const unsigned char *data = (const unsigned char*)key;
     while (*data) {
-        unsigned char ch = *data;
-        correct_trie_t       *entries = t->entries;
-        const size_t  vs = vec_size(t->entries);
-        size_t        i;
+        const size_t    vs      = vec_size(t->entries);
+        unsigned char   ch      = *data;
+        correct_trie_t *entries = t->entries;
+        size_t          i;
+
         for (i = 0; i < vs; ++i) {
             if (entries[i].ch == ch) {
                 t = &entries[i];
@@ -205,10 +227,11 @@ void correct_trie_set(correct_trie_t *t, const char *key, void * const value) {
         }
         if (i == vs) {
             correct_trie_t *elem  = (correct_trie_t*)vec_add(t->entries, 1);
+
             elem->ch      = ch;
             elem->value   = NULL;
             elem->entries = NULL;
-            t = elem;
+            t             = elem;
         }
         ++data;
     }
@@ -247,8 +270,9 @@ void correct_add(correct_trie_t* table, size_t ***size, const char *ident) {
 }
 
 void correct_del(correct_trie_t* dictonary, size_t **data) {
-    size_t i;
+    size_t       i;
     const size_t vs = vec_size(data);
+
     for (i = 0; i < vs; i++)
         mem_d(data[i]);
 
@@ -261,7 +285,9 @@ void correct_del(correct_trie_t* dictonary, size_t **data) {
  * because they're only valid after the first character is of a _, or
  * alpha character.
  */
-static const char correct_alpha[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_";
+static const char correct_alpha[] = "abcdefghijklmnopqrstuvwxyz"
+                                    "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+                                    "_"; /* TODO: Numbers ... */
 
 /*
  * correcting logic for the following forms of transformations:
@@ -392,19 +418,19 @@ static GMQCC_INLINE char **correct_known_resize(char **res, size_t *allocated, s
 }
 
 static char **correct_known(correct_trie_t* table, char **array, size_t rows, size_t *next) {
-    size_t itr;
-    size_t jtr;
-    size_t len;
-    size_t row;
+    size_t itr = 0;
+    size_t jtr = 0;
+    size_t len = 0;
+    size_t row = 0;
     size_t nxt = 8;
     char **res = correct_pool_alloc(sizeof(char *) * nxt);
     char **end = NULL;
 
-    for (itr = 0, len = 0; itr < rows; itr++) {
+    for (; itr < rows; itr++) {
         end = correct_edit(array[itr]);
         row = correct_size(array[itr]);
 
-        for (jtr = 0; jtr < row; jtr++) {
+        for (; jtr < row; jtr++) {
             if (correct_find(table, end[jtr]) && !correct_exist(res, len, end[jtr])) {
                 res        = correct_known_resize(res, &nxt, len+1);
                 res[len++] = end[jtr];
@@ -417,12 +443,12 @@ static char **correct_known(correct_trie_t* table, char **array, size_t rows, si
 }
 
 static char *correct_maximum(correct_trie_t* table, char **array, size_t rows) {
-    char   *str  = NULL;
-    size_t *itm  = NULL;
-    size_t  itr;
-    size_t  top;
+    char   *str = NULL;
+    size_t *itm = NULL;
+    size_t  itr = 0;
+    size_t  top = 0;
 
-    for (itr = 0, top = 0; itr < rows; itr++) {
+    for (; itr < rows; itr++) {
         if ((itm = correct_find(table, array[itr])) && (*itm > top)) {
             top = *itm;
             str = array[itr];
@@ -439,8 +465,7 @@ static char *correct_maximum(correct_trie_t* table, char **array, size_t rows) {
  *
  * the add function works the same.  Except the identifier is used to
  * add to the dictonary.  
- */   
-
+ */
 char *correct_str(correct_trie_t* table, const char *ident) {
     char **e1;
     char **e2;
-- 
2.39.2