* there is also a hash. After 3 million hashes .. you start to get
* very slow. To combat this I had suggested burst tries to Blub.
* The next day he had implemented them. Sure enough this brought
- * down the runtime by a factory > 100%
+ * down the runtime by a factor > 100%
*
+ * The trie initially was designed to work on all strings, but later it
+ * became aparent that not only was this not a requirement. It was also
+ * slowing down get/sets' for the trie. To fully understand, only
+ * correct_alpha needs to be understood by the trie system, knowing this
+ * We can combat the slowness using a very clever but evil optimization.
+ * By Setting a fixed sized amount of branches for the trie using a
+ * char-to-index map into the branches. We've complelty made the trie
+ * accesses entierly constant in lookup time. No really, a lookup is
+ * literally trie[str[0]] [str[1]] [2] .... .value.
+ *
+ *
* Future Work (If we really need it)
*
- * Currently we can only distinguishes one source of error in the
+ * Currently we can only distinguish one source of error in the
* language model we use. This could become an issue for identifiers
* that have close colliding rates, e.g colate->coat yields collate.
*
static GMQCC_INLINE char *correct_pool_claim(const char *data) {
char *claim = util_strdup(data);
- correct_pool_delete();
return claim;
}
* jge correct_cmp_loop
* ...
*
- * Despite how much optimization went in to this, the speed was the
+ * Despite how much optimization went in to this, the speed was
* being conflicted by the strlen(ident) used for &LHS[END_POS]
* If we could eliminate the strlen with what I suggested on line
* 311 ... we can accelerate this whole damn thing quite a bit.
return out;
}
-static char **correct_known(correct_trie_t* table, char **array, size_t rows, size_t *next) {
+static char **correct_known(correction_t *corr, correct_trie_t* table, char **array, size_t rows, size_t *next) {
size_t itr = 0;
size_t jtr = 0;
size_t len = 0;
char **end = NULL;
for (; itr < rows; itr++) {
- end = correct_edit(array[itr]);
+ if (!array[itr][0])
+ continue;
+ if (vec_size(corr->edits) > itr+1)
+ end = corr->edits[itr+1];
+ else {
+ end = correct_edit(array[itr]);
+ vec_push(corr->edits, end);
+ }
row = correct_size(array[itr]);
- /* removing jtr=0 here speeds it up by 100ms O_o */
for (jtr = 0; jtr < row; jtr++) {
if (correct_find(table, end[jtr]) && !correct_exist(res, len, end[jtr])) {
res = correct_known_resize(res, &nxt, len+1);
* takes a table for the dictonary a vector of sizes (used for internal
* probability calculation), and an identifier to "correct".
*/
-char *correct_str(correct_trie_t* table, const char *ident) {
+void correct_init(correction_t *c)
+{
+ correct_pool_new();
+ c->edits = NULL;
+}
+
+void correct_free(correction_t *c)
+{
+ vec_free(c->edits);
+ correct_pool_delete();
+}
+
+char *correct_str(correction_t *corr, correct_trie_t* table, const char *ident) {
char **e1 = NULL;
char **e2 = NULL;
char *e1ident = NULL;
size_t e1rows = 0;
size_t e2rows = 0;
- correct_pool_new();
-
/* needs to be allocated for free later */
if (correct_find(table, ident))
return correct_pool_claim(ident);
if ((e1rows = correct_size(ident))) {
- e1 = correct_edit(ident);
+ if (vec_size(corr->edits) > 0)
+ e1 = corr->edits[0];
+ else {
+ e1 = correct_edit(ident);
+ vec_push(corr->edits, e1);
+ }
if ((e1ident = correct_maximum(table, e1, e1rows)))
return correct_pool_claim(e1ident);
}
- e2 = correct_known(table, e1, e1rows, &e2rows);
+ e2 = correct_known(corr, table, e1, e1rows, &e2rows);
if (e2rows && ((e2ident = correct_maximum(table, e2, e2rows))))
return correct_pool_claim(e2ident);
- correct_pool_delete();
return util_strdup(ident);
}