javascript - What is the fastest levenshtein algorithm for high frequent use -


for client side search tool need find levenshtein distance of word millions of other words. user should able compare short text of twenty words book. user finding locations of characterizing words of text in book. 'finding locations'does not mean looking exact match match levenshtein. started available implementations needed more speed. ended this:

var rowa = new uint16array(1e6); var rowb = new uint16array(1e6); function levenshtein(s1, s2) {     var s1_len = s1.length, s2_len = s2.length, i1, i2 = 0, a, b, c, c2, = 0;     if (s1_len === 0)         return s2_len;     if (s2_len === 0)         return s1_len;     while (i < s1_len)         rowa[i] = ++i;     while (i2 < s2_len) {         c2 = s2[i2];         = i2;         ++i2;         b = i2;         (i1 = 0; i1 < s1_len; ++i1) {             c = + (s1[i1] !== c2 );             = rowa[i1];             b = b < ? (b < c ? b + 1 : c) : (a < c ? + 1 : c);             rowb[i1] = b;         }         if (i2 === s2_len)             return b;         c2 = s2[i2];         = i2;         ++i2;         b = i2;         (i1 = 0; i1 < s1_len; ++i1) {             c = + (s1[i1] !== c2 );             = rowb[i1];             b = b < ? (b < c ? b + 1 : c) : (a < c ? + 1 : c);             rowa[i1] = b;         }     }     return b; } 

as see used techniques placing objects out of function in order re use them. repeated myself bit linearizing loop somewhat. faster? curious of advice.

update: after tips bergi , more thinking came solution:

    var row = new uint16array(1e6);     function levenshtein(s1, s2) {         var s1_len = s1.length, s2_len = s2.length, i2 = 1, a, b = 0, c, c2, i1 = 0;         if (s1_len === 0)             return s2_len;         if (s2_len === 0)             return s1_len;         c2 = s2[0];         if (s1[0] === c2) {             while (i1 < s1_len) {                 row[i1] = i1++;             }             b = s1_len - 1;         } else {             row[0] = 1;             ++b;             if (s1_len > 1)                 (i1 = 1; i1 < s1_len; ++i1) {                     if (s1[i1] === c2) {                         row[i1] = b;                         (++i1; i1 < s1_len; ++i1) {                             row[i1] = ++b;                         }                     } else {                         row[i1] = ++b;                     }                 }         }         if (s2_len > 1)             while (i2 < s2_len) {                 c2 = s2[i2];                 c = i2 + (s1[0] !== c2);                 = row[0];                 ++i2;                 b = i2 < ? (i2 < c ? i2 + 1 : c) : (a < c ? + 1 : c);                 row[0] = b;                 if (s1_len > 1) {                     (i1 = 1; i1 < s1_len; ++i1) {                         c = + (s1[i1] !== c2);                         = row[i1];                         b = b < ? (b < c ? b + 1 : c) : (a < c ? + 1 : c);                         row[i1] = b;                     }                 }             }         return b;     } 

this again faster. cannot squeeze more out of it. keep looking other ideas , try more.

since comparing against same word on , over, can little performance improvement using partial application , caching there:

function levenshtein(s1) {     var row0 = [], row1 = [], s1_len = s1.length;     if (s1_len === 0)         return function(s2) {             return s2.length;         };     return function(s2) {         var s2_len = s2.length, s1_idx, s2_idx = 0, a, b, c, c2, = 0;         if (s2_len === 0)             return s1_len;         …         return b;     }; } 

i repeated myself bit linearizing loop somewhat.

not sure whether gets lot faster, can omit 1 of arrays - not need read/write them in alternating manner:

function levenshtein(s1) {     var s1_len = s1.length, row = new array(s1_len);     if (s1_len === 0)         return function(s2) {             return s2.length;         };     return function(s2) {         var s2_len = s2.length, s1_idx, s2_idx = 0, a, b, c, c2, = 0;         if (s2_len === 0)             return s1_len;         while (i < s1_len)            row[i] = ++i;         while (s2_idx < s2_len) {             c2 = s2[s2_idx];             = s2_idx;             ++s2_idx;             b = s2_idx;             (s1_idx = 0; s1_idx < s1_len; ++s1_idx) {                 c = + (s1[s1_idx] === c2 ? 0 : 1);                 = row[s1_idx];                 b = b < ? (b < c ? b + 1 : c) : (a < c ? + 1 : c);                 row[s1_idx] = b;             }         }         return b;     }; } 

i don't think further optimisations can made without putting millions of words in dedicated data structure (e.g. prefix trie).


Comments

Popular posts from this blog

java - activate/deactivate sonar maven plugin by profile? -

python - TypeError: can only concatenate tuple (not "float") to tuple -

java - What is the difference between String. and String.this. ? -