/*---------------------------------------------------------------------- File : wld.c Contents: compute weighted Levenshtein distance Author : Christian Borgelt History : 26.06.1992 file created 28.11.1995 main function added, length restriction removed 24.10.1996 weights made function arguments 10.11.1997 minor improvements 04.03.1998 weights changed to floating point 08.04.1998 memory requirement significantly reduced ----------------------------------------------------------------------*/ #include #include #include #include /*---------------------------------------------------------------------- Preprocessor Definitions ----------------------------------------------------------------------*/ #define up(c) ((((c) >= 'a') && ((c) <= 'z')) ? (c) +'A'-'a' : (c)) /* --- error codes --- */ #define OK 0 /* no error */ #define E_NONE 0 /* no error */ #define E_NOMEM (-1) /* not enough memory */ #define E_OPTION (-2) /* illegal option */ #define E_ARGCNT (-3) /* wrong number of arguments */ #define E_WEIGHT (-4) /* illegal weight */ #define E_UNKNOWN (-5) /* unknown error */ /*---------------------------------------------------------------------- Constants ----------------------------------------------------------------------*/ const char *errmsgs[] = { /* error messages */ /* E_NONE */ "no error\n", /* E_NOMEM -1 */ "not enough memory\n", /* E_OPTION -2 */ "unknown option -%c\n", /* E_ARGCNT -3 */ "wrong number of arguments\n", /* E_WEIGHT -4 */ "illegal weight(s) (must be >= 0)\n", /* E_UNKNOWN -5 */ "unknown error\n", }; /*---------------------------------------------------------------------- Global Variables ----------------------------------------------------------------------*/ const char *prgname; /* program name for error messages */ /*---------------------------------------------------------------------- Functions ----------------------------------------------------------------------*/ float wld (const char *s1, const char *s2, float w_case, float w_subst, float w_ins, float w_del) { /* --- weighted Levenshtein distance */ int i; /* loop variable */ int len1, len2; /* length of strings s1 and s2 */ const char *s; /* to traverse string s2 */ float *tab; /* distance table */ float *d; /* to traverse the distance table */ float t, w; /* cumulated weights */ len1 = (int)strlen(s1); /* get the lengths of the strings */ len2 = (int)strlen(s2); /* to compare and check them */ if (len1 <= 0) return len2 *w_ins; if (len2 <= 0) return len1 *w_del; if (len1 < len2) { /* if the first string is shorter */ s = s1; s1 = s2; s2 = s; i = len1; len1 = len2; len2 = i; w = w_ins; w_ins = w_del; w_del = w; } /* exchange the two strings */ tab = (float*)malloc((len2+1) *sizeof(float)); if (!tab) return -1; /* allocate distance table */ d = tab; w = 0; /* traverse distance table */ for (i = len2 +1; --i >= 0; ){/* and initialize the weights */ *d++ = w; w += w_ins; } /* as for an empty string s2 */ for (i = 0; *s1; s1++) { /* traverse rows */ d = tab; w = ++i *w_del; /* compute new first element */ for (s = s2; *s; s++) { /* traverse columns */ t = *d; *d++ = w; /* compute the cumulated weight */ if (*s1 != *s) /* for a substitution */ t += (up(*s1) == up(*s)) ? w_case : w_subst; w += w_ins; /* compute the cumulated */ if (t < w) w = t; /* weight for an insertion */ t = *d +w_del; /* and the cumulated weight */ if (t < w) w = t; /* for a deletion and select the */ } /* minimum of the three weights */ *d = w; /* for the next table element */ } free(tab); /* delete distance table */ return w; /* return computed distance */ } /* wld() */ /*---------------------------------------------------------------------- The above function computes the weighted Levenshtein distance for the two strings s1 and s2 using as little memory as possible. To understand it, let us neglect the special implementation for a moment and look at the underlying idea first. It rests on the fact that the weighted Levenshtein distance can be computed using the recursive formula d(a_1...a_n,b_1...b_m) / m * w_ins, if n = 0, = | n * w_del, if m = 0, \ min { d(a_1...a_n ,b_1...b_{m-1}) + w_ins, d(a_1...a_{n-1},b_1...b_m ) + w_del, d(a_1...a_{n-1},b_1...b_{m-1}) + w(a_n,b_m)}, otherwise, where a_1 to a_n are the letters of the first string and b_1 to b_m are the letters of the second string, w_ins the weight of an insertion, w_del the weight of a deletion and / 0, if a = b, w(a,b) = | w_case, if uppercase(a) = uppercase(b), \ w_subst, otherwise, with w_case the weight of a case change and w_subst the weight of a substitution. This formula takes care of the fact that in order to turn an empty string into a string m characters long, it takes m insertions, and to turn a string n characters long into the empty string, it takes n deletions. To transform a string a_1...a_n into a string b_1...b_m, one can transform a_1...a_n into b_1...b_{m-1} and then append the letter b_m (insertion), one can delete a_n and transform a_1...a_{n-1} into b_1...b_m (deletion) or one can transform a_1...a_{n-1} to b_1...b_{m-1} and then replace a_n by b_m (substitution). This formula can easily be computed using a table. For example, let s1 = "black" and s2 = "brake" and the weights w_case = 0, w_ins = 1, w_subst = 2, w_del = 3. (This is a reasonable choice of weights, if the first string is the one searched for and second stems from, say, a text in which it is to be searched. Since a user is more likely to leave out a letter --- e.g. by not hitting a key hard enough --- or to misspell a word, the weights for an insertion and a substitution are less than the weight of a deletion.) Then the computation is carried out on a table that is (len(s1) +1) * (len(s2) +1) = 6 * 6 elements large. The ith row (i starting with 0) corresponds to the string s1 up to and including the ith letter and the kth column (k also starting with 0) corresponds to the string s2 up to and including the kth letter. | b r a k e | b r a k e --+------------------- --+------------------- | 0 1 2 3 4 5 | 0 1 2 3 4 5 b | 3 * b | 3 0 1 2 3 4 l | 6 l | 6 3 2 3 4 5 a | 9 a | 9 6 5 2 3 5 c | 12 c | 12 9 8 5 4 5 k | 15 k | 15 12 11 8 5 6 The table is initialized as shown in the left table above, i.e. the first row is initialized to k *w_ins and the first column to k *w_del, where k is the number of the row or column, respectively, starting with 0. For the first row, these are the weights of the operations to transform the empty string (the string "black" up to the 0th letter) into the empty string (the string "brake" up to the 0th letter), the string "b" (the string "brake" up to and including the first letter), the string "br" (the string "brake" up to and including the second letter) and so on. Since it obviously takes k insertions (where k is the column, starting with 0), the weight is k *w_ins. The columns of the first row are determined analogously, only w_del is used instead of w_ins, since the letters of the string "black" have to be deleted in order to turn it into the empty string. (The initialization corresponds to the first two parts of the recursive formula.) Computation starts at the table element marked with a `*' and proceeds row by row (or column by column). The value of an element is computed as follows: 1. Get the element to the left of the current element and add w_ins. (For the `*'-marked element this is 3 +1 = 4.) 2. Get the element above the current element and add w_del. (For the `*'-marked element this is 1 +3 = 4.) 3. Get the element above and to the left of the current element and add w(x,y), where x is the letter of the current row and y is the letter of the current column. (For the `*'-marked element this is 0 +w(`b', `b') = 0 +0 = 0.) 4. Determine the minimum of the three computed values and store it in the current element. (For the `*'-marked element, the minimum is 0, so a 0 is stored in the `*'-marked element.) (These steps correspond to the third part of the recursive formula.) While the table is not complete, go to the next element and repeat the steps until the table is filled. The exact order of the computations is not important as long as only those elements are computed for which the values needed in the three steps are already available. This can easily be achieved by a row by row or a column by column scheme. When the table has been completed, simply retrieve the value of the element in the lower right corner. This is the weighted Levenshtein distance of the two words. In the example above it is 6. The above implementation computes the table row by row, but it exploits the fact that the contents of a row is needed only to compute the next row. In addition, if a table element has been used to compute the cumulated weight of the next row for a deletion (step 2) and a substitution (step 3), it is not needed anymore. Hence we can do all the computations with only one table row and one additional variable, simply overwriting the elements after they have been used in the computations. In addition the function first determines the shorter string and places it at the horizontal direction of the table. In this way the minimal amount of memory is used. Note that, if the first string is shorter, we also have to exchange the weights w_ins and w_del, since the weighted Levenshtein distance is asymmetric (i.e. in general d(s1,s2) != d(s2,s1)) unless w_del = w_ins. ----------------------------------------------------------------------*/ void error (int code, ...) { /* --- print an error message */ const char *msg; /* error message */ va_list args; /* list of variable arguments */ if ((code > 0) || (code < E_UNKNOWN)) code = E_UNKNOWN; /* check index */ msg = errmsgs[-code]; /* get error message */ fprintf(stderr, "%s: ", prgname); va_start(args, code); /* get variable arguments, */ vfprintf(stderr, msg, args); /* print error message and */ va_end(args); /* end argument evaluation */ exit(code); /* abort programm */ } /* error() */ /*--------------------------------------------------------------------*/ int main (int argc, char *argv[]) { /* --- main function */ int i, k = 0; /* loop variables, counter */ char *s; /* to traverse options */ char *s1 = NULL, *s2 = NULL; /* strings to compare */ float w_case = 0.0F; /* weight of case change */ float w_subst = 2.0F; /* weight of substitution */ float w_ins = 1.0F; /* weight of insertion */ float w_del = 3.0F; /* weight of deletion */ float dist; /* weighted Levenshtein distance */ prgname = argv[0]; /* get program name for error msgs. */ /* --- print usage message --- */ if (argc <= 1) { /* if no arguments given */ printf("usage: %s [-c#] [-i#] [-s#] [-d#] s1 s2\n", argv[0]); printf("compute weighted Levenshtein distance\n"); printf("-c# weight of case change (default: %g)\n", w_case); printf("-s# weight of substitution (default: %g)\n", w_subst); printf("-i# weight of insertion (default: %g)\n", w_ins); printf("-d# weight of deletion (default: %g)\n", w_del); printf("s1, s2 strings to compare\n"); return 0; /* print usage message */ } /* and abort program */ /* --- evaluate arguments --- */ for (i = 1; i < argc; i++) { /* traverse arguments */ if (argv[i][0] == '-') { /* if argument is an option */ s = argv[i]+1; /* get pointer */ while (*s) { /* traverse options */ switch (*s++) { /* evaluate options */ case 'c': w_case = strtod(s, &s); break; case 's': w_subst = strtod(s, &s); break; case 'i': w_ins = strtod(s, &s); break; case 'd': w_del = strtod(s, &s); break; default : error(E_OPTION, *(--s)); break; } } } else { /* if argument is no option */ switch (k++) { /* evaluate non-options */ case 0: s1 = argv[i]; break; case 1: s2 = argv[i]; break; default: error(E_ARGCNT); break; } /* note strings to compare */ } } if (k < 2) error(E_ARGCNT); /* check arguments */ if ((w_case < 0) || (w_subst < 0) || (w_del < 0) || (w_ins < 0)) error(E_WEIGHT); /* check weights */ /* --- compute distance --- */ dist = wld(s1, s2, w_case, w_subst, w_ins, w_del); if (dist < 0) error(E_NOMEM); /* compute weighted Levenshtein */ printf("%g\n", dist); /* distance and print it */ return 0; /* return 'ok' */ } /* main() */ /*---------------------------------------------------------------------- For an explanation of the main function, take a look at the skeleton for a command line program `skeleton.c'. ----------------------------------------------------------------------*/