/*----------------------------------------------------------------------
  File    : wld.c
  Contents: compute weighted Levenshtein distance
  Author  : Christian Borgelt
  History : 26.06.1992 file created
            28.11.1995 main function added, length restriction removed
            24.10.1996 weights made function arguments
            10.11.1997 minor improvements
            04.03.1998 weights changed to floating point
            08.04.1998 memory requirement significantly reduced
----------------------------------------------------------------------*/
#include <stdio.h>
#include <stdlib.h>
#include <stdarg.h>
#include <string.h>

/*----------------------------------------------------------------------
  Preprocessor Definitions
----------------------------------------------------------------------*/
#define up(c)      ((((c) >= 'a') && ((c) <= 'z')) ? (c) +'A'-'a' : (c))

/* --- error codes --- */
#define OK           0          /* no error */
#define E_NONE       0          /* no error */
#define E_NOMEM    (-1)         /* not enough memory */
#define E_OPTION   (-2)         /* illegal option */
#define E_ARGCNT   (-3)         /* wrong number of arguments */
#define E_WEIGHT   (-4)         /* illegal weight */
#define E_UNKNOWN  (-5)         /* unknown error */

/*----------------------------------------------------------------------
  Constants
----------------------------------------------------------------------*/
const char *errmsgs[] = {       /* error messages */
  /* E_NONE       */  "no error\n",
  /* E_NOMEM   -1 */  "not enough memory\n",
  /* E_OPTION  -2 */  "unknown option -%c\n",
  /* E_ARGCNT  -3 */  "wrong number of arguments\n",
  /* E_WEIGHT  -4 */  "illegal weight(s) (must be >= 0)\n",
  /* E_UNKNOWN -5 */  "unknown error\n",
};

/*----------------------------------------------------------------------
  Global Variables
----------------------------------------------------------------------*/
const char *prgname;            /* program name for error messages */

/*----------------------------------------------------------------------
  Functions
----------------------------------------------------------------------*/

float wld (const char *s1, const char *s2,
           float w_case, float w_subst, float w_ins, float w_del)
{                               /* --- weighted Levenshtein distance */
  int        i;                 /* loop variable */
  int        len1, len2;        /* length of strings s1 and s2 */
  const char *s;                /* to traverse string s2 */
  float      *tab;              /* distance table */
  float      *d;                /* to traverse the distance table */
  float      t, w;              /* cumulated weights */

  len1 = (int)strlen(s1);       /* get the lengths of the strings */
  len2 = (int)strlen(s2);       /* to compare and check them */
  if (len1 <= 0) return len2 *w_ins;
  if (len2 <= 0) return len1 *w_del;
  if (len1 < len2) {            /* if the first string is shorter */
    s = s1;    s1    = s2;    s2    = s;
    i = len1;  len1  = len2;  len2  = i;
    w = w_ins; w_ins = w_del; w_del = w;
  }                             /* exchange the two strings */
  tab = (float*)malloc((len2+1) *sizeof(float));
  if (!tab) return -1;          /* allocate distance table */
  d = tab; w = 0;               /* traverse distance table */
  for (i = len2 +1; --i >= 0; ){/* and initialize the weights */
    *d++ = w; w += w_ins; }     /* as for an empty string s2 */
  for (i = 0; *s1; s1++) {      /* traverse rows */
    d = tab; w = ++i *w_del;    /* compute new first element */
    for (s = s2; *s; s++) {     /* traverse columns */
      t = *d; *d++ = w;         /* compute the cumulated weight */
      if (*s1 != *s)            /* for a substitution */
        t += (up(*s1) == up(*s)) ? w_case : w_subst;
      w += w_ins;               /* compute the cumulated */
      if (t < w) w = t;         /* weight for an insertion */
      t = *d +w_del;            /* and the cumulated weight */
      if (t < w) w = t;         /* for a deletion and select the */
    }                           /* minimum of the three weights */
    *d = w;                     /* for the next table element */
  }
  free(tab);                    /* delete distance table */
  return w;                     /* return computed distance */
}  /* wld() */

/*----------------------------------------------------------------------
The above function computes the weighted Levenshtein distance for the
two strings s1 and s2 using as little memory as possible. To understand
it, let us neglect the special implementation for a moment and look at
the underlying idea first. It rests on the fact that the weighted
Levenshtein distance can be computed using the recursive formula
  d(a_1...a_n,b_1...b_m)
    / m * w_ins,                                          if n = 0,
  = | n * w_del,                                          if m = 0,
    \ min { d(a_1...a_n    ,b_1...b_{m-1}) + w_ins,
            d(a_1...a_{n-1},b_1...b_m    ) + w_del,
            d(a_1...a_{n-1},b_1...b_{m-1}) + w(a_n,b_m)}, otherwise,
where a_1 to a_n are the letters of the first string and b_1 to b_m
are the letters of the second string, w_ins the weight of an insertion,
w_del the weight of a deletion and
           / 0,       if a = b,
  w(a,b) = | w_case,  if uppercase(a) = uppercase(b),
           \ w_subst, otherwise,
with w_case the weight of a case change and w_subst the weight of a
substitution. This formula takes care of the fact that in order to turn
an empty string into a string m characters long, it takes m insertions,
and to turn a string n characters long into the empty string, it takes
n deletions. To transform a string a_1...a_n into a string b_1...b_m,
one can transform a_1...a_n into b_1...b_{m-1} and then append the
letter b_m (insertion), one can delete a_n and transform a_1...a_{n-1}
into b_1...b_m (deletion) or one can transform a_1...a_{n-1} to
b_1...b_{m-1} and then replace a_n by b_m (substitution).
  This formula can easily be computed using a table. For example, let
s1 = "black" and s2 = "brake" and the weights w_case = 0, w_ins = 1,
w_subst = 2, w_del = 3. (This is a reasonable choice of weights, if
the first string is the one searched for and second stems from, say,
a text in which it is to be searched. Since a user is more likely to
leave out a letter --- e.g. by not hitting a key hard enough --- or to
misspell a word, the weights for an insertion and a substitution are
less than the weight of a deletion.) Then the computation is carried out
on a table that is (len(s1) +1) * (len(s2) +1) = 6 * 6 elements large.
The ith row (i starting with 0) corresponds to the string s1 up to and
including the ith letter and the kth column (k also starting with 0)
corresponds to the string s2 up to and including the kth letter.

     |     b  r  a  k  e               |     b  r  a  k  e
   --+-------------------            --+-------------------
     |  0  1  2  3  4  5               |  0  1  2  3  4  5
   b |  3  *                         b |  3  0  1  2  3  4
   l |  6                            l |  6  3  2  3  4  5
   a |  9                            a |  9  6  5  2  3  5
   c | 12                            c | 12  9  8  5  4  5
   k | 15                            k | 15 12 11  8  5  6

The table is initialized as shown in the left table above, i.e. the
first row is initialized to k *w_ins and the first column to k *w_del,
where k is the number of the row or column, respectively, starting
with 0. For the first row, these are the weights of the operations to
transform the empty string (the string "black" up to the 0th letter)
into the empty string (the string "brake" up to the 0th letter), the
string "b" (the string "brake" up to and including the first letter),
the string "br" (the string "brake" up to and including the second
letter) and so on. Since it obviously takes k insertions (where k is
the column, starting with 0), the weight is k *w_ins. The columns of
the first row are determined analogously, only w_del is used instead
of w_ins, since the letters of the string "black" have to be deleted in
order to turn it into the empty string. (The initialization corresponds
to the first two parts of the recursive formula.)
  Computation starts at the table element marked with a `*' and proceeds
row by row (or column by column). The value of an element is computed as
follows:
1. Get the element to the left of the current element and add w_ins.
   (For the `*'-marked element this is 3 +1 = 4.)
2. Get the element above the current element and add w_del.
   (For the `*'-marked element this is 1 +3 = 4.)
3. Get the element above and to the left of the current element and
   add w(x,y), where x is the letter of the current row and y is
   the letter of the current column. (For the `*'-marked element this
   is 0 +w(`b', `b') = 0 +0 = 0.)
4. Determine the minimum of the three computed values and store it
   in the current element. (For the `*'-marked element, the minimum
   is 0, so a 0 is stored in the `*'-marked element.)
(These steps correspond to the third part of the recursive formula.)
While the table is not complete, go to the next element and repeat the
steps until the table is filled. The exact order of the computations is
not important as long as only those elements are computed for which the
values needed in the three steps are already available. This can easily
be achieved by a row by row or a column by column scheme.
  When the table has been completed, simply retrieve the value of the
element in the lower right corner. This is the weighted Levenshtein
distance of the two words. In the example above it is 6.
  The above implementation computes the table row by row, but it
exploits the fact that the contents of a row is needed only to compute
the next row. In addition, if a table element has been used to compute
the cumulated weight of the next row for a deletion (step 2) and a
substitution (step 3), it is not needed anymore. Hence we can do all
the computations with only one table row and one additional variable,
simply overwriting the elements after they have been used in the
computations. In addition the function first determines the shorter
string and places it at the horizontal direction of the table. In this
way the minimal amount of memory is used. Note that, if the first
string is shorter, we also have to exchange the weights w_ins and
w_del, since the weighted Levenshtein distance is asymmetric (i.e.
in general d(s1,s2) != d(s2,s1)) unless w_del = w_ins.
----------------------------------------------------------------------*/

void error (int code, ...)
{                               /* --- print an error message */
  const char *msg;              /* error message */
  va_list    args;              /* list of variable arguments */

  if ((code > 0) || (code < E_UNKNOWN))
    code = E_UNKNOWN;           /* check index */
  msg = errmsgs[-code];         /* get error message */
  fprintf(stderr, "%s: ", prgname);
  va_start(args, code);         /* get variable arguments, */
  vfprintf(stderr, msg, args);  /* print error message and */
  va_end(args);                 /* end argument evaluation */
  exit(code);                   /* abort programm */
}  /* error() */

/*--------------------------------------------------------------------*/

int main (int argc, char *argv[])
{                               /* --- main function */
  int   i, k = 0;               /* loop variables, counter */
  char  *s;                     /* to traverse options */
  char  *s1 = NULL, *s2 = NULL; /* strings to compare */
  float w_case  = 0.0F;         /* weight of case change */
  float w_subst = 2.0F;         /* weight of substitution */
  float w_ins   = 1.0F;         /* weight of insertion */
  float w_del   = 3.0F;         /* weight of deletion */
  float dist;                   /* weighted Levenshtein distance */

  prgname = argv[0];            /* get program name for error msgs. */

  /* --- print usage message --- */
  if (argc <= 1) {              /* if no arguments given */
    printf("usage: %s [-c#] [-i#] [-s#] [-d#] s1 s2\n", argv[0]);
    printf("compute weighted Levenshtein distance\n");
    printf("-c#     weight of case change  (default: %g)\n", w_case);
    printf("-s#     weight of substitution (default: %g)\n", w_subst);
    printf("-i#     weight of insertion    (default: %g)\n", w_ins);
    printf("-d#     weight of deletion     (default: %g)\n", w_del);
    printf("s1, s2  strings to compare\n");
    return 0;                   /* print usage message */
  }                             /* and abort program */

  /* --- evaluate arguments --- */
  for (i = 1; i < argc; i++) {  /* traverse arguments */
    if (argv[i][0] == '-') {    /* if argument is an option */
      s = argv[i]+1;            /* get pointer */
      while (*s) {              /* traverse options */
        switch (*s++) {         /* evaluate options */
          case 'c': w_case  = strtod(s, &s); break;
          case 's': w_subst = strtod(s, &s); break;
          case 'i': w_ins   = strtod(s, &s); break;
          case 'd': w_del   = strtod(s, &s); break;
          default : error(E_OPTION, *(--s)); break;
        }
      } }
    else {                      /* if argument is no option */
      switch (k++) {            /* evaluate non-options */
        case  0: s1 = argv[i];    break;
        case  1: s2 = argv[i];    break;
        default: error(E_ARGCNT); break;
      }                         /* note strings to compare */
    }
  }
  if (k < 2) error(E_ARGCNT);   /* check arguments */
  if ((w_case < 0) || (w_subst < 0)
  ||  (w_del  < 0) || (w_ins   < 0))
    error(E_WEIGHT);            /* check weights */

  /* --- compute distance --- */
  dist = wld(s1, s2, w_case, w_subst, w_ins, w_del);
  if (dist < 0) error(E_NOMEM); /* compute weighted Levenshtein */
  printf("%g\n", dist);         /* distance and print it */
  return 0;                     /* return 'ok' */
}  /* main() */

/*----------------------------------------------------------------------
For an explanation of the main function, take a look at the skeleton
for a command line program `skeleton.c'.
----------------------------------------------------------------------*/