#!/bin/sh

#-----------------------------------------------------------------------
# prepare datasets
#-----------------------------------------------------------------------

dom -s audio.tab - 2> /dev/null | sed "/identifier/d" > audio.dom
tsplit -cclass -t2 audio.tab -o "audio%d.tab" 2> /dev/null

dom -s horse.tab horse.dom 2> /dev/null
tsplit -coutcome -t2 horse.tab -o "horse%d.tab" 2> /dev/null

dom -s soybean.tab soybean.dom 2> /dev/null
tsplit -cdisease -t2 soybean.tab -o "soybean%d.tab" 2> /dev/null

dom -s vote.tab vote.dom 2> /dev/null
# vote needs no splitting

#-----------------------------------------------------------------------
# induce and evaluate classifiers
#-----------------------------------------------------------------------
rm -f .nc

for t in audio horse soybean vote; do

echo $t | gawk '{ printf("%s#", $t); }' >> .nc
head -1 $t"0.tab" | wc | gawk '{ printf("%2d#", $2-2); }' >> .nc
wc $t"0.tab" | gawk '{ printf("%3d#", $1-1); }' >> .nc
wc $t"1.tab" | gawk '{ printf("%3d#", $1-1); }' >> .nc

#-----------------------------------------------------------------------

npi -sa $t.dom $t"0.tab" $t.npc 2>&1 | \
  gawk -F[ '$1 ~ /writing/ { printf("%2d      #", $2-1)} '       >> .nc
npx $t.npc $t"0.tab" 2>&1 | \
  gawk -F'(' '$1 ~ /error/ { printf("%2d/%4.1f%%#", $1, $3); } ' >> .nc
npx $t.npc $t"1.tab" 2>&1 | \
  gawk -F'(' '$1 ~ /error/ { printf("%2d/%4.1f%%#", $1, $3); } ' >> .nc

npi -sr $t.dom $t"0.tab" $t.npc 2>&1 | \
  gawk -F[ '$1 ~ /writing/ { printf("%2d      #", $2-1)} '       >> .nc
npx $t.npc $t"0.tab" 2>&1 | \
  gawk -F'(' '$1 ~ /error/ { printf("%2d/%4.1f%%#", $1, $3); } ' >> .nc
npx $t.npc $t"1.tab" 2>&1 | \
  gawk -F'(' '$1 ~ /error/ { printf("%2d/%4.1f%%#", $1, $3); } ' >> .nc

#-----------------------------------------------------------------------

bci -tsa $t.dom $t"0.tab" $t.nbc 2>&1 | \
  gawk -F[ '$1 ~ /writing/ { printf("%2d      #", $2-1)} '       >> .nc
bcx $t.nbc $t"0.tab" 2>&1 | \
  gawk -F'(' '$1 ~ /error/ { printf("%2d/%4.1f%%#", $1, $3); } ' >> .nc
bcx $t.nbc $t"1.tab" 2>&1 | \
  gawk -F'(' '$1 ~ /error/ { printf("%2d/%4.1f%%#", $1, $3); } ' >> .nc

bci -tsr $t.dom $t"0.tab" $t.nbc 2>&1 | \
  gawk -F[ '$1 ~ /writing/ { printf("%2d      #", $2-1)} '       >> .nc
bcx $t.nbc $t"0.tab" 2>&1 | \
  gawk -F'(' '$1 ~ /error/ { printf("%2d/%4.1f%%#", $1, $3); } ' >> .nc
bcx $t.nbc $t"1.tab" 2>&1 | \
  gawk -F'(' '$1 ~ /error/ { printf("%2d/%4.1f%%#", $1, $3); } ' >> .nc

#-----------------------------------------------------------------------

dti $t.dom $t"0.tab" $t.dt 2>&1 | \
  gawk -F[ '$1 ~ /growing / { printf("%2d      #", $2-1)} '      >> .nc
dtx $t.dt $t"0.tab" 2>&1 | \
  gawk -F'(' '$1 ~ /error/ { printf("%2d/%4.1f%%#", $1, $3); } ' >> .nc
dtx $t.dt $t"1.tab" 2>&1 | \
  gawk -F'(' '$1 ~ /error/ { printf("%2d/%4.1f%%#", $1, $3); } ' >> .nc

dtp $t.dt $t.pdt $t"0.tab" 2>&1 | \
  gawk -F[ '$1 ~ /pruning / { printf("%2d      #", $2-1)} '      >> .nc
dtx $t.pdt $t"0.tab" 2>&1 | \
  gawk -F'(' '$1 ~ /error/ { printf("%2d/%4.1f%%#", $1, $3); } ' >> .nc
dtx $t.pdt $t"1.tab" 2>&1 | \
  gawk -F'(' '$1 ~ /error/ { printf("%2d/%4.1f%%\n",$1, $3); } ' >> .nc

done

#-----------------------------------------------------------------------
# show results
#-----------------------------------------------------------------------

#  1: name of dataset
#  2: number of attributes
#  3: number of tuples (train)
#  4: number of tuples (test)
#  5: number of selected attributes (poss., add.)
#  6: errors (poss., add., train)
#  7: errors (poss., add., test)
#  8: number of selected attributes (poss., rem.)
#  9: errors (poss., rem., train)
# 10: errors (poss., rem., test)
# 11: number of selected attributes (Bayes, add.)
# 12: errors (Bayes, add., train)
# 13: errors (Bayes, add., test)
# 14: number of selected attributes (Bayes, rem.)
# 15: errors (Bayes, rem., train)
# 16: errors (Bayes, rem., test)
# 17: number of selected attributes (tree, unpruned)
# 18: errors (tree, unpruned, train)
# 19: errors (tree, unpruned, test)
# 20: number of selected attributes (tree, pruned)
# 21: errors (tree, pruned, train)
# 22: errors (tree, pruned, test)

gawk -F"#" '
BEGIN {
  printf("dataset              naive poss. class.  ");
  printf("naive Bayes class.  decision tree\n");
  printf("             tuples  add.      rem.      ");
  printf("add.      rem.      unpruned  pruned\n");
}
{ printf(   "%-7s  train  %s  %s  %s  %s  %s  %s  %s\n",
             $1,          $3, $6, $9,$12,$15,$18,$21);
  printf("         test   %s  %s  %s  %s  %s  %s  %s\n",
                          $4, $7,$10,$13,$16,$19,$22);
  printf("%s atts  sel         %s  %s  %s  %s  %s  %s\n",
          $2,                  $5, $8,$11,$14,$17,$20); }
' .nc

#-----------------------------------------------------------------------
# clean up
#-----------------------------------------------------------------------

rm -f audio.dom audio[01].tab audio.n[bp]c audio.dt audio.pdt
rm -f horse.dom horse[01].tab horse.n[bp]c horse.dt horse.pdt
rm -f soybean.dom soybean[01].tab soybean.n[bp]c soybean.dt soybean.pdt
rm -f vote.dom vote.n[bp]c vote.dt vote.pdt
rm -f .nc

