#|
How to use this program:

(load "...-data")
(init-decision-tree)
(train-decision-tree)
(test-decision-tree ...)

(show-decision-tree)

No provision made for numerical range data.  Every attribute assumed to
have discrete values.
|#

(defstruct node
  coordinate				;number of coordinate to test on
  (value-node-alist nil)		;when nil, means a leaf node
  count					;nbr of samples here and below
  class					;only used for leaf nodes
  )

(defvar *decision-tree-data*)
(defvar *decision-tree*)

;;; User routines.

(defun init-decision-tree ()
  (convert-data-to-decision-tree-form))

(defun train-decision-tree ()
  (setq *decision-tree*
	(build-decision-tree
	  *decision-tree-data*
	  (reverse (countdown (length *input-ranges*)))))
  (format t "~%Decision tree built having")
  (show-decision-tree-stats *decision-tree*)
  (values))

(defun test-decision-tree (input-att-vec)
  (when (check-att-vector input-att-vec)
	 (tally-and-sort-scores
	  (get-class-from-tree input-att-vec *decision-tree*))))

(defun show-decision-tree (&optional (node *decision-tree*) (level 0))
  (format t "~%~a~a training example~:p"
		(indent level)
		(node-count node))
  (if (node-value-node-alist node)
      (progn				;internal node
	(format t "~%~aSplit on attribute number ~a:"
		(indent level)
		(node-coordinate node))
	(mapcar #'(lambda (val-node-pair)
		    (format t "~%  ~aIf value = ~a"
			    (indent level)
			    (first val-node-pair))
		    (show-decision-tree (second val-node-pair) (1+ level)))
		(node-value-node-alist node)))
    (format t "~%~aClass = ~a"		;leaf node
	    (indent level)
	    (node-class node)))
  (when (zerop level) (format t "~%") (show-decision-tree-stats node))
  (values))

;;; auxiliaries

(defun convert-data-to-decision-tree-form ()
  (if (> (length *output-ranges*) 1)
      (format t "~%Only using output attribute 0."))
  (setq *decision-tree-data*		;use first output attribute only
	(mapcar #'list			; (assumed discrete)
		(mapcar #'first *training-data*)
		(mapcar #'(lambda (pair) (list (first (second pair))))
			*training-data*)))
  'done)

(defun classs (pair)
  (first (second pair)))

(defun input-attribute (i pair)
  (nth i (first pair)))

(defun build-decision-tree (pairs att-nums)
  (let ((this-node (make-node)))
    ;; Fill in the count slot
    (setf (node-count this-node) (length pairs))
    ;; If all pairs have same class
    (cond ((every #'(lambda (p) (eql (classs p) (classs (first pairs))))
		  pairs)
	   ;; Store this value in node
	   (setf (node-class this-node) (classs (first pairs))))
	  ;; Else if no attributes to split on
	  ((null att-nums)
	   ;; Store a summary of all classes present in data at node
	   (setf (node-class this-node)
		 (tally-and-sort-items (mapcar #'classs pairs))))
	  ;; Else
	  (t
	   ;; Find the best coordinate to split on and group the
	   ;;  data according to the values along this attribute
	   (let* ((best-split-att (get-best-split-att pairs att-nums))
		  (vals (remove-duplicates
			 (mapcar #'(lambda (p)
				     (input-attribute best-split-att p))
				 pairs)))
		  (grouped-pairs
		   (mapcar
		    #'(lambda (v)
			(remove-if-not
			 #'(lambda (p)
			     (eql v (input-attribute best-split-att  p)))
			 pairs))
		    vals)))
	     ;; Store the split coordinate in node
	     (setf (node-coordinate this-node) best-split-att)
	     ;; Update remaining attribute numbers for future splits
	     (setf att-nums (remove best-split-att att-nums))
	     ;; Store a-list of value/pointer pairs in node
	     (setf (node-value-node-alist this-node)
		   (mapcar
		    #'(lambda (val g-pair)
			(list val (build-decision-tree g-pair att-nums)))
		    vals
		    grouped-pairs)))))
    ;; Return the node
    this-node
    ))

(defun get-best-split-att (pairs att-nums)
  (let* ((scores (mapcar #'(lambda (i) (avg-disorder i pairs))
			 att-nums))
	 (best-score (reduce #'min scores)))
    (nth (position best-score scores) att-nums)))

(defun avg-disorder (i pairs)
  (let* ((ith-vals (mapcar #'(lambda (p) (input-attribute i p)) pairs))
	 (classes (mapcar #'classs pairs))
	 (val-counts (tally-items ith-vals))
	 (val-class-counts
	  (tally-items (mapcar #'list ith-vals classes) :test #'equal)))
    (/ (reduce
	#'+
	(mapcar #'(lambda (vcc)
		    (let* ((val (first (first vcc)))
			   (val-count (float (second (assoc val val-counts))))
			   (val-class-count (float (second vcc))))
		      (* val-class-count
			 (log (/ val-count val-class-count) 2))))
		val-class-counts))
       (length pairs))))

(defun countdown (n)		 ;returns a list of the form (n-1 n-2 ... 1 0)
  (when (> n 0)
	(cons (1- n) (countdown (1- n)))))

(defun indent (depth) (make-string (* 2 depth) :initial-element #\space))

(defun show-decision-tree-stats (node)
  (let ((stats (dt-stats node)))
    (format t "~%~a internal node~:p, ~a leaf node~:p, and height ~a~%~%"
	    (first stats) (second stats) (third stats))))
   
(defun dt-stats (node) ;returns list (# internal nodes, # leaf nodes, height)
  (if (leaf-node-p node) '(0 1 0)
    (let* ((children (mapcar #'second (node-value-node-alist node)))
	   (lists (mapcar #'dt-stats children))
	   (i-node-counts (mapcar #'first lists))
	   (l-node-counts (mapcar #'second lists))
	   (heights (mapcar #'third lists)))
      (list (1+ (reduce #'+ i-node-counts))
	    (reduce #'+ l-node-counts)
	    (1+ (reduce #'max heights))))))

(defun leaf-node-p (node)
  (null (node-value-node-alist node)))

;;; Branches are built only for values actually present in data.
;;; This raises the question of how to handle a situation when a
;;; value occurs in a test input that requires a missing branch.
;;; This code handles it by collecting the decisions that would
;;; be made by going down all the branches present at that node

(defun get-class-from-tree (att-vec node)
  (if (node-value-node-alist node)	;internal node
      (progn
	(format t "~%Testing attribute ~a:  Value = ~a"
		(node-coordinate node)
		(nth (node-coordinate node) att-vec))
	(let ((next-node (second (assoc (nth (node-coordinate node) att-vec)
					(node-value-node-alist node)))))
	  (if (node-p next-node)
	      (get-class-from-tree att-vec next-node)
	    (progn
	      (format t "~%No branch for value ~a~
                         ~%   -- collecting results for all branches."
		      (nth (node-coordinate node) att-vec))
	      (reduce #'append
		      (mapcar #'(lambda (vn)
				  (get-class-from-tree att-vec (second vn)))
			      (node-value-node-alist node)))))))
    (progn				;leaf node
      (format t "~%Class = ~a" (node-class node))
      (if (listp (node-class node))
	  (node-class node)
	(list (list (node-class node) (node-count node)))))
    ))
