/*
 * mp_mul.c
 *
 * This code is in the public domain. I would appreciate bug reports and
 * enhancements.
 *
 * Duncan S Wong <swong@ieee.org>
 *
 * Dec 14, 2000 - Initial Version
 */
#include <PalmOS.h>
#include "mp.h"
#include "mp_priv.h"

// Set r to a * b.
//
// This is Algorithm 14.12 on p.595 of HAC.
//
// Note: r must be different from a and b.
Int16 MP_mul(INT *r, INT *a, INT *b)
{
Int16 i, rlen,al,bl;
DIGIT *ap,*bp,*rp;

	al=a->top;
	bl=b->top;
	if ((al == 0) || (bl == 0)) {
		r->top=0;
		return(1);
  }

	rlen = al + bl;
	if (MP_alloc(r,rlen*DIGIT_BITS) == NULL) return(0);
	r->top=rlen;
	r->neg=a->neg^b->neg;
	ap=a->d;
	bp=b->d;
	rp=r->d;

	rp[al] = MP_mul_digit(rp,ap,al,*(bp++));
	rp++;
	for (i=1; i<bl; i++) {
		rp[al] = MP_mul_add_digit(rp,ap,al,*(bp++));
		rp++;
  }

	if (r->d[rlen-1] == 0) r->top--;  // like MP_fix_top but only need to check the most significant digit

	return(1);
}


// Set rp to ap * w where ap has num digits and w is only one digit.
// The original value pointed by rp is overridden.
//
// Return a carry.
//
// e.g. ap = 95, num = 2 and w = 6 in radix 10;
//      rp = 70 and 5 is returned.
DIGIT MP_mul_digit(DIGIT *rp, DIGIT *ap, Int16 num, DIGIT w)
{
DIGIT carry=0;

	// why not just use a single mul in the for loop, each time one digit?
  // the following code is faster by removing branching (for loop)
	for (;;) {
		mul(rp[0],ap[0],w,carry);
		if (--num == 0) break;
		mul(rp[1],ap[1],w,carry);
		if (--num == 0) break;
		mul(rp[2],ap[2],w,carry);
		if (--num == 0) break;
		mul(rp[3],ap[3],w,carry);
		if (--num == 0) break;
		ap+=4;
		rp+=4;
  }
	return(carry);
}


// Set rp to rp + (ap * w) where ap has num digits and w is only one digit.
// The original value pointed by rp is overridden.
//
// Return a carry.
//
// e.g. rp = 99, ap = 95, num = 2 and w = 6 in radix 10;
//      rp = 69 and 6 is returned.
DIGIT MP_mul_add_digit(DIGIT *rp, DIGIT *ap, Int16 num, DIGIT w)
{
DIGIT carry = 0;

	for (;;) {
		mul_add(rp[0],ap[0],w,carry);
		if (--num == 0) break;
		mul_add(rp[1],ap[1],w,carry);
		if (--num == 0) break;
		mul_add(rp[2],ap[2],w,carry);
		if (--num == 0) break;
		mul_add(rp[3],ap[3],w,carry);
		if (--num == 0) break;
		ap+=4;
		rp+=4;
  }
	
	return(carry);
} 


// Set r to a*a
//
// This is Algorithm 14.16 on p.597 of HAC. It should be almost twice
// the speed of MP_mul(r, a, a) when the machine supports TETRA_DIGIT.
// Currently only EIGHT_BIT and SIXTEEN_BIT supports TETRA_DIGIT.
// Otherwise, this algorithm may be SLOWER than MP_mul(r, a, a)!
//
// Note : - r must not be a.
//        - It gives 23% improvement over MP_mul(r, a, a) running on a Palm III.
//        - It takes 199msec to compute the square of a 1024-bit integer when
//          DIGIT is (unsigned char).
Int16 MP_sqr(INT *r, INT *a)
{
Int16 i, j, rlen, al;
DIGIT *ap,*rp;
DOUBLE_DIGIT carry;

	al=a->top;
	if (al == 0) {
		r->top=0;
		return(1);
  }

	rlen = al << 1 + 1;
	if (MP_alloc(r, rlen*DIGIT_BITS) == NULL) return(0);
	r->top=rlen;
	r->neg=0;
	ap=a->d;
	rp=r->d;

	//memset(r->d,0,r->max*sizeof(DIGIT));
  for (i=0; i<rlen; i++) rp[i] = 0;  // this is 'slightly' faster than memset
  for (i=0; i<al; i++) {
    carry = (DOUBLE_DIGIT)rp[i<<1];
    mul(rp[i<<1], ap[i], ap[i], carry);
    for (j=i+1; j<al; j++) {
      mul_add_double(rp[i+j], ap[i], ap[j], carry);
    }
    rp[i+al] += Ldigit(carry);
    rp[i+al+1] = Hdigit(carry);
  }

  MP_fix_top(r);
	//if (r->d[rlen-1] == 0) r->top--;  // like MP_fix_top but only need to check the most significant digit

	return(1);
}
