/*
 *   nnls.c  (c) 2003-2009 Turku PET Centre
 *   This file contains the routine NNLS (nonnegative least squares)
 *   and the subroutines required by it, except h12, which is in
 *   file 'lss_h12.c'.
 *   
 *  This routine is based on the text and fortran code in
 *  C.L. Lawson and R.J. Hanson, Solving Least Squares Problems,
 *  Prentice-Hall, Englewood Cliffs, New Jersey, 1974.
 *  Version:
 *  2002-08-19 Vesa Oikonen
 *  2003-05-08 Kaisa Sederholm & VO
 *  Included function nnlsWght().
 *  2003-05-12 KS
 *  Variable a_dim1 excluded
 *  
 *  Usage of the coefficient matrix altered so that it is
 *  given in a[][] instead of a[].
 *  2003-11-06 VO
 *  If n<2, then itmax is set to n*n instead of previous n*3.
 *  2004-09-17 VO
 *  Doxygen style comments.
 *  2006-24-04 Pauli Sundberg
 *  Added some debuging output, and made some comments more precise.
 *  2007-05-17 VO
 *  2009-04-16 VO
 *  Corrected a bug in nnls() which may have caused an infinite loop.
 *  009-04-27 VO
 *  Added function nnlsWghtSquared() for faster pixel-by-pixel calculations.
 *  Checking for exceeding iteration count is corrected in nnls().
 */ 
#include <stdio.h>
#include <stdint.h>
#include <assert.h>
#include <stdlib.h>
#include <math.h>
#define MAX(a,b) ((a) >= (b) ? (a) : (b))
#define ABS(x) ((x) >= 0 ? (x) : -(x))

int64_t h12( int64_t mode, int64_t lpivot, int64_t l1, int64_t m, double *u, int64_t u_dim1, double *up, double *cm, int64_t ice, int64_t icv, int64_t ncv) {
  double d1,  b, clinv, cl, sm;
  int64_t k, j;

  /* Check parameters */
  if (mode!=1 && mode!=2) 
    return(1);
  if (m<1 || u==NULL || u_dim1<1 || cm==NULL) 
    assert(0);
  //     return(1);
  if (lpivot<0 || lpivot>=l1 || l1>m) 
    //     assert(0);
    return(1);

  /* Function Body */
  cl = ABS( u[lpivot*u_dim1] );
  // cl= (d1 = u[lpivot*u_dim1], fabs(d1));

  if (mode==2) 
  { /* Apply transformation I+U*(U**T)/B to cm[] */
    if(cl<=0.) 
      //       assert(0);
      return(0);
  } 
  else 
  {   /* Construct the transformation */


    /* This is the way provided in the original pseudocode
       sm = 0;
       for (j = l1; j < m; j++)
       {
       d1 =  u[j * u_dim1];
       sm += d1*d1;
       }
       d1 = u[lpivot * u_dim1];
       sm += d1*d1;
       sm = sqrt(sm);

       if (u[lpivot*u_dim1] > 0) 
       sm=-sm;

       up[0] = u[lpivot*u_dim1] - sm; 
       u[lpivot*u_dim1]=sm;
       printf("Got sum: %f\n",sm);
       */

    /* and this trying to compensate overflow */
    for (j=l1; j<m; j++) 
    {  // Computing MAX 
      cl = MAX( ABS( u[j*u_dim1] ), cl );
    }
    // zero vector?   

    if (cl<=0.) 
      return(0);

    clinv=1.0/cl;

    // Computing 2nd power 
    d1=u[lpivot*u_dim1]*clinv; 
    sm=d1*d1;

    for (j=l1; j<m; j++) 
    {
      d1=u[j*u_dim1]*clinv; 
      sm+=d1*d1;
    }
    cl *= sqrt(sm); 

    if (u[lpivot*u_dim1] > 0.) 
      cl=-cl;
    up[0] = u[lpivot*u_dim1] - cl; 
    u[lpivot*u_dim1]=cl;
  }

  // no vectors where to apply? only change pivot vector! 
  b=up[0] * u[lpivot*u_dim1];

  /* b must be nonpositive here; if b>=0., then return */
  if (b == 0) 
    return(0);

  // ok, for all vectors we want to apply
  for (j =0; j < ncv; j++) {
    sm = cm[ lpivot * ice + j * icv ] * (up[0]);

    for (k=l1; k<m; k++) 
      sm += cm[ k * ice + j*icv ] * u[ k*u_dim1 ]; 

    if (sm != 0.0) {
      sm *= (1/b); 
      // cm[lpivot, j] = ..
      cm[ lpivot * ice + j*icv] += sm * (up[0]);
      for (k= l1; k<m; k++) 
      {
        cm[ k*ice + j*icv] += u[k * u_dim1]*sm;
      }
    }
  }

  return(0);
} 


void g1(double a, double b, double *cterm, double *sterm, double *sig)
{
  double d1, xr, yr;

  if( fabs(a) > fabs(b) ) {
    xr = b / a;
    d1 = xr;
    yr = sqrt(d1*d1 + 1.);
    d1 = 1./yr;
    *cterm=(a>=0.0 ? fabs(d1) : -fabs(d1));
    *sterm=(*cterm)*xr;
    *sig=fabs(a)*yr;
  } else if( b != 0.) {
    xr = a / b;
    d1 = xr;
    yr = sqrt(d1 * d1 + 1.);
    d1 = 1. / yr;
    *sterm=(b>=0.0 ? fabs(d1) : -fabs(d1));
    *cterm=(*sterm)*xr; *sig=fabs(b)*yr;
  } else {
    *sig=0.; *cterm=0.; *sterm=1.;
  }
} 


int64_t nnls_algorithm(double *a, int64_t m,int64_t n, double *b, double *x, double *rnorm) {
  int64_t pfeas;
  int ret=0;
  int64_t iz;
  int64_t jz;
  int64_t k, j=0, l, itmax, izmax=0, ii, jj=0, ip;
  double d1, d2, sm, up, ss; 
  double temp, wmax, t, alpha, asave, dummy, unorm, ztest, cc;


  /* Check the parameters and data */
  if(m <= 0 || n <= 0 || a == NULL || b == NULL || x == NULL) 
    return(2);

  /* Allocate memory for working space, if required */
  double *w = (double*)calloc(n, sizeof(double));
  double *zz = (double*)calloc(m, sizeof(double));
  int64_t *index = (int64_t*)calloc(n, sizeof(int64_t));
  if(w == NULL || zz == NULL || index == NULL) 
    return(2);

  /* Initialize the arrays INDEX[] and X[] */
  for(k=0; k<n; k++) {
    x[k]=0.;
    index[k]=k;
  }

  int64_t iz2 = n - 1;
  int64_t iz1 = 0;
  int64_t iter=0; 
  int64_t nsetp=0;
  int64_t npp1=0;

  /* Main loop; quit if all coeffs are already in the solution or */
  /* if M cols of A have been triangularized */
  if(n < 3) 
    itmax=n*3;
  else 
    itmax=n*n;
 

  while(iz1 <= iz2 && nsetp < m) {
    /* Compute components of the dual (negative gradient) vector W[] */
    for(iz=iz1; iz<=iz2; iz++) {
      j=index[iz];
      sm=0.;
      for(l=npp1; l<m; l++) 
        sm+=a[j*m + l]*b[l];
      w[j]=sm;
    }

    while(1) {
      /* Find largest positive W[j] */
      for(wmax=0., iz=iz1; iz<=iz2; iz++) {
        j=index[iz]; if(w[j]>wmax) {wmax=w[j]; izmax=iz;}}

      /* Terminate if wmax<=0.; */
      /* it indicates satisfaction of the Kuhn-Tucker conditions */
      if(wmax<=0.0)
        break;

      iz=izmax; 
      j=index[iz];

      /* The sign of W[j] is ok for j to be moved to set P. */
      /* Begin the transformation and check new diagonal element to avoid */
      /* near linear dependence. */
      asave=a[j*m + npp1];
      h12(1, npp1, npp1+1, m, &a[j*m +0], 1, &up, &dummy, 1, 1, 0);
      unorm=0.;
      if(nsetp!=0){
        for(l=0; l<nsetp; l++) {
          d1=a[j*m + l]; 
          unorm+=d1*d1;
        }
      }

      unorm=sqrt(unorm);
      d2=unorm+(d1=a[j*m + npp1], fabs(d1)) * 0.01;
      if((d2-unorm)>0.) {
        /* Col j is sufficiently independent. Copy B into ZZ, update ZZ */
        /* and solve for ztest ( = proposed new value for X[j] ) */
        for(l=0; l<m; l++) zz[l]=b[l];
        h12(2, npp1, npp1+1, m, &a[j*m + 0], 1, &up, zz, 1, 1, 1);
        ztest=zz[npp1]/a[j*m +npp1];
        /* See if ztest is positive */
        if(ztest>0.) break;
      }

      /* Reject j as a candidate to be moved from set Z to set P. Restore */
      /* A[npp1,j], set W[j]=0., and loop back to test dual coeffs again */
      a[j*m+ npp1]=asave; w[j]=0.;
    } /* while(1) */

    if(wmax<=0.0)
      break;

    /* Index j=INDEX[iz] has been selected to be moved from set Z to set P. */
    /* Update B and indices, apply householder transformations to cols in */
    /* new set Z, zero subdiagonal elts in col j, set W[j]=0. */
    for(l=0; l<m; ++l) 
      b[l]=zz[l];

    index[iz]=index[iz1];
    index[iz1]=j;
    iz1++;
    npp1++;
    nsetp=npp1;

    if(iz1<=iz2) {
     for(jz=iz1; jz<=iz2; jz++) {
        jj=index[jz];
        h12(2, nsetp-1, npp1, m, &a[j*m +0], 1, &up, &a[jj*m +0], 1, m, 1);
      }
    }

    if(nsetp!=m) {
      for(l=npp1; l<m; l++) 
        a[j*m +l]=0.;
    }

    w[j]=0.;

    /* Solve the triangular system; store the solution temporarily in Z[] */
    for(l=0; l<nsetp; l++) {
      ip=nsetp-(l+1);
      if(l!=0) for(ii=0; ii<=ip; ii++) zz[ii]-=a[jj*m + ii]*zz[ip+1];
      jj=index[ip]; zz[ip]/=a[jj*m +ip];
    }

    /* Secondary loop begins here */
    while(++iter < itmax) {
      /* See if all new constrained coeffs are feasible; if not, compute alpha */
      for(alpha = 2.0, ip = 0; ip < nsetp; ip++) {
        l=index[ip];
        if(zz[ip]<=0.) {
          t = -x[l]/(zz[ip]-x[l]);
          if(alpha > t) {
            alpha = t; 
            jj = ip - 1;
          }
        }
      }

      /* If all new constrained coeffs are feasible then still alpha==2. */
      /* If so, then exit from the secondary loop to main loop */
      if(alpha==2.0) 
        break;

      /* Use alpha (0.<alpha<1.) to interpolate between old X and new ZZ */
      for(ip=0; ip<nsetp; ip++) {
        l = index[ip];
        x[l] += alpha*(zz[ip]-x[l]);
      }

      /* Modify A and B and the INDEX arrays to move coefficient i */
      /* from set P to set Z. */
      k=index[jj+1]; pfeas=1;
      do {
        x[k]=0.;
        if(jj!=(nsetp-1)) {
          jj++;
          for(j=jj+1; j<nsetp; j++) {
            ii=index[j]; index[j-1]=ii;
            g1(a[ii*m + (j-1)], a[ii*m + j], &cc, &ss, &a[ii*m + j-1]);
            for(a[ii*m + j]=0., l=0; l<n; l++) if(l!=ii) {
              /* Apply procedure G2 (CC,SS,A(J-1,L),A(J,L)) */
              temp=a[l*m + j-1];
              a[l*m + j-1]=cc*temp+ss*a[l*m + j];
              a[l*m + j]=-ss*temp+cc*a[l*m + j];
            }
            /* Apply procedure G2 (CC,SS,B(J-1),B(J)) */
            temp=b[j-1]; b[j-1]=cc*temp+ss*b[j]; b[j]=-ss*temp+cc*b[j];
          }
        }
        npp1=nsetp-1; nsetp--; iz1--; index[iz1]=k;

        /* See if the remaining coeffs in set P are feasible; they should */
        /* be because of the way alpha was determined. If any are */
        /* infeasible it is due to round-off error. Any that are */
        /* nonpositive will be set to zero and moved from set P to set Z */
        for(jj=0, pfeas=1; jj<nsetp; jj++) {
          k=index[jj]; if(x[k]<=0.) {pfeas=0; break;}
        }
      } while(pfeas==0);

      /* Copy B[] into zz[], then solve again and loop back */
      for(k=0; k<m; k++) 
        zz[k]=b[k];
      for(l=0; l<nsetp; l++) {
        ip=nsetp-(l+1);
        if(l!=0) for(ii=0; ii<=ip; ii++) zz[ii]-=a[jj*m + ii]*zz[ip+1];
        jj=index[ip]; zz[ip]/=a[jj*m + ip];
      }
    } /* end of secondary loop */

    if(iter>=itmax) {
      ret = 1;
      break;
    }

    for(ip=0; ip<nsetp; ip++) {
      k=index[ip];
      x[k]=zz[ip];
      }
  } /* end of main loop */

  /* Compute the norm of the final residual vector */
  sm=0.;

  if (rnorm != NULL) {
    if (npp1<m) 
      for (k=npp1; k<m; k++) 
        sm+=(b[k] * b[k]);
    else 
      for (j=0; j<n; j++) 
        w[j]=0.;
    *rnorm=sqrt(sm);
  } 

  /* Free working space, if it was allocated here */
  free(w);
  free(zz);
  free(index);
  return(ret);
}
/* nnls_ */


double *nnls(double *a_matrix, double *b_matrix, int64_t height, int64_t width) {

  double *solution = (double*)calloc(height, sizeof(double));

  if(solution == NULL) {
    fprintf(stderr, "could not allocate enough memory for nnls\n");
    exit(EXIT_FAILURE);
  }

  int ret = nnls_algorithm(a_matrix, width, height, b_matrix, solution, NULL);
  if(ret == 1) {
    printf("NNLS has reached the maximum iterations\n");
  } else if(ret == 2) {
    fprintf(stderr, "NNLS could not allocate enough memory\n");
    exit(EXIT_FAILURE);
  }

  return solution;
}