/*
 *  hmm.c
 *
 *  Created by Mark Levy on 12/02/2006.
 *  Copyright 2006 Centre for Digital Music, Queen Mary, University of London.

    This program is free software; you can redistribute it and/or
    modify it under the terms of the GNU General Public License as
    published by the Free Software Foundation; either version 2 of the
    License, or (at your option) any later version.  See the file
    COPYING included with this distribution for more information.
 *
 */

#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <float.h>
#include <time.h>				/* to seed random number generator */

#include <clapack.h>		/* LAPACK for matrix inversion */

#include "maths/nan-inf.h"

#ifdef ATLAS_ORDER
#define HAVE_ATLAS 1
#endif

#ifdef HAVE_ATLAS
// Using ATLAS C interface to LAPACK
#define dgetrf_(m, n, a, lda, ipiv, info) \
        clapack_dgetrf(CblasColMajor, *m, *n, a, *lda, ipiv)
#define dgetri_(n, a, lda, ipiv, work, lwork, info) \
        clapack_dgetri(CblasColMajor, *n, a, *lda, ipiv)
#endif

#ifdef _MAC_OS_X
#include <vecLib/cblas.h>
#else
#include <cblas.h>		/* BLAS for matrix multiplication */
#endif

#include "hmm.h"

model_t* hmm_init(double** x, int T, int L, int N)
{
	int i, j, d, e, t;
	double s, ss;

	model_t* model;
	model = (model_t*) malloc(sizeof(model_t));
	model->N = N;
	model->L = L;
	model->p0 = (double*) malloc(N*sizeof(double));
	model->a = (double**) malloc(N*sizeof(double*));
	model->mu = (double**) malloc(N*sizeof(double*));
	for (i = 0; i < N; i++)
	{
		model->a[i] = (double*) malloc(N*sizeof(double));
		model->mu[i] = (double*) malloc(L*sizeof(double));
	}
	model->cov = (double**) malloc(L*sizeof(double*));
	for (i = 0; i < L; i++)
		model->cov[i] = (double*) malloc(L*sizeof(double));

	srand(time(0));
	double* global_mean = (double*) malloc(L*sizeof(double));

	/* find global mean */
	for (d = 0; d < L; d++)
	{
		global_mean[d] = 0;
		for (t = 0; t < T; t++)
			global_mean[d] += x[t][d];
		global_mean[d] /= T;
	}

	/* calculate global diagonal covariance */
	for (d = 0; d < L; d++)
	{
		for (e = 0; e < L; e++)
			model->cov[d][e] = 0;
		for (t = 0; t < T; t++)
			model->cov[d][d] += (x[t][d] - global_mean[d]) * (x[t][d] - global_mean[d]);
		model->cov[d][d] /= T-1;
	}

	/* set all means close to global mean */
	for (i = 0; i < N; i++)
	{
		for (d = 0; d < L; d++)
		{
			/* add some random noise related to covariance */
			/* ideally the random number would be Gaussian(0,1), as a hack we make it uniform on [-0.25,0.25] */
			model->mu[i][d] = global_mean[d] + (0.5 * rand() / (double) RAND_MAX - 0.25) * sqrt(model->cov[d][d]);
		}
	}

	/* random intial and transition probs */
	s = 0;
	for (i = 0; i < N; i++)
	{
		model->p0[i] = 1 + rand() / (double) RAND_MAX;
		s += model->p0[i];
		ss = 0;
		for (j = 0; j < N; j++)
		{
			model->a[i][j] = 1 + rand() / (double) RAND_MAX;
			ss += model->a[i][j];
		}
		for (j = 0; j < N; j++)
		{
			model->a[i][j] /= ss;
		}
	}
	for (i = 0; i < N; i++)
		model->p0[i] /= s;

	free(global_mean);

	return model;
}

void hmm_close(model_t* model)
{
	int i;

	for (i = 0; i < model->N; i++)
	{
		free(model->a[i]);
		free(model->mu[i]);
	}
	free(model->a);
	free(model->mu);
	for (i = 0; i < model->L; i++)
		free(model->cov[i]);
	free(model->cov);
	free(model);
}

void hmm_train(double** x, int T, model_t* model)
{
	int i, t;
	double loglik;	/* overall log-likelihood at each iteration */

	int N = model->N;
	int L = model->L;
	double* p0 = model->p0;
	double** a = model->a;
	double** mu = model->mu;
	double** cov = model->cov;

	/* allocate memory */
	double** gamma = (double**) malloc(T*sizeof(double*));
	double*** xi = (double***) malloc(T*sizeof(double**));
	for (t = 0; t < T; t++)
	{
		gamma[t] = (double*) malloc(N*sizeof(double));
		xi[t] = (double**) malloc(N*sizeof(double*));
		for (i = 0; i < N; i++)
			xi[t][i] = (double*) malloc(N*sizeof(double));
	}

	/* temporary memory */
	double* gauss_y = (double*) malloc(L*sizeof(double));
	double* gauss_z = (double*) malloc(L*sizeof(double));

	/* obs probs P(j|{x}) */
	double** b = (double**) malloc(T*sizeof(double*));
	for (t = 0; t < T; t++)
		b[t] = (double*) malloc(N*sizeof(double));

	/* inverse covariance and its determinant */
	double** icov = (double**) malloc(L*sizeof(double*));
	for (i = 0; i < L; i++)
		icov[i] = (double*) malloc(L*sizeof(double));
	double detcov;

	double thresh = 0.0001;
	int niter = 50;
	int iter = 0;
	double loglik1, loglik2;
	int foundnan = 0;

	while (iter < niter && !foundnan && !(iter > 1 && (loglik - loglik1) < thresh * (loglik1 - loglik2)))
	{
		++iter;
/*
		fprintf(stderr, "calculating obsprobs...\n");
		fflush(stderr);
*/
		/* precalculate obs probs */
		invert(cov, L, icov, &detcov);

		for (t = 0; t < T; t++)
		{
			//int allzero = 1;
			for (i = 0; i < N; i++)
			{
				b[t][i] = exp(loggauss(x[t], L, mu[i], icov, detcov, gauss_y, gauss_z));

				//if (b[t][i] != 0)
				//	allzero = 0;
			}
			/*
			if (allzero)
			{
				printf("all the b[t][i] were zero for t = %d, correcting...\n", t);
				for (i = 0; i < N; i++)
				{
					b[t][i] = 0.00001;
				}
			}
			*/
		}
/*
		fprintf(stderr, "forwards-backwards...\n");
		fflush(stderr);
*/
		forward_backwards(xi, gamma, &loglik, &loglik1, &loglik2, iter, N, T, p0, a, b);
/*
		fprintf(stderr, "iteration %d: loglik = %f\n", iter, loglik);
		fprintf(stderr, "re-estimation...\n");
		fflush(stderr);
*/
		if (ISNAN(loglik)) {
		    foundnan = 1;
		    continue;
		}

		baum_welch(p0, a, mu, cov, N, T, L, x, xi, gamma);

		/*
		printf("a:\n");
		for (i = 0; i < model->N; i++)
		{
			for (j = 0; j < model->N; j++)
				printf("%f ", model->a[i][j]);
			printf("\n");
		}
		printf("\n\n");
		 */
		//hmm_print(model);
	}

	/* deallocate memory */
	for (t = 0; t < T; t++)
	{
		free(gamma[t]);
		free(b[t]);
		for (i = 0; i < N; i++)
			free(xi[t][i]);
		free(xi[t]);
	}
	free(gamma);
	free(xi);
	free(b);

	for (i = 0; i < L; i++)
		free(icov[i]);
	free(icov);

	free(gauss_y);
	free(gauss_z);
}

void mlss_reestimate(double* p0, double** a, double** mu, double** cov, int N, int T, int L, int* q, double** x)
{
	/* fit a single Gaussian to observations in each state */

	/* calculate the mean observation in each state */

	/* calculate the overall covariance */

	/* count transitions */

	/* estimate initial probs from transitions (???) */
}

void baum_welch(double* p0, double** a, double** mu, double** cov, int N, int T, int L, double** x, double*** xi, double** gamma)
{
	int i, j, t;

	double* sum_gamma = (double*) malloc(N*sizeof(double));

	/* temporary memory */
	double* u = (double*) malloc(L*L*sizeof(double));
	double* yy = (double*) malloc(T*L*sizeof(double));
	double* yy2 = (double*) malloc(T*L*sizeof(double));

	/* re-estimate transition probs */
	for (i = 0; i < N; i++)
	{
		sum_gamma[i] = 0;
		for (t = 0; t < T-1; t++)
			sum_gamma[i] += gamma[t][i];
	}

	for (i = 0; i < N; i++)
	{
		if (sum_gamma[i] == 0)
		{
/*			fprintf(stderr, "sum_gamma[%d] was zero...\n", i); */
		}
		//double s = 0;
		for (j = 0; j < N; j++)
		{
			a[i][j] = 0;
			if (sum_gamma[i] == 0.) continue;
			for (t = 0; t < T-1; t++)
				a[i][j] += xi[t][i][j];
			//s += a[i][j];
			a[i][j] /= sum_gamma[i];
		}
		/*
		 for (j = 0; j < N; j++)
		 {
			 a[i][j] /= s;
		 }
		 */
	}

	/* NB: now we need to sum gamma over all t */
	for (i = 0; i < N; i++)
		sum_gamma[i] += gamma[T-1][i];

	/* re-estimate initial probs */
	for (i = 0; i < N; i++)
		p0[i] = gamma[0][i];

	/* re-estimate covariance */
	int d, e;
	double sum_sum_gamma = 0;
	for (i = 0; i < N; i++)
		sum_sum_gamma += sum_gamma[i];

	/*
	 for (d = 0; d < L; d++)
	 {
		 for (e = d; e < L; e++)
		 {
			 cov[d][e] = 0;
			 for (t = 0; t < T; t++)
				 for (j = 0; j < N; j++)
					 cov[d][e] += gamma[t][j] * (x[t][d] - mu[j][d]) * (x[t][e] - mu[j][e]);

			 cov[d][e] /= sum_sum_gamma;

			 if (ISNAN(cov[d][e]))
			 {
				 printf("cov[%d][%d] was nan\n", d, e);
				 for (j = 0; j < N; j++)
					 for (i = 0; i < L; i++)
						 if (ISNAN(mu[j][i]))
							 printf("mu[%d][%d] was nan\n", j, i);
				 for (t = 0; t < T; t++)
					 for (j = 0; j < N; j++)
						 if (ISNAN(gamma[t][j]))
							 printf("gamma[%d][%d] was nan\n", t, j);
				 exit(-1);
			 }
		 }
	 }
	 for (d = 0; d < L; d++)
	 for (e = 0; e < d; e++)
	 cov[d][e] = cov[e][d];
	 */

	/* using BLAS */
	for (d = 0; d < L; d++)
		for (e = 0; e < L; e++)
			cov[d][e] = 0;

	for (j = 0; j < N; j++)
	{
		for (d = 0; d < L; d++)
			for (t = 0; t < T; t++)
			{
				yy[d*T+t] = x[t][d] - mu[j][d];
				yy2[d*T+t] = gamma[t][j] * (x[t][d] - mu[j][d]);
			}

				cblas_dgemm(CblasColMajor, CblasTrans, CblasNoTrans, L, L, T, 1.0, yy, T, yy2, T, 0, u, L);

		for (e = 0; e < L; e++)
			for (d = 0; d < L; d++)
				cov[d][e] += u[e*L+d];
	}

	for (d = 0; d < L; d++)
		for (e = 0; e < L; e++)
			cov[d][e] /= T; /* sum_sum_gamma; */

	//printf("sum_sum_gamma = %f\n", sum_sum_gamma); /* fine, = T IS THIS ALWAYS TRUE with pooled cov?? */

	/* re-estimate means */
	for (j = 0; j < N; j++)
	{
		for (d = 0; d < L; d++)
		{
			mu[j][d] = 0;
			for (t = 0; t < T; t++)
				mu[j][d] += gamma[t][j] * x[t][d];
			mu[j][d] /= sum_gamma[j];
		}
	}

	/* deallocate memory */
	free(sum_gamma);
	free(yy);
	free(yy2);
	free(u);
}

void forward_backwards(double*** xi, double** gamma, double* loglik, double* loglik1, double* loglik2, int iter, int N, int T, double* p0, double** a, double** b)
{
	/* forwards-backwards with scaling */
	int i, j, t;

	double** alpha = (double**) malloc(T*sizeof(double*));
	double** beta = (double**) malloc(T*sizeof(double*));
	for (t = 0; t < T; t++)
	{
		alpha[t] = (double*) malloc(N*sizeof(double));
		beta[t] = (double*) malloc(N*sizeof(double));
	}

	/* scaling coefficients */
	double* c = (double*) malloc(T*sizeof(double));

	/* calculate forward probs and scale coefficients */
	c[0] = 0;
	for (i = 0; i < N; i++)
	{
		alpha[0][i] = p0[i] * b[0][i];
		c[0] += alpha[0][i];

		//printf("p0[%d] = %f, b[0][%d] = %f\n", i, p0[i], i, b[0][i]);
	}
	c[0] = 1 / c[0];
	for (i = 0; i < N; i++)
	{
		alpha[0][i] *= c[0];

		//printf("alpha[0][%d] = %f\n", i, alpha[0][i]);	/* OK agrees with Matlab */
	}

	*loglik1 = *loglik;
	*loglik = -log(c[0]);
	if (iter == 2)
		*loglik2 = *loglik;

	for (t = 1; t < T; t++)
	{
		c[t] = 0;
		for (j = 0; j < N; j++)
		{
			alpha[t][j] = 0;
			for (i = 0; i < N; i++)
				alpha[t][j] += alpha[t-1][i] * a[i][j];
			alpha[t][j] *= b[t][j];

			c[t] += alpha[t][j];
		}

		/*
		 if (c[t] == 0)
		 {
			 printf("c[%d] = 0, going to blow up so exiting\n", t);
			 for (i = 0; i < N; i++)
				 if (b[t][i] == 0)
					 fprintf(stderr, "b[%d][%d] was zero\n", t, i);
			 fprintf(stderr, "x[t] was \n");
			 for (i = 0; i < L; i++)
				 fprintf(stderr, "%f ", x[t][i]);
			 fprintf(stderr, "\n\n");
			 exit(-1);
		 }
		 */

		c[t] = 1 / c[t];
		for (j = 0; j < N; j++)
			alpha[t][j] *= c[t];

		//printf("c[%d] = %e\n", t, c[t]);

		*loglik -= log(c[t]);
	}

	/* calculate backwards probs using same coefficients */
	for (i = 0; i < N; i++)
		beta[T-1][i] = 1;
	t = T - 1;
	while (1)
	{
		for (i = 0; i < N; i++)
			beta[t][i] *= c[t];

		if (t == 0)
			break;

		for (i = 0; i < N; i++)
		{
			beta[t-1][i] = 0;
			for (j = 0; j < N; j++)
				beta[t-1][i] += a[i][j] * b[t][j] * beta[t][j];
		}

		t--;
	}

	/*
	 printf("alpha:\n");
	 for (t = 0; t < T; t++)
	 {
		 for (i = 0; i < N; i++)
			 printf("%4.4e\t\t", alpha[t][i]);
		 printf("\n");
	 }
	 printf("\n\n");printf("beta:\n");
	 for (t = 0; t < T; t++)
	 {
		 for (i = 0; i < N; i++)
			 printf("%4.4e\t\t", beta[t][i]);
		 printf("\n");
	 }
	 printf("\n\n");
	 */

	/* calculate posterior probs */
	double tot;
	for (t = 0; t < T; t++)
	{
		tot = 0;
		for (i = 0; i < N; i++)
		{
			gamma[t][i] = alpha[t][i] * beta[t][i];
			tot += gamma[t][i];
		}
		for (i = 0; i < N; i++)
		{
			gamma[t][i] /= tot;

			//printf("gamma[%d][%d] = %f\n", t, i, gamma[t][i]);
		}
	}

	for (t = 0; t < T-1; t++)
	{
		tot = 0;
		for (i = 0; i < N; i++)
		{
			for (j = 0; j < N; j++)
			{
				xi[t][i][j] = alpha[t][i] * a[i][j] * b[t+1][j] * beta[t+1][j];
				tot += xi[t][i][j];
			}
		}
		for (i = 0; i < N; i++)
			for (j = 0; j < N; j++)
				xi[t][i][j] /= tot;
	}

	/*
	 // CHECK - fine
	 // gamma[t][i] = \sum_j{xi[t][i][j]}
	 tot = 0;
	 for (j = 0; j < N; j++)
	 tot += xi[3][1][j];
	 printf("gamma[3][1] = %f, sum_j(xi[3][1][j]) = %f\n", gamma[3][1], tot);
	 */

	for (t = 0; t < T; t++)
	{
		free(alpha[t]);
		free(beta[t]);
	}
	free(alpha);
	free(beta);
	free(c);
}

void viterbi_decode(double** x, int T, model_t* model, int* q)
{
	int i, j, t;
	double max;
	int havemax;

	int N = model->N;
	int L = model->L;
	double* p0 = model->p0;
	double** a = model->a;
	double** mu = model->mu;
	double** cov = model->cov;

	/* inverse covariance and its determinant */
	double** icov = (double**) malloc(L*sizeof(double*));
	for (i = 0; i < L; i++)
		icov[i] = (double*) malloc(L*sizeof(double));
	double detcov;

	double** logb = (double**) malloc(T*sizeof(double*));
	double** phi = (double**) malloc(T*sizeof(double*));
	int** psi = (int**) malloc(T*sizeof(int*));
	for (t = 0; t < T; t++)
	{
		logb[t] = (double*) malloc(N*sizeof(double));
		phi[t] = (double*) malloc(N*sizeof(double));
		psi[t] = (int*) malloc(N*sizeof(int));
	}

	/* temporary memory */
	double* gauss_y = (double*) malloc(L*sizeof(double));
	double* gauss_z = (double*) malloc(L*sizeof(double));

	/* calculate observation logprobs */
	invert(cov, L, icov, &detcov);
	for (t = 0; t < T; t++)
		for (i = 0; i < N; i++)
			logb[t][i] = loggauss(x[t], L, mu[i], icov, detcov, gauss_y, gauss_z);

	/* initialise */
	for (i = 0; i < N; i++)
	{
		phi[0][i] = log(p0[i]) + logb[0][i];
		psi[0][i] = 0;
	}

	for (t = 1; t < T; t++)
	{
		for (j = 0; j < N; j++)
		{
			max = -1000000;
			havemax = 0;

			psi[t][j] = 0;
			for (i = 0; i < N; i++)
			{
				if (phi[t-1][i] + log(a[i][j]) > max || !havemax)
				{
					max = phi[t-1][i] + log(a[i][j]);
					phi[t][j] = max + logb[t][j];
					psi[t][j] = i;
					havemax = 1;
				}
			}
		}
	}

	/* find maximising state at time T-1 */
	max = phi[T-1][0];
	q[T-1] = 0;
	for (i = 1; i < N; i++)
	{
		if (phi[T-1][i] > max)
		{
			max = phi[T-1][i];
			q[T-1] = i;
		}
	}


	/* track back */
	t = T - 2;
	while (t >= 0)
	{
		q[t] = psi[t+1][q[t+1]];
		t--;
	}

	/* de-allocate memory */
	for (i = 0; i < L; i++)
		free(icov[i]);
	free(icov);
	for (t = 0; t < T; t++)
	{
		free(logb[t]);
		free(phi[t]);
		free(psi[t]);
	}
	free(logb);
	free(phi);
	free(psi);

	free(gauss_y);
	free(gauss_z);
}

/* invert matrix and calculate determinant using LAPACK */
void invert(double** cov, int L, double** icov, double* detcov)
{
	/* copy square matrix into a vector in column-major order */
	double* a = (double*) malloc(L*L*sizeof(double));
	int i, j;
	for(j=0; j < L; j++)
		for (i=0; i < L; i++)
			a[j*L+i] = cov[i][j];

	int M = (int) L;
	int* ipiv = (int *) malloc(L*L*sizeof(int));
	int ret;

	/* LU decomposition */
	ret = dgetrf_(&M, &M, a, &M, ipiv, &ret);	/* ret should be zero, negative if cov is singular */
	if (ret < 0)
	{
		fprintf(stderr, "Covariance matrix was singular, couldn't invert\n");
		exit(-1);
	}

	/* find determinant */
	double det = 1;
	for(i = 0; i < L; i++)
		det *= a[i*L+i];
	// TODO: get this to work!!! If detcov < 0 then cov is bad anyway...
	/*
	int sign = 1;
	for (i = 0; i < L; i++)
		if (ipiv[i] != i)
			sign = -sign;
	det *= sign;
	 */
	if (det < 0)
		det = -det;
	*detcov = det;

	/* allocate required working storage */
#ifndef HAVE_ATLAS
	int lwork = -1;
	double lwbest = 0.0;
	dgetri_(&M, a, &M, ipiv, &lwbest, &lwork, &ret);
	lwork = (int) lwbest;
	double* work  = (double*) malloc(lwork*sizeof(double));
#endif

	/* find inverse */
	dgetri_(&M, a, &M, ipiv, work, &lwork, &ret);

	for(j=0; j < L; j++)
		for (i=0; i < L; i++)
			icov[i][j] = a[j*L+i];

#ifndef HAVE_ATLAS
	free(work);
#endif
	free(a);
}

/* probability of multivariate Gaussian given mean, inverse and determinant of covariance */
double gauss(double* x, int L, double* mu, double** icov, double detcov, double* y, double* z)
{
	int i, j;
	double s = 0;
	for (i = 0; i < L; i++)
		y[i] = x[i] - mu[i];
	for (i = 0; i < L; i++)
	{
		//z[i] = 0;
		//for (j = 0; j < L; j++)
		//	z[i] += icov[i][j] *  y[j];
		z[i] = cblas_ddot(L, &icov[i][0], 1, y, 1);
	}
	s = cblas_ddot(L, z, 1, y, 1);
	//for (i = 0; i < L; i++)
	//	s += z[i] * y[i];

	return exp(-s/2.0) / (pow(2*PI, L/2.0) * sqrt(detcov));
}

/* log probability of multivariate Gaussian given mean, inverse and determinant of covariance */
double loggauss(double* x, int L, double* mu, double** icov, double detcov, double* y, double* z)
{
	int i, j;
	double s = 0;
	double ret;
	for (i = 0; i < L; i++)
		y[i] = x[i] - mu[i];
	for (i = 0; i < L; i++)
	{
		//z[i] = 0;
		//for (j = 0; j < L; j++)
		//	z[i] += icov[i][j] *  y[j];
		z[i] = cblas_ddot(L, &icov[i][0], 1, y, 1);
	}
	s = cblas_ddot(L, z, 1, y, 1);
	//for (i = 0; i < L; i++)
	//	s += z[i] * y[i];

	ret = -0.5 * (s + L * log(2*PI) + log(detcov));

	/*
	// TEST
	if (ISINF(ret) > 0)
		printf("loggauss returning infinity\n");
	if (ISINF(ret) < 0)
		printf("loggauss returning -infinity\n");
	if (ISNAN(ret))
		printf("loggauss returning nan\n");
	*/

	return ret;
}

void hmm_print(model_t* model)
{
	int i, j;
	printf("p0:\n");
	for (i = 0; i < model->N; i++)
		printf("%f ", model->p0[i]);
	printf("\n\n");
	printf("a:\n");
	for (i = 0; i < model->N; i++)
	{
		for (j = 0; j < model->N; j++)
			printf("%f ", model->a[i][j]);
		printf("\n");
	}
	printf("\n\n");
	printf("mu:\n");
	for (i = 0; i < model->N; i++)
	{
		for (j = 0; j < model->L; j++)
			printf("%f ", model->mu[i][j]);
		printf("\n");
	}
	printf("\n\n");
	printf("cov:\n");
	for (i = 0; i < model->L; i++)
	{
		for (j = 0; j < model->L; j++)
			printf("%f ", model->cov[i][j]);
		printf("\n");
	}
	printf("\n\n");
}