##################################################################
# R macros for Lee, Quintana, Mueller & Trippa (2008)
# Implement posterior inference for the SSM(p,G0) model in Sec. 3
#
# DISCLAIMER: these macros are not meant for data analysis.
#       Any realistic data analysis would require more efficient
#       coding in some compiled level language (C, C++ etc.).
#       If any user translates these macros into C, we would 
#       love to hear. 
#
#       We distribute these macros since we believe
#       they might help interested readers who wish to
#       understand implementaiton details of the proposed SSM(p,G0)
#       model. There is no claim whatsover about the efficiency
#       of the used algorithms. In fact, we would love to hear
#       if anyone develops improved posterior simulation for
#       these models.
##################################################################


## USAGE: 
## MCMC: use 'gibbs(.)' to launch posterior MCMC
##       see macro example(.) for a calling sequence for
##       the example in Section 3.2.
##       The data 'y0' is defined in the list of global vars below.
## The plotting macros 'plt.xxx(.)' are defined at the end of 
##       this file.
##

example <- function()
  { ## example from Section 3.2.
    ## execute line by line.. See below for global variables
    ## and fake data.

    ## first using the proposed SSM(p,G0)
    use.dp0 <<- F
    ssm  <- gibbs(y=y0, n.iter=1000, n.batch=100,use.dp=F)

    ## same using a simple DP prior
    use.dp0 <- T
    dp <-  gibbs(y=y0,n.iter=1000, n.batch=100,use.dp=T)

    ## plot post predictive dist's
    plt.pj(ssm$pjlist,c(0,.15))
    plt.pj(dp$pjlist, c(0,.15))

    ## summarize posterior random partition
    plt.sij(ssm$slist)
    plt.sij(dp$slist)
  }


## global parameters (all end in "..0"
alpha0  <- 2.83 # total mass par of DP prior
use.dp0 <- F    # flag for using DP prior
y0      <- -4:4
yl0     <- c(0,0.14)
ygrid0  <- seq(from= -5, to=5, length=50)
## hyperprior paramters
as0     <- 4; bs0 <- 4
B0      <- 0.1

eps <- 0.001  # small threshold to ignore imputed RPMs


##################################################################
## PPF
##################################################################
## auxiliary macros used to evaluate complete conditionals 
## in the gibbs(.) sampler below
## 
## rSSM():    generate RPM's P[m] ~ p(P), m=1..M
## ps.SSM():  evaluate weights p(s | P) for RPMs generated
##            in rSSM() (and saved in LW (weights) and PI
## ppfSSM():     evaluate PRIOR ppf p(s[n+1] | s)

a <- 1; b <- 10; sig <- 0.1; mxj <- 20; M <- 10
rSSM <- function(a=1,b=10,sig=0.1,mxj=20,M=100)
  {## generate random weights (LW=log weights) and latent indicators (PI)
   ## for M random probability measures, one in each row of LW and PI
    nn <- 1:mxj
    wbar <- 1 - 1/(1+exp(b-a*nn))
    wbar <- wbar/sum(wbar)
    wbar <- ifelse(wbar>eps,wbar,eps)
    lwbar <- log(wbar)
    LW <- NULL
    PI <- NULL
    for(m in 1:M){
      lwj <- rnorm(mxj,lwbar,sig)
      lwj <- lwj - log(sum(exp(lwj)))
      wj <- exp(lwj)
      pi <- sample(nn,mxj,replace=F,prob=wj)
      LW <- rbind(LW,lwj)
      PI <- rbind(PI,pi)
    }
    return(list(LW=LW,PI=PI))
  }

n <- 9
k.SSM <- function(n)
  { ## compute prior expected number of clusters
    ## call rSSM first to simulate PI
    SSM <- rSSM()
    W <- exp(SSM$LW)
    kmx <- ncol(W)    # number of point masses
    x <-  apply(W,1,function(wj){sample(1:kmx,n,replace=T,prob=wj)})
    kk <- apply(x,2,function(xj){length(unique(xj))})
    kbar <- mean(kk)     # E(k) under SSM
    alpha <- kbar/log(n) # equivalent prior mean under DP prior
                         ## kbar = alpha*log(n) => alpha = kbar/log(n)
    return(alpha) # alpha=2.83
  }
    
ps.SSM <- function(s, LW,PI)
  {# compute p(s | pi,P) for each row of  weights W and indicators PI
   ## this is the second factor in eq (14), last line
   ## evaluated using (15).
    s <- as.numeric(as.factor(s)) # relabel clusters by arrival
    k <- length(unique(s))
    n <- length(s)
    ki <- unlist( lapply(1:n,function(i){max(s[1:i])}) ) #k_i
    kim <- c(0,ki[-n]) # for p(s[i] | s[1..i-1]) we need k_{i-1}
    ts <- table(s)
    nj <- c(ts)
    j <- as.numeric(names(ts)) # might include gaps!
    LPS <- NULL
    for(m in 1:nrow(LW)){
      pi <- PI[m,]
      lwj <- LW[m,pi[1:k]]         # m-th random G ~ SSM(p,.), for
            ## the first k clusters
      pcum <- cumsum(exp(lwj))     # proportionality constants
      pcum <- c(pcum,1)            # normalization constant for
                   # i>i_k is 1.0

      lps <- sum(lwj*nj)           # prod_j  w[pi[j]]^(nj-1)
             ## un-normalized probabilities
      lps <- lps - sum(log(pcum[kim+1])) # proportionality constants
      LPS <- c(LPS,lps)  # log weight p(s | pi,w)
    }
    return(LPS)
  }

ppfSSM <- function(s)
  { # get PPF for s
    k <- length(unique(s))
    SSM <- rSSM()           
           ## MC sample of M=100 w~p(F) and pi~p(pi | F)
    LPS <- ps.SSM(s,SSM$LW,SSM$PI) 
           ## log p(s | pi,w) for each row of PI and W
    LPS0 <- LPS-max(LPS) # normalize
    PS <- exp(LPS0)
    PS <- PS/(sum(PS))
    pj <- 0*(1:(k+1))           # initialize
    M <- nrow(SSM$LW)
    idx <- which(PS > 0.01*1/M) # more than 1% contribution...
    r <- sum(PS[idx])
    for(m in idx){
      wj <- exp(SSM$LW[m,] )
      pi <- SSM$PI[m,]
      w0 <- 1-sum(wj[pi[1:k]]) # prob of new cluster
      pj <- pj+ PS[m]*c(wj[pi[1:k]],w0)
    }
    return(pj/r)
  }

ppfDP <- function(nj0,alpha=alpha0)
  { ## PPF for DP(alpha,G0)
    n <- sum(nj0)
    pj <- c(nj0,alpha)/(alpha+n)
    return(pj)
  }


##################################################################
# posterior inference
##################################################################

## Functions for posterior inference
## gibbs():      does MCMC for data y
##               returns two matrices with one row per iteration:
##               pjlist= posterior pred simulation
##               slist = cluster membership indicators
##
## ppred.grid(): evaluate posterior PPF p(y[n+1] | y,s)
##               on a grid over y
## sample.si():  complete conditional p(s[i] | y[i], y[-i],s[-i])
## ppred.mix():  POSTERIOR ppf p(y[n+1] | y,s)
##               can be used for p(s[i] | y[-i],s[-i])
##               as mixture of normals sum pr[j]*N(m[j],v[j])


gibbs <- function(y, n.iter=100,n.batch=10,plt=T,yl=c(0,0.1),
                  ygrid=ygrid0, use.dp=use.dp0)
  {
    n <- length(y)
    s <- 1:n
    pjlist <<- NULL
    slist <<- NULL
    for(iter in 1:n.iter){
      for(i in 1:n)
        s[i] <- sample.si(y[i],s[-i],y[-i])
      s <- as.numeric(as.factor(s)) # recode cluster numbers
      ## save summaries
      slist <<- rbind(slist,s)
      if (iter %% n.batch == 0){ # computation intensive -- do only in batches
        cat("iter=",iter,"\n")
        pj <-  ppred.grid(s,y,ygrid,add=!is.null(pjlist),use.dp=use.dp)
        pjlist <<- rbind(pjlist,pj)
      }
    }
    return(list(pjlist=pjlist,slist=slist))
  }

col <- 2
ppred.grid <- function(s,y,ygrid=ygrid0,plt=T,add=T,use.dp=use.dp0)
  { ## returns predictive on a grid (-3,3)
    pp <- ppred.mix(s,y,use.dp=use.dp0)
    pj <- 0*ygrid
    pr <- pp$pr/ sum(pp$pr) # standardize
    for(j in 1:length(pr)){
      z <- (ygrid-pp$mj[j])/pp$vj[j] # standardize
      pj <- pj+pr[j]*                # WAS: dnorm(ygrid,m=pp$mj[j],...)
        dt(z,df=pp$nuj[j])/pp$vj[j]; # divide by Jacobian for scaling
    }
    if(plt){
      if (add) lines(ygrid,pj,type="l",col=col)
      else     plot(ygrid,pj,type="l",xlab="Y",ylab="P",bty="l",ylim=yl0)
    }
    col <<- col+1
    return(pj)
  }    
    
## yi <- y[1]; sm <- s[-1]; ym <- y[-1] # debugging..
sample.si <- function(yi,sm,ym)
  {# sample s[i] ~ p(s[i] | y[i], s[-i],y[-i])
    pp <- ppred.mix(sm,ym)
    zi <- (yi-pp$mj)/pp$vj
    llik <- dt(zi, df=pp$nuj, log=T) -
      log(pp$vj) # subtract log Jacobian for scaling
    lik <- exp(llik-max(llik))
    po <- lik*pp$pr 
    si <- sample(pp$jlist,1,prob=po)
    return(si) 
  }

# sm <- s[-1]; ym <- y[-1] # debugging..
ppred.mix <- function(sm,ym,use.dp=use.dp0)
  {##  posterior predictive probs
   ##  p(y[n+1] | s,y) = sum pr[j]*t(nu[j], mj[j],vj[j])
   ##  returns(pr,mj,vj), vectors of length k+1
   ##  k= # clusters in sm
   ##  use.dp:  indicator for using DP(alpha,.) model
   ##  NOTE: B0 = 1/c from the paper.
   ## hyperprior parameters: as0,bs0,B0 are set on top of file
    k <- length(unique(sm))
    ts <- table(sm)
    nj <- c(ts)
    tn <- table(nj)
    nj0 <- as.numeric(names(tn))
    nnj <- c(tn)
    jlist <- c( as.numeric(names(ts)), max(sm)+1)
    if (!use.dp){
      pr <- ppfSSM(sm)              # prior prob for different clusters
      for (h in 1:length(nj0)){  # smooth over equal cluster sizes..
        idx <- which( nj == nj0[h] )
        pr[idx] <- mean(pr[idx])
      }
    } else 
      pr <- ppfDP(nj)         # prior prob under DP prior
    llik <- 0*pr  # likelihood for s=j
    vj <- 0*pr    # scale par of prediciton
    mj <- 0*pr    # location of prediction
    nuj <- 0*pr    # d.f. of prediction
    ## clusters 1...k
    for(j in 1:k){
      idx <- sm==jlist[j]
      yj <- ym[idx]
      mx <- mean(yj)       # average
      sx <- sum((yj-mx)^2) # ss
      nuj[j] <- as0+nj[j]
      mj[j] <- nj[j]/(nj[j]+B0)*mx # t location parameter
      K <- (nj[j]+B0+1)/(nj[j]+B0)
      vj[j] <- sqrt( K*(bs0+sx+ 1/(1/B0+1/nj[j])*mx^2)/(nj[j]+as0) )
      ## t-dist scale parameter
    }
    ## new cluster
    mj[k+1] <- 0;
    nuj[k+1] <- as0;
    K <- (B0+1)/B0
    vj[k+1] <- sqrt( K*bs0/as0 )
 
    return(list(mj=mj,vj=vj,nuj=nuj,pr=pr,jlist=jlist))
  }

#######################################################
# inference summaries
#######################################################
## various functions to make plots for paper
## 

plt.pj <- function(pjlist,yl=NULL,hist=F,pts=T)
  {## plot density estimate
    pjbar <- apply(pjlist,2,mean)
    matplot(ygrid0,t(pjlist),type="l",
            bty="l",xlab="Y",ylab="P",ylim=yl)
    lines(ygrid0,pjbar,type="l",lwd=3)
    if (hist)
      hist(y0,add=T,prob=T)
    yy <- 0*y0
    if (pts)
      points(y0,yy,pch=17,lwd=2)
}


plt.sij <- function(slist)
  {
    idx <- order(y0)
    M <- nrow(slist)
    n <- length(y0)
    psij <- matrix(1,nrow=n,ncol=n)
    for(i in 2:n){
      for(j in 1:(i-1)){
        pij <- sum(slist[,idx[i]]==slist[,idx[j]])/M
        psij[i,j] <- pij
        psij[j,i] <- pij
      }
    }
    ## mx <- max(psij)
    ## diag(psij) <- mx
    image(1:n,1:n,psij,xlab="",ylab="",col=grey((1:10/10)))
  }