################################################################## # R macros for Lee, Quintana, Mueller & Trippa (2008) # Implement posterior inference for the SSM(p,G0) model in Sec. 3 # # DISCLAIMER: these macros are not meant for data analysis. # Any realistic data analysis would require more efficient # coding in some compiled level language (C, C++ etc.). # If any user translates these macros into C, we would # love to hear. # # We distribute these macros since we believe # they might help interested readers who wish to # understand implementaiton details of the proposed SSM(p,G0) # model. There is no claim whatsover about the efficiency # of the used algorithms. In fact, we would love to hear # if anyone develops improved posterior simulation for # these models. ################################################################## ## USAGE: ## MCMC: use 'gibbs(.)' to launch posterior MCMC ## see macro example(.) for a calling sequence for ## the example in Section 3.2. ## The data 'y0' is defined in the list of global vars below. ## The plotting macros 'plt.xxx(.)' are defined at the end of ## this file. ## example <- function() { ## example from Section 3.2. ## execute line by line.. See below for global variables ## and fake data. ## first using the proposed SSM(p,G0) use.dp0 <<- F ssm <- gibbs(y=y0, n.iter=1000, n.batch=100,use.dp=F) ## same using a simple DP prior use.dp0 <- T dp <- gibbs(y=y0,n.iter=1000, n.batch=100,use.dp=T) ## plot post predictive dist's plt.pj(ssm$pjlist,c(0,.15)) plt.pj(dp$pjlist, c(0,.15)) ## summarize posterior random partition plt.sij(ssm$slist) plt.sij(dp$slist) } ## global parameters (all end in "..0" alpha0 <- 2.83 # total mass par of DP prior use.dp0 <- F # flag for using DP prior y0 <- -4:4 yl0 <- c(0,0.14) ygrid0 <- seq(from= -5, to=5, length=50) ## hyperprior paramters as0 <- 4; bs0 <- 4 B0 <- 0.1 eps <- 0.001 # small threshold to ignore imputed RPMs ################################################################## ## PPF ################################################################## ## auxiliary macros used to evaluate complete conditionals ## in the gibbs(.) sampler below ## ## rSSM(): generate RPM's P[m] ~ p(P), m=1..M ## ps.SSM(): evaluate weights p(s | P) for RPMs generated ## in rSSM() (and saved in LW (weights) and PI ## ppfSSM(): evaluate PRIOR ppf p(s[n+1] | s) a <- 1; b <- 10; sig <- 0.1; mxj <- 20; M <- 10 rSSM <- function(a=1,b=10,sig=0.1,mxj=20,M=100) {## generate random weights (LW=log weights) and latent indicators (PI) ## for M random probability measures, one in each row of LW and PI nn <- 1:mxj wbar <- 1 - 1/(1+exp(b-a*nn)) wbar <- wbar/sum(wbar) wbar <- ifelse(wbar>eps,wbar,eps) lwbar <- log(wbar) LW <- NULL PI <- NULL for(m in 1:M){ lwj <- rnorm(mxj,lwbar,sig) lwj <- lwj - log(sum(exp(lwj))) wj <- exp(lwj) pi <- sample(nn,mxj,replace=F,prob=wj) LW <- rbind(LW,lwj) PI <- rbind(PI,pi) } return(list(LW=LW,PI=PI)) } n <- 9 k.SSM <- function(n) { ## compute prior expected number of clusters ## call rSSM first to simulate PI SSM <- rSSM() W <- exp(SSM$LW) kmx <- ncol(W) # number of point masses x <- apply(W,1,function(wj){sample(1:kmx,n,replace=T,prob=wj)}) kk <- apply(x,2,function(xj){length(unique(xj))}) kbar <- mean(kk) # E(k) under SSM alpha <- kbar/log(n) # equivalent prior mean under DP prior ## kbar = alpha*log(n) => alpha = kbar/log(n) return(alpha) # alpha=2.83 } ps.SSM <- function(s, LW,PI) {# compute p(s | pi,P) for each row of weights W and indicators PI ## this is the second factor in eq (14), last line ## evaluated using (15). s <- as.numeric(as.factor(s)) # relabel clusters by arrival k <- length(unique(s)) n <- length(s) ki <- unlist( lapply(1:n,function(i){max(s[1:i])}) ) #k_i kim <- c(0,ki[-n]) # for p(s[i] | s[1..i-1]) we need k_{i-1} ts <- table(s) nj <- c(ts) j <- as.numeric(names(ts)) # might include gaps! LPS <- NULL for(m in 1:nrow(LW)){ pi <- PI[m,] lwj <- LW[m,pi[1:k]] # m-th random G ~ SSM(p,.), for ## the first k clusters pcum <- cumsum(exp(lwj)) # proportionality constants pcum <- c(pcum,1) # normalization constant for # i>i_k is 1.0 lps <- sum(lwj*nj) # prod_j w[pi[j]]^(nj-1) ## un-normalized probabilities lps <- lps - sum(log(pcum[kim+1])) # proportionality constants LPS <- c(LPS,lps) # log weight p(s | pi,w) } return(LPS) } ppfSSM <- function(s) { # get PPF for s k <- length(unique(s)) SSM <- rSSM() ## MC sample of M=100 w~p(F) and pi~p(pi | F) LPS <- ps.SSM(s,SSM$LW,SSM$PI) ## log p(s | pi,w) for each row of PI and W LPS0 <- LPS-max(LPS) # normalize PS <- exp(LPS0) PS <- PS/(sum(PS)) pj <- 0*(1:(k+1)) # initialize M <- nrow(SSM$LW) idx <- which(PS > 0.01*1/M) # more than 1% contribution... r <- sum(PS[idx]) for(m in idx){ wj <- exp(SSM$LW[m,] ) pi <- SSM$PI[m,] w0 <- 1-sum(wj[pi[1:k]]) # prob of new cluster pj <- pj+ PS[m]*c(wj[pi[1:k]],w0) } return(pj/r) } ppfDP <- function(nj0,alpha=alpha0) { ## PPF for DP(alpha,G0) n <- sum(nj0) pj <- c(nj0,alpha)/(alpha+n) return(pj) } ################################################################## # posterior inference ################################################################## ## Functions for posterior inference ## gibbs(): does MCMC for data y ## returns two matrices with one row per iteration: ## pjlist= posterior pred simulation ## slist = cluster membership indicators ## ## ppred.grid(): evaluate posterior PPF p(y[n+1] | y,s) ## on a grid over y ## sample.si(): complete conditional p(s[i] | y[i], y[-i],s[-i]) ## ppred.mix(): POSTERIOR ppf p(y[n+1] | y,s) ## can be used for p(s[i] | y[-i],s[-i]) ## as mixture of normals sum pr[j]*N(m[j],v[j]) gibbs <- function(y, n.iter=100,n.batch=10,plt=T,yl=c(0,0.1), ygrid=ygrid0, use.dp=use.dp0) { n <- length(y) s <- 1:n pjlist <<- NULL slist <<- NULL for(iter in 1:n.iter){ for(i in 1:n) s[i] <- sample.si(y[i],s[-i],y[-i]) s <- as.numeric(as.factor(s)) # recode cluster numbers ## save summaries slist <<- rbind(slist,s) if (iter %% n.batch == 0){ # computation intensive -- do only in batches cat("iter=",iter,"\n") pj <- ppred.grid(s,y,ygrid,add=!is.null(pjlist),use.dp=use.dp) pjlist <<- rbind(pjlist,pj) } } return(list(pjlist=pjlist,slist=slist)) } col <- 2 ppred.grid <- function(s,y,ygrid=ygrid0,plt=T,add=T,use.dp=use.dp0) { ## returns predictive on a grid (-3,3) pp <- ppred.mix(s,y,use.dp=use.dp0) pj <- 0*ygrid pr <- pp$pr/ sum(pp$pr) # standardize for(j in 1:length(pr)){ z <- (ygrid-pp$mj[j])/pp$vj[j] # standardize pj <- pj+pr[j]* # WAS: dnorm(ygrid,m=pp$mj[j],...) dt(z,df=pp$nuj[j])/pp$vj[j]; # divide by Jacobian for scaling } if(plt){ if (add) lines(ygrid,pj,type="l",col=col) else plot(ygrid,pj,type="l",xlab="Y",ylab="P",bty="l",ylim=yl0) } col <<- col+1 return(pj) } ## yi <- y[1]; sm <- s[-1]; ym <- y[-1] # debugging.. sample.si <- function(yi,sm,ym) {# sample s[i] ~ p(s[i] | y[i], s[-i],y[-i]) pp <- ppred.mix(sm,ym) zi <- (yi-pp$mj)/pp$vj llik <- dt(zi, df=pp$nuj, log=T) - log(pp$vj) # subtract log Jacobian for scaling lik <- exp(llik-max(llik)) po <- lik*pp$pr si <- sample(pp$jlist,1,prob=po) return(si) } # sm <- s[-1]; ym <- y[-1] # debugging.. ppred.mix <- function(sm,ym,use.dp=use.dp0) {## posterior predictive probs ## p(y[n+1] | s,y) = sum pr[j]*t(nu[j], mj[j],vj[j]) ## returns(pr,mj,vj), vectors of length k+1 ## k= # clusters in sm ## use.dp: indicator for using DP(alpha,.) model ## NOTE: B0 = 1/c from the paper. ## hyperprior parameters: as0,bs0,B0 are set on top of file k <- length(unique(sm)) ts <- table(sm) nj <- c(ts) tn <- table(nj) nj0 <- as.numeric(names(tn)) nnj <- c(tn) jlist <- c( as.numeric(names(ts)), max(sm)+1) if (!use.dp){ pr <- ppfSSM(sm) # prior prob for different clusters for (h in 1:length(nj0)){ # smooth over equal cluster sizes.. idx <- which( nj == nj0[h] ) pr[idx] <- mean(pr[idx]) } } else pr <- ppfDP(nj) # prior prob under DP prior llik <- 0*pr # likelihood for s=j vj <- 0*pr # scale par of prediciton mj <- 0*pr # location of prediction nuj <- 0*pr # d.f. of prediction ## clusters 1...k for(j in 1:k){ idx <- sm==jlist[j] yj <- ym[idx] mx <- mean(yj) # average sx <- sum((yj-mx)^2) # ss nuj[j] <- as0+nj[j] mj[j] <- nj[j]/(nj[j]+B0)*mx # t location parameter K <- (nj[j]+B0+1)/(nj[j]+B0) vj[j] <- sqrt( K*(bs0+sx+ 1/(1/B0+1/nj[j])*mx^2)/(nj[j]+as0) ) ## t-dist scale parameter } ## new cluster mj[k+1] <- 0; nuj[k+1] <- as0; K <- (B0+1)/B0 vj[k+1] <- sqrt( K*bs0/as0 ) return(list(mj=mj,vj=vj,nuj=nuj,pr=pr,jlist=jlist)) } ####################################################### # inference summaries ####################################################### ## various functions to make plots for paper ## plt.pj <- function(pjlist,yl=NULL,hist=F,pts=T) {## plot density estimate pjbar <- apply(pjlist,2,mean) matplot(ygrid0,t(pjlist),type="l", bty="l",xlab="Y",ylab="P",ylim=yl) lines(ygrid0,pjbar,type="l",lwd=3) if (hist) hist(y0,add=T,prob=T) yy <- 0*y0 if (pts) points(y0,yy,pch=17,lwd=2) } plt.sij <- function(slist) { idx <- order(y0) M <- nrow(slist) n <- length(y0) psij <- matrix(1,nrow=n,ncol=n) for(i in 2:n){ for(j in 1:(i-1)){ pij <- sum(slist[,idx[i]]==slist[,idx[j]])/M psij[i,j] <- pij psij[j,i] <- pij } } ## mx <- max(psij) ## diag(psij) <- mx image(1:n,1:n,psij,xlab="",ylab="",col=grey((1:10/10))) }