Rel 0.0.1 - Updated runtests.jl

goedman · goedman · commit 3cc6353807c1 · 2021-01-23T12:27:47.000-07:00
diff --git a/old.appveyor.yml b/old.appveyor.yml
diff --git a/old.travis.yml b/old.travis.yml
diff --git a/rest/cvit.jl b/rest/cvit.jl
@@ -0,0 +1,50 @@
+# CVIT - Create itr and itst indeces for k-fold-cv
+#
+#    Description
+#     [ITR,ITST]=CVITR(N,K) returns 1xK cell arrays ITR and ITST holding 
+#      cross-validation indeces for train and test sets respectively. 
+#      K-fold division is balanced with all sets having floor(N/K) or 
+#      ceil(N/K) elements.
+#
+#     [ITR,ITST]=CVITR(N,K,RS) with integer RS=true also makes random 
+#      permutation, using substream RS. This way different permutations 
+#      can be produced with different RS values, but same permutation is 
+#      obtained when called again with same RS. Function restores the 
+#      previous random stream before exiting.
+#
+
+
+# Copyright (c) 2010 Aki Vehtari
+
+# This software is distributed under the GNU General Public
+# License (version 2 or later); please refer to the file
+# License.txt, included with the software, for details.
+    
+function cvit(n, k=10, rsubstream=false)
+
+    a = k-rem(n,k)
+    b = floor(Int, n/k);
+
+    itst = Any[]
+    itr = Any[]
+
+    for cvi in 1:a
+        push!(itst, collect(1:b) .+ (cvi-1) * b)
+        push!(itr, setdiff(1:n,itst[cvi])) 
+    end
+    for cvi in (a+1):k
+        push!(itst, (a * b) + collect(1:(b + 1)) + (cvi - a - 1) * (b + 1)) 
+        push!(itr, setdiff(1:n,itst[cvi])) 
+    end  
+
+    if rsubstream
+        rng = MersenneTwister()
+        rii = randperm(rng, n)
+        for cvi in 1:k
+            itst[cvi] = rii[itst[cvi]]
+            itr[cvi] = rii[itr[cvi]]
+        end
+    end
+    itr, itst
+end
+
diff --git a/test/arsenic_logistic.stan b/test/arsenic_logistic.stan
@@ -0,0 +1,40 @@
+data {
+  int<lower=0> p;
+  int<lower=0> N;
+  int<lower=0,upper=1> y[N];
+  matrix[N,p] x;
+}
+
+transformed data {
+  matrix[N,p] z;
+  vector[p] mean_x;
+  vector[p] sd_x;
+  for (j in 1:p) { 
+    mean_x[j] <- mean(col(x,j)); 
+    sd_x[j] <- sd(col(x,j)); 
+    for (i in 1:N)
+      z[i,j] <- (x[i,j] - mean_x[j]) / sd_x[j]; 
+  }
+}
+
+parameters {
+  real beta0;
+  vector[p] beta;
+  real<lower=0> phi;
+}
+
+model {
+  vector[N] eta;
+  eta <- beta0 + z*beta;
+  beta ~ normal(0, phi);
+  phi ~ double_exponential(0, 10);
+  y ~ bernoulli_logit(eta);
+}
+
+generated quantities {
+  vector[N] log_lik;
+  vector[N] eta;
+  eta <- beta0 + z*beta;
+  for (i in 1:N)
+    log_lik[i] <- bernoulli_logit_log(y[i],eta[i]);
+}
diff --git a/test/arsenic_logistic_t.stan b/test/arsenic_logistic_t.stan
@@ -0,0 +1,49 @@
+data {
+  int<lower=0> p;
+  int<lower=0> N;
+  int<lower=0,upper=1> y[N];
+  matrix[N,p] x;
+  int<lower=0> Nt;
+  int<lower=0,upper=1> yt[Nt];
+  matrix[Nt,p] xt;
+}
+transformed data {
+  matrix[N,p] z;
+  matrix[Nt,p] zt;
+  vector[p] mean_x;
+  vector[p] sd_x;
+  for (j in 1:p) { 
+    mean_x[j] <- mean(col(x,j)); 
+    sd_x[j] <- sd(col(x,j)); 
+    for (i in 1:N)
+      z[i,j] <- (x[i,j] - mean_x[j]) / sd_x[j]; 
+    for (i in 1:Nt)
+      zt[i,j] <- (xt[i,j] - mean_x[j]) / sd_x[j]; 
+  }
+}
+parameters {
+  real beta0;
+  vector[p] beta;
+  real<lower=0> phi;
+}
+model {
+  vector[N] eta;
+  eta <- beta0 + z*beta;
+  beta ~ normal(0, phi);
+  phi ~ double_exponential(0, 10);
+  y ~ bernoulli_logit(eta);
+}
+
+generated quantities {
+  vector[N] log_lik;
+  vector[Nt] log_likt;
+  vector[N] eta;
+  vector[Nt] etat;
+  eta <- beta0 + z*beta;
+  etat <- beta0 + zt*beta;
+  for (i in 1:N)
+    log_lik[i] <- bernoulli_logit_log(y[i],eta[i]);
+  for (i in 1:Nt)
+    log_likt[i] <- bernoulli_logit_log(yt[i],etat[i]);
+}
+
diff --git a/test/cvit.jl b/test/cvit.jl
@@ -0,0 +1,50 @@
+# CVIT - Create itr and itst indeces for k-fold-cv
+#
+#    Description
+#     [ITR,ITST]=CVITR(N,K) returns 1xK cell arrays ITR and ITST holding 
+#      cross-validation indeces for train and test sets respectively. 
+#      K-fold division is balanced with all sets having floor(N/K) or 
+#      ceil(N/K) elements.
+#
+#     [ITR,ITST]=CVITR(N,K,RS) with integer RS=true also makes random 
+#      permutation, using substream RS. This way different permutations 
+#      can be produced with different RS values, but same permutation is 
+#      obtained when called again with same RS. Function restores the 
+#      previous random stream before exiting.
+#
+
+
+# Copyright (c) 2010 Aki Vehtari
+
+# This software is distributed under the GNU General Public
+# License (version 2 or later); please refer to the file
+# License.txt, included with the software, for details.
+    
+function cvit(n, k=10, rsubstream=false)
+
+    a = k-rem(n,k)
+    b = floor(Int, n/k);
+
+    itst = Any[]
+    itr = Any[]
+
+    for cvi in 1:a
+        push!(itst, collect(1:b) .+ (cvi-1) * b)
+        push!(itr, setdiff(1:n,itst[cvi])) 
+    end
+    for cvi in (a+1):k
+        push!(itst, (a * b) + collect(1:(b + 1)) + (cvi - a - 1) * (b + 1)) 
+        push!(itr, setdiff(1:n,itst[cvi])) 
+    end  
+
+    if rsubstream
+        rng = MersenneTwister()
+        rii = randperm(rng, n)
+        for cvi in 1:k
+            itst[cvi] = rii[itst[cvi]]
+            itr[cvi] = rii[itr[cvi]]
+        end
+    end
+    itr, itst
+end
+
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -1,5 +1,11 @@
-using PSIS
-using Base.Test
+using PSIS, StanSample
+using Test
 
-# write your own tests here
-@test 1 == 2
+if haskey(ENV, "JULIA_CMDSTAN_HOME")
+
+    ProjDir = @__DIR__
+    include(joinpath(ProjDir, "test_demo_wells.jl"))
+
+else
+  println("\nJULIA_CMDSTAN_HOME not set. Skipping tests")
+end
diff --git a/test/test_demo_wells.jl b/test/test_demo_wells.jl
@@ -0,0 +1,106 @@
+using StatisticalRethinking
+using JSON
+using StanSample
+using PSIS
+#using Statistics
+using Printf
+#using StatsPlots
+
+ProjDir = @__DIR__
+
+include(joinpath(ProjDir, "cvit.jl"))
+
+# Data
+data = JSON.parsefile(joinpath(ProjDir, "wells.data.json"))
+y = Float64.(data["switched"])
+x = Float64[data["arsenic"]  data["dist"]]
+n, m = size(x)
+
+# Model
+model_str = read(open(joinpath(ProjDir, "arsenic_logistic.stan")), String)
+tmpdir = joinpath(ProjDir, "tmp")
+sm1 = SampleModel("arsenic_logistic", model_str)
+
+data1 = (p = m, N = n, y = Int.(y), x = x)
+# Fit the model in Stan
+rc1 = stan_sample(sm1; data=data1)
+if success(rc1)
+    nt1 = read_samples(sm1)
+
+    # Compute LOO and standard error
+    log_lik = nt1.log_lik'
+    loo, loos, pk = psisloo(log_lik)
+    elpd_loo = sum(loos)
+    se_elpd_loo = std(loos) * sqrt(n)
+    @printf(">> elpd_loo = %.1f, SE(elpd_loo) = %.1f\n", elpd_loo, se_elpd_loo)
+
+    # Check the shape parameter k of the generalized Pareto distribution
+    if all(pk .< 0.5)
+        println("All Pareto k estimates OK (k < 0.5)")
+    else
+        pkn1 = sum((pk .>= 0.5) & (pk .< 1))
+        pkn2 = sum(pk .>= 1)
+        @printf(">> %d (%.0f%%) PSIS Pareto k estimates between 0.5 and 1\n", pkn1, pkn1/n*100)
+        @printf(">> %d (%.0f%%) PSIS Pareto k estimates greater than 1\n", pkn2, pkn2/n*100)
+    end
+end
+
+# Fit a second model, using log(arsenic) instead of arsenic
+x2 = Float64[log.(data["arsenic"])  data["dist"]]
+
+# Model
+data2 = (p = m, N = n, y = Int.(y), x = x2)
+# Fit the model in Stan
+rc2 = stan_sample(sm1; data=data2)
+
+if success(rc2)
+    nt2 = read_samples(sm1)
+    # Compute LOO and standard error
+    log_lik = nt2.log_lik'
+    loo2, loos2, pk2 = psisloo(log_lik)
+    elpd_loo = sum(loos2)
+    se_elpd_loo = std(loos2) * sqrt(n)
+    @printf(">> elpd_loo = %.1f, SE(elpd_loo) = %.1f\n", elpd_loo, se_elpd_loo)
+
+    # Check the shape parameter k of the generalized Pareto distribution
+    if all(pk .< 0.5)
+        println("All Pareto k estimates OK (k < 0.5)")
+    else
+        pkn1 = sum((pk .>= 0.5) & (pk .< 1))
+        pkn2 = sum(pk .>= 1)
+        @printf(">> %d (%.0f%%) PSIS Pareto k estimates between 0.5 and 1\n", pkn1, pkn1/n*100)
+        @printf(">> %d (%.0f%%) PSIS Pareto k estimates greater than 1\n", pkn2, pkn2/n*100)
+    end
+end
+
+if success(rc1) && success(rc2)
+    ## Compare the models
+    loodiff = loos - loos2
+    @printf("elpd_diff = %.1f, SE(elpd_diff) = %.1f\n",sum(loodiff), std(loodiff) * sqrt(n))
+end
+
+## k-fold-CV
+# k-fold-CV should be used if several khats>0.5
+# in this case it is not needed, but provided as an example
+
+model_str = read(open(joinpath(ProjDir, "arsenic_logistic_t.stan")), String)
+sm3 = SampleModel("arsenic_logistic_t", model_str);
+
+cvitr, cvitst = cvit(n, 10, true)
+kfcvs = similar(loos)
+for cvi in 1:3
+    @printf("%d\n", cvi)
+
+    standatacv = (p = m, N = length(cvitr[cvi]), Nt = length(cvitst[cvi]),
+        x = x[cvitr[cvi],:], y = Int.(y[cvitr[cvi]]),
+        xt = x[cvitst[cvi],:], yt = Int.(y[cvitst[cvi]]))
+
+    # Fit the model in Stan
+    rc3 = stan_sample(sm3; data=standatacv)
+    if success(rc3)
+        nt3 = read_samples(sm3)
+        # Compute LOO and standard error
+        log_likt = nt3.log_likt'
+        kfcvs[cvitst[cvi]] = PSIS.logsumexp(log_likt) .- log(size(log_likt, 1))
+    end
+end
diff --git a/test/wells.data.json b/test/wells.data.json