rft: make gamma_inc, gamma_inc_inv GPU-compatible (#514)

haakon-e · web-flow · commit d71eddb39ffd · 2026-02-11T21:47:03.000+01:00
- replace `Vector`s by statically sized `Tuple`s - update type restriction in `chepolsum` - use Base.@nif to statically unroll findfirst into if/elseif/else chain at parse time - replace interpolated strings in errors with LazyString - manually inline single recursion step in auxgam; GPU compilers cannot statically prove termination - add tests that checks these functions are inferrable and do not allocation memory
diff --git a/Project.toml b/Project.toml
@@ -1,6 +1,6 @@
 name = "SpecialFunctions"
 uuid = "276daf66-3868-5448-9aa4-cd146d93841b"
-version = "2.7.0"
+version = "2.7.1"
 
 [deps]
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
diff --git a/src/bessel.jl b/src/bessel.jl
@@ -7,7 +7,7 @@ struct AmosException <: Exception
 end
 
 function Base.showerror(io::IO, ex::AmosException)
-    print(io, "AmosException with id $(ex.id): ")
+    print(io, LazyString("AmosException with id ", ex.id, ": "))
     if ex.id == 0
         print(io, "normal return, computation complete.")
     elseif ex.id == 1
diff --git a/src/expint.jl b/src/expint.jl
@@ -52,7 +52,7 @@ macro E₁_cf64(x, n::Integer)
 end
 
 function E₁_taylor_coefficients(::Type{T}, n::Integer) where {T<:Number}
-    n < 0 && throw(ArgumentError("$n ≥ 0 is required"))
+    n < 0 && throw(ArgumentError(LazyString(n, " ≥ 0 is required")))
     n == 0 && return T[]
     n == 1 && return T[-eulergamma]
     # iteratively compute the terms in the series, starting with k=1
diff --git a/src/gamma_inc.jl b/src/gamma_inc.jl
@@ -1,25 +1,25 @@
 using Base.MPFR: ROUNDING_MODE
 #useful constants
-const acc0 = [5.0e-15, 5.0e-7, 5.0e-4] #accuracy options
-const big1 = [25.0, 14.0, 10.0]
-const e0 = [0.25e-3, 0.25e-1, 0.14]
-const x0 = [31.0, 17.0, 9.7]
+const acc0 = (5.0e-15, 5.0e-7, 5.0e-4) #accuracy options
+const big1 = (25.0, 14.0, 10.0)
+const e0 = (0.25e-3, 0.25e-1, 0.14)
+const x0 = (31.0, 17.0, 9.7)
 const alog10 = log(10)
 const rt2pin = Float64(invsqrt2π)
 const rtpi = Float64(sqrtπ)
-const stirling_coef = [1.996379051590076518221, -0.17971032528832887213e-2, 0.131292857963846713e-4, -0.2340875228178749e-6, 0.72291210671127e-8, -0.3280997607821e-9, 0.198750709010e-10, -0.15092141830e-11, 0.1375340084e-12, -0.145728923e-13, 0.17532367e-14, -0.2351465e-15, 0.346551e-16, -0.55471e-17, 0.9548e-18, -0.1748e-18, 0.332e-19, -0.58e-20]
-const auxgam_coef = [-1.013609258009865776949, 0.784903531024782283535e-1, 0.67588668743258315530e-2, -0.12790434869623468120e-2, 0.462939838642739585e-4, 0.43381681744740352e-5, -0.5326872422618006e-6, 0.172233457410539e-7, 0.8300542107118e-9, -0.10553994239968e-9, 0.39415842851e-11, 0.362068537e-13, -0.107440229e-13, 0.5000413e-15, -0.62452e-17, -0.5185e-18, 0.347e-19, -0.9e-21]
+const stirling_coef = (1.996379051590076518221, -0.17971032528832887213e-2, 0.131292857963846713e-4, -0.2340875228178749e-6, 0.72291210671127e-8, -0.3280997607821e-9, 0.198750709010e-10, -0.15092141830e-11, 0.1375340084e-12, -0.145728923e-13, 0.17532367e-14, -0.2351465e-15, 0.346551e-16, -0.55471e-17, 0.9548e-18, -0.1748e-18, 0.332e-19, -0.58e-20)
+const auxgam_coef = (-1.013609258009865776949, 0.784903531024782283535e-1, 0.67588668743258315530e-2, -0.12790434869623468120e-2, 0.462939838642739585e-4, 0.43381681744740352e-5, -0.5326872422618006e-6, 0.172233457410539e-7, 0.8300542107118e-9, -0.10553994239968e-9, 0.39415842851e-11, 0.362068537e-13, -0.107440229e-13, 0.5000413e-15, -0.62452e-17, -0.5185e-18, 0.347e-19, -0.9e-21)
 
 #----------------COEFFICIENTS FOR TEMME EXPANSION------------------
 
 const d00 = -.333333333333333E+00
-const d0  = [.833333333333333E-01, -.148148148148148E-01, .115740740740741E-02, .352733686067019E-03, -.178755144032922E-03, .391926317852244E-04]
+const d0  = (.833333333333333E-01, -.148148148148148E-01, .115740740740741E-02, .352733686067019E-03, -.178755144032922E-03, .391926317852244E-04)
 const d10 = -.185185185185185E-02
-const d1  = [-.347222222222222E-02, .264550264550265E-02, -.990226337448560E-03, .205761316872428E-03]
+const d1  = (-.347222222222222E-02, .264550264550265E-02, -.990226337448560E-03, .205761316872428E-03)
 const d20 = .413359788359788E-02
-const d2  = [-.268132716049383E-02, .771604938271605E-03]
+const d2  = (-.268132716049383E-02, .771604938271605E-03)
 const d30 = .649434156378601E-03
-const d3  =[.229472093621399E-03, -.469189494395256E-03]
+const d3  = (.229472093621399E-03, -.469189494395256E-03)
 const d40 = -.861888290916712E-03
 const d4  = .784039221720067E-03
 const d50 = -.336798553366358E-03
@@ -88,11 +88,13 @@ Compute function `g` in ``1/\Gamma(x+1) = 1 + x (x-1) g(x)``, `-1 <= x <= 1`.
 """
 function auxgam(x::Float64)
     @assert -1 <= x <= 1
-    if x < 0
-        return -(1.0 + (1.0 + x)*(1.0 + x)*auxgam(1.0 + x))/(1.0 - x)
+    xp = ifelse(x < 0, 1 + x, x)
+    t = 2xp - 1
+    cheb = chepolsum(t, auxgam_coef)
+    return if x < 0
+        - (1 + xp * xp * cheb) / (1 - x)
     else
-        t = 2*x - 1.0
-        return chepolsum(t, auxgam_coef)
+        cheb
     end
 end
 
@@ -111,7 +113,7 @@ end
 
 Computes a series of Chebyshev Polynomials given by: `a[1]/2 + a[2]T1(x) + .... + a[n]T{n-1}(X)`.
 """
-function chepolsum(x::Float64, a::Array{Float64,1})
+function chepolsum(x::Float64, a::Tuple{Float64,Vararg{Float64}})
     n = length(a)
     if n == 1
         return a[1]/2.0
@@ -471,7 +473,8 @@ function gamma_inc_asym(a::Float64, x::Float64, ind::Integer)
     ts = cumprod(ntuple(i -> (a - i) / x, Val(21)))
 
     # sum the smaller terms directly
-    first_small_t = something(findfirst(x -> abs(x) < 1.0e-3, ts), 21)
+    # Unrolled findfirst: finds the first index i in 1:21 where abs(ts[i]) < 1e-3, defaulting to 21
+    first_small_t = Base.@nif 21 (i -> abs(ts[i]) < 1e-3) (i -> i) (i -> 21)
     sm = t = ts[first_small_t]
     amn = a - first_small_t
     while abs(t) ≥ acc
@@ -998,7 +1001,7 @@ end
 # floating point numbers of the same type
 function _gamma_inc_inv(a::T, p::T, q::T) where {T<:Real}
     if p + q != 1
-        throw(ArgumentError("p + q must equal one but is $(p + q)"))
+        throw(ArgumentError(LazyString("p + q must equal one but is ", p + q)))
     end
 
     if iszero(p)
diff --git a/test/gamma_inc.jl b/test/gamma_inc.jl
@@ -277,3 +277,53 @@ end
     @test loggamma(7, -300.2) ≅ log(gamma(7, -300.2))
     @test_throws DomainError loggamma(6, -3.2)
 end
+
+@testset "GPU compatibility ($FT)" for FT in (Float64, Float32, Float16)
+    # Note: This test is a proxy for GPU compatibility by checking that the functions
+    # are type stable and do not allocate memory. It does not launch any GPU kernels.
+    @testset "gamma_inc type stability" begin
+        @test @inferred(gamma_inc(FT(30.0), FT(29.99999), 0)) isa Tuple{FT,FT}
+    end
+    @testset "gamma_inc_inv type stability" begin
+        @test @inferred(gamma_inc_inv(FT(1.0), FT(0.01), FT(0.99))) isa FT
+    end
+
+    @testset "gamma_inc allocations" begin
+        # `@allocated` checks for allocations for specific code paths
+        ## a >= 1
+        ### gamma_inc_temme_1: simplified Temme expansion
+        @test iszero((FT -> @allocated(gamma_inc(FT(30.0), FT(29.99999), 0)))(FT))
+        ### gamma_inc_minimax: minimax approximation
+        @test iszero((FT -> @allocated(gamma_inc(FT(100.0), FT(80.0), 0)))(FT))
+        ### gamma_inc_temme: Temme expansion
+        @test iszero((FT -> @allocated(gamma_inc(FT(100.0), FT(80.0), 1)))(FT))
+        ### gamma_inc_cf: Continued fraction
+        @test iszero((FT -> @allocated(gamma_inc(FT(1.7), FT(2.5))))(FT))
+        ### gamma_inc_taylor: Taylor series
+        @test iszero((FT -> @allocated(gamma_inc(FT(11.1), FT(0.001))))(FT))
+        ### gamma_inc_asym: Asymptotic expansion
+        @test iszero((FT -> @allocated(gamma_inc(FT(10.0), FT(35.0))))(FT))
+        ### gamma_inc_fsum: Finite sums
+        @test iszero((FT -> @allocated(gamma_inc(FT(24.0), FT(25))))(FT))
+        ## a==0.5
+        ### erfc
+        @test iszero((FT -> @allocated(gamma_inc(FT(0.5), FT(0.5))))(FT))
+        ## x < 1.1
+        ### gamma_inc_taylor_x
+        @test iszero((FT -> @allocated(gamma_inc(FT(0.9), FT(0.8))))(FT))
+        ## else
+        ### gamma_inc_cf
+        @test iszero((FT -> @allocated(gamma_inc(FT(0.7), FT(2.5))))(FT))
+    end
+
+    @testset "gamma_inc_inv allocations" begin
+        # `@allocated` checks for allocations for specific code paths
+        ## x0 approximation paths
+        ### gamma_inc_inv_psmall
+        @test iszero((FT -> @allocated(gamma_inc_inv(FT(1.0), FT(0.01), FT(0.99))))(FT))
+        ### gamma_inc_inv_qsmall
+        @test iszero((FT -> @allocated(gamma_inc_inv(FT(5.0), FT(0.99), FT(0.01))))(FT))
+        ### gamma_inc_inv_alarge
+        @test iszero((FT -> @allocated(gamma_inc_inv(FT(50.0), FT(0.3), FT(0.7))))(FT))
+    end
+end
diff --git a/test/qa.jl b/test/qa.jl
@@ -45,6 +45,7 @@ end
             :ROUNDING_MODE, # Base.MPFR
             :_fact_table64, # Base
             :version, # Base.MPFR
+            Symbol("@nif"), # Base
             (VERSION < v"1.11" ? (:depwarn,) : ())..., # Base
         ),
     ) === nothing