GPU support #52

Vilin97 · 2023-05-09T20:57:32Z

What would it take to make hcubature GPU-friendly? In the sense that there is no scalar indexing. Is that even possible?

MWE:

using HCubature, CUDA
CUDA.allowscalar(false)
hcubature(x -> sum(abs2, x), CuArray(fill(-3, 3)), CuArray(fill(3, 3))) # ERROR: Scalar indexing is disallowed.

The text was updated successfully, but these errors were encountered:

stevengj · 2023-05-10T23:45:07Z

Why do you need to store the integrand bounds as CuArrays? In costly multidimensional integration, isn't the cost of evaluating the integrand usually the dominant factor (in which case you could evaluate the integrand on the GPU).

Vilin97 · 2023-05-11T16:00:32Z

@stevengj , how do I tell Julia to evaluate the integrand on the GPU though? I thought that by passing CuArray for the bounds it would dispatch to the GPU. Could you use my MWE to demonstrate how to do the integration on the GPU?

Thank you!

HyperSphereStudio · 2024-10-14T23:10:07Z

Here is my variant of HCubature.jl with GPU and Unitful support (large numeric simulation library). Note that it only implements the cubature rule

using QuadGK, Combinatorics, Adapt, StaticArrays, Unitful

abstract type AbstractBoxedIntegral{N} end

struct GaussKronrod{T <: Real, VT <: AbstractVector{T}} <: AbstractBoxedIntegral{1}
    x::VT
    w::VT
    wg::VT
end
GaussKronrod(::Type{T}) where {T<:Real} = GaussKronrod(QuadGK.cachedrule(T, 7)...)
countevals(g::GaussKronrod) = 15
Adapt.adapt(to, gk::GaussKronrod) = GaussKronrod(adapt(to, gk.x), adapt(to, gk.w), adapt(to, gk.wg))

function (g::GaussKronrod{T})(f::F, a_, b_, norm=norm) where {F, T}
    a = a_[Int32(1)]
    b = b_[Int32(1)]
    c = (a+b)*T(0.5)
    Δ = (b-a)*T(0.5)

    fx⁰ = f(SVector(c))                # f(0)
    I = fx⁰ * g.w[end]
    I′ = fx⁰ * g.wg[end]
    @inbounds for i = Int32(1):(Int32(length(g.x)) - Int32(1))
        Δx = Δ * g.x[i]
        fx = f(SVector(c + Δx)) + f(SVector(c - Δx))
        I += fx * g.w[i]
        if iseven(i)
            I′ += fx * g.wg[i>>1]
        end
    end
    I *= Δ
    I′ *= Δ
    return I, norm(I - I′), 1
end

# trivial rule for 0-dimensional integrals
struct Trivial <: AbstractBoxedIntegral{0}; end
function (::Trivial)(f, a::SVector{0}, b::SVector{0}, norm)
    I = f(a)
    return I, norm(I - I), 1
end

countevals(::Trivial) = 1

cubrule(v::Val{N}, ::Type{T}) where {N, T<:Real} = GenzMalik(v, T)
cubrule(::Val{1}, ::Type{T}) where {T<:Real} = GaussKronrod(T)
cubrule(::Val{0}, ::Type{T}) where {T<:Real} = Trivial()

# implementation of the n-dimensional cubature rule (n ≥ 2) from
# A. C. Genz and A. A. Malik, "An adaptive algorithm for numeric
# integration over an N-dimensional rectangular region," J. Comput. Appl. Math.,
# vol. 6 (no. 4), 295-302 (1980).

"""
    combos(k, λ, Val{n}())

Return an array of SVector{n} of all n-component vectors
with k components equal to λ and other components equal to zero.
"""
function combos(k::Integer, λ::T, ::Val{n}) where {n, T}
    combos = Combinatorics.combinations(1:n, k)
    p = Vector{SVector{n,T}}(undef, length(combos))
    v = similar(SVector{n,T})
    for (i,c) in enumerate(combos)
        v .= zero(T)
        v[c] .= λ
        p[i] = v
    end
    return p
end

"""
    signcombos(k, λ, Val{n}())

Return an array of SVector{n} of all n-component vectors
with k components equal to ±λ and other components equal to zero
(with all possible signs).
"""
function signcombos(k::Integer, λ::T, ::Val{n}) where {n, T<:Number}
    combos = Combinatorics.combinations(1:n, k)
    twoᵏ = 1 << k
    p = Vector{SVector{n,T}}(undef, length(combos) * twoᵏ)
    v = similar(SVector{n,T})
    for (i,c) in enumerate(combos)
        j = (i-1)*twoᵏ + 1
        v .= zero(T)
        v[c] .= λ
        p[j] = v
        # use a gray code to flip one sign at a time
        graycode = 0
        for s = 1:twoᵏ-1
            graycode′ = s ⊻ (s >> 1)
            graycomp = c[trailing_zeros(graycode ⊻ graycode′) + 1]
            graycode = graycode′
            v[graycomp] = -v[graycomp]
            p[j+s] = v
        end
    end
    return p
end

"""
`GenzMalik{n,T}` holds the points and weights corresponding
to an `n`-dimensional Genz-Malik cubature rule over coordinates
of type `T`.
"""
struct GenzMalik{N, T <: Real, XVT} <: AbstractBoxedIntegral{N}
    p::NTuple{4, XVT}  # points for the last 4 G-M weights
    w::NTuple{5, T}    # weights for the 5 terms in the G-M rule
    w′::NTuple{4, T}   # weights for the embedded lower-degree rule

    GenzMalik(::Val{N}, p::NTuple{4, XVT}, w::NTuple{5, T}, w′::NTuple{4, T}) where {N, T<:Real, XVT} = new{N, T, XVT}(p, w, w′)
end

Adapt.adapt(to, gm::GenzMalik{N}) where N = GenzMalik(Val(N), adapt(to, gm.p), adapt(to, gm.w), adapt(to, gm.w′))

# internal code to construct n-dimensional Genz-Malik rule for coordinates of type `T`.
function _GenzMalik(v::Val{n}, ::Type{T}) where {n, T}
    n < 2 && throw(ArgumentError("invalid dimension $n: GenzMalik rule requires dimension > 2"))

    λ₄ = T(sqrt(9/10))
    λ₂ = T(sqrt(9/70))
    λ₃ = λ₄
    λ₅ = T(sqrt(9/19))

    twoⁿ = 1 << n
    w₁ = twoⁿ * T((12824 - 9120n + 400n^2) / 19683)
    w₂ = twoⁿ * T(980 / 6561)
    w₃ = twoⁿ * T((1820 - 400n) / 19683)
    w₄ = twoⁿ * T(200 / 19683)
    w₅ = T(6859/19683)
    w₄′ = twoⁿ * T(25/729)
    w₃′ = twoⁿ * T((265 - 100n)/1458)
    w₂′ = twoⁿ * T(245/486)
    w₁′ = twoⁿ * T((729 - 950n + 50n^2)/729)

    p₂ = combos(1, λ₂, v)
    p₃ = combos(1, λ₃, v)
    p₄ = signcombos(2, λ₄, v)
    p₅ = signcombos(n, λ₅, v)

    return GenzMalik(v, (p₂,p₃,p₄,p₅), (w₁,w₂,w₃,w₄,w₅), (w₁′,w₂′,w₃′,w₄′))
end

"""
    GenzMalik(Val{n}(), T=Float64)

Construct an n-dimensional Genz-Malik rule for coordinates of type `T`.
"""
GenzMalik(v::Val{n}, ::Type{T}) where {n, T <: Real} = _GenzMalik(v, T)

countevals(::Type{<:GenzMalik{n}}) where {n} = 1 + 4n + 2*n*(n-1) + (1<<n)

"""
    genzmalik(f, a, b, norm=norm)

Evaluate `genzmalik::GenzMalik` for the box with min/max corners `a` and `b`
for an integrand `f`.  Returns the estimated integral `I`, the estimated
error `E` (via the given `norm`), and the suggested coordinate `k` ∈ `1:n`
to subdivide next.
"""
function (g::GenzMalik{n, T})(f::F, a, b, norm=norm) where {F, n, T}
    c = T(.5).*(a.+b)
    Δ = T(.5).*(b.-a)        #Multiplied by dimensionless to get x -> units of x

    #Hardcoded for ease of type stability
    V = Δ[Int32(1)]    #Multiplied by the weighting -> x (units of this is x)
    for i in Int32(2):Int32(n) V *= ustrip(Δ[i]) end

    f₁ = f(c)
    f₂ = zero(f₁)
    f₃ = zero(f₁)

    maxdivdiff = zero(norm(f₁))
    divdiff = similar(SVector{n, typeof(maxdivdiff)})
    for i = Int32(1):Int32(n)
        p₂ = Δ .* g.p[Int32(1)][i]

        f₂ᵢ = f(c .+ p₂) + f(c .- p₂)
        p₃ = Δ .* g.p[Int32(2)][i]
        f₃ᵢ = f(c .+ p₃) + f(c .- p₃)
        f₂ += f₂ᵢ
        f₃ += f₃ᵢ
        # fourth divided difference: f₃ᵢ-2f₁ - 7*(f₂ᵢ-2f₁),
        # where 7 = (λ₃/λ₂)^2 [see van Dooren and de Ridder]
        divdiff[i] = norm(f₃ᵢ + 12f₁ - 7*f₂ᵢ)
    end

    f₄ = zero(f₁)
    for p in g.p[Int32(3)]
        f₄ += f(c .+ Δ .* p)
    end

    f₅ = zero(f₁)
    for p in g.p[Int32(4)]
        f₅ += f(c .+ Δ .* p)
    end

    I = V * (g.w[Int32(1)]*f₁ + g.w[Int32(2)]*f₂ + g.w[Int32(3)]*f₃ + g.w[Int32(4)]*f₄ + g.w[Int32(5)]*f₅)
    I′ = V * (g.w′[Int32(1)]*f₁ + g.w′[Int32(2)]*f₂ + g.w′[Int32(3)]*f₃ + g.w′[Int32(4)]*f₄)
    E = norm(I - I′)

    # choose axis
    kdivide = Int32(1)
    δf = E / (Int32(10)^n * V)
    for i = Int32(1):Int32(n)
        if (δ = divdiff[i] - maxdivdiff) > δf
            kdivide = i
            maxdivdiff = divdiff[i]
        elseif abs(δ) <= δf && abs(Δ[i]) > abs(Δ[kdivide])
            kdivide = i
        end
    end

    return I, E, kdivide
end


end

On the CPU

ST = Float32
intgr_cpu = cubrule(Val(3), ST)
cpu = map(i -> intgr_cpu(x -> i*Vec3(x[1]^2, x[2]^2, x[3]^2), (0,0,0).*u"V", (1,1,1).*u"V"), 1:5)

Which outputs on the cpu

5-element Vector{Tuple{Vec3{Quantity{Float32, 𝐋 ^6 𝐌 ^3 𝐈 ^-3 𝐓 ^-9, Unitful.FreeUnits{(V^3,), 𝐋 ^6 𝐌 ^3 𝐈 ^-3 𝐓 ^-9, nothing}}}, Quantity{Float32, 𝐋 ^6 𝐌 ^3 𝐈 ^-3 𝐓 ^-9, Unitful.FreeUnits{(V^3,), 𝐋 ^6 𝐌 ^3 𝐈 ^-3 𝐓 ^-9, nothing}}, Int32}}:
 ([0.33333334f0 V^3, 0.33333337f0 V^3, 0.33333337f0 V^3], 8.940697f-8 V^3, 1)
 ([0.6666667f0 V^3, 0.66666675f0 V^3, 0.66666675f0 V^3], 1.7881393f-7 V^3, 1)
 ([1.0f0 V^3, 1.0f0 V^3, 1.0f0 V^3], 0.0f0 V^3, 1)
 ([1.3333334f0 V^3, 1.3333335f0 V^3, 1.3333335f0 V^3], 3.5762787f-7 V^3, 1)
 ([1.6666667f0 V^3, 1.6666666f0 V^3, 1.6666667f0 V^3], 1.6858739f-7 V^3, 1)

Some Benchmarking

julia> @benchmark map(i -> intg(x -> i*Vec3(x[1]^2, x[2]^2, x[3]^2), (0,0,0).*u"V", (1,1,1).*u"V"), vec) setup=(vec=1:10000; intg=cubrule(Val(3), Float32))
BenchmarkTools.Trial: 4761 samples with 1 evaluation.
 Range (min … max):  740.500 μs …   6.673 ms  ┊ GC (min … max): 0.00% … 86.45%
 Time  (median):     892.800 μs               ┊ GC (median):    0.00%
 Time  (mean ± σ):     1.042 ms ± 495.179 μs  ┊ GC (mean ± σ):  1.91% ±  5.40%

   ▃▁▄█▅▄▂▂▁     ▁
  ▅████████████▇▇█▇▇▇▇▇▇▇▆▆▅▆▆▅▅▆▅▄▅▄▅▅▅▄▅▄▅▄▅▄▄▆▄▄▄▅▅▄▄▅▇█▆▄▅▅ █
  740 μs        Histogram: log(frequency) by time       2.85 ms <

 Memory estimate: 195.36 KiB, allocs estimate: 2.

On the GPU

ST = Float32
intgr_cpu = cubrule(Val(3), ST)
intgr_gpu = adapt(CuArray, intgr_cpu)
gpu = map(i -> intgr_gpu(x -> i*Vec3(x[1]^2, x[2]^2, x[3]^2), (0,0,0).*u"V", (1,1,1).*u"V"), CuArray(1:5))

5-element CuArray{Tuple{Vec3{Quantity{Float32, 𝐋 ^6 𝐌 ^3 𝐈 ^-3 𝐓 ^-9, Unitful.FreeUnits{(V^3,), 𝐋 ^6 𝐌 ^3 𝐈 ^-3 𝐓 ^-9, nothing}}}, Quantity{Float32, 𝐋 ^6 𝐌 ^3 𝐈 ^-3 𝐓 ^-9, Unitful.FreeUnits{(V^3,), 𝐋 ^6 𝐌 ^3 𝐈 ^-3 𝐓 ^-9, nothing}}, Int32}, 1, CUDA.DeviceMemory}:
 ([0.33333334f0 V^3, 0.33333334f0 V^3, 0.33333334f0 V^3], 5.1619136f-8 V^3, 1)
 ([0.6666667f0 V^3, 0.6666667f0 V^3, 0.6666667f0 V^3], 1.0323827f-7 V^3, 1)
 ([1.0f0 V^3, 1.0f0 V^3, 1.0f0 V^3], 4.1295309f-7 V^3, 1)
 ([1.3333334f0 V^3, 1.3333334f0 V^3, 1.3333334f0 V^3], 2.0647654f-7 V^3, 1)
 ([1.6666667f0 V^3, 1.6666666f0 V^3, 1.6666667f0 V^3], 1.6858739f-7 V^3, 1)

Some Benchmarking

julia> @benchmark map(i -> int(x -> i*Vec3(x[1]^2, x[2]^2, x[3]^2), (0,0,0).*u"V", (1,1,1).*u"V"), vec) setup=(vec=CuArray(1:10000); int=adapt(CuArray, cubrule(Val(3), Float32)))
BenchmarkTools.Trial: 10000 samples with 1 evaluation.
 Range (min … max):  11.100 μs … 431.700 μs  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     14.100 μs               ┊ GC (median):    0.00%
 Time  (mean ± σ):   19.996 μs ±  18.620 μs  ┊ GC (mean ± σ):  0.00% ± 0.00%

  ▅█▇▆▄▃▂▂▁      ▁▁  ▁▂▃▃▃▁▁▁                                  ▂
  ████████████████████████████▇▇▇▆▆▆▆▆▅▅▅▆▃▅▂▃▃▄▅▅▄▃▄▂▅▄▄▄▂▄▄▅ █
  11.1 μs       Histogram: log(frequency) by time      88.1 μs <

 Memory estimate: 2.47 KiB, allocs estimate: 105.

GPU shows decent speedup!

Feel free to use as needed!

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

GPU support #52

GPU support #52

Vilin97 commented May 9, 2023

stevengj commented May 10, 2023

Vilin97 commented May 11, 2023

HyperSphereStudio commented Oct 14, 2024 •

edited

Loading

GPU support #52

GPU support #52

Comments

Vilin97 commented May 9, 2023

stevengj commented May 10, 2023

Vilin97 commented May 11, 2023

HyperSphereStudio commented Oct 14, 2024 • edited Loading

On the CPU

On the GPU

HyperSphereStudio commented Oct 14, 2024 •

edited

Loading