-
Notifications
You must be signed in to change notification settings - Fork 25
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
GPU support #52
Comments
Why do you need to store the integrand bounds as |
@stevengj , how do I tell Julia to evaluate the integrand on the GPU though? I thought that by passing CuArray for the bounds it would dispatch to the GPU. Could you use my MWE to demonstrate how to do the integration on the GPU? Thank you! |
Here is my variant of HCubature.jl with GPU and Unitful support (large numeric simulation library). Note that it only implements the cubature rule using QuadGK, Combinatorics, Adapt, StaticArrays, Unitful
abstract type AbstractBoxedIntegral{N} end
struct GaussKronrod{T <: Real, VT <: AbstractVector{T}} <: AbstractBoxedIntegral{1}
x::VT
w::VT
wg::VT
end
GaussKronrod(::Type{T}) where {T<:Real} = GaussKronrod(QuadGK.cachedrule(T, 7)...)
countevals(g::GaussKronrod) = 15
Adapt.adapt(to, gk::GaussKronrod) = GaussKronrod(adapt(to, gk.x), adapt(to, gk.w), adapt(to, gk.wg))
function (g::GaussKronrod{T})(f::F, a_, b_, norm=norm) where {F, T}
a = a_[Int32(1)]
b = b_[Int32(1)]
c = (a+b)*T(0.5)
Δ = (b-a)*T(0.5)
fx⁰ = f(SVector(c)) # f(0)
I = fx⁰ * g.w[end]
I′ = fx⁰ * g.wg[end]
@inbounds for i = Int32(1):(Int32(length(g.x)) - Int32(1))
Δx = Δ * g.x[i]
fx = f(SVector(c + Δx)) + f(SVector(c - Δx))
I += fx * g.w[i]
if iseven(i)
I′ += fx * g.wg[i>>1]
end
end
I *= Δ
I′ *= Δ
return I, norm(I - I′), 1
end
# trivial rule for 0-dimensional integrals
struct Trivial <: AbstractBoxedIntegral{0}; end
function (::Trivial)(f, a::SVector{0}, b::SVector{0}, norm)
I = f(a)
return I, norm(I - I), 1
end
countevals(::Trivial) = 1
cubrule(v::Val{N}, ::Type{T}) where {N, T<:Real} = GenzMalik(v, T)
cubrule(::Val{1}, ::Type{T}) where {T<:Real} = GaussKronrod(T)
cubrule(::Val{0}, ::Type{T}) where {T<:Real} = Trivial()
# implementation of the n-dimensional cubature rule (n ≥ 2) from
# A. C. Genz and A. A. Malik, "An adaptive algorithm for numeric
# integration over an N-dimensional rectangular region," J. Comput. Appl. Math.,
# vol. 6 (no. 4), 295-302 (1980).
"""
combos(k, λ, Val{n}())
Return an array of SVector{n} of all n-component vectors
with k components equal to λ and other components equal to zero.
"""
function combos(k::Integer, λ::T, ::Val{n}) where {n, T}
combos = Combinatorics.combinations(1:n, k)
p = Vector{SVector{n,T}}(undef, length(combos))
v = similar(SVector{n,T})
for (i,c) in enumerate(combos)
v .= zero(T)
v[c] .= λ
p[i] = v
end
return p
end
"""
signcombos(k, λ, Val{n}())
Return an array of SVector{n} of all n-component vectors
with k components equal to ±λ and other components equal to zero
(with all possible signs).
"""
function signcombos(k::Integer, λ::T, ::Val{n}) where {n, T<:Number}
combos = Combinatorics.combinations(1:n, k)
twoᵏ = 1 << k
p = Vector{SVector{n,T}}(undef, length(combos) * twoᵏ)
v = similar(SVector{n,T})
for (i,c) in enumerate(combos)
j = (i-1)*twoᵏ + 1
v .= zero(T)
v[c] .= λ
p[j] = v
# use a gray code to flip one sign at a time
graycode = 0
for s = 1:twoᵏ-1
graycode′ = s ⊻ (s >> 1)
graycomp = c[trailing_zeros(graycode ⊻ graycode′) + 1]
graycode = graycode′
v[graycomp] = -v[graycomp]
p[j+s] = v
end
end
return p
end
"""
`GenzMalik{n,T}` holds the points and weights corresponding
to an `n`-dimensional Genz-Malik cubature rule over coordinates
of type `T`.
"""
struct GenzMalik{N, T <: Real, XVT} <: AbstractBoxedIntegral{N}
p::NTuple{4, XVT} # points for the last 4 G-M weights
w::NTuple{5, T} # weights for the 5 terms in the G-M rule
w′::NTuple{4, T} # weights for the embedded lower-degree rule
GenzMalik(::Val{N}, p::NTuple{4, XVT}, w::NTuple{5, T}, w′::NTuple{4, T}) where {N, T<:Real, XVT} = new{N, T, XVT}(p, w, w′)
end
Adapt.adapt(to, gm::GenzMalik{N}) where N = GenzMalik(Val(N), adapt(to, gm.p), adapt(to, gm.w), adapt(to, gm.w′))
# internal code to construct n-dimensional Genz-Malik rule for coordinates of type `T`.
function _GenzMalik(v::Val{n}, ::Type{T}) where {n, T}
n < 2 && throw(ArgumentError("invalid dimension $n: GenzMalik rule requires dimension > 2"))
λ₄ = T(sqrt(9/10))
λ₂ = T(sqrt(9/70))
λ₃ = λ₄
λ₅ = T(sqrt(9/19))
twoⁿ = 1 << n
w₁ = twoⁿ * T((12824 - 9120n + 400n^2) / 19683)
w₂ = twoⁿ * T(980 / 6561)
w₃ = twoⁿ * T((1820 - 400n) / 19683)
w₄ = twoⁿ * T(200 / 19683)
w₅ = T(6859/19683)
w₄′ = twoⁿ * T(25/729)
w₃′ = twoⁿ * T((265 - 100n)/1458)
w₂′ = twoⁿ * T(245/486)
w₁′ = twoⁿ * T((729 - 950n + 50n^2)/729)
p₂ = combos(1, λ₂, v)
p₃ = combos(1, λ₃, v)
p₄ = signcombos(2, λ₄, v)
p₅ = signcombos(n, λ₅, v)
return GenzMalik(v, (p₂,p₃,p₄,p₅), (w₁,w₂,w₃,w₄,w₅), (w₁′,w₂′,w₃′,w₄′))
end
"""
GenzMalik(Val{n}(), T=Float64)
Construct an n-dimensional Genz-Malik rule for coordinates of type `T`.
"""
GenzMalik(v::Val{n}, ::Type{T}) where {n, T <: Real} = _GenzMalik(v, T)
countevals(::Type{<:GenzMalik{n}}) where {n} = 1 + 4n + 2*n*(n-1) + (1<<n)
"""
genzmalik(f, a, b, norm=norm)
Evaluate `genzmalik::GenzMalik` for the box with min/max corners `a` and `b`
for an integrand `f`. Returns the estimated integral `I`, the estimated
error `E` (via the given `norm`), and the suggested coordinate `k` ∈ `1:n`
to subdivide next.
"""
function (g::GenzMalik{n, T})(f::F, a, b, norm=norm) where {F, n, T}
c = T(.5).*(a.+b)
Δ = T(.5).*(b.-a) #Multiplied by dimensionless to get x -> units of x
#Hardcoded for ease of type stability
V = Δ[Int32(1)] #Multiplied by the weighting -> x (units of this is x)
for i in Int32(2):Int32(n) V *= ustrip(Δ[i]) end
f₁ = f(c)
f₂ = zero(f₁)
f₃ = zero(f₁)
maxdivdiff = zero(norm(f₁))
divdiff = similar(SVector{n, typeof(maxdivdiff)})
for i = Int32(1):Int32(n)
p₂ = Δ .* g.p[Int32(1)][i]
f₂ᵢ = f(c .+ p₂) + f(c .- p₂)
p₃ = Δ .* g.p[Int32(2)][i]
f₃ᵢ = f(c .+ p₃) + f(c .- p₃)
f₂ += f₂ᵢ
f₃ += f₃ᵢ
# fourth divided difference: f₃ᵢ-2f₁ - 7*(f₂ᵢ-2f₁),
# where 7 = (λ₃/λ₂)^2 [see van Dooren and de Ridder]
divdiff[i] = norm(f₃ᵢ + 12f₁ - 7*f₂ᵢ)
end
f₄ = zero(f₁)
for p in g.p[Int32(3)]
f₄ += f(c .+ Δ .* p)
end
f₅ = zero(f₁)
for p in g.p[Int32(4)]
f₅ += f(c .+ Δ .* p)
end
I = V * (g.w[Int32(1)]*f₁ + g.w[Int32(2)]*f₂ + g.w[Int32(3)]*f₃ + g.w[Int32(4)]*f₄ + g.w[Int32(5)]*f₅)
I′ = V * (g.w′[Int32(1)]*f₁ + g.w′[Int32(2)]*f₂ + g.w′[Int32(3)]*f₃ + g.w′[Int32(4)]*f₄)
E = norm(I - I′)
# choose axis
kdivide = Int32(1)
δf = E / (Int32(10)^n * V)
for i = Int32(1):Int32(n)
if (δ = divdiff[i] - maxdivdiff) > δf
kdivide = i
maxdivdiff = divdiff[i]
elseif abs(δ) <= δf && abs(Δ[i]) > abs(Δ[kdivide])
kdivide = i
end
end
return I, E, kdivide
end
end
On the CPUST = Float32
intgr_cpu = cubrule(Val(3), ST)
cpu = map(i -> intgr_cpu(x -> i*Vec3(x[1]^2, x[2]^2, x[3]^2), (0,0,0).*u"V", (1,1,1).*u"V"), 1:5) Which outputs on the cpu
Some Benchmarking julia> @benchmark map(i -> intg(x -> i*Vec3(x[1]^2, x[2]^2, x[3]^2), (0,0,0).*u"V", (1,1,1).*u"V"), vec) setup=(vec=1:10000; intg=cubrule(Val(3), Float32))
BenchmarkTools.Trial: 4761 samples with 1 evaluation.
Range (min … max): 740.500 μs … 6.673 ms ┊ GC (min … max): 0.00% … 86.45%
Time (median): 892.800 μs ┊ GC (median): 0.00%
Time (mean ± σ): 1.042 ms ± 495.179 μs ┊ GC (mean ± σ): 1.91% ± 5.40%
▃▁▄█▅▄▂▂▁ ▁
▅████████████▇▇█▇▇▇▇▇▇▇▆▆▅▆▆▅▅▆▅▄▅▄▅▅▅▄▅▄▅▄▅▄▄▆▄▄▄▅▅▄▄▅▇█▆▄▅▅ █
740 μs Histogram: log(frequency) by time 2.85 ms <
Memory estimate: 195.36 KiB, allocs estimate: 2. On the GPUST = Float32
intgr_cpu = cubrule(Val(3), ST)
intgr_gpu = adapt(CuArray, intgr_cpu)
gpu = map(i -> intgr_gpu(x -> i*Vec3(x[1]^2, x[2]^2, x[3]^2), (0,0,0).*u"V", (1,1,1).*u"V"), CuArray(1:5))
Some Benchmarking julia> @benchmark map(i -> int(x -> i*Vec3(x[1]^2, x[2]^2, x[3]^2), (0,0,0).*u"V", (1,1,1).*u"V"), vec) setup=(vec=CuArray(1:10000); int=adapt(CuArray, cubrule(Val(3), Float32)))
BenchmarkTools.Trial: 10000 samples with 1 evaluation.
Range (min … max): 11.100 μs … 431.700 μs ┊ GC (min … max): 0.00% … 0.00%
Time (median): 14.100 μs ┊ GC (median): 0.00%
Time (mean ± σ): 19.996 μs ± 18.620 μs ┊ GC (mean ± σ): 0.00% ± 0.00%
▅█▇▆▄▃▂▂▁ ▁▁ ▁▂▃▃▃▁▁▁ ▂
████████████████████████████▇▇▇▆▆▆▆▆▅▅▅▆▃▅▂▃▃▄▅▅▄▃▄▂▅▄▄▄▂▄▄▅ █
11.1 μs Histogram: log(frequency) by time 88.1 μs <
Memory estimate: 2.47 KiB, allocs estimate: 105. GPU shows decent speedup! Feel free to use as needed! |
What would it take to make
hcubature
GPU-friendly? In the sense that there is no scalar indexing. Is that even possible?MWE:
The text was updated successfully, but these errors were encountered: