diff --git a/Project.toml b/Project.toml index da1c32b..c88ea0e 100644 --- a/Project.toml +++ b/Project.toml @@ -3,6 +3,7 @@ uuid = "8646bddf-ab1c-4fa7-9c51-ba187d647618" version = "0.2.8" [deps] +DataFlowTasks = "d1549cb6-e9f4-42f8-98cc-ffc8d067ff5b" Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" diff --git a/src/hmatrix.jl b/src/hmatrix.jl index a47cfd3..52fd7af 100644 --- a/src/hmatrix.jl +++ b/src/hmatrix.jl @@ -1,3 +1,5 @@ +using DataFlowTasks + """ mutable struct HMatrix{R,T} <: AbstractMatrix{T} @@ -156,6 +158,7 @@ function _show(io, hmat, allow_empty = false) @printf(io, "\n\t min number of elements per leaf: %i", minimum(points_per_leaf)) @printf(io, "\n\t max number of elements per leaf: %i", maximum(points_per_leaf)) depth_per_leaf = map(depth, leaves_) + # TODO: the depth of tree is always zero because all leafs are roots @printf(io, "\n\t depth of tree: %i", maximum(depth_per_leaf)) @printf(io, "\n\t compression ratio: %f\n", compression_ratio(hmat)) return io @@ -467,6 +470,54 @@ function compress!(H::HMatrix, comp) return H end +""" + ancestors(H::HMatrix) + +Return all ancestors of `H`. +""" +function ancestors(H::HMatrix) + ancestors = [] + W = H + while W !== parentnode(W) + W = parentnode(W) + push!(ancestors, W) + end + return ancestors +end + +""" + issubmatrix(A::HMatrix, B::HMatrix) + +Return `true` if B is a submatrix of A. +""" +function issubmatrix(A::HMatrix, B::HMatrix) + for ancB in ancestors(B) + if A === ancB + return true + end + end + return false +end + +function DataFlowTasks.memory_overlap(A::HMatrix, B::HMatrix) + # TODO: compare leaves in more efficient way. + if A === B + return true + elseif issubmatrix(A, B) || issubmatrix(B, A) + return true + end + chdA = leaves(A) + chdB = leaves(B) + for i in eachindex(chdA) + for j in eachindex(chdB) + if data(chdA[i]) === data(chdB[j]) + return true + end + end + end + return false +end + ############################################################################################ # Recipes ############################################################################################ diff --git a/src/lu.jl b/src/lu.jl index 80802d0..57131ba 100644 --- a/src/lu.jl +++ b/src/lu.jl @@ -1,3 +1,5 @@ +using DataFlowTasks + const NOPIVOT = VERSION >= v"1.7" ? NoPivot : Val{false} const HLU = LU{<:Any,<:HMatrix} @@ -30,9 +32,16 @@ function LinearAlgebra.lu!(M::HMatrix, compressor; threads = use_threads()) nt = Threads.nthreads() chn = Channel{ACABuffer{T}}(nt) foreach(i -> put!(chn, ACABuffer(T)), 1:nt) - _lu!(M, compressor, threads, chn) - # wrap the result in the LU structure - return LU(M, Int[], 0) + if (threads) + _lu_threads!(M, compressor, chn) + # wrap the result in the LU structure + d = @dspawn LU(@R(M), Int[], 0) label = "result" + return fetch(d) + else + _lu!(M, compressor, chn) + # wrap the result in the LU structure + return LU(M, Int[], 0) + end end """ @@ -59,7 +68,7 @@ Hierarchical LU factorization. See [`lu!`](@ref) for the available options. """ LinearAlgebra.lu(M::HMatrix, args...; kwargs...) = lu!(deepcopy(M), args...; kwargs...) -function _lu!(M::HMatrix, compressor, threads, bufs = nothing) +function _lu!(M::HMatrix, compressor, bufs = nothing) if isleaf(M) d = data(M) @assert d isa Matrix @@ -69,7 +78,7 @@ function _lu!(M::HMatrix, compressor, threads, bufs = nothing) chdM = children(M) m, n = size(chdM) for i in 1:m - _lu!(chdM[i, i], compressor, threads, bufs) + _lu!(chdM[i, i], compressor, bufs) for j in (i+1):n ldiv!(UnitLowerTriangular(chdM[i, i]), chdM[i, j], compressor, bufs) rdiv!(chdM[j, i], UpperTriangular(chdM[i, i]), compressor, bufs) @@ -84,6 +93,52 @@ function _lu!(M::HMatrix, compressor, threads, bufs = nothing) return M end +function _lu_threads!(M::HMatrix, compressor, bufs = nothing, level = 0, parent = (0, 0)) + if isleaf(M) + @dspawn begin + @RW(M) + d = data(M) + @assert d isa Matrix + lu!(d, NOPIVOT()) + end label = "lu($(parent[1]),$(parent[2]))\nlevel=$(level)" + else + @assert !hasdata(M) + chdM = children(M) + m, n = size(chdM) + for i in 1:m + _lu_threads!(chdM[i, i], compressor, bufs, level + 1, (i, i)) + for j in (i+1):n + @dspawn ldiv!( + UnitLowerTriangular(@R(chdM[i, i])), + @RW(chdM[i, j]), + compressor, + bufs, + ) label = "ldiv($i,$j)\nlevel=$(level+1)" + @dspawn rdiv!( + @RW(chdM[j, i]), + UpperTriangular(@R(chdM[i, i])), + compressor, + bufs, + ) label = "rdiv($j,$i)\nlevel=$(level+1)" + end + for j in (i+1):m + for k in (i+1):n + @dspawn hmul!( + @RW(chdM[j, k]), + @R(chdM[j, i]), + @R(chdM[i, k]), + -1, + 1, + compressor, + bufs, + ) label = "hmul($j,$k)\nlevel=$(level+1)" + end + end + end + end + return M +end + function LinearAlgebra.ldiv!(A::HLU, y::AbstractVector; global_index = true) p = A.factors # underlying data ctree = coltree(p) diff --git a/src/utils.jl b/src/utils.jl index 1173693..04e8d0e 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -64,7 +64,7 @@ function leaves(tree) end """ - leaves(tree) + nodes(tree) Return a vector containing all the nodes of `tree`. """ diff --git a/test/Project.toml b/test/Project.toml index a2203d0..fa37e66 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -1,6 +1,9 @@ [deps] +CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0" ComputationalResources = "ed09eef8-17a6-5b46-8889-db040fac31e3" +DataFlowTasks = "d1549cb6-e9f4-42f8-98cc-ffc8d067ff5b" Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b" +JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" diff --git a/test/hmatrix_test.jl b/test/hmatrix_test.jl index a8e0cf3..fa3ba0b 100644 --- a/test/hmatrix_test.jl +++ b/test/hmatrix_test.jl @@ -3,6 +3,7 @@ using StaticArrays using HMatrices using LinearAlgebra using SparseArrays +using DataFlowTasks include(joinpath(HMatrices.PROJECT_ROOT, "test", "testutils.jl")) @@ -58,3 +59,32 @@ end Hnew = axpy!(true, S, deepcopy(H)) @test Matrix(Hnew) == (H_full + Matrix(S)) end + +@testset "Memory overlap" begin + m = 1000 + T = Float64 + + X = points_on_sphere(m) + Y = X + K = laplace_matrix(X, X) + + X1 = points_on_sphere(m) + Y1 = X1 + K1 = laplace_matrix(X1, X1) + + splitter = CardinalitySplitter(; nmax = 50) + Xclt = ClusterTree(X, splitter) + Yclt = ClusterTree(Y, splitter) + X1clt = ClusterTree(X1, splitter) + Y1clt = ClusterTree(Y1, splitter) + adm = StrongAdmissibilityStd(3) + comp = PartialACA(; atol = 1e-10) + + H = assemble_hmatrix(K, Xclt, Yclt; adm, comp, threads = false, distributed = false) + H1 = assemble_hmatrix(K1, X1clt, Y1clt; adm, comp, threads = false, distributed = false) + + # Test memory overlap function + @test DataFlowTasks.memory_overlap(H, H) == true + @test DataFlowTasks.memory_overlap(H, H1) == false + @test DataFlowTasks.memory_overlap(H, H.children[1]) == true +end diff --git a/test/lu_plots/create_lu_plots.bat b/test/lu_plots/create_lu_plots.bat new file mode 100644 index 0000000..6dff56a --- /dev/null +++ b/test/lu_plots/create_lu_plots.bat @@ -0,0 +1,37 @@ +@echo off + +:: First argument: path to HMatrices project +:: Second argument: path where to store plots + +if "%1"=="" ( + set JULIA_PROJECT_PATH=%~dp0\..\.. +) else ( + set JULIA_PROJECT_PATH=%1 +) + +if "%2"=="" ( + set PLOTS_PATH=%~dp0 +) else ( + set PLOTS_PATH=%2 +) + +:: Path to the Julia project environment +set JULIA_ENVIROMENT_PATH=%JULIA_PROJECT_PATH%\test + +:: Paths to the scripts +set RUN_LU_PATH=%JULIA_PROJECT_PATH%\test\lu_plots\run_lu.jl +set CREATE_PLOTS_PATH=%JULIA_PROJECT_PATH%\test\lu_plots\create_plots.jl + +:: Space-separated list of thread counts to use +set THREAD_COUNTS=1 2 4 6 8 + +:: Loop over each thread count +for %%t in (%THREAD_COUNTS%) do ( + echo Running LU with %%t threads... + set JULIA_NUM_THREADS=%%t + julia --project=%JULIA_ENVIROMENT_PATH% %RUN_LU_PATH% %PLOTS_PATH% +) + +:: Create the plots +echo Creating plots... +julia --project=%JULIA_ENVIROMENT_PATH% %CREATE_PLOTS_PATH% %PLOTS_PATH% diff --git a/test/lu_plots/create_lu_plots.sh b/test/lu_plots/create_lu_plots.sh new file mode 100644 index 0000000..eeed2ec --- /dev/null +++ b/test/lu_plots/create_lu_plots.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +# First argument: path to HMatrices project +# Second argument: path where to store plots + +if [ -z "$1" ]; then + JULIA_PROJECT_PATH="$(dirname "$(realpath "$0")")/../.." +else + JULIA_PROJECT_PATH="$1" +fi + +if [ -z "$2" ]; then + PLOTS_PATH="$(dirname "$(realpath "$0")")" +else + PLOTS_PATH="$2" +fi + +# Path to the Julia project environment +JULIA_ENVIRONMENT_PATH="$JULIA_PROJECT_PATH/test" + +# Paths to the scripts +RUN_LU_PATH="$JULIA_PROJECT_PATH/test/lu_plots/run_lu.jl" +CREATE_PLOTS_PATH="$JULIA_PROJECT_PATH/test/lu_plots/create_plots.jl" + +# Space-separated list of thread counts to use +THREAD_COUNTS="1 2 4 6 8" + +# Loop over each thread count +for t in $THREAD_COUNTS; do + echo "Running LU with $t threads..." + export JULIA_NUM_THREADS=$t + julia --project="$JULIA_ENVIRONMENT_PATH" "$RUN_LU_PATH" "$PLOTS_PATH" +done + +# Create the plots +echo "Creating plots..." +julia --project="$JULIA_ENVIRONMENT_PATH" "$CREATE_PLOTS_PATH" "$PLOTS_PATH" diff --git a/test/lu_plots/create_plots.jl b/test/lu_plots/create_plots.jl new file mode 100644 index 0000000..0527dd0 --- /dev/null +++ b/test/lu_plots/create_plots.jl @@ -0,0 +1,58 @@ +using HMatrices +using LinearAlgebra +using StaticArrays +using JLD2 + +using CairoMakie + +const FILE_PATH = joinpath(ARGS[1], "lu_results.jld2") +const PLOT_PATH = joinpath(ARGS[1], "nthreads_speedup_plot.svg") + +results = load(FILE_PATH)["results"] +signequal_result = results["1"] +info = Dict{Int,Dict{Int,Float64}}() + +for (nthreads, result) in results + for (size, time) in result + if !haskey(info, size) + info[size] = Dict{Int,Float64}() + end + speedup = signequal_result[size] / time + info[size][parse(Int64, nthreads)] = speedup + end +end + +f = Figure() +ax = Axis( + f[1, 1]; + title = "Speedup of HLU depending on number of threads", + xlabel = "Number of threads", + ylabel = "Speedup", +) + +for (size, speedup_info) in info + sorted_info = sort(collect(pairs(speedup_info))) + nthreads = [x[1] for x in sorted_info] + speedups = [x[2] for x in sorted_info] + lines!(ax, nthreads, speedups; label = "$size") +end + +perfect_speedup = collect(keys(info[minimum(keys(info))])) # Assuming the smallest size has all thread counts +lines!( + ax, + perfect_speedup, + perfect_speedup; + linestyle = :dash, + linewidth = 2, + color = :red, + label = "Perfect speedup", +) + +ax.yticks = (minimum(perfect_speedup):1:maximum(perfect_speedup)) +ax.xticks = (minimum(perfect_speedup):1:maximum(perfect_speedup)) + +f[1, 2] = Legend(f, ax, "Size"; framevisible = false) + +save(PLOT_PATH, f) + +f diff --git a/test/lu_plots/run_lu.jl b/test/lu_plots/run_lu.jl new file mode 100644 index 0000000..5c0f230 --- /dev/null +++ b/test/lu_plots/run_lu.jl @@ -0,0 +1,52 @@ +using Test +using HMatrices +using LinearAlgebra +using Random +using StaticArrays +using BenchmarkTools +using JLD2 + +using HMatrices: RkMatrix + +include(joinpath(HMatrices.PROJECT_ROOT, "test", "testutils.jl")) + +const FILE_PATH = joinpath(ARGS[1], "lu_results.jld2") + +Random.seed!(1) + +# 3D Laplace on sphere +problem_size = (100, 500, 1000) +threads = Threads.nthreads() > 1 + +results = Dict{String,Dict{Int,Float64}}() +if isfile(FILE_PATH) + results = load(FILE_PATH)["results"] +end + +thread_key = string(Threads.nthreads()) +if !haskey(results, thread_key) + results[thread_key] = Dict{Int,Float64}() +end + +for size in problem_size + println("Problem size=$size") + m = size + T = Float64 + X = points_on_sphere(m) + Y = X + + K = laplace_matrix(X, X) + + splitter = CardinalitySplitter(; nmax = 50) + Xclt = ClusterTree(X, splitter) + Yclt = ClusterTree(Y, splitter) + adm = StrongAdmissibilityStd(3) + comp = PartialACA(; atol = 1e-10) + + H = assemble_hmatrix(K, Xclt, Yclt; adm, comp, threads, distributed = false) + time = @belapsed lu($H; atol = 1e-10, threads = $threads) + results[thread_key][size] = time + println("Elapsed time=$time\n") +end + +JLD2.@save FILE_PATH results diff --git a/test/lu_test.jl b/test/lu_test.jl index 9ee8b2b..58a798f 100644 --- a/test/lu_test.jl +++ b/test/lu_test.jl @@ -24,7 +24,7 @@ adm = StrongAdmissibilityStd(3) comp = PartialACA(; atol = 1e-10) for threads in (false, true) H = assemble_hmatrix(K, Xclt, Yclt; adm, comp, threads, distributed = false) - hlu = lu(H; atol = 1e-10) + hlu = lu(H; atol = 1e-10, threads = threads) y = rand(m) M = Matrix(K) exact = M \ y