Skip to content

Commit

Permalink
AMD: Clarify diagnostic messages; free up disk space for CI build
Browse files Browse the repository at this point in the history
* Add build job for rocm

* Add rocm build script

* Copy shared obj file into output_dir

* upload build artifacts and enable wheels build

* Remove cuda build temporarily

* Add ROCm version to .so filename

* Add rocm_version to whls build

* Revert "Remove cuda build temporarily"

This reverts commit 1413c5f.

* Add rocm_version env var

* Remove thrush header files

* Print node info

* print cuda node info

* Revert "print cuda node info"

This reverts commit cdb209a.

* Revert "Print node info"

This reverts commit 7e9a65c.

* Add rocm arch to compile command

* Rename .so files to rocm

* Update default gpu arch

* Skip cpu based igemmlt int tests on ROCm

* Update Documentation

* Update upstream repo name

* Update docs

* Update string format

Co-authored-by: Aarni Koskela <akx@iki.fi>

* Remove pre-release option for torch install

* Update pytorch install path

Co-authored-by: Titus <9048635+Titus-von-Koeller@users.noreply.github.com>

* Add messages for Heuristics error

* Remove toolcache for disk space

* print disk usage

* Clean disk space for linux

* Fix for ubuntu

* Add sudo for apt clean

* Update clean up disk list

* remove disk usage print

* Add BNB_BACKEND variable

* Update diagnostic functions for ROCm

* Fix tuple error

* Fix library detection bug for recursive and symlink cases

* fix pre-commit errors

* Remove recursive path lib search

* Create function for runtime lib patterns

* Update logger format

Co-authored-by: Aarni Koskela <akx@iki.fi>

* Update error reporting

Co-authored-by: Aarni Koskela <akx@iki.fi>

* Remove commented code

Co-authored-by: Aarni Koskela <akx@iki.fi>

* Update error reporting

Co-authored-by: Aarni Koskela <akx@iki.fi>

* Update error reporting

* Create hip diagnostics functions

* Fix Typo

* Fix pre-commit checks

---------

Co-authored-by: Aarni Koskela <akx@iki.fi>
Co-authored-by: Titus <9048635+Titus-von-Koeller@users.noreply.github.com>
  • Loading branch information
3 people authored Sep 16, 2024
1 parent 2784653 commit 45b7d14
Show file tree
Hide file tree
Showing 5 changed files with 137 additions and 41 deletions.
21 changes: 17 additions & 4 deletions .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -116,10 +116,23 @@ jobs:
uses: docker/setup-qemu-action@v2
- name: Clean up disk space
run: |
sudo rm -rf /usr/share/dotnet
sudo rm -rf /opt/ghc
sudo rm -rf "/usr/local/share/boost"
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
sudo rm -rf \
/usr/share/dotnet \
/opt/ghc \
"/usr/local/share/boost" \
"$AGENT_TOOLSDIRECTORY" \
/opt/hostedtoolcache \
/opt/google/chrome \
/opt/microsoft/msedge \
/opt/microsoft/powershell \
/opt/pipx \
/usr/lib/mono \
/usr/local/julia* \
/usr/local/lib/android \
/usr/local/lib/node_modules \
/usr/local/share/chromium \
/usr/local/share/powershell \
/usr/share/swift
- name: Build C++
run: bash .github/scripts/build-rocm.sh
env:
Expand Down
11 changes: 7 additions & 4 deletions bitsandbytes/cextension.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ def get_native_library() -> BNBNativeLibrary:
if cuda_binary_path.exists():
binary_path = cuda_binary_path
else:
logger.warning("Could not find the bitsandbytes CUDA binary at %r", cuda_binary_path)
logger.warning("Could not find the bitsandbytes %s binary at %r", BNB_BACKEND, cuda_binary_path)
logger.debug(f"Loading bitsandbytes native library from: {binary_path}")
dll = ct.cdll.LoadLibrary(str(binary_path))

Expand All @@ -116,21 +116,24 @@ def get_native_library() -> BNBNativeLibrary:
hip_major, hip_minor = map(int, torch.version.hip.split(".")[0:2])
HIP_ENVIRONMENT, BNB_HIP_VERSION = True, hip_major * 100 + hip_minor
BNB_HIP_VERSION_SHORT = f"{hip_major}{hip_minor}"
BNB_BACKEND = "ROCm"
else:
HIP_ENVIRONMENT, BNB_HIP_VERSION = False, 0
BNB_HIP_VERSION_SHORT = ""
BNB_BACKEND = "CUDA"

lib = get_native_library()
except Exception as e:
lib = None
logger.error(f"Could not load bitsandbytes native library: {e}", exc_info=True)
if torch.cuda.is_available():
logger.warning(
"""
CUDA Setup failed despite CUDA being available. Please run the following command to get more information:
f"""
{BNB_BACKEND} Setup failed despite {BNB_BACKEND} being available. Please run the following command to get more information:
python -m bitsandbytes
Inspect the output of the command and see if you can locate CUDA libraries. You might need to add them
Inspect the output of the command and see if you can locate {BNB_BACKEND} libraries. You might need to add them
to your LD_LIBRARY_PATH. If you suspect a bug, please take the information from python -m bitsandbytes
and open an issue at: https://github.com/TimDettmers/bitsandbytes/issues
""",
Expand Down
89 changes: 77 additions & 12 deletions bitsandbytes/diagnostics/cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import torch

from bitsandbytes.cextension import get_cuda_bnb_library_path
from bitsandbytes.cextension import HIP_ENVIRONMENT, get_cuda_bnb_library_path
from bitsandbytes.consts import NONPYTORCH_DOC_URL
from bitsandbytes.cuda_specs import CUDASpecs
from bitsandbytes.diagnostics.utils import print_dedented
Expand All @@ -32,15 +32,20 @@
"_", # current Python interpreter
}

CUDA_RUNTIME_LIB_PATTERNS = (
"cudart64*.dll", # Windows
"libcudart*.so*", # libcudart.so, libcudart.so.11.0, libcudart.so.12.0, libcudart.so.12.1, libcudart.so.12.2 etc.
"nvcuda*.dll", # Windows
)

logger = logging.getLogger(__name__)


def get_runtime_lib_patterns() -> tuple:
if HIP_ENVIRONMENT:
return ("libamdhip64.so*",)
else:
return (
"cudart64*.dll", # Windows
"libcudart*.so*", # libcudart.so, libcudart.so.11.0, libcudart.so.12.0, libcudart.so.12.1, libcudart.so.12.2 etc.
"nvcuda*.dll", # Windows
)


def find_cuda_libraries_in_path_list(paths_list_candidate: str) -> Iterable[Path]:
for dir_string in paths_list_candidate.split(os.pathsep):
if not dir_string:
Expand All @@ -55,9 +60,9 @@ def find_cuda_libraries_in_path_list(paths_list_candidate: str) -> Iterable[Path
continue
except OSError: # Assume an esoteric error trying to poke at the directory
pass
for lib_pattern in CUDA_RUNTIME_LIB_PATTERNS:
for lib_pattern in get_runtime_lib_patterns():
for pth in dir.glob(lib_pattern):
if pth.is_file():
if pth.is_file() and not pth.is_symlink():
yield pth
except (OSError, PermissionError):
pass
Expand Down Expand Up @@ -104,7 +109,7 @@ def find_cudart_libraries() -> Iterator[Path]:
yield from find_cuda_libraries_in_path_list(value)


def print_cuda_diagnostics(cuda_specs: CUDASpecs) -> None:
def _print_cuda_diagnostics(cuda_specs: CUDASpecs) -> None:
print(
f"PyTorch settings found: CUDA_VERSION={cuda_specs.cuda_version_string}, "
f"Highest Compute Capability: {cuda_specs.highest_compute_capability}.",
Expand Down Expand Up @@ -149,10 +154,40 @@ def print_cuda_diagnostics(cuda_specs: CUDASpecs) -> None:
# (2) Multiple CUDA versions installed


def print_cuda_runtime_diagnostics() -> None:
def _print_hip_diagnostics(cuda_specs: CUDASpecs) -> None:
print(f"PyTorch settings found: ROCM_VERSION={cuda_specs.cuda_version_string}")

binary_path = get_cuda_bnb_library_path(cuda_specs)
if not binary_path.exists():
print_dedented(
f"""
Library not found: {binary_path}.
Maybe you need to compile it from source? If you compiled from source, check that ROCM_VERSION
in PyTorch Settings matches your ROCm install. If not, reinstall PyTorch for your ROCm version
and rebuild bitsandbytes.
""",
)

hip_major, hip_minor = cuda_specs.cuda_version_tuple
if (hip_major, hip_minor) < (6, 1):
print_dedented(
"""
WARNING: bitsandbytes is fully supported only from ROCm 6.1.
""",
)


def print_diagnostics(cuda_specs: CUDASpecs) -> None:
if HIP_ENVIRONMENT:
_print_hip_diagnostics(cuda_specs)
else:
_print_cuda_diagnostics(cuda_specs)


def _print_cuda_runtime_diagnostics() -> None:
cudart_paths = list(find_cudart_libraries())
if not cudart_paths:
print("CUDA SETUP: WARNING! CUDA runtime files not found in any environmental path.")
print("WARNING! CUDA runtime files not found in any environmental path.")
elif len(cudart_paths) > 1:
print_dedented(
f"""
Expand All @@ -174,3 +209,33 @@ def print_cuda_runtime_diagnostics() -> None:
)
for pth in cudart_paths:
print(f"* Found CUDA runtime at: {pth}")


def _print_hip_runtime_diagnostics() -> None:
cudart_paths = list(find_cudart_libraries())
if not cudart_paths:
print("WARNING! ROCm runtime files not found in any environmental path.")
elif len(cudart_paths) > 1:
print_dedented(
f"""
Found duplicate ROCm runtime files (see below).
We select the PyTorch default ROCm runtime, which is {torch.version.hip},
but this might mismatch with the ROCm version that is needed for bitsandbytes.
To resolve it, install PyTorch built for the ROCm version you want to use
and set LD_LIBRARY_PATH to your ROCm install path, e.g.
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm-6.1.2/lib,
""",
)

for pth in cudart_paths:
print(f"* Found ROCm runtime at: {pth}")


def print_runtime_diagnostics() -> None:
if HIP_ENVIRONMENT:
_print_hip_runtime_diagnostics()
else:
_print_cuda_runtime_diagnostics()
31 changes: 19 additions & 12 deletions bitsandbytes/diagnostics/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,12 @@

import torch

from bitsandbytes.cextension import BNB_BACKEND, HIP_ENVIRONMENT
from bitsandbytes.consts import PACKAGE_GITHUB_URL
from bitsandbytes.cuda_specs import get_cuda_specs
from bitsandbytes.diagnostics.cuda import (
print_cuda_diagnostics,
print_cuda_runtime_diagnostics,
print_diagnostics,
print_runtime_diagnostics,
)
from bitsandbytes.diagnostics.utils import print_dedented, print_header

Expand All @@ -16,12 +17,13 @@ def sanity_check():
from bitsandbytes.cextension import lib

if lib is None:
compute_backend = "cuda" if not HIP_ENVIRONMENT else "hip"
print_dedented(
"""
f"""
Couldn't load the bitsandbytes library, likely due to missing binaries.
Please ensure bitsandbytes is properly installed.
For source installations, compile the binaries with `cmake -DCOMPUTE_BACKEND=cuda -S .`.
For source installations, compile the binaries with `cmake -DCOMPUTE_BACKEND={compute_backend} -S .`.
See the documentation for more details if needed.
Trying a simple check anyway, but this will likely fail...
Expand Down Expand Up @@ -49,19 +51,24 @@ def main():

print_header("OTHER")
cuda_specs = get_cuda_specs()
print("CUDA specs:", cuda_specs)
if HIP_ENVIRONMENT:
rocm_specs = f" rocm_version_string='{cuda_specs.cuda_version_string}',"
rocm_specs += f" rocm_version_tuple={cuda_specs.cuda_version_tuple}"
print(f"{BNB_BACKEND} specs:{rocm_specs}")
else:
print(f"{BNB_BACKEND} specs:{cuda_specs}")
if not torch.cuda.is_available():
print("Torch says CUDA is not available. Possible reasons:")
print("1. CUDA driver not installed")
print("2. CUDA not installed")
print("3. You have multiple conflicting CUDA libraries")
print(f"Torch says {BNB_BACKEND} is not available. Possible reasons:")
print(f"1. {BNB_BACKEND} driver not installed")
print(f"2. {BNB_BACKEND} not installed")
print(f"3. You have multiple conflicting {BNB_BACKEND} libraries")
if cuda_specs:
print_cuda_diagnostics(cuda_specs)
print_cuda_runtime_diagnostics()
print_diagnostics(cuda_specs)
print_runtime_diagnostics()
print_header("")
print_header("DEBUG INFO END")
print_header("")
print("Checking that the library is importable and CUDA is callable...")
print(f"Checking that the library is importable and {BNB_BACKEND} is callable...")
try:
sanity_check()
print("SUCCESS!")
Expand Down
26 changes: 17 additions & 9 deletions csrc/ops.hip
Original file line number Diff line number Diff line change
Expand Up @@ -576,6 +576,7 @@ template <int FORMATB, int DTYPE_OUT, int SCALE_ROWS> int igemmlt(hipblasLtHandl
if (returnedAlgoCount == 0)
{
has_error = 1;
fprintf(stderr, "Error: Matmul Algo Heuristic didn't return algorithms\n");
}
else
{
Expand Down Expand Up @@ -614,18 +615,25 @@ template <int FORMATB, int DTYPE_OUT, int SCALE_ROWS> int igemmlt(hipblasLtHandl
heuristicResult,
&returnedAlgoCount));

if(!SCALE_ROWS)
if (returnedAlgoCount == 0)
{
float alpha = 1.0f, beta = 0.0f;

has_error |= checkHipblasStatus(hipblasLtMatmul(ltHandle, matmulDesc,&alpha, A, Adesc, B, Bdesc, &beta, (int8_t*)C, Cdesc, (int8_t*)C, Cdesc, &heuristicResult[0].algo, nullptr, 0, 0));
has_error = 1;
fprintf(stderr, "Error: Matmul Algo Heuristic didn't return algorithms\n");
}
else
{
//has_error |= checkHipblasStatus(hipblasLtMatmulDescSetAttribute(matmulDesc, hipblasLt_MATMUL_DESC_POINTER_MODE, &alphaVec, sizeof(alphaVec)));
float beta = 0.0f;

has_error |= checkHipblasStatus(hipblasLtMatmul(ltHandle, matmulDesc, row_scale, A, Adesc, B, Bdesc, &beta, (int8_t*)C, Cdesc, (int8_t*)C, Cdesc, &heuristicResult[0].algo, nullptr, 0, 0));
if(!SCALE_ROWS)
{
float alpha = 1.0f, beta = 0.0f;

has_error |= checkHipblasStatus(hipblasLtMatmul(ltHandle, matmulDesc,&alpha, A, Adesc, B, Bdesc, &beta, (int8_t*)C, Cdesc, (int8_t*)C, Cdesc, &heuristicResult[0].algo, nullptr, 0, 0));
}
else
{
float beta = 0.0f;

has_error |= checkHipblasStatus(hipblasLtMatmul(ltHandle, matmulDesc, row_scale, A, Adesc, B, Bdesc, &beta, (int8_t*)C, Cdesc, (int8_t*)C, Cdesc, &heuristicResult[0].algo, nullptr, 0, 0));
}
}
}

Expand All @@ -635,7 +643,7 @@ template <int FORMATB, int DTYPE_OUT, int SCALE_ROWS> int igemmlt(hipblasLtHandl
if (Adesc) has_error |= checkHipblasStatus(hipblasLtMatrixLayoutDestroy(Adesc));
if (matmulDesc) has_error |= checkHipblasStatus(hipblasLtMatmulDescDestroy(matmulDesc));
if(has_error == 1)
printf("error detected");
fprintf(stderr, "error detected\n");

return has_error;
#endif // NO_HIPBLASLT
Expand Down

0 comments on commit 45b7d14

Please sign in to comment.