// build with cpp extension
// try to hook some function
bash scripts/build.sh
// build without cpp extension
// just profiling cpu kernel, and analysis the log
bash scripts/build_regular.sh
export CUDA_DEV=true
bash scripts/build.sh
export ENABLE_PROFILING=True
import module_logging as ml
with ml.combined_context():
model()
m = model()
import module_logging as ml
m = model
with ml.PerformanceLogger(m):
m()
from module_logging import PerformanceLogger as PL
pl = PL()
m = model()
pl.config(model=m)
pl.__enter__()
for i in range(100):
m()
pl.__exit__()
# for default print the total time table
python -m module_logging --path 7.log
# print summary table
python -m module_logging --path 7.log --summary
# print the detail table
python -m module_logging --path 7.log --detail
# print all 3 kinds table
python -m module_logging --path 7.log --all
# write table to csv: /tmp/total.csv
python -m module_logging --path 7.log --csv
#compare mode, must profiling with Mode 2
python -m module_logging --compare --lhs_path 0.log --rhs_path 1.log
# compare mode and write to csv: /tmp/compare.csv
# must profiling with Mode 2
python -m module_logging --compare --lhs_path 0.log --rhs_path 1.log --csv
# analysis the distribution op
python -m module_logging --dist --path 7.log
# compare the two nn.Module inputs/outputs/parameters or torch.Tensor(s)
python -m module_logging --percision --lhs_path 0.h5f --rhs_path 1.h5f
export ENABLE_HOOK_TRACE=true
import module_logging
module_logging.Hook.install_hook()
python test.py
export ENABLE_HOOK_TRACE=true
export PRINT_BACKTRACE=true
import module_logging
module_logging.Hook.install_hook()
python test.py
example:
/root/miniconda/envs/python38_torch201_cuda/lib/python3.8/site-packages/module_logging/Hook.cpython-38-x86_64-linux-gnu.so(_ZN5trace6Tracer5traceEv+0x39) [0x7fb56afa46d9]
/root/miniconda/envs/python38_torch201_cuda/lib/python3.8/site-packages/module_logging/Hook.cpython-38-x86_64-linux-gnu.so(_ZN5trace6TracerC1ENSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE+0x92) [0x7fb56afa4942]
/root/miniconda/envs/python38_torch201_cuda/lib/python3.8/site-packages/module_logging/Hook.cpython-38-x86_64-linux-gnu.so(_ZN14CpuHookWrapper20local_launch_arg_setEPKvmm+0x99) [0x7fb56afa2b69]
/root/miniconda/envs/python38_torch201_cuda/lib/python3.8/site-packages/torch_xmlir/libxdnn_pytorch.so(_ZN14xpukernel_xpu310calc_basicILi2EfEEvPKT0_S3_PS1_x+0x46) [0x7fb69f724076]
/root/miniconda/envs/python38_torch201_cuda/lib/python3.8/site-packages/torch_xmlir/libxdnn_pytorch.so(+0x3c44692) [0x7fb6a23d4692]
/root/miniconda/envs/python38_torch201_cuda/lib/python3.8/site-packages/torch_xmlir/libxdnn_pytorch.so(_ZN8xpytorch3xpu3api13broadcast_mulIfEEiPNS1_7ContextEPKT_S7_PS5_RKSt6vectorIlSaIlEESD_+0x4b) [0x7fb6a23d26db]
/root/miniconda/envs/python38_torch201_cuda/lib/python3.8/site-packages/torch_xmlir/libxdnn_pytorch.so(+0x1a139ca) [0x7fb6a01a39ca]
/root/miniconda/envs/python38_torch201_cuda/lib/python3.8/site-packages/torch_xmlir/libxdnn_pytorch.so(_ZN12xdnn_pytorch10mul_tensorEPN8xpytorch3xpu3api7ContextERKNS_6TensorES7_RS5_+0x1f5) [0x7fb6a01a0685]
/root/miniconda/envs/python38_torch201_cuda/lib/python3.8/site-packages/torch_xmlir/_XMLIRC.cpython-38-x86_64-linux-gnu.so(+0xc5a1d4) [0x7fb6ed9761d4]
/root/miniconda/envs/python38_torch201_cuda/lib/python3.8/site-packages/torch_xmlir/_XMLIRC.cpython-38-x86_64-linux-gnu.so(+0xe4ae6e) [0x7fb6edb66e6e]
/root/miniconda/envs/python38_torch201_cuda/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so(_ZN2at4_ops10mul_Tensor10redispatchEN3c1014DispatchKeySetERKNS_6TensorES6_+0x8a) [0x7fb7ce23204a]
/root/miniconda/envs/python38_torch201_cuda/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so(+0x3d09390) [0x7fb7cffeb390]
/root/miniconda/envs/python38_torch201_cuda/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so(+0x3d09e9b) [0x7fb7cffebe9b]
/root/miniconda/envs/python38_torch201_cuda/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so(_ZN2at4_ops10mul_Tensor4callERKNS_6TensorES4_+0x175) [0x7fb7ce29b715]
/root/miniconda/envs/python38_torch201_cuda/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so(+0x526184b) [0x7fb7d154384b]
/root/miniconda/envs/python38_torch201_cuda/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so(_ZN5torch8autograd9generated12PowBackward05applyEOSt6vectorIN2at6TensorESaIS5_EE+0x144) [0x7fb7cfee50c4]
/root/miniconda/envs/python38_torch201_cuda/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so(+0x48d9d8b) [0x7fb7d0bbbd8b]
import module_logging as ml
with ml.trace.Tracer(model=m, path="/tmp/profiling.log", print_module_info=False, ranks=[0, 1, 2]):
m()
- model: optional, set the nn.Module to profiling, [nn.Module] or nn.Module
- path: optional a file path to save the profiling result
- print_module_info: optional, if True, will record the profiling info and write to /tmp/logs/
- ranks: the ranks to trace and profiling. Default is None, means all ranks.
open the json file with: chrome://tracing/
# print summary table
python -m module_logging --path 7.log --summary
# print the detail table
python -m module_logging --path 7.log --detail
# print all 3 kinds table
python -m module_logging --path 7.log --all
from module_logging import percision_debugger
m = model()
percision_debugger.config(m, path="/tmp/", steps=[0, 1], ranks=[0])
percision_debugger.__enter__()
for iter in range(100):
inputs = []
m(inputs)
......
optimizer.step()
percision_debugger.update_step()
percision_debugger.__exit__()
from module_logging import percision_debugger
m = model()
percision_debugger.config(m, path="/tmp/", steps=[0, 1], ranks=[0])
with persion_debugger:
for iter in range(100):
inputs = []
m(inputs)
......
optimizer.step()
percision_debugger.update_step()
# compare the two nn.Module inputs/outputs/parameters or torch.Tensor(s)
python -m module_logging --percision --lhs_path 0.h5f --rhs_path 1.h5f
In training, due to some kernel implementation error, some kernel may write data over range. This action is Secretive and diffcult to debug. There is neccesary to trace the Tensor and record the action which modified the inner data.
from module_logging import tensor_tracer
tensor1 = torch.tensor([1, 2, 3], device='cpu').float()
tensor2 = torch.tensor([4, 5, 6], device='cpu').float()
tensor_tracer.__enter__()
# begin to trace the tensor
tensor_tracer.trace("tensor1", tensor1)
# tensor1 will be modified in add
tensor1.add_(tensor2)
tensor_tracer.__exit__()
[aten op name]: aten.add_.Tensor
| Tensor | Status | Max | Min | Mean | Std |
|----------|--------|-------|-------|------|------|
| tensor1 | old | 3.0 | 1.0 | 2.0 | 1.0 |
| tensor1 | new | 9.0 | 5.0 | 7.0 | 2.0 |
The traced tensor will not be released until the end of program.