-
Notifications
You must be signed in to change notification settings - Fork 114
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Fix] Replace Invalid OpenCL Representation of Infinity Constants for Float.POSITIVE_INFINITY
and Float.NEGATIVE_INFINITY
#592
base: develop
Are you sure you want to change the base?
Conversation
Float.POSITIVE_INFINITY
and Float.NEGATIVE_INFINITY
Float.POSITIVE_INFINITY
and Float.NEGATIVE_INFINITY
Thanks Michali, Is this only for OpenCL? is it possible to add it also for SPIR-V and PTX? |
@jjfumero Now, this support also PTX and SPIRV |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
Thanks. This test seems to fail time to time. Example running with the OpenCL backend:
|
It also fails with the PTX backend. SPIR-V runs fine. |
@jjfumero rnado-test --printKernel -V uk.ac.manchester.tornado.unittests.compute.MMwithBytes
/home/michalis/TornadoVM/bin/sdk/bin/tornado --jvm "-Xmx6g -Dtornado.recover.bailout=False -Dtornado.unittests.verbose=True -Dtornado.print.kernel=True " -m tornado.unittests/uk.ac.manchester.tornado.unittests.tools.TornadoTestRunner --params "uk.ac.manchester.tornado.unittests.compute.MMwithBytes"
WARNING: Using incubator modules: jdk.incubator.vector
.version 7.6
.target sm_86
.address_size 64
.visible .entry s0_t0_matmultornado_uk_ac_manchester_tornado_api_kernelcontext_1543043602_arrays_bytearray_arrays_floatarray_arrays_floatarray_128(.param .u64 .ptr .global .align 8 kernel_context, .param .u64 .ptr .global .align 8 context_unused, .param .u64 .ptr .global .align 8 thisx, .param .u64 .ptr .global .align 8 that, .param .u64 .ptr .global .align 8 out, .param .align 8 .u64 dim1) {
.reg .pred rpb<10>;
.reg .s8 rsb<4>;
.reg .s64 rsd<8>;
.reg .s16 rsh<2>;
.reg .s32 rsi<36>;
.reg .u32 rui<10>;
.reg .f32 rfi<20>;
.reg .u64 rud<10>;
BLOCK_0:
ld.param.u64 rud0, [kernel_context];
ld.param.u64 rud1, [thisx];
ld.param.u64 rud2, [that];
ld.param.u64 rud3, [out];
mov.u32 rui0, %tid.x;
mov.u32 rui1, %ntid.x;
mov.u32 rui2, %ctaid.x;
mad.lo.s32 rsi0, rui2, rui1, rui0;
shl.b32 rsi1, rsi0, 7;
BLOCK_1:
mov.f32 rfi0, 0F00000000;
mov.s32 rsi2, 0;
LOOP_COND_1:
setp.lt.s32 rpb0, rsi2, 128;
@!rpb0 bra BLOCK_21;
BLOCK_2:
add.s32 rsi3, rsi1, rsi2;
shr.b32 rsi4, rsi3, 31;
shr.b32 rsi5, rsi4, 27;
add.s32 rsi6, rsi5, rsi3;
shr.b32 rsi7, rsi6, 5;
shl.b32 rsi8, rsi7, 1;
and.b32 rsi9, rsi6, -32;
add.s32 rsi10, rsi8, rsi9;
add.s32 rsi11, rsi10, 24;
cvt.s64.s32 rsd0, rsi11;
add.u64 rud4, rud1, rsd0;
ld.global.s8 rsb0, [rud4];
add.s32 rsi12, rsi10, 25;
cvt.s64.s32 rsd1, rsi12;
add.u64 rud5, rud1, rsd1;
ld.global.s8 rsb1, [rud5];
cvt.u32.s8 rui3, rsb1;
and.b32 rui4, rui3, 255;
shl.b32 rui5, rui4, 8;
cvt.u32.s8 rui6, rsb0;
and.b32 rui7, rui6, 255;
or.b32 rui8, rui5, rui7;
cvt.s16.u32 rsh0, rui8;
cvt.s32.s16 rsi13, rsh0;
and.b32 rsi14, rsi13, 32768;
and.b32 rsi15, rsi13, 31744;
shr.b32 rsi16, rsi15, 10;
setp.eq.s32 rpb1, rsi16, 31;
@!rpb1 bra BLOCK_6;
BLOCK_3:
and.b32 rsi17, rsi14, -32768;
setp.eq.s32 rpb2, rsi17, 0;
@!rpb2 bra BLOCK_5;
BLOCK_4:
div.full.f32 rfi1, 0F3F800000, 0F00000000;
mov.f32 rfi2, rfi1;
bra.uni BLOCK_20;
BLOCK_5:
div.full.f32 rfi3, 0FBF800000, 0F00000000;
mov.f32 rfi2, rfi3;
bra.uni BLOCK_20;
BLOCK_6:
and.b32 rsi18, rsi13, 1023;
cvt.rn.f32.s32 rfi4, rsi18;
and.b32 rsi19, rsi15, -1024;
setp.eq.s32 rpb3, rsi19, 0;
@!rpb3 bra BLOCK_14;
BLOCK_7:
and.b32 rsi20, rsi13, 1023;
setp.eq.s32 rpb4, rsi20, 0;
@!rpb4 bra BLOCK_11;
BLOCK_8:
and.b32 rsi21, rsi14, -32768;
setp.eq.s32 rpb5, rsi21, 0;
@!rpb5 bra BLOCK_10;
BLOCK_9:
mov.f32 rfi2, 0F00000000;
bra.uni BLOCK_20;
BLOCK_10:
mov.f32 rfi2, 0F80000000;
bra.uni BLOCK_20;
BLOCK_11:
mul.rn.f32 rfi5, rfi4, 0F33800000;
and.b32 rsi22, rsi14, -32768;
setp.eq.s32 rpb6, rsi22, 0;
@!rpb6 bra BLOCK_13;
BLOCK_12:
mov.f32 rfi2, rfi5;
bra.uni BLOCK_20;
BLOCK_13:
neg.f32 rfi6, rfi5;
mov.f32 rfi2, rfi6;
bra.uni BLOCK_20;
BLOCK_14:
add.s32 rsi23, rsi16, -15;
setp.lt.s32 rpb7, rsi16, 15;
@!rpb7 bra BLOCK_16;
BLOCK_15:
neg.s32 rsi24, rsi23;
shl.b32 rsi25, 1, rsi24;
cvt.rn.f32.s32 rfi7, rsi25;
div.full.f32 rfi8, 0F3F800000, rfi7;
mov.f32 rfi9, rfi8;
bra.uni BLOCK_17;
BLOCK_16:
shl.b32 rsi26, 1, rsi23;
cvt.rn.f32.s32 rfi10, rsi26;
mov.f32 rfi9, rfi10;
bra.uni BLOCK_17;
BLOCK_17:
div.full.f32 rfi11, rfi4, 0F44800000;
add.rn.f32 rfi12, rfi11, 0F3F800000;
mul.rn.f32 rfi13, rfi12, rfi9;
and.b32 rsi27, rsi14, -32768;
setp.eq.s32 rpb8, rsi27, 0;
@!rpb8 bra BLOCK_19;
BLOCK_18:
mov.f32 rfi2, rfi13;
bra.uni BLOCK_20;
BLOCK_19:
neg.f32 rfi14, rfi13;
mov.f32 rfi2, rfi14;
bra.uni BLOCK_20;
BLOCK_20:
rem.s32 rsi28, rsi3, 32;
add.s32 rsi29, rsi28, rsi10;
add.s32 rsi30, rsi29, 26;
cvt.s64.s32 rsd2, rsi30;
add.u64 rud6, rud1, rsd2;
ld.global.s8 rsb2, [rud6];
add.s32 rsi31, rsi2, 6;
cvt.s64.s32 rsd3, rsi31;
shl.b64 rsd4, rsd3, 2;
add.u64 rud7, rud2, rsd4;
ld.global.f32 rfi15, [rud7];
cvt.s32.s8 rsi32, rsb2;
cvt.rn.f32.s32 rfi16, rsi32;
mul.rn.f32 rfi17, rfi2, rfi16;
mad.rn.f32 rfi18, rfi17, rfi15, rfi0;
add.s32 rsi33, rsi2, 1;
mov.f32 rfi0, rfi18;
mov.s32 rsi2, rsi33;
bra.uni LOOP_COND_1;
BLOCK_21:
add.s32 rsi34, rsi0, 6;
cvt.s64.s32 rsd5, rsi34;
shl.b64 rsd6, rsd5, 2;
add.u64 rud8, rud3, rsd6;
st.global.f32 [rud8], rfi0;
ret;
}
Test: class uk.ac.manchester.tornado.unittests.compute.MMwithBytes
Running test: testMatrixMultiplicationWithBytes ................ [PASS]
Test ran: 1, Failed: 0, Unsupported: 0 |
@jjfumero this is now fixed. The issue it was with the use of Random generating value that they were overflown the byte casting. |
It still fails:
|
I am using NVIDIA Driver: |
the issue is with input data, I added 3 unit-tests to isolate the functionality of These seem to be consistent. Test: class uk.ac.manchester.tornado.unittests.compute.MMwithBytes
Running test: testMatrixMultiplicationWithBytes ................ [PASS]
Running test: testNegativeInfinityAssignment ................ [PASS]
Running test: testPositiveInfinity ................ [PASS]
Running test: testNegativeInfinity ................ [PASS]
Test ran: 4, Failed: 0, Unsupported: 0 |
Description
There was a bug when generating code for the constant values
Float.POSITIVE_INFINITY
andFloat.NEGATIVE_INFINITY
from the Float class. Using these values caused the compiler to generate OpenCL code with Infinity and -Infinity constants, which are not valid OpenCL intrinsics.This fix replaces Float.POSITIVE_INFINITY and Float.NEGATIVE_INFINITY with
1.0f / 0.0f
and-1.0f / 0.0f
respectively, which are the proper representations of positive and negative infinity in OpenCL.Backend/s tested
Mark the backends affected by this PR.
OS tested
Mark the OS where this PR is tested.
Did you check on FPGAs?
If it is applicable, check your changes on FPGAs.
How to test the new patch?