[Fix] Replace Invalid OpenCL Representation of Infinity Constants for `Float.POSITIVE_INFINITY` and `Float.NEGATIVE_INFINITY` #592

mikepapadim · 2024-11-21T13:57:06Z

Description

There was a bug when generating code for the constant values Float.POSITIVE_INFINITY and Float.NEGATIVE_INFINITY from the Float class. Using these values caused the compiler to generate OpenCL code with Infinity and -Infinity constants, which are not valid OpenCL intrinsics.

This fix replaces Float.POSITIVE_INFINITY and Float.NEGATIVE_INFINITY with 1.0f / 0.0f and -1.0f / 0.0f respectively, which are the proper representations of positive and negative infinity in OpenCL.

Backend/s tested

Mark the backends affected by this PR.

OpenCL
PTX
SPIRV

OS tested

Mark the OS where this PR is tested.

Linux
OSx
Windows

Did you check on FPGAs?

If it is applicable, check your changes on FPGAs.

Yes
No

How to test the new patch?

make jdk21 
tornado-test  -V uk.ac.manchester.tornado.unittests.compute.MMwithBytes

jjfumero · 2024-11-21T14:06:10Z

Thanks Michali, Is this only for OpenCL? is it possible to add it also for SPIR-V and PTX?

…into fix/infinity_intr

mikepapadim · 2024-11-22T10:13:27Z

@jjfumero Now, this support also PTX and SPIRV

mairooni

LGTM

jjfumero · 2024-11-25T08:35:55Z

Thanks. This test seems to fail time to time. Example running with the OpenCL backend:

Test: class uk.ac.manchester.tornado.unittests.compute.MMwithBytes
	Running test: testMatrixMultiplicationWithBytes ................  [FAILED] 
		\_[REASON] Output contains NaN at index 262
Test ran: 1, Failed: 1, Unsupported: 0

jjfumero · 2024-11-25T08:38:27Z

It also fails with the PTX backend. SPIR-V runs fine.

mikepapadim · 2024-11-25T13:14:08Z

@jjfumero
I cant reproduce it with the 4090:

rnado-test --printKernel  -V uk.ac.manchester.tornado.unittests.compute.MMwithBytes
/home/michalis/TornadoVM/bin/sdk/bin/tornado --jvm "-Xmx6g -Dtornado.recover.bailout=False -Dtornado.unittests.verbose=True -Dtornado.print.kernel=True "  -m  tornado.unittests/uk.ac.manchester.tornado.unittests.tools.TornadoTestRunner  --params "uk.ac.manchester.tornado.unittests.compute.MMwithBytes"
WARNING: Using incubator modules: jdk.incubator.vector

.version 7.6 
.target sm_86 
.address_size 64 

.visible .entry s0_t0_matmultornado_uk_ac_manchester_tornado_api_kernelcontext_1543043602_arrays_bytearray_arrays_floatarray_arrays_floatarray_128(.param .u64 .ptr .global .align 8 kernel_context, .param .u64 .ptr .global .align 8 context_unused, .param .u64 .ptr .global .align 8 thisx, .param .u64 .ptr .global .align 8 that, .param .u64 .ptr .global .align 8 out, .param .align 8 .u64 dim1) {
	.reg .pred rpb<10>;
	.reg .s8 rsb<4>;
	.reg .s64 rsd<8>;
	.reg .s16 rsh<2>;
	.reg .s32 rsi<36>;
	.reg .u32 rui<10>;
	.reg .f32 rfi<20>;
	.reg .u64 rud<10>;

BLOCK_0:
	ld.param.u64	rud0, [kernel_context];
	ld.param.u64	rud1, [thisx];
	ld.param.u64	rud2, [that];
	ld.param.u64	rud3, [out];
	mov.u32	rui0, %tid.x;
	mov.u32	rui1, %ntid.x;
	mov.u32	rui2, %ctaid.x;
	mad.lo.s32	rsi0, rui2, rui1, rui0;
	shl.b32	rsi1, rsi0, 7;

BLOCK_1:
	mov.f32	rfi0, 0F00000000;
	mov.s32	rsi2, 0;
LOOP_COND_1:
	setp.lt.s32	rpb0, rsi2, 128;
	@!rpb0 bra	BLOCK_21;

BLOCK_2:
	add.s32	rsi3, rsi1, rsi2;
	shr.b32	rsi4, rsi3, 31;
	shr.b32	rsi5, rsi4, 27;
	add.s32	rsi6, rsi5, rsi3;
	shr.b32	rsi7, rsi6, 5;
	shl.b32	rsi8, rsi7, 1;
	and.b32	rsi9, rsi6, -32;
	add.s32	rsi10, rsi8, rsi9;
	add.s32	rsi11, rsi10, 24;
	cvt.s64.s32	rsd0, rsi11;
	add.u64	rud4, rud1, rsd0;
	ld.global.s8	rsb0, [rud4];
	add.s32	rsi12, rsi10, 25;
	cvt.s64.s32	rsd1, rsi12;
	add.u64	rud5, rud1, rsd1;
	ld.global.s8	rsb1, [rud5];
	cvt.u32.s8	rui3, rsb1;
	and.b32	rui4, rui3, 255;
	shl.b32	rui5, rui4, 8;
	cvt.u32.s8	rui6, rsb0;
	and.b32	rui7, rui6, 255;
	or.b32	rui8, rui5, rui7;
	cvt.s16.u32	rsh0, rui8;
	cvt.s32.s16	rsi13, rsh0;
	and.b32	rsi14, rsi13, 32768;
	and.b32	rsi15, rsi13, 31744;
	shr.b32	rsi16, rsi15, 10;
	setp.eq.s32	rpb1, rsi16, 31;
	@!rpb1 bra	BLOCK_6;

BLOCK_3:
	and.b32	rsi17, rsi14, -32768;
	setp.eq.s32	rpb2, rsi17, 0;
	@!rpb2 bra	BLOCK_5;

BLOCK_4:
	div.full.f32	rfi1, 0F3F800000, 0F00000000;
	mov.f32	rfi2, rfi1;
	bra.uni	BLOCK_20;

BLOCK_5:
	div.full.f32	rfi3, 0FBF800000, 0F00000000;
	mov.f32	rfi2, rfi3;
	bra.uni	BLOCK_20;

BLOCK_6:
	and.b32	rsi18, rsi13, 1023;
	cvt.rn.f32.s32	rfi4, rsi18;
	and.b32	rsi19, rsi15, -1024;
	setp.eq.s32	rpb3, rsi19, 0;
	@!rpb3 bra	BLOCK_14;

BLOCK_7:
	and.b32	rsi20, rsi13, 1023;
	setp.eq.s32	rpb4, rsi20, 0;
	@!rpb4 bra	BLOCK_11;

BLOCK_8:
	and.b32	rsi21, rsi14, -32768;
	setp.eq.s32	rpb5, rsi21, 0;
	@!rpb5 bra	BLOCK_10;

BLOCK_9:
	mov.f32	rfi2, 0F00000000;
	bra.uni	BLOCK_20;

BLOCK_10:
	mov.f32	rfi2, 0F80000000;
	bra.uni	BLOCK_20;

BLOCK_11:
	mul.rn.f32	rfi5, rfi4, 0F33800000;
	and.b32	rsi22, rsi14, -32768;
	setp.eq.s32	rpb6, rsi22, 0;
	@!rpb6 bra	BLOCK_13;

BLOCK_12:
	mov.f32	rfi2, rfi5;
	bra.uni	BLOCK_20;

BLOCK_13:
	neg.f32	rfi6, rfi5;
	mov.f32	rfi2, rfi6;
	bra.uni	BLOCK_20;

BLOCK_14:
	add.s32	rsi23, rsi16, -15;
	setp.lt.s32	rpb7, rsi16, 15;
	@!rpb7 bra	BLOCK_16;

BLOCK_15:
	neg.s32	rsi24, rsi23;
	shl.b32	rsi25, 1, rsi24;
	cvt.rn.f32.s32	rfi7, rsi25;
	div.full.f32	rfi8, 0F3F800000, rfi7;
	mov.f32	rfi9, rfi8;
	bra.uni	BLOCK_17;

BLOCK_16:
	shl.b32	rsi26, 1, rsi23;
	cvt.rn.f32.s32	rfi10, rsi26;
	mov.f32	rfi9, rfi10;
	bra.uni	BLOCK_17;

BLOCK_17:
	div.full.f32	rfi11, rfi4, 0F44800000;
	add.rn.f32	rfi12, rfi11, 0F3F800000;
	mul.rn.f32	rfi13, rfi12, rfi9;
	and.b32	rsi27, rsi14, -32768;
	setp.eq.s32	rpb8, rsi27, 0;
	@!rpb8 bra	BLOCK_19;

BLOCK_18:
	mov.f32	rfi2, rfi13;
	bra.uni	BLOCK_20;

BLOCK_19:
	neg.f32	rfi14, rfi13;
	mov.f32	rfi2, rfi14;
	bra.uni	BLOCK_20;

BLOCK_20:
	rem.s32	rsi28, rsi3, 32;
	add.s32	rsi29, rsi28, rsi10;
	add.s32	rsi30, rsi29, 26;
	cvt.s64.s32	rsd2, rsi30;
	add.u64	rud6, rud1, rsd2;
	ld.global.s8	rsb2, [rud6];
	add.s32	rsi31, rsi2, 6;
	cvt.s64.s32	rsd3, rsi31;
	shl.b64	rsd4, rsd3, 2;
	add.u64	rud7, rud2, rsd4;
	ld.global.f32	rfi15, [rud7];
	cvt.s32.s8	rsi32, rsb2;
	cvt.rn.f32.s32	rfi16, rsi32;
	mul.rn.f32	rfi17, rfi2, rfi16;
	mad.rn.f32	rfi18, rfi17, rfi15, rfi0;
	add.s32	rsi33, rsi2, 1;
	mov.f32	rfi0, rfi18;
	mov.s32	rsi2, rsi33;
	bra.uni	LOOP_COND_1;

BLOCK_21:
	add.s32	rsi34, rsi0, 6;
	cvt.s64.s32	rsd5, rsi34;
	shl.b64	rsd6, rsd5, 2;
	add.u64	rud8, rud3, rsd6;
	st.global.f32	[rud8], rfi0;
	ret;
}

Test: class uk.ac.manchester.tornado.unittests.compute.MMwithBytes
	Running test: testMatrixMultiplicationWithBytes ................  [PASS] 
Test ran: 1, Failed: 0, Unsupported: 0

mikepapadim · 2024-11-26T10:02:38Z

@jjfumero this is now fixed. The issue it was with the use of Random generating value that they were overflown the byte casting.

jjfumero · 2024-11-26T10:18:08Z

It still fails:

 juan   fix/infinity_intr … 5  ~  tornadovm  TornadoVM  tornado-test  -V uk.ac.manchester.tornado.unittests.compute.MMwithBytes
/home/juan/tornadovm/TornadoVM/bin/sdk/bin/tornado --jvm "-Xmx6g -Dtornado.recover.bailout=False -Dtornado.unittests.verbose=True "  -m  tornado.unittests/uk.ac.manchester.tornado.unittests.tools.TornadoTestRunner  --params "uk.ac.manchester.tornado.unittests.compute.MMwithBytes"
WARNING: Using incubator modules: jdk.incubator.vector

Test: class uk.ac.manchester.tornado.unittests.compute.MMwithBytes
	Running test: testMatrixMultiplicationWithBytes ................  [PASS] 
Test ran: 1, Failed: 0, Unsupported: 0


Total Time (s): 0.8244667053222656
 juan   fix/infinity_intr … 5  ~  tornadovm  TornadoVM  tornado-test  -V uk.ac.manchester.tornado.unittests.compute.MMwithBytes
/home/juan/tornadovm/TornadoVM/bin/sdk/bin/tornado --jvm "-Xmx6g -Dtornado.recover.bailout=False -Dtornado.unittests.verbose=True "  -m  tornado.unittests/uk.ac.manchester.tornado.unittests.tools.TornadoTestRunner  --params "uk.ac.manchester.tornado.unittests.compute.MMwithBytes"
WARNING: Using incubator modules: jdk.incubator.vector

Test: class uk.ac.manchester.tornado.unittests.compute.MMwithBytes
	Running test: testMatrixMultiplicationWithBytes ................  [FAILED] 
		\_[REASON] Output contains NaN at index 263
Test ran: 1, Failed: 1, Unsupported: 0


Total Time (s): 0.8283684253692627

jjfumero · 2024-11-26T10:18:53Z

I am using NVIDIA Driver: 550.107.02 and CUDA 12.3

…EGATIVE_INFINITY

mikepapadim · 2024-11-26T12:04:59Z

the issue is with input data, I added 3 unit-tests to isolate the functionality of Float.POSITIVE_INFINITY and Float.NEGATIVE_INFINITY

These seem to be consistent.

Test: class uk.ac.manchester.tornado.unittests.compute.MMwithBytes
	Running test: testMatrixMultiplicationWithBytes ................  [PASS] 
	Running test: testNegativeInfinityAssignment ................  [PASS] 
	Running test: testPositiveInfinity       ................  [PASS] 
	Running test: testNegativeInfinity       ................  [PASS] 
Test ran: 4, Failed: 0, Unsupported: 0

mikepapadim added 6 commits November 21, 2024 12:56

Add support for infinity calculation in OpenCL backend

5cff4e4

Add licenses

8b9a534

Simplify infinity replacement

8de52cb

Add Javadocs

8e34056

Add run instructions

fd024f5

Add the test for infinity code gen in test suite

0b5bad5

mikepapadim added the bug Something isn't working label Nov 21, 2024

mikepapadim requested review from jjfumero, mairooni and stratika November 21, 2024 13:57

mikepapadim self-assigned this Nov 21, 2024

mikepapadim changed the title ~~Fix: Replace Invalid OpenCL Representation of Infinity Constants for Float.POSITIVE_INFINITY and Float.NEGATIVE_INFINITY~~ [Fix] Replace Invalid OpenCL Representation of Infinity Constants for Float.POSITIVE_INFINITY and Float.NEGATIVE_INFINITY Nov 21, 2024

mikepapadim added 4 commits November 22, 2024 09:58

Add fix for infinity intrinsics in PTX

eed8041

Add correct package for phase implementation

4ec62c2

Add fix for infinity intrinsics in SPIRV

bd897f5

Merge branch 'fix/infinity_intr' of github.com:mikepapadim/TornadoVM …

644edfb

…into fix/infinity_intr

mairooni approved these changes Nov 22, 2024

View reviewed changes

Replace random generation of input

3da9ffa

Add uni-tests to isolate usage of Float.POSITIVE_INFINITY and Float.N…

740252f

…EGATIVE_INFINITY

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[Fix] Replace Invalid OpenCL Representation of Infinity Constants for `Float.POSITIVE_INFINITY` and `Float.NEGATIVE_INFINITY` #592

[Fix] Replace Invalid OpenCL Representation of Infinity Constants for `Float.POSITIVE_INFINITY` and `Float.NEGATIVE_INFINITY` #592

mikepapadim commented Nov 21, 2024 •

edited

Loading

jjfumero commented Nov 21, 2024

mikepapadim commented Nov 22, 2024

mairooni left a comment

jjfumero commented Nov 25, 2024

jjfumero commented Nov 25, 2024

mikepapadim commented Nov 25, 2024

mikepapadim commented Nov 26, 2024

jjfumero commented Nov 26, 2024

jjfumero commented Nov 26, 2024

mikepapadim commented Nov 26, 2024

[Fix] Replace Invalid OpenCL Representation of Infinity Constants for Float.POSITIVE_INFINITY and Float.NEGATIVE_INFINITY #592

Are you sure you want to change the base?

[Fix] Replace Invalid OpenCL Representation of Infinity Constants for Float.POSITIVE_INFINITY and Float.NEGATIVE_INFINITY #592

Conversation

mikepapadim commented Nov 21, 2024 • edited Loading

Description

Backend/s tested

OS tested

Did you check on FPGAs?

How to test the new patch?

jjfumero commented Nov 21, 2024

mikepapadim commented Nov 22, 2024

mairooni left a comment

Choose a reason for hiding this comment

jjfumero commented Nov 25, 2024

jjfumero commented Nov 25, 2024

mikepapadim commented Nov 25, 2024

mikepapadim commented Nov 26, 2024

jjfumero commented Nov 26, 2024

jjfumero commented Nov 26, 2024

mikepapadim commented Nov 26, 2024

[Fix] Replace Invalid OpenCL Representation of Infinity Constants for `Float.POSITIVE_INFINITY` and `Float.NEGATIVE_INFINITY` #592

[Fix] Replace Invalid OpenCL Representation of Infinity Constants for `Float.POSITIVE_INFINITY` and `Float.NEGATIVE_INFINITY` #592

mikepapadim commented Nov 21, 2024 •

edited

Loading