[llvm] [AMDGPU] Fix bf16 inv2pi inline constant hadling (PR #82283)

Mon Feb 19 14:38:16 PST 2024

https://github.com/rampitec updated https://github.com/llvm/llvm-project/pull/82283

>From 8cd9e71f55e4df5eaddfdea5fedcf091fbf9eea9 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Mon, 19 Feb 2024 14:25:34 -0800
Subject: [PATCH 1/3] [AMDGPU] Fix bf16 inv2pi inline constant hadling

Inline constant 1/(2*pi) has the truncated value 0x3e22. According
to the spec it is not rounded. A bf16 value in a nutshall is a fp32
value with cleared 16 bites of mantissa. The value 0x3e22 converted
to fp32 is 0.158203125 and the next representable value 0x3e23 means
0.1591796875. The fp32 value of 1/(2*pi) = 0.15915494 cannot be
represented in bf16. Although since bf16 values are essentailly
truncated fp32 values we can use 0.15915494 as an idiomatic
representation of 1/(2*pi) inline constant. This is also consistent
with sp3 behaviour. The patch fixes the problem that value we are
printing for inv2pi inline constant is not parsed as inv2pi by the
asm parser and gets rounded.
---
 .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp      | 25 +++++++++++++------
 llvm/test/MC/AMDGPU/bf16_imm.s                |  6 ++++-
 2 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 883b30562e911b..85bd33e4efbd0f 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -2230,6 +2230,24 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
       // in predicate methods (isLiteralImm())
       llvm_unreachable("fp literal in 64-bit integer instruction.");
 
+    case AMDGPU::OPERAND_REG_IMM_BF16:
+    case AMDGPU::OPERAND_REG_IMM_BF16_DEFERRED:
+    case AMDGPU::OPERAND_REG_INLINE_C_BF16:
+    case AMDGPU::OPERAND_REG_INLINE_C_V2BF16:
+    case AMDGPU::OPERAND_REG_INLINE_AC_BF16:
+    case AMDGPU::OPERAND_REG_INLINE_AC_V2BF16:
+    case AMDGPU::OPERAND_REG_IMM_V2BF16:
+      if (AsmParser->hasInv2PiInlineImm() && Literal == 0x3fc45f306725feed) {
+        // This is the 1/(2*pi) which is going to be truncated to bf16 with the
+        // loss of precision. The constant represents ideomatic fp32 value of
+        // 1/(2*pi) = 0.15915494 since bf16 is in fact fp32 with cleared low 16
+        // bits. Prevent rounding below.
+        Inst.addOperand(MCOperand::createImm(0x3e22));
+        setImmKindLiteral();
+        return;
+      }
+      [[fallthrough]];
+
     case AMDGPU::OPERAND_REG_IMM_INT32:
     case AMDGPU::OPERAND_REG_IMM_FP32:
     case AMDGPU::OPERAND_REG_IMM_FP32_DEFERRED:
@@ -2238,24 +2256,17 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
     case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
     case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
     case AMDGPU::OPERAND_REG_IMM_INT16:
-    case AMDGPU::OPERAND_REG_IMM_BF16:
     case AMDGPU::OPERAND_REG_IMM_FP16:
-    case AMDGPU::OPERAND_REG_IMM_BF16_DEFERRED:
     case AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED:
     case AMDGPU::OPERAND_REG_INLINE_C_INT16:
-    case AMDGPU::OPERAND_REG_INLINE_C_BF16:
     case AMDGPU::OPERAND_REG_INLINE_C_FP16:
     case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
-    case AMDGPU::OPERAND_REG_INLINE_C_V2BF16:
     case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
     case AMDGPU::OPERAND_REG_INLINE_AC_INT16:
-    case AMDGPU::OPERAND_REG_INLINE_AC_BF16:
     case AMDGPU::OPERAND_REG_INLINE_AC_FP16:
     case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
-    case AMDGPU::OPERAND_REG_INLINE_AC_V2BF16:
     case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16:
     case AMDGPU::OPERAND_REG_IMM_V2INT16:
-    case AMDGPU::OPERAND_REG_IMM_V2BF16:
     case AMDGPU::OPERAND_REG_IMM_V2FP16:
     case AMDGPU::OPERAND_REG_INLINE_C_V2FP32:
     case AMDGPU::OPERAND_REG_IMM_V2FP32:
diff --git a/llvm/test/MC/AMDGPU/bf16_imm.s b/llvm/test/MC/AMDGPU/bf16_imm.s
index c27cbcc4f294a2..f04e347e5afb3b 100644
--- a/llvm/test/MC/AMDGPU/bf16_imm.s
+++ b/llvm/test/MC/AMDGPU/bf16_imm.s
@@ -34,10 +34,14 @@ v_dot2_bf16_bf16 v2, v0, 4.0, v2
 v_dot2_bf16_bf16 v2, v0, -4.0, v2
 // CHECK: v_dot2_bf16_bf16 v2, v0, -4.0, v2       ; encoding: [0x02,0x00,0x67,0xd6,0x00,0xef,0x09,0x04]
 
-// FIXME: pi/2 rounded value is incorrect in the inst printer.
+// Check 1/(2*pi) rounded value and ideomatic fp32 0.15915494 value
+// which cannot be accurately represented in bf16.
 
 v_dot2_bf16_bf16 v2, v0, 0.158203125, v2
 // CHECK: v_dot2_bf16_bf16 v2, v0, 0.15915494, v2 ; encoding: [0x02,0x00,0x67,0xd6,0x00,0xf1,0x09,0x04]
 
+v_dot2_bf16_bf16 v2, v0, v2, 0.15915494
+// CHECK: v_dot2_bf16_bf16 v2, v0, v2, 0.15915494 ; encoding: [0x02,0x00,0x67,0xd6,0x00,0x05,0xe2,0x03]
+
 v_dot2_bf16_bf16 v2, v0, 0x3e22, v2
 // CHECK: v_dot2_bf16_bf16 v2, v0, 0.15915494, v2 ; encoding: [0x02,0x00,0x67,0xd6,0x00,0xf1,0x09,0x04]

>From 991ddb67722086a53a1e939b495b744cc65d9b00 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Mon, 19 Feb 2024 14:35:35 -0800
Subject: [PATCH 2/3] Added one more test for scalar bf16 value

---
 llvm/test/MC/AMDGPU/bf16_imm.s | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/llvm/test/MC/AMDGPU/bf16_imm.s b/llvm/test/MC/AMDGPU/bf16_imm.s
index f04e347e5afb3b..25bf0d6cfcb25a 100644
--- a/llvm/test/MC/AMDGPU/bf16_imm.s
+++ b/llvm/test/MC/AMDGPU/bf16_imm.s
@@ -45,3 +45,6 @@ v_dot2_bf16_bf16 v2, v0, v2, 0.15915494
 
 v_dot2_bf16_bf16 v2, v0, 0x3e22, v2
 // CHECK: v_dot2_bf16_bf16 v2, v0, 0.15915494, v2 ; encoding: [0x02,0x00,0x67,0xd6,0x00,0xf1,0x09,0x04]
+
+v_dot2_bf16_bf16 v2, v0, v2, 0.15915494
+// CHECK: v_dot2_bf16_bf16 v2, v0, v2, 0.15915494 ; encoding: [0x02,0x00,0x67,0xd6,0x00,0x05,0xe2,0x03]

>From 227bf867fec18387344987fa7175e5a66e0f45fe Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Mon, 19 Feb 2024 14:38:00 -0800
Subject: [PATCH 3/3] Test fix

---
 llvm/test/MC/AMDGPU/bf16_imm.s | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/test/MC/AMDGPU/bf16_imm.s b/llvm/test/MC/AMDGPU/bf16_imm.s
index 25bf0d6cfcb25a..271bd19aa49690 100644
--- a/llvm/test/MC/AMDGPU/bf16_imm.s
+++ b/llvm/test/MC/AMDGPU/bf16_imm.s
@@ -40,8 +40,8 @@ v_dot2_bf16_bf16 v2, v0, -4.0, v2
 v_dot2_bf16_bf16 v2, v0, 0.158203125, v2
 // CHECK: v_dot2_bf16_bf16 v2, v0, 0.15915494, v2 ; encoding: [0x02,0x00,0x67,0xd6,0x00,0xf1,0x09,0x04]
 
-v_dot2_bf16_bf16 v2, v0, v2, 0.15915494
-// CHECK: v_dot2_bf16_bf16 v2, v0, v2, 0.15915494 ; encoding: [0x02,0x00,0x67,0xd6,0x00,0x05,0xe2,0x03]
+v_dot2_bf16_bf16 v2, v0, 0.15915494, v2
+// CHECK: v_dot2_bf16_bf16 v2, v0, 0.15915494, v2 ; encoding: [0x02,0x00,0x67,0xd6,0x00,0xf1,0x09,0x04]
 
 v_dot2_bf16_bf16 v2, v0, 0x3e22, v2
 // CHECK: v_dot2_bf16_bf16 v2, v0, 0.15915494, v2 ; encoding: [0x02,0x00,0x67,0xd6,0x00,0xf1,0x09,0x04]