[llvm] [AMDGPU] Fix wrong operand value when floating-point value is used as operand of type i16 (PR #84106)

Tue Mar 5 20:16:48 PST 2024

https://github.com/shiltian updated https://github.com/llvm/llvm-project/pull/84106

>From 267f7c0914b107265a72ea2ac4ab8c9c45f4d8b2 Mon Sep 17 00:00:00 2001
From: Shilei Tian <i at tianshilei.me>
Date: Tue, 5 Mar 2024 23:16:30 -0500
Subject: [PATCH] [AMDGPU] Fix wrong operand value when floating-point value is
 used as operand of type i16

Based on the section "OPF_INV2PI_16" of the spec, when a floating-point value is
used as operand of type `i16`, the 32-bit representation of the constant
truncated to the 16 LSBs should be used. Currently we directly use the FP16
representation, which doesn't conform with the spec.

For example, when `0.5` is used, for now we take it as `0x3800` because that is
the encoding of `<half 0.5>`. Instead, it should be `0x3f000000` truncated to 16
LSB, which is `0x0000`.
---
 .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp      | 33 ++++++++++++++++---
 llvm/test/MC/AMDGPU/gfx10_asm_vop1.s          | 16 ++++-----
 2 files changed, 36 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 10999d846e3bb2..5050aec2613815 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -1926,6 +1926,11 @@ static const fltSemantics *getFltSemantics(MVT VT) {
 
 static const fltSemantics *getOpFltSemantics(uint8_t OperandType) {
   switch (OperandType) {
+  // When floating-point immediate is used as operand of type i16, the 32-bit
+  // representation of the constant truncated to the 16 LSBs should be used.
+  case AMDGPU::OPERAND_REG_IMM_INT16:
+  case AMDGPU::OPERAND_REG_INLINE_C_INT16:
+  case AMDGPU::OPERAND_REG_INLINE_AC_INT16:
   case AMDGPU::OPERAND_REG_IMM_INT32:
   case AMDGPU::OPERAND_REG_IMM_FP32:
   case AMDGPU::OPERAND_REG_IMM_FP32_DEFERRED:
@@ -1949,13 +1954,10 @@ static const fltSemantics *getOpFltSemantics(uint8_t OperandType) {
   case AMDGPU::OPERAND_REG_INLINE_C_FP64:
   case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
     return &APFloat::IEEEdouble();
-  case AMDGPU::OPERAND_REG_IMM_INT16:
   case AMDGPU::OPERAND_REG_IMM_FP16:
   case AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED:
-  case AMDGPU::OPERAND_REG_INLINE_C_INT16:
   case AMDGPU::OPERAND_REG_INLINE_C_FP16:
   case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
-  case AMDGPU::OPERAND_REG_INLINE_AC_INT16:
   case AMDGPU::OPERAND_REG_INLINE_AC_FP16:
   case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16:
   case AMDGPU::OPERAND_REG_IMM_V2FP16:
@@ -2045,9 +2047,26 @@ bool AMDGPUOperand::isInlinableImm(MVT type) const {
       return false;
 
     if (type.getScalarSizeInBits() == 16) {
+      bool Lost;
+      switch (type.getScalarType().SimpleTy) {
+      default:
+        llvm_unreachable("unknown 16-bit type");
+      case MVT::bf16:
+        FPLiteral.convert(APFloatBase::BFloat(), APFloat::rmNearestTiesToEven,
+                          &Lost);
+        break;
+      case MVT::f16:
+        FPLiteral.convert(APFloatBase::IEEEhalf(), APFloat::rmNearestTiesToEven,
+                          &Lost);
+        break;
+      case MVT::i16:
+        FPLiteral.convert(APFloatBase::IEEEsingle(),
+                          APFloat::rmNearestTiesToEven, &Lost);
+        break;
+      }
       return isInlineableLiteralOp16(
-        static_cast<int16_t>(FPLiteral.bitcastToAPInt().getZExtValue()),
-        type, AsmParser->hasInv2PiInlineImm());
+          static_cast<uint16_t>(FPLiteral.bitcastToAPInt().getZExtValue()), type,
+          AsmParser->hasInv2PiInlineImm());
     }
 
     // Check if single precision literal is inlinable
@@ -2315,6 +2334,10 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
       // checked earlier in isLiteralImm()
 
       uint64_t ImmVal = FPLiteral.bitcastToAPInt().getZExtValue();
+      if (OpTy == AMDGPU::OPERAND_REG_IMM_INT16 ||
+          OpTy == AMDGPU::OPERAND_REG_INLINE_C_INT16 ||
+          OpTy == AMDGPU::OPERAND_REG_INLINE_AC_INT16)
+        ImmVal = ImmVal & 0xffff;
       Inst.addOperand(MCOperand::createImm(ImmVal));
       if (OpTy == AMDGPU::OPERAND_KIMM32 || OpTy == AMDGPU::OPERAND_KIMM16) {
         setImmKindMandatoryLiteral();
diff --git a/llvm/test/MC/AMDGPU/gfx10_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx10_asm_vop1.s
index 1cfafebe2c3cd4..8b685805fb3ed9 100644
--- a/llvm/test/MC/AMDGPU/gfx10_asm_vop1.s
+++ b/llvm/test/MC/AMDGPU/gfx10_asm_vop1.s
@@ -11189,10 +11189,10 @@ v_cvt_f16_u16_e32 v5, -1
 // GFX10: encoding: [0xc1,0xa0,0x0a,0x7e]
 
 v_cvt_f16_u16_e32 v5, 0.5
-// GFX10: encoding: [0xff,0xa0,0x0a,0x7e,0x00,0x38,0x00,0x00]
+// GFX10: encoding: [0x80,0xa0,0x0a,0x7e]
 
 v_cvt_f16_u16_e32 v5, -4.0
-// GFX10: encoding: [0xff,0xa0,0x0a,0x7e,0x00,0xc4,0x00,0x00]
+// GFX10: encoding: [0x80,0xa0,0x0a,0x7e]
 
 v_cvt_f16_u16_e32 v5, 0xfe0b
 // GFX10: encoding: [0xff,0xa0,0x0a,0x7e,0x0b,0xfe,0x00,0x00]
@@ -11237,10 +11237,10 @@ v_cvt_f16_u16_e64 v5, -1
 // GFX10: encoding: [0x05,0x00,0xd0,0xd5,0xc1,0x00,0x00,0x00]
 
 v_cvt_f16_u16_e64 v5, 0.5
-// GFX10: encoding: [0x05,0x00,0xd0,0xd5,0xff,0x00,0x00,0x00,0x00,0x38,0x00,0x00]
+// GFX10: encoding: [0x05,0x00,0xd0,0xd5,0x80,0x00,0x00,0x00]
 
 v_cvt_f16_u16_e64 v5, -4.0
-// GFX10: encoding: [0x05,0x00,0xd0,0xd5,0xff,0x00,0x00,0x00,0x00,0xc4,0x00,0x00]
+// GFX10: encoding: [0x05,0x00,0xd0,0xd5,0x80,0x00,0x00,0x00]
 
 v_cvt_f16_u16_e64 v5, v1 clamp
 // GFX10: encoding: [0x05,0x80,0xd0,0xd5,0x01,0x01,0x00,0x00]
@@ -11435,10 +11435,10 @@ v_cvt_f16_i16_e32 v5, -1
 // GFX10: encoding: [0xc1,0xa2,0x0a,0x7e]
 
 v_cvt_f16_i16_e32 v5, 0.5
-// GFX10: encoding: [0xff,0xa2,0x0a,0x7e,0x00,0x38,0x00,0x00]
+// GFX10: encoding: [0x80,0xa2,0x0a,0x7e]
 
 v_cvt_f16_i16_e32 v5, -4.0
-// GFX10: encoding: [0xff,0xa2,0x0a,0x7e,0x00,0xc4,0x00,0x00]
+// GFX10: encoding: [0x80,0xa2,0x0a,0x7e]
 
 v_cvt_f16_i16_e32 v5, 0xfe0b
 // GFX10: encoding: [0xff,0xa2,0x0a,0x7e,0x0b,0xfe,0x00,0x00]
@@ -11483,10 +11483,10 @@ v_cvt_f16_i16_e64 v5, -1
 // GFX10: encoding: [0x05,0x00,0xd1,0xd5,0xc1,0x00,0x00,0x00]
 
 v_cvt_f16_i16_e64 v5, 0.5
-// GFX10: encoding: [0x05,0x00,0xd1,0xd5,0xff,0x00,0x00,0x00,0x00,0x38,0x00,0x00]
+// GFX10: encoding: [0x05,0x00,0xd1,0xd5,0x80,0x00,0x00,0x00]
 
 v_cvt_f16_i16_e64 v5, -4.0
-// GFX10: encoding: [0x05,0x00,0xd1,0xd5,0xff,0x00,0x00,0x00,0x00,0xc4,0x00,0x00]
+// GFX10: encoding: [0x05,0x00,0xd1,0xd5,0x80,0x00,0x00,0x00]
 
 v_cvt_f16_i16_e64 v5, v1 clamp
 // GFX10: encoding: [0x05,0x80,0xd1,0xd5,0x01,0x01,0x00,0x00]