[llvm] [AMDGPU] Fix canonicalization of truncated values. (PR #83054)

Harald van Dijk via llvm-commits llvm-commits at lists.llvm.org
Mon Feb 26 16:22:51 PST 2024


https://github.com/hvdijk updated https://github.com/llvm/llvm-project/pull/83054

>From 6c814cc14829401c22695637cd1c2af247fd3384 Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald at gigawatt.nl>
Date: Tue, 27 Feb 2024 00:22:28 +0000
Subject: [PATCH] [AMDGPU] Fix canonicalization of truncated values.

We were relying on roundings to implicitly canonicalize, which is
generally safe, except with roundings that may be optimized away.

Fixes #82937.
---
 .../SelectionDAG/LegalizeFloatTypes.cpp       |   1 +
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |  33 ++-
 llvm/lib/Target/AMDGPU/SIISelLowering.h       |   5 +
 llvm/test/CodeGen/AMDGPU/bf16.ll              |   4 +
 llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll | 270 ++++++++++++++----
 5 files changed, 244 insertions(+), 69 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index f0a04589fbfdc2..315053ce1ac0f0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -2398,6 +2398,7 @@ void DAGTypeLegalizer::PromoteFloatResult(SDNode *N, unsigned ResNo) {
     case ISD::FCOPYSIGN:  R = PromoteFloatRes_FCOPYSIGN(N); break;
 
     // Unary FP Operations
+    case ISD::FREEZE:
     case ISD::FABS:
     case ISD::FCBRT:
     case ISD::FCEIL:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 84ef9679ab9563..4e99282984868f 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -12416,7 +12416,7 @@ SDValue SITargetLowering::performRcpCombine(SDNode *N,
 }
 
 bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
-                                       unsigned MaxDepth) const {
+                                       bool &Trunc, unsigned MaxDepth) const {
   unsigned Opcode = Op.getOpcode();
   if (Opcode == ISD::FCANONICALIZE)
     return true;
@@ -12450,7 +12450,6 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
   case ISD::FSQRT:
   case ISD::FDIV:
   case ISD::FREM:
-  case ISD::FP_ROUND:
   case ISD::FP_EXTEND:
   case ISD::FLDEXP:
   case AMDGPUISD::FMUL_LEGACY:
@@ -12473,12 +12472,17 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
   case AMDGPUISD::CVT_F32_UBYTE3:
     return true;
 
+  case ISD::FP_ROUND:
+    if (Op.getConstantOperandVal(1))
+      Trunc = true;
+    return true;
+
   // It can/will be lowered or combined as a bit operation.
   // Need to check their input recursively to handle.
   case ISD::FNEG:
   case ISD::FABS:
   case ISD::FCOPYSIGN:
-    return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
+    return isCanonicalized(DAG, Op.getOperand(0), Trunc, MaxDepth - 1);
 
   case ISD::FSIN:
   case ISD::FCOS:
@@ -12513,20 +12517,20 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
 
     // FIXME: Does this apply with clamp? It's implemented with max.
     for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
-      if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
+      if (!isCanonicalized(DAG, Op.getOperand(I), Trunc, MaxDepth - 1))
         return false;
     }
 
     return true;
   }
   case ISD::SELECT: {
-    return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
-           isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
+    return isCanonicalized(DAG, Op.getOperand(1), Trunc, MaxDepth - 1) &&
+           isCanonicalized(DAG, Op.getOperand(2), Trunc, MaxDepth - 1);
   }
   case ISD::BUILD_VECTOR: {
     for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
       SDValue SrcOp = Op.getOperand(i);
-      if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))
+      if (!isCanonicalized(DAG, SrcOp, Trunc, MaxDepth - 1))
         return false;
     }
 
@@ -12534,18 +12538,18 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
   }
   case ISD::EXTRACT_VECTOR_ELT:
   case ISD::EXTRACT_SUBVECTOR: {
-    return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
+    return isCanonicalized(DAG, Op.getOperand(0), Trunc, MaxDepth - 1);
   }
   case ISD::INSERT_VECTOR_ELT: {
-    return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
-           isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
+    return isCanonicalized(DAG, Op.getOperand(0), Trunc, MaxDepth - 1) &&
+           isCanonicalized(DAG, Op.getOperand(1), Trunc, MaxDepth - 1);
   }
   case ISD::UNDEF:
     // Could be anything.
     return false;
 
   case ISD::BITCAST:
-    return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
+    return isCanonicalized(DAG, Op.getOperand(0), Trunc, MaxDepth - 1);
   case ISD::TRUNCATE: {
     // Hack round the mess we make when legalizing extract_vector_elt
     if (Op.getValueType() == MVT::i16) {
@@ -12553,7 +12557,7 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
       if (TruncSrc.getValueType() == MVT::i32 &&
           TruncSrc.getOpcode() == ISD::BITCAST &&
           TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
-        return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
+        return isCanonicalized(DAG, TruncSrc.getOperand(0), Trunc, MaxDepth - 1);
       }
     }
     return false;
@@ -12831,7 +12835,10 @@ SDValue SITargetLowering::performFCanonicalizeCombine(
     }
   }
 
-  return isCanonicalized(DAG, N0) ? N0 : SDValue();
+  bool Trunc = false;
+  return isCanonicalized(DAG, N0, Trunc)
+             ? Trunc ? DAG.getNode(ISD::FREEZE, SDLoc(N), VT, N0) : N0
+             : SDValue();
 }
 
 static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index f6e1d198f40aec..f89a4d805d59b4 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -516,6 +516,11 @@ class SITargetLowering final : public AMDGPUTargetLowering {
                            Register N1) const override;
 
   bool isCanonicalized(SelectionDAG &DAG, SDValue Op,
+                       unsigned MaxDepth = 5) const {
+    bool Trunc;
+    return isCanonicalized(DAG, Op, Trunc, MaxDepth);
+  }
+  bool isCanonicalized(SelectionDAG &DAG, SDValue Op, bool &Trunc,
                        unsigned MaxDepth = 5) const;
   bool isCanonicalized(Register Reg, MachineFunction &MF,
                        unsigned MaxDepth = 5) const;
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index c773742a459507..14c5a8c46f7ba5 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -26818,11 +26818,15 @@ define bfloat @v_canonicalize_bf16(bfloat %a) {
 ; GCN-LABEL: v_canonicalize_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_canonicalize_bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_canonicalize_bf16:
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
index 274621307f540d..05a14470ff65e0 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
@@ -170,6 +170,35 @@ define amdgpu_kernel void @s_test_canonicalize_var_f16(ptr addrspace(1) %out, i1
   ret void
 }
 
+define half @s_test_canonicalize_arg(half %x) #1 {
+; VI-LABEL: s_test_canonicalize_arg:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_max_f16_e32 v0, v0, v0
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_test_canonicalize_arg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; CI-LABEL: s_test_canonicalize_arg:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: s_test_canonicalize_arg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %canonicalized = call half @llvm.canonicalize.f16(half %x)
+  ret half %canonicalized
+}
+
 define <2 x half> @v_test_canonicalize_build_vector_v2f16(half %lo, half %hi) #1 {
 ; VI-LABEL: v_test_canonicalize_build_vector_v2f16:
 ; VI:       ; %bb.0:
@@ -3456,24 +3485,54 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 {
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; CI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v4
 ; CI-NEXT:    v_cvt_f16_f32_e32 v4, v5
 ; CI-NEXT:    v_cvt_f16_f32_e32 v5, v7
 ; CI-NEXT:    v_cvt_f16_f32_e32 v7, v9
+; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; CI-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; CI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; CI-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; CI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; CI-NEXT:    v_cvt_f16_f32_e32 v7, v7
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; CI-NEXT:    v_or_b32_e32 v2, v3, v2
 ; CI-NEXT:    v_cvt_f16_f32_e32 v3, v6
 ; CI-NEXT:    v_cvt_f16_f32_e32 v6, v10
 ; CI-NEXT:    v_cvt_f16_f32_e32 v9, v13
 ; CI-NEXT:    v_cvt_f16_f32_e32 v10, v18
+; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; CI-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; CI-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; CI-NEXT:    v_cvt_f32_f16_e32 v10, v10
+; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; CI-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; CI-NEXT:    v_cvt_f16_f32_e32 v9, v9
+; CI-NEXT:    v_cvt_f16_f32_e32 v10, v10
 ; CI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; CI-NEXT:    v_or_b32_e32 v3, v4, v3
 ; CI-NEXT:    v_cvt_f16_f32_e32 v4, v8
 ; CI-NEXT:    v_cvt_f16_f32_e32 v8, v14
 ; CI-NEXT:    v_cvt_f16_f32_e32 v13, v21
 ; CI-NEXT:    v_cvt_f16_f32_e32 v14, v26
+; CI-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; CI-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; CI-NEXT:    v_cvt_f32_f16_e32 v13, v13
+; CI-NEXT:    v_cvt_f32_f16_e32 v14, v14
+; CI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; CI-NEXT:    v_cvt_f16_f32_e32 v8, v8
+; CI-NEXT:    v_cvt_f16_f32_e32 v13, v13
+; CI-NEXT:    v_cvt_f16_f32_e32 v14, v14
 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; CI-NEXT:    v_or_b32_e32 v4, v5, v4
 ; CI-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
@@ -3481,16 +3540,32 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 {
 ; CI-NEXT:    v_or_b32_e32 v5, v7, v5
 ; CI-NEXT:    v_cvt_f16_f32_e32 v7, v11
 ; CI-NEXT:    v_cvt_f16_f32_e32 v11, v17
-; CI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; CI-NEXT:    v_cvt_f32_f16_e32 v6, v6
 ; CI-NEXT:    v_cvt_f16_f32_e32 v12, v22
+; CI-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; CI-NEXT:    v_cvt_f32_f16_e32 v11, v11
+; CI-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; CI-NEXT:    v_cvt_f32_f16_e32 v12, v12
+; CI-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; CI-NEXT:    v_cvt_f16_f32_e32 v11, v11
+; CI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; CI-NEXT:    v_cvt_f16_f32_e32 v12, v12
 ; CI-NEXT:    v_or_b32_e32 v6, v7, v6
 ; CI-NEXT:    v_lshlrev_b32_e32 v7, 16, v8
 ; CI-NEXT:    v_cvt_f16_f32_e32 v8, v16
 ; CI-NEXT:    v_or_b32_e32 v7, v9, v7
 ; CI-NEXT:    v_cvt_f16_f32_e32 v9, v15
 ; CI-NEXT:    v_cvt_f16_f32_e32 v15, v25
-; CI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; CI-NEXT:    v_cvt_f32_f16_e32 v8, v8
 ; CI-NEXT:    v_cvt_f16_f32_e32 v25, v29
+; CI-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; CI-NEXT:    v_cvt_f32_f16_e32 v15, v15
+; CI-NEXT:    v_cvt_f16_f32_e32 v8, v8
+; CI-NEXT:    v_cvt_f32_f16_e32 v25, v25
+; CI-NEXT:    v_cvt_f16_f32_e32 v9, v9
+; CI-NEXT:    v_cvt_f16_f32_e32 v15, v15
+; CI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; CI-NEXT:    v_cvt_f16_f32_e32 v25, v25
 ; CI-NEXT:    v_or_b32_e32 v8, v9, v8
 ; CI-NEXT:    v_lshlrev_b32_e32 v9, 16, v10
 ; CI-NEXT:    v_cvt_f16_f32_e32 v10, v20
@@ -3500,6 +3575,10 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 {
 ; CI-NEXT:    buffer_load_dword v17, off, s[0:3], s32
 ; CI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:12
 ; CI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:8
+; CI-NEXT:    v_cvt_f32_f16_e32 v10, v10
+; CI-NEXT:    v_cvt_f32_f16_e32 v11, v11
+; CI-NEXT:    v_cvt_f16_f32_e32 v10, v10
+; CI-NEXT:    v_cvt_f16_f32_e32 v11, v11
 ; CI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
 ; CI-NEXT:    v_or_b32_e32 v10, v11, v10
 ; CI-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
@@ -3510,8 +3589,14 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 {
 ; CI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:16
 ; CI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:28
 ; CI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:24
-; CI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; CI-NEXT:    v_cvt_f32_f16_e32 v12, v12
 ; CI-NEXT:    v_cvt_f16_f32_e32 v24, v30
+; CI-NEXT:    v_cvt_f32_f16_e32 v13, v13
+; CI-NEXT:    v_cvt_f16_f32_e32 v12, v12
+; CI-NEXT:    v_cvt_f32_f16_e32 v24, v24
+; CI-NEXT:    v_cvt_f16_f32_e32 v13, v13
+; CI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; CI-NEXT:    v_cvt_f16_f32_e32 v24, v24
 ; CI-NEXT:    v_or_b32_e32 v12, v13, v12
 ; CI-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
 ; CI-NEXT:    v_or_b32_e32 v13, v15, v13
@@ -3521,6 +3606,10 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 {
 ; CI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:32
 ; CI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:44
 ; CI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:40
+; CI-NEXT:    v_cvt_f32_f16_e32 v14, v14
+; CI-NEXT:    v_cvt_f32_f16_e32 v15, v15
+; CI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; CI-NEXT:    v_cvt_f16_f32_e32 v15, v15
 ; CI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
 ; CI-NEXT:    v_or_b32_e32 v14, v15, v14
 ; CI-NEXT:    v_lshlrev_b32_e32 v15, 16, v24
@@ -3533,6 +3622,14 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 {
 ; CI-NEXT:    v_cvt_f16_f32_e32 v18, v18
 ; CI-NEXT:    s_waitcnt vmcnt(8)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v19, v19
+; CI-NEXT:    v_cvt_f32_f16_e32 v16, v16
+; CI-NEXT:    v_cvt_f32_f16_e32 v17, v17
+; CI-NEXT:    v_cvt_f32_f16_e32 v18, v18
+; CI-NEXT:    v_cvt_f32_f16_e32 v19, v19
+; CI-NEXT:    v_cvt_f16_f32_e32 v16, v16
+; CI-NEXT:    v_cvt_f16_f32_e32 v17, v17
+; CI-NEXT:    v_cvt_f16_f32_e32 v18, v18
+; CI-NEXT:    v_cvt_f16_f32_e32 v19, v19
 ; CI-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
 ; CI-NEXT:    v_or_b32_e32 v16, v17, v16
 ; CI-NEXT:    v_lshlrev_b32_e32 v17, 16, v18
@@ -3545,6 +3642,14 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 {
 ; CI-NEXT:    v_cvt_f16_f32_e32 v20, v22
 ; CI-NEXT:    s_waitcnt vmcnt(4)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v21, v23
+; CI-NEXT:    v_cvt_f32_f16_e32 v18, v18
+; CI-NEXT:    v_cvt_f32_f16_e32 v19, v19
+; CI-NEXT:    v_cvt_f32_f16_e32 v20, v20
+; CI-NEXT:    v_cvt_f32_f16_e32 v21, v21
+; CI-NEXT:    v_cvt_f16_f32_e32 v18, v18
+; CI-NEXT:    v_cvt_f16_f32_e32 v19, v19
+; CI-NEXT:    v_cvt_f16_f32_e32 v20, v20
+; CI-NEXT:    v_cvt_f16_f32_e32 v21, v21
 ; CI-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
 ; CI-NEXT:    v_or_b32_e32 v18, v19, v18
 ; CI-NEXT:    v_lshlrev_b32_e32 v19, 16, v20
@@ -3557,13 +3662,21 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 {
 ; CI-NEXT:    v_cvt_f16_f32_e32 v26, v28
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v27, v29
+; CI-NEXT:    v_cvt_f32_f16_e32 v20, v20
+; CI-NEXT:    v_cvt_f32_f16_e32 v21, v21
+; CI-NEXT:    v_cvt_f32_f16_e32 v26, v26
+; CI-NEXT:    v_cvt_f32_f16_e32 v27, v27
+; CI-NEXT:    v_cvt_f16_f32_e32 v20, v20
+; CI-NEXT:    v_cvt_f16_f32_e32 v21, v21
+; CI-NEXT:    v_cvt_f16_f32_e32 v26, v26
+; CI-NEXT:    v_cvt_f16_f32_e32 v27, v27
 ; CI-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
 ; CI-NEXT:    v_or_b32_e32 v20, v21, v20
 ; CI-NEXT:    v_lshlrev_b32_e32 v21, 16, v26
 ; CI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:52
 ; CI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:48
-; CI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:60
-; CI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:56
+; CI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:60
+; CI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:56
 ; CI-NEXT:    v_or_b32_e32 v21, v27, v21
 ; CI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:132
 ; CI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:128
@@ -3572,40 +3685,70 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 {
 ; CI-NEXT:    s_waitcnt vmcnt(4)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v25, v25
 ; CI-NEXT:    s_waitcnt vmcnt(3)
-; CI-NEXT:    v_cvt_f16_f32_e32 v23, v23
-; CI-NEXT:    s_waitcnt vmcnt(2)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v22, v22
+; CI-NEXT:    s_waitcnt vmcnt(2)
+; CI-NEXT:    v_cvt_f16_f32_e32 v23, v23
 ; CI-NEXT:    s_waitcnt vmcnt(1)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v26, v26
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v27, v27
-; CI-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; CI-NEXT:    v_or_b32_e32 v24, v25, v24
+; CI-NEXT:    v_cvt_f32_f16_e32 v24, v24
+; CI-NEXT:    v_cvt_f32_f16_e32 v25, v25
+; CI-NEXT:    v_cvt_f32_f16_e32 v26, v26
+; CI-NEXT:    v_cvt_f32_f16_e32 v27, v27
+; CI-NEXT:    v_cvt_f16_f32_e32 v24, v24
+; CI-NEXT:    v_cvt_f32_f16_e32 v28, v22
+; CI-NEXT:    v_cvt_f16_f32_e32 v26, v26
+; CI-NEXT:    v_cvt_f16_f32_e32 v27, v27
+; CI-NEXT:    v_cvt_f32_f16_e32 v23, v23
+; CI-NEXT:    v_cvt_f16_f32_e32 v25, v25
 ; CI-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
 ; CI-NEXT:    v_or_b32_e32 v26, v27, v26
 ; CI-NEXT:    v_add_i32_e32 v27, vcc, 0x7c, v0
 ; CI-NEXT:    buffer_store_dword v26, v27, s[0:3], 0 offen
 ; CI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:124
 ; CI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:120
-; CI-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; CI-NEXT:    v_or_b32_e32 v22, v22, v23
-; CI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:88
-; CI-NEXT:    s_waitcnt vmcnt(2)
+; CI-NEXT:    v_lshlrev_b32_e32 v22, 16, v24
+; CI-NEXT:    v_cvt_f16_f32_e32 v24, v28
+; CI-NEXT:    v_cvt_f16_f32_e32 v23, v23
+; CI-NEXT:    v_or_b32_e32 v22, v25, v22
+; CI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:92
+; CI-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; CI-NEXT:    v_or_b32_e32 v23, v23, v24
+; CI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:88
+; CI-NEXT:    s_waitcnt vmcnt(3)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v26, v26
+; CI-NEXT:    s_waitcnt vmcnt(2)
+; CI-NEXT:    v_cvt_f16_f32_e32 v27, v27
+; CI-NEXT:    v_cvt_f32_f16_e32 v26, v26
+; CI-NEXT:    v_cvt_f32_f16_e32 v27, v27
 ; CI-NEXT:    s_waitcnt vmcnt(1)
+; CI-NEXT:    v_cvt_f16_f32_e32 v25, v25
+; CI-NEXT:    v_cvt_f16_f32_e32 v26, v26
 ; CI-NEXT:    v_cvt_f16_f32_e32 v27, v27
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_cvt_f16_f32_e32 v24, v24
+; CI-NEXT:    v_cvt_f32_f16_e32 v25, v25
 ; CI-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
 ; CI-NEXT:    v_or_b32_e32 v26, v27, v26
 ; CI-NEXT:    v_add_i32_e32 v27, vcc, 0x78, v0
 ; CI-NEXT:    buffer_store_dword v26, v27, s[0:3], 0 offen
 ; CI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:116
 ; CI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:112
-; CI-NEXT:    s_waitcnt vmcnt(3)
-; CI-NEXT:    v_cvt_f16_f32_e32 v23, v23
+; CI-NEXT:    v_cvt_f32_f16_e32 v24, v24
+; CI-NEXT:    v_cvt_f16_f32_e32 v25, v25
+; CI-NEXT:    v_cvt_f16_f32_e32 v24, v24
+; CI-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; CI-NEXT:    v_or_b32_e32 v24, v24, v25
+; CI-NEXT:    v_add_i32_e32 v25, vcc, 0x68, v0
 ; CI-NEXT:    s_waitcnt vmcnt(1)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v26, v26
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v27, v27
+; CI-NEXT:    v_cvt_f32_f16_e32 v26, v26
+; CI-NEXT:    v_cvt_f32_f16_e32 v27, v27
+; CI-NEXT:    v_cvt_f16_f32_e32 v26, v26
+; CI-NEXT:    v_cvt_f16_f32_e32 v27, v27
 ; CI-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
 ; CI-NEXT:    v_or_b32_e32 v26, v27, v26
 ; CI-NEXT:    v_add_i32_e32 v27, vcc, 0x74, v0
@@ -3613,61 +3756,76 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 {
 ; CI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:108
 ; CI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:104
 ; CI-NEXT:    s_waitcnt vmcnt(1)
-; CI-NEXT:    v_cvt_f16_f32_e32 v25, v26
+; CI-NEXT:    v_cvt_f16_f32_e32 v26, v26
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    v_cvt_f16_f32_e32 v26, v27
-; CI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:92
-; CI-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; CI-NEXT:    v_or_b32_e32 v25, v26, v25
-; CI-NEXT:    v_add_i32_e32 v26, vcc, 0x70, v0
-; CI-NEXT:    buffer_store_dword v25, v26, s[0:3], 0 offen
-; CI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:100
-; CI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:96
-; CI-NEXT:    s_waitcnt vmcnt(3)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v27, v27
-; CI-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
-; CI-NEXT:    v_or_b32_e32 v23, v23, v27
+; CI-NEXT:    v_cvt_f32_f16_e32 v26, v26
+; CI-NEXT:    v_cvt_f32_f16_e32 v27, v27
+; CI-NEXT:    v_cvt_f16_f32_e32 v26, v26
+; CI-NEXT:    v_cvt_f16_f32_e32 v27, v27
+; CI-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; CI-NEXT:    v_or_b32_e32 v26, v27, v26
+; CI-NEXT:    v_add_i32_e32 v27, vcc, 0x70, v0
+; CI-NEXT:    buffer_store_dword v26, v27, s[0:3], 0 offen
+; CI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:100
+; CI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:96
 ; CI-NEXT:    s_waitcnt vmcnt(1)
-; CI-NEXT:    v_cvt_f16_f32_e32 v25, v25
+; CI-NEXT:    v_cvt_f16_f32_e32 v26, v26
 ; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_cvt_f16_f32_e32 v27, v27
+; CI-NEXT:    v_cvt_f32_f16_e32 v26, v26
+; CI-NEXT:    v_cvt_f32_f16_e32 v27, v27
 ; CI-NEXT:    v_cvt_f16_f32_e32 v26, v26
-; CI-NEXT:    v_add_i32_e32 v27, vcc, 0x68, v0
-; CI-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; CI-NEXT:    v_or_b32_e32 v25, v26, v25
-; CI-NEXT:    v_add_i32_e32 v26, vcc, 0x6c, v0
-; CI-NEXT:    buffer_store_dword v25, v26, s[0:3], 0 offen
-; CI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:68
-; CI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:64
-; CI-NEXT:    buffer_store_dword v23, v27, s[0:3], 0 offen
-; CI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:76
-; CI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:72
+; CI-NEXT:    v_cvt_f16_f32_e32 v27, v27
+; CI-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; CI-NEXT:    v_or_b32_e32 v26, v27, v26
+; CI-NEXT:    v_add_i32_e32 v27, vcc, 0x6c, v0
+; CI-NEXT:    buffer_store_dword v26, v27, s[0:3], 0 offen
+; CI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:68
+; CI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:64
+; CI-NEXT:    buffer_store_dword v24, v25, s[0:3], 0 offen
+; CI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:76
+; CI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:72
 ; CI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:84
 ; CI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:80
 ; CI-NEXT:    s_waitcnt vmcnt(3)
-; CI-NEXT:    v_cvt_f16_f32_e32 v23, v23
-; CI-NEXT:    v_cvt_f16_f32_e32 v25, v25
+; CI-NEXT:    v_cvt_f16_f32_e32 v24, v24
 ; CI-NEXT:    v_cvt_f16_f32_e32 v26, v26
-; CI-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; CI-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; CI-NEXT:    v_or_b32_e32 v25, v26, v25
 ; CI-NEXT:    s_waitcnt vmcnt(2)
-; CI-NEXT:    v_cvt_f16_f32_e32 v26, v27
-; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    v_cvt_f16_f32_e32 v27, v29
-; CI-NEXT:    v_or_b32_e32 v23, v26, v23
-; CI-NEXT:    v_cvt_f16_f32_e32 v26, v28
+; CI-NEXT:    v_cvt_f16_f32_e32 v25, v25
+; CI-NEXT:    v_cvt_f16_f32_e32 v27, v27
+; CI-NEXT:    v_cvt_f32_f16_e32 v24, v24
+; CI-NEXT:    v_cvt_f32_f16_e32 v26, v26
+; CI-NEXT:    v_cvt_f32_f16_e32 v25, v25
+; CI-NEXT:    v_cvt_f32_f16_e32 v27, v27
+; CI-NEXT:    v_cvt_f16_f32_e32 v24, v24
+; CI-NEXT:    v_cvt_f16_f32_e32 v26, v26
+; CI-NEXT:    v_cvt_f16_f32_e32 v25, v25
+; CI-NEXT:    v_cvt_f16_f32_e32 v27, v27
+; CI-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
 ; CI-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; CI-NEXT:    v_or_b32_e32 v24, v25, v24
+; CI-NEXT:    s_waitcnt vmcnt(1)
+; CI-NEXT:    v_cvt_f16_f32_e32 v25, v28
 ; CI-NEXT:    v_or_b32_e32 v26, v27, v26
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_cvt_f16_f32_e32 v27, v29
+; CI-NEXT:    v_cvt_f32_f16_e32 v25, v25
+; CI-NEXT:    v_cvt_f32_f16_e32 v27, v27
+; CI-NEXT:    v_cvt_f16_f32_e32 v25, v25
+; CI-NEXT:    v_cvt_f16_f32_e32 v27, v27
+; CI-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; CI-NEXT:    v_or_b32_e32 v25, v27, v25
 ; CI-NEXT:    v_add_i32_e32 v27, vcc, 0x64, v0
-; CI-NEXT:    buffer_store_dword v26, v27, s[0:3], 0 offen
-; CI-NEXT:    v_add_i32_e32 v26, vcc, 0x60, v0
-; CI-NEXT:    buffer_store_dword v23, v26, s[0:3], 0 offen
-; CI-NEXT:    v_add_i32_e32 v23, vcc, 0x5c, v0
-; CI-NEXT:    buffer_store_dword v25, v23, s[0:3], 0 offen
-; CI-NEXT:    v_add_i32_e32 v23, vcc, 0x58, v0
+; CI-NEXT:    buffer_store_dword v25, v27, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v25, vcc, 0x60, v0
+; CI-NEXT:    buffer_store_dword v24, v25, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v24, vcc, 0x5c, v0
+; CI-NEXT:    buffer_store_dword v26, v24, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v24, vcc, 0x58, v0
+; CI-NEXT:    buffer_store_dword v23, v24, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v23, vcc, 0x54, v0
 ; CI-NEXT:    buffer_store_dword v22, v23, s[0:3], 0 offen
-; CI-NEXT:    v_add_i32_e32 v22, vcc, 0x54, v0
-; CI-NEXT:    buffer_store_dword v24, v22, s[0:3], 0 offen
 ; CI-NEXT:    v_add_i32_e32 v22, vcc, 0x50, v0
 ; CI-NEXT:    buffer_store_dword v21, v22, s[0:3], 0 offen
 ; CI-NEXT:    v_add_i32_e32 v21, vcc, 0x4c, v0



More information about the llvm-commits mailing list