[llvm] 675c942 - [AMDGPU] Disable NSA for BVH instructions when appropriate
Carl Ritson via llvm-commits
llvm-commits at lists.llvm.org
Mon Aug 2 04:10:06 PDT 2021
Author: Carl Ritson
Date: 2021-08-02T20:09:26+09:00
New Revision: 675c9423736d4117e918f734c995dc3bb069c986
URL: https://github.com/llvm/llvm-project/commit/675c9423736d4117e918f734c995dc3bb069c986
DIFF: https://github.com/llvm/llvm-project/commit/675c9423736d4117e918f734c995dc3bb069c986.diff
LOG: [AMDGPU] Disable NSA for BVH instructions when appropriate
Check maximum NSA size when selecting NSA or non-NSA BVH instructions.
Differential Revision: https://reviews.llvm.org/D103230
Added:
Modified:
llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index a357c86ff0019..ddd8eac9adf9d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -4770,12 +4770,20 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
return false;
}
- bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
- bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
- unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa
- : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa
- : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa
- : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa;
+ const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
+ const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
+ const unsigned NumVAddrs = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
+ const bool UseNSA = ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize();
+ const unsigned Opcodes[2][2][2] = {
+ {{AMDGPU::IMAGE_BVH_INTERSECT_RAY_sa,
+ AMDGPU::IMAGE_BVH64_INTERSECT_RAY_sa},
+ {AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_sa,
+ AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_sa}},
+ {{AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa,
+ AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa},
+ {AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa,
+ AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa}}};
+ const unsigned Opcode = Opcodes[UseNSA][IsA16][Is64];
SmallVector<Register, 12> Ops;
if (Is64) {
@@ -4812,6 +4820,14 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
packLanes(RayInvDir);
}
+ if (!UseNSA) {
+ // Build a single vector containing all the operands so far prepared.
+ LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
+ Register MergedOps = B.buildMerge(OpTy, Ops).getReg(0);
+ Ops.clear();
+ Ops.push_back(MergedOps);
+ }
+
auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
.addDef(DstReg)
.addImm(Opcode);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 5e95ebc80ec56..56560ae28b35c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4255,8 +4255,17 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
unsigned N = MI.getNumExplicitOperands() - 2;
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 128);
OpdsMapping[N] = getSGPROpMapping(MI.getOperand(N).getReg(), MRI, *TRI);
- for (unsigned I = 2; I < N; ++I)
- OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
+ if (N == 3) {
+ // Sequential form: all operands combined into VGPR256/VGPR512
+ unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
+ if (Size > 256)
+ Size = 512;
+ OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
+ } else {
+ // NSA form
+ for (unsigned I = 2; I < N; ++I)
+ OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
+ }
break;
}
case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index d98acfc6c5327..3479f56e71b52 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -7341,7 +7341,6 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
Op->getVTList(), Ops, VT, M->getMemOperand());
}
case Intrinsic::amdgcn_image_bvh_intersect_ray: {
- SDLoc DL(Op);
MemSDNode *M = cast<MemSDNode>(Op);
SDValue NodePtr = M->getOperand(2);
SDValue RayExtent = M->getOperand(3);
@@ -7360,12 +7359,21 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
return SDValue();
}
- bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
- bool Is64 = NodePtr.getValueType() == MVT::i64;
- unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa
- : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa
- : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa
- : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa;
+ const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
+ const bool Is64 = NodePtr.getValueType() == MVT::i64;
+ const unsigned NumVAddrs = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
+ const bool UseNSA =
+ Subtarget->hasNSAEncoding() && NumVAddrs <= Subtarget->getNSAMaxSize();
+ const unsigned Opcodes[2][2][2] = {
+ {{AMDGPU::IMAGE_BVH_INTERSECT_RAY_sa,
+ AMDGPU::IMAGE_BVH64_INTERSECT_RAY_sa},
+ {AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_sa,
+ AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_sa}},
+ {{AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa,
+ AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa},
+ {AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa,
+ AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa}}};
+ const unsigned Opcode = Opcodes[UseNSA][IsA16][Is64];
SmallVector<SDValue, 16> Ops;
@@ -7405,6 +7413,20 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
packLanes(RayOrigin, true);
packLanes(RayDir, true);
packLanes(RayInvDir, false);
+
+ if (!UseNSA) {
+ // Build a single vector containing all the operands so far prepared.
+ if (NumVAddrs > 8) {
+ SDValue Undef = DAG.getUNDEF(MVT::i32);
+ Ops.append(16 - Ops.size(), Undef);
+ }
+ assert(Ops.size() == 8 || Ops.size() == 16);
+ SDValue MergedOps = DAG.getBuildVector(
+ Ops.size() == 16 ? MVT::v16i32 : MVT::v8i32, DL, Ops);
+ Ops.clear();
+ Ops.push_back(MergedOps);
+ }
+
Ops.push_back(TDescr);
if (IsA16)
Ops.push_back(DAG.getTargetConstant(1, DL, MVT::i1));
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
index a0bb2190a65f8..0c8010c55d530 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
@@ -15,11 +15,23 @@ declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64, float, <4
declare i32 @llvm.amdgcn.workitem.id.x()
define amdgpu_ps <4 x float> @image_bvh_intersect_ray(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> inreg %tdescr) {
-; GCN-LABEL: image_bvh_intersect_ray:
-; GCN: ; %bb.0:
-; GCN-NEXT: image_bvh_intersect_ray v[0:3], [v0, v1, v2, v3, v4, v6, v7, v8, v10, v11, v12], s[0:3]
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: ; return to shader part epilog
+; GFX1030-LABEL: image_bvh_intersect_ray:
+; GFX1030: ; %bb.0:
+; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], [v0, v1, v2, v3, v4, v6, v7, v8, v10, v11, v12], s[0:3]
+; GFX1030-NEXT: s_waitcnt vmcnt(0)
+; GFX1030-NEXT: ; return to shader part epilog
+;
+; GFX1013-LABEL: image_bvh_intersect_ray:
+; GFX1013: ; %bb.0:
+; GFX1013-NEXT: v_mov_b32_e32 v5, v6
+; GFX1013-NEXT: v_mov_b32_e32 v6, v7
+; GFX1013-NEXT: v_mov_b32_e32 v7, v8
+; GFX1013-NEXT: v_mov_b32_e32 v8, v10
+; GFX1013-NEXT: v_mov_b32_e32 v9, v11
+; GFX1013-NEXT: v_mov_b32_e32 v10, v12
+; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:15], s[0:3]
+; GFX1013-NEXT: s_waitcnt vmcnt(0)
+; GFX1013-NEXT: ; return to shader part epilog
; ERR: in function image_bvh_intersect_ray{{.*}}intrinsic not supported on subtarget
%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr)
%r = bitcast <4 x i32> %v to <4 x float>
@@ -68,11 +80,23 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16(i32 %node_ptr, float %
}
define amdgpu_ps <4 x float> @image_bvh64_intersect_ray(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> inreg %tdescr) {
-; GCN-LABEL: image_bvh64_intersect_ray:
-; GCN: ; %bb.0:
-; GCN-NEXT: image_bvh64_intersect_ray v[0:3], [v0, v1, v2, v3, v4, v5, v7, v8, v9, v11, v12, v13], s[0:3]
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: ; return to shader part epilog
+; GFX1030-LABEL: image_bvh64_intersect_ray:
+; GFX1030: ; %bb.0:
+; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], [v0, v1, v2, v3, v4, v5, v7, v8, v9, v11, v12, v13], s[0:3]
+; GFX1030-NEXT: s_waitcnt vmcnt(0)
+; GFX1030-NEXT: ; return to shader part epilog
+;
+; GFX1013-LABEL: image_bvh64_intersect_ray:
+; GFX1013: ; %bb.0:
+; GFX1013-NEXT: v_mov_b32_e32 v6, v7
+; GFX1013-NEXT: v_mov_b32_e32 v7, v8
+; GFX1013-NEXT: v_mov_b32_e32 v8, v9
+; GFX1013-NEXT: v_mov_b32_e32 v9, v11
+; GFX1013-NEXT: v_mov_b32_e32 v10, v12
+; GFX1013-NEXT: v_mov_b32_e32 v11, v13
+; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3]
+; GFX1013-NEXT: s_waitcnt vmcnt(0)
+; GFX1013-NEXT: ; return to shader part epilog
%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr)
%r = bitcast <4 x i32> %v to <4 x float>
ret <4 x float> %r
@@ -121,136 +145,265 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16(i64 %node_ptr, float
}
define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) {
-; GCN-LABEL: image_bvh_intersect_ray_vgpr_descr:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_mov_b32 s1, exec_lo
-; GCN-NEXT: BB6_1: ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT: v_readfirstlane_b32 s4, v14
-; GCN-NEXT: v_readfirstlane_b32 s5, v15
-; GCN-NEXT: v_readfirstlane_b32 s6, v16
-; GCN-NEXT: v_readfirstlane_b32 s7, v17
-; GCN-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[14:15]
-; GCN-NEXT: image_bvh_intersect_ray v[18:21], [v0, v1, v2, v3, v4, v6, v7, v8, v10, v11, v12], s[4:7]
-; GCN-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[16:17]
-; GCN-NEXT: s_and_b32 s0, s0, vcc_lo
-; GCN-NEXT: s_and_saveexec_b32 s0, s0
-; GCN-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GCN-NEXT: s_cbranch_execnz BB6_1
-; GCN-NEXT: ; %bb.2:
-; GCN-NEXT: s_mov_b32 exec_lo, s1
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, v18
-; GCN-NEXT: v_mov_b32_e32 v1, v19
-; GCN-NEXT: v_mov_b32_e32 v2, v20
-; GCN-NEXT: v_mov_b32_e32 v3, v21
-; GCN-NEXT: ; return to shader part epilog
+; GFX1030-LABEL: image_bvh_intersect_ray_vgpr_descr:
+; GFX1030: ; %bb.0:
+; GFX1030-NEXT: s_mov_b32 s1, exec_lo
+; GFX1030-NEXT: BB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX1030-NEXT: v_readfirstlane_b32 s4, v14
+; GFX1030-NEXT: v_readfirstlane_b32 s5, v15
+; GFX1030-NEXT: v_readfirstlane_b32 s6, v16
+; GFX1030-NEXT: v_readfirstlane_b32 s7, v17
+; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[14:15]
+; GFX1030-NEXT: image_bvh_intersect_ray v[18:21], [v0, v1, v2, v3, v4, v6, v7, v8, v10, v11, v12], s[4:7]
+; GFX1030-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[16:17]
+; GFX1030-NEXT: s_and_b32 s0, s0, vcc_lo
+; GFX1030-NEXT: s_and_saveexec_b32 s0, s0
+; GFX1030-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX1030-NEXT: s_cbranch_execnz BB6_1
+; GFX1030-NEXT: ; %bb.2:
+; GFX1030-NEXT: s_mov_b32 exec_lo, s1
+; GFX1030-NEXT: s_waitcnt vmcnt(0)
+; GFX1030-NEXT: v_mov_b32_e32 v0, v18
+; GFX1030-NEXT: v_mov_b32_e32 v1, v19
+; GFX1030-NEXT: v_mov_b32_e32 v2, v20
+; GFX1030-NEXT: v_mov_b32_e32 v3, v21
+; GFX1030-NEXT: ; return to shader part epilog
+;
+; GFX1013-LABEL: image_bvh_intersect_ray_vgpr_descr:
+; GFX1013: ; %bb.0:
+; GFX1013-NEXT: v_mov_b32_e32 v5, v6
+; GFX1013-NEXT: v_mov_b32_e32 v6, v7
+; GFX1013-NEXT: v_mov_b32_e32 v7, v8
+; GFX1013-NEXT: v_mov_b32_e32 v8, v10
+; GFX1013-NEXT: v_mov_b32_e32 v9, v11
+; GFX1013-NEXT: v_mov_b32_e32 v10, v12
+; GFX1013-NEXT: s_mov_b32 s1, exec_lo
+; GFX1013-NEXT: BB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX1013-NEXT: v_readfirstlane_b32 s4, v14
+; GFX1013-NEXT: v_readfirstlane_b32 s5, v15
+; GFX1013-NEXT: v_readfirstlane_b32 s6, v16
+; GFX1013-NEXT: v_readfirstlane_b32 s7, v17
+; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[14:15]
+; GFX1013-NEXT: image_bvh_intersect_ray v[18:21], v[0:15], s[4:7]
+; GFX1013-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[16:17]
+; GFX1013-NEXT: s_and_b32 s0, s0, vcc_lo
+; GFX1013-NEXT: s_and_saveexec_b32 s0, s0
+; GFX1013-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX1013-NEXT: s_cbranch_execnz BB6_1
+; GFX1013-NEXT: ; %bb.2:
+; GFX1013-NEXT: s_mov_b32 exec_lo, s1
+; GFX1013-NEXT: s_waitcnt vmcnt(0)
+; GFX1013-NEXT: v_mov_b32_e32 v0, v18
+; GFX1013-NEXT: v_mov_b32_e32 v1, v19
+; GFX1013-NEXT: v_mov_b32_e32 v2, v20
+; GFX1013-NEXT: v_mov_b32_e32 v3, v21
+; GFX1013-NEXT: ; return to shader part epilog
%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr)
%r = bitcast <4 x i32> %v to <4 x float>
ret <4 x float> %r
}
define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) {
-; GCN-LABEL: image_bvh_intersect_ray_a16_vgpr_descr:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_mov_b32 s0, 0xffff
-; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v6
-; GCN-NEXT: v_and_b32_e32 v14, s0, v8
-; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GCN-NEXT: v_and_b32_e32 v15, s0, v9
-; GCN-NEXT: s_mov_b32 s1, exec_lo
-; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; GCN-NEXT: v_lshl_or_b32 v15, v15, 16, v8
-; GCN-NEXT: v_and_or_b32 v9, v6, s0, v5
-; GCN-NEXT: v_and_or_b32 v14, v7, s0, v14
-; GCN-NEXT: BB7_1: ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT: v_readfirstlane_b32 s4, v10
-; GCN-NEXT: v_readfirstlane_b32 s5, v11
-; GCN-NEXT: v_readfirstlane_b32 s6, v12
-; GCN-NEXT: v_readfirstlane_b32 s7, v13
-; GCN-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[10:11]
-; GCN-NEXT: image_bvh_intersect_ray v[5:8], [v0, v1, v2, v3, v4, v9, v14, v15], s[4:7] a16
-; GCN-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[12:13]
-; GCN-NEXT: s_and_b32 s0, s0, vcc_lo
-; GCN-NEXT: s_and_saveexec_b32 s0, s0
-; GCN-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GCN-NEXT: s_cbranch_execnz BB7_1
-; GCN-NEXT: ; %bb.2:
-; GCN-NEXT: s_mov_b32 exec_lo, s1
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, v5
-; GCN-NEXT: v_mov_b32_e32 v1, v6
-; GCN-NEXT: v_mov_b32_e32 v2, v7
-; GCN-NEXT: v_mov_b32_e32 v3, v8
-; GCN-NEXT: ; return to shader part epilog
+; GFX1030-LABEL: image_bvh_intersect_ray_a16_vgpr_descr:
+; GFX1030: ; %bb.0:
+; GFX1030-NEXT: s_mov_b32 s0, 0xffff
+; GFX1030-NEXT: v_lshrrev_b32_e32 v5, 16, v6
+; GFX1030-NEXT: v_and_b32_e32 v14, s0, v8
+; GFX1030-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX1030-NEXT: v_and_b32_e32 v15, s0, v9
+; GFX1030-NEXT: s_mov_b32 s1, exec_lo
+; GFX1030-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX1030-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX1030-NEXT: v_lshl_or_b32 v15, v15, 16, v8
+; GFX1030-NEXT: v_and_or_b32 v9, v6, s0, v5
+; GFX1030-NEXT: v_and_or_b32 v14, v7, s0, v14
+; GFX1030-NEXT: BB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1030-NEXT: v_readfirstlane_b32 s4, v10
+; GFX1030-NEXT: v_readfirstlane_b32 s5, v11
+; GFX1030-NEXT: v_readfirstlane_b32 s6, v12
+; GFX1030-NEXT: v_readfirstlane_b32 s7, v13
+; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[10:11]
+; GFX1030-NEXT: image_bvh_intersect_ray v[5:8], [v0, v1, v2, v3, v4, v9, v14, v15], s[4:7] a16
+; GFX1030-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[12:13]
+; GFX1030-NEXT: s_and_b32 s0, s0, vcc_lo
+; GFX1030-NEXT: s_and_saveexec_b32 s0, s0
+; GFX1030-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX1030-NEXT: s_cbranch_execnz BB7_1
+; GFX1030-NEXT: ; %bb.2:
+; GFX1030-NEXT: s_mov_b32 exec_lo, s1
+; GFX1030-NEXT: s_waitcnt vmcnt(0)
+; GFX1030-NEXT: v_mov_b32_e32 v0, v5
+; GFX1030-NEXT: v_mov_b32_e32 v1, v6
+; GFX1030-NEXT: v_mov_b32_e32 v2, v7
+; GFX1030-NEXT: v_mov_b32_e32 v3, v8
+; GFX1030-NEXT: ; return to shader part epilog
+;
+; GFX1013-LABEL: image_bvh_intersect_ray_a16_vgpr_descr:
+; GFX1013: ; %bb.0:
+; GFX1013-NEXT: s_mov_b32 s0, 0xffff
+; GFX1013-NEXT: v_lshrrev_b32_e32 v5, 16, v6
+; GFX1013-NEXT: v_and_b32_e32 v14, s0, v8
+; GFX1013-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX1013-NEXT: v_and_b32_e32 v9, s0, v9
+; GFX1013-NEXT: s_mov_b32 s1, exec_lo
+; GFX1013-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX1013-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX1013-NEXT: v_and_or_b32 v5, v6, s0, v5
+; GFX1013-NEXT: v_and_or_b32 v6, v7, s0, v14
+; GFX1013-NEXT: v_lshl_or_b32 v7, v9, 16, v8
+; GFX1013-NEXT: BB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1013-NEXT: v_readfirstlane_b32 s4, v10
+; GFX1013-NEXT: v_readfirstlane_b32 s5, v11
+; GFX1013-NEXT: v_readfirstlane_b32 s6, v12
+; GFX1013-NEXT: v_readfirstlane_b32 s7, v13
+; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[10:11]
+; GFX1013-NEXT: image_bvh_intersect_ray v[14:17], v[0:7], s[4:7] a16
+; GFX1013-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[12:13]
+; GFX1013-NEXT: s_and_b32 s0, s0, vcc_lo
+; GFX1013-NEXT: s_and_saveexec_b32 s0, s0
+; GFX1013-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX1013-NEXT: s_cbranch_execnz BB7_1
+; GFX1013-NEXT: ; %bb.2:
+; GFX1013-NEXT: s_mov_b32 exec_lo, s1
+; GFX1013-NEXT: s_waitcnt vmcnt(0)
+; GFX1013-NEXT: v_mov_b32_e32 v0, v14
+; GFX1013-NEXT: v_mov_b32_e32 v1, v15
+; GFX1013-NEXT: v_mov_b32_e32 v2, v16
+; GFX1013-NEXT: v_mov_b32_e32 v3, v17
+; GFX1013-NEXT: ; return to shader part epilog
%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr)
%r = bitcast <4 x i32> %v to <4 x float>
ret <4 x float> %r
}
define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_vgpr_descr(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) {
-; GCN-LABEL: image_bvh64_intersect_ray_vgpr_descr:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_mov_b32 s1, exec_lo
-; GCN-NEXT: BB8_1: ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT: v_readfirstlane_b32 s4, v15
-; GCN-NEXT: v_readfirstlane_b32 s5, v16
-; GCN-NEXT: v_readfirstlane_b32 s6, v17
-; GCN-NEXT: v_readfirstlane_b32 s7, v18
-; GCN-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[15:16]
-; GCN-NEXT: image_bvh64_intersect_ray v[19:22], [v0, v1, v2, v3, v4, v5, v7, v8, v9, v11, v12, v13], s[4:7]
-; GCN-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[17:18]
-; GCN-NEXT: s_and_b32 s0, s0, vcc_lo
-; GCN-NEXT: s_and_saveexec_b32 s0, s0
-; GCN-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GCN-NEXT: s_cbranch_execnz BB8_1
-; GCN-NEXT: ; %bb.2:
-; GCN-NEXT: s_mov_b32 exec_lo, s1
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, v19
-; GCN-NEXT: v_mov_b32_e32 v1, v20
-; GCN-NEXT: v_mov_b32_e32 v2, v21
-; GCN-NEXT: v_mov_b32_e32 v3, v22
-; GCN-NEXT: ; return to shader part epilog
+; GFX1030-LABEL: image_bvh64_intersect_ray_vgpr_descr:
+; GFX1030: ; %bb.0:
+; GFX1030-NEXT: s_mov_b32 s1, exec_lo
+; GFX1030-NEXT: BB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1030-NEXT: v_readfirstlane_b32 s4, v15
+; GFX1030-NEXT: v_readfirstlane_b32 s5, v16
+; GFX1030-NEXT: v_readfirstlane_b32 s6, v17
+; GFX1030-NEXT: v_readfirstlane_b32 s7, v18
+; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[15:16]
+; GFX1030-NEXT: image_bvh64_intersect_ray v[19:22], [v0, v1, v2, v3, v4, v5, v7, v8, v9, v11, v12, v13], s[4:7]
+; GFX1030-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[17:18]
+; GFX1030-NEXT: s_and_b32 s0, s0, vcc_lo
+; GFX1030-NEXT: s_and_saveexec_b32 s0, s0
+; GFX1030-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX1030-NEXT: s_cbranch_execnz BB8_1
+; GFX1030-NEXT: ; %bb.2:
+; GFX1030-NEXT: s_mov_b32 exec_lo, s1
+; GFX1030-NEXT: s_waitcnt vmcnt(0)
+; GFX1030-NEXT: v_mov_b32_e32 v0, v19
+; GFX1030-NEXT: v_mov_b32_e32 v1, v20
+; GFX1030-NEXT: v_mov_b32_e32 v2, v21
+; GFX1030-NEXT: v_mov_b32_e32 v3, v22
+; GFX1030-NEXT: ; return to shader part epilog
+;
+; GFX1013-LABEL: image_bvh64_intersect_ray_vgpr_descr:
+; GFX1013: ; %bb.0:
+; GFX1013-NEXT: v_mov_b32_e32 v6, v7
+; GFX1013-NEXT: v_mov_b32_e32 v7, v8
+; GFX1013-NEXT: v_mov_b32_e32 v8, v9
+; GFX1013-NEXT: v_mov_b32_e32 v9, v11
+; GFX1013-NEXT: v_mov_b32_e32 v10, v12
+; GFX1013-NEXT: v_mov_b32_e32 v11, v13
+; GFX1013-NEXT: s_mov_b32 s1, exec_lo
+; GFX1013-NEXT: BB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1013-NEXT: v_readfirstlane_b32 s4, v15
+; GFX1013-NEXT: v_readfirstlane_b32 s5, v16
+; GFX1013-NEXT: v_readfirstlane_b32 s6, v17
+; GFX1013-NEXT: v_readfirstlane_b32 s7, v18
+; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[15:16]
+; GFX1013-NEXT: image_bvh64_intersect_ray v[19:22], v[0:15], s[4:7]
+; GFX1013-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[17:18]
+; GFX1013-NEXT: s_and_b32 s0, s0, vcc_lo
+; GFX1013-NEXT: s_and_saveexec_b32 s0, s0
+; GFX1013-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX1013-NEXT: s_cbranch_execnz BB8_1
+; GFX1013-NEXT: ; %bb.2:
+; GFX1013-NEXT: s_mov_b32 exec_lo, s1
+; GFX1013-NEXT: s_waitcnt vmcnt(0)
+; GFX1013-NEXT: v_mov_b32_e32 v0, v19
+; GFX1013-NEXT: v_mov_b32_e32 v1, v20
+; GFX1013-NEXT: v_mov_b32_e32 v2, v21
+; GFX1013-NEXT: v_mov_b32_e32 v3, v22
+; GFX1013-NEXT: ; return to shader part epilog
%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr)
%r = bitcast <4 x i32> %v to <4 x float>
ret <4 x float> %r
}
define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) {
-; GCN-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_mov_b32 s0, 0xffff
-; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v7
-; GCN-NEXT: v_and_b32_e32 v15, s0, v9
-; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; GCN-NEXT: v_and_b32_e32 v16, s0, v10
-; GCN-NEXT: s_mov_b32 s1, exec_lo
-; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15
-; GCN-NEXT: v_lshl_or_b32 v16, v16, 16, v9
-; GCN-NEXT: v_and_or_b32 v10, v7, s0, v6
-; GCN-NEXT: v_and_or_b32 v15, v8, s0, v15
-; GCN-NEXT: BB9_1: ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT: v_readfirstlane_b32 s4, v11
-; GCN-NEXT: v_readfirstlane_b32 s5, v12
-; GCN-NEXT: v_readfirstlane_b32 s6, v13
-; GCN-NEXT: v_readfirstlane_b32 s7, v14
-; GCN-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[11:12]
-; GCN-NEXT: image_bvh64_intersect_ray v[6:9], [v0, v1, v2, v3, v4, v5, v10, v15, v16], s[4:7] a16
-; GCN-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[13:14]
-; GCN-NEXT: s_and_b32 s0, s0, vcc_lo
-; GCN-NEXT: s_and_saveexec_b32 s0, s0
-; GCN-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GCN-NEXT: s_cbranch_execnz BB9_1
-; GCN-NEXT: ; %bb.2:
-; GCN-NEXT: s_mov_b32 exec_lo, s1
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, v6
-; GCN-NEXT: v_mov_b32_e32 v1, v7
-; GCN-NEXT: v_mov_b32_e32 v2, v8
-; GCN-NEXT: v_mov_b32_e32 v3, v9
-; GCN-NEXT: ; return to shader part epilog
+; GFX1030-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr:
+; GFX1030: ; %bb.0:
+; GFX1030-NEXT: s_mov_b32 s0, 0xffff
+; GFX1030-NEXT: v_lshrrev_b32_e32 v6, 16, v7
+; GFX1030-NEXT: v_and_b32_e32 v15, s0, v9
+; GFX1030-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX1030-NEXT: v_and_b32_e32 v16, s0, v10
+; GFX1030-NEXT: s_mov_b32 s1, exec_lo
+; GFX1030-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX1030-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX1030-NEXT: v_lshl_or_b32 v16, v16, 16, v9
+; GFX1030-NEXT: v_and_or_b32 v10, v7, s0, v6
+; GFX1030-NEXT: v_and_or_b32 v15, v8, s0, v15
+; GFX1030-NEXT: BB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX1030-NEXT: v_readfirstlane_b32 s4, v11
+; GFX1030-NEXT: v_readfirstlane_b32 s5, v12
+; GFX1030-NEXT: v_readfirstlane_b32 s6, v13
+; GFX1030-NEXT: v_readfirstlane_b32 s7, v14
+; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[11:12]
+; GFX1030-NEXT: image_bvh64_intersect_ray v[6:9], [v0, v1, v2, v3, v4, v5, v10, v15, v16], s[4:7] a16
+; GFX1030-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[13:14]
+; GFX1030-NEXT: s_and_b32 s0, s0, vcc_lo
+; GFX1030-NEXT: s_and_saveexec_b32 s0, s0
+; GFX1030-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX1030-NEXT: s_cbranch_execnz BB9_1
+; GFX1030-NEXT: ; %bb.2:
+; GFX1030-NEXT: s_mov_b32 exec_lo, s1
+; GFX1030-NEXT: s_waitcnt vmcnt(0)
+; GFX1030-NEXT: v_mov_b32_e32 v0, v6
+; GFX1030-NEXT: v_mov_b32_e32 v1, v7
+; GFX1030-NEXT: v_mov_b32_e32 v2, v8
+; GFX1030-NEXT: v_mov_b32_e32 v3, v9
+; GFX1030-NEXT: ; return to shader part epilog
+;
+; GFX1013-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr:
+; GFX1013: ; %bb.0:
+; GFX1013-NEXT: s_mov_b32 s0, 0xffff
+; GFX1013-NEXT: v_lshrrev_b32_e32 v6, 16, v7
+; GFX1013-NEXT: v_and_b32_e32 v15, s0, v9
+; GFX1013-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX1013-NEXT: v_and_b32_e32 v10, s0, v10
+; GFX1013-NEXT: s_mov_b32 s1, exec_lo
+; GFX1013-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX1013-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX1013-NEXT: v_and_or_b32 v6, v7, s0, v6
+; GFX1013-NEXT: v_and_or_b32 v7, v8, s0, v15
+; GFX1013-NEXT: v_lshl_or_b32 v8, v10, 16, v9
+; GFX1013-NEXT: BB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX1013-NEXT: v_readfirstlane_b32 s4, v11
+; GFX1013-NEXT: v_readfirstlane_b32 s5, v12
+; GFX1013-NEXT: v_readfirstlane_b32 s6, v13
+; GFX1013-NEXT: v_readfirstlane_b32 s7, v14
+; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[11:12]
+; GFX1013-NEXT: s_waitcnt vmcnt(0)
+; GFX1013-NEXT: image_bvh64_intersect_ray v[15:18], v[0:15], s[4:7] a16
+; GFX1013-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[13:14]
+; GFX1013-NEXT: s_and_b32 s0, s0, vcc_lo
+; GFX1013-NEXT: s_and_saveexec_b32 s0, s0
+; GFX1013-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX1013-NEXT: s_cbranch_execnz BB9_1
+; GFX1013-NEXT: ; %bb.2:
+; GFX1013-NEXT: s_mov_b32 exec_lo, s1
+; GFX1013-NEXT: s_waitcnt vmcnt(0)
+; GFX1013-NEXT: v_mov_b32_e32 v0, v15
+; GFX1013-NEXT: v_mov_b32_e32 v1, v16
+; GFX1013-NEXT: v_mov_b32_e32 v2, v17
+; GFX1013-NEXT: v_mov_b32_e32 v3, v18
+; GFX1013-NEXT: ; return to shader part epilog
%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr)
%r = bitcast <4 x i32> %v to <4 x float>
ret <4 x float> %r
@@ -294,29 +447,29 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(i32* %p_node_ptr
; GFX1013-NEXT: s_clause 0x1
; GFX1013-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1013-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
-; GFX1013-NEXT: v_lshlrev_b32_e32 v4, 2, v0
-; GFX1013-NEXT: v_mov_b32_e32 v6, 0
-; GFX1013-NEXT: v_mov_b32_e32 v7, 1.0
-; GFX1013-NEXT: v_mov_b32_e32 v8, 2.0
-; GFX1013-NEXT: v_mov_b32_e32 v9, 0x40400000
-; GFX1013-NEXT: v_mov_b32_e32 v10, 4.0
-; GFX1013-NEXT: v_mov_b32_e32 v11, 0x40a00000
-; GFX1013-NEXT: v_mov_b32_e32 v12, 0x40c00000
-; GFX1013-NEXT: v_mov_b32_e32 v13, 0x40e00000
-; GFX1013-NEXT: v_mov_b32_e32 v14, 0x41000000
+; GFX1013-NEXT: v_lshlrev_b32_e32 v6, 2, v0
+; GFX1013-NEXT: v_mov_b32_e32 v7, 0x40a00000
+; GFX1013-NEXT: v_mov_b32_e32 v8, 0x40c00000
+; GFX1013-NEXT: v_mov_b32_e32 v9, 0x40e00000
+; GFX1013-NEXT: v_mov_b32_e32 v10, 0x41000000
; GFX1013-NEXT: s_waitcnt lgkmcnt(0)
; GFX1013-NEXT: v_mov_b32_e32 v0, s4
; GFX1013-NEXT: v_mov_b32_e32 v1, s5
; GFX1013-NEXT: v_mov_b32_e32 v2, s6
; GFX1013-NEXT: v_mov_b32_e32 v3, s7
-; GFX1013-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
-; GFX1013-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX1013-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
+; GFX1013-NEXT: v_add_co_u32 v4, vcc_lo, v0, v6
+; GFX1013-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
+; GFX1013-NEXT: v_add_co_u32 v2, vcc_lo, v2, v6
+; GFX1013-NEXT: v_mov_b32_e32 v6, 4.0
; GFX1013-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
-; GFX1013-NEXT: flat_load_dword v4, v[0:1]
-; GFX1013-NEXT: flat_load_dword v5, v[2:3]
+; GFX1013-NEXT: flat_load_dword v0, v[4:5]
+; GFX1013-NEXT: flat_load_dword v1, v[2:3]
+; GFX1013-NEXT: v_mov_b32_e32 v2, 0
+; GFX1013-NEXT: v_mov_b32_e32 v3, 1.0
+; GFX1013-NEXT: v_mov_b32_e32 v4, 2.0
+; GFX1013-NEXT: v_mov_b32_e32 v5, 0x40400000
; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[4:19], s[8:11]
+; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:15], s[8:11]
; GFX1013-NEXT: s_waitcnt vmcnt(0)
; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
; GFX1013-NEXT: s_endpgm
@@ -392,7 +545,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(i32* %p_node
; GFX1013-NEXT: s_clause 0x1
; GFX1013-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1013-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
-; GFX1013-NEXT: v_lshlrev_b32_e32 v4, 2, v0
+; GFX1013-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; GFX1013-NEXT: s_movk_i32 s1, 0x4400
; GFX1013-NEXT: s_movk_i32 s2, 0x4200
; GFX1013-NEXT: s_bfe_u32 s1, s1, 0x100000
@@ -404,32 +557,32 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(i32* %p_node
; GFX1013-NEXT: s_or_b32 s1, s2, s1
; GFX1013-NEXT: s_bfe_u32 s0, s0, 0x100000
; GFX1013-NEXT: s_lshl_b32 s3, s3, 16
-; GFX1013-NEXT: v_mov_b32_e32 v6, 0
-; GFX1013-NEXT: v_mov_b32_e32 v7, 1.0
-; GFX1013-NEXT: v_mov_b32_e32 v8, 2.0
-; GFX1013-NEXT: v_mov_b32_e32 v9, s1
; GFX1013-NEXT: s_waitcnt lgkmcnt(0)
; GFX1013-NEXT: v_mov_b32_e32 v0, s4
; GFX1013-NEXT: v_mov_b32_e32 v1, s5
; GFX1013-NEXT: v_mov_b32_e32 v2, s6
; GFX1013-NEXT: v_mov_b32_e32 v3, s7
; GFX1013-NEXT: s_movk_i32 s5, 0x4600
-; GFX1013-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
+; GFX1013-NEXT: v_add_co_u32 v4, vcc_lo, v0, v6
; GFX1013-NEXT: s_movk_i32 s4, 0x4700
-; GFX1013-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX1013-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
+; GFX1013-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
+; GFX1013-NEXT: v_add_co_u32 v2, vcc_lo, v2, v6
; GFX1013-NEXT: s_bfe_u32 s2, s5, 0x100000
; GFX1013-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
; GFX1013-NEXT: s_lshl_b32 s2, s2, 16
; GFX1013-NEXT: s_bfe_u32 s4, s4, 0x100000
; GFX1013-NEXT: s_or_b32 s0, s0, s2
-; GFX1013-NEXT: flat_load_dword v4, v[0:1]
-; GFX1013-NEXT: flat_load_dword v5, v[2:3]
+; GFX1013-NEXT: flat_load_dword v0, v[4:5]
+; GFX1013-NEXT: flat_load_dword v1, v[2:3]
; GFX1013-NEXT: s_or_b32 s2, s4, s3
-; GFX1013-NEXT: v_mov_b32_e32 v10, s0
-; GFX1013-NEXT: v_mov_b32_e32 v11, s2
+; GFX1013-NEXT: v_mov_b32_e32 v2, 0
+; GFX1013-NEXT: v_mov_b32_e32 v3, 1.0
+; GFX1013-NEXT: v_mov_b32_e32 v4, 2.0
+; GFX1013-NEXT: v_mov_b32_e32 v5, s1
+; GFX1013-NEXT: v_mov_b32_e32 v6, s0
+; GFX1013-NEXT: v_mov_b32_e32 v7, s2
; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[4:11], s[8:11] a16
+; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[8:11] a16
; GFX1013-NEXT: s_waitcnt vmcnt(0)
; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
; GFX1013-NEXT: s_endpgm
More information about the llvm-commits
mailing list