[llvm] [DAG] Fold (vt trunc (extload (vt x))) -> (vt load x) (PR #75229)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Wed Dec 13 02:19:03 PST 2023
https://github.com/RKSimon updated https://github.com/llvm/llvm-project/pull/75229
>From e7b2e5c9478b0e9cbe9ed118282ee91fa63ddd88 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Tue, 12 Dec 2023 18:00:37 +0000
Subject: [PATCH] [DAG] Fold (vt trunc (extload (vt x))) -> (vt load x)
We were only folding cases which remained extloads, but DAG.getExtLoad can also handle the cases where don't need to extend at all.
reduceLoadWidth can handle this for scalar loads, but not for vectors.
Noticed while triaging D152928
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 15 +-
llvm/test/CodeGen/AMDGPU/ctpop16.ll | 42 ++-
llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll | 12 +-
llvm/test/CodeGen/AMDGPU/kernel-args.ll | 296 ++++--------------
4 files changed, 98 insertions(+), 267 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index f35f663d6ba1b4..86b91390666ad0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -14818,15 +14818,14 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
if (SDValue Reduced = reduceLoadWidth(N))
return Reduced;
- // Handle the case where the load remains an extending load even
- // after truncation.
+ // Handle the case where the truncated result is at least as wide as the
+ // loaded type.
if (N0.hasOneUse() && ISD::isUNINDEXEDLoad(N0.getNode())) {
- LoadSDNode *LN0 = cast<LoadSDNode>(N0);
- if (LN0->isSimple() && LN0->getMemoryVT().bitsLT(VT)) {
- SDValue NewLoad = DAG.getExtLoad(LN0->getExtensionType(), SDLoc(LN0),
- VT, LN0->getChain(), LN0->getBasePtr(),
- LN0->getMemoryVT(),
- LN0->getMemOperand());
+ auto *LN0 = cast<LoadSDNode>(N0);
+ if (LN0->isSimple() && LN0->getMemoryVT().bitsLE(VT)) {
+ SDValue NewLoad = DAG.getExtLoad(
+ LN0->getExtensionType(), SDLoc(LN0), VT, LN0->getChain(),
+ LN0->getBasePtr(), LN0->getMemoryVT(), LN0->getMemOperand());
DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), NewLoad.getValue(1));
return NewLoad;
}
diff --git a/llvm/test/CodeGen/AMDGPU/ctpop16.ll b/llvm/test/CodeGen/AMDGPU/ctpop16.ll
index 78c657049fcb2a..c7396f25fba652 100644
--- a/llvm/test/CodeGen/AMDGPU/ctpop16.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctpop16.ll
@@ -1553,50 +1553,48 @@ define amdgpu_kernel void @ctpop_i16_in_br(ptr addrspace(1) %out, ptr addrspace(
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 0, @20, KC0[], KC1[]
; EG-NEXT: TEX 0 @14
-; EG-NEXT: ALU_PUSH_BEFORE 6, @21, KC0[], KC1[]
+; EG-NEXT: ALU_PUSH_BEFORE 4, @21, KC0[], KC1[]
; EG-NEXT: JUMP @7 POP:1
-; EG-NEXT: ALU 0, @28, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 0, @26, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @16
-; EG-NEXT: ALU_POP_AFTER 1, @29, KC0[], KC1[]
-; EG-NEXT: ALU_PUSH_BEFORE 2, @31, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU_POP_AFTER 1, @27, KC0[], KC1[]
+; EG-NEXT: ALU_PUSH_BEFORE 2, @29, KC0[CB0:0-32], KC1[]
; EG-NEXT: JUMP @11 POP:1
; EG-NEXT: TEX 0 @18
-; EG-NEXT: ALU_POP_AFTER 0, @34, KC0[], KC1[]
-; EG-NEXT: ALU 11, @35, KC0[], KC1[]
+; EG-NEXT: ALU_POP_AFTER 0, @32, KC0[], KC1[]
+; EG-NEXT: ALU 11, @33, KC0[], KC1[]
; EG-NEXT: MEM_RAT MSKOR T1.XW, T0.X
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 14:
-; EG-NEXT: VTX_READ_16 T1.X, T0.X, 46, #3
+; EG-NEXT: VTX_READ_16 T2.X, T1.X, 46, #3
; EG-NEXT: Fetch clause starting at 16:
-; EG-NEXT: VTX_READ_16 T1.X, T1.X, 2, #1
+; EG-NEXT: VTX_READ_16 T0.X, T0.X, 2, #1
; EG-NEXT: Fetch clause starting at 18:
-; EG-NEXT: VTX_READ_16 T0.X, T0.X, 44, #3
+; EG-NEXT: VTX_READ_16 T0.X, T1.X, 44, #3
; EG-NEXT: ALU clause starting at 20:
-; EG-NEXT: MOV * T0.X, 0.0,
+; EG-NEXT: MOV * T1.X, 0.0,
; EG-NEXT: ALU clause starting at 21:
-; EG-NEXT: AND_INT * T0.W, T1.X, literal.x,
-; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: MOV T1.X, literal.x,
+; EG-NEXT: MOV T0.X, literal.x,
; EG-NEXT: MOV T1.W, literal.y,
-; EG-NEXT: SETNE_INT * T0.W, PV.W, 0.0,
+; EG-NEXT: SETNE_INT * T0.W, T2.X, 0.0,
; EG-NEXT: 0(0.000000e+00), 1(1.401298e-45)
; EG-NEXT: PRED_SETNE_INT * ExecMask,PredicateBit (MASKED), PS, 0.0,
-; EG-NEXT: ALU clause starting at 28:
-; EG-NEXT: MOV * T1.X, KC0[2].Z,
-; EG-NEXT: ALU clause starting at 29:
+; EG-NEXT: ALU clause starting at 26:
+; EG-NEXT: MOV * T0.X, KC0[2].Z,
+; EG-NEXT: ALU clause starting at 27:
; EG-NEXT: MOV * T1.W, literal.x,
; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00)
-; EG-NEXT: ALU clause starting at 31:
+; EG-NEXT: ALU clause starting at 29:
; EG-NEXT: MOV T0.W, KC0[2].Y,
; EG-NEXT: SETE_INT * T1.W, T1.W, 0.0,
; EG-NEXT: PRED_SETE_INT * ExecMask,PredicateBit (MASKED), PS, 0.0,
-; EG-NEXT: ALU clause starting at 34:
-; EG-NEXT: BCNT_INT * T1.X, T0.X,
-; EG-NEXT: ALU clause starting at 35:
+; EG-NEXT: ALU clause starting at 32:
+; EG-NEXT: BCNT_INT * T0.X, T0.X,
+; EG-NEXT: ALU clause starting at 33:
; EG-NEXT: LSHL * T1.W, T0.W, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: AND_INT T1.W, PV.W, literal.x,
-; EG-NEXT: AND_INT * T2.W, T1.X, literal.y,
+; EG-NEXT: AND_INT * T2.W, T0.X, literal.y,
; EG-NEXT: 24(3.363116e-44), 65535(9.183409e-41)
; EG-NEXT: LSHL T1.X, PS, PV.W,
; EG-NEXT: LSHL * T1.W, literal.x, PV.W,
diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
index 855b5fff11fe55..7e8c28fa447509 100644
--- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
@@ -331,7 +331,7 @@ define amdgpu_kernel void @s_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
; EG-NEXT: CF_END
; EG-NEXT: PAD
@@ -340,9 +340,7 @@ define amdgpu_kernel void @s_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T0.X, 0.0,
; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: BFE_INT * T0.W, T0.X, 0.0, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: FFBL_INT T0.W, PV.W,
+; EG-NEXT: FFBL_INT T0.W, T0.X,
; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
@@ -402,7 +400,7 @@ define amdgpu_kernel void @s_cttz_zero_undef_i16_with_select(ptr addrspace(1) no
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
; EG-NEXT: CF_END
; EG-NEXT: PAD
@@ -411,9 +409,7 @@ define amdgpu_kernel void @s_cttz_zero_undef_i16_with_select(ptr addrspace(1) no
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T0.X, 0.0,
; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: BFE_INT * T0.W, T0.X, 0.0, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: FFBL_INT T0.W, PV.W,
+; EG-NEXT: FFBL_INT T0.W, T0.X,
; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
index d37819ac69412c..5d6972dcaea138 100644
--- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
@@ -602,64 +602,54 @@ define amdgpu_kernel void @v2i8_arg(ptr addrspace(1) %out, <2 x i8> %in) {
;
; EG-LABEL: v2i8_arg:
; EG: ; %bb.0: ; %entry
-; EG-NEXT: ALU 0, @10, KC0[], KC1[]
-; EG-NEXT: TEX 1 @6
-; EG-NEXT: ALU 15, @11, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT MSKOR T4.XW, T5.X
+; EG-NEXT: ALU 0, @8, KC0[], KC1[]
+; EG-NEXT: TEX 0 @6
+; EG-NEXT: ALU 11, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
-; EG-NEXT: VTX_READ_8 T5.X, T4.X, 41, #3
-; EG-NEXT: VTX_READ_8 T4.X, T4.X, 40, #3
-; EG-NEXT: ALU clause starting at 10:
-; EG-NEXT: MOV * T4.X, 0.0,
-; EG-NEXT: ALU clause starting at 11:
-; EG-NEXT: LSHL T0.W, T5.X, literal.x,
-; EG-NEXT: AND_INT * T1.W, T4.X, literal.y,
-; EG-NEXT: 8(1.121039e-44), 255(3.573311e-43)
-; EG-NEXT: AND_INT T2.W, KC0[2].Y, literal.x,
-; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: VTX_READ_16 T0.X, T0.X, 40, #3
+; EG-NEXT: ALU clause starting at 8:
+; EG-NEXT: MOV * T0.X, 0.0,
+; EG-NEXT: ALU clause starting at 9:
+; EG-NEXT: AND_INT T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: AND_INT * T1.W, T0.X, literal.y,
+; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41)
+; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT: AND_INT T0.W, PS, literal.x,
-; EG-NEXT: LSHL * T1.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
-; EG-NEXT: LSHL T4.X, PV.W, PS,
-; EG-NEXT: LSHL * T4.W, literal.x, PS,
+; EG-NEXT: LSHL T0.X, T1.W, PV.W,
+; EG-NEXT: LSHL * T0.W, literal.x, PV.W,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: MOV T4.Y, 0.0,
-; EG-NEXT: MOV * T4.Z, 0.0,
-; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.x,
+; EG-NEXT: MOV T0.Y, 0.0,
+; EG-NEXT: MOV * T0.Z, 0.0,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: v2i8_arg:
; CM: ; %bb.0: ; %entry
-; CM-NEXT: ALU 0, @10, KC0[], KC1[]
-; CM-NEXT: TEX 1 @6
-; CM-NEXT: ALU 15, @11, KC0[CB0:0-32], KC1[]
-; CM-NEXT: MEM_RAT MSKOR T4.XW, T5.X
+; CM-NEXT: ALU 0, @8, KC0[], KC1[]
+; CM-NEXT: TEX 0 @6
+; CM-NEXT: ALU 11, @9, KC0[CB0:0-32], KC1[]
+; CM-NEXT: MEM_RAT MSKOR T0.XW, T1.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: Fetch clause starting at 6:
-; CM-NEXT: VTX_READ_8 T5.X, T4.X, 41, #3
-; CM-NEXT: VTX_READ_8 T4.X, T4.X, 40, #3
-; CM-NEXT: ALU clause starting at 10:
-; CM-NEXT: MOV * T4.X, 0.0,
-; CM-NEXT: ALU clause starting at 11:
-; CM-NEXT: LSHL T0.Z, T5.X, literal.x,
-; CM-NEXT: AND_INT * T0.W, T4.X, literal.y, BS:VEC_120/SCL_212
-; CM-NEXT: 8(1.121039e-44), 255(3.573311e-43)
-; CM-NEXT: AND_INT T1.Z, KC0[2].Y, literal.x,
-; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
+; CM-NEXT: VTX_READ_16 T0.X, T0.X, 40, #3
+; CM-NEXT: ALU clause starting at 8:
+; CM-NEXT: MOV * T0.X, 0.0,
+; CM-NEXT: ALU clause starting at 9:
+; CM-NEXT: AND_INT * T0.W, KC0[2].Y, literal.x,
; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
-; CM-NEXT: AND_INT T0.Z, PV.W, literal.x,
-; CM-NEXT: LSHL * T0.W, PV.Z, literal.y,
+; CM-NEXT: AND_INT T0.Z, T0.X, literal.x,
+; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
; CM-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
-; CM-NEXT: LSHL T4.X, PV.Z, PV.W,
-; CM-NEXT: LSHL * T4.W, literal.x, PV.W,
+; CM-NEXT: LSHL T0.X, PV.Z, PV.W,
+; CM-NEXT: LSHL * T0.W, literal.x, PV.W,
; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; CM-NEXT: MOV T4.Y, 0.0,
-; CM-NEXT: MOV * T4.Z, 0.0,
-; CM-NEXT: LSHR * T5.X, KC0[2].Y, literal.x,
+; CM-NEXT: MOV T0.Y, 0.0,
+; CM-NEXT: MOV * T0.Z, 0.0,
+; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
store <2 x i8> %in, ptr addrspace(1) %out
@@ -701,44 +691,24 @@ define amdgpu_kernel void @v2i16_arg(ptr addrspace(1) %out, <2 x i16> %in) {
;
; EG-LABEL: v2i16_arg:
; EG: ; %bb.0: ; %entry
-; EG-NEXT: ALU 0, @10, KC0[], KC1[]
-; EG-NEXT: TEX 1 @6
-; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T5.X, 1
+; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
-; EG-NEXT: Fetch clause starting at 6:
-; EG-NEXT: VTX_READ_16 T5.X, T4.X, 42, #3
-; EG-NEXT: VTX_READ_16 T4.X, T4.X, 40, #3
-; EG-NEXT: ALU clause starting at 10:
-; EG-NEXT: MOV * T4.X, 0.0,
-; EG-NEXT: ALU clause starting at 11:
-; EG-NEXT: LSHL T0.W, T5.X, literal.x,
-; EG-NEXT: AND_INT * T1.W, T4.X, literal.y,
-; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
-; EG-NEXT: OR_INT T4.X, PV.W, PS,
-; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.x,
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: MOV T0.X, KC0[2].Z,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: v2i16_arg:
; CM: ; %bb.0: ; %entry
-; CM-NEXT: ALU 0, @10, KC0[], KC1[]
-; CM-NEXT: TEX 1 @6
-; CM-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[]
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T4.X, T5.X
+; CM-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
-; CM-NEXT: Fetch clause starting at 6:
-; CM-NEXT: VTX_READ_16 T5.X, T4.X, 42, #3
-; CM-NEXT: VTX_READ_16 T4.X, T4.X, 40, #3
-; CM-NEXT: ALU clause starting at 10:
-; CM-NEXT: MOV * T4.X, 0.0,
-; CM-NEXT: ALU clause starting at 11:
-; CM-NEXT: LSHL T0.Z, T5.X, literal.x,
-; CM-NEXT: AND_INT * T0.W, T4.X, literal.y, BS:VEC_120/SCL_212
-; CM-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
-; CM-NEXT: OR_INT * T4.X, PV.Z, PV.W,
-; CM-NEXT: LSHR * T5.X, KC0[2].Y, literal.x,
+; CM-NEXT: ALU clause starting at 4:
+; CM-NEXT: MOV * T0.X, KC0[2].Z,
+; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
store <2 x i16> %in, ptr addrspace(1) %out
@@ -1322,68 +1292,24 @@ define amdgpu_kernel void @v4i8_arg(ptr addrspace(1) %out, <4 x i8> %in) {
;
; EG-LABEL: v4i8_arg:
; EG: ; %bb.0: ; %entry
-; EG-NEXT: ALU 0, @14, KC0[], KC1[]
-; EG-NEXT: TEX 3 @6
-; EG-NEXT: ALU 15, @15, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T5.X, 1
+; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
-; EG-NEXT: Fetch clause starting at 6:
-; EG-NEXT: VTX_READ_8 T5.X, T4.X, 42, #3
-; EG-NEXT: VTX_READ_8 T6.X, T4.X, 40, #3
-; EG-NEXT: VTX_READ_8 T7.X, T4.X, 43, #3
-; EG-NEXT: VTX_READ_8 T4.X, T4.X, 41, #3
-; EG-NEXT: ALU clause starting at 14:
-; EG-NEXT: MOV * T4.X, 0.0,
-; EG-NEXT: ALU clause starting at 15:
-; EG-NEXT: AND_INT * T0.W, T5.X, literal.x,
-; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; EG-NEXT: AND_INT T0.Z, T4.X, literal.x,
-; EG-NEXT: LSHL T0.W, PV.W, literal.y,
-; EG-NEXT: LSHL * T1.W, T7.X, literal.z,
-; EG-NEXT: 255(3.573311e-43), 16(2.242078e-44)
-; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT: OR_INT T0.W, PS, PV.W,
-; EG-NEXT: LSHL * T1.W, PV.Z, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: OR_INT T0.W, PV.W, PS,
-; EG-NEXT: AND_INT * T1.W, T6.X, literal.x,
-; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; EG-NEXT: OR_INT T4.X, PV.W, PS,
-; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.x,
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: MOV T0.X, KC0[2].Z,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: v4i8_arg:
; CM: ; %bb.0: ; %entry
-; CM-NEXT: ALU 0, @14, KC0[], KC1[]
-; CM-NEXT: TEX 3 @6
-; CM-NEXT: ALU 15, @15, KC0[CB0:0-32], KC1[]
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T4.X, T5.X
+; CM-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
-; CM-NEXT: Fetch clause starting at 6:
-; CM-NEXT: VTX_READ_8 T5.X, T4.X, 42, #3
-; CM-NEXT: VTX_READ_8 T6.X, T4.X, 40, #3
-; CM-NEXT: VTX_READ_8 T7.X, T4.X, 43, #3
-; CM-NEXT: VTX_READ_8 T4.X, T4.X, 41, #3
-; CM-NEXT: ALU clause starting at 14:
-; CM-NEXT: MOV * T4.X, 0.0,
-; CM-NEXT: ALU clause starting at 15:
-; CM-NEXT: AND_INT * T0.W, T5.X, literal.x,
-; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; CM-NEXT: AND_INT T0.Y, T4.X, literal.x,
-; CM-NEXT: LSHL T0.Z, PV.W, literal.y,
-; CM-NEXT: LSHL * T0.W, T7.X, literal.z, BS:VEC_120/SCL_212
-; CM-NEXT: 255(3.573311e-43), 16(2.242078e-44)
-; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00)
-; CM-NEXT: OR_INT T0.Z, PV.W, PV.Z,
-; CM-NEXT: LSHL * T0.W, PV.Y, literal.x,
-; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; CM-NEXT: OR_INT T0.Z, PV.Z, PV.W,
-; CM-NEXT: AND_INT * T0.W, T6.X, literal.x,
-; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; CM-NEXT: OR_INT * T4.X, PV.Z, PV.W,
-; CM-NEXT: LSHR * T5.X, KC0[2].Y, literal.x,
+; CM-NEXT: ALU clause starting at 4:
+; CM-NEXT: MOV * T0.X, KC0[2].Z,
+; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
store <4 x i8> %in, ptr addrspace(1) %out
@@ -1427,115 +1353,27 @@ define amdgpu_kernel void @v4i16_arg(ptr addrspace(1) %out, <4 x i16> %in) {
;
; EG-LABEL: v4i16_arg:
; EG: ; %bb.0: ; %entry
-; EG-NEXT: ALU 1, @20, KC0[], KC1[]
-; EG-NEXT: TEX 0 @12
-; EG-NEXT: ALU 5, @22, KC0[], KC1[]
-; EG-NEXT: TEX 0 @14
-; EG-NEXT: ALU 5, @28, KC0[], KC1[]
-; EG-NEXT: TEX 0 @16
-; EG-NEXT: ALU 5, @34, KC0[], KC1[]
-; EG-NEXT: TEX 0 @18
-; EG-NEXT: ALU 7, @40, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XY, T6.X, 1
+; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
-; EG-NEXT: Fetch clause starting at 12:
-; EG-NEXT: VTX_READ_16 T6.X, T5.X, 50, #3
-; EG-NEXT: Fetch clause starting at 14:
-; EG-NEXT: VTX_READ_16 T6.X, T5.X, 48, #3
-; EG-NEXT: Fetch clause starting at 16:
-; EG-NEXT: VTX_READ_16 T6.X, T5.X, 46, #3
-; EG-NEXT: Fetch clause starting at 18:
-; EG-NEXT: VTX_READ_16 T5.X, T5.X, 44, #3
-; EG-NEXT: ALU clause starting at 20:
-; EG-NEXT: MOV * T0.Y, T3.X,
-; EG-NEXT: MOV * T5.X, 0.0,
-; EG-NEXT: ALU clause starting at 22:
-; EG-NEXT: LSHL T0.W, T6.X, literal.x,
-; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
-; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
-; EG-NEXT: OR_INT * T0.W, PS, PV.W,
-; EG-NEXT: MOV * T3.X, PV.W,
-; EG-NEXT: MOV * T0.Y, PV.X,
-; EG-NEXT: ALU clause starting at 28:
-; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, T6.X, literal.y,
-; EG-NEXT: -65536(nan), 65535(9.183409e-41)
-; EG-NEXT: OR_INT * T0.W, PV.W, PS,
-; EG-NEXT: MOV T3.X, PV.W,
-; EG-NEXT: MOV * T0.Y, T2.X,
-; EG-NEXT: ALU clause starting at 34:
-; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
-; EG-NEXT: LSHL * T1.W, T6.X, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: OR_INT * T0.W, PV.W, PS,
-; EG-NEXT: MOV * T2.X, PV.W,
-; EG-NEXT: MOV * T0.Y, PV.X,
-; EG-NEXT: ALU clause starting at 40:
-; EG-NEXT: LSHR T6.X, KC0[2].Y, literal.x,
-; EG-NEXT: AND_INT T0.W, T0.Y, literal.y,
-; EG-NEXT: AND_INT * T1.W, T5.X, literal.z,
-; EG-NEXT: 2(2.802597e-45), -65536(nan)
-; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T5.X, PV.W, PS,
-; EG-NEXT: MOV T2.X, PV.X,
-; EG-NEXT: MOV * T5.Y, T3.X,
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: MOV * T0.Y, KC0[3].X,
+; EG-NEXT: MOV T0.X, KC0[2].W,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: v4i16_arg:
; CM: ; %bb.0: ; %entry
-; CM-NEXT: ALU 1, @20, KC0[], KC1[]
-; CM-NEXT: TEX 0 @12
-; CM-NEXT: ALU 5, @22, KC0[], KC1[]
-; CM-NEXT: TEX 0 @14
-; CM-NEXT: ALU 5, @28, KC0[], KC1[]
-; CM-NEXT: TEX 0 @16
-; CM-NEXT: ALU 5, @34, KC0[], KC1[]
-; CM-NEXT: TEX 0 @18
-; CM-NEXT: ALU 7, @40, KC0[CB0:0-32], KC1[]
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T5, T6.X
+; CM-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
-; CM-NEXT: Fetch clause starting at 12:
-; CM-NEXT: VTX_READ_16 T6.X, T5.X, 50, #3
-; CM-NEXT: Fetch clause starting at 14:
-; CM-NEXT: VTX_READ_16 T6.X, T5.X, 48, #3
-; CM-NEXT: Fetch clause starting at 16:
-; CM-NEXT: VTX_READ_16 T6.X, T5.X, 46, #3
-; CM-NEXT: Fetch clause starting at 18:
-; CM-NEXT: VTX_READ_16 T5.X, T5.X, 44, #3
-; CM-NEXT: ALU clause starting at 20:
-; CM-NEXT: MOV * T0.Y, T3.X,
-; CM-NEXT: MOV * T5.X, 0.0,
-; CM-NEXT: ALU clause starting at 22:
-; CM-NEXT: LSHL T0.Z, T6.X, literal.x,
-; CM-NEXT: AND_INT * T0.W, T0.Y, literal.y,
-; CM-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
-; CM-NEXT: OR_INT * T0.W, PV.W, PV.Z,
-; CM-NEXT: MOV * T3.X, PV.W,
-; CM-NEXT: MOV * T0.Y, PV.X,
-; CM-NEXT: ALU clause starting at 28:
-; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT: AND_INT * T0.W, T6.X, literal.y,
-; CM-NEXT: -65536(nan), 65535(9.183409e-41)
-; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
-; CM-NEXT: MOV T3.X, PV.W,
-; CM-NEXT: MOV * T0.Y, T2.X,
-; CM-NEXT: ALU clause starting at 34:
-; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT: LSHL * T0.W, T6.X, literal.y,
-; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
-; CM-NEXT: MOV * T2.X, PV.W,
-; CM-NEXT: MOV * T0.Y, PV.X,
-; CM-NEXT: ALU clause starting at 40:
-; CM-NEXT: LSHR T6.X, KC0[2].Y, literal.x,
-; CM-NEXT: AND_INT T0.Z, T0.Y, literal.y,
-; CM-NEXT: AND_INT * T0.W, T5.X, literal.z,
-; CM-NEXT: 2(2.802597e-45), -65536(nan)
-; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; CM-NEXT: OR_INT * T5.X, PV.Z, PV.W,
-; CM-NEXT: MOV T2.X, PV.X,
-; CM-NEXT: MOV * T5.Y, T3.X,
+; CM-NEXT: ALU clause starting at 4:
+; CM-NEXT: MOV * T0.Y, KC0[3].X,
+; CM-NEXT: MOV * T0.X, KC0[2].W,
+; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
store <4 x i16> %in, ptr addrspace(1) %out
ret void
More information about the llvm-commits
mailing list