[llvm] 142efd6 - [AMDGPU] Add ISD::FSHR Handling to AMDGPUISD::PERM matching
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Sun Sep 24 05:49:54 PDT 2023
Author: Simon Pilgrim
Date: 2023-09-24T13:40:07+01:00
New Revision: 142efd6d612965897cf0b9d560348bf40c15ebaa
URL: https://github.com/llvm/llvm-project/commit/142efd6d612965897cf0b9d560348bf40c15ebaa
DIFF: https://github.com/llvm/llvm-project/commit/142efd6d612965897cf0b9d560348bf40c15ebaa.diff
LOG: [AMDGPU] Add ISD::FSHR Handling to AMDGPUISD::PERM matching
Pulled out of D159533, which encourages (zext (trunc x)) -> x folds, leading to more ISD::FSHR nodes, which was breaking some existing AMDGPUISD::PERM tests
Differential Revision: https://reviews.llvm.org/D159533
Added:
Modified:
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/test/CodeGen/AMDGPU/permute_i8.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 591775dbf45e396..60b2f9ee49fcf8a 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -789,6 +789,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
ISD::AND,
ISD::OR,
ISD::XOR,
+ ISD::FSHR,
ISD::SINT_TO_FP,
ISD::UINT_TO_FP,
ISD::FCANONICALIZE,
@@ -10773,6 +10774,30 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
return calculateSrcByte(Op->getOperand(0), StartingIndex, Index);
}
+ case ISD::FSHR: {
+ // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
+ auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(2));
+ if (!ShiftOp || Op.getValueType().isVector())
+ return std::nullopt;
+
+ uint64_t BitsProvided = Op.getValueSizeInBits();
+ if (BitsProvided % 8 != 0)
+ return std::nullopt;
+
+ uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
+ if (BitShift % 8)
+ return std::nullopt;
+
+ uint64_t ConcatSizeInBytes = BitsProvided / 4;
+ uint64_t ByteShift = BitShift / 8;
+
+ uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
+ uint64_t BytesProvided = BitsProvided / 8;
+ SDValue NextOp = Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
+ NewIndex %= BytesProvided;
+ return calculateByteProvider(NextOp, NewIndex, Depth + 1, StartingIndex);
+ }
+
case ISD::SRA:
case ISD::SRL: {
auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
@@ -11053,6 +11078,12 @@ static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
SDValue OtherOp = SecondSrc.has_value() ? *PermNodes[*SecondSrc].Src
: *PermNodes[FirstSrc].Src;
+ // Check that we haven't just recreated the same FSHR node.
+ if (N->getOpcode() == ISD::FSHR &&
+ (N->getOperand(0) == Op || N->getOperand(0) == OtherOp) &&
+ (N->getOperand(1) == Op || N->getOperand(1) == OtherOp))
+ return SDValue();
+
// Check that we are not just extracting the bytes in order from an op
if (Op == OtherOp && Op.getValueSizeInBits() == 32) {
int Low16 = PermMask & 0xffff;
@@ -13061,6 +13092,14 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
return performAndCombine(N, DCI);
case ISD::OR:
return performOrCombine(N, DCI);
+ case ISD::FSHR: {
+ const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+ if (N->getValueType(0) == MVT::i32 && N->isDivergent() &&
+ TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
+ return matchPERM(N, DCI);
+ }
+ break;
+ }
case ISD::XOR:
return performXorCombine(N, DCI);
case ISD::ZERO_EXTEND:
diff --git a/llvm/test/CodeGen/AMDGPU/permute_i8.ll b/llvm/test/CodeGen/AMDGPU/permute_i8.ll
index 1d139f1fb40c281..6f6452c69fb214c 100644
--- a/llvm/test/CodeGen/AMDGPU/permute_i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/permute_i8.ll
@@ -1234,13 +1234,12 @@ define hidden void @ive_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
; GFX10-NEXT: v_lshrrev_b32_sdwa v0, v0, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_and_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT: v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT: v_or_b32_sdwa v0, v9, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX10-NEXT: v_alignbit_b32 v0, v0, v10, 16
-; GFX10-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT: global_store_dword v[5:6], v1, off
-; GFX10-NEXT: global_store_dword v[7:8], v0, off
+; GFX10-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT: v_perm_b32 v1, v10, v9, 0x2000706
+; GFX10-NEXT: global_store_dword v[5:6], v0, off
+; GFX10-NEXT: global_store_dword v[7:8], v1, off
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: ive_store_div:
@@ -1256,18 +1255,18 @@ define hidden void @ive_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
; GFX9-NEXT: global_load_dword v10, v[2:3], off
; GFX9-NEXT: s_movk_i32 s4, 0xff
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v4
+; GFX9-NEXT: s_mov_b32 s5, 0x2000706
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v9
-; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_and_b32_sdwa v2, v10, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX9-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX9-NEXT: v_alignbit_b32 v2, v1, v10, 16
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT: v_perm_b32 v3, v10, v9, s5
; GFX9-NEXT: global_store_dword v[5:6], v0, off
-; GFX9-NEXT: global_store_dword v[7:8], v2, off
+; GFX9-NEXT: global_store_dword v[7:8], v3, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
%tid = call i32 @llvm.amdgcn.workitem.id.x()
More information about the llvm-commits
mailing list