[llvm] r334559 - [AMDGPU] DAG combine to produce V_PERM_B32
Stanislav Mekhanoshin via llvm-commits
llvm-commits at lists.llvm.org
Tue Jun 12 16:50:38 PDT 2018
Author: rampitec
Date: Tue Jun 12 16:50:37 2018
New Revision: 334559
URL: http://llvm.org/viewvc/llvm-project?rev=334559&view=rev
Log:
[AMDGPU] DAG combine to produce V_PERM_B32
Differential Revision: https://reviews.llvm.org/D48099
Added:
llvm/trunk/test/CodeGen/AMDGPU/permute.ll
Modified:
llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h
llvm/trunk/lib/Target/AMDGPU/AMDGPUInstrInfo.td
llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/trunk/lib/Target/AMDGPU/VOP3Instructions.td
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp?rev=334559&r1=334558&r2=334559&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp Tue Jun 12 16:50:37 2018
@@ -4119,6 +4119,7 @@ const char* AMDGPUTargetLowering::getTar
NODE_NAME_CASE(MAD_I24)
NODE_NAME_CASE(MAD_I64_I32)
NODE_NAME_CASE(MAD_U64_U32)
+ NODE_NAME_CASE(PERM)
NODE_NAME_CASE(TEXTURE_FETCH)
NODE_NAME_CASE(EXPORT)
NODE_NAME_CASE(EXPORT_DONE)
@@ -4374,6 +4375,34 @@ void AMDGPUTargetLowering::computeKnownB
Known.Zero.setHighBits(32 - MaxValBits);
break;
}
+ case AMDGPUISD::PERM: {
+ ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Op.getOperand(2));
+ if (!CMask)
+ return;
+
+ KnownBits LHSKnown, RHSKnown;
+ DAG.computeKnownBits(Op.getOperand(0), LHSKnown, Depth + 1);
+ DAG.computeKnownBits(Op.getOperand(1), RHSKnown, Depth + 1);
+ unsigned Sel = CMask->getZExtValue();
+
+ for (unsigned I = 0; I < 32; I += 8) {
+ unsigned ByteMask = 0xff << I;
+ unsigned SelBits = Sel & 0xff;
+ if (SelBits < 4) {
+ Known.One |= RHSKnown.One & ByteMask;
+ Known.Zero |= RHSKnown.Zero & ByteMask;
+ } else if (SelBits < 7) {
+ Known.One |= LHSKnown.One & ByteMask;
+ Known.Zero |= LHSKnown.Zero & ByteMask;
+ } else if (SelBits == 0x0c) {
+ Known.Zero |= ByteMask;
+ } else if (SelBits > 0x0c) {
+ Known.One |= ByteMask;
+ }
+ Sel >>= 8;
+ }
+ break;
+ }
case ISD::INTRINSIC_WO_CHAIN: {
unsigned IID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
switch (IID) {
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h?rev=334559&r1=334558&r2=334559&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h Tue Jun 12 16:50:37 2018
@@ -402,6 +402,7 @@ enum NodeType : unsigned {
MAD_I64_I32,
MUL_LOHI_I24,
MUL_LOHI_U24,
+ PERM,
TEXTURE_FETCH,
EXPORT, // exp on SI+
EXPORT_DONE, // exp on SI+ with done bit set
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUInstrInfo.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUInstrInfo.td?rev=334559&r1=334558&r2=334559&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUInstrInfo.td (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUInstrInfo.td Tue Jun 12 16:50:37 2018
@@ -339,6 +339,8 @@ def AMDGPUumed3 : SDNode<"AMDGPUISD::UME
def AMDGPUfmed3 : SDNode<"AMDGPUISD::FMED3", SDTFPTernaryOp, []>;
+def AMDGPUperm : SDNode<"AMDGPUISD::PERM", AMDGPUDTIntTernaryOp, []>;
+
def AMDGPUinit_exec : SDNode<"AMDGPUISD::INIT_EXEC",
SDTypeProfile<0, 1, [SDTCisInt<0>]>,
[SDNPHasChain, SDNPInGlue]>;
Modified: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp?rev=334559&r1=334558&r2=334559&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp Tue Jun 12 16:50:37 2018
@@ -6135,6 +6135,71 @@ static bool isBoolSGPR(SDValue V) {
return false;
}
+// If a constant has all zeroes or all ones within each byte return it.
+// Otherwise return 0.
+static uint32_t getConstantPermuteMask(uint32_t C) {
+ // 0xff for any zero byte in the mask
+ uint32_t ZeroByteMask = 0;
+ if (!(C & 0x000000ff)) ZeroByteMask |= 0x000000ff;
+ if (!(C & 0x0000ff00)) ZeroByteMask |= 0x0000ff00;
+ if (!(C & 0x00ff0000)) ZeroByteMask |= 0x00ff0000;
+ if (!(C & 0xff000000)) ZeroByteMask |= 0xff000000;
+ uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
+ if ((NonZeroByteMask & C) != NonZeroByteMask)
+ return 0; // Partial bytes selected.
+ return C;
+}
+
+// Check if a node selects whole bytes from its operand 0 starting at a byte
+// boundary while masking the rest. Returns select mask as in the v_perm_b32
+// or -1 if not succeeded.
+// Note byte select encoding:
+// value 0-3 selects corresponding source byte;
+// value 0xc selects zero;
+// value 0xff selects 0xff.
+static uint32_t getPermuteMask(SelectionDAG &DAG, SDValue V) {
+ assert(V.getValueSizeInBits() == 32);
+
+ if (V.getNumOperands() != 2)
+ return ~0;
+
+ ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
+ if (!N1)
+ return ~0;
+
+ uint32_t C = N1->getZExtValue();
+
+ switch (V.getOpcode()) {
+ default:
+ break;
+ case ISD::AND:
+ if (uint32_t ConstMask = getConstantPermuteMask(C)) {
+ return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
+ }
+ break;
+
+ case ISD::OR:
+ if (uint32_t ConstMask = getConstantPermuteMask(C)) {
+ return (0x03020100 & ~ConstMask) | ConstMask;
+ }
+ break;
+
+ case ISD::SHL:
+ if (C % 8)
+ return ~0;
+
+ return uint32_t((0x030201000c0c0c0cull << C) >> 32);
+
+ case ISD::SRL:
+ if (C % 8)
+ return ~0;
+
+ return uint32_t(0x0c0c0c0c03020100ull >> C);
+ }
+
+ return ~0;
+}
+
SDValue SITargetLowering::performAndCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
if (DCI.isBeforeLegalize())
@@ -6181,6 +6246,20 @@ SDValue SITargetLowering::performAndComb
}
}
}
+
+ // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
+ if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
+ isa<ConstantSDNode>(LHS.getOperand(2))) {
+ uint32_t Sel = getConstantPermuteMask(Mask);
+ if (!Sel)
+ return SDValue();
+
+ // Select 0xc for all zero bytes
+ Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
+ SDLoc DL(N);
+ return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
+ LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
+ }
}
// (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
@@ -6233,6 +6312,54 @@ SDValue SITargetLowering::performAndComb
LHS, DAG.getConstant(0, SDLoc(N), MVT::i32));
}
+ // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
+ const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+ if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
+ N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32) != -1) {
+ uint32_t LHSMask = getPermuteMask(DAG, LHS);
+ uint32_t RHSMask = getPermuteMask(DAG, RHS);
+ if (LHSMask != ~0u && RHSMask != ~0u) {
+ // Canonicalize the expression in an attempt to have fewer unique masks
+ // and therefore fewer registers used to hold the masks.
+ if (LHSMask > RHSMask) {
+ std::swap(LHSMask, RHSMask);
+ std::swap(LHS, RHS);
+ }
+
+ // Select 0xc for each lane used from source operand. Zero has 0xc mask
+ // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
+ uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
+ uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
+
+ // Check of we need to combine values from two sources within a byte.
+ if (!(LHSUsedLanes & RHSUsedLanes) &&
+ // If we select high and lower word keep it for SDWA.
+ // TODO: teach SDWA to work with v_perm_b32 and remove the check.
+ !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
+ // Each byte in each mask is either selector mask 0-3, or has higher
+ // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
+ // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
+ // mask which is not 0xff wins. By anding both masks we have a correct
+ // result except that 0x0c shall be corrected to give 0x0c only.
+ uint32_t Mask = LHSMask & RHSMask;
+ for (unsigned I = 0; I < 32; I += 8) {
+ uint32_t ByteSel = 0xff << I;
+ if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
+ Mask &= (0x0c << I) & 0xffffffff;
+ }
+
+ // Add 4 to each active LHS lane. It will not affect any existing 0xff
+ // or 0x0c.
+ uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
+ SDLoc DL(N);
+
+ return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
+ LHS.getOperand(0), RHS.getOperand(0),
+ DAG.getConstant(Sel, DL, MVT::i32));
+ }
+ }
+ }
+
return SDValue();
}
@@ -6268,6 +6395,60 @@ SDValue SITargetLowering::performOrCombi
return SDValue();
}
+ // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
+ if (isa<ConstantSDNode>(RHS) && LHS.hasOneUse() &&
+ LHS.getOpcode() == AMDGPUISD::PERM &&
+ isa<ConstantSDNode>(LHS.getOperand(2))) {
+ uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
+ if (!Sel)
+ return SDValue();
+
+ Sel |= LHS.getConstantOperandVal(2);
+ SDLoc DL(N);
+ return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
+ LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
+ }
+
+ // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
+ const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+ if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
+ N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32) != -1) {
+ uint32_t LHSMask = getPermuteMask(DAG, LHS);
+ uint32_t RHSMask = getPermuteMask(DAG, RHS);
+ if (LHSMask != ~0u && RHSMask != ~0u) {
+ // Canonicalize the expression in an attempt to have fewer unique masks
+ // and therefore fewer registers used to hold the masks.
+ if (LHSMask > RHSMask) {
+ std::swap(LHSMask, RHSMask);
+ std::swap(LHS, RHS);
+ }
+
+ // Select 0xc for each lane used from source operand. Zero has 0xc mask
+ // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
+ uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
+ uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
+
+ // Check of we need to combine values from two sources within a byte.
+ if (!(LHSUsedLanes & RHSUsedLanes) &&
+ // If we select high and lower word keep it for SDWA.
+ // TODO: teach SDWA to work with v_perm_b32 and remove the check.
+ !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
+ // Kill zero bytes selected by other mask. Zero value is 0xc.
+ LHSMask &= ~RHSUsedLanes;
+ RHSMask &= ~LHSUsedLanes;
+ // Add 4 to each active LHS lane
+ LHSMask |= LHSUsedLanes & 0x04040404;
+ // Combine masks
+ uint32_t Sel = LHSMask | RHSMask;
+ SDLoc DL(N);
+
+ return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
+ LHS.getOperand(0), RHS.getOperand(0),
+ DAG.getConstant(Sel, DL, MVT::i32));
+ }
+ }
+ }
+
if (VT != MVT::i64)
return SDValue();
Modified: llvm/trunk/lib/Target/AMDGPU/VOP3Instructions.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/VOP3Instructions.td?rev=334559&r1=334558&r2=334559&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/VOP3Instructions.td (original)
+++ llvm/trunk/lib/Target/AMDGPU/VOP3Instructions.td Tue Jun 12 16:50:37 2018
@@ -449,7 +449,7 @@ def V_INTERP_P1_F32_e64 : VOP3Interp <"
def V_INTERP_P2_F32_e64 : VOP3Interp <"v_interp_p2_f32", VOP3_INTERP>;
def V_INTERP_MOV_F32_e64 : VOP3Interp <"v_interp_mov_f32", VOP3_INTERP_MOV>;
-def V_PERM_B32 : VOP3Inst <"v_perm_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+def V_PERM_B32 : VOP3Inst <"v_perm_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUperm>;
} // End SubtargetPredicate = isVI
let Predicates = [Has16BitInsts] in {
Added: llvm/trunk/test/CodeGen/AMDGPU/permute.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/permute.ll?rev=334559&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/permute.ll (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/permute.ll Tue Jun 12 16:50:37 2018
@@ -0,0 +1,199 @@
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}lsh8_or_and:
+; GCN: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x6050400
+; GCN: v_perm_b32 v{{[0-9]+}}, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[MASK]]
+define amdgpu_kernel void @lsh8_or_and(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
+bb:
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr i32, i32 addrspace(1)* %arg, i32 %id
+ %tmp = load i32, i32 addrspace(1)* %gep, align 4
+ %tmp2 = shl i32 %tmp, 8
+ %tmp3 = and i32 %arg1, 255
+ %tmp4 = or i32 %tmp2, %tmp3
+ store i32 %tmp4, i32 addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: {{^}}lsr24_or_and:
+; GCN: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7060503
+; GCN: v_perm_b32 v{{[0-9]+}}, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[MASK]]
+define amdgpu_kernel void @lsr24_or_and(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
+bb:
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr i32, i32 addrspace(1)* %arg, i32 %id
+ %tmp = load i32, i32 addrspace(1)* %gep, align 4
+ %tmp2 = lshr i32 %tmp, 24
+ %tmp3 = and i32 %arg1, 4294967040 ; 0xffffff00
+ %tmp4 = or i32 %tmp2, %tmp3
+ store i32 %tmp4, i32 addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: {{^}}and_or_lsr24:
+; GCN: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7060503
+; GCN: v_perm_b32 v{{[0-9]+}}, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[MASK]]
+define amdgpu_kernel void @and_or_lsr24(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
+bb:
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr i32, i32 addrspace(1)* %arg, i32 %id
+ %tmp = load i32, i32 addrspace(1)* %gep, align 4
+ %tmp2 = and i32 %tmp, 4294967040 ; 0xffffff00
+ %tmp3 = lshr i32 %arg1, 24
+ %tmp4 = or i32 %tmp2, %tmp3
+ %tmp5 = xor i32 %tmp4, -2147483648
+ store i32 %tmp5, i32 addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: {{^}}and_or_and:
+; GCN: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7020500
+; GCN: v_perm_b32 v{{[0-9]+}}, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[MASK]]
+define amdgpu_kernel void @and_or_and(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
+bb:
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr i32, i32 addrspace(1)* %arg, i32 %id
+ %tmp = load i32, i32 addrspace(1)* %gep, align 4
+ %tmp2 = and i32 %tmp, -16711936
+ %tmp3 = and i32 %arg1, 16711935
+ %tmp4 = or i32 %tmp2, %tmp3
+ store i32 %tmp4, i32 addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: {{^}}lsh8_or_lsr24:
+; GCN: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x6050403
+; GCN: v_perm_b32 v{{[0-9]+}}, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[MASK]]
+define amdgpu_kernel void @lsh8_or_lsr24(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
+bb:
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr i32, i32 addrspace(1)* %arg, i32 %id
+ %tmp = load i32, i32 addrspace(1)* %gep, align 4
+ %tmp2 = shl i32 %tmp, 8
+ %tmp3 = lshr i32 %arg1, 24
+ %tmp4 = or i32 %tmp2, %tmp3
+ store i32 %tmp4, i32 addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: {{^}}lsh16_or_lsr24:
+; GCN: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x5040c03
+; GCN: v_perm_b32 v{{[0-9]+}}, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[MASK]]
+define amdgpu_kernel void @lsh16_or_lsr24(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
+bb:
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr i32, i32 addrspace(1)* %arg, i32 %id
+ %tmp = load i32, i32 addrspace(1)* %gep, align 4
+ %tmp2 = shl i32 %tmp, 16
+ %tmp3 = lshr i32 %arg1, 24
+ %tmp4 = or i32 %tmp2, %tmp3
+ store i32 %tmp4, i32 addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: {{^}}and_xor_and:
+; GCN: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7020104
+; GCN: v_perm_b32 v{{[0-9]+}}, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[MASK]]
+define amdgpu_kernel void @and_xor_and(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
+bb:
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr i32, i32 addrspace(1)* %arg, i32 %id
+ %tmp = load i32, i32 addrspace(1)* %gep, align 4
+ %tmp2 = and i32 %tmp, -16776961
+ %tmp3 = and i32 %arg1, 16776960
+ %tmp4 = xor i32 %tmp2, %tmp3
+ store i32 %tmp4, i32 addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: {{^}}and_or_or_and:
+; GCN: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff0500
+; GCN: v_perm_b32 v{{[0-9]+}}, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[MASK]]
+define amdgpu_kernel void @and_or_or_and(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
+bb:
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr i32, i32 addrspace(1)* %arg, i32 %id
+ %tmp = load i32, i32 addrspace(1)* %gep, align 4
+ %and = and i32 %tmp, 16711935 ; 0x00ff00ff
+ %tmp1 = and i32 %arg1, 4294967040 ; 0xffffff00
+ %tmp2 = or i32 %tmp1, -65536
+ %tmp3 = or i32 %tmp2, %and
+ store i32 %tmp3, i32 addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: {{^}}and_or_and_shl:
+; GCN: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x50c0c00
+; GCN: v_perm_b32 v{{[0-9]+}}, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[MASK]]
+define amdgpu_kernel void @and_or_and_shl(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
+bb:
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr i32, i32 addrspace(1)* %arg, i32 %id
+ %tmp = load i32, i32 addrspace(1)* %gep, align 4
+ %tmp2 = shl i32 %tmp, 16
+ %tmp3 = and i32 %arg1, 65535
+ %tmp4 = or i32 %tmp2, %tmp3
+ %and = and i32 %tmp4, 4278190335
+ store i32 %and, i32 addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: {{^}}or_and_or:
+; GCN: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7020104
+; GCN: v_perm_b32 v{{[0-9]+}}, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[MASK]]
+define amdgpu_kernel void @or_and_or(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
+bb:
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr i32, i32 addrspace(1)* %arg, i32 %id
+ %tmp = load i32, i32 addrspace(1)* %gep, align 4
+ %or1 = or i32 %tmp, 16776960 ; 0x00ffff00
+ %or2 = or i32 %arg1, 4278190335 ; 0xff0000ff
+ %and = and i32 %or1, %or2
+ store i32 %and, i32 addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: {{^}}known_ffff0500:
+; GCN-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff0500
+; GCN-DAG: v_mov_b32_e32 [[RES:v[0-9]+]], 0xffff8004
+; GCN: v_perm_b32 v{{[0-9]+}}, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[MASK]]
+; GCN: store_dword v[{{[0-9:]+}}], [[RES]]{{$}}
+define amdgpu_kernel void @known_ffff0500(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
+bb:
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr i32, i32 addrspace(1)* %arg, i32 %id
+ %load = load i32, i32 addrspace(1)* %gep, align 4
+ %mask1 = or i32 %arg1, 32768 ; 0x8000
+ %mask2 = or i32 %load, 4
+ %and = and i32 %mask2, 16711935 ; 0x00ff00ff
+ %tmp1 = and i32 %mask1, 4294967040 ; 0xffffff00
+ %tmp2 = or i32 %tmp1, 4294901760 ; 0xffff0000
+ %tmp3 = or i32 %tmp2, %and
+ store i32 %tmp3, i32 addrspace(1)* %gep, align 4
+ %v = and i32 %tmp3, 4294934532 ; 0xffff8004
+ store i32 %v, i32 addrspace(1)* %arg, align 4
+ ret void
+}
+
+; GCN-LABEL: {{^}}known_050c0c00:
+; GCN-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x50c0c00
+; GCN-DAG: v_mov_b32_e32 [[RES:v[0-9]+]], 4{{$}}
+; GCN: v_perm_b32 v{{[0-9]+}}, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[MASK]]
+; GCN: store_dword v[{{[0-9:]+}}], [[RES]]{{$}}
+define amdgpu_kernel void @known_050c0c00(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
+bb:
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr i32, i32 addrspace(1)* %arg, i32 %id
+ %tmp = load i32, i32 addrspace(1)* %gep, align 4
+ %tmp2 = shl i32 %tmp, 16
+ %mask = or i32 %arg1, 4
+ %tmp3 = and i32 %mask, 65535
+ %tmp4 = or i32 %tmp2, %tmp3
+ %and = and i32 %tmp4, 4278190335
+ store i32 %and, i32 addrspace(1)* %gep, align 4
+ %v = and i32 %and, 16776964
+ store i32 %v, i32 addrspace(1)* %arg, align 4
+ ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()
More information about the llvm-commits
mailing list