[llvm] r281488 - AMDGPU: Improve splitting 64-bit bit ops by constants
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Wed Sep 14 08:19:03 PDT 2016
Author: arsenm
Date: Wed Sep 14 10:19:03 2016
New Revision: 281488
URL: http://llvm.org/viewvc/llvm-project?rev=281488&view=rev
Log:
AMDGPU: Improve splitting 64-bit bit ops by constants
This addresses a TODO to handle operations besides and. This
also starts eliminating no-op operations with a constant that
can emerge later.
Added:
llvm/trunk/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll
Modified:
llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h
llvm/trunk/lib/Target/AMDGPU/SIFoldOperands.cpp
llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h
llvm/trunk/lib/Target/AMDGPU/SIInstructions.td
llvm/trunk/test/CodeGen/AMDGPU/and.ll
llvm/trunk/test/CodeGen/AMDGPU/bitreverse.ll
llvm/trunk/test/CodeGen/AMDGPU/bswap.ll
llvm/trunk/test/CodeGen/AMDGPU/ctpop64.ll
llvm/trunk/test/CodeGen/AMDGPU/or.ll
llvm/trunk/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll
llvm/trunk/test/CodeGen/AMDGPU/sint_to_fp.i64.ll
llvm/trunk/test/CodeGen/AMDGPU/xor.ll
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp?rev=281488&r1=281487&r2=281488&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp Wed Sep 14 10:19:03 2016
@@ -462,7 +462,6 @@ AMDGPUTargetLowering::AMDGPUTargetLoweri
MaxStoresPerMemset = 4096;
setTargetDAGCombine(ISD::BITCAST);
- setTargetDAGCombine(ISD::AND);
setTargetDAGCombine(ISD::SHL);
setTargetDAGCombine(ISD::SRA);
setTargetDAGCombine(ISD::SRL);
@@ -2093,38 +2092,21 @@ SDValue AMDGPUTargetLowering::performSto
SN->getBasePtr(), SN->getMemOperand());
}
-// TODO: Should repeat for other bit ops.
-SDValue AMDGPUTargetLowering::performAndCombine(SDNode *N,
- DAGCombinerInfo &DCI) const {
- if (N->getValueType(0) != MVT::i64)
- return SDValue();
-
- // Break up 64-bit and of a constant into two 32-bit ands. This will typically
- // happen anyway for a VALU 64-bit and. This exposes other 32-bit integer
- // combine opportunities since most 64-bit operations are decomposed this way.
- // TODO: We won't want this for SALU especially if it is an inline immediate.
- const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
- if (!RHS)
- return SDValue();
-
- uint64_t Val = RHS->getZExtValue();
- if (Lo_32(Val) != 0 && Hi_32(Val) != 0 && !RHS->hasOneUse()) {
- // If either half of the constant is 0, this is really a 32-bit and, so
- // split it. If we can re-use the full materialized constant, keep it.
- return SDValue();
- }
-
- SDLoc SL(N);
+/// Split the 64-bit value \p LHS into two 32-bit components, and perform the
+/// binary operation \p Opc to it with the corresponding constant operands.
+SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
+ DAGCombinerInfo &DCI, const SDLoc &SL,
+ unsigned Opc, SDValue LHS,
+ uint32_t ValLo, uint32_t ValHi) const {
SelectionDAG &DAG = DCI.DAG;
-
SDValue Lo, Hi;
- std::tie(Lo, Hi) = split64BitValue(N->getOperand(0), DAG);
+ std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
- SDValue LoRHS = DAG.getConstant(Lo_32(Val), SL, MVT::i32);
- SDValue HiRHS = DAG.getConstant(Hi_32(Val), SL, MVT::i32);
+ SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
+ SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
- SDValue LoAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Lo, LoRHS);
- SDValue HiAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, HiRHS);
+ SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
+ SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
// Re-visit the ands. It's possible we eliminated one of them and it could
// simplify the vector.
@@ -2518,12 +2500,6 @@ SDValue AMDGPUTargetLowering::PerformDAG
return performSraCombine(N, DCI);
}
- case ISD::AND: {
- if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
- break;
-
- return performAndCombine(N, DCI);
- }
case ISD::MUL:
return performMulCombine(N, DCI);
case ISD::MULHS:
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h?rev=281488&r1=281487&r2=281488&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h Wed Sep 14 10:19:03 2016
@@ -62,7 +62,10 @@ protected:
bool shouldCombineMemoryType(EVT VT) const;
SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const;
- SDValue performAndCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+
+ SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL,
+ unsigned Opc, SDValue LHS,
+ uint32_t ValLo, uint32_t ValHi) const;
SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performSraCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performSrlCombine(SDNode *N, DAGCombinerInfo &DCI) const;
Modified: llvm/trunk/lib/Target/AMDGPU/SIFoldOperands.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIFoldOperands.cpp?rev=281488&r1=281487&r2=281488&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIFoldOperands.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIFoldOperands.cpp Wed Sep 14 10:19:03 2016
@@ -304,6 +304,126 @@ static void foldOperand(MachineOperand &
return;
}
+static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result,
+ int32_t LHS, int32_t RHS) {
+ switch (Opcode) {
+ case AMDGPU::V_AND_B32_e64:
+ case AMDGPU::S_AND_B32:
+ Result = LHS & RHS;
+ return true;
+ case AMDGPU::V_OR_B32_e64:
+ case AMDGPU::S_OR_B32:
+ Result = LHS | RHS;
+ return true;
+ case AMDGPU::V_XOR_B32_e64:
+ case AMDGPU::S_XOR_B32:
+ Result = LHS ^ RHS;
+ return true;
+ default:
+ return false;
+ }
+}
+
+static unsigned getMovOpc(bool IsScalar) {
+ return IsScalar ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
+}
+
+// Try to simplify operations with a constant that may appear after instruction
+// selection.
+static bool tryConstantFoldOp(MachineRegisterInfo &MRI,
+ const SIInstrInfo *TII,
+ MachineInstr *MI) {
+ unsigned Opc = MI->getOpcode();
+
+ if (Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 ||
+ Opc == AMDGPU::S_NOT_B32) {
+ MachineOperand &Src0 = MI->getOperand(1);
+ if (Src0.isImm()) {
+ Src0.setImm(~Src0.getImm());
+ MI->setDesc(TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32)));
+ return true;
+ }
+
+ return false;
+ }
+
+ if (!MI->isCommutable())
+ return false;
+
+ int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
+ int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
+
+ MachineOperand *Src0 = &MI->getOperand(Src0Idx);
+ MachineOperand *Src1 = &MI->getOperand(Src1Idx);
+ if (!Src0->isImm() && !Src1->isImm())
+ return false;
+
+ // and k0, k1 -> v_mov_b32 (k0 & k1)
+ // or k0, k1 -> v_mov_b32 (k0 | k1)
+ // xor k0, k1 -> v_mov_b32 (k0 ^ k1)
+ if (Src0->isImm() && Src1->isImm()) {
+ int32_t NewImm;
+ if (!evalBinaryInstruction(Opc, NewImm, Src0->getImm(), Src1->getImm()))
+ return false;
+
+ const SIRegisterInfo &TRI = TII->getRegisterInfo();
+ bool IsSGPR = TRI.isSGPRReg(MRI, MI->getOperand(0).getReg());
+
+ Src0->setImm(NewImm);
+ MI->RemoveOperand(Src1Idx);
+ MI->setDesc(TII->get(getMovOpc(IsSGPR)));
+ return true;
+ }
+
+ if (Src0->isImm() && !Src1->isImm()) {
+ std::swap(Src0, Src1);
+ std::swap(Src0Idx, Src1Idx);
+ }
+
+ int32_t Src1Val = static_cast<int32_t>(Src1->getImm());
+ if (Opc == AMDGPU::V_OR_B32_e64 || Opc == AMDGPU::S_OR_B32) {
+ if (Src1Val == 0) {
+ // y = or x, 0 => y = copy x
+ MI->RemoveOperand(Src1Idx);
+ MI->setDesc(TII->get(AMDGPU::COPY));
+ } else if (Src1Val == -1) {
+ // y = or x, -1 => y = v_mov_b32 -1
+ MI->RemoveOperand(Src1Idx);
+ MI->setDesc(TII->get(getMovOpc(Opc == AMDGPU::S_OR_B32)));
+ } else
+ return false;
+
+ return true;
+ }
+
+ if (MI->getOpcode() == AMDGPU::V_AND_B32_e64 ||
+ MI->getOpcode() == AMDGPU::S_AND_B32) {
+ if (Src1Val == 0) {
+ // y = and x, 0 => y = v_mov_b32 0
+ MI->RemoveOperand(Src0Idx);
+ MI->setDesc(TII->get(getMovOpc(Opc == AMDGPU::S_AND_B32)));
+ } else if (Src1Val == -1) {
+ // y = and x, -1 => y = copy x
+ MI->RemoveOperand(Src1Idx);
+ MI->setDesc(TII->get(AMDGPU::COPY));
+ } else
+ return false;
+
+ return true;
+ }
+
+ if (MI->getOpcode() == AMDGPU::V_XOR_B32_e64 ||
+ MI->getOpcode() == AMDGPU::S_XOR_B32) {
+ if (Src1Val == 0) {
+ // y = xor x, 0 => y = copy x
+ MI->RemoveOperand(Src1Idx);
+ MI->setDesc(TII->get(AMDGPU::COPY));
+ }
+ }
+
+ return false;
+}
+
bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
if (skipFunction(*MF.getFunction()))
return false;
@@ -389,6 +509,12 @@ bool SIFoldOperands::runOnMachineFunctio
}
DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " <<
Fold.UseOpNo << " of " << *Fold.UseMI << '\n');
+
+ // Folding the immediate may reveal operations that can be constant
+ // folded or replaced with a copy. This can happen for example after
+ // frame indices are lowered to constants or from splitting 64-bit
+ // constants.
+ tryConstantFoldOp(MRI, TII, Fold.UseMI);
}
}
}
Modified: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp?rev=281488&r1=281487&r2=281488&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp Wed Sep 14 10:19:03 2016
@@ -227,6 +227,7 @@ SITargetLowering::SITargetLowering(const
setTargetDAGCombine(ISD::SETCC);
setTargetDAGCombine(ISD::AND);
setTargetDAGCombine(ISD::OR);
+ setTargetDAGCombine(ISD::XOR);
setTargetDAGCombine(ISD::UINT_TO_FP);
setTargetDAGCombine(ISD::FCANONICALIZE);
@@ -2899,23 +2900,62 @@ SDValue SITargetLowering::performSHLPtrC
return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset);
}
+static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
+ return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
+ (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
+ (Opc == ISD::XOR && Val == 0);
+}
+
+// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
+// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
+// integer combine opportunities since most 64-bit operations are decomposed
+// this way. TODO: We won't want this for SALU especially if it is an inline
+// immediate.
+SDValue SITargetLowering::splitBinaryBitConstantOp(
+ DAGCombinerInfo &DCI,
+ const SDLoc &SL,
+ unsigned Opc, SDValue LHS,
+ const ConstantSDNode *CRHS) const {
+ uint64_t Val = CRHS->getZExtValue();
+ uint32_t ValLo = Lo_32(Val);
+ uint32_t ValHi = Hi_32(Val);
+ const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+
+ if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
+ bitOpWithConstantIsReducible(Opc, ValHi)) ||
+ (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
+ // If we need to materialize a 64-bit immediate, it will be split up later
+ // anyway. Avoid creating the harder to understand 64-bit immediate
+ // materialization.
+ return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
+ }
+
+ return SDValue();
+}
+
SDValue SITargetLowering::performAndCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
if (DCI.isBeforeLegalize())
return SDValue();
- if (SDValue Base = AMDGPUTargetLowering::performAndCombine(N, DCI))
- return Base;
-
SelectionDAG &DAG = DCI.DAG;
-
- // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
- // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
+ EVT VT = N->getValueType(0);
SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);
- if (LHS.getOpcode() == ISD::SETCC &&
- RHS.getOpcode() == ISD::SETCC) {
+
+ if (VT == MVT::i64) {
+ const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
+ if (CRHS) {
+ if (SDValue Split
+ = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
+ return Split;
+ }
+ }
+
+ // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
+ // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
+ if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
@@ -2963,54 +3003,85 @@ SDValue SITargetLowering::performOrCombi
SDValue RHS = N->getOperand(1);
EVT VT = N->getValueType(0);
- if (VT == MVT::i64) {
- // TODO: This could be a generic combine with a predicate for extracting the
- // high half of an integer being free.
+ if (VT == MVT::i1) {
+ // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
+ if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
+ RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
+ SDValue Src = LHS.getOperand(0);
+ if (Src != RHS.getOperand(0))
+ return SDValue();
- // (or i64:x, (zero_extend i32:y)) ->
- // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
- if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
- RHS.getOpcode() != ISD::ZERO_EXTEND)
- std::swap(LHS, RHS);
-
- if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
- SDValue ExtSrc = RHS.getOperand(0);
- EVT SrcVT = ExtSrc.getValueType();
- if (SrcVT == MVT::i32) {
- SDLoc SL(N);
- SDValue LowLHS, HiBits;
- std::tie(LowLHS, HiBits) = split64BitValue(LHS, DAG);
- SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
-
- DCI.AddToWorklist(LowOr.getNode());
- DCI.AddToWorklist(HiBits.getNode());
-
- SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
- LowOr, HiBits);
- return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
- }
+ const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
+ const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
+ if (!CLHS || !CRHS)
+ return SDValue();
+
+ // Only 10 bits are used.
+ static const uint32_t MaxMask = 0x3ff;
+
+ uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
+ SDLoc DL(N);
+ return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
+ Src, DAG.getConstant(NewMask, DL, MVT::i32));
}
+
+ return SDValue();
}
- // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
- if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
- RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
- SDValue Src = LHS.getOperand(0);
- if (Src != RHS.getOperand(0))
- return SDValue();
+ if (VT != MVT::i64)
+ return SDValue();
- const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
- const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
- if (!CLHS || !CRHS)
- return SDValue();
+ // TODO: This could be a generic combine with a predicate for extracting the
+ // high half of an integer being free.
+
+ // (or i64:x, (zero_extend i32:y)) ->
+ // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
+ if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
+ RHS.getOpcode() != ISD::ZERO_EXTEND)
+ std::swap(LHS, RHS);
+
+ if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
+ SDValue ExtSrc = RHS.getOperand(0);
+ EVT SrcVT = ExtSrc.getValueType();
+ if (SrcVT == MVT::i32) {
+ SDLoc SL(N);
+ SDValue LowLHS, HiBits;
+ std::tie(LowLHS, HiBits) = split64BitValue(LHS, DAG);
+ SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
+
+ DCI.AddToWorklist(LowOr.getNode());
+ DCI.AddToWorklist(HiBits.getNode());
+
+ SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
+ LowOr, HiBits);
+ return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
+ }
+ }
+
+ const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
+ if (CRHS) {
+ if (SDValue Split
+ = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR, LHS, CRHS))
+ return Split;
+ }
+
+ return SDValue();
+}
- // Only 10 bits are used.
- static const uint32_t MaxMask = 0x3ff;
+SDValue SITargetLowering::performXorCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ EVT VT = N->getValueType(0);
+ if (VT != MVT::i64)
+ return SDValue();
+
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
- uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
- SDLoc DL(N);
- return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
- Src, DAG.getConstant(NewMask, DL, MVT::i32));
+ const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
+ if (CRHS) {
+ if (SDValue Split
+ = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
+ return Split;
}
return SDValue();
@@ -3427,6 +3498,8 @@ SDValue SITargetLowering::PerformDAGComb
return performAndCombine(N, DCI);
case ISD::OR:
return performOrCombine(N, DCI);
+ case ISD::XOR:
+ return performXorCombine(N, DCI);
case AMDGPUISD::FP_CLASS:
return performClassCombine(N, DCI);
case ISD::FCANONICALIZE:
Modified: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h?rev=281488&r1=281487&r2=281488&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h Wed Sep 14 10:19:03 2016
@@ -58,8 +58,14 @@ class SITargetLowering final : public AM
SDValue performSHLPtrCombine(SDNode *N,
unsigned AS,
DAGCombinerInfo &DCI) const;
+
+ SDValue splitBinaryBitConstantOp(DAGCombinerInfo &DCI, const SDLoc &SL,
+ unsigned Opc, SDValue LHS,
+ const ConstantSDNode *CRHS) const;
+
SDValue performAndCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performOrCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performXorCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performClassCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performFCanonicalizeCombine(SDNode *N, DAGCombinerInfo &DCI) const;
Modified: llvm/trunk/lib/Target/AMDGPU/SIInstructions.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInstructions.td?rev=281488&r1=281487&r2=281488&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstructions.td (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInstructions.td Wed Sep 14 10:19:03 2016
@@ -1512,7 +1512,7 @@ def : Pat <
def : Pat <
(fabs f32:$src),
- (V_AND_B32_e32 $src, (V_MOV_B32_e32 0x7fffffff))
+ (V_AND_B32_e64 $src, (V_MOV_B32_e32 0x7fffffff))
>;
def : Pat <
@@ -1525,7 +1525,7 @@ def : Pat <
(REG_SEQUENCE VReg_64,
(i32 (EXTRACT_SUBREG f64:$src, sub0)),
sub0,
- (V_AND_B32_e32 (EXTRACT_SUBREG f64:$src, sub1),
+ (V_AND_B32_e64 (EXTRACT_SUBREG f64:$src, sub1),
(V_MOV_B32_e32 0x7fffffff)), // Set sign bit.
sub1)
>;
Modified: llvm/trunk/test/CodeGen/AMDGPU/and.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/and.ll?rev=281488&r1=281487&r2=281488&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/and.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/and.ll Wed Sep 14 10:19:03 2016
@@ -324,6 +324,20 @@ define void @v_and_inline_imm_i64(i64 ad
ret void
}
+; FIXME: Should be able to reduce load width
+; FUNC-LABEL: {{^}}v_and_inline_neg_imm_i64:
+; SI: buffer_load_dwordx2 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}
+; SI-NOT: and
+; SI: v_and_b32_e32 v[[VAL_LO]], -8, v[[VAL_LO]]
+; SI-NOT: and
+; SI: buffer_store_dwordx2 v{{\[}}[[VAL_LO]]:[[VAL_HI]]{{\]}}
+define void @v_and_inline_neg_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
+ %a = load i64, i64 addrspace(1)* %aptr, align 8
+ %and = and i64 %a, -8
+ store i64 %and, i64 addrspace(1)* %out, align 8
+ ret void
+}
+
; FUNC-LABEL: {{^}}s_and_inline_imm_64_i64
; SI: s_load_dword
; SI-NOT: and
Modified: llvm/trunk/test/CodeGen/AMDGPU/bitreverse.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/bitreverse.ll?rev=281488&r1=281487&r2=281488&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/bitreverse.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/bitreverse.ll Wed Sep 14 10:19:03 2016
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
declare i16 @llvm.bitreverse.i16(i16) #1
@@ -79,6 +79,7 @@ define void @s_brev_i64(i64 addrspace(1)
}
; FUNC-LABEL: {{^}}v_brev_i64:
+; SI-NOT: v_or_b32_e64 v{{[0-9]+}}, 0, 0
define void @v_brev_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %valptr) #0 {
%val = load i64, i64 addrspace(1)* %valptr
%brev = call i64 @llvm.bitreverse.i64(i64 %val) #1
Modified: llvm/trunk/test/CodeGen/AMDGPU/bswap.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/bswap.ll?rev=281488&r1=281487&r2=281488&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/bswap.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/bswap.ll Wed Sep 14 10:19:03 2016
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
declare i32 @llvm.bswap.i32(i32) nounwind readnone
@@ -93,6 +93,8 @@ define void @test_bswap_v8i32(<8 x i32>
ret void
}
+; FUNC-LABEL: {{^}}test_bswap_i64:
+; SI-NOT: v_or_b32_e64 v{{[0-9]+}}, 0, 0
define void @test_bswap_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
%val = load i64, i64 addrspace(1)* %in, align 8
%bswap = call i64 @llvm.bswap.i64(i64 %val) nounwind readnone
Added: llvm/trunk/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll?rev=281488&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll Wed Sep 14 10:19:03 2016
@@ -0,0 +1,144 @@
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}fold_mi_v_and_0:
+; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
+; GCN-NOT: [[RESULT]]
+; GCN: buffer_store_dword [[RESULT]]
+define void @fold_mi_v_and_0(i32 addrspace(1)* %out) {
+ %x = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
+ %size = call i32 @llvm.amdgcn.groupstaticsize()
+ %and = and i32 %size, %x
+ store i32 %and, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}fold_mi_s_and_0:
+; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
+; GCN-NOT: [[RESULT]]
+; GCN: buffer_store_dword [[RESULT]]
+define void @fold_mi_s_and_0(i32 addrspace(1)* %out, i32 %x) #0 {
+ %size = call i32 @llvm.amdgcn.groupstaticsize()
+ %and = and i32 %size, %x
+ store i32 %and, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}fold_mi_v_or_0:
+; GCN: v_mbcnt_lo_u32_b32_e64 [[RESULT:v[0-9]+]]
+; GCN-NOT: [[RESULT]]
+; GCN: buffer_store_dword [[RESULT]]
+define void @fold_mi_v_or_0(i32 addrspace(1)* %out) {
+ %x = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
+ %size = call i32 @llvm.amdgcn.groupstaticsize()
+ %or = or i32 %size, %x
+ store i32 %or, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}fold_mi_s_or_0:
+; GCN: s_load_dword [[SVAL:s[0-9]+]]
+; GCN-NOT: [[SVAL]]
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]]
+; GCN-NOT: [[VVAL]]
+; GCN: buffer_store_dword [[VVAL]]
+define void @fold_mi_s_or_0(i32 addrspace(1)* %out, i32 %x) #0 {
+ %size = call i32 @llvm.amdgcn.groupstaticsize()
+ %or = or i32 %size, %x
+ store i32 %or, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}fold_mi_v_xor_0:
+; GCN: v_mbcnt_lo_u32_b32_e64 [[RESULT:v[0-9]+]]
+; GCN-NOT: [[RESULT]]
+; GCN: buffer_store_dword [[RESULT]]
+define void @fold_mi_v_xor_0(i32 addrspace(1)* %out) {
+ %x = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
+ %size = call i32 @llvm.amdgcn.groupstaticsize()
+ %xor = xor i32 %size, %x
+ store i32 %xor, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}fold_mi_s_xor_0:
+; GCN: s_load_dword [[SVAL:s[0-9]+]]
+; GCN-NOT: [[SVAL]]
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]]
+; GCN-NOT: [[VVAL]]
+; GCN: buffer_store_dword [[VVAL]]
+define void @fold_mi_s_xor_0(i32 addrspace(1)* %out, i32 %x) #0 {
+ %size = call i32 @llvm.amdgcn.groupstaticsize()
+ %xor = xor i32 %size, %x
+ store i32 %xor, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}fold_mi_s_not_0:
+; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], -1{{$}}
+; GCN-NOT: [[RESULT]]
+; GCN: buffer_store_dword [[RESULT]]
+define void @fold_mi_s_not_0(i32 addrspace(1)* %out, i32 %x) #0 {
+ %size = call i32 @llvm.amdgcn.groupstaticsize()
+ %xor = xor i32 %size, -1
+ store i32 %xor, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}fold_mi_v_not_0:
+; GCN: v_bcnt_u32_b32_e64 v[[RESULT_LO:[0-9]+]], v{{[0-9]+}}, 0{{$}}
+; GCN: v_bcnt_u32_b32_e{{[0-9]+}} v[[RESULT_LO:[0-9]+]], v{{[0-9]+}}, v[[RESULT_LO]]{{$}}
+; GCN-NEXT: v_not_b32_e32 v[[RESULT_LO]]
+; GCN-NEXT: v_mov_b32_e32 v[[RESULT_HI:[0-9]+]], -1{{$}}
+; GCN-NEXT: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
+define void @fold_mi_v_not_0(i64 addrspace(1)* %out) {
+ %vreg = load volatile i64, i64 addrspace(1)* undef
+ %ctpop = call i64 @llvm.ctpop.i64(i64 %vreg)
+ %xor = xor i64 %ctpop, -1
+ store i64 %xor, i64 addrspace(1)* %out
+ ret void
+}
+
+; The neg1 appears after folding the not 0
+; GCN-LABEL: {{^}}fold_mi_or_neg1:
+; GCN: buffer_load_dwordx2
+; GCN: buffer_load_dwordx2 v{{\[}}[[VREG1_LO:[0-9]+]]:[[VREG1_HI:[0-9]+]]{{\]}}
+
+; GCN: v_bcnt_u32_b32_e64 v[[RESULT_LO:[0-9]+]], v{{[0-9]+}}, 0{{$}}
+; GCN: v_bcnt_u32_b32_e{{[0-9]+}} v[[RESULT_LO:[0-9]+]], v{{[0-9]+}}, v[[RESULT_LO]]{{$}}
+; GCN-DAG: v_not_b32_e32 v[[RESULT_LO]], v[[RESULT_LO]]
+; GCN-DAG: v_or_b32_e32 v[[RESULT_LO]], v[[VREG1_LO]], v[[RESULT_LO]]
+; GCN-DAG: v_mov_b32_e32 v[[RESULT_HI:[0-9]+]], v[[VREG1_HI]]
+; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
+define void @fold_mi_or_neg1(i64 addrspace(1)* %out) {
+ %vreg0 = load volatile i64, i64 addrspace(1)* undef
+ %vreg1 = load volatile i64, i64 addrspace(1)* undef
+ %ctpop = call i64 @llvm.ctpop.i64(i64 %vreg0)
+ %xor = xor i64 %ctpop, -1
+ %or = or i64 %xor, %vreg1
+ store i64 %or, i64 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}fold_mi_and_neg1:
+; GCN: v_bcnt_u32_b32
+; GCN: v_bcnt_u32_b32
+; GCN: v_not_b32
+; GCN: v_and_b32
+; GCN-NOT: v_and_b32
+define void @fold_mi_and_neg1(i64 addrspace(1)* %out) {
+ %vreg0 = load volatile i64, i64 addrspace(1)* undef
+ %vreg1 = load volatile i64, i64 addrspace(1)* undef
+ %ctpop = call i64 @llvm.ctpop.i64(i64 %vreg0)
+ %xor = xor i64 %ctpop, -1
+ %and = and i64 %xor, %vreg1
+ store i64 %and, i64 addrspace(1)* %out
+ ret void
+}
+
+declare i64 @llvm.ctpop.i64(i64) #1
+declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1
+declare i32 @llvm.amdgcn.groupstaticsize() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
Modified: llvm/trunk/test/CodeGen/AMDGPU/ctpop64.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/ctpop64.ll?rev=281488&r1=281487&r2=281488&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/ctpop64.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/ctpop64.ll Wed Sep 14 10:19:03 2016
@@ -39,14 +39,13 @@ define void @v_ctpop_i64(i32 addrspace(1
ret void
}
-; FIXME: or 0 should be replaxed with copy
; FUNC-LABEL: {{^}}v_ctpop_i64_user:
; GCN: buffer_load_dwordx2 v{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}},
; GCN: v_bcnt_u32_b32_e64 [[MIDRESULT:v[0-9]+]], v[[LOVAL]], 0
; SI-NEXT: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]]
; VI-NEXT: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]]
; GCN-DAG: v_or_b32_e32 v[[RESULT_LO:[0-9]+]], s{{[0-9]+}}, [[RESULT]]
-; GCN-DAG: v_or_b32_e64 v[[RESULT_HI:[0-9]+]], 0, s{{[0-9]+}}
+; GCN-DAG: v_mov_b32_e32 v[[RESULT_HI:[0-9]+]], s{{[0-9]+}}
; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
; GCN: s_endpgm
define void @v_ctpop_i64_user(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 %s.val) nounwind {
Modified: llvm/trunk/test/CodeGen/AMDGPU/or.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/or.ll?rev=281488&r1=281487&r2=281488&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/or.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/or.ll Wed Sep 14 10:19:03 2016
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
@@ -62,6 +62,75 @@ define void @scalar_or_literal_i32(i32 a
ret void
}
+; FUNC-LABEL: {{^}}scalar_or_literal_i64:
+; SI: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
+; SI-DAG: s_or_b32 s[[RES_HI:[0-9]+]], s[[HI]], 0xf237b
+; SI-DAG: s_or_b32 s[[RES_LO:[0-9]+]], s[[LO]], 0x3039
+; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[RES_LO]]
+; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[RES_HI]]
+define void @scalar_or_literal_i64(i64 addrspace(1)* %out, i64 %a) {
+ %or = or i64 %a, 4261135838621753
+ store i64 %or, i64 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}scalar_or_literal_multi_use_i64:
+; SI: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
+; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0xf237b
+; SI-DAG: s_movk_i32 s[[K_LO:[0-9]+]], 0x3039
+; SI: s_or_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
+
+; SI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, s[[K_LO]]
+; SI: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, s[[K_HI]]
+define void @scalar_or_literal_multi_use_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+ %or = or i64 %a, 4261135838621753
+ store i64 %or, i64 addrspace(1)* %out
+
+ %foo = add i64 %b, 4261135838621753
+ store volatile i64 %foo, i64 addrspace(1)* undef
+ ret void
+}
+
+; FUNC-LABEL: {{^}}scalar_or_inline_imm_i64:
+; SI: s_load_dwordx2 s{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
+; SI-NOT: or_b32
+; SI: s_or_b32 s[[VAL_LO]], s[[VAL_LO]], 63
+; SI-NOT: or_b32
+; SI: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[VAL_LO]]
+; SI-NOT: or_b32
+; SI: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[VAL_HI]]
+; SI-NOT: or_b32
+; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
+define void @scalar_or_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) {
+ %or = or i64 %a, 63
+ store i64 %or, i64 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}scalar_or_inline_imm_multi_use_i64:
+; SI-NOT: or_b32
+; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 63
+; SI-NOT: or_b32
+define void @scalar_or_inline_imm_multi_use_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+ %or = or i64 %a, 63
+ store i64 %or, i64 addrspace(1)* %out
+ %foo = add i64 %b, 63
+ store volatile i64 %foo, i64 addrspace(1)* undef
+ ret void
+}
+
+; FUNC-LABEL: {{^}}scalar_or_neg_inline_imm_i64:
+; SI-DAG: s_load_dword [[VAL:s[0-9]+]]
+; SI-DAG: s_or_b32 [[VAL]], [[VAL]], -8
+; SI-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], -1{{$}}
+; SI-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[VAL]]
+; SI: buffer_store_dwordx2 v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
+define void @scalar_or_neg_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) {
+ %or = or i64 %a, -8
+ store i64 %or, i64 addrspace(1)* %out
+ ret void
+}
+
; FUNC-LABEL: {{^}}vector_or_literal_i32:
; SI: v_or_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}}
define void @vector_or_literal_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) {
@@ -127,8 +196,9 @@ define void @vector_or_i64_loadimm(i64 a
; FIXME: The or 0 should really be removed.
; FUNC-LABEL: {{^}}vector_or_i64_imm:
; SI: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
-; SI: v_or_b32_e32 {{v[0-9]+}}, 8, v[[LO_VREG]]
-; SI: v_or_b32_e32 {{v[0-9]+}}, 0, {{.*}}
+; SI: v_or_b32_e32 v[[LO_RESULT:[0-9]+]], 8, v[[LO_VREG]]
+; SI-NOT: v_or_b32_e32 {{v[0-9]+}}, 0
+; SI: buffer_store_dwordx2 v{{\[}}[[LO_RESULT]]:[[HI_VREG]]{{\]}}
; SI: s_endpgm
define void @vector_or_i64_imm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
%loada = load i64, i64 addrspace(1)* %a, align 8
@@ -136,6 +206,32 @@ define void @vector_or_i64_imm(i64 addrs
store i64 %or, i64 addrspace(1)* %out
ret void
}
+
+; FUNC-LABEL: {{^}}vector_or_i64_neg_inline_imm:
+; SI-DAG: buffer_load_dword v[[LO_VREG:[0-9]+]]
+; SI-DAG: v_or_b32_e32 v[[RES_LO:[0-9]+]], -8, v[[LO_VREG]]
+; SI-DAG: v_mov_b32_e32 v[[RES_HI:[0-9]+]], -1{{$}}
+; SI: buffer_store_dwordx2 v{{\[}}[[RES_LO]]:[[RES_HI]]{{\]}}
+; SI: s_endpgm
+define void @vector_or_i64_neg_inline_imm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+ %loada = load i64, i64 addrspace(1)* %a, align 8
+ %or = or i64 %loada, -8
+ store i64 %or, i64 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}vector_or_i64_neg_literal:
+; SI-DAG: buffer_load_dword v[[LO_VREG:[0-9]+]]
+; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, -1{{$}}
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0xffffff38, v[[LO_VREG]]
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
+define void @vector_or_i64_neg_literal(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+ %loada = load i64, i64 addrspace(1)* %a, align 8
+ %or = or i64 %loada, -200
+ store i64 %or, i64 addrspace(1)* %out
+ ret void
+}
; FUNC-LABEL: {{^}}trunc_i64_or_to_i32:
; SI: s_load_dword s[[SREG0:[0-9]+]]
Modified: llvm/trunk/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll?rev=281488&r1=281487&r2=281488&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll Wed Sep 14 10:19:03 2016
@@ -97,7 +97,6 @@ define void @v_uextract_bit_127_i128(i12
; GCN-DAG: v_bfe_u32 v[[ELT2PART:[0-9]+]], v[[VAL3]], 2, 2{{$}}
; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
; GCN-DAG: v_or_b32_e32 v[[OR0:[0-9]+]], v[[SHLLO]], v[[ELT1PART]]
-; GCN-DAG: v_or_b32_e32 v[[OR1:[0-9]+]], 0, v[[SHLHI]]{{$}}
; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[OR0]]:[[ZERO]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; GCN: s_endpgm
Modified: llvm/trunk/test/CodeGen/AMDGPU/sint_to_fp.i64.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/sint_to_fp.i64.ll?rev=281488&r1=281487&r2=281488&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/sint_to_fp.i64.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/sint_to_fp.i64.ll Wed Sep 14 10:19:03 2016
@@ -38,6 +38,7 @@ define void @v_sint_to_fp_i64_to_f32(flo
}
; FUNC-LABEL: {{^}}s_sint_to_fp_v2i64:
+; GCN-NOT: v_and_b32_e32 v{{[0-9]+}}, -1,
define void @s_sint_to_fp_v2i64(<2 x float> addrspace(1)* %out, <2 x i64> %in) #0{
%result = sitofp <2 x i64> %in to <2 x float>
store <2 x float> %result, <2 x float> addrspace(1)* %out
Modified: llvm/trunk/test/CodeGen/AMDGPU/xor.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/xor.ll?rev=281488&r1=281487&r2=281488&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/xor.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/xor.ll Wed Sep 14 10:19:03 2016
@@ -171,3 +171,81 @@ endif:
store i64 %3, i64 addrspace(1)* %out
ret void
}
+
+; FUNC-LABEL: {{^}}scalar_xor_literal_i64:
+; SI: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
+; SI-DAG: s_xor_b32 s[[RES_HI:[0-9]+]], s[[HI]], 0xf237b
+; SI-DAG: s_xor_b32 s[[RES_LO:[0-9]+]], s[[LO]], 0x3039
+; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[RES_LO]]
+; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[RES_HI]]
+define void @scalar_xor_literal_i64(i64 addrspace(1)* %out, i64 %a) {
+ %or = xor i64 %a, 4261135838621753
+ store i64 %or, i64 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}scalar_xor_literal_multi_use_i64:
+; SI: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
+; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0xf237b
+; SI-DAG: s_movk_i32 s[[K_LO:[0-9]+]], 0x3039
+; SI: s_xor_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
+
+; SI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, s[[K_LO]]
+; SI: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, s[[K_HI]]
+define void @scalar_xor_literal_multi_use_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+ %or = xor i64 %a, 4261135838621753
+ store i64 %or, i64 addrspace(1)* %out
+
+ %foo = add i64 %b, 4261135838621753
+ store volatile i64 %foo, i64 addrspace(1)* undef
+ ret void
+}
+
+; FUNC-LABEL: {{^}}scalar_xor_inline_imm_i64:
+; SI: s_load_dwordx2 s{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
+; SI-NOT: xor_b32
+; SI: s_xor_b32 s[[VAL_LO]], s[[VAL_LO]], 63
+; SI-NOT: xor_b32
+; SI: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[VAL_LO]]
+; SI-NOT: xor_b32
+; SI: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[VAL_HI]]
+; SI-NOT: xor_b32
+; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
+define void @scalar_xor_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) {
+ %or = xor i64 %a, 63
+ store i64 %or, i64 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}scalar_xor_neg_inline_imm_i64:
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
+; SI: s_xor_b64 [[VAL]], [[VAL]], -8
+define void @scalar_xor_neg_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) {
+ %or = xor i64 %a, -8
+ store i64 %or, i64 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}vector_xor_i64_neg_inline_imm:
+; SI: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI: v_xor_b32_e32 {{v[0-9]+}}, -8, v[[LO_VREG]]
+; SI: v_xor_b32_e32 {{v[0-9]+}}, -1, {{.*}}
+; SI: s_endpgm
+define void @vector_xor_i64_neg_inline_imm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+ %loada = load i64, i64 addrspace(1)* %a, align 8
+ %or = xor i64 %loada, -8
+ store i64 %or, i64 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}vector_xor_literal_i64:
+; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI-DAG: v_xor_b32_e32 {{v[0-9]+}}, 0xdf77987f, v[[LO_VREG]]
+; SI-DAG: v_xor_b32_e32 {{v[0-9]+}}, 0x146f, v[[HI_VREG]]
+; SI: s_endpgm
+define void @vector_xor_literal_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+ %loada = load i64, i64 addrspace(1)* %a, align 8
+ %or = xor i64 %loada, 22470723082367
+ store i64 %or, i64 addrspace(1)* %out
+ ret void
+}
More information about the llvm-commits
mailing list