[llvm] r355731 - AMDGPU: Move d16 load matching to preprocess step
Mikael Holmén via llvm-commits
llvm-commits at lists.llvm.org
Wed Mar 13 02:38:22 PDT 2019
Hi,
Starting with this commit, the following warning pops up (at least with
gcc 7.4):
../lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp:2219:6: warning: 'bool
{anonymous}::AMDGPUDAGToDAGISel::SelectHi16Elt(llvm::SDValue,
llvm::SDValue&) const' defined but not used [-Wunused-function]
bool AMDGPUDAGToDAGISel::SelectHi16Elt(SDValue In, SDValue &Src) const {
^~~~~~~~~~~~~~~~~~
Should AMDGPUDAGToDAGISel::SelectHi16Elt perhaps be removed now or is
this just temporary?
Regards,
Mikael
On 3/8/19 9:58 PM, Matt Arsenault via llvm-commits wrote:
> Author: arsenm
> Date: Fri Mar 8 12:58:11 2019
> New Revision: 355731
>
> URL: http://llvm.org/viewvc/llvm-project?rev=355731&view=rev
> Log:
> AMDGPU: Move d16 load matching to preprocess step
>
> When matching half of the build_vector to a load, there could still be
> a hidden dependency on the other half of the build_vector the pattern
> wouldn't detect. If there was an additional chain dependency on the
> other value, a cycle could be introduced.
>
> I don't think a tablegen pattern is capable of matching the necessary
> conditions, so move this into PreprocessISelDAG. Check isPredecessorOf
> for the other value to avoid a cycle. This has a warning that it's
> expensive, so this should probably be moved into an MI pass eventually
> that will have more freedom to reorder instructions to help match
> this. That is currently complicated by the lack of a computeKnownBits
> type mechanism for the selected function.
>
> Modified:
> llvm/trunk/lib/Target/AMDGPU/AMDGPU.td
> llvm/trunk/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
> llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
> llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h
> llvm/trunk/lib/Target/AMDGPU/AMDGPUInstructions.td
> llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h
> llvm/trunk/lib/Target/AMDGPU/BUFInstructions.td
> llvm/trunk/lib/Target/AMDGPU/DSInstructions.td
> llvm/trunk/lib/Target/AMDGPU/FLATInstructions.td
> llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td
> llvm/trunk/test/CodeGen/AMDGPU/build-vector-insert-elt-infloop.ll
> llvm/trunk/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
> llvm/trunk/test/CodeGen/AMDGPU/load-hi16.ll
>
> Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPU.td
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPU.td?rev=355731&r1=355730&r2=355731&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/AMDGPU/AMDGPU.td (original)
> +++ llvm/trunk/lib/Target/AMDGPU/AMDGPU.td Fri Mar 8 12:58:11 2019
> @@ -691,7 +691,7 @@ def HasPackedD16VMem : Predicate<"!Subta
> AssemblerPredicate<"!FeatureUnpackedD16VMem">;
>
> def D16PreservesUnusedBits :
> - Predicate<"Subtarget->hasD16LoadStore() && !Subtarget->isSRAMECCEnabled()">,
> + Predicate<"Subtarget->d16PreservesUnusedBits()">,
> AssemblerPredicate<"FeatureGFX9Insts,!FeatureSRAMECC">;
>
> def LDSRequiresM0Init : Predicate<"Subtarget->ldsRequiresM0Init()">;
>
> Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp?rev=355731&r1=355730&r2=355731&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp (original)
> +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp Fri Mar 8 12:58:11 2019
> @@ -51,6 +51,8 @@
> #include <new>
> #include <vector>
>
> +#define DEBUG_TYPE "isel"
> +
> using namespace llvm;
>
> namespace llvm {
> @@ -88,7 +90,10 @@ public:
> SelectionDAGISel::getAnalysisUsage(AU);
> }
>
> + bool matchLoadD16FromBuildVector(SDNode *N) const;
> +
> bool runOnMachineFunction(MachineFunction &MF) override;
> + void PreprocessISelDAG() override;
> void Select(SDNode *N) override;
> StringRef getPassName() const override;
> void PostprocessISelDAG() override;
> @@ -193,6 +198,7 @@ private:
> bool SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src, unsigned &Mods) const;
> bool SelectVOP3PMadMixMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
>
> + SDValue getHi16Elt(SDValue In) const;
> bool SelectHi16Elt(SDValue In, SDValue &Src) const;
>
> void SelectADD_SUB_I64(SDNode *N);
> @@ -236,11 +242,49 @@ public:
> SDValue &Offset) override;
>
> bool runOnMachineFunction(MachineFunction &MF) override;
> +
> + void PreprocessISelDAG() override {}
> +
> protected:
> // Include the pieces autogenerated from the target description.
> #include "R600GenDAGISel.inc"
> };
>
> +static SDValue stripBitcast(SDValue Val) {
> + return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val;
> +}
> +
> +// Figure out if this is really an extract of the high 16-bits of a dword.
> +static bool isExtractHiElt(SDValue In, SDValue &Out) {
> + In = stripBitcast(In);
> + if (In.getOpcode() != ISD::TRUNCATE)
> + return false;
> +
> + SDValue Srl = In.getOperand(0);
> + if (Srl.getOpcode() == ISD::SRL) {
> + if (ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
> + if (ShiftAmt->getZExtValue() == 16) {
> + Out = stripBitcast(Srl.getOperand(0));
> + return true;
> + }
> + }
> + }
> +
> + return false;
> +}
> +
> +// Look through operations that obscure just looking at the low 16-bits of the
> +// same register.
> +static SDValue stripExtractLoElt(SDValue In) {
> + if (In.getOpcode() == ISD::TRUNCATE) {
> + SDValue Src = In.getOperand(0);
> + if (Src.getValueType().getSizeInBits() == 32)
> + return stripBitcast(Src);
> + }
> +
> + return In;
> +}
> +
> } // end anonymous namespace
>
> INITIALIZE_PASS_BEGIN(AMDGPUDAGToDAGISel, "amdgpu-isel",
> @@ -270,6 +314,114 @@ bool AMDGPUDAGToDAGISel::runOnMachineFun
> return SelectionDAGISel::runOnMachineFunction(MF);
> }
>
> +bool AMDGPUDAGToDAGISel::matchLoadD16FromBuildVector(SDNode *N) const {
> + assert(Subtarget->d16PreservesUnusedBits());
> + MVT VT = N->getValueType(0).getSimpleVT();
> + if (VT != MVT::v2i16 && VT != MVT::v2f16)
> + return false;
> +
> + SDValue Lo = N->getOperand(0);
> + SDValue Hi = N->getOperand(1);
> +
> + LoadSDNode *LdHi = dyn_cast<LoadSDNode>(stripBitcast(Hi));
> +
> + // build_vector lo, (load ptr) -> load_d16_hi ptr, lo
> + // build_vector lo, (zextload ptr from i8) -> load_d16_hi_u8 ptr, lo
> + // build_vector lo, (sextload ptr from i8) -> load_d16_hi_i8 ptr, lo
> +
> + // Need to check for possible indirect dependencies on the other half of the
> + // vector to avoid introducing a cycle.
> + if (LdHi && Hi.hasOneUse() && !LdHi->isPredecessorOf(Lo.getNode())) {
> + SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
> +
> + SDValue TiedIn = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Lo);
> + SDValue Ops[] = {
> + LdHi->getChain(), LdHi->getBasePtr(), TiedIn
> + };
> +
> + unsigned LoadOp = AMDGPUISD::LOAD_D16_HI;
> + if (LdHi->getMemoryVT() == MVT::i8) {
> + LoadOp = LdHi->getExtensionType() == ISD::SEXTLOAD ?
> + AMDGPUISD::LOAD_D16_HI_I8 : AMDGPUISD::LOAD_D16_HI_U8;
> + } else {
> + assert(LdHi->getMemoryVT() == MVT::i16);
> + }
> +
> + SDValue NewLoadHi =
> + CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdHi), VTList,
> + Ops, LdHi->getMemoryVT(),
> + LdHi->getMemOperand());
> +
> + CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadHi);
> + CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdHi, 1), NewLoadHi.getValue(1));
> + return true;
> + }
> +
> + // build_vector (load ptr), hi -> load_d16_lo ptr, hi
> + // build_vector (zextload ptr from i8), hi -> load_d16_lo_u8 ptr, hi
> + // build_vector (sextload ptr from i8), hi -> load_d16_lo_i8 ptr, hi
> + LoadSDNode *LdLo = dyn_cast<LoadSDNode>(stripBitcast(Lo));
> + if (LdLo && Lo.hasOneUse()) {
> + SDValue TiedIn = getHi16Elt(Hi);
> + if (!TiedIn || LdLo->isPredecessorOf(TiedIn.getNode()))
> + return false;
> +
> + SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
> + unsigned LoadOp = AMDGPUISD::LOAD_D16_LO;
> + if (LdLo->getMemoryVT() == MVT::i8) {
> + LoadOp = LdLo->getExtensionType() == ISD::SEXTLOAD ?
> + AMDGPUISD::LOAD_D16_LO_I8 : AMDGPUISD::LOAD_D16_LO_U8;
> + } else {
> + assert(LdLo->getMemoryVT() == MVT::i16);
> + }
> +
> + TiedIn = CurDAG->getNode(ISD::BITCAST, SDLoc(N), VT, TiedIn);
> +
> + SDValue Ops[] = {
> + LdLo->getChain(), LdLo->getBasePtr(), TiedIn
> + };
> +
> + SDValue NewLoadLo =
> + CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdLo), VTList,
> + Ops, LdLo->getMemoryVT(),
> + LdLo->getMemOperand());
> +
> + CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadLo);
> + CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdLo, 1), NewLoadLo.getValue(1));
> + return true;
> + }
> +
> + return false;
> +}
> +
> +void AMDGPUDAGToDAGISel::PreprocessISelDAG() {
> + if (!Subtarget->d16PreservesUnusedBits())
> + return;
> +
> + SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
> +
> + bool MadeChange = false;
> + while (Position != CurDAG->allnodes_begin()) {
> + SDNode *N = &*--Position;
> + if (N->use_empty())
> + continue;
> +
> + switch (N->getOpcode()) {
> + case ISD::BUILD_VECTOR:
> + MadeChange |= matchLoadD16FromBuildVector(N);
> + break;
> + default:
> + break;
> + }
> + }
> +
> + if (MadeChange) {
> + CurDAG->RemoveDeadNodes();
> + LLVM_DEBUG(dbgs() << "After PreProcess:\n";
> + CurDAG->dump(););
> + }
> +}
> +
> bool AMDGPUDAGToDAGISel::isNoNanSrc(SDValue N) const {
> if (TM.Options.NoNaNsFPMath)
> return true;
> @@ -1889,41 +2041,6 @@ bool AMDGPUDAGToDAGISel::SelectVOP3OMods
> return true;
> }
>
> -static SDValue stripBitcast(SDValue Val) {
> - return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val;
> -}
> -
> -// Figure out if this is really an extract of the high 16-bits of a dword.
> -static bool isExtractHiElt(SDValue In, SDValue &Out) {
> - In = stripBitcast(In);
> - if (In.getOpcode() != ISD::TRUNCATE)
> - return false;
> -
> - SDValue Srl = In.getOperand(0);
> - if (Srl.getOpcode() == ISD::SRL) {
> - if (ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
> - if (ShiftAmt->getZExtValue() == 16) {
> - Out = stripBitcast(Srl.getOperand(0));
> - return true;
> - }
> - }
> - }
> -
> - return false;
> -}
> -
> -// Look through operations that obscure just looking at the low 16-bits of the
> -// same register.
> -static SDValue stripExtractLoElt(SDValue In) {
> - if (In.getOpcode() == ISD::TRUNCATE) {
> - SDValue Src = In.getOperand(0);
> - if (Src.getValueType().getSizeInBits() == 32)
> - return stripBitcast(Src);
> - }
> -
> - return In;
> -}
> -
> bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
> SDValue &SrcMods) const {
> unsigned Mods = 0;
> @@ -2076,6 +2193,28 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMadM
> return true;
> }
>
> +SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const {
> + if (In.isUndef())
> + return CurDAG->getUNDEF(MVT::i32);
> +
> + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(In)) {
> + SDLoc SL(In);
> + return CurDAG->getConstant(C->getZExtValue() << 16, SL, MVT::i32);
> + }
> +
> + if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(In)) {
> + SDLoc SL(In);
> + return CurDAG->getConstant(
> + C->getValueAPF().bitcastToAPInt().getZExtValue() << 16, SL, MVT::i32);
> + }
> +
> + SDValue Src;
> + if (isExtractHiElt(In, Src))
> + return Src;
> +
> + return SDValue();
> +}
> +
> // TODO: Can we identify things like v_mad_mixhi_f16?
> bool AMDGPUDAGToDAGISel::SelectHi16Elt(SDValue In, SDValue &Src) const {
> if (In.isUndef()) {
>
> Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp?rev=355731&r1=355730&r2=355731&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp (original)
> +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp Fri Mar 8 12:58:11 2019
> @@ -4186,6 +4186,12 @@ const char* AMDGPUTargetLowering::getTar
> NODE_NAME_CASE(INTERP_P1LL_F16)
> NODE_NAME_CASE(INTERP_P1LV_F16)
> NODE_NAME_CASE(INTERP_P2_F16)
> + NODE_NAME_CASE(LOAD_D16_HI)
> + NODE_NAME_CASE(LOAD_D16_LO)
> + NODE_NAME_CASE(LOAD_D16_HI_I8)
> + NODE_NAME_CASE(LOAD_D16_HI_U8)
> + NODE_NAME_CASE(LOAD_D16_LO_I8)
> + NODE_NAME_CASE(LOAD_D16_LO_U8)
> NODE_NAME_CASE(STORE_MSKOR)
> NODE_NAME_CASE(LOAD_CONSTANT)
> NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
>
> Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h?rev=355731&r1=355730&r2=355731&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h (original)
> +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h Fri Mar 8 12:58:11 2019
> @@ -469,6 +469,13 @@ enum NodeType : unsigned {
> KILL,
> DUMMY_CHAIN,
> FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE,
> + LOAD_D16_HI,
> + LOAD_D16_LO,
> + LOAD_D16_HI_I8,
> + LOAD_D16_HI_U8,
> + LOAD_D16_LO_I8,
> + LOAD_D16_LO_U8,
> +
> STORE_MSKOR,
> LOAD_CONSTANT,
> TBUFFER_STORE_FORMAT,
>
> Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUInstructions.td
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUInstructions.td?rev=355731&r1=355730&r2=355731&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/AMDGPU/AMDGPUInstructions.td (original)
> +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUInstructions.td Fri Mar 8 12:58:11 2019
> @@ -802,7 +802,7 @@ multiclass IntMed3Pat<Instruction med3In
> SDPatternOperator max_oneuse,
> ValueType vt = i32> {
>
> - // This matches 16 permutations of
> + // This matches 16 permutations of
> // min(max(a, b), max(min(a, b), c))
> def : AMDGPUPat <
> (min (max_oneuse vt:$src0, vt:$src1),
> @@ -810,7 +810,7 @@ multiclass IntMed3Pat<Instruction med3In
> (med3Inst vt:$src0, vt:$src1, vt:$src2)
> >;
>
> - // This matches 16 permutations of
> + // This matches 16 permutations of
> // max(min(x, y), min(max(x, y), z))
> def : AMDGPUPat <
> (max (min_oneuse vt:$src0, vt:$src1),
> @@ -818,7 +818,7 @@ multiclass IntMed3Pat<Instruction med3In
> (med3Inst $src0, $src1, $src2)
> >;
> }
> -
> +
> // Special conversion patterns
>
> def cvt_rpi_i32_f32 : PatFrag <
>
> Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h?rev=355731&r1=355730&r2=355731&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h (original)
> +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h Fri Mar 8 12:58:11 2019
> @@ -614,6 +614,10 @@ public:
> return getGeneration() >= GFX9;
> }
>
> + bool d16PreservesUnusedBits() const {
> + return hasD16LoadStore() && !isSRAMECCEnabled();
> + }
> +
> /// Return if most LDS instructions have an m0 use that require m0 to be
> /// iniitalized.
> bool ldsRequiresM0Init() const {
>
> Modified: llvm/trunk/lib/Target/AMDGPU/BUFInstructions.td
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/BUFInstructions.td?rev=355731&r1=355730&r2=355731&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/AMDGPU/BUFInstructions.td (original)
> +++ llvm/trunk/lib/Target/AMDGPU/BUFInstructions.td Fri Mar 8 12:58:11 2019
> @@ -1376,60 +1376,17 @@ multiclass MUBUFScratchLoadPat <MUBUF_Ps
> }
>
> // XXX - Is it possible to have a complex pattern in a PatFrag?
> -multiclass MUBUFScratchLoadPat_Hi16 <MUBUF_Pseudo InstrOffen,
> +multiclass MUBUFScratchLoadPat_D16 <MUBUF_Pseudo InstrOffen,
> MUBUF_Pseudo InstrOffset,
> - ValueType vt, PatFrag ld> {
> + ValueType vt, PatFrag ld_frag> {
> def : GCNPat <
> - (build_vector vt:$lo, (vt (ld (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr,
> - i32:$soffset, u16imm:$offset)))),
> - (v2i16 (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, $lo))
> + (ld_frag (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, i32:$soffset, u16imm:$offset), vt:$in),
> + (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, $in)
> >;
>
> def : GCNPat <
> - (build_vector f16:$lo, (f16 (bitconvert (vt (ld (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr,
> - i32:$soffset, u16imm:$offset)))))),
> - (v2f16 (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, $lo))
> - >;
> -
> -
> - def : GCNPat <
> - (build_vector vt:$lo, (vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset)))),
> - (v2i16 (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, $lo))
> - >;
> -
> - def : GCNPat <
> - (build_vector f16:$lo, (f16 (bitconvert (vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset)))))),
> - (v2f16 (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, $lo))
> - >;
> -}
> -
> -multiclass MUBUFScratchLoadPat_Lo16 <MUBUF_Pseudo InstrOffen,
> - MUBUF_Pseudo InstrOffset,
> - ValueType vt, PatFrag ld> {
> - def : GCNPat <
> - (build_vector (vt (ld (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr,
> - i32:$soffset, u16imm:$offset))),
> - (vt (Hi16Elt vt:$hi))),
> - (v2i16 (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, $hi))
> - >;
> -
> - def : GCNPat <
> - (build_vector (f16 (bitconvert (vt (ld (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr,
> - i32:$soffset, u16imm:$offset))))),
> - (f16 (Hi16Elt f16:$hi))),
> - (v2f16 (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, $hi))
> - >;
> -
> - def : GCNPat <
> - (build_vector (vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset))),
> - (vt (Hi16Elt vt:$hi))),
> - (v2i16 (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, $hi))
> - >;
> -
> - def : GCNPat <
> - (build_vector (f16 (bitconvert (vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset))))),
> - (f16 (Hi16Elt f16:$hi))),
> - (v2f16 (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, $hi))
> + (ld_frag (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset), vt:$in),
> + (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, $in)
> >;
> }
>
> @@ -1445,13 +1402,19 @@ defm : MUBUFScratchLoadPat <BUFFER_LOAD_
> defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX4_OFFEN, BUFFER_LOAD_DWORDX4_OFFSET, v4i32, load_private>;
>
> let OtherPredicates = [D16PreservesUnusedBits] in {
> -defm : MUBUFScratchLoadPat_Hi16<BUFFER_LOAD_SHORT_D16_HI_OFFEN, BUFFER_LOAD_SHORT_D16_HI_OFFSET, i16, load_private>;
> -defm : MUBUFScratchLoadPat_Hi16<BUFFER_LOAD_UBYTE_D16_HI_OFFEN, BUFFER_LOAD_UBYTE_D16_HI_OFFSET, i16, az_extloadi8_private>;
> -defm : MUBUFScratchLoadPat_Hi16<BUFFER_LOAD_SBYTE_D16_HI_OFFEN, BUFFER_LOAD_SBYTE_D16_HI_OFFSET, i16, sextloadi8_private>;
> -
> -defm : MUBUFScratchLoadPat_Lo16<BUFFER_LOAD_SHORT_D16_OFFEN, BUFFER_LOAD_SHORT_D16_OFFSET, i16, load_private>;
> -defm : MUBUFScratchLoadPat_Lo16<BUFFER_LOAD_UBYTE_D16_OFFEN, BUFFER_LOAD_UBYTE_D16_OFFSET, i16, az_extloadi8_private>;
> -defm : MUBUFScratchLoadPat_Lo16<BUFFER_LOAD_SBYTE_D16_OFFEN, BUFFER_LOAD_SBYTE_D16_OFFSET, i16, sextloadi8_private>;
> +defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SHORT_D16_HI_OFFEN, BUFFER_LOAD_SHORT_D16_HI_OFFSET, v2i16, load_d16_hi_private>;
> +defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_UBYTE_D16_HI_OFFEN, BUFFER_LOAD_UBYTE_D16_HI_OFFSET, v2i16, az_extloadi8_d16_hi_private>;
> +defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SBYTE_D16_HI_OFFEN, BUFFER_LOAD_SBYTE_D16_HI_OFFSET, v2i16, sextloadi8_d16_hi_private>;
> +defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SHORT_D16_HI_OFFEN, BUFFER_LOAD_SHORT_D16_HI_OFFSET, v2f16, load_d16_hi_private>;
> +defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_UBYTE_D16_HI_OFFEN, BUFFER_LOAD_UBYTE_D16_HI_OFFSET, v2f16, az_extloadi8_d16_hi_private>;
> +defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SBYTE_D16_HI_OFFEN, BUFFER_LOAD_SBYTE_D16_HI_OFFSET, v2f16, sextloadi8_d16_hi_private>;
> +
> +defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SHORT_D16_OFFEN, BUFFER_LOAD_SHORT_D16_OFFSET, v2i16, load_d16_lo_private>;
> +defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_UBYTE_D16_OFFEN, BUFFER_LOAD_UBYTE_D16_OFFSET, v2i16, az_extloadi8_d16_lo_private>;
> +defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SBYTE_D16_OFFEN, BUFFER_LOAD_SBYTE_D16_OFFSET, v2i16, sextloadi8_d16_lo_private>;
> +defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SHORT_D16_OFFEN, BUFFER_LOAD_SHORT_D16_OFFSET, v2f16, load_d16_lo_private>;
> +defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_UBYTE_D16_OFFEN, BUFFER_LOAD_UBYTE_D16_OFFSET, v2f16, az_extloadi8_d16_lo_private>;
> +defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SBYTE_D16_OFFEN, BUFFER_LOAD_SBYTE_D16_OFFSET, v2f16, sextloadi8_d16_lo_private>;
> }
> multiclass MUBUFStore_Atomic_Pattern <MUBUF_Pseudo Instr_ADDR64, MUBUF_Pseudo Instr_OFFSET,
> ValueType vt, PatFrag atomic_st> {
>
> Modified: llvm/trunk/lib/Target/AMDGPU/DSInstructions.td
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/DSInstructions.td?rev=355731&r1=355730&r2=355731&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/AMDGPU/DSInstructions.td (original)
> +++ llvm/trunk/lib/Target/AMDGPU/DSInstructions.td Fri Mar 8 12:58:11 2019
> @@ -611,30 +611,10 @@ multiclass DSReadPat_mc<DS_Pseudo inst,
> }
> }
>
> -
> -multiclass DSReadPat_Hi16 <DS_Pseudo inst, PatFrag frag, ValueType vt = i16> {
> - def : GCNPat <
> - (build_vector vt:$lo, (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset)))),
> - (v2i16 (inst $ptr, (as_i16imm $offset), (i1 0), $lo))
> - >;
> -
> - def : GCNPat <
> - (build_vector f16:$lo, (f16 (bitconvert (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset)))))),
> - (v2f16 (inst $ptr, (as_i16imm $offset), (i1 0), $lo))
> - >;
> -}
> -
> -multiclass DSReadPat_Lo16 <DS_Pseudo inst, PatFrag frag, ValueType vt = i16> {
> - def : GCNPat <
> - (build_vector (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset))), (vt (Hi16Elt vt:$hi))),
> - (v2i16 (inst $ptr, (as_i16imm $offset), 0, $hi))
> - >;
> -
> - def : GCNPat <
> - (build_vector (f16 (bitconvert (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset))))), (f16 (Hi16Elt f16:$hi))),
> - (v2f16 (inst $ptr, (as_i16imm $offset), 0, $hi))
> - >;
> -}
> +class DSReadPat_D16 <DS_Pseudo inst, PatFrag frag, ValueType vt> : GCNPat <
> + (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$in),
> + (inst $ptr, (as_i16imm $offset), (i1 0), $in)
> +>;
>
> defm : DSReadPat_mc <DS_READ_I8, i32, "sextloadi8_local">;
> defm : DSReadPat_mc <DS_READ_U8, i32, "az_extloadi8_local">;
> @@ -656,16 +636,19 @@ defm : DSReadPat_mc <DS_READ_B128, v4i32
> } // End AddedComplexity = 100
>
> let OtherPredicates = [D16PreservesUnusedBits] in {
> -let AddedComplexity = 100 in {
> -defm : DSReadPat_Hi16<DS_READ_U16_D16_HI, load_local>;
> -defm : DSReadPat_Hi16<DS_READ_U8_D16_HI, az_extloadi8_local>;
> -defm : DSReadPat_Hi16<DS_READ_I8_D16_HI, sextloadi8_local>;
> -
> -defm : DSReadPat_Lo16<DS_READ_U16_D16, load_local>;
> -defm : DSReadPat_Lo16<DS_READ_U8_D16, az_extloadi8_local>;
> -defm : DSReadPat_Lo16<DS_READ_I8_D16, sextloadi8_local>;
> -
> -}
> +def : DSReadPat_D16<DS_READ_U16_D16_HI, load_d16_hi_local, v2i16>;
> +def : DSReadPat_D16<DS_READ_U16_D16_HI, load_d16_hi_local, v2f16>;
> +def : DSReadPat_D16<DS_READ_U8_D16_HI, az_extloadi8_d16_hi_local, v2i16>;
> +def : DSReadPat_D16<DS_READ_U8_D16_HI, az_extloadi8_d16_hi_local, v2f16>;
> +def : DSReadPat_D16<DS_READ_I8_D16_HI, sextloadi8_d16_hi_local, v2i16>;
> +def : DSReadPat_D16<DS_READ_I8_D16_HI, sextloadi8_d16_hi_local, v2f16>;
> +
> +def : DSReadPat_D16<DS_READ_U16_D16, load_d16_lo_local, v2i16>;
> +def : DSReadPat_D16<DS_READ_U16_D16, load_d16_lo_local, v2f16>;
> +def : DSReadPat_D16<DS_READ_U8_D16, az_extloadi8_d16_lo_local, v2i16>;
> +def : DSReadPat_D16<DS_READ_U8_D16, az_extloadi8_d16_lo_local, v2f16>;
> +def : DSReadPat_D16<DS_READ_I8_D16, sextloadi8_d16_lo_local, v2i16>;
> +def : DSReadPat_D16<DS_READ_I8_D16, sextloadi8_d16_lo_local, v2f16>;
> }
>
> class DSWritePat <DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat <
>
> Modified: llvm/trunk/lib/Target/AMDGPU/FLATInstructions.td
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/FLATInstructions.td?rev=355731&r1=355730&r2=355731&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/AMDGPU/FLATInstructions.td (original)
> +++ llvm/trunk/lib/Target/AMDGPU/FLATInstructions.td Fri Mar 8 12:58:11 2019
> @@ -663,53 +663,15 @@ class FlatLoadPat <FLAT_Pseudo inst, SDP
> (inst $vaddr, $offset, 0, $slc)
> >;
>
> -multiclass FlatLoadPat_Hi16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt = i16> {
> - def : GCNPat <
> - (build_vector vt:$elt0, (vt (node (FLATOffset i64:$vaddr, i16:$offset, i1:$slc)))),
> - (v2i16 (inst $vaddr, $offset, 0, $slc, $elt0))
> - >;
> -
> - def : GCNPat <
> - (build_vector f16:$elt0, (f16 (bitconvert (vt (node (FLATOffset i64:$vaddr, i16:$offset, i1:$slc)))))),
> - (v2f16 (inst $vaddr, $offset, 0, $slc, $elt0))
> - >;
> -}
> -
> -multiclass FlatSignedLoadPat_Hi16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt = i16> {
> - def : GCNPat <
> - (build_vector vt:$elt0, (vt (node (FLATOffsetSigned i64:$vaddr, i16:$offset, i1:$slc)))),
> - (v2i16 (inst $vaddr, $offset, 0, $slc, $elt0))
> - >;
> -
> - def : GCNPat <
> - (build_vector f16:$elt0, (f16 (bitconvert (vt (node (FLATOffsetSigned i64:$vaddr, i16:$offset, i1:$slc)))))),
> - (v2f16 (inst $vaddr, $offset, 0, $slc, $elt0))
> - >;
> -}
> -
> -multiclass FlatLoadPat_Lo16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt = i16> {
> - def : GCNPat <
> - (build_vector (vt (node (FLATOffset i64:$vaddr, i16:$offset, i1:$slc))), (vt (Hi16Elt vt:$hi))),
> - (v2i16 (inst $vaddr, $offset, 0, $slc, $hi))
> - >;
> -
> - def : GCNPat <
> - (build_vector (f16 (bitconvert (vt (node (FLATOffset i64:$vaddr, i16:$offset, i1:$slc))))), (f16 (Hi16Elt f16:$hi))),
> - (v2f16 (inst $vaddr, $offset, 0, $slc, $hi))
> - >;
> -}
> +class FlatLoadPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
> + (node (FLATOffset i64:$vaddr, i16:$offset, i1:$slc), vt:$in),
> + (inst $vaddr, $offset, 0, $slc, $in)
> +>;
>
> -multiclass FlatSignedLoadPat_Lo16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt = i16> {
> - def : GCNPat <
> - (build_vector (vt (node (FLATOffsetSigned i64:$vaddr, i16:$offset, i1:$slc))), (vt (Hi16Elt vt:$hi))),
> - (v2i16 (inst $vaddr, $offset, 0, $slc, $hi))
> - >;
> -
> - def : GCNPat <
> - (build_vector (f16 (bitconvert (vt (node (FLATOffsetSigned i64:$vaddr, i16:$offset, i1:$slc))))), (f16 (Hi16Elt f16:$hi))),
> - (v2f16 (inst $vaddr, $offset, 0, $slc, $hi))
> - >;
> -}
> +class FlatSignedLoadPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
> + (node (FLATOffsetSigned i64:$vaddr, i16:$offset, i1:$slc), vt:$in),
> + (inst $vaddr, $offset, 0, $slc, $in)
> +>;
>
> class FlatLoadAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
> (vt (node (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc))),
> @@ -817,17 +779,19 @@ let OtherPredicates = [D16PreservesUnuse
> def : FlatStorePat <FLAT_STORE_SHORT_D16_HI, truncstorei16_hi16_flat, i32>;
> def : FlatStorePat <FLAT_STORE_BYTE_D16_HI, truncstorei8_hi16_flat, i32>;
>
> -let AddedComplexity = 3 in {
> -defm : FlatLoadPat_Hi16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_flat>;
> -defm : FlatLoadPat_Hi16 <FLAT_LOAD_SBYTE_D16_HI, sextloadi8_flat>;
> -defm : FlatLoadPat_Hi16 <FLAT_LOAD_SHORT_D16_HI, load_flat>;
> -}
> -
> -let AddedComplexity = 9 in {
> -defm : FlatLoadPat_Lo16 <FLAT_LOAD_UBYTE_D16, az_extloadi8_flat>;
> -defm : FlatLoadPat_Lo16 <FLAT_LOAD_SBYTE_D16, sextloadi8_flat>;
> -defm : FlatLoadPat_Lo16 <FLAT_LOAD_SHORT_D16, load_flat>;
> -}
> +def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_flat, v2i16>;
> +def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_flat, v2f16>;
> +def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_flat, v2i16>;
> +def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_flat, v2f16>;
> +def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16_HI, load_d16_hi_flat, v2i16>;
> +def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16_HI, load_d16_hi_flat, v2f16>;
> +
> +def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16, az_extloadi8_d16_lo_flat, v2i16>;
> +def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16, az_extloadi8_d16_lo_flat, v2f16>;
> +def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16, sextloadi8_d16_lo_flat, v2i16>;
> +def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16, sextloadi8_d16_lo_flat, v2f16>;
> +def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2i16>;
> +def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2f16>;
> }
>
> } // End OtherPredicates = [HasFlatAddressSpace]
> @@ -861,14 +825,19 @@ let OtherPredicates = [D16PreservesUnuse
> def : FlatStoreSignedPat <GLOBAL_STORE_SHORT_D16_HI, truncstorei16_hi16_global, i32>;
> def : FlatStoreSignedPat <GLOBAL_STORE_BYTE_D16_HI, truncstorei8_hi16_global, i32>;
>
> -defm : FlatSignedLoadPat_Hi16 <GLOBAL_LOAD_UBYTE_D16_HI, az_extloadi8_global>;
> -defm : FlatSignedLoadPat_Hi16 <GLOBAL_LOAD_SBYTE_D16_HI, sextloadi8_global>;
> -defm : FlatSignedLoadPat_Hi16 <GLOBAL_LOAD_SHORT_D16_HI, load_global>;
> -
> -defm : FlatSignedLoadPat_Lo16 <GLOBAL_LOAD_UBYTE_D16, az_extloadi8_global>;
> -defm : FlatSignedLoadPat_Lo16 <GLOBAL_LOAD_SBYTE_D16, sextloadi8_global>;
> -defm : FlatSignedLoadPat_Lo16 <GLOBAL_LOAD_SHORT_D16, load_global>;
> -
> +def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_global, v2i16>;
> +def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_global, v2f16>;
> +def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_global, v2i16>;
> +def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_global, v2f16>;
> +def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SHORT_D16_HI, load_d16_hi_global, v2i16>;
> +def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SHORT_D16_HI, load_d16_hi_global, v2f16>;
> +
> +def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_UBYTE_D16, az_extloadi8_d16_lo_global, v2i16>;
> +def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_UBYTE_D16, az_extloadi8_d16_lo_global, v2f16>;
> +def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SBYTE_D16, sextloadi8_d16_lo_global, v2i16>;
> +def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SBYTE_D16, sextloadi8_d16_lo_global, v2f16>;
> +def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SHORT_D16, load_d16_lo_global, v2i16>;
> +def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SHORT_D16, load_d16_lo_global, v2f16>;
> }
>
> def : FlatStoreSignedAtomicPat <GLOBAL_STORE_DWORD, store_atomic_global, i32>;
> @@ -902,7 +871,7 @@ def : FlatSignedAtomicPat <GLOBAL_ATOMIC
> def : FlatSignedAtomicPat <GLOBAL_ATOMIC_CMPSWAP_X2_RTN, AMDGPUatomic_cmp_swap_global, i64, v2i64>;
> def : FlatSignedAtomicPat <GLOBAL_ATOMIC_XOR_X2_RTN, atomic_xor_global, i64>;
>
> -} // End OtherPredicates = [HasFlatGlobalInsts]
> +} // End OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10
>
>
> //===----------------------------------------------------------------------===//
>
> Modified: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td?rev=355731&r1=355730&r2=355731&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td (original)
> +++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td Fri Mar 8 12:58:11 2019
> @@ -69,6 +69,13 @@ def SIatomic_fmax : SDNode<"AMDGPUISD::A
> [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
> >;
>
> +// load_d16_{lo|hi} ptr, tied_input
> +def SIload_d16 : SDTypeProfile<1, 2, [
> + SDTCisPtrTy<1>,
> + SDTCisSameAs<0, 2>
> +]>;
> +
> +
> def SDTtbuffer_load : SDTypeProfile<1, 8,
> [ // vdata
> SDTCisVT<1, v4i32>, // rsrc
> @@ -187,6 +194,36 @@ def SIpc_add_rel_offset : SDNode<"AMDGPU
> SDTypeProfile<1, 2, [SDTCisVT<0, iPTR>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]>
> >;
>
> +def SIload_d16_lo : SDNode<"AMDGPUISD::LOAD_D16_LO",
> + SIload_d16,
> + [SDNPMayLoad, SDNPMemOperand, SDNPHasChain]
> +>;
> +
> +def SIload_d16_lo_u8 : SDNode<"AMDGPUISD::LOAD_D16_LO_U8",
> + SIload_d16,
> + [SDNPMayLoad, SDNPMemOperand, SDNPHasChain]
> +>;
> +
> +def SIload_d16_lo_i8 : SDNode<"AMDGPUISD::LOAD_D16_LO_I8",
> + SIload_d16,
> + [SDNPMayLoad, SDNPMemOperand, SDNPHasChain]
> +>;
> +
> +def SIload_d16_hi : SDNode<"AMDGPUISD::LOAD_D16_HI",
> + SIload_d16,
> + [SDNPMayLoad, SDNPMemOperand, SDNPHasChain]
> +>;
> +
> +def SIload_d16_hi_u8 : SDNode<"AMDGPUISD::LOAD_D16_HI_U8",
> + SIload_d16,
> + [SDNPMayLoad, SDNPMemOperand, SDNPHasChain]
> +>;
> +
> +def SIload_d16_hi_i8 : SDNode<"AMDGPUISD::LOAD_D16_HI_I8",
> + SIload_d16,
> + [SDNPMayLoad, SDNPMemOperand, SDNPHasChain]
> +>;
> +
> //===----------------------------------------------------------------------===//
> // ValueType helpers
> //===----------------------------------------------------------------------===//
> @@ -384,6 +421,51 @@ def si_setcc_uniform : PatFrag <
> return true;
> }]>;
>
> +//===----------------------------------------------------------------------===//
> +// SDNodes PatFrags for d16 loads
> +//===----------------------------------------------------------------------===//
> +
> +class LoadD16Frag <SDPatternOperator op> : PatFrag<(ops node:$ptr, node:$tied_in), (op node:$ptr, node:$tied_in)>;
> +class LocalLoadD16 <SDPatternOperator op> : LoadD16Frag <op>, LocalAddress;
> +class GlobalLoadD16 <SDPatternOperator op> : LoadD16Frag <op>, GlobalLoadAddress;
> +class PrivateLoadD16 <SDPatternOperator op> : LoadD16Frag <op>, PrivateAddress;
> +class FlatLoadD16 <SDPatternOperator op> : LoadD16Frag <op>, FlatLoadAddress;
> +
> +def load_d16_hi_local : LocalLoadD16 <SIload_d16_hi>;
> +def az_extloadi8_d16_hi_local : LocalLoadD16 <SIload_d16_hi_u8>;
> +def sextloadi8_d16_hi_local : LocalLoadD16 <SIload_d16_hi_i8>;
> +
> +def load_d16_hi_global : GlobalLoadD16 <SIload_d16_hi>;
> +def az_extloadi8_d16_hi_global : GlobalLoadD16 <SIload_d16_hi_u8>;
> +def sextloadi8_d16_hi_global : GlobalLoadD16 <SIload_d16_hi_i8>;
> +
> +def load_d16_hi_private : PrivateLoadD16 <SIload_d16_hi>;
> +def az_extloadi8_d16_hi_private : PrivateLoadD16 <SIload_d16_hi_u8>;
> +def sextloadi8_d16_hi_private : PrivateLoadD16 <SIload_d16_hi_i8>;
> +
> +def load_d16_hi_flat : FlatLoadD16 <SIload_d16_hi>;
> +def az_extloadi8_d16_hi_flat : FlatLoadD16 <SIload_d16_hi_u8>;
> +def sextloadi8_d16_hi_flat : FlatLoadD16 <SIload_d16_hi_i8>;
> +
> +
> +def load_d16_lo_local : LocalLoadD16 <SIload_d16_lo>;
> +def az_extloadi8_d16_lo_local : LocalLoadD16 <SIload_d16_lo_u8>;
> +def sextloadi8_d16_lo_local : LocalLoadD16 <SIload_d16_lo_i8>;
> +
> +def load_d16_lo_global : GlobalLoadD16 <SIload_d16_lo>;
> +def az_extloadi8_d16_lo_global : GlobalLoadD16 <SIload_d16_lo_u8>;
> +def sextloadi8_d16_lo_global : GlobalLoadD16 <SIload_d16_lo_i8>;
> +
> +def load_d16_lo_private : PrivateLoadD16 <SIload_d16_lo>;
> +def az_extloadi8_d16_lo_private : PrivateLoadD16 <SIload_d16_lo_u8>;
> +def sextloadi8_d16_lo_private : PrivateLoadD16 <SIload_d16_lo_i8>;
> +
> +def load_d16_lo_flat : FlatLoadD16 <SIload_d16_lo>;
> +def az_extloadi8_d16_lo_flat : FlatLoadD16 <SIload_d16_lo_u8>;
> +def sextloadi8_d16_lo_flat : FlatLoadD16 <SIload_d16_lo_i8>;
> +
> +
> +
> def lshr_rev : PatFrag <
> (ops node:$src1, node:$src0),
> (srl $src0, $src1)
>
> Modified: llvm/trunk/test/CodeGen/AMDGPU/build-vector-insert-elt-infloop.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/build-vector-insert-elt-infloop.ll?rev=355731&r1=355730&r2=355731&view=diff
> ==============================================================================
> --- llvm/trunk/test/CodeGen/AMDGPU/build-vector-insert-elt-infloop.ll (original)
> +++ llvm/trunk/test/CodeGen/AMDGPU/build-vector-insert-elt-infloop.ll Fri Mar 8 12:58:11 2019
> @@ -4,9 +4,8 @@
> ; combine and a generic insert_vector_elt combine.
>
> ; GCN-LABEL: {{^}}combine_loop:
> -; GCN: flat_load_ushort
> +; GCN: flat_load_short_d16_hi
> ; GCN: flat_store_short
> -; GCN: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
> define amdgpu_kernel void @combine_loop(i16* %arg) #0 {
> bb:
> br label %bb1
>
> Modified: llvm/trunk/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/chain-hi-to-lo.ll?rev=355731&r1=355730&r2=355731&view=diff
> ==============================================================================
> --- llvm/trunk/test/CodeGen/AMDGPU/chain-hi-to-lo.ll (original)
> +++ llvm/trunk/test/CodeGen/AMDGPU/chain-hi-to-lo.ll Fri Mar 8 12:58:11 2019
> @@ -1,4 +1,4 @@
> -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
> +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900 %s
>
> ; GCN-LABEL: {{^}}chain_hi_to_lo_private:
> ; GCN: buffer_load_ushort [[DST:v[0-9]+]], off, [[RSRC:s\[[0-9]+:[0-9]+\]]], [[SOFF:s[0-9]+]] offset:2
> @@ -175,3 +175,128 @@ entry:
> %loc.0.sroa_cast2 = bitcast [3 x i16] addrspace(5)* %loc to i8 addrspace(5)*
> ret void
> }
> +
> +; There is another instruction between the misordered instruction and
> +; the value dependent load, so a simple operand check is insufficient.
> +; GCN-LABEL: {{^}}chain_hi_to_lo_group_other_dep:
> +; GFX900: ds_read_u16_d16_hi v1, v0
> +; GFX900-NEXT: s_waitcnt lgkmcnt(0)
> +; GFX900-NEXT: v_pk_add_u16 v1, v1, 12 op_sel_hi:[1,0]
> +; GFX900-NEXT: ds_read_u16_d16 v1, v0 offset:2
> +; GFX900-NEXT: s_waitcnt lgkmcnt(0)
> +; GFX900-NEXT: v_mov_b32_e32 v0, v1
> +; GFX900-NEXT: s_setpc_b64
> +define <2 x i16> @chain_hi_to_lo_group_other_dep(i16 addrspace(3)* %ptr) {
> +bb:
> + %gep_lo = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 1
> + %load_lo = load i16, i16 addrspace(3)* %gep_lo
> + %gep_hi = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 0
> + %load_hi = load i16, i16 addrspace(3)* %gep_hi
> + %to.hi = insertelement <2 x i16> undef, i16 %load_hi, i32 1
> + %op.hi = add <2 x i16> %to.hi, <i16 12, i16 12>
> + %result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0
> + ret <2 x i16> %result
> +}
> +
> +; The volatile operations aren't put on the same chain
> +; GCN-LABEL: {{^}}chain_hi_to_lo_group_other_dep_multi_chain:
> +; GFX900: ds_read_u16 v1, v0 offset:2
> +; GFX900-NEXT: ds_read_u16_d16_hi v0, v0
> +; GFX900-NEXT: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff
> +; GFX900-NEXT: s_waitcnt lgkmcnt(0)
> +; GFX900-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
> +; GFX900-NEXT: v_bfi_b32 v0, [[MASK]], v1, v0
> +; GFX900-NEXT: s_setpc_b64
> +define <2 x i16> @chain_hi_to_lo_group_other_dep_multi_chain(i16 addrspace(3)* %ptr) {
> +bb:
> + %gep_lo = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 1
> + %load_lo = load volatile i16, i16 addrspace(3)* %gep_lo
> + %gep_hi = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 0
> + %load_hi = load volatile i16, i16 addrspace(3)* %gep_hi
> + %to.hi = insertelement <2 x i16> undef, i16 %load_hi, i32 1
> + %op.hi = add <2 x i16> %to.hi, <i16 12, i16 12>
> + %result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0
> + ret <2 x i16> %result
> +}
> +
> +; GCN-LABEL: {{^}}chain_hi_to_lo_private_other_dep:
> +; GFX900: buffer_load_short_d16_hi v1, v0, s[0:3], s4 offen
> +; GFX900-NEXT: s_waitcnt vmcnt(0)
> +; GFX900-NEXT: v_pk_add_u16 v1, v1, 12 op_sel_hi:[1,0]
> +; GFX900-NEXT: buffer_load_short_d16 v1, v0, s[0:3], s4 offen offset:2
> +; GFX900-NEXT: s_waitcnt vmcnt(0)
> +; GFX900-NEXT: v_mov_b32_e32 v0, v1
> +; GFX900-NEXT: s_setpc_b64
> +define <2 x i16> @chain_hi_to_lo_private_other_dep(i16 addrspace(5)* %ptr) {
> +bb:
> + %gep_lo = getelementptr inbounds i16, i16 addrspace(5)* %ptr, i64 1
> + %load_lo = load i16, i16 addrspace(5)* %gep_lo
> + %gep_hi = getelementptr inbounds i16, i16 addrspace(5)* %ptr, i64 0
> + %load_hi = load i16, i16 addrspace(5)* %gep_hi
> + %to.hi = insertelement <2 x i16> undef, i16 %load_hi, i32 1
> + %op.hi = add <2 x i16> %to.hi, <i16 12, i16 12>
> + %result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0
> + ret <2 x i16> %result
> +}
> +
> +; GCN-LABEL: {{^}}chain_hi_to_lo_global_other_dep:
> +; GFX900: global_load_ushort v2, v[0:1], off offset:2
> +; GFX900-NEXT: global_load_short_d16_hi v0, v[0:1], off
> +; GFX900-NEXT: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff
> +; GFX900-NEXT: s_waitcnt vmcnt(0)
> +; GFX900-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
> +; GFX900-NEXT: v_bfi_b32 v0, [[MASK]], v2, v0
> +; GFX900-NEXT: s_setpc_b64
> +define <2 x i16> @chain_hi_to_lo_global_other_dep(i16 addrspace(1)* %ptr) {
> +bb:
> + %gep_lo = getelementptr inbounds i16, i16 addrspace(1)* %ptr, i64 1
> + %load_lo = load volatile i16, i16 addrspace(1)* %gep_lo
> + %gep_hi = getelementptr inbounds i16, i16 addrspace(1)* %ptr, i64 0
> + %load_hi = load volatile i16, i16 addrspace(1)* %gep_hi
> + %to.hi = insertelement <2 x i16> undef, i16 %load_hi, i32 1
> + %op.hi = add <2 x i16> %to.hi, <i16 12, i16 12>
> + %result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0
> + ret <2 x i16> %result
> +}
> +
> +; GCN-LABEL: {{^}}chain_hi_to_lo_flat_other_dep:
> +; GFX900: flat_load_ushort v2, v[0:1] offset:2
> +; GFX900-NEXT: flat_load_short_d16_hi v0, v[0:1]
> +; GFX900-NEXT: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff
> +; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
> +; GFX900-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
> +; GFX900-NEXT: v_bfi_b32 v0, v1, v2, v0
> +; GFX900-NEXT: s_setpc_b64
> +define <2 x i16> @chain_hi_to_lo_flat_other_dep(i16 addrspace(0)* %ptr) {
> +bb:
> + %gep_lo = getelementptr inbounds i16, i16 addrspace(0)* %ptr, i64 1
> + %load_lo = load volatile i16, i16 addrspace(0)* %gep_lo
> + %gep_hi = getelementptr inbounds i16, i16 addrspace(0)* %ptr, i64 0
> + %load_hi = load volatile i16, i16 addrspace(0)* %gep_hi
> + %to.hi = insertelement <2 x i16> undef, i16 %load_hi, i32 1
> + %op.hi = add <2 x i16> %to.hi, <i16 12, i16 12>
> + %result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0
> + ret <2 x i16> %result
> +}
> +
> +; GCN-LABEL: {{^}}chain_hi_to_lo_group_may_alias_store:
> +; GFX900: v_mov_b32_e32 [[K:v[0-9]+]], 0x7b
> +; GFX900-NEXT: ds_read_u16 v3, v0
> +; GFX900-NEXT: ds_write_b16 v1, [[K]]
> +; GFX900-NEXT: ds_read_u16 v0, v0 offset:2
> +; GFX900-NEXT: s_waitcnt lgkmcnt(0)
> +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0
> +; GFX900-NEXT: v_lshl_or_b32 v0, v3, 16, v0
> +; GFX900-NEXT: s_setpc_b64
> +define <2 x i16> @chain_hi_to_lo_group_may_alias_store(i16 addrspace(3)* %ptr, i16 addrspace(3)* %may.alias) {
> +bb:
> + %gep_lo = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 1
> + %gep_hi = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 0
> + %load_hi = load i16, i16 addrspace(3)* %gep_hi
> + store i16 123, i16 addrspace(3)* %may.alias
> + %load_lo = load i16, i16 addrspace(3)* %gep_lo
> +
> + %to.hi = insertelement <2 x i16> undef, i16 %load_hi, i32 1
> + %result = insertelement <2 x i16> %to.hi, i16 %load_lo, i32 0
> + ret <2 x i16> %result
> +}
>
> Modified: llvm/trunk/test/CodeGen/AMDGPU/load-hi16.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/load-hi16.ll?rev=355731&r1=355730&r2=355731&view=diff
> ==============================================================================
> --- llvm/trunk/test/CodeGen/AMDGPU/load-hi16.ll (original)
> +++ llvm/trunk/test/CodeGen/AMDGPU/load-hi16.ll Fri Mar 8 12:58:11 2019
> @@ -880,6 +880,21 @@ entry:
> ret <2 x i16> %build1
> }
>
> +; FIXME: Remove and
> +; GCN-LABEL: {{^}}load_local_v2i16_broadcast:
> +; GCN: ds_read_u16 [[LOAD:v[0-9]+]]
> +; GCN-NOT: ds_read
> +; GFX9: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, [[LOAD]]
> +; GFX9: v_lshl_or_b32 v0, [[LOAD]], 16, [[AND]]
> +define <2 x i16> @load_local_v2i16_broadcast(i16 addrspace(3)* %in) #0 {
> +entry:
> + %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 1
> + %load0 = load i16, i16 addrspace(3)* %in
> + %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0
> + %build1 = insertelement <2 x i16> %build0, i16 %load0, i32 1
> + ret <2 x i16> %build1
> +}
> +
> ; GCN-LABEL: {{^}}load_local_lo_hi_v2i16_side_effect:
> ; GFX900: ds_read_u16 [[LOAD0:v[0-9]+]], v0
> ; GFX900: ds_write_b16
>
>
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at lists.llvm.org
> https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits
>
More information about the llvm-commits
mailing list