[llvm] e65e6d0 - AMDGPU/GlobalISel: Legalize TFE image result loads

Wed Feb 5 09:40:33 PST 2020

Author: Matt Arsenault
Date: 2020-02-05T12:40:20-05:00
New Revision: e65e6d052ed438843c1e8bc00524c766505966e3

URL: https://github.com/llvm/llvm-project/commit/e65e6d052ed438843c1e8bc00524c766505966e3
DIFF: https://github.com/llvm/llvm-project/commit/e65e6d052ed438843c1e8bc00524c766505966e3.diff

LOG: AMDGPU/GlobalISel: Legalize TFE image result loads

Rewrite the result register pair into the expected sinigle register
format in the legalizer.

I'm also operating under the assumption that TFE doesn't apply to
stores or atomics, but don't know if this is true or not.

Added: 
    llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.ll

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
    llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.d16.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 3f4dce23ce43..155e385fee3e 100644

--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -2914,26 +2914,50 @@ bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
   return true;
 }
 
+// FIXME: Just vector trunc should be sufficent, but legalization currently
+// broken.
+static void repackUnpackedD16Load(MachineIRBuilder &B, Register DstReg,
+                                  Register WideDstReg) {
+  const LLT S32 = LLT::scalar(32);
+  const LLT S16 = LLT::scalar(16);
+
+  auto Unmerge = B.buildUnmerge(S32, WideDstReg);
+
+  int NumOps = Unmerge->getNumOperands() - 1;
+  SmallVector<Register, 4> RemergeParts(NumOps);
+  for (int I = 0; I != NumOps; ++I)
+    RemergeParts[I] = B.buildTrunc(S16, Unmerge.getReg(I)).getReg(0);
+
+  B.buildBuildVector(DstReg, RemergeParts);
+}
+
 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
     MachineInstr &MI, MachineIRBuilder &B,
     GISelChangeObserver &Observer,
     const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
+  bool IsTFE = MI.getNumExplicitDefs() == 2;
+
   // We are only processing the operands of d16 image operations on subtargets
-  // that use the unpacked register layout.
-  if (!ST.hasUnpackedD16VMem())
+  // that use the unpacked register layout, or need to repack the TFE result.
+
+  // TODO: Need to handle a16 images too
+  // TODO: Do we need to guard against already legalized intrinsics?
+  if (!IsTFE && !ST.hasUnpackedD16VMem())
     return true;
 
   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
     AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
 
-  if (BaseOpcode->Atomic) // No d16 atomics
+  if (BaseOpcode->Atomic) // No d16 atomics, or TFE.
     return true;
 
+  B.setInstr(MI);
+
   MachineRegisterInfo *MRI = B.getMRI();
   const LLT S32 = LLT::scalar(32);
   const LLT S16 = LLT::scalar(16);
 
-  if (BaseOpcode->Store) {
+  if (BaseOpcode->Store) { // No TFE for stores?
     Register VData = MI.getOperand(1).getReg();
     LLT Ty = MRI->getType(VData);
     if (!Ty.isVector() || Ty.getElementType() != S16)
@@ -2947,9 +2971,66 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
     return true;
   }
 
-  // Must be an image load.
   Register DstReg = MI.getOperand(0).getReg();
   LLT Ty = MRI->getType(DstReg);
+  const bool IsD16 = Ty.getScalarType() == S16;
+  const unsigned NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
+
+  if (IsTFE) {
+    // In the IR, TFE is supposed to be used with a 2 element struct return
+    // type. The intruction really returns these two values in one contiguous
+    // register, with one additional dword beyond the loaded data. Rewrite the
+    // return type to use a single register result.
+    Register Dst1Reg = MI.getOperand(1).getReg();
+    if (MRI->getType(Dst1Reg) != S32)
+      return false;
+
+    // TODO: Make sure the TFE operand bit is set.
+
+    // The raw dword aligned data component of the load. The only legal cases
+    // where this matters should be when using the packed D16 format, for
+    // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
+    LLT RoundedTy;
+    LLT TFETy;
+
+    if (IsD16 && ST.hasUnpackedD16VMem()) {
+      RoundedTy = LLT::scalarOrVector(NumElts, 32);
+      TFETy = LLT::vector(NumElts + 1, 32);
+    } else {
+      unsigned EltSize = Ty.getScalarSizeInBits();
+      unsigned RoundedElts = (Ty.getSizeInBits() + 31) / 32;
+      unsigned RoundedSize = 32 * RoundedElts;
+      RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
+      TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
+    }
+
+    Register TFEReg = MRI->createGenericVirtualRegister(TFETy);
+    Observer.changingInstr(MI);
+
+    MI.getOperand(0).setReg(TFEReg);
+    MI.RemoveOperand(1);
+
+    Observer.changedInstr(MI);
+
+    // Insert after the instruction.
+    B.setInsertPt(*MI.getParent(), ++MI.getIterator());
+
+    // TODO: Should probably unmerge to s32 pieces and repack instead of using
+    // extracts.
+    if (RoundedTy == Ty) {
+      B.buildExtract(DstReg, TFEReg, 0);
+    } else {
+      // If we had to round the data type (i.e. this was a <3 x s16>), do the
+      // weird extract separately.
+      auto DataPart = B.buildExtract(RoundedTy, TFEReg, 0);
+      B.buildExtract(DstReg, DataPart, 0);
+    }
+
+    B.buildExtract(Dst1Reg, TFEReg, RoundedTy.getSizeInBits());
+    return true;
+  }
+
+  // Must be an image load.
   if (!Ty.isVector() || Ty.getElementType() != S16)
     return true;
 
@@ -2962,16 +3043,7 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
   MI.getOperand(0).setReg(WideDstReg);
   Observer.changedInstr(MI);
 
-  // FIXME: Just vector trunc should be sufficent, but legalization currently
-  // broken.
-  auto Unmerge = B.buildUnmerge(S32, WideDstReg);
-
-  int NumOps = Unmerge->getNumOperands() - 1;
-  SmallVector<Register, 4> RemergeParts(NumOps);
-  for (int I = 0; I != NumOps; ++I)
-    RemergeParts[I] = B.buildTrunc(S16, Unmerge.getReg(I)).getReg(0);
-
-  B.buildBuildVector(DstReg, RemergeParts);
+  repackUnpackedD16Load(B, DstReg, WideDstReg);
   return true;
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.d16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.d16.ll
index f1c66c5e10b5..349a9e8bce8f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.d16.ll
@@ -218,9 +218,234 @@ define amdgpu_ps <4 x half> @image_load_v4f16(<8 x i32> inreg %rsrc, i32 %s, i32
   ret <4 x half> %tex
 }
 
+define amdgpu_ps half @image_load_tfe_f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+  ; UNPACKED-LABEL: name: image_load_tfe_f16
+  ; UNPACKED: bb.1 (%ir-block.0):
+  ; UNPACKED:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1
+  ; UNPACKED:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
+  ; UNPACKED:   [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
+  ; UNPACKED:   [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
+  ; UNPACKED:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
+  ; UNPACKED:   [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
+  ; UNPACKED:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
+  ; UNPACKED:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
+  ; UNPACKED:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; UNPACKED:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; UNPACKED:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; UNPACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; UNPACKED:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; UNPACKED:   [[INT:%[0-9]+]]:_(<2 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.2d), 1, [[COPY8]](s32), [[COPY9]](s32), [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 2 from custom "TargetCustom8")
+  ; UNPACKED:   [[EXTRACT:%[0-9]+]]:_(s32) = G_EXTRACT [[INT]](<2 x s32>), 0
+  ; UNPACKED:   [[EXTRACT1:%[0-9]+]]:_(s32) = G_EXTRACT [[INT]](<2 x s32>), 32
+  ; UNPACKED:   G_STORE [[EXTRACT1]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+  ; UNPACKED:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[EXTRACT]](s32)
+  ; UNPACKED:   $vgpr0 = COPY [[COPY10]](s32)
+  ; UNPACKED:   SI_RETURN_TO_EPILOG implicit $vgpr0
+  ; PACKED-LABEL: name: image_load_tfe_f16
+  ; PACKED: bb.1 (%ir-block.0):
+  ; PACKED:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1
+  ; PACKED:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
+  ; PACKED:   [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
+  ; PACKED:   [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
+  ; PACKED:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
+  ; PACKED:   [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
+  ; PACKED:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
+  ; PACKED:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
+  ; PACKED:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; PACKED:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; PACKED:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; PACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; PACKED:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; PACKED:   [[INT:%[0-9]+]]:_(<2 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.2d), 1, [[COPY8]](s32), [[COPY9]](s32), [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 2 from custom "TargetCustom8")
+  ; PACKED:   [[EXTRACT:%[0-9]+]]:_(<2 x s16>) = G_EXTRACT [[INT]](<2 x s32>), 0
+  ; PACKED:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[EXTRACT]](<2 x s16>)
+  ; PACKED:   [[EXTRACT1:%[0-9]+]]:_(s32) = G_EXTRACT [[INT]](<2 x s32>), 32
+  ; PACKED:   G_STORE [[EXTRACT1]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+  ; PACKED:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
+  ; PACKED:   $vgpr0 = COPY [[COPY10]](s32)
+  ; PACKED:   SI_RETURN_TO_EPILOG implicit $vgpr0
+  %res = call { half, i32 } @llvm.amdgcn.image.load.2d.sl_f16i32s.i32(i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0)
+  %tex = extractvalue { half, i32 } %res, 0
+  %tfe = extractvalue { half, i32 } %res, 1
+  store i32 %tfe, i32 addrspace(1)* undef
+  ret half %tex
+}
+
+define amdgpu_ps <2 x half> @image_load_tfe_v2f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+  ; UNPACKED-LABEL: name: image_load_tfe_v2f16
+  ; UNPACKED: bb.1 (%ir-block.0):
+  ; UNPACKED:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1
+  ; UNPACKED:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
+  ; UNPACKED:   [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
+  ; UNPACKED:   [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
+  ; UNPACKED:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
+  ; UNPACKED:   [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
+  ; UNPACKED:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
+  ; UNPACKED:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
+  ; UNPACKED:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; UNPACKED:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; UNPACKED:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; UNPACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; UNPACKED:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; UNPACKED:   [[INT:%[0-9]+]]:_(<3 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.2d), 3, [[COPY8]](s32), [[COPY9]](s32), [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 4 from custom "TargetCustom8")
+  ; UNPACKED:   [[EXTRACT:%[0-9]+]]:_(<2 x s32>) = G_EXTRACT [[INT]](<3 x s32>), 0
+  ; UNPACKED:   [[EXTRACT1:%[0-9]+]]:_(<2 x s16>) = G_EXTRACT [[EXTRACT]](<2 x s32>), 0
+  ; UNPACKED:   [[EXTRACT2:%[0-9]+]]:_(s32) = G_EXTRACT [[INT]](<3 x s32>), 64
+  ; UNPACKED:   G_STORE [[EXTRACT2]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+  ; UNPACKED:   $vgpr0 = COPY [[EXTRACT1]](<2 x s16>)
+  ; UNPACKED:   SI_RETURN_TO_EPILOG implicit $vgpr0
+  ; PACKED-LABEL: name: image_load_tfe_v2f16
+  ; PACKED: bb.1 (%ir-block.0):
+  ; PACKED:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1
+  ; PACKED:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
+  ; PACKED:   [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
+  ; PACKED:   [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
+  ; PACKED:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
+  ; PACKED:   [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
+  ; PACKED:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
+  ; PACKED:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
+  ; PACKED:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; PACKED:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; PACKED:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; PACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; PACKED:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; PACKED:   [[INT:%[0-9]+]]:_(<2 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.2d), 3, [[COPY8]](s32), [[COPY9]](s32), [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 4 from custom "TargetCustom8")
+  ; PACKED:   [[EXTRACT:%[0-9]+]]:_(<2 x s16>) = G_EXTRACT [[INT]](<2 x s32>), 0
+  ; PACKED:   [[EXTRACT1:%[0-9]+]]:_(s32) = G_EXTRACT [[INT]](<2 x s32>), 32
+  ; PACKED:   G_STORE [[EXTRACT1]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+  ; PACKED:   $vgpr0 = COPY [[EXTRACT]](<2 x s16>)
+  ; PACKED:   SI_RETURN_TO_EPILOG implicit $vgpr0
+  %res = call { <2 x half>, i32 } @llvm.amdgcn.image.load.2d.sl_v2f16i32s.i32(i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0)
+  %tex = extractvalue { <2 x half>, i32 } %res, 0
+  %tfe = extractvalue { <2 x half>, i32 } %res, 1
+  store i32 %tfe, i32 addrspace(1)* undef
+  ret <2 x half> %tex
+}
+
+define amdgpu_ps <3 x half> @image_load_tfe_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+  ; UNPACKED-LABEL: name: image_load_tfe_v3f16
+  ; UNPACKED: bb.1 (%ir-block.0):
+  ; UNPACKED:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1
+  ; UNPACKED:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
+  ; UNPACKED:   [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
+  ; UNPACKED:   [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
+  ; UNPACKED:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
+  ; UNPACKED:   [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
+  ; UNPACKED:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
+  ; UNPACKED:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
+  ; UNPACKED:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; UNPACKED:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; UNPACKED:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; UNPACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; UNPACKED:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; UNPACKED:   [[INT:%[0-9]+]]:_(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.2d), 7, [[COPY8]](s32), [[COPY9]](s32), [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 6 from custom "TargetCustom8", align 8)
+  ; UNPACKED:   [[EXTRACT:%[0-9]+]]:_(<3 x s32>) = G_EXTRACT [[INT]](<4 x s32>), 0
+  ; UNPACKED:   [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[EXTRACT]](<3 x s32>), 0
+  ; UNPACKED:   [[EXTRACT2:%[0-9]+]]:_(s32) = G_EXTRACT [[INT]](<4 x s32>), 96
+  ; UNPACKED:   G_STORE [[EXTRACT2]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+  ; UNPACKED:   [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+  ; UNPACKED:   [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<3 x s16>), 0
+  ; UNPACKED:   [[EXTRACT3:%[0-9]+]]:_(<2 x s16>) = G_EXTRACT [[INSERT]](<4 x s16>), 0
+  ; UNPACKED:   [[EXTRACT4:%[0-9]+]]:_(<2 x s16>) = G_EXTRACT [[INSERT]](<4 x s16>), 32
+  ; UNPACKED:   $vgpr0 = COPY [[EXTRACT3]](<2 x s16>)
+  ; UNPACKED:   $vgpr1 = COPY [[EXTRACT4]](<2 x s16>)
+  ; UNPACKED:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
+  ; PACKED-LABEL: name: image_load_tfe_v3f16
+  ; PACKED: bb.1 (%ir-block.0):
+  ; PACKED:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1
+  ; PACKED:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
+  ; PACKED:   [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
+  ; PACKED:   [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
+  ; PACKED:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
+  ; PACKED:   [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
+  ; PACKED:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
+  ; PACKED:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
+  ; PACKED:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; PACKED:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; PACKED:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; PACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; PACKED:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; PACKED:   [[INT:%[0-9]+]]:_(<3 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.2d), 7, [[COPY8]](s32), [[COPY9]](s32), [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 6 from custom "TargetCustom8", align 8)
+  ; PACKED:   [[EXTRACT:%[0-9]+]]:_(<4 x s16>) = G_EXTRACT [[INT]](<3 x s32>), 0
+  ; PACKED:   [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[EXTRACT]](<4 x s16>), 0
+  ; PACKED:   [[EXTRACT2:%[0-9]+]]:_(s32) = G_EXTRACT [[INT]](<3 x s32>), 64
+  ; PACKED:   G_STORE [[EXTRACT2]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+  ; PACKED:   [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+  ; PACKED:   [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<3 x s16>), 0
+  ; PACKED:   [[EXTRACT3:%[0-9]+]]:_(<2 x s16>) = G_EXTRACT [[INSERT]](<4 x s16>), 0
+  ; PACKED:   [[EXTRACT4:%[0-9]+]]:_(<2 x s16>) = G_EXTRACT [[INSERT]](<4 x s16>), 32
+  ; PACKED:   $vgpr0 = COPY [[EXTRACT3]](<2 x s16>)
+  ; PACKED:   $vgpr1 = COPY [[EXTRACT4]](<2 x s16>)
+  ; PACKED:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
+  %res = call { <3 x half>, i32 } @llvm.amdgcn.image.load.2d.sl_v3f16i32s.i32(i32 7, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0)
+  %tex = extractvalue { <3 x half>, i32 } %res, 0
+  %tfe = extractvalue { <3 x half>, i32 } %res, 1
+  store i32 %tfe, i32 addrspace(1)* undef
+  ret <3 x half> %tex
+}
+
+define amdgpu_ps <4 x half> @image_load_tfe_v4f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+  ; UNPACKED-LABEL: name: image_load_tfe_v4f16
+  ; UNPACKED: bb.1 (%ir-block.0):
+  ; UNPACKED:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1
+  ; UNPACKED:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
+  ; UNPACKED:   [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
+  ; UNPACKED:   [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
+  ; UNPACKED:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
+  ; UNPACKED:   [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
+  ; UNPACKED:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
+  ; UNPACKED:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
+  ; UNPACKED:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; UNPACKED:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; UNPACKED:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; UNPACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; UNPACKED:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; UNPACKED:   [[INT:%[0-9]+]]:_(<5 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.2d), 15, [[COPY8]](s32), [[COPY9]](s32), [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 8 from custom "TargetCustom8")
+  ; UNPACKED:   [[EXTRACT:%[0-9]+]]:_(<4 x s32>) = G_EXTRACT [[INT]](<5 x s32>), 0
+  ; UNPACKED:   [[EXTRACT1:%[0-9]+]]:_(<4 x s16>) = G_EXTRACT [[EXTRACT]](<4 x s32>), 0
+  ; UNPACKED:   [[EXTRACT2:%[0-9]+]]:_(s32) = G_EXTRACT [[INT]](<5 x s32>), 128
+  ; UNPACKED:   G_STORE [[EXTRACT2]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+  ; UNPACKED:   [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[EXTRACT1]](<4 x s16>)
+  ; UNPACKED:   $vgpr0 = COPY [[UV]](<2 x s16>)
+  ; UNPACKED:   $vgpr1 = COPY [[UV1]](<2 x s16>)
+  ; UNPACKED:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
+  ; PACKED-LABEL: name: image_load_tfe_v4f16
+  ; PACKED: bb.1 (%ir-block.0):
+  ; PACKED:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1
+  ; PACKED:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
+  ; PACKED:   [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
+  ; PACKED:   [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
+  ; PACKED:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
+  ; PACKED:   [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
+  ; PACKED:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
+  ; PACKED:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
+  ; PACKED:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; PACKED:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; PACKED:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; PACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; PACKED:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; PACKED:   [[INT:%[0-9]+]]:_(<3 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.2d), 15, [[COPY8]](s32), [[COPY9]](s32), [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 8 from custom "TargetCustom8")
+  ; PACKED:   [[EXTRACT:%[0-9]+]]:_(<4 x s16>) = G_EXTRACT [[INT]](<3 x s32>), 0
+  ; PACKED:   [[EXTRACT1:%[0-9]+]]:_(s32) = G_EXTRACT [[INT]](<3 x s32>), 64
+  ; PACKED:   G_STORE [[EXTRACT1]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+  ; PACKED:   [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[EXTRACT]](<4 x s16>)
+  ; PACKED:   $vgpr0 = COPY [[UV]](<2 x s16>)
+  ; PACKED:   $vgpr1 = COPY [[UV1]](<2 x s16>)
+  ; PACKED:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
+  %res = call { <4 x half>, i32 } @llvm.amdgcn.image.load.2d.sl_v4f16i32s.i32(i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0)
+  %tex = extractvalue { <4 x half>, i32 } %res, 0
+  %tfe = extractvalue { <4 x half>, i32 } %res, 1
+  store i32 %tfe, i32 addrspace(1)* undef
+  ret <4 x half> %tex
+}
+
 declare half @llvm.amdgcn.image.load.2d.f16.i32(i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0
 declare <2 x half> @llvm.amdgcn.image.load.2d.v2f16.i32(i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0
 declare <3 x half> @llvm.amdgcn.image.load.2d.v3f16.i32(i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0
 declare <4 x half> @llvm.amdgcn.image.load.2d.v4f16.i32(i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0
+declare { half, i32 } @llvm.amdgcn.image.load.2d.sl_f16i32s.i32(i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0
+declare { <2 x half>, i32 } @llvm.amdgcn.image.load.2d.sl_v2f16i32s.i32(i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0
+declare { <3 x half>, i32 } @llvm.amdgcn.image.load.2d.sl_v3f16i32s.i32(i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0
+declare { <4 x half>, i32 } @llvm.amdgcn.image.load.2d.sl_v4f16i32s.i32(i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0
 
 attributes #0 = { nounwind readonly }

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.ll
new file mode 100644
index 000000000000..638d428bca63
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.ll
@@ -0,0 +1,235 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -stop-after=legalizer -o - %s | FileCheck -check-prefix=GCN %s
+
+define amdgpu_ps float @image_load_f32(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+  ; GCN-LABEL: name: image_load_f32
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1
+  ; GCN:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
+  ; GCN:   [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
+  ; GCN:   [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
+  ; GCN:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
+  ; GCN:   [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
+  ; GCN:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
+  ; GCN:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
+  ; GCN:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GCN:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GCN:   [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.2d), 1, [[COPY8]](s32), [[COPY9]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 4 from custom "TargetCustom8")
+  ; GCN:   $vgpr0 = COPY [[INT]](s32)
+  ; GCN:   SI_RETURN_TO_EPILOG implicit $vgpr0
+  %tex = call float @llvm.amdgcn.image.load.2d.f32.i32(i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
+  ret float %tex
+}
+
+define amdgpu_ps <2 x float> @image_load_v2f32(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+  ; GCN-LABEL: name: image_load_v2f32
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1
+  ; GCN:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
+  ; GCN:   [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
+  ; GCN:   [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
+  ; GCN:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
+  ; GCN:   [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
+  ; GCN:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
+  ; GCN:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
+  ; GCN:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GCN:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GCN:   [[INT:%[0-9]+]]:_(<2 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.2d), 3, [[COPY8]](s32), [[COPY9]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 8 from custom "TargetCustom8")
+  ; GCN:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[INT]](<2 x s32>)
+  ; GCN:   $vgpr0 = COPY [[UV]](s32)
+  ; GCN:   $vgpr1 = COPY [[UV1]](s32)
+  ; GCN:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
+  %tex = call <2 x float> @llvm.amdgcn.image.load.2d.v2f32.i32(i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <2 x float> %tex
+}
+
+define amdgpu_ps <3 x float> @image_load_v3f32(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+  ; GCN-LABEL: name: image_load_v3f32
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1
+  ; GCN:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
+  ; GCN:   [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
+  ; GCN:   [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
+  ; GCN:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
+  ; GCN:   [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
+  ; GCN:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
+  ; GCN:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
+  ; GCN:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GCN:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GCN:   [[INT:%[0-9]+]]:_(<3 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.2d), 7, [[COPY8]](s32), [[COPY9]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 12 from custom "TargetCustom8", align 16)
+  ; GCN:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[INT]](<3 x s32>)
+  ; GCN:   $vgpr0 = COPY [[UV]](s32)
+  ; GCN:   $vgpr1 = COPY [[UV1]](s32)
+  ; GCN:   $vgpr2 = COPY [[UV2]](s32)
+  ; GCN:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+  %tex = call <3 x float> @llvm.amdgcn.image.load.2d.v3f32.i32(i32 7, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <3 x float> %tex
+}
+
+define amdgpu_ps <4 x float> @image_load_v4f32(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+  ; GCN-LABEL: name: image_load_v4f32
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1
+  ; GCN:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
+  ; GCN:   [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
+  ; GCN:   [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
+  ; GCN:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
+  ; GCN:   [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
+  ; GCN:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
+  ; GCN:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
+  ; GCN:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GCN:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GCN:   [[INT:%[0-9]+]]:_(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.2d), 15, [[COPY8]](s32), [[COPY9]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
+  ; GCN:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[INT]](<4 x s32>)
+  ; GCN:   $vgpr0 = COPY [[UV]](s32)
+  ; GCN:   $vgpr1 = COPY [[UV1]](s32)
+  ; GCN:   $vgpr2 = COPY [[UV2]](s32)
+  ; GCN:   $vgpr3 = COPY [[UV3]](s32)
+  ; GCN:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  %tex = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %tex
+}
+
+define amdgpu_ps float @image_load_tfe_f32(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+  ; GCN-LABEL: name: image_load_tfe_f32
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1
+  ; GCN:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
+  ; GCN:   [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
+  ; GCN:   [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
+  ; GCN:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
+  ; GCN:   [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
+  ; GCN:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
+  ; GCN:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
+  ; GCN:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GCN:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GCN:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; GCN:   [[INT:%[0-9]+]]:_(<2 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.2d), 1, [[COPY8]](s32), [[COPY9]](s32), [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 4 from custom "TargetCustom8")
+  ; GCN:   [[EXTRACT:%[0-9]+]]:_(s32) = G_EXTRACT [[INT]](<2 x s32>), 0
+  ; GCN:   [[EXTRACT1:%[0-9]+]]:_(s32) = G_EXTRACT [[INT]](<2 x s32>), 32
+  ; GCN:   G_STORE [[EXTRACT1]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+  ; GCN:   $vgpr0 = COPY [[EXTRACT]](s32)
+  ; GCN:   SI_RETURN_TO_EPILOG implicit $vgpr0
+  %res = call { float, i32 } @llvm.amdgcn.image.load.2d.sl_f32i32s.i32(i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0)
+  %tex = extractvalue { float, i32 } %res, 0
+  %tfe = extractvalue { float, i32 } %res, 1
+  store i32 %tfe, i32 addrspace(1)* undef
+  ret float %tex
+}
+
+define amdgpu_ps <2 x float> @image_load_tfe_v2f32(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+  ; GCN-LABEL: name: image_load_tfe_v2f32
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1
+  ; GCN:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
+  ; GCN:   [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
+  ; GCN:   [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
+  ; GCN:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
+  ; GCN:   [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
+  ; GCN:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
+  ; GCN:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
+  ; GCN:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GCN:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GCN:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; GCN:   [[INT:%[0-9]+]]:_(<3 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.2d), 3, [[COPY8]](s32), [[COPY9]](s32), [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 8 from custom "TargetCustom8")
+  ; GCN:   [[EXTRACT:%[0-9]+]]:_(<2 x s32>) = G_EXTRACT [[INT]](<3 x s32>), 0
+  ; GCN:   [[EXTRACT1:%[0-9]+]]:_(s32) = G_EXTRACT [[INT]](<3 x s32>), 64
+  ; GCN:   G_STORE [[EXTRACT1]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+  ; GCN:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[EXTRACT]](<2 x s32>)
+  ; GCN:   $vgpr0 = COPY [[UV]](s32)
+  ; GCN:   $vgpr1 = COPY [[UV1]](s32)
+  ; GCN:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
+  %res = call { <2 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v2f32i32s.i32(i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0)
+  %tex = extractvalue { <2 x float>, i32 } %res, 0
+  %tfe = extractvalue { <2 x float>, i32 } %res, 1
+  store i32 %tfe, i32 addrspace(1)* undef
+  ret <2 x float> %tex
+}
+
+define amdgpu_ps <3 x float> @image_load_tfe_v3f32(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+  ; GCN-LABEL: name: image_load_tfe_v3f32
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1
+  ; GCN:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
+  ; GCN:   [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
+  ; GCN:   [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
+  ; GCN:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
+  ; GCN:   [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
+  ; GCN:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
+  ; GCN:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
+  ; GCN:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GCN:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GCN:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; GCN:   [[INT:%[0-9]+]]:_(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.2d), 7, [[COPY8]](s32), [[COPY9]](s32), [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 12 from custom "TargetCustom8", align 16)
+  ; GCN:   [[EXTRACT:%[0-9]+]]:_(<3 x s32>) = G_EXTRACT [[INT]](<4 x s32>), 0
+  ; GCN:   [[EXTRACT1:%[0-9]+]]:_(s32) = G_EXTRACT [[INT]](<4 x s32>), 96
+  ; GCN:   G_STORE [[EXTRACT1]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+  ; GCN:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[EXTRACT]](<3 x s32>)
+  ; GCN:   $vgpr0 = COPY [[UV]](s32)
+  ; GCN:   $vgpr1 = COPY [[UV1]](s32)
+  ; GCN:   $vgpr2 = COPY [[UV2]](s32)
+  ; GCN:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+  %res = call { <3 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v3f32i32s.i32(i32 7, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0)
+  %tex = extractvalue { <3 x float>, i32 } %res, 0
+  %tfe = extractvalue { <3 x float>, i32 } %res, 1
+  store i32 %tfe, i32 addrspace(1)* undef
+  ret <3 x float> %tex
+}
+
+define amdgpu_ps <4 x float> @image_load_tfe_v4f32(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+  ; GCN-LABEL: name: image_load_tfe_v4f32
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1
+  ; GCN:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
+  ; GCN:   [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
+  ; GCN:   [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
+  ; GCN:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
+  ; GCN:   [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
+  ; GCN:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
+  ; GCN:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
+  ; GCN:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GCN:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GCN:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; GCN:   [[INT:%[0-9]+]]:_(<5 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.2d), 15, [[COPY8]](s32), [[COPY9]](s32), [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
+  ; GCN:   [[EXTRACT:%[0-9]+]]:_(<4 x s32>) = G_EXTRACT [[INT]](<5 x s32>), 0
+  ; GCN:   [[EXTRACT1:%[0-9]+]]:_(s32) = G_EXTRACT [[INT]](<5 x s32>), 128
+  ; GCN:   G_STORE [[EXTRACT1]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+  ; GCN:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[EXTRACT]](<4 x s32>)
+  ; GCN:   $vgpr0 = COPY [[UV]](s32)
+  ; GCN:   $vgpr1 = COPY [[UV1]](s32)
+  ; GCN:   $vgpr2 = COPY [[UV2]](s32)
+  ; GCN:   $vgpr3 = COPY [[UV3]](s32)
+  ; GCN:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  %res = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v4f32i32s.i32(i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0)
+  %tex = extractvalue { <4 x float>, i32 } %res, 0
+  %tfe = extractvalue { <4 x float>, i32 } %res, 1
+  store i32 %tfe, i32 addrspace(1)* undef
+  ret <4 x float> %tex
+}
+
+declare float @llvm.amdgcn.image.load.2d.f32.i32(i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0
+declare <2 x float> @llvm.amdgcn.image.load.2d.v2f32.i32(i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0
+declare <3 x float> @llvm.amdgcn.image.load.2d.v3f32.i32(i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0
+declare <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0
+declare { float, i32 } @llvm.amdgcn.image.load.2d.sl_f32i32s.i32(i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0
+declare { <2 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v2f32i32s.i32(i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0
+declare { <3 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v3f32i32s.i32(i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0
+declare { <4 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v4f32i32s.i32(i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0
+
+attributes #0 = { nounwind readonly }