[PATCH] Add target hook for whether it is profitable to reduce load widths

Thu Dec 11 14:00:33 PST 2014

On Tue, Nov 25, 2014 at 02:10:36AM +0000, Matt Arsenault wrote:
> Add an option to disable optimization to shrink truncated larger type
> loads to smaller type loads. On SI this prevents using scalar load
> instructions in some cases, since there are no scalar extloads.
> 
> http://reviews.llvm.org/D6398
> 

LGTM.

> Files:
>   include/llvm/Target/TargetLowering.h
>   lib/CodeGen/SelectionDAG/DAGCombiner.cpp
>   lib/Target/R600/AMDGPUISelLowering.cpp
>   lib/Target/R600/AMDGPUISelLowering.h
>   test/CodeGen/R600/no-shrink-extloads.ll
>   test/CodeGen/R600/store.ll

> Index: include/llvm/Target/TargetLowering.h
> ===================================================================
> --- include/llvm/Target/TargetLowering.h
> +++ include/llvm/Target/TargetLowering.h
> @@ -753,6 +753,16 @@
>    /// reduce runtime.
>    virtual bool ShouldShrinkFPConstant(EVT) const { return true; }
>  
> +  // Return true if it is profitable to reduce the given load node to a smaller
> +  // type.
> +  //
> +  // e.g. (i16 (trunc (i32 (load x))) -> i16 load x should be performed
> +  virtual bool shouldReduceLoadWidth(SDNode *Load,
> +                                     ISD::LoadExtType ExtTy,
> +                                     EVT NewVT) const {
> +    return true;
> +  }
> +
>    /// When splitting a value of the specified type into parts, does the Lo
>    /// or Hi part come first?  This usually follows the endianness, except
>    /// for ppcf128, where the Hi part always comes first.
> Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp
> ===================================================================
> --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp
> +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp
> @@ -6035,6 +6035,9 @@
>        LN0->getMemoryVT().getSizeInBits() < ExtVT.getSizeInBits() + ShAmt)
>      return SDValue();
>  
> +  if (!TLI.shouldReduceLoadWidth(LN0, ExtType, ExtVT))
> +    return SDValue();
> +
>    EVT PtrType = N0.getOperand(1).getValueType();
>  
>    if (PtrType == MVT::Untyped || PtrType.isExtended())
> Index: lib/Target/R600/AMDGPUISelLowering.cpp
> ===================================================================
> --- lib/Target/R600/AMDGPUISelLowering.cpp
> +++ lib/Target/R600/AMDGPUISelLowering.cpp
> @@ -429,6 +429,29 @@
>    return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
>  }
>  
> +bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N,
> +                                                 ISD::LoadExtType,
> +                                                 EVT NewVT) const {
> +
> +  unsigned NewSize = NewVT.getStoreSizeInBits();
> +
> +  // If we are reducing to a 32-bit load, this is always better.
> +  if (NewSize == 32)
> +    return true;
> +
> +  EVT OldVT = N->getValueType(0);
> +  unsigned OldSize = OldVT.getStoreSizeInBits();
> +
> +  // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
> +  // extloads, so doing one requires using a buffer_load. In cases where we
> +  // still couldn't use a scalar load, using the wider load shouldn't really
> +  // hurt anything.
> +
> +  // If the old size already had to be an extload, there's no harm in continuing
> +  // to reduce the width.
> +  return (OldSize < 32);
> +}
> +
>  bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy,
>                                                     EVT CastTy) const {
>    if (LoadTy.getSizeInBits() != CastTy.getSizeInBits())
> Index: lib/Target/R600/AMDGPUISelLowering.h
> ===================================================================
> --- lib/Target/R600/AMDGPUISelLowering.h
> +++ lib/Target/R600/AMDGPUISelLowering.h
> @@ -124,6 +124,9 @@
>  
>    bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
>    bool ShouldShrinkFPConstant(EVT VT) const override;
> +  bool shouldReduceLoadWidth(SDNode *Load,
> +                             ISD::LoadExtType ExtType,
> +                             EVT ExtVT) const override;
>  
>    bool isLoadBitCastBeneficial(EVT, EVT) const override;
>    SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv,
> Index: test/CodeGen/R600/no-shrink-extloads.ll
> ===================================================================
> --- /dev/null
> +++ test/CodeGen/R600/no-shrink-extloads.ll
> @@ -0,0 +1,191 @@
> +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
> +
> +declare i32 @llvm.r600.read.tidig.x() nounwind readnone
> +
> +; Make sure we don't turn the 32-bit argument load into a 16-bit
> +; load. There aren't extending scalar lods, so that would require
> +; using a buffer_load instruction.
> +
> +; FUNC-LABEL: {{^}}truncate_kernarg_i32_to_i16:
> +; SI: s_load_dword s
> +; SI: buffer_store_short v
> +define void @truncate_kernarg_i32_to_i16(i16 addrspace(1)* %out, i32 %arg) nounwind {
> +  %trunc = trunc i32 %arg to i16
> +  store i16 %trunc, i16 addrspace(1)* %out
> +  ret void
> +}
> +
> +; It should be OK (and probably performance neutral) to reduce this,
> +; but we don't know if the load is uniform yet.
> +
> +; FUNC-LABEL: {{^}}truncate_buffer_load_i32_to_i16:
> +; SI: buffer_load_dword v
> +; SI: buffer_store_short v
> +define void @truncate_buffer_load_i32_to_i16(i16 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
> +  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
> +  %gep.in = getelementptr i32 addrspace(1)* %in, i32 %tid
> +  %gep.out = getelementptr i16 addrspace(1)* %out, i32 %tid
> +  %load = load i32 addrspace(1)* %gep.in
> +  %trunc = trunc i32 %load to i16
> +  store i16 %trunc, i16 addrspace(1)* %gep.out
> +  ret void
> +}
> +
> +; FUNC-LABEL: {{^}}truncate_kernarg_i32_to_i8:
> +; SI: s_load_dword s
> +; SI: buffer_store_byte v
> +define void @truncate_kernarg_i32_to_i8(i8 addrspace(1)* %out, i32 %arg) nounwind {
> +  %trunc = trunc i32 %arg to i8
> +  store i8 %trunc, i8 addrspace(1)* %out
> +  ret void
> +}
> +
> +; FUNC-LABEL: {{^}}truncate_buffer_load_i32_to_i8:
> +; SI: buffer_load_dword v
> +; SI: buffer_store_byte v
> +define void @truncate_buffer_load_i32_to_i8(i8 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
> +  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
> +  %gep.in = getelementptr i32 addrspace(1)* %in, i32 %tid
> +  %gep.out = getelementptr i8 addrspace(1)* %out, i32 %tid
> +  %load = load i32 addrspace(1)* %gep.in
> +  %trunc = trunc i32 %load to i8
> +  store i8 %trunc, i8 addrspace(1)* %gep.out
> +  ret void
> +}
> +
> +; FUNC-LABEL: {{^}}truncate_kernarg_i32_to_i1:
> +; SI: s_load_dword s
> +; SI: buffer_store_byte v
> +define void @truncate_kernarg_i32_to_i1(i1 addrspace(1)* %out, i32 %arg) nounwind {
> +  %trunc = trunc i32 %arg to i1
> +  store i1 %trunc, i1 addrspace(1)* %out
> +  ret void
> +}
> +
> +; FUNC-LABEL: {{^}}truncate_buffer_load_i32_to_i1:
> +; SI: buffer_load_dword v
> +; SI: buffer_store_byte v
> +define void @truncate_buffer_load_i32_to_i1(i1 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
> +  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
> +  %gep.in = getelementptr i32 addrspace(1)* %in, i32 %tid
> +  %gep.out = getelementptr i1 addrspace(1)* %out, i32 %tid
> +  %load = load i32 addrspace(1)* %gep.in
> +  %trunc = trunc i32 %load to i1
> +  store i1 %trunc, i1 addrspace(1)* %gep.out
> +  ret void
> +}
> +
> +; FUNC-LABEL: {{^}}truncate_kernarg_i64_to_i32:
> +; SI: s_load_dword s
> +; SI: buffer_store_dword v
> +define void @truncate_kernarg_i64_to_i32(i32 addrspace(1)* %out, i64 %arg) nounwind {
> +  %trunc = trunc i64 %arg to i32
> +  store i32 %trunc, i32 addrspace(1)* %out
> +  ret void
> +}
> +
> +; FUNC-LABEL: {{^}}truncate_buffer_load_i64_to_i32:
> +; SI: buffer_load_dword v
> +; SI: buffer_store_dword v
> +define void @truncate_buffer_load_i64_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
> +  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
> +  %gep.in = getelementptr i64 addrspace(1)* %in, i32 %tid
> +  %gep.out = getelementptr i32 addrspace(1)* %out, i32 %tid
> +  %load = load i64 addrspace(1)* %gep.in
> +  %trunc = trunc i64 %load to i32
> +  store i32 %trunc, i32 addrspace(1)* %gep.out
> +  ret void
> +}
> +
> +; FUNC-LABEL: {{^}}srl_kernarg_i64_to_i32:
> +; SI: s_load_dword s
> +; SI: buffer_store_dword v
> +define void @srl_kernarg_i64_to_i32(i32 addrspace(1)* %out, i64 %arg) nounwind {
> +  %srl = lshr i64 %arg, 32
> +  %trunc = trunc i64 %srl to i32
> +  store i32 %trunc, i32 addrspace(1)* %out
> +  ret void
> +}
> +
> +; FUNC-LABEL: {{^}}srl_buffer_load_i64_to_i32:
> +; SI: buffer_load_dword v
> +; SI: buffer_store_dword v
> +define void @srl_buffer_load_i64_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
> +  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
> +  %gep.in = getelementptr i64 addrspace(1)* %in, i32 %tid
> +  %gep.out = getelementptr i32 addrspace(1)* %out, i32 %tid
> +  %load = load i64 addrspace(1)* %gep.in
> +  %srl = lshr i64 %load, 32
> +  %trunc = trunc i64 %srl to i32
> +  store i32 %trunc, i32 addrspace(1)* %gep.out
> +  ret void
> +}
> +
> +; Might as well reduce to 8-bit loads.
> +; FUNC-LABEL: {{^}}truncate_kernarg_i16_to_i8:
> +; SI: s_load_dword s
> +; SI: buffer_store_byte v
> +define void @truncate_kernarg_i16_to_i8(i8 addrspace(1)* %out, i16 %arg) nounwind {
> +  %trunc = trunc i16 %arg to i8
> +  store i8 %trunc, i8 addrspace(1)* %out
> +  ret void
> +}
> +
> +; FUNC-LABEL: {{^}}truncate_buffer_load_i16_to_i8:
> +; SI: buffer_load_ubyte v
> +; SI: buffer_store_byte v
> +define void @truncate_buffer_load_i16_to_i8(i8 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
> +  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
> +  %gep.in = getelementptr i16 addrspace(1)* %in, i32 %tid
> +  %gep.out = getelementptr i8 addrspace(1)* %out, i32 %tid
> +  %load = load i16 addrspace(1)* %gep.in
> +  %trunc = trunc i16 %load to i8
> +  store i8 %trunc, i8 addrspace(1)* %gep.out
> +  ret void
> +}
> +
> +; FUNC-LABEL: {{^}}srl_kernarg_i64_to_i8:
> +; SI: s_load_dword s
> +; SI: buffer_store_byte v
> +define void @srl_kernarg_i64_to_i8(i8 addrspace(1)* %out, i64 %arg) nounwind {
> +  %srl = lshr i64 %arg, 32
> +  %trunc = trunc i64 %srl to i8
> +  store i8 %trunc, i8 addrspace(1)* %out
> +  ret void
> +}
> +
> +; FUNC-LABEL: {{^}}srl_buffer_load_i64_to_i8:
> +; SI: buffer_load_dword v
> +; SI: buffer_store_byte v
> +define void @srl_buffer_load_i64_to_i8(i8 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
> +  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
> +  %gep.in = getelementptr i64 addrspace(1)* %in, i32 %tid
> +  %gep.out = getelementptr i8 addrspace(1)* %out, i32 %tid
> +  %load = load i64 addrspace(1)* %gep.in
> +  %srl = lshr i64 %load, 32
> +  %trunc = trunc i64 %srl to i8
> +  store i8 %trunc, i8 addrspace(1)* %gep.out
> +  ret void
> +}
> +
> +; FUNC-LABEL: {{^}}truncate_kernarg_i64_to_i8:
> +; SI: s_load_dword s
> +; SI: buffer_store_byte v
> +define void @truncate_kernarg_i64_to_i8(i8 addrspace(1)* %out, i64 %arg) nounwind {
> +  %trunc = trunc i64 %arg to i8
> +  store i8 %trunc, i8 addrspace(1)* %out
> +  ret void
> +}
> +
> +; FUNC-LABEL: {{^}}truncate_buffer_load_i64_to_i8:
> +; SI: buffer_load_dword v
> +; SI: buffer_store_byte v
> +define void @truncate_buffer_load_i64_to_i8(i8 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
> +  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
> +  %gep.in = getelementptr i64 addrspace(1)* %in, i32 %tid
> +  %gep.out = getelementptr i8 addrspace(1)* %out, i32 %tid
> +  %load = load i64 addrspace(1)* %gep.in
> +  %trunc = trunc i64 %load to i8
> +  store i8 %trunc, i8 addrspace(1)* %gep.out
> +  ret void
> +}
> Index: test/CodeGen/R600/store.ll
> ===================================================================
> --- test/CodeGen/R600/store.ll
> +++ test/CodeGen/R600/store.ll
> @@ -1,6 +1,6 @@
> -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK --check-prefix=FUNC %s
> -; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck --check-prefix=CM-CHECK --check-prefix=FUNC %s
> -; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK --check-prefix=FUNC %s
> +; RUN: llc -march=r600 -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI-CHECK -check-prefix=FUNC %s
> +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG-CHECK -check-prefix=FUNC %s
> +; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=CM-CHECK -check-prefix=FUNC %s
>  
>  ;===------------------------------------------------------------------------===;
>  ; Global Address Space
> @@ -17,16 +17,18 @@
>  ; i8 store
>  ; EG-CHECK-LABEL: {{^}}store_i8:
>  ; EG-CHECK: MEM_RAT MSKOR T[[RW_GPR:[0-9]]].XW, T{{[0-9]}}.X
> -; EG-CHECK: VTX_READ_8 [[VAL:T[0-9]\.X]], [[VAL]]
> +
>  ; IG 0: Get the byte index and truncate the value
> -; EG-CHECK: AND_INT T{{[0-9]}}.[[BI_CHAN:[XYZW]]], KC0[2].Y, literal.x
> -; EG-CHECK-NEXT: AND_INT * T{{[0-9]}}.[[TRUNC_CHAN:[XYZW]]], [[VAL]], literal.y
> +; EG-CHECK: AND_INT * T{{[0-9]}}.[[BI_CHAN:[XYZW]]], KC0[2].Y, literal.x
> +; EG-CHECK: LSHL T{{[0-9]}}.[[SHIFT_CHAN:[XYZW]]], PV.[[BI_CHAN]], literal.x
> +; EG-CHECK: AND_INT * T{{[0-9]}}.[[TRUNC_CHAN:[XYZW]]], KC0[2].Z, literal.y
>  ; EG-CHECK-NEXT: 3(4.203895e-45), 255(3.573311e-43)
> +
> +
>  ; IG 1: Truncate the calculated the shift amount for the mask
> -; EG-CHECK: LSHL * T{{[0-9]}}.[[SHIFT_CHAN:[XYZW]]], PV.[[BI_CHAN]], literal.x
> -; EG-CHECK-NEXT: 3
> +
>  ; IG 2: Shift the value and the mask
> -; EG-CHECK: LSHL T[[RW_GPR]].X, T{{[0-9]}}.[[TRUNC_CHAN]], PV.[[SHIFT_CHAN]]
> +; EG-CHECK: LSHL T[[RW_GPR]].X, PS, PV.[[SHIFT_CHAN]]
>  ; EG-CHECK: LSHL * T[[RW_GPR]].W, literal.x, PV.[[SHIFT_CHAN]]
>  ; EG-CHECK-NEXT: 255
>  ; IG 3: Initialize the Y and Z channels to zero
> @@ -46,16 +48,21 @@
>  ; i16 store
>  ; EG-CHECK-LABEL: {{^}}store_i16:
>  ; EG-CHECK: MEM_RAT MSKOR T[[RW_GPR:[0-9]]].XW, T{{[0-9]}}.X
> -; EG-CHECK: VTX_READ_16 [[VAL:T[0-9]\.X]], [[VAL]]
> +
>  ; IG 0: Get the byte index and truncate the value
> -; EG-CHECK: AND_INT T{{[0-9]}}.[[BI_CHAN:[XYZW]]], KC0[2].Y, literal.x
> -; EG-CHECK: AND_INT * T{{[0-9]}}.[[TRUNC_CHAN:[XYZW]]], [[VAL]], literal.y
> +
> +
> +; EG-CHECK: AND_INT * T{{[0-9]}}.[[BI_CHAN:[XYZW]]], KC0[2].Y, literal.x
> +; EG-CHECK-NEXT: 3(4.203895e-45),
> +
> +; EG-CHECK: LSHL T{{[0-9]}}.[[SHIFT_CHAN:[XYZW]]], PV.[[BI_CHAN]], literal.x
> +; EG-CHECK: AND_INT * T{{[0-9]}}.[[TRUNC_CHAN:[XYZW]]], KC0[2].Z, literal.y
> +
>  ; EG-CHECK-NEXT: 3(4.203895e-45), 65535(9.183409e-41)
>  ; IG 1: Truncate the calculated the shift amount for the mask
> -; EG-CHECK: LSHL * T{{[0-9]}}.[[SHIFT_CHAN:[XYZW]]], PV.[[BI_CHAN]], literal.x
> -; EG-CHECK: 3
> +
>  ; IG 2: Shift the value and the mask
> -; EG-CHECK: LSHL T[[RW_GPR]].X, T{{[0-9]}}.[[TRUNC_CHAN]], PV.[[SHIFT_CHAN]]
> +; EG-CHECK: LSHL T[[RW_GPR]].X, PS, PV.[[SHIFT_CHAN]]
>  ; EG-CHECK: LSHL * T[[RW_GPR]].W, literal.x, PV.[[SHIFT_CHAN]]
>  ; EG-CHECK-NEXT: 65535
>  ; IG 3: Initialize the Y and Z channels to zero

> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits