[llvm] r356399 - [AMDGPU] Asm/disasm clamp modifier on vop3 int arithmetic

Tue Mar 19 08:26:42 PDT 2019

Hi Tim,

On 2019-03-18 8:35 p.m., Tim Renouf via llvm-commits wrote:
> Author: tpr
> Date: Mon Mar 18 12:35:44 2019
> New Revision: 356399
> 
> URL: http://llvm.org/viewvc/llvm-project?rev=356399&view=rev
> Log:
> [AMDGPU] Asm/disasm clamp modifier on vop3 int arithmetic

This broke a bunch of piglit tests with radeonsi on Bonaire, see example
assertion failure output and backtrace below. The corresponding IR is
attached.

shader_runner: ../include/llvm/CodeGen/MachineOperand.h:526: int64_t llvm::MachineOperand::getImm() const: Assertion `isImm() && "Wrong MachineOperand accessor"' failed.

Thread 8 "shader_run:sh4" received signal SIGABRT, Aborted.
[Switching to Thread 0x7fffe2ffd700 (LWP 6300)]
__GI_raise (sig=sig at entry=6) at ../sysdeps/unix/sysv/linux/raise.c:50
50	../sysdeps/unix/sysv/linux/raise.c: No such file or directory.
(gdb) bt
#0  __GI_raise (sig=sig at entry=6) at ../sysdeps/unix/sysv/linux/raise.c:50
#1  0x00007ffff7268535 in __GI_abort () at abort.c:79
#2  0x00007ffff726840f in __assert_fail_base (fmt=0x7ffff73caee0 "%s%s%s:%u: %s%sAssertion `%s' failed.\n%n", assertion=0x7ffff2999573 "isImm() && \"Wrong MachineOperand accessor\"", file=0x7ffff225a02a "../include/llvm/CodeGen/MachineOperand.h", line=526, 
    function=<optimized out>) at assert.c:92
#3  0x00007ffff72760f2 in __GI___assert_fail (assertion=0x7ffff2999573 "isImm() && \"Wrong MachineOperand accessor\"", file=0x7ffff225a02a "../include/llvm/CodeGen/MachineOperand.h", line=526, 
    function=0x7ffff21f8381 "int64_t llvm::MachineOperand::getImm() const") at assert.c:101
#4  0x00007ffff4892445 in llvm::MachineOperand::getImm (this=<optimized out>) at ../include/llvm/CodeGen/MachineOperand.h:526
#5  llvm::SIInstrInfo::hasModifiersSet (this=<optimized out>, MI=..., OpName=55) at ../lib/Target/AMDGPU/SIInstrInfo.cpp:2581
#6  llvm::SIInstrInfo::canShrink (this=<optimized out>, MI=..., MRI=...) at ../lib/Target/AMDGPU/SIInstrInfo.cpp:2620
#7  0x00007ffff48fd9a8 in (anonymous namespace)::SIShrinkInstructions::runOnMachineFunction (this=<optimized out>, MF=...) at ../lib/Target/AMDGPU/SIShrinkInstructions.cpp:603
#8  0x00007ffff368358a in llvm::MachineFunctionPass::runOnFunction (this=0x555555768ce0, F=...) at ../lib/CodeGen/MachineFunctionPass.cpp:73
#9  0x00007ffff34ad883 in llvm::FPPassManager::runOnFunction (this=<optimized out>, F=...) at ../lib/IR/LegacyPassManager.cpp:1643
#10 0x00007ffff4227125 in (anonymous namespace)::CGPassManager::RunPassOnSCC (this=<optimized out>, P=0x55555575e4a0, CG=..., CurSCC=..., CallGraphUpToDate=<optimized out>, DevirtualizedCall=<optimized out>) at ../lib/Analysis/CallGraphSCCPass.cpp:177
#11 (anonymous namespace)::CGPassManager::RunAllPassesOnSCC (this=<optimized out>, CG=..., CurSCC=..., DevirtualizedCall=<optimized out>) at ../lib/Analysis/CallGraphSCCPass.cpp:441
#12 (anonymous namespace)::CGPassManager::runOnModule (this=<optimized out>, M=...) at ../lib/Analysis/CallGraphSCCPass.cpp:497
#13 0x00007ffff34ae041 in (anonymous namespace)::MPPassManager::runOnModule (this=<optimized out>, M=...) at ../lib/IR/LegacyPassManager.cpp:1743
#14 llvm::legacy::PassManagerImpl::run (this=0x555555749230, M=...) at ../lib/IR/LegacyPassManager.cpp:1856
#15 0x00007ffff534baf6 in ac_compile_module_to_binary (p=p at entry=0x555555712ff0, module=module at entry=0x7fffd0006300, binary=binary at entry=0x7fffd0000be0) at /home/daenzer/src/llvm-git/llvm/include/llvm/IR/Module.h:914
#16 0x00007ffff5307e72 in si_llvm_compile (M=M at entry=0x7fffd0006300, binary=binary at entry=0x7fffd0000be0, compiler=compiler at entry=0x5555556353d8, debug=debug at entry=0x555555bdc1e0, less_optimized=less_optimized at entry=false)
    at ../src/gallium/drivers/radeonsi/si_shader_tgsi_setup.c:103
#17 0x00007ffff52f91ac in si_compile_llvm (sscreen=sscreen at entry=0x555555634c00, binary=binary at entry=0x7fffd0000be0, conf=conf at entry=0x7fffd0000c30, compiler=compiler at entry=0x5555556353d8, mod=0x7fffd0006300, debug=debug at entry=0x555555bdc1e0, processor=2, 
    name=0x7ffff59308fe "Geometry Shader", less_optimized=false) at ../src/gallium/drivers/radeonsi/si_shader.c:5561
#18 0x00007ffff52fab5a in si_compile_tgsi_shader (sscreen=sscreen at entry=0x555555634c00, compiler=compiler at entry=0x5555556353d8, shader=shader at entry=0x7fffd0000b20, debug=debug at entry=0x555555bdc1e0) at ../src/gallium/drivers/radeonsi/si_shader.c:6698
#19 0x00007ffff52b4c5f in si_init_shader_selector_async (job=job at entry=0x555555bdc1c0, thread_index=thread_index at entry=4) at ../src/gallium/drivers/radeonsi/si_state_shaders.c:2086
#20 0x00007ffff570ee4a in util_queue_thread_func (input=input at entry=0x555555631c90) at ../src/util/u_queue.c:286
#21 0x00007ffff570e8d8 in impl_thrd_routine (p=<optimized out>) at ../include/c11/threads_posix.h:87
#22 0x00007ffff67bdfa3 in start_thread (arg=<optimized out>) at pthread_create.c:486
#23 0x00007ffff733f82f in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:95

-- 
Earthling Michel Dänzer               |              https://www.amd.com
Libre software enthusiast             |             Mesa and X developer
-------------- next part --------------
; ModuleID = 'mesa-shader'
source_filename = "mesa-shader"
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-ni:7"
target triple = "amdgcn--"

; Function Attrs: nounwind readnone
declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32 immarg) #0

; Function Attrs: nounwind readonly
declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1 immarg, i1 immarg) #1

; Function Attrs: nounwind
declare void @llvm.amdgcn.kill(i1) #2

; Function Attrs: nounwind writeonly
declare void @llvm.amdgcn.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32 immarg, i32 immarg, i32 immarg, i1 immarg, i1 immarg) #3

; Function Attrs: nounwind
declare void @llvm.amdgcn.s.sendmsg(i32 immarg, i32) #2

define amdgpu_gs void @wrapper([0 x <4 x i32>] addrspace(6)* inreg noalias dereferenceable(18446744073709551615), [0 x <8 x i32>] addrspace(6)* inreg noalias dereferenceable(18446744073709551615), [0 x float] addrspace(6)* inreg noalias dereferenceable(18446744073709551615), [0 x <8 x i32>] addrspace(6)* inreg noalias dereferenceable(18446744073709551615), i32 inreg, i32 inreg, i32, i32, i32, i32, i32, i32, i32, i32) #4 {
main_body:
  %14 = ptrtoint [0 x float] addrspace(6)* %2 to i32
  %15 = getelementptr inbounds [0 x <4 x i32>], [0 x <4 x i32>] addrspace(6)* %0, i32 0, i32 1, !amdgpu.uniform !0
  %16 = load <4 x i32>, <4 x i32> addrspace(6)* %15, align 16, !invariant.load !0, !alias.scope !1, !noalias !4
  %17 = getelementptr inbounds [0 x <4 x i32>], [0 x <4 x i32>] addrspace(6)* %0, i32 0, i32 2, !amdgpu.uniform !0
  %18 = load <4 x i32>, <4 x i32> addrspace(6)* %17, align 16, !invariant.load !0, !alias.scope !1, !noalias !4
  %19 = extractelement <4 x i32> %18, i32 1
  %20 = or i32 %19, -2141192192
  %21 = shufflevector <4 x i32> %18, <4 x i32> <i32 undef, i32 undef, i32 64, i32 11173804>, <4 x i32> <i32 0, i32 undef, i32 6, i32 7>
  %22 = insertelement <4 x i32> %21, i32 %20, i32 1
  %23 = insertelement <4 x i32> <i32 undef, i32 0, i32 80, i32 163756>, i32 %14, i32 0
  %24 = call nsz float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %23, i32 0, i32 0) #0
  %25 = bitcast float %24 to i32
  %26 = call nsz float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %23, i32 4, i32 0) #0
  %27 = bitcast float %26 to i32
  %28 = insertelement <2 x i32> undef, i32 %25, i32 0
  %29 = insertelement <2 x i32> %28, i32 %27, i32 1
  %30 = bitcast <2 x i32> %29 to i64
  %31 = call nsz float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %23, i32 32, i32 0) #0
  %32 = bitcast float %31 to i32
  %33 = call nsz float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %23, i32 36, i32 0) #0
  %34 = bitcast float %33 to i32
  %35 = insertelement <2 x i32> undef, i32 %32, i32 0
  %36 = insertelement <2 x i32> %35, i32 %34, i32 1
  %37 = bitcast <2 x i32> %36 to i64
  %38 = srem i64 %30, %37
  %39 = call nsz float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %23, i32 8, i32 0) #0
  %40 = bitcast float %39 to i32
  %41 = call nsz float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %23, i32 12, i32 0) #0
  %42 = bitcast float %41 to i32
  %43 = insertelement <2 x i32> undef, i32 %40, i32 0
  %44 = insertelement <2 x i32> %43, i32 %42, i32 1
  %45 = bitcast <2 x i32> %44 to i64
  %46 = srem i64 %45, %37
  %47 = call nsz float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %23, i32 16, i32 0) #0
  %48 = bitcast float %47 to i32
  %49 = call nsz float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %23, i32 20, i32 0) #0
  %50 = bitcast float %49 to i32
  %51 = insertelement <2 x i32> undef, i32 %48, i32 0
  %52 = insertelement <2 x i32> %51, i32 %50, i32 1
  %53 = bitcast <2 x i32> %52 to i64
  %54 = srem i64 %53, %37
  %55 = call nsz float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %23, i32 48, i32 0) #0
  %56 = bitcast float %55 to i32
  %57 = call nsz float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %23, i32 52, i32 0) #0
  %58 = bitcast float %57 to i32
  %59 = insertelement <2 x i32> undef, i32 %56, i32 0
  %60 = insertelement <2 x i32> %59, i32 %58, i32 1
  %61 = bitcast <2 x i32> %60 to i64
  %62 = icmp eq i64 %38, %61
  %63 = call nsz float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %23, i32 56, i32 0) #0
  %64 = bitcast float %63 to i32
  %65 = call nsz float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %23, i32 60, i32 0) #0
  %66 = bitcast float %65 to i32
  %67 = insertelement <2 x i32> undef, i32 %64, i32 0
  %68 = insertelement <2 x i32> %67, i32 %66, i32 1
  %69 = bitcast <2 x i32> %68 to i64
  %70 = icmp eq i64 %46, %69
  %71 = call nsz float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %23, i32 64, i32 0) #0
  %72 = bitcast float %71 to i32
  %73 = call nsz float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %23, i32 68, i32 0) #0
  %74 = bitcast float %73 to i32
  %75 = insertelement <2 x i32> undef, i32 %72, i32 0
  %76 = insertelement <2 x i32> %75, i32 %74, i32 1
  %77 = bitcast <2 x i32> %76 to i64
  %78 = icmp eq i64 %54, %77
  %79 = and i1 %70, %78
  %80 = and i1 %62, %79
  %81 = shl i32 %6, 2
  %82 = add i32 %81, 1024
  %83 = call nsz float @llvm.amdgcn.buffer.load.f32(<4 x i32> %16, i32 0, i32 %82, i1 true, i1 false) #0
  %84 = add i32 %81, 1280
  %85 = call nsz float @llvm.amdgcn.buffer.load.f32(<4 x i32> %16, i32 0, i32 %84, i1 true, i1 false) #0
  %86 = add i32 %81, 1536
  %87 = call nsz float @llvm.amdgcn.buffer.load.f32(<4 x i32> %16, i32 0, i32 %86, i1 true, i1 false) #0
  %88 = add i32 %81, 1792
  %89 = call nsz float @llvm.amdgcn.buffer.load.f32(<4 x i32> %16, i32 0, i32 %88, i1 true, i1 false) #0
  %90 = bitcast float %83 to i32
  call void @llvm.amdgcn.tbuffer.store.i32(i32 %90, <4 x i32> %22, i32 0, i32 0, i32 %4, i32 0, i32 4, i32 4, i1 true, i1 true) #6, !noalias !1
  %91 = bitcast float %85 to i32
  call void @llvm.amdgcn.tbuffer.store.i32(i32 %91, <4 x i32> %22, i32 0, i32 12, i32 %4, i32 0, i32 4, i32 4, i1 true, i1 true) #6, !noalias !1
  %92 = bitcast float %87 to i32
  call void @llvm.amdgcn.tbuffer.store.i32(i32 %92, <4 x i32> %22, i32 0, i32 24, i32 %4, i32 0, i32 4, i32 4, i1 true, i1 true) #6, !noalias !1
  %93 = bitcast float %89 to i32
  call void @llvm.amdgcn.tbuffer.store.i32(i32 %93, <4 x i32> %22, i32 0, i32 36, i32 %4, i32 0, i32 4, i32 4, i1 true, i1 true) #6, !noalias !1
  %94 = select i1 %80, i32 0, i32 1065353216
  call void @llvm.amdgcn.tbuffer.store.i32(i32 %94, <4 x i32> %22, i32 0, i32 48, i32 %4, i32 0, i32 4, i32 4, i1 true, i1 true) #6, !noalias !1
  %95 = select i1 %80, i32 1065353216, i32 0
  call void @llvm.amdgcn.tbuffer.store.i32(i32 %95, <4 x i32> %22, i32 0, i32 60, i32 %4, i32 0, i32 4, i32 4, i1 true, i1 true) #6, !noalias !1
  call void @llvm.amdgcn.tbuffer.store.i32(i32 0, <4 x i32> %22, i32 0, i32 72, i32 %4, i32 0, i32 4, i32 4, i1 true, i1 true) #6, !noalias !1
  call void @llvm.amdgcn.tbuffer.store.i32(i32 1065353216, <4 x i32> %22, i32 0, i32 84, i32 %4, i32 0, i32 4, i32 4, i1 true, i1 true) #6, !noalias !1
  call void @llvm.amdgcn.s.sendmsg(i32 34, i32 %5) #2, !noalias !1
  %96 = shl i32 %7, 2
  %97 = add i32 %96, 1024
  %98 = call nsz float @llvm.amdgcn.buffer.load.f32(<4 x i32> %16, i32 0, i32 %97, i1 true, i1 false) #0
  %99 = add i32 %96, 1280
  %100 = call nsz float @llvm.amdgcn.buffer.load.f32(<4 x i32> %16, i32 0, i32 %99, i1 true, i1 false) #0
  %101 = add i32 %96, 1536
  %102 = call nsz float @llvm.amdgcn.buffer.load.f32(<4 x i32> %16, i32 0, i32 %101, i1 true, i1 false) #0
  %103 = add i32 %96, 1792
  %104 = call nsz float @llvm.amdgcn.buffer.load.f32(<4 x i32> %16, i32 0, i32 %103, i1 true, i1 false) #0
  %105 = bitcast float %98 to i32
  call void @llvm.amdgcn.tbuffer.store.i32(i32 %105, <4 x i32> %22, i32 0, i32 4, i32 %4, i32 0, i32 4, i32 4, i1 true, i1 true) #6, !noalias !1
  %106 = bitcast float %100 to i32
  call void @llvm.amdgcn.tbuffer.store.i32(i32 %106, <4 x i32> %22, i32 0, i32 16, i32 %4, i32 0, i32 4, i32 4, i1 true, i1 true) #6, !noalias !1
  %107 = bitcast float %102 to i32
  call void @llvm.amdgcn.tbuffer.store.i32(i32 %107, <4 x i32> %22, i32 0, i32 28, i32 %4, i32 0, i32 4, i32 4, i1 true, i1 true) #6, !noalias !1
  %108 = bitcast float %104 to i32
  call void @llvm.amdgcn.tbuffer.store.i32(i32 %108, <4 x i32> %22, i32 0, i32 40, i32 %4, i32 0, i32 4, i32 4, i1 true, i1 true) #6, !noalias !1
  call void @llvm.amdgcn.tbuffer.store.i32(i32 %94, <4 x i32> %22, i32 0, i32 52, i32 %4, i32 0, i32 4, i32 4, i1 true, i1 true) #6, !noalias !1
  call void @llvm.amdgcn.tbuffer.store.i32(i32 %95, <4 x i32> %22, i32 0, i32 64, i32 %4, i32 0, i32 4, i32 4, i1 true, i1 true) #6, !noalias !1
  call void @llvm.amdgcn.tbuffer.store.i32(i32 0, <4 x i32> %22, i32 0, i32 76, i32 %4, i32 0, i32 4, i32 4, i1 true, i1 true) #6, !noalias !1
  call void @llvm.amdgcn.tbuffer.store.i32(i32 1065353216, <4 x i32> %22, i32 0, i32 88, i32 %4, i32 0, i32 4, i32 4, i1 true, i1 true) #6, !noalias !1
  call void @llvm.amdgcn.s.sendmsg(i32 34, i32 %5) #2, !noalias !1
  %109 = shl i32 %9, 2
  %110 = add i32 %109, 1024
  %111 = call nsz float @llvm.amdgcn.buffer.load.f32(<4 x i32> %16, i32 0, i32 %110, i1 true, i1 false) #0
  %112 = add i32 %109, 1280
  %113 = call nsz float @llvm.amdgcn.buffer.load.f32(<4 x i32> %16, i32 0, i32 %112, i1 true, i1 false) #0
  %114 = add i32 %109, 1536
  %115 = call nsz float @llvm.amdgcn.buffer.load.f32(<4 x i32> %16, i32 0, i32 %114, i1 true, i1 false) #0
  %116 = add i32 %109, 1792
  %117 = call nsz float @llvm.amdgcn.buffer.load.f32(<4 x i32> %16, i32 0, i32 %116, i1 true, i1 false) #0
  %118 = bitcast float %111 to i32
  call void @llvm.amdgcn.tbuffer.store.i32(i32 %118, <4 x i32> %22, i32 0, i32 8, i32 %4, i32 0, i32 4, i32 4, i1 true, i1 true) #6, !noalias !1
  %119 = bitcast float %113 to i32
  call void @llvm.amdgcn.tbuffer.store.i32(i32 %119, <4 x i32> %22, i32 0, i32 20, i32 %4, i32 0, i32 4, i32 4, i1 true, i1 true) #6, !noalias !1
  %120 = bitcast float %115 to i32
  call void @llvm.amdgcn.tbuffer.store.i32(i32 %120, <4 x i32> %22, i32 0, i32 32, i32 %4, i32 0, i32 4, i32 4, i1 true, i1 true) #6, !noalias !1
  %121 = bitcast float %117 to i32
  call void @llvm.amdgcn.tbuffer.store.i32(i32 %121, <4 x i32> %22, i32 0, i32 44, i32 %4, i32 0, i32 4, i32 4, i1 true, i1 true) #6, !noalias !1
  call void @llvm.amdgcn.tbuffer.store.i32(i32 %94, <4 x i32> %22, i32 0, i32 56, i32 %4, i32 0, i32 4, i32 4, i1 true, i1 true) #6, !noalias !1
  call void @llvm.amdgcn.tbuffer.store.i32(i32 %95, <4 x i32> %22, i32 0, i32 68, i32 %4, i32 0, i32 4, i32 4, i1 true, i1 true) #6, !noalias !1
  call void @llvm.amdgcn.tbuffer.store.i32(i32 0, <4 x i32> %22, i32 0, i32 80, i32 %4, i32 0, i32 4, i32 4, i1 true, i1 true) #6, !noalias !1
  call void @llvm.amdgcn.tbuffer.store.i32(i32 1065353216, <4 x i32> %22, i32 0, i32 92, i32 %4, i32 0, i32 4, i32 4, i1 true, i1 true) #6, !noalias !1
  call void @llvm.amdgcn.s.sendmsg(i32 34, i32 %5) #2, !noalias !1
  call void @llvm.amdgcn.s.sendmsg(i32 3, i32 %5) #2, !noalias !1
  ret void
}

; Function Attrs: argmemonly nounwind
declare void @llvm.lifetime.start.p5i8(i64 immarg, i8 addrspace(5)* nocapture) #5

; Function Attrs: argmemonly nounwind
declare void @llvm.lifetime.end.p5i8(i64 immarg, i8 addrspace(5)* nocapture) #5

attributes #0 = { nounwind readnone }
attributes #1 = { nounwind readonly }
attributes #2 = { nounwind }
attributes #3 = { nounwind writeonly }
attributes #4 = { "amdgpu-max-work-group-size"="0x40" "no-signed-zeros-fp-math"="true" }
attributes #5 = { argmemonly nounwind }
attributes #6 = { inaccessiblememonly nounwind }

!0 = !{}
!1 = !{!2}
!2 = distinct !{!2, !3, !"main: argument 0"}
!3 = distinct !{!3, !"main"}
!4 = !{!5}
!5 = distinct !{!5, !3, !"main: argument 1"}