[LLVMbugs] [Bug 17822] New: min/max vector instructions not selected for shorts (16-bit integers)
bugzilla-daemon at llvm.org
bugzilla-daemon at llvm.org
Tue Nov 5 10:50:46 PST 2013
http://llvm.org/bugs/show_bug.cgi?id=17822
Bug ID: 17822
Summary: min/max vector instructions not selected for shorts
(16-bit integers)
Product: libraries
Version: trunk
Hardware: PC
OS: All
Status: NEW
Severity: normal
Priority: P
Component: Backend: X86
Assignee: unassignedbugs at nondot.org
Reporter: kkhoo at perfwizard.com
CC: llvmbugs at cs.uiuc.edu
Classification: Unclassified
I'm not sure if this is a problem with the loop vectorizer, instruction
selection with the x86-64 target, or something else:
$ cat max.c
#define TYPE unsigned short
void max(TYPE * restrict x, TYPE * restrict y) {
for (int i=0; i<1024; i++) {
x[i] = (x[i] > y[i]) ? x[i] : y[i];
}
}
Converted to LLVM IR with clang built from r194083:
$ ./clang -O3 -fomit-frame-pointer -march=corei7-avx -S max.c -emit-llvm -o -
; ModuleID = 'max.c'
target datalayout =
"e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.7.0"
; Function Attrs: nounwind ssp uwtable
define void @max(i16* noalias nocapture %x, i16* noalias nocapture readonly %y)
#0 {
entry:
br label %vector.body
vector.body: ; preds = %vector.body,
%entry
%index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
%0 = getelementptr inbounds i16* %x, i64 %index
%1 = bitcast i16* %0 to <4 x i16>*
%wide.load = load <4 x i16>* %1, align 2
%.sum28 = or i64 %index, 4
%2 = getelementptr i16* %x, i64 %.sum28
%3 = bitcast i16* %2 to <4 x i16>*
%wide.load25 = load <4 x i16>* %3, align 2
%4 = getelementptr inbounds i16* %y, i64 %index
%5 = bitcast i16* %4 to <4 x i16>*
%wide.load26 = load <4 x i16>* %5, align 2
%.sum29 = or i64 %index, 4
%6 = getelementptr i16* %y, i64 %.sum29
%7 = bitcast i16* %6 to <4 x i16>*
%wide.load27 = load <4 x i16>* %7, align 2
%8 = icmp ugt <4 x i16> %wide.load, %wide.load26
%9 = icmp ugt <4 x i16> %wide.load25, %wide.load27
%10 = select <4 x i1> %8, <4 x i16> %wide.load, <4 x i16> %wide.load26
%11 = select <4 x i1> %9, <4 x i16> %wide.load25, <4 x i16> %wide.load27
%12 = bitcast i16* %0 to <4 x i16>*
store <4 x i16> %10, <4 x i16>* %12, align 2
%13 = bitcast i16* %2 to <4 x i16>*
store <4 x i16> %11, <4 x i16>* %13, align 2
%index.next = add i64 %index, 8
%14 = icmp eq i64 %index.next, 1024
br i1 %14, label %for.end, label %vector.body, !llvm.loop !1
for.end: ; preds = %vector.body
ret void
}
attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false"
"no-frame-pointer-elim"="false" "no-infs-fp-math"="false"
"no-nans-fp-math"="false" "stack-protector-buffer-size"="8"
"unsafe-fp-math"="false" "use-soft-float"="false" }
!llvm.ident = !{!0}
!0 = metadata !{metadata !"clang version 3.4 (trunk 194083)"}
!1 = metadata !{metadata !1, metadata !2, metadata !3}
!2 = metadata !{metadata !"llvm.vectorizer.width", i32 1}
!3 = metadata !{metadata !"llvm.vectorizer.unroll", i32 1}
And then compiled to x86-64 asm via llc:
$ ./llc -O3 -mcpu=corei7-avx max.ll -o -
.section __TEXT,__text,regular,pure_instructions
.section __TEXT,__const
.align 4
LCPI0_0:
.long 65535 ## 0xffff
.long 65535 ## 0xffff
.long 65535 ## 0xffff
.long 65535 ## 0xffff
LCPI0_1:
.long 2147483648 ## 0x80000000
.long 2147483648 ## 0x80000000
.long 2147483648 ## 0x80000000
.long 2147483648 ## 0x80000000
LCPI0_2:
.byte 0 ## 0x0
.byte 1 ## 0x1
.byte 4 ## 0x4
.byte 5 ## 0x5
.byte 8 ## 0x8
.byte 9 ## 0x9
.byte 12 ## 0xc
.byte 13 ## 0xd
.byte 254 ## 0xfe
.byte 255 ## 0xff
.byte 254 ## 0xfe
.byte 255 ## 0xff
.byte 254 ## 0xfe
.byte 255 ## 0xff
.byte 254 ## 0xfe
.byte 255 ## 0xff
.section __TEXT,__text,regular,pure_instructions
.globl _max
.align 4, 0x90
_max: ## @max
.cfi_startproc
## BB#0: ## %entry
xorl %eax, %eax
vmovdqa LCPI0_0(%rip), %xmm9
vmovdqa LCPI0_1(%rip), %xmm10
vmovdqa LCPI0_2(%rip), %xmm8
.align 4, 0x90
LBB0_1: ## %vector.body
## =>This Inner Loop Header: Depth=1
vpmovzxwd (%rdi,%rax,2), %xmm3
vpand %xmm9, %xmm3, %xmm4
vpmovzxwd 8(%rdi,%rax,2), %xmm5
vpand %xmm9, %xmm5, %xmm6
vpmovzxwd (%rsi,%rax,2), %xmm7
vpand %xmm9, %xmm7, %xmm2
vpmovzxwd 8(%rsi,%rax,2), %xmm0
vpand %xmm9, %xmm0, %xmm1
vpxor %xmm10, %xmm2, %xmm2
vpxor %xmm10, %xmm4, %xmm4
vpcmpgtd %xmm2, %xmm4, %xmm2
vpslld $31, %xmm2, %xmm2
vpxor %xmm10, %xmm1, %xmm1
vpxor %xmm10, %xmm6, %xmm4
vpcmpgtd %xmm1, %xmm4, %xmm1
vpslld $31, %xmm1, %xmm1
vblendvps %xmm2, %xmm3, %xmm7, %xmm2
vblendvps %xmm1, %xmm5, %xmm0, %xmm0
vpshufb %xmm8, %xmm2, %xmm1
vmovq %xmm1, (%rdi,%rax,2)
vpshufb %xmm8, %xmm0, %xmm0
vmovq %xmm0, 8(%rdi,%rax,2)
addq $8, %rax
cmpq $1024, %rax ## imm = 0x400
jne LBB0_1
## BB#2: ## %for.end
ret
I was expecting this code to generate "vpmaxuw" rather than compare/shift/xor.
I get the expected codegen with 8-bit and 32-bit integers, but not with 16-bit.
Signed/unsigned does not appear to make any difference.
--
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20131105/dda2a251/attachment.html>
More information about the llvm-bugs
mailing list