[LLVMbugs] [Bug 17822] New: min/max vector instructions not selected for shorts (16-bit integers)

Tue Nov 5 10:50:46 PST 2013

http://llvm.org/bugs/show_bug.cgi?id=17822

            Bug ID: 17822
           Summary: min/max vector instructions not selected for shorts
                    (16-bit integers)
           Product: libraries
           Version: trunk
          Hardware: PC
                OS: All
            Status: NEW
          Severity: normal
          Priority: P
         Component: Backend: X86
          Assignee: unassignedbugs at nondot.org
          Reporter: kkhoo at perfwizard.com
                CC: llvmbugs at cs.uiuc.edu
    Classification: Unclassified

I'm not sure if this is a problem with the loop vectorizer, instruction
selection with the x86-64 target, or something else:

$ cat max.c
#define TYPE unsigned short 
void max(TYPE * restrict x, TYPE * restrict y) {
    for (int i=0; i<1024; i++) {
        x[i] = (x[i] > y[i]) ? x[i] : y[i];
    }
}

Converted to LLVM IR with clang built from r194083:

$ ./clang -O3 -fomit-frame-pointer -march=corei7-avx -S max.c -emit-llvm -o -
; ModuleID = 'max.c'
target datalayout =
"e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.7.0"

; Function Attrs: nounwind ssp uwtable
define void @max(i16* noalias nocapture %x, i16* noalias nocapture readonly %y)
#0 {
entry:
  br label %vector.body

vector.body:                                      ; preds = %vector.body,
%entry
  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
  %0 = getelementptr inbounds i16* %x, i64 %index
  %1 = bitcast i16* %0 to <4 x i16>*
  %wide.load = load <4 x i16>* %1, align 2
  %.sum28 = or i64 %index, 4
  %2 = getelementptr i16* %x, i64 %.sum28
  %3 = bitcast i16* %2 to <4 x i16>*
  %wide.load25 = load <4 x i16>* %3, align 2
  %4 = getelementptr inbounds i16* %y, i64 %index
  %5 = bitcast i16* %4 to <4 x i16>*
  %wide.load26 = load <4 x i16>* %5, align 2
  %.sum29 = or i64 %index, 4
  %6 = getelementptr i16* %y, i64 %.sum29
  %7 = bitcast i16* %6 to <4 x i16>*
  %wide.load27 = load <4 x i16>* %7, align 2
  %8 = icmp ugt <4 x i16> %wide.load, %wide.load26
  %9 = icmp ugt <4 x i16> %wide.load25, %wide.load27
  %10 = select <4 x i1> %8, <4 x i16> %wide.load, <4 x i16> %wide.load26
  %11 = select <4 x i1> %9, <4 x i16> %wide.load25, <4 x i16> %wide.load27
  %12 = bitcast i16* %0 to <4 x i16>*
  store <4 x i16> %10, <4 x i16>* %12, align 2
  %13 = bitcast i16* %2 to <4 x i16>*
  store <4 x i16> %11, <4 x i16>* %13, align 2
  %index.next = add i64 %index, 8
  %14 = icmp eq i64 %index.next, 1024
  br i1 %14, label %for.end, label %vector.body, !llvm.loop !1

for.end:                                          ; preds = %vector.body
  ret void
}

attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false"
"no-frame-pointer-elim"="false" "no-infs-fp-math"="false"
"no-nans-fp-math"="false" "stack-protector-buffer-size"="8"
"unsafe-fp-math"="false" "use-soft-float"="false" }

!llvm.ident = !{!0}

!0 = metadata !{metadata !"clang version 3.4 (trunk 194083)"}
!1 = metadata !{metadata !1, metadata !2, metadata !3}
!2 = metadata !{metadata !"llvm.vectorizer.width", i32 1}
!3 = metadata !{metadata !"llvm.vectorizer.unroll", i32 1}

And then compiled to x86-64 asm via llc:

$ ./llc -O3 -mcpu=corei7-avx max.ll -o -
    .section    __TEXT,__text,regular,pure_instructions
    .section    __TEXT,__const
    .align    4
LCPI0_0:
    .long    65535                   ## 0xffff
    .long    65535                   ## 0xffff
    .long    65535                   ## 0xffff
    .long    65535                   ## 0xffff
LCPI0_1:
    .long    2147483648              ## 0x80000000
    .long    2147483648              ## 0x80000000
    .long    2147483648              ## 0x80000000
    .long    2147483648              ## 0x80000000
LCPI0_2:
    .byte    0                       ## 0x0
    .byte    1                       ## 0x1
    .byte    4                       ## 0x4
    .byte    5                       ## 0x5
    .byte    8                       ## 0x8
    .byte    9                       ## 0x9
    .byte    12                      ## 0xc
    .byte    13                      ## 0xd
    .byte    254                     ## 0xfe
    .byte    255                     ## 0xff
    .byte    254                     ## 0xfe
    .byte    255                     ## 0xff
    .byte    254                     ## 0xfe
    .byte    255                     ## 0xff
    .byte    254                     ## 0xfe
    .byte    255                     ## 0xff
    .section    __TEXT,__text,regular,pure_instructions
    .globl    _max
    .align    4, 0x90
_max:                                   ## @max
    .cfi_startproc
## BB#0:                                ## %entry
    xorl    %eax, %eax
    vmovdqa    LCPI0_0(%rip), %xmm9
    vmovdqa    LCPI0_1(%rip), %xmm10
    vmovdqa    LCPI0_2(%rip), %xmm8
    .align    4, 0x90
LBB0_1:                                 ## %vector.body
                                        ## =>This Inner Loop Header: Depth=1
    vpmovzxwd    (%rdi,%rax,2), %xmm3
    vpand    %xmm9, %xmm3, %xmm4
    vpmovzxwd    8(%rdi,%rax,2), %xmm5
    vpand    %xmm9, %xmm5, %xmm6
    vpmovzxwd    (%rsi,%rax,2), %xmm7
    vpand    %xmm9, %xmm7, %xmm2
    vpmovzxwd    8(%rsi,%rax,2), %xmm0
    vpand    %xmm9, %xmm0, %xmm1
    vpxor    %xmm10, %xmm2, %xmm2
    vpxor    %xmm10, %xmm4, %xmm4
    vpcmpgtd    %xmm2, %xmm4, %xmm2
    vpslld    $31, %xmm2, %xmm2
    vpxor    %xmm10, %xmm1, %xmm1
    vpxor    %xmm10, %xmm6, %xmm4
    vpcmpgtd    %xmm1, %xmm4, %xmm1
    vpslld    $31, %xmm1, %xmm1
    vblendvps    %xmm2, %xmm3, %xmm7, %xmm2
    vblendvps    %xmm1, %xmm5, %xmm0, %xmm0
    vpshufb    %xmm8, %xmm2, %xmm1
    vmovq    %xmm1, (%rdi,%rax,2)
    vpshufb    %xmm8, %xmm0, %xmm0
    vmovq    %xmm0, 8(%rdi,%rax,2)
    addq    $8, %rax
    cmpq    $1024, %rax             ## imm = 0x400
    jne    LBB0_1
## BB#2:                                ## %for.end
    ret

I was expecting this code to generate "vpmaxuw" rather than compare/shift/xor.
I get the expected codegen with 8-bit and 32-bit integers, but not with 16-bit.
Signed/unsigned does not appear to make any difference.

-- 
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20131105/dda2a251/attachment.html>