[llvm] r225165 - Select lower sub,abs pattern to sabd on AArch64

KARTHIK VENKATESH BHAT kv.bhat at samsung.com
Tue Jan 6 22:43:02 PST 2015


Hi Owen,
Thanks for checking it up for me. The patch is reverted in r225341. 
I read somewhere that the signed integer overflow is actually an undefined behavior but it is not the same for floating points/unsigned integers though for which pattern was being generated as well.
Strange that gcc produces this instruction in such cases. May be we can raise a gcc bug as well.
Thanks
Karthik Bhat

------- Original Message -------
Sender : Owen Anderson<resistor at mac.com>
Date : Jan 07, 2015 10:58 (GMT+09:00)
Title : Re: [llvm] r225165 - Select lower sub,abs pattern to sabd on AArch64

Karthik,

The logical operation of absolute difference is (X > Y) ? (X - Y) : (Y - X).

Consider the case of X == -2, Y == 127 in 8 bit values.  The absolute difference of these two is 129.  However, abs(X - Y) == abs(0xFE - 0x7F) == abs(0xFE + 0x81) == abs(0x7F) == 0x7F == 127.

I wrote a small C program to find all the mismatches:

X=-128, Y=1, unfused=127, fused=129
X=-128, Y=2, unfused=126, fused=130
X=-128, Y=3, unfused=125, fused=131
X=-128, Y=4, unfused=124, fused=132
X=-128, Y=5, unfused=123, fused=133
X=-128, Y=6, unfused=122, fused=134
X=-128, Y=7, unfused=121, fused=135
X=-128, Y=8, unfused=120, fused=136
X=-128, Y=9, unfused=119, fused=137
X=-128, Y=10, unfused=118, fused=138
X=-128, Y=11, unfused=117, fused=139
X=-128, Y=12, unfused=116, fused=140
X=-128, Y=13, unfused=115, fused=141
X=-128, Y=14, unfused=114, fused=142
X=-128, Y=15, unfused=113, fused=143
X=-128, Y=16, unfused=112, fused=144
X=-128, Y=17, unfused=111, fused=145
X=-128, Y=18, unfused=110, fused=146
X=-128, Y=19, unfused=109, fused=147
X=-128, Y=20, unfused=108, fused=148
X=-128, Y=21, unfused=107, fused=149
X=-128, Y=22, unfused=106, fused=150
X=-128, Y=23, unfused=105, fused=151
X=-128, Y=24, unfused=104, fused=152
X=-128, Y=25, unfused=103, fused=153
X=-128, Y=26, unfused=102, fused=154
X=-128, Y=27, unfused=101, fused=155
X=-128, Y=28, unfused=100, fused=156
X=-128, Y=29, unfused=99, fused=157
X=-128, Y=30, unfused=98, fused=158
X=-128, Y=31, unfused=97, fused=159
X=-128, Y=32, unfused=96, fused=160
X=-128, Y=33, unfused=95, fused=161
X=-128, Y=34, unfused=94, fused=162
X=-128, Y=35, unfused=93, fused=163
X=-128, Y=36, unfused=92, fused=164
X=-128, Y=37, unfused=91, fused=165
X=-128, Y=38, unfused=90, fused=166
X=-128, Y=39, unfused=89, fused=167
X=-128, Y=40, unfused=88, fused=168
X=-128, Y=41, unfused=87, fused=169
X=-128, Y=42, unfused=86, fused=170
X=-128, Y=43, unfused=85, fused=171
X=-128, Y=44, unfused=84, fused=172
X=-128, Y=45, unfused=83, fused=173
X=-128, Y=46, unfused=82, fused=174
X=-128, Y=47, unfused=81, fused=175
X=-128, Y=48, unfused=80, fused=176
X=-128, Y=49, unfused=79, fused=177
X=-128, Y=50, unfused=78, fused=178
X=-128, Y=51, unfused=77, fused=179
X=-128, Y=52, unfused=76, fused=180
X=-128, Y=53, unfused=75, fused=181
X=-128, Y=54, unfused=74, fused=182
X=-128, Y=55, unfused=73, fused=183
X=-128, Y=56, unfused=72, fused=184
X=-128, Y=57, unfused=71, fused=185
X=-128, Y=58, unfused=70, fused=186
X=-128, Y=59, unfused=69, fused=187
X=-128, Y=60, unfused=68, fused=188
X=-128, Y=61, unfused=67, fused=189
X=-128, Y=62, unfused=66, fused=190
X=-128, Y=63, unfused=65, fused=191
X=-128, Y=64, unfused=64, fused=192
X=-128, Y=65, unfused=63, fused=193
X=-128, Y=66, unfused=62, fused=194
X=-128, Y=67, unfused=61, fused=195
X=-128, Y=68, unfused=60, fused=196
X=-128, Y=69, unfused=59, fused=197
X=-128, Y=70, unfused=58, fused=198
X=-128, Y=71, unfused=57, fused=199
X=-128, Y=72, unfused=56, fused=200
X=-128, Y=73, unfused=55, fused=201
X=-128, Y=74, unfused=54, fused=202
X=-128, Y=75, unfused=53, fused=203
X=-128, Y=76, unfused=52, fused=204
X=-128, Y=77, unfused=51, fused=205
X=-128, Y=78, unfused=50, fused=206
X=-128, Y=79, unfused=49, fused=207
X=-128, Y=80, unfused=48, fused=208
X=-128, Y=81, unfused=47, fused=209
X=-128, Y=82, unfused=46, fused=210
X=-128, Y=83, unfused=45, fused=211
X=-128, Y=84, unfused=44, fused=212
X=-128, Y=85, unfused=43, fused=213
X=-128, Y=86, unfused=42, fused=214
X=-128, Y=87, unfused=41, fused=215
X=-128, Y=88, unfused=40, fused=216
X=-128, Y=89, unfused=39, fused=217
X=-128, Y=90, unfused=38, fused=218
X=-128, Y=91, unfused=37, fused=219
X=-128, Y=92, unfused=36, fused=220
X=-128, Y=93, unfused=35, fused=221
X=-128, Y=94, unfused=34, fused=222
X=-128, Y=95, unfused=33, fused=223
X=-128, Y=96, unfused=32, fused=224
X=-128, Y=97, unfused=31, fused=225
X=-128, Y=98, unfused=30, fused=226
X=-128, Y=99, unfused=29, fused=227
X=-128, Y=100, unfused=28, fused=228
X=-128, Y=101, unfused=27, fused=229
X=-128, Y=102, unfused=26, fused=230
X=-128, Y=103, unfused=25, fused=231
X=-128, Y=104, unfused=24, fused=232
X=-128, Y=105, unfused=23, fused=233
X=-128, Y=106, unfused=22, fused=234
X=-128, Y=107, unfused=21, fused=235
X=-128, Y=108, unfused=20, fused=236
X=-128, Y=109, unfused=19, fused=237
X=-128, Y=110, unfused=18, fused=238
X=-128, Y=111, unfused=17, fused=239
X=-128, Y=112, unfused=16, fused=240
X=-128, Y=113, unfused=15, fused=241
X=-128, Y=114, unfused=14, fused=242
X=-128, Y=115, unfused=13, fused=243
X=-128, Y=116, unfused=12, fused=244
X=-128, Y=117, unfused=11, fused=245
X=-128, Y=118, unfused=10, fused=246
X=-128, Y=119, unfused=9, fused=247
X=-128, Y=120, unfused=8, fused=248
X=-128, Y=121, unfused=7, fused=249
X=-128, Y=122, unfused=6, fused=250
X=-128, Y=123, unfused=5, fused=251
X=-128, Y=124, unfused=4, fused=252
X=-128, Y=125, unfused=3, fused=253
X=-128, Y=126, unfused=2, fused=254
X=-128, Y=127, unfused=1, fused=255
X=-127, Y=127, unfused=2, fused=254
X=-126, Y=127, unfused=3, fused=253
X=-125, Y=127, unfused=4, fused=252
X=-124, Y=127, unfused=5, fused=251
X=-123, Y=127, unfused=6, fused=250
X=-122, Y=127, unfused=7, fused=249
X=-121, Y=127, unfused=8, fused=248
X=-120, Y=127, unfused=9, fused=247
X=-119, Y=127, unfused=10, fused=246
X=-118, Y=127, unfused=11, fused=245
X=-117, Y=127, unfused=12, fused=244
X=-116, Y=127, unfused=13, fused=243
X=-115, Y=127, unfused=14, fused=242
X=-114, Y=127, unfused=15, fused=241
X=-113, Y=127, unfused=16, fused=240
X=-112, Y=127, unfused=17, fused=239
X=-111, Y=127, unfused=18, fused=238
X=-110, Y=127, unfused=19, fused=237
X=-109, Y=127, unfused=20, fused=236
X=-108, Y=127, unfused=21, fused=235
X=-107, Y=127, unfused=22, fused=234
X=-106, Y=127, unfused=23, fused=233
X=-105, Y=127, unfused=24, fused=232
X=-104, Y=127, unfused=25, fused=231
X=-103, Y=127, unfused=26, fused=230
X=-102, Y=127, unfused=27, fused=229
X=-101, Y=127, unfused=28, fused=228
X=-100, Y=127, unfused=29, fused=227
X=-99, Y=127, unfused=30, fused=226
X=-98, Y=127, unfused=31, fused=225
X=-97, Y=127, unfused=32, fused=224
X=-96, Y=127, unfused=33, fused=223
X=-95, Y=127, unfused=34, fused=222
X=-94, Y=127, unfused=35, fused=221
X=-93, Y=127, unfused=36, fused=220
X=-92, Y=127, unfused=37, fused=219
X=-91, Y=127, unfused=38, fused=218
X=-90, Y=127, unfused=39, fused=217
X=-89, Y=127, unfused=40, fused=216
X=-88, Y=127, unfused=41, fused=215
X=-87, Y=127, unfused=42, fused=214
X=-86, Y=127, unfused=43, fused=213
X=-85, Y=127, unfused=44, fused=212
X=-84, Y=127, unfused=45, fused=211
X=-83, Y=127, unfused=46, fused=210
X=-82, Y=127, unfused=47, fused=209
X=-81, Y=127, unfused=48, fused=208
X=-80, Y=127, unfused=49, fused=207
X=-79, Y=127, unfused=50, fused=206
X=-78, Y=127, unfused=51, fused=205
X=-77, Y=127, unfused=52, fused=204
X=-76, Y=127, unfused=53, fused=203
X=-75, Y=127, unfused=54, fused=202
X=-74, Y=127, unfused=55, fused=201
X=-73, Y=127, unfused=56, fused=200
X=-72, Y=127, unfused=57, fused=199
X=-71, Y=127, unfused=58, fused=198
X=-70, Y=127, unfused=59, fused=197
X=-69, Y=127, unfused=60, fused=196
X=-68, Y=127, unfused=61, fused=195
X=-67, Y=127, unfused=62, fused=194
X=-66, Y=127, unfused=63, fused=193
X=-65, Y=127, unfused=64, fused=192
X=-64, Y=127, unfused=65, fused=191
X=-63, Y=127, unfused=66, fused=190
X=-62, Y=127, unfused=67, fused=189
X=-61, Y=127, unfused=68, fused=188
X=-60, Y=127, unfused=69, fused=187
X=-59, Y=127, unfused=70, fused=186
X=-58, Y=127, unfused=71, fused=185
X=-57, Y=127, unfused=72, fused=184
X=-56, Y=127, unfused=73, fused=183
X=-55, Y=127, unfused=74, fused=182
X=-54, Y=127, unfused=75, fused=181
X=-53, Y=127, unfused=76, fused=180
X=-52, Y=127, unfused=77, fused=179
X=-51, Y=127, unfused=78, fused=178
X=-50, Y=127, unfused=79, fused=177
X=-49, Y=127, unfused=80, fused=176
X=-48, Y=127, unfused=81, fused=175
X=-47, Y=127, unfused=82, fused=174
X=-46, Y=127, unfused=83, fused=173
X=-45, Y=127, unfused=84, fused=172
X=-44, Y=127, unfused=85, fused=171
X=-43, Y=127, unfused=86, fused=170
X=-42, Y=127, unfused=87, fused=169
X=-41, Y=127, unfused=88, fused=168
X=-40, Y=127, unfused=89, fused=167
X=-39, Y=127, unfused=90, fused=166
X=-38, Y=127, unfused=91, fused=165
X=-37, Y=127, unfused=92, fused=164
X=-36, Y=127, unfused=93, fused=163
X=-35, Y=127, unfused=94, fused=162
X=-34, Y=127, unfused=95, fused=161
X=-33, Y=127, unfused=96, fused=160
X=-32, Y=127, unfused=97, fused=159
X=-31, Y=127, unfused=98, fused=158
X=-30, Y=127, unfused=99, fused=157
X=-29, Y=127, unfused=100, fused=156
X=-28, Y=127, unfused=101, fused=155
X=-27, Y=127, unfused=102, fused=154
X=-26, Y=127, unfused=103, fused=153
X=-25, Y=127, unfused=104, fused=152
X=-24, Y=127, unfused=105, fused=151
X=-23, Y=127, unfused=106, fused=150
X=-22, Y=127, unfused=107, fused=149
X=-21, Y=127, unfused=108, fused=148
X=-20, Y=127, unfused=109, fused=147
X=-19, Y=127, unfused=110, fused=146
X=-18, Y=127, unfused=111, fused=145
X=-17, Y=127, unfused=112, fused=144
X=-16, Y=127, unfused=113, fused=143
X=-15, Y=127, unfused=114, fused=142
X=-14, Y=127, unfused=115, fused=141
X=-13, Y=127, unfused=116, fused=140
X=-12, Y=127, unfused=117, fused=139
X=-11, Y=127, unfused=118, fused=138
X=-10, Y=127, unfused=119, fused=137
X=-9, Y=127, unfused=120, fused=136
X=-8, Y=127, unfused=121, fused=135
X=-7, Y=127, unfused=122, fused=134
X=-6, Y=127, unfused=123, fused=133
X=-5, Y=127, unfused=124, fused=132
X=-4, Y=127, unfused=125, fused=131
X=-3, Y=127, unfused=126, fused=130
X=-2, Y=127, unfused=127, fused=129


—Owen

> On Jan 6, 2015, at 1:51 AM, KARTHIK VENKATESH BHAT wrote:
> 
> Hi Owen,
> Thanks for the inputs. Unfortunately I'm unable to check this as I'm not having an aarch64 target. 
> But my reference was gcc which generated similar assembly.(e.g. the sample test cases submitted in the patch)
> I checked the aarch64 instruction manual it doesn't seems to specify anything about overflow case for sabd instruction. 
> May be we need to check o/p with an overflowing i/p case to confirm the behavior on AArch64 target?
> 
> Thanks and Regards
> Karthik Bhat
> 
> 
> ------- Original Message -------
> Sender : Owen Anderson
> Date : Jan 06, 2015 15:56 (GMT+09:00)
> Title : Re: [llvm] r225165 - Select lower sub,abs pattern to sabd on AArch64
> 
> I’m pretty sure these two patterns aren’t equivalent in the case where the original subtraction could have caused an overflow.
> 
> —Owen
> 
>> On Jan 5, 2015, at 7:11 AM, Karthik Bhat wrote:
>> 
>> Author: karthik
>> Date: Mon Jan  5 07:11:07 2015
>> New Revision: 225165
>> 
>> URL: http://llvm.org/viewvc/llvm-project?rev=225165&view=rev
>> Log:
>> Select lower sub,abs pattern to sabd on AArch64
>> 
>> This patch lowers patterns such as-
>> sub v0.4s, v0.4s, v1.4s
>> abs v0.4s, v0.4s
>> to
>> sabd v0.4s, v0.4s, v1.4s
>> on AArch64.
>> 
>> Review: http://reviews.llvm.org/D6781
>> 
>> 
>> Added:
>>   llvm/trunk/test/CodeGen/AArch64/arm64-neon-simd-vabs.ll
>> Modified:
>>   llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.td
>> 
>> Modified: llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.td
>> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.td?rev=225165&r1=225164&r2=225165&view=diff
>> ==============================================================================
>> --- llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.td (original)
>> +++ llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.td Mon Jan  5 07:11:07 2015
>> @@ -2733,6 +2733,33 @@ defm ORN : SIMDLogicalThreeVector<0, 0b1
>>                                  BinOpFrag<(or node:$LHS, (vnot node:$RHS))> >;
>> defm ORR : SIMDLogicalThreeVector<0, 0b10, "orr", or>;
>> 
>> +// SABD Vd., Vn., Vm. Subtracts the elements of Vm from the corresponding
>> +// elements of Vn, and places the absolute values of the results in the elements of Vd.
>> +def : Pat<(xor (v8i8 (AArch64vashr (v8i8(sub V64:$Rn, V64:$Rm)), (i32 7))),
>> +               (v8i8 (add (v8i8(sub V64:$Rn, V64:$Rm)),
>> +               (AArch64vashr (v8i8(sub V64:$Rn, V64:$Rm)), (i32 7))))),
>> +          (SABDv8i8 V64:$Rn, V64:$Rm)>;
>> +def : Pat<(xor (v4i16 (AArch64vashr (v4i16(sub V64:$Rn, V64:$Rm)), (i32 15))),
>> +               (v4i16 (add (v4i16(sub V64:$Rn, V64:$Rm)),
>> +               (AArch64vashr (v4i16(sub V64:$Rn, V64:$Rm)), (i32 15))))),
>> +          (SABDv4i16 V64:$Rn, V64:$Rm)>;
>> +def : Pat<(xor (v2i32 (AArch64vashr (v2i32(sub V64:$Rn, V64:$Rm)), (i32 31))),
>> +               (v2i32 (add (v2i32(sub V64:$Rn, V64:$Rm)),
>> +               (AArch64vashr (v2i32(sub V64:$Rn, V64:$Rm)), (i32 31))))),
>> +          (SABDv2i32 V64:$Rn, V64:$Rm)>;
>> +def : Pat<(xor (v16i8 (AArch64vashr (v16i8(sub V128:$Rn, V128:$Rm)), (i32 7))),
>> +               (v16i8 (add (v16i8(sub V128:$Rn, V128:$Rm)),
>> +               (AArch64vashr (v16i8(sub V128:$Rn, V128:$Rm)), (i32 7))))),
>> +          (SABDv16i8 V128:$Rn, V128:$Rm)>;
>> +def : Pat<(xor (v8i16 (AArch64vashr (v8i16(sub V128:$Rn, V128:$Rm)), (i32 15))),
>> +               (v8i16 (add (v8i16(sub V128:$Rn, V128:$Rm)),
>> +               (AArch64vashr (v8i16(sub V128:$Rn, V128:$Rm)), (i32 15))))),
>> +          (SABDv8i16 V128:$Rn, V128:$Rm)>;
>> +def : Pat<(xor (v4i32 (AArch64vashr (v4i32(sub V128:$Rn, V128:$Rm)), (i32 31))),
>> +               (v4i32 (add (v4i32(sub V128:$Rn, V128:$Rm)),
>> +               (AArch64vashr (v4i32(sub V128:$Rn, V128:$Rm)), (i32 31))))),
>> +          (SABDv4i32 V128:$Rn, V128:$Rm)>;
>> +
>> def : Pat<(AArch64bsl (v8i8 V64:$Rd), V64:$Rn, V64:$Rm),
>>          (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
>> def : Pat<(AArch64bsl (v4i16 V64:$Rd), V64:$Rn, V64:$Rm),
>> 
>> Added: llvm/trunk/test/CodeGen/AArch64/arm64-neon-simd-vabs.ll
>> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/arm64-neon-simd-vabs.ll?rev=225165&view=auto
>> ==============================================================================
>> --- llvm/trunk/test/CodeGen/AArch64/arm64-neon-simd-vabs.ll (added)
>> +++ llvm/trunk/test/CodeGen/AArch64/arm64-neon-simd-vabs.ll Mon Jan  5 07:11:07 2015
>> @@ -0,0 +1,101 @@
>> +; RUN: llc -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
>> +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
>> +target triple = "aarch64--linux-gnu"
>> +
>> +; CHECK: testv4i32
>> +; CHECK: sabd v0.4s, v0.4s, v1.4s
>> +define void @testv4i32(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32* noalias nocapture readonly %c){
>> +  %1 = bitcast i32* %b to <4 x i32>*
>> +  %2 = load <4 x i32>* %1, align 4
>> +  %3 = bitcast i32* %c to <4 x i32>*
>> +  %4 = load <4 x i32>* %3, align 4
>> +  %5 = sub nsw <4 x i32> %2, %4
>> +  %6 = icmp sgt <4 x i32> %5, 
>> +  %7 = sub <4 x i32> zeroinitializer, %5
>> +  %8 = select <4 x i1> %6, <4 x i32> %5, <4 x i32> %7
>> +  %9 = bitcast i32* %a to <4 x i32>*
>> +  store <4 x i32> %8, <4 x i32>* %9, align 4
>> +  ret void
>> +}
>> +
>> +; CHECK: testv2i32
>> +; CHECK: sabd v0.2s, v0.2s, v1.2s
>> +define void @testv2i32(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32* noalias nocapture readonly %c){
>> +  %1 = bitcast i32* %b to <2 x i32>*
>> +  %2 = load <2 x i32>* %1, align 4
>> +  %3 = bitcast i32* %c to <2 x i32>*
>> +  %4 = load <2 x i32>* %3, align 4
>> +  %5 = sub nsw <2 x i32> %2, %4
>> +  %6 = icmp sgt <2 x i32> %5, 
>> +  %7 = sub <2 x i32> zeroinitializer, %5
>> +  %8 = select <2 x i1> %6, <2 x i32> %5, <2 x i32> %7
>> +  %9 = bitcast i32* %a to <2 x i32>*
>> +  store <2 x i32> %8, <2 x i32>* %9, align 4
>> +  ret void
>> +}
>> +
>> +; CHECK: testv8i16
>> +; CHECK: sabd v0.8h, v0.8h, v1.8h
>> +define void @testv8i16(i16* noalias nocapture %a, i16* noalias nocapture readonly %b, i16* noalias nocapture readonly %c){
>> +  %1 = bitcast i16* %b to <8 x i16>*
>> +  %2 = load <8 x i16>* %1, align 4
>> +  %3 = bitcast i16* %c to <8 x i16>*
>> +  %4 = load <8 x i16>* %3, align 4
>> +  %5 = sub nsw <8 x i16> %2, %4
>> +  %6 = icmp sgt <8 x i16> %5,  
>> +  %7 = sub <8 x i16> zeroinitializer, %5
>> +  %8 = select <8 x i1> %6, <8 x i16> %5, <8 x i16> %7
>> +  %9 = bitcast i16* %a to <8 x i16>*
>> +  store <8 x i16> %8, <8 x i16>* %9, align 4
>> +  ret void
>> +}
>> +
>> +; CHECK: testv4i16
>> +; CHECK: sabd v0.4h, v0.4h, v1.4h
>> +define void @testv4i16(i16* noalias nocapture %a, i16* noalias nocapture readonly %b, i16* noalias nocapture readonly %c){
>> +  %1 = bitcast i16* %b to <4 x i16>*
>> +  %2 = load <4 x i16>* %1, align 4
>> +  %3 = bitcast i16* %c to <4 x i16>*
>> +  %4 = load <4 x i16>* %3, align 4
>> +  %5 = sub nsw <4 x i16> %2, %4
>> +  %6 = icmp sgt <4 x i16> %5,  
>> +  %7 = sub <4 x i16> zeroinitializer, %5
>> +  %8 = select <4 x i1> %6, <4 x i16> %5, <4 x i16> %7
>> +  %9 = bitcast i16* %a to <4 x i16>*
>> +  store <4 x i16> %8, <4 x i16>* %9, align 4
>> +  ret void
>> +}
>> +
>> +
>> +; CHECK: testv16i8
>> +; CHECK: sabd v0.16b, v0.16b, v1.16b
>> +define void @testv16i8(i8* noalias nocapture %a, i8* noalias nocapture readonly %b, i8* noalias nocapture readonly %c){
>> +  %1 = bitcast i8* %b to <16 x i8>*
>> +  %2 = load <16 x i8>* %1, align 4
>> +  %3 = bitcast i8* %c to <16 x i8>*
>> +  %4 = load <16 x i8>* %3, align 4
>> +  %5 = sub nsw <16 x i8> %2, %4
>> +  %6 = icmp sgt <16 x i8> %5,  
>> +  %7 = sub <16 x i8> zeroinitializer, %5
>> +  %8 = select <16 x i1> %6, <16 x i8> %5, <16 x i8> %7
>> +  %9 = bitcast i8* %a to <16 x i8>*
>> +  store <16 x i8> %8, <16 x i8>* %9, align 4
>> +  ret void
>> +}
>> +
>> +; CHECK: testv8i8
>> +; CHECK: sabd v0.8b, v0.8b, v1.8b
>> +define void @testv8i8(i8* noalias nocapture %a, i8* noalias nocapture readonly %b, i8* noalias nocapture readonly %c){
>> +  %1 = bitcast i8* %b to <8 x i8>*
>> +  %2 = load <8 x i8>* %1, align 4
>> +  %3 = bitcast i8* %c to <8 x i8>*
>> +  %4 = load <8 x i8>* %3, align 4
>> +  %5 = sub nsw <8 x i8> %2, %4
>> +  %6 = icmp sgt <8 x i8> %5,  
>> +  %7 = sub <8 x i8> zeroinitializer, %5
>> +  %8 = select <8 x i1> %6, <8 x i8> %5, <8 x i8> %7
>> +  %9 = bitcast i8* %a to <8 x i8>*
>> +  store <8 x i8> %8, <8 x i8>* %9, align 4
>> +  ret void
>> +}
>> +
>> 
>> 
>> _______________________________________________
>> llvm-commits mailing list
>> llvm-commits at cs.uiuc.edu
>> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits




More information about the llvm-commits mailing list