[llvm] r225165 - Select lower sub,abs pattern to sabd on AArch64
Owen Anderson
resistor at mac.com
Wed Jan 7 07:58:31 PST 2015
I don't think you needed to revert the FABD portion of the patch. I'm not 100% confident, but I am not able to come up with a counter-example for that version.
Intuitively, I believe it works because floating point arithmetic saturates as Infinities rather than wrapping.
-Owen
> On Jan 7, 2015, at 12:43 AM, KARTHIK VENKATESH BHAT <kv.bhat at samsung.com> wrote:
>
> Hi Owen,
> Thanks for checking it up for me. The patch is reverted in r225341.
> I read somewhere that the signed integer overflow is actually an undefined behavior but it is not the same for floating points/unsigned integers though for which pattern was being generated as well.
> Strange that gcc produces this instruction in such cases. May be we can raise a gcc bug as well.
> Thanks
> Karthik Bhat
>
> ------- Original Message -------
> Sender : Owen Anderson<resistor at mac.com>
> Date : Jan 07, 2015 10:58 (GMT+09:00)
> Title : Re: [llvm] r225165 - Select lower sub,abs pattern to sabd on AArch64
>
> Karthik,
>
> The logical operation of absolute difference is (X > Y) ? (X - Y) : (Y - X).
>
> Consider the case of X == -2, Y == 127 in 8 bit values. The absolute difference of these two is 129. However, abs(X - Y) == abs(0xFE - 0x7F) == abs(0xFE + 0x81) == abs(0x7F) == 0x7F == 127.
>
> I wrote a small C program to find all the mismatches:
>
> X=-128, Y=1, unfused=127, fused=129
> X=-128, Y=2, unfused=126, fused=130
> X=-128, Y=3, unfused=125, fused=131
> X=-128, Y=4, unfused=124, fused=132
> X=-128, Y=5, unfused=123, fused=133
> X=-128, Y=6, unfused=122, fused=134
> X=-128, Y=7, unfused=121, fused=135
> X=-128, Y=8, unfused=120, fused=136
> X=-128, Y=9, unfused=119, fused=137
> X=-128, Y=10, unfused=118, fused=138
> X=-128, Y=11, unfused=117, fused=139
> X=-128, Y=12, unfused=116, fused=140
> X=-128, Y=13, unfused=115, fused=141
> X=-128, Y=14, unfused=114, fused=142
> X=-128, Y=15, unfused=113, fused=143
> X=-128, Y=16, unfused=112, fused=144
> X=-128, Y=17, unfused=111, fused=145
> X=-128, Y=18, unfused=110, fused=146
> X=-128, Y=19, unfused=109, fused=147
> X=-128, Y=20, unfused=108, fused=148
> X=-128, Y=21, unfused=107, fused=149
> X=-128, Y=22, unfused=106, fused=150
> X=-128, Y=23, unfused=105, fused=151
> X=-128, Y=24, unfused=104, fused=152
> X=-128, Y=25, unfused=103, fused=153
> X=-128, Y=26, unfused=102, fused=154
> X=-128, Y=27, unfused=101, fused=155
> X=-128, Y=28, unfused=100, fused=156
> X=-128, Y=29, unfused=99, fused=157
> X=-128, Y=30, unfused=98, fused=158
> X=-128, Y=31, unfused=97, fused=159
> X=-128, Y=32, unfused=96, fused=160
> X=-128, Y=33, unfused=95, fused=161
> X=-128, Y=34, unfused=94, fused=162
> X=-128, Y=35, unfused=93, fused=163
> X=-128, Y=36, unfused=92, fused=164
> X=-128, Y=37, unfused=91, fused=165
> X=-128, Y=38, unfused=90, fused=166
> X=-128, Y=39, unfused=89, fused=167
> X=-128, Y=40, unfused=88, fused=168
> X=-128, Y=41, unfused=87, fused=169
> X=-128, Y=42, unfused=86, fused=170
> X=-128, Y=43, unfused=85, fused=171
> X=-128, Y=44, unfused=84, fused=172
> X=-128, Y=45, unfused=83, fused=173
> X=-128, Y=46, unfused=82, fused=174
> X=-128, Y=47, unfused=81, fused=175
> X=-128, Y=48, unfused=80, fused=176
> X=-128, Y=49, unfused=79, fused=177
> X=-128, Y=50, unfused=78, fused=178
> X=-128, Y=51, unfused=77, fused=179
> X=-128, Y=52, unfused=76, fused=180
> X=-128, Y=53, unfused=75, fused=181
> X=-128, Y=54, unfused=74, fused=182
> X=-128, Y=55, unfused=73, fused=183
> X=-128, Y=56, unfused=72, fused=184
> X=-128, Y=57, unfused=71, fused=185
> X=-128, Y=58, unfused=70, fused=186
> X=-128, Y=59, unfused=69, fused=187
> X=-128, Y=60, unfused=68, fused=188
> X=-128, Y=61, unfused=67, fused=189
> X=-128, Y=62, unfused=66, fused=190
> X=-128, Y=63, unfused=65, fused=191
> X=-128, Y=64, unfused=64, fused=192
> X=-128, Y=65, unfused=63, fused=193
> X=-128, Y=66, unfused=62, fused=194
> X=-128, Y=67, unfused=61, fused=195
> X=-128, Y=68, unfused=60, fused=196
> X=-128, Y=69, unfused=59, fused=197
> X=-128, Y=70, unfused=58, fused=198
> X=-128, Y=71, unfused=57, fused=199
> X=-128, Y=72, unfused=56, fused=200
> X=-128, Y=73, unfused=55, fused=201
> X=-128, Y=74, unfused=54, fused=202
> X=-128, Y=75, unfused=53, fused=203
> X=-128, Y=76, unfused=52, fused=204
> X=-128, Y=77, unfused=51, fused=205
> X=-128, Y=78, unfused=50, fused=206
> X=-128, Y=79, unfused=49, fused=207
> X=-128, Y=80, unfused=48, fused=208
> X=-128, Y=81, unfused=47, fused=209
> X=-128, Y=82, unfused=46, fused=210
> X=-128, Y=83, unfused=45, fused=211
> X=-128, Y=84, unfused=44, fused=212
> X=-128, Y=85, unfused=43, fused=213
> X=-128, Y=86, unfused=42, fused=214
> X=-128, Y=87, unfused=41, fused=215
> X=-128, Y=88, unfused=40, fused=216
> X=-128, Y=89, unfused=39, fused=217
> X=-128, Y=90, unfused=38, fused=218
> X=-128, Y=91, unfused=37, fused=219
> X=-128, Y=92, unfused=36, fused=220
> X=-128, Y=93, unfused=35, fused=221
> X=-128, Y=94, unfused=34, fused=222
> X=-128, Y=95, unfused=33, fused=223
> X=-128, Y=96, unfused=32, fused=224
> X=-128, Y=97, unfused=31, fused=225
> X=-128, Y=98, unfused=30, fused=226
> X=-128, Y=99, unfused=29, fused=227
> X=-128, Y=100, unfused=28, fused=228
> X=-128, Y=101, unfused=27, fused=229
> X=-128, Y=102, unfused=26, fused=230
> X=-128, Y=103, unfused=25, fused=231
> X=-128, Y=104, unfused=24, fused=232
> X=-128, Y=105, unfused=23, fused=233
> X=-128, Y=106, unfused=22, fused=234
> X=-128, Y=107, unfused=21, fused=235
> X=-128, Y=108, unfused=20, fused=236
> X=-128, Y=109, unfused=19, fused=237
> X=-128, Y=110, unfused=18, fused=238
> X=-128, Y=111, unfused=17, fused=239
> X=-128, Y=112, unfused=16, fused=240
> X=-128, Y=113, unfused=15, fused=241
> X=-128, Y=114, unfused=14, fused=242
> X=-128, Y=115, unfused=13, fused=243
> X=-128, Y=116, unfused=12, fused=244
> X=-128, Y=117, unfused=11, fused=245
> X=-128, Y=118, unfused=10, fused=246
> X=-128, Y=119, unfused=9, fused=247
> X=-128, Y=120, unfused=8, fused=248
> X=-128, Y=121, unfused=7, fused=249
> X=-128, Y=122, unfused=6, fused=250
> X=-128, Y=123, unfused=5, fused=251
> X=-128, Y=124, unfused=4, fused=252
> X=-128, Y=125, unfused=3, fused=253
> X=-128, Y=126, unfused=2, fused=254
> X=-128, Y=127, unfused=1, fused=255
> X=-127, Y=127, unfused=2, fused=254
> X=-126, Y=127, unfused=3, fused=253
> X=-125, Y=127, unfused=4, fused=252
> X=-124, Y=127, unfused=5, fused=251
> X=-123, Y=127, unfused=6, fused=250
> X=-122, Y=127, unfused=7, fused=249
> X=-121, Y=127, unfused=8, fused=248
> X=-120, Y=127, unfused=9, fused=247
> X=-119, Y=127, unfused=10, fused=246
> X=-118, Y=127, unfused=11, fused=245
> X=-117, Y=127, unfused=12, fused=244
> X=-116, Y=127, unfused=13, fused=243
> X=-115, Y=127, unfused=14, fused=242
> X=-114, Y=127, unfused=15, fused=241
> X=-113, Y=127, unfused=16, fused=240
> X=-112, Y=127, unfused=17, fused=239
> X=-111, Y=127, unfused=18, fused=238
> X=-110, Y=127, unfused=19, fused=237
> X=-109, Y=127, unfused=20, fused=236
> X=-108, Y=127, unfused=21, fused=235
> X=-107, Y=127, unfused=22, fused=234
> X=-106, Y=127, unfused=23, fused=233
> X=-105, Y=127, unfused=24, fused=232
> X=-104, Y=127, unfused=25, fused=231
> X=-103, Y=127, unfused=26, fused=230
> X=-102, Y=127, unfused=27, fused=229
> X=-101, Y=127, unfused=28, fused=228
> X=-100, Y=127, unfused=29, fused=227
> X=-99, Y=127, unfused=30, fused=226
> X=-98, Y=127, unfused=31, fused=225
> X=-97, Y=127, unfused=32, fused=224
> X=-96, Y=127, unfused=33, fused=223
> X=-95, Y=127, unfused=34, fused=222
> X=-94, Y=127, unfused=35, fused=221
> X=-93, Y=127, unfused=36, fused=220
> X=-92, Y=127, unfused=37, fused=219
> X=-91, Y=127, unfused=38, fused=218
> X=-90, Y=127, unfused=39, fused=217
> X=-89, Y=127, unfused=40, fused=216
> X=-88, Y=127, unfused=41, fused=215
> X=-87, Y=127, unfused=42, fused=214
> X=-86, Y=127, unfused=43, fused=213
> X=-85, Y=127, unfused=44, fused=212
> X=-84, Y=127, unfused=45, fused=211
> X=-83, Y=127, unfused=46, fused=210
> X=-82, Y=127, unfused=47, fused=209
> X=-81, Y=127, unfused=48, fused=208
> X=-80, Y=127, unfused=49, fused=207
> X=-79, Y=127, unfused=50, fused=206
> X=-78, Y=127, unfused=51, fused=205
> X=-77, Y=127, unfused=52, fused=204
> X=-76, Y=127, unfused=53, fused=203
> X=-75, Y=127, unfused=54, fused=202
> X=-74, Y=127, unfused=55, fused=201
> X=-73, Y=127, unfused=56, fused=200
> X=-72, Y=127, unfused=57, fused=199
> X=-71, Y=127, unfused=58, fused=198
> X=-70, Y=127, unfused=59, fused=197
> X=-69, Y=127, unfused=60, fused=196
> X=-68, Y=127, unfused=61, fused=195
> X=-67, Y=127, unfused=62, fused=194
> X=-66, Y=127, unfused=63, fused=193
> X=-65, Y=127, unfused=64, fused=192
> X=-64, Y=127, unfused=65, fused=191
> X=-63, Y=127, unfused=66, fused=190
> X=-62, Y=127, unfused=67, fused=189
> X=-61, Y=127, unfused=68, fused=188
> X=-60, Y=127, unfused=69, fused=187
> X=-59, Y=127, unfused=70, fused=186
> X=-58, Y=127, unfused=71, fused=185
> X=-57, Y=127, unfused=72, fused=184
> X=-56, Y=127, unfused=73, fused=183
> X=-55, Y=127, unfused=74, fused=182
> X=-54, Y=127, unfused=75, fused=181
> X=-53, Y=127, unfused=76, fused=180
> X=-52, Y=127, unfused=77, fused=179
> X=-51, Y=127, unfused=78, fused=178
> X=-50, Y=127, unfused=79, fused=177
> X=-49, Y=127, unfused=80, fused=176
> X=-48, Y=127, unfused=81, fused=175
> X=-47, Y=127, unfused=82, fused=174
> X=-46, Y=127, unfused=83, fused=173
> X=-45, Y=127, unfused=84, fused=172
> X=-44, Y=127, unfused=85, fused=171
> X=-43, Y=127, unfused=86, fused=170
> X=-42, Y=127, unfused=87, fused=169
> X=-41, Y=127, unfused=88, fused=168
> X=-40, Y=127, unfused=89, fused=167
> X=-39, Y=127, unfused=90, fused=166
> X=-38, Y=127, unfused=91, fused=165
> X=-37, Y=127, unfused=92, fused=164
> X=-36, Y=127, unfused=93, fused=163
> X=-35, Y=127, unfused=94, fused=162
> X=-34, Y=127, unfused=95, fused=161
> X=-33, Y=127, unfused=96, fused=160
> X=-32, Y=127, unfused=97, fused=159
> X=-31, Y=127, unfused=98, fused=158
> X=-30, Y=127, unfused=99, fused=157
> X=-29, Y=127, unfused=100, fused=156
> X=-28, Y=127, unfused=101, fused=155
> X=-27, Y=127, unfused=102, fused=154
> X=-26, Y=127, unfused=103, fused=153
> X=-25, Y=127, unfused=104, fused=152
> X=-24, Y=127, unfused=105, fused=151
> X=-23, Y=127, unfused=106, fused=150
> X=-22, Y=127, unfused=107, fused=149
> X=-21, Y=127, unfused=108, fused=148
> X=-20, Y=127, unfused=109, fused=147
> X=-19, Y=127, unfused=110, fused=146
> X=-18, Y=127, unfused=111, fused=145
> X=-17, Y=127, unfused=112, fused=144
> X=-16, Y=127, unfused=113, fused=143
> X=-15, Y=127, unfused=114, fused=142
> X=-14, Y=127, unfused=115, fused=141
> X=-13, Y=127, unfused=116, fused=140
> X=-12, Y=127, unfused=117, fused=139
> X=-11, Y=127, unfused=118, fused=138
> X=-10, Y=127, unfused=119, fused=137
> X=-9, Y=127, unfused=120, fused=136
> X=-8, Y=127, unfused=121, fused=135
> X=-7, Y=127, unfused=122, fused=134
> X=-6, Y=127, unfused=123, fused=133
> X=-5, Y=127, unfused=124, fused=132
> X=-4, Y=127, unfused=125, fused=131
> X=-3, Y=127, unfused=126, fused=130
> X=-2, Y=127, unfused=127, fused=129
>
>
> —Owen
>
>> On Jan 6, 2015, at 1:51 AM, KARTHIK VENKATESH BHAT wrote:
>>
>> Hi Owen,
>> Thanks for the inputs. Unfortunately I'm unable to check this as I'm not having an aarch64 target.
>> But my reference was gcc which generated similar assembly.(e.g. the sample test cases submitted in the patch)
>> I checked the aarch64 instruction manual it doesn't seems to specify anything about overflow case for sabd instruction.
>> May be we need to check o/p with an overflowing i/p case to confirm the behavior on AArch64 target?
>>
>> Thanks and Regards
>> Karthik Bhat
>>
>>
>> ------- Original Message -------
>> Sender : Owen Anderson
>> Date : Jan 06, 2015 15:56 (GMT+09:00)
>> Title : Re: [llvm] r225165 - Select lower sub,abs pattern to sabd on AArch64
>>
>> I’m pretty sure these two patterns aren’t equivalent in the case where the original subtraction could have caused an overflow.
>>
>> —Owen
>>
>>> On Jan 5, 2015, at 7:11 AM, Karthik Bhat wrote:
>>>
>>> Author: karthik
>>> Date: Mon Jan 5 07:11:07 2015
>>> New Revision: 225165
>>>
>>> URL: http://llvm.org/viewvc/llvm-project?rev=225165&view=rev
>>> Log:
>>> Select lower sub,abs pattern to sabd on AArch64
>>>
>>> This patch lowers patterns such as-
>>> sub v0.4s, v0.4s, v1.4s
>>> abs v0.4s, v0.4s
>>> to
>>> sabd v0.4s, v0.4s, v1.4s
>>> on AArch64.
>>>
>>> Review: http://reviews.llvm.org/D6781
>>>
>>>
>>> Added:
>>> llvm/trunk/test/CodeGen/AArch64/arm64-neon-simd-vabs.ll
>>> Modified:
>>> llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.td
>>>
>>> Modified: llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.td
>>> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.td?rev=225165&r1=225164&r2=225165&view=diff
>>> ==============================================================================
>>> --- llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.td (original)
>>> +++ llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.td Mon Jan 5 07:11:07 2015
>>> @@ -2733,6 +2733,33 @@ defm ORN : SIMDLogicalThreeVector<0, 0b1
>>> BinOpFrag<(or node:$LHS, (vnot node:$RHS))> >;
>>> defm ORR : SIMDLogicalThreeVector<0, 0b10, "orr", or>;
>>>
>>> +// SABD Vd., Vn., Vm. Subtracts the elements of Vm from the corresponding
>>> +// elements of Vn, and places the absolute values of the results in the elements of Vd.
>>> +def : Pat<(xor (v8i8 (AArch64vashr (v8i8(sub V64:$Rn, V64:$Rm)), (i32 7))),
>>> + (v8i8 (add (v8i8(sub V64:$Rn, V64:$Rm)),
>>> + (AArch64vashr (v8i8(sub V64:$Rn, V64:$Rm)), (i32 7))))),
>>> + (SABDv8i8 V64:$Rn, V64:$Rm)>;
>>> +def : Pat<(xor (v4i16 (AArch64vashr (v4i16(sub V64:$Rn, V64:$Rm)), (i32 15))),
>>> + (v4i16 (add (v4i16(sub V64:$Rn, V64:$Rm)),
>>> + (AArch64vashr (v4i16(sub V64:$Rn, V64:$Rm)), (i32 15))))),
>>> + (SABDv4i16 V64:$Rn, V64:$Rm)>;
>>> +def : Pat<(xor (v2i32 (AArch64vashr (v2i32(sub V64:$Rn, V64:$Rm)), (i32 31))),
>>> + (v2i32 (add (v2i32(sub V64:$Rn, V64:$Rm)),
>>> + (AArch64vashr (v2i32(sub V64:$Rn, V64:$Rm)), (i32 31))))),
>>> + (SABDv2i32 V64:$Rn, V64:$Rm)>;
>>> +def : Pat<(xor (v16i8 (AArch64vashr (v16i8(sub V128:$Rn, V128:$Rm)), (i32 7))),
>>> + (v16i8 (add (v16i8(sub V128:$Rn, V128:$Rm)),
>>> + (AArch64vashr (v16i8(sub V128:$Rn, V128:$Rm)), (i32 7))))),
>>> + (SABDv16i8 V128:$Rn, V128:$Rm)>;
>>> +def : Pat<(xor (v8i16 (AArch64vashr (v8i16(sub V128:$Rn, V128:$Rm)), (i32 15))),
>>> + (v8i16 (add (v8i16(sub V128:$Rn, V128:$Rm)),
>>> + (AArch64vashr (v8i16(sub V128:$Rn, V128:$Rm)), (i32 15))))),
>>> + (SABDv8i16 V128:$Rn, V128:$Rm)>;
>>> +def : Pat<(xor (v4i32 (AArch64vashr (v4i32(sub V128:$Rn, V128:$Rm)), (i32 31))),
>>> + (v4i32 (add (v4i32(sub V128:$Rn, V128:$Rm)),
>>> + (AArch64vashr (v4i32(sub V128:$Rn, V128:$Rm)), (i32 31))))),
>>> + (SABDv4i32 V128:$Rn, V128:$Rm)>;
>>> +
>>> def : Pat<(AArch64bsl (v8i8 V64:$Rd), V64:$Rn, V64:$Rm),
>>> (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
>>> def : Pat<(AArch64bsl (v4i16 V64:$Rd), V64:$Rn, V64:$Rm),
>>>
>>> Added: llvm/trunk/test/CodeGen/AArch64/arm64-neon-simd-vabs.ll
>>> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/arm64-neon-simd-vabs.ll?rev=225165&view=auto
>>> ==============================================================================
>>> --- llvm/trunk/test/CodeGen/AArch64/arm64-neon-simd-vabs.ll (added)
>>> +++ llvm/trunk/test/CodeGen/AArch64/arm64-neon-simd-vabs.ll Mon Jan 5 07:11:07 2015
>>> @@ -0,0 +1,101 @@
>>> +; RUN: llc -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
>>> +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
>>> +target triple = "aarch64--linux-gnu"
>>> +
>>> +; CHECK: testv4i32
>>> +; CHECK: sabd v0.4s, v0.4s, v1.4s
>>> +define void @testv4i32(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32* noalias nocapture readonly %c){
>>> + %1 = bitcast i32* %b to <4 x i32>*
>>> + %2 = load <4 x i32>* %1, align 4
>>> + %3 = bitcast i32* %c to <4 x i32>*
>>> + %4 = load <4 x i32>* %3, align 4
>>> + %5 = sub nsw <4 x i32> %2, %4
>>> + %6 = icmp sgt <4 x i32> %5,
>>> + %7 = sub <4 x i32> zeroinitializer, %5
>>> + %8 = select <4 x i1> %6, <4 x i32> %5, <4 x i32> %7
>>> + %9 = bitcast i32* %a to <4 x i32>*
>>> + store <4 x i32> %8, <4 x i32>* %9, align 4
>>> + ret void
>>> +}
>>> +
>>> +; CHECK: testv2i32
>>> +; CHECK: sabd v0.2s, v0.2s, v1.2s
>>> +define void @testv2i32(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32* noalias nocapture readonly %c){
>>> + %1 = bitcast i32* %b to <2 x i32>*
>>> + %2 = load <2 x i32>* %1, align 4
>>> + %3 = bitcast i32* %c to <2 x i32>*
>>> + %4 = load <2 x i32>* %3, align 4
>>> + %5 = sub nsw <2 x i32> %2, %4
>>> + %6 = icmp sgt <2 x i32> %5,
>>> + %7 = sub <2 x i32> zeroinitializer, %5
>>> + %8 = select <2 x i1> %6, <2 x i32> %5, <2 x i32> %7
>>> + %9 = bitcast i32* %a to <2 x i32>*
>>> + store <2 x i32> %8, <2 x i32>* %9, align 4
>>> + ret void
>>> +}
>>> +
>>> +; CHECK: testv8i16
>>> +; CHECK: sabd v0.8h, v0.8h, v1.8h
>>> +define void @testv8i16(i16* noalias nocapture %a, i16* noalias nocapture readonly %b, i16* noalias nocapture readonly %c){
>>> + %1 = bitcast i16* %b to <8 x i16>*
>>> + %2 = load <8 x i16>* %1, align 4
>>> + %3 = bitcast i16* %c to <8 x i16>*
>>> + %4 = load <8 x i16>* %3, align 4
>>> + %5 = sub nsw <8 x i16> %2, %4
>>> + %6 = icmp sgt <8 x i16> %5,
>>> + %7 = sub <8 x i16> zeroinitializer, %5
>>> + %8 = select <8 x i1> %6, <8 x i16> %5, <8 x i16> %7
>>> + %9 = bitcast i16* %a to <8 x i16>*
>>> + store <8 x i16> %8, <8 x i16>* %9, align 4
>>> + ret void
>>> +}
>>> +
>>> +; CHECK: testv4i16
>>> +; CHECK: sabd v0.4h, v0.4h, v1.4h
>>> +define void @testv4i16(i16* noalias nocapture %a, i16* noalias nocapture readonly %b, i16* noalias nocapture readonly %c){
>>> + %1 = bitcast i16* %b to <4 x i16>*
>>> + %2 = load <4 x i16>* %1, align 4
>>> + %3 = bitcast i16* %c to <4 x i16>*
>>> + %4 = load <4 x i16>* %3, align 4
>>> + %5 = sub nsw <4 x i16> %2, %4
>>> + %6 = icmp sgt <4 x i16> %5,
>>> + %7 = sub <4 x i16> zeroinitializer, %5
>>> + %8 = select <4 x i1> %6, <4 x i16> %5, <4 x i16> %7
>>> + %9 = bitcast i16* %a to <4 x i16>*
>>> + store <4 x i16> %8, <4 x i16>* %9, align 4
>>> + ret void
>>> +}
>>> +
>>> +
>>> +; CHECK: testv16i8
>>> +; CHECK: sabd v0.16b, v0.16b, v1.16b
>>> +define void @testv16i8(i8* noalias nocapture %a, i8* noalias nocapture readonly %b, i8* noalias nocapture readonly %c){
>>> + %1 = bitcast i8* %b to <16 x i8>*
>>> + %2 = load <16 x i8>* %1, align 4
>>> + %3 = bitcast i8* %c to <16 x i8>*
>>> + %4 = load <16 x i8>* %3, align 4
>>> + %5 = sub nsw <16 x i8> %2, %4
>>> + %6 = icmp sgt <16 x i8> %5,
>>> + %7 = sub <16 x i8> zeroinitializer, %5
>>> + %8 = select <16 x i1> %6, <16 x i8> %5, <16 x i8> %7
>>> + %9 = bitcast i8* %a to <16 x i8>*
>>> + store <16 x i8> %8, <16 x i8>* %9, align 4
>>> + ret void
>>> +}
>>> +
>>> +; CHECK: testv8i8
>>> +; CHECK: sabd v0.8b, v0.8b, v1.8b
>>> +define void @testv8i8(i8* noalias nocapture %a, i8* noalias nocapture readonly %b, i8* noalias nocapture readonly %c){
>>> + %1 = bitcast i8* %b to <8 x i8>*
>>> + %2 = load <8 x i8>* %1, align 4
>>> + %3 = bitcast i8* %c to <8 x i8>*
>>> + %4 = load <8 x i8>* %3, align 4
>>> + %5 = sub nsw <8 x i8> %2, %4
>>> + %6 = icmp sgt <8 x i8> %5,
>>> + %7 = sub <8 x i8> zeroinitializer, %5
>>> + %8 = select <8 x i1> %6, <8 x i8> %5, <8 x i8> %7
>>> + %9 = bitcast i8* %a to <8 x i8>*
>>> + store <8 x i8> %8, <8 x i8>* %9, align 4
>>> + ret void
>>> +}
>>> +
>>>
>>>
>>> _______________________________________________
>>> llvm-commits mailing list
>>> llvm-commits at cs.uiuc.edu
>>> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
More information about the llvm-commits
mailing list