llvm-project/llvm/test/Bitcode/aarch64-bf16-upgrade.ll
Mikhail Maltsev ae1396c7d4 [ARM][BFloat16] Change types of some Arm and AArch64 bf16 intrinsics
This patch adjusts the following ARM/AArch64 LLVM IR intrinsics:
- neon_bfmmla
- neon_bfmlalb
- neon_bfmlalt
so that they take and return bf16 and float types. Previously these
intrinsics used <8 x i8> and <4 x i8> vectors (a rudiment from
implementation lacking bf16 IR type).

The neon_vbfdot[q] intrinsics are adjusted similarly. This change
required some additional selection patterns for vbfdot itself and
also for vector shuffles (in a previous patch) because of SelectionDAG
transformations kicking in and mangling the original code.

This patch makes the generated IR cleaner (less useless bitcasts are
produced), but it does not affect the final assembly.

Reviewed By: dmgreen

Differential Revision: https://reviews.llvm.org/D86146
2020-08-27 18:43:16 +01:00

76 lines
4.3 KiB
LLVM

; RUN: llvm-dis < %s.bc | FileCheck %s
; Bitcode was generated from file below
define <2 x float> @test_vbfdot_f32(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %b) {
; CHECK-LABEL: @test_vbfdot_f32
entry:
%0 = bitcast <4 x bfloat> %a to <8 x i8>
%1 = bitcast <4 x bfloat> %b to <8 x i8>
; CHECK: %2 = bitcast <8 x i8> %0 to <4 x bfloat>
; CHECK-NEXT: %3 = bitcast <8 x i8> %1 to <4 x bfloat>
; CHECK-NEXT: %vbfdot1.i = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> %r, <4 x bfloat> %2, <4 x bfloat> %3)
%vbfdot1.i = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %0, <8 x i8> %1)
ret <2 x float> %vbfdot1.i
}
define <4 x float> @test_vbfdotq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
; CHECK-LABEL: @test_vbfdotq_f32
entry:
%0 = bitcast <8 x bfloat> %a to <16 x i8>
%1 = bitcast <8 x bfloat> %b to <16 x i8>
; CHECK: %2 = bitcast <16 x i8> %0 to <8 x bfloat>
; CHECK-NEXT: %3 = bitcast <16 x i8> %1 to <8 x bfloat>
; CHECK-NEXT: %vbfdot1.i = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> %r, <8 x bfloat> %2, <8 x bfloat> %3)
%vbfdot1.i = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
ret <4 x float> %vbfdot1.i
}
define <4 x float> @test_vbfmmlaq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
; CHECK-LABEL: @test_vbfmmlaq_f32
entry:
%0 = bitcast <8 x bfloat> %a to <16 x i8>
%1 = bitcast <8 x bfloat> %b to <16 x i8>
%vbfmmla1.i = call <4 x float> @llvm.aarch64.neon.bfmmla.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
; CHECK: %2 = bitcast <16 x i8> %0 to <8 x bfloat>
; CHECK-NEXT: %3 = bitcast <16 x i8> %1 to <8 x bfloat>
; CHECK-NEXT: %vbfmmla1.i = call <4 x float> @llvm.aarch64.neon.bfmmla(<4 x float> %r, <8 x bfloat> %2, <8 x bfloat> %3)
ret <4 x float> %vbfmmla1.i
}
define <4 x float> @test_vbfmlalbq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
; CHECK-LABEL: @test_vbfmlalbq_laneq_f32
entry:
%vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
%0 = bitcast <8 x bfloat> %a to <16 x i8>
%1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
%vbfmlalb1.i = call <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
; CHECK: %2 = bitcast <16 x i8> %0 to <8 x bfloat>
; CHECK-NEXT: %3 = bitcast <16 x i8> %1 to <8 x bfloat>
; CHECK-NEXT: %vbfmlalb1.i = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> %r, <8 x bfloat> %2, <8 x bfloat> %3)
ret <4 x float> %vbfmlalb1.i
}
define <4 x float> @test_vbfmlaltq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
; CHECK-LABEL: @test_vbfmlaltq_laneq_f32
entry:
%vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
%0 = bitcast <8 x bfloat> %a to <16 x i8>
%1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
%vbfmlalt1.i = call <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
; CHECK: %2 = bitcast <16 x i8> %0 to <8 x bfloat>
; CHECK-NEXT: %3 = bitcast <16 x i8> %1 to <8 x bfloat>
; CHECK-NEXT: %vbfmlalt1.i = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %2, <8 x bfloat> %3)
ret <4 x float> %vbfmlalt1.i
}
declare <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float>, <8 x i8>, <8 x i8>)
; CHECK: declare <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float>, <4 x bfloat>, <4 x bfloat>)
declare <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
; CHECK: declare <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float>, <8 x bfloat>, <8 x bfloat>)
declare <4 x float> @llvm.aarch64.neon.bfmmla.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
; CHECK: declare <4 x float> @llvm.aarch64.neon.bfmmla(<4 x float>, <8 x bfloat>, <8 x bfloat>)
declare <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
; CHECK: declare <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float>, <8 x bfloat>, <8 x bfloat>)
declare <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
; CHECK: declare <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float>, <8 x bfloat>, <8 x bfloat>)