[RISCV][NFC] Add test case for SLP reduction vectorization failure

Horizontal reductions still occur on RISC-V, despite the maximum SLP VF
reported back by TTI being 1, to disable SLP.
This can cause the cost model to think it can vectorize a gather into
smaller, widened loads, when it will actually fail to do so.
This should ultimately be fixed whenever SLP is re-enabled for RISC-V at
some point.

Reviewed By: reames

Differential Revision: https://reviews.llvm.org/D146529
This commit is contained in:
Luke Lau 2023-03-21 14:46:15 +00:00
parent 67852bff58
commit e69f8bac42

View File

@ -821,4 +821,96 @@ entry:
ret i64 %add.15
}
declare i32 @llvm.abs.i32(i32, i1)
; FIXME: This horizontal reduction occurs because the cost model thinks it can
; vectorize the loads here. However, because -riscv-v-slp-max-vf is set to 1 by
; default, tryToVectorizeList fails and we end up with this very expensive
; scalarized load.
;
; This is the code the cost model thinks it's going to generate, which you can
; get by passing -riscv-v-slp-max-vf=0
;
; define i32 @stride_sum_abs_diff(ptr %p, ptr %q, i64 %stride) #0 {
; %p.2 = getelementptr inbounds i32, ptr %p, i64 %stride
; %q.2 = getelementptr inbounds i32, ptr %q, i64 %stride
; %p.3 = getelementptr inbounds i32, ptr %p.2, i64 1
; %q.3 = getelementptr inbounds i32, ptr %q.2, i64 1
; %1 = load <2 x i32>, ptr %p, align 4
; %2 = load <2 x i32>, ptr %q, align 4
; %x.2 = load i32, ptr %p.2, align 4
; %y.2 = load i32, ptr %q.2, align 4
; %x.3 = load i32, ptr %p.3, align 4
; %y.3 = load i32, ptr %q.3, align 4
; %3 = shufflevector <2 x i32> %1, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
; %4 = insertelement <4 x i32> %3, i32 %x.2, i32 2
; %5 = insertelement <4 x i32> %4, i32 %x.3, i32 3
; %6 = shufflevector <2 x i32> %2, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
; %7 = insertelement <4 x i32> %6, i32 %y.2, i32 2
; %8 = insertelement <4 x i32> %7, i32 %y.3, i32 3
; %9 = sub <4 x i32> %5, %8
; %10 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %9, i1 true)
; %11 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %10)
; ret i32 %11
; }
define i32 @stride_sum_abs_diff(ptr %p, ptr %q, i64 %stride) {
; CHECK-LABEL: @stride_sum_abs_diff(
; CHECK-NEXT: [[P_1:%.*]] = getelementptr inbounds i32, ptr [[P:%.*]], i64 1
; CHECK-NEXT: [[Q_1:%.*]] = getelementptr inbounds i32, ptr [[Q:%.*]], i64 1
; CHECK-NEXT: [[P_2:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[STRIDE:%.*]]
; CHECK-NEXT: [[Q_2:%.*]] = getelementptr inbounds i32, ptr [[Q]], i64 [[STRIDE]]
; CHECK-NEXT: [[P_3:%.*]] = getelementptr inbounds i32, ptr [[P_2]], i64 1
; CHECK-NEXT: [[Q_3:%.*]] = getelementptr inbounds i32, ptr [[Q_2]], i64 1
; CHECK-NEXT: [[X_0:%.*]] = load i32, ptr [[P]], align 4
; CHECK-NEXT: [[Y_0:%.*]] = load i32, ptr [[Q]], align 4
; CHECK-NEXT: [[X_1:%.*]] = load i32, ptr [[P_1]], align 4
; CHECK-NEXT: [[Y_1:%.*]] = load i32, ptr [[Q_1]], align 4
; CHECK-NEXT: [[X_2:%.*]] = load i32, ptr [[P_2]], align 4
; CHECK-NEXT: [[Y_2:%.*]] = load i32, ptr [[Q_2]], align 4
; CHECK-NEXT: [[X_3:%.*]] = load i32, ptr [[P_3]], align 4
; CHECK-NEXT: [[Y_3:%.*]] = load i32, ptr [[Q_3]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[X_0]], i32 0
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[X_1]], i32 1
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[X_2]], i32 2
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[X_3]], i32 3
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[Y_0]], i32 0
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[Y_1]], i32 1
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[Y_2]], i32 2
; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[Y_3]], i32 3
; CHECK-NEXT: [[TMP9:%.*]] = sub <4 x i32> [[TMP4]], [[TMP8]]
; CHECK-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP9]], i1 true)
; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP10]])
; CHECK-NEXT: ret i32 [[TMP11]]
;
%x.0 = load i32, ptr %p
%y.0 = load i32, ptr %q
%sub.0 = sub i32 %x.0, %y.0
%abs.0 = tail call i32 @llvm.abs.i32(i32 %sub.0, i1 true)
%p.1 = getelementptr inbounds i32, ptr %p, i64 1
%x.1 = load i32, ptr %p.1
%q.1 = getelementptr inbounds i32, ptr %q, i64 1
%y.1 = load i32, ptr %q.1
%sub.1 = sub i32 %x.1, %y.1
%abs.1 = tail call i32 @llvm.abs.i32(i32 %sub.1, i1 true)
%sum.0 = add i32 %abs.0, %abs.1
%p.2 = getelementptr inbounds i32, ptr %p, i64 %stride
%q.2 = getelementptr inbounds i32, ptr %q, i64 %stride
%x.2 = load i32, ptr %p.2
%y.2 = load i32, ptr %q.2
%sub.2 = sub i32 %x.2, %y.2
%abs.2 = tail call i32 @llvm.abs.i32(i32 %sub.2, i1 true)
%sum.1 = add i32 %sum.0, %abs.2
%p.3 = getelementptr inbounds i32, ptr %p.2, i64 1
%x.3 = load i32, ptr %p.3
%q.3 = getelementptr inbounds i32, ptr %q.2, i64 1
%y.3 = load i32, ptr %q.3
%sub.3 = sub i32 %x.3, %y.3
%abs.3 = tail call i32 @llvm.abs.i32(i32 %sub.3, i1 true)
%sum.2 = add i32 %sum.1, %abs.3
ret i32 %sum.2
}