[LoongArch] Optimize inserting extracted elements#146018
Conversation
|
@llvm/pr-subscribers-backend-loongarch Author: ZhaoQi (zhaoqi5) ChangesFull diff: https://github.com/llvm/llvm-project/pull/146018.diff 5 Files Affected:
diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
index ff7b0f2ae3f25..915dc803bdbd7 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
@@ -1593,11 +1593,14 @@ def : Pat<(vector_insert v8i32:$xd, GRLenVT:$rj, uimm3:$imm),
(XVINSGR2VR_W v8i32:$xd, GRLenVT:$rj, uimm3:$imm)>;
def : Pat<(vector_insert v4i64:$xd, GRLenVT:$rj, uimm2:$imm),
(XVINSGR2VR_D v4i64:$xd, GRLenVT:$rj, uimm2:$imm)>;
-
-def : Pat<(vector_insert v8f32:$vd, FPR32:$fj, uimm3:$imm),
- (XVINSGR2VR_W $vd, (COPY_TO_REGCLASS FPR32:$fj, GPR), uimm3:$imm)>;
-def : Pat<(vector_insert v4f64:$vd, FPR64:$fj, uimm2:$imm),
- (XVINSGR2VR_D $vd, (COPY_TO_REGCLASS FPR64:$fj, GPR), uimm2:$imm)>;
+def : Pat<(vector_insert v8f32:$xd, (f32 (vector_extract v8f32:$xj, uimm3:$imm1)), uimm3:$imm2),
+ (XVINSGR2VR_W $xd, (XVPICKVE2GR_W v8f32:$xj, uimm3:$imm1), uimm3:$imm2)>;
+def : Pat<(vector_insert v4f64:$xd, (f64 (vector_extract v4f64:$xj, uimm2:$imm1)), uimm2:$imm2),
+ (XVINSGR2VR_D $xd, (XVPICKVE2GR_D v4f64:$xj, uimm2:$imm1), uimm2:$imm2)>;
+def : Pat<(vector_insert v8f32:$xd, FPR32:$fj, uimm3:$imm),
+ (XVINSGR2VR_W $xd, (COPY_TO_REGCLASS FPR32:$fj, GPR), uimm3:$imm)>;
+def : Pat<(vector_insert v4f64:$xd, FPR64:$fj, uimm2:$imm),
+ (XVINSGR2VR_D $xd, (COPY_TO_REGCLASS FPR64:$fj, GPR), uimm2:$imm)>;
// scalar_to_vector
def : Pat<(v8f32 (scalar_to_vector FPR32:$fj)),
diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
index d73d78083ddcd..34c6ffc6727f1 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
@@ -1791,7 +1791,10 @@ def : Pat<(vector_insert v4i32:$vd, GRLenVT:$rj, uimm2:$imm),
(VINSGR2VR_W v4i32:$vd, GRLenVT:$rj, uimm2:$imm)>;
def : Pat<(vector_insert v2i64:$vd, GRLenVT:$rj, uimm1:$imm),
(VINSGR2VR_D v2i64:$vd, GRLenVT:$rj, uimm1:$imm)>;
-
+def : Pat<(vector_insert v4f32:$vd, (f32 (vector_extract v4f32:$vj, uimm2:$imm1)), uimm2:$imm2),
+ (VINSGR2VR_W $vd, (VPICKVE2GR_W v4f32:$vj, uimm2:$imm1), uimm2:$imm2)>;
+def : Pat<(vector_insert v2f64:$vd, (f64 (vector_extract v2f64:$vj, uimm1:$imm1)), uimm1:$imm2),
+ (VINSGR2VR_D $vd, (VPICKVE2GR_D v2f64:$vj, uimm1:$imm1), uimm1:$imm2)>;
def : Pat<(vector_insert v4f32:$vd, FPR32:$fj, uimm2:$imm),
(VINSGR2VR_W $vd, (COPY_TO_REGCLASS FPR32:$fj, GPR), uimm2:$imm)>;
def : Pat<(vector_insert v2f64:$vd, FPR64:$fj, uimm1:$imm),
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fix-xvshuf.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fix-xvshuf.ll
index f3bec11810e9b..f154dd3b8eb3c 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fix-xvshuf.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fix-xvshuf.ll
@@ -7,20 +7,12 @@ define <4 x double> @shufflevector_v4f64(<4 x double> %a, <4 x double> %b) {
; CHECK-LABEL: shufflevector_v4f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 0
-; CHECK-NEXT: movgr2fr.d $fa2, $a0
-; CHECK-NEXT: xvpickve2gr.d $a0, $xr1, 2
-; CHECK-NEXT: movgr2fr.d $fa3, $a0
-; CHECK-NEXT: movfr2gr.d $a0, $fa2
; CHECK-NEXT: xvinsgr2vr.d $xr2, $a0, 0
-; CHECK-NEXT: movfr2gr.d $a0, $fa3
+; CHECK-NEXT: xvpickve2gr.d $a0, $xr1, 2
; CHECK-NEXT: xvinsgr2vr.d $xr2, $a0, 1
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 3
-; CHECK-NEXT: movgr2fr.d $fa0, $a0
-; CHECK-NEXT: xvpickve2gr.d $a0, $xr1, 3
-; CHECK-NEXT: movgr2fr.d $fa1, $a0
-; CHECK-NEXT: movfr2gr.d $a0, $fa0
; CHECK-NEXT: xvinsgr2vr.d $xr2, $a0, 2
-; CHECK-NEXT: movfr2gr.d $a0, $fa1
+; CHECK-NEXT: xvpickve2gr.d $a0, $xr1, 3
; CHECK-NEXT: xvinsgr2vr.d $xr2, $a0, 3
; CHECK-NEXT: xvori.b $xr0, $xr2, 0
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-element.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-element.ll
index a5d3a0d395b3c..ddbc159ca94ba 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-element.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-element.ll
@@ -5,8 +5,6 @@ define <8 x float> @insert_extract_v8f32(<8 x float> %a) nounwind {
; CHECK-LABEL: insert_extract_v8f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xvpickve2gr.w $a0, $xr0, 7
-; CHECK-NEXT: movgr2fr.w $fa1, $a0
-; CHECK-NEXT: movfr2gr.s $a0, $fa1
; CHECK-NEXT: xvinsgr2vr.w $xr0, $a0, 1
; CHECK-NEXT: ret
entry:
@@ -19,8 +17,6 @@ define <4 x double> @insert_extract_v4f64(<4 x double> %a) nounwind {
; CHECK-LABEL: insert_extract_v4f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 3
-; CHECK-NEXT: movgr2fr.d $fa1, $a0
-; CHECK-NEXT: movfr2gr.d $a0, $fa1
; CHECK-NEXT: xvinsgr2vr.d $xr0, $a0, 1
; CHECK-NEXT: ret
entry:
diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insert-extract-element.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insert-extract-element.ll
index dcf23f0240712..4c34e0f49b8c8 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insert-extract-element.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insert-extract-element.ll
@@ -4,8 +4,7 @@
define <4 x float> @insert_extract_v4f32(<4 x float> %a) nounwind {
; CHECK-LABEL: insert_extract_v4f32:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vreplvei.w $vr1, $vr0, 3
-; CHECK-NEXT: movfr2gr.s $a0, $fa1
+; CHECK-NEXT: vpickve2gr.w $a0, $vr0, 3
; CHECK-NEXT: vinsgr2vr.w $vr0, $a0, 0
; CHECK-NEXT: ret
entry:
@@ -17,8 +16,7 @@ entry:
define <2 x double> @insert_extract_v2f64(<2 x double> %a) nounwind {
; CHECK-LABEL: insert_extract_v2f64:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vreplvei.d $vr1, $vr0, 1
-; CHECK-NEXT: movfr2gr.d $a0, $fa1
+; CHECK-NEXT: vpickve2gr.d $a0, $vr0, 1
; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0
; CHECK-NEXT: ret
entry:
|
|
We could use |
Great, using one |
85cb5e9 to
aab3fee
Compare
We could also support |
|
|
Use this patch to support extract i8/i16 type element from hi128 part of 256bit vector. |
aab3fee to
00a0512
Compare
Done. Thanks for your efforts on this. |
|
This change improves vector_insert for lasx --> |
The purpose of this pr is mainly to optimize inserting extracted elements from 128 or 256 bits vector. I think we can do this optimization for |
OK |
No description provided.