Skip to content

Commit 320a28f

Browse files
committed
Use VU0_VECTORs and ASM code
1 parent d67a4b8 commit 320a28f

File tree

13 files changed

+2456
-920
lines changed

13 files changed

+2456
-920
lines changed

.github/workflows/compilation.yml

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,4 +19,12 @@ jobs:
1919
2020
- name: Compile project
2121
run: |
22-
make clean all install
22+
make -j $(getconf _NPROCESSORS_ONLN) clean
23+
make -j $(getconf _NPROCESSORS_ONLN) all
24+
make -j $(getconf _NPROCESSORS_ONLN) install
25+
26+
- name: Compile tests
27+
run: |
28+
cd tests
29+
make -j $(getconf _NPROCESSORS_ONLN) clean
30+
make -j $(getconf _NPROCESSORS_ONLN) all

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,4 @@ objs_*
33
prebuilddone
44
*.o
55
*.a
6+
*.elf

Makefile

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,6 @@ endif
1111
# Disabling warnings
1212
WARNING_FLAGS = -Wno-strict-aliasing -Wno-conversion-null
1313

14-
# VU0 code is broken so disable for now
15-
EE_CFLAGS += $(WARNING_FLAGS) -DNO_VU0_VECTORS -DNO_ASM
16-
EE_CXXFLAGS += $(WARNING_FLAGS) -DNO_VU0_VECTORS -DNO_ASM
1714

1815
EE_OBJS = \
1916
src/core.o \

Makefile.builds

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,6 @@ perf_INCDIRS := $(incdirs)
4141

4242
### native, no vu0 vectors ###
4343

44-
defines := NO_VU0_VECTORS
4544
optflags := -ffast-math -O2
4645

4746
# debug_no_vu0
@@ -69,7 +68,7 @@ release_no_vu0_INCDIRS := $(incdirs)
6968
incdirs := $(PS2STUFF)/linux/kernel_module
7069
libdirs := /usr/lib
7170
debug_flags := -D_DEBUG -g
72-
defines := PS2_LINUX NO_VU0_VECTORS
71+
defines := PS2_LINUX
7372
optflags := -ffast-math -O2
7473

7574
# linux
@@ -104,7 +103,7 @@ linux_release_PLATFORM := linux
104103
# cross_linux
105104
BUILDNAMES += cross_linux
106105
cross_linux_INCDIRS := $(incdirs)
107-
cross_linux_DEFINES := PS2_LINUX NO_VU0_VECTORS
106+
cross_linux_DEFINES := PS2_LINUX
108107
cross_linux_DEBUGFLAGS := -D_DEBUG -g
109108
cross_linux_OPTFLAGS := -ffast-math -O2
110109
cross_linux_PLATFORM := linux_cross

include/ps2s/cpu_vector.h

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -70,8 +70,8 @@ class cpu_vec_3 {
7070
"mtc1 _temp0, _z # z = value.z \n"
7171

7272
".endif \n"
73-
: "=&r,f _x"(x), "=&r,f _y"(y), "=&r,f _z"(z), "=&r,&r _temp0"(temp0)
74-
: "r,r _vec"(vec));
73+
: "=&r,&f"(x), "=&r,&f"(y), "=&r,&f"(z), "=&r"(temp0)
74+
: "r"(vec));
7575
}
7676

7777
explicit inline cpu_vec_3(const vec_3 vec) { set(vec.vec128); }
@@ -215,8 +215,8 @@ class cpu_vec_4 {
215215
"mtc1 _temp0, _w # w = value.w \n"
216216

217217
".endif \n"
218-
: "=&r,f _x"(x), "=&r,f _y"(y), "=&r,f _z"(z), "=&r,f _w"(w), "=&r,&r _temp0"(temp0)
219-
: "r,r _vec"(vec));
218+
: "=&r,&f"(x), "=&r,&f"(y), "=&r,&f"(z), "=&r,&f"(w), "=&r"(temp0)
219+
: "r"(vec));
220220
}
221221

222222
explicit inline cpu_vec_4(const vec_4 vec) { set(vec.vec128); }
@@ -335,12 +335,12 @@ cpu_vec_3::operator+(const cpu_vec_3& vec)
335335
#else
336336

337337
asm(" ### cpu_vec_3 + cpu_vec_3 ### \n"
338-
"add.s rx, v0x, v1x \n"
339-
"add.s ry, v0y, v1y \n"
340-
"add.s rz, v0z, v1z \n"
341-
: "=&f rx"(result.x), "=&f ry"(result.y), "=&f rz"(result.z)
342-
: "f v0x"(x), "f v0y"(y), "f v0z"(z),
343-
"f v1x"(vec.x), "f v1y"(vec.y), "f v1z"(vec.z));
338+
"add.s %[rx], %[v0x], %[v1x] \n"
339+
"add.s %[ry], %[v0y], %[v1y] \n"
340+
"add.s %[rz], %[v0z], %[v1z] \n"
341+
: [rx] "=&f"(result.x), [ry] "=&f"(result.y), [rz] "=&f"(result.z)
342+
: [v0x] "f"(x), [v0y] "f"(y), [v0z] "f"(z),
343+
[v1x] "f"(vec.x), [v1y] "f"(vec.y), [v1z] "f"(vec.z));
344344

345345
#endif
346346
return result;
@@ -359,12 +359,12 @@ cpu_vec_3::operator-(const cpu_vec_3& vec)
359359
#else
360360

361361
asm(" ### cpu_vec_3 - cpu_vec_3 ### \n"
362-
"sub.s rx, v0x, v1x \n"
363-
"sub.s ry, v0y, v1y \n"
364-
"sub.s rz, v0z, v1z \n"
365-
: "=&f rx"(result.x), "=&f ry"(result.y), "=&f rz"(result.z)
366-
: "f v0x"(x), "f v0y"(y), "f v0z"(z),
367-
"f v1x"(vec.x), "f v1y"(vec.y), "f v1z"(vec.z));
362+
"sub.s %[rx], %[v0x], %[v1x] \n"
363+
"sub.s %[ry], %[v0y], %[v1y] \n"
364+
"sub.s %[rz], %[v0z], %[v1z] \n"
365+
: [rx] "=&f"(result.x), [ry] "=&f"(result.y), [rz] "=&f"(result.z)
366+
: [v0x] "f"(x), [v0y] "f"(y), [v0z] "f"(z),
367+
[v1x] "f"(vec.x), [v1y] "f"(vec.y), [v1z] "f"(vec.z));
368368

369369
#endif
370370
return result;

include/ps2s/matrix.h

Lines changed: 121 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -51,24 +51,39 @@ class mat_33 : public mat_x3_template<vec_3> {
5151

5252
void set_identity()
5353
{
54-
asm(" ### mat_33::set_identity ### \n"
55-
"vsub %[col0], %[col0], %[col0] \n"
56-
"vsub %[col1], %[col1], %[col1] \n"
57-
"vmr32 %[col2], vf00 \n"
58-
"vaddw.x %[col0], vf00, vf00 \n"
59-
"vaddw.y %[col1], vf00, vf00 \n"
60-
: [col0] "=j"(col0),
61-
[col1] "=j"(col1),
62-
[col2] "=j"(col2));
54+
asm __volatile__(
55+
"vaddw.x $vf1, $vf0, $vf0 \n"
56+
"vaddw.y $vf2, $vf0, $vf0 \n"
57+
"vmr32 $vf3, $vf0 \n"
58+
"qmfc2 %[col0], $vf1 \n"
59+
"qmfc2 %[col1], $vf2 \n"
60+
"qmfc2 %[col2], $vf3 \n"
61+
: [col0] "=r"(col0.vec128), [col1] "=r"(col1.vec128), [col2] "=r"(col2.vec128)
62+
:
63+
: "memory");
6364
}
6465

6566
void
6667
set_scale(vec_3 scale)
6768
{
6869
set_zero();
69-
col0 = (vec_x)scale;
70-
col1 = (vec_y)scale;
71-
col2 = (vec_z)scale;
70+
// Extract components and set diagonal elements using VU0
71+
float sx = (float)vec_x(scale);
72+
float sy = (float)vec_y(scale);
73+
float sz = (float)vec_z(scale);
74+
asm __volatile__(
75+
"ctc2 %[sx], $vi21 \n"
76+
"vaddi.x $vf1, $vf0, $I \n"
77+
"ctc2 %[sy], $vi21 \n"
78+
"vaddi.y $vf2, $vf0, $I \n"
79+
"ctc2 %[sz], $vi21 \n"
80+
"vaddi.z $vf3, $vf0, $I \n"
81+
"qmfc2 %[col0], $vf1 \n"
82+
"qmfc2 %[col1], $vf2 \n"
83+
"qmfc2 %[col2], $vf3 \n"
84+
: [col0] "=r"(col0.vec128), [col1] "=r"(col1.vec128), [col2] "=r"(col2.vec128)
85+
: [sx] "r"(sx), [sy] "r"(sy), [sz] "r"(sz)
86+
: "memory");
7287
}
7388

7489
void
@@ -219,25 +234,26 @@ class mat_33 : public mat_x3_template<vec_3> {
219234
vec128_t result, temp0, temp1, temp2, ones;
220235

221236
asm("### mat_33 trans_mult vec_3 ### \n"
222-
"vmul %[temp0], col0], %[vec] \n"
223-
"vmaxw %[ones], vf00, vf00 \n"
224-
"vmul %[temp1], %[col1], %[vec] \n"
225-
"vmul %[temp2], %[col2], %[vec] \n"
226-
"vadday.x ACC, %[temp0], %[temp0] \n"
227-
"vmaddz.x %[result], %[ones], %[temp0] \n"
228-
"vaddax.y ACC, %[temp1], %[temp1] \n"
229-
"vmaddz.y %[result], %[ones], %[temp1] \n"
230-
"vaddax.z ACC, %[temp2], %[temp2] \n"
231-
"vmaddy.z %[result], %[ones], %[temp2] \n"
232-
: [result] "=&j"(result),
233-
[temp0] "=&j"(temp0),
234-
[temp1] "=&j"(temp1),
235-
[temp2] "=&j"(temp2),
236-
[ones] "=&j"(ones), "=r"(vu0_ACC)
237-
: "j col0"(col0),
238-
"j col1"(col1),
239-
"j col2"(col2),
240-
"j vec"(vec));
237+
"qmtc2 %[col0], $vf10 \n"
238+
"qmtc2 %[col1], $vf11 \n"
239+
"qmtc2 %[col2], $vf12 \n"
240+
"qmtc2 %[vec], $vf13 \n"
241+
"vmul $vf14, $vf10, $vf13 \n"
242+
"vmaxw $vf15, $vf0, $vf0 \n"
243+
"vmul $vf16, $vf11, $vf13 \n"
244+
"vmul $vf17, $vf12, $vf13 \n"
245+
"vadday.x $ACC, $vf14, $vf14 \n"
246+
"vmaddz.x $vf14, $vf15, $vf14 \n"
247+
"vaddax.y $ACC, $vf16, $vf16 \n"
248+
"vmaddz.y $vf14, $vf15, $vf16 \n"
249+
"vaddax.z $ACC, $vf17, $vf17 \n"
250+
"vmaddy.z $vf14, $vf15, $vf17 \n"
251+
"qmfc2 %[result], $vf14 \n"
252+
: [result] "=&r"(result), "=r"(vu0_ACC)
253+
: [col0] "r"(col0.vec128),
254+
[col1] "r"(col1.vec128),
255+
[col2] "r"(col2.vec128),
256+
[vec] "r"(vec.vec128));
241257

242258
return vec_3(result);
243259
}
@@ -422,7 +438,7 @@ class mat_43 : public mat_x3_template<vec_4> {
422438
"vadday.x ACC, %[temp0], %[temp0] \n"
423439
"vmaddz.x %[result], %[ones], %[temp0] \n"
424440
"vaddax.y ACC, %[temp1], %[temp1] \n"
425-
"vmaddz.y %[result, %[ones], %[temp1] \n"
441+
"vmaddz.y %[result], %[ones], %[temp1] \n"
426442
"vaddax.z ACC, %[temp2], %[temp2] \n"
427443
"vmaddy.z %[result], %[ones], %[temp2] \n"
428444
: [result] "=&j"(result),
@@ -742,43 +758,89 @@ class mat_44 : public mat_x4_template<vec_4> {
742758

743759
void set_identity()
744760
{
745-
asm(" ### mat_44::set_identity ### \n"
746-
"vsub %[col0], %[col0], %[col0] \n"
747-
"vsub %[col1], %[col1], %[col1] \n"
748-
"vmr32 %[col2], vf00 \n"
749-
"vmove %[col3], vf00 \n"
750-
"vaddw.x %[col0], vf00, vf00 \n"
751-
"vaddw.y %[col1], vf00, vf00 \n"
752-
: [col0] "=j"(col0),
753-
[col1] "=j"(col1),
754-
[col2] "=j"(col2),
755-
[col3] "=j"(col3));
761+
asm __volatile__(
762+
"vaddw.x $vf1, $vf0, $vf0 \n"
763+
"vaddw.y $vf2, $vf0, $vf0 \n"
764+
"vmr32 $vf3, $vf0 \n"
765+
"vmove $vf4, $vf0 \n"
766+
"qmfc2 %[col0], $vf1 \n"
767+
"qmfc2 %[col1], $vf2 \n"
768+
"qmfc2 %[col2], $vf3 \n"
769+
"qmfc2 %[col3], $vf4 \n"
770+
: [col0] "=r"(col0.vec128), [col1] "=r"(col1.vec128), [col2] "=r"(col2.vec128), [col3] "=r"(col3.vec128)
771+
:
772+
: "memory");
756773
}
757774

758775
void
759776
set_scale(vec_3 scale)
760777
{
761-
set_identity();
762-
col0 = (vec_x)scale;
763-
col1 = (vec_y)scale;
764-
col2 = (vec_z)scale;
778+
set_zero();
779+
// Extract components and set diagonal elements using VU0
780+
float sx = (float)vec_x(scale);
781+
float sy = (float)vec_y(scale);
782+
float sz = (float)vec_z(scale);
783+
asm __volatile__(
784+
"ctc2 %[sx], $vi21 \n"
785+
"vaddi.x $vf1, $vf0, $I \n"
786+
"ctc2 %[sy], $vi21 \n"
787+
"vaddi.y $vf2, $vf0, $I \n"
788+
"ctc2 %[sz], $vi21 \n"
789+
"vaddi.z $vf3, $vf0, $I \n"
790+
"vmove $vf4, $vf0 \n"
791+
"qmfc2 %[col0], $vf1 \n"
792+
"qmfc2 %[col1], $vf2 \n"
793+
"qmfc2 %[col2], $vf3 \n"
794+
"qmfc2 %[col3], $vf4 \n"
795+
: [col0] "=r"(col0.vec128), [col1] "=r"(col1.vec128), [col2] "=r"(col2.vec128), [col3] "=r"(col3.vec128)
796+
: [sx] "r"(sx), [sy] "r"(sy), [sz] "r"(sz)
797+
: "memory");
765798
}
766799

767800
void
768801
set_scale(vec_4 scale)
769802
{
770803
set_identity();
771-
col0 = (vec_x)scale;
772-
col1 = (vec_y)scale;
773-
col2 = (vec_z)scale;
774-
col3 = (vec_w)scale;
804+
// Extract components and set diagonal elements using VU0
805+
float sx = (float)vec_x(scale);
806+
float sy = (float)vec_y(scale);
807+
float sz = (float)vec_z(scale);
808+
float sw = (float)vec_w(scale);
809+
asm __volatile__(
810+
"vsub $vf1, $vf0, $vf0 \n"
811+
"vsub $vf2, $vf0, $vf0 \n"
812+
"vsub $vf3, $vf0, $vf0 \n"
813+
"vsub $vf4, $vf0, $vf0 \n"
814+
"ctc2 %[sx], $vi21 \n"
815+
"vaddi.x $vf1, $vf0, $I \n"
816+
"ctc2 %[sy], $vi21 \n"
817+
"vaddi.y $vf2, $vf0, $I \n"
818+
"ctc2 %[sz], $vi21 \n"
819+
"vaddi.z $vf3, $vf0, $I \n"
820+
"ctc2 %[sw], $vi21 \n"
821+
"vmuli.w $vf4, $vf0, $I \n"
822+
"qmfc2 %[col0], $vf1 \n"
823+
"qmfc2 %[col1], $vf2 \n"
824+
"qmfc2 %[col2], $vf3 \n"
825+
"qmfc2 %[col3], $vf4 \n"
826+
: [col0] "=r"(col0.vec128), [col1] "=r"(col1.vec128), [col2] "=r"(col2.vec128), [col3] "=r"(col3.vec128)
827+
: [sx] "r"(sx), [sy] "r"(sy), [sz] "r"(sz), [sw] "r"(sw)
828+
: "memory");
775829
}
776830

777831
void
778832
set_translate(vec_3 xlate_amount)
779833
{
780834
set_identity();
781-
col3 = xlate_amount;
835+
// Set translation in col3, preserving w=1 from set_identity
836+
float tx = (float)vec_x(xlate_amount);
837+
float ty = (float)vec_y(xlate_amount);
838+
float tz = (float)vec_z(xlate_amount);
839+
col3.set_x(tx);
840+
col3.set_y(ty);
841+
col3.set_z(tz);
842+
// w should already be 1.0 from set_identity, but ensure it
843+
col3.set_w(1.0f);
782844
}
783845

784846
void
@@ -1377,7 +1439,7 @@ class transform_t {
13771439
" ### transform_t * vector_t ### \n"
13781440
"vmulax ACC, %[col0], %[vec] \n"
13791441
"vmadday ACC, %[col1], %[vec] \n"
1380-
"vmaddz %[result], %[col2, %[vec] \n"
1442+
"vmaddz %[result], %[col2], %[vec] \n"
13811443
: [result] "=&j"(result), "=r"(vu0_ACC)
13821444
: [vec] "j"(vec),
13831445
[col0] "j"(col0), [col1] "j"(col1), [col2] "j"(col2));
@@ -1761,7 +1823,7 @@ mat_33::inverse() const
17611823
"vaddx.y %[temp], vf00], %[inv2] # Do an in-place transpose, produces determinant(R)*Rinv \n"
17621824
"vadd.xz %[temp], vf00], %[inv1] \n"
17631825
"vaddy.x %[inv1], vf00], %[inv0] \n"
1764-
"vdiv Q, vf00w, %[determinantx] # Q = 1/determinant(R) \n"
1826+
"vdiv Q, vf00w, %[determinant]x # Q = 1/determinant(R) \n"
17651827
"vaddy.z %[inv1], vf00, %[inv2] \n"
17661828
"vaddz.x %[inv2], vf00, %[inv0] \n"
17671829
"vaddy.z %[inv0], vf00, %[temp] \n"
@@ -1844,10 +1906,10 @@ mat_33::mult_tilde(vec_3 vec) const
18441906
mat_33 result;
18451907
asm("### mat_33 mult_tilde vec_3 ### \n"
18461908
"vmulaz ACC, %[col1], %[vec] \n"
1847-
"vmsuby %[res0], %[col2, %[vec] \n"
1848-
"vmulax ACC, %[col2, %[vec] \n"
1909+
"vmsuby %[res0], %[col2], %[vec] \n"
1910+
"vmulax ACC, %[col2], %[vec] \n"
18491911
"vmsubz %[res1], %[col0], %[vec] \n"
1850-
"vmulay ACC, %[col0, %[vec] \n"
1912+
"vmulay ACC, %[col0], %[vec] \n"
18511913
"vmsubx %[res2], %[col1], %[vec] \n"
18521914
: [res0] "=&j"(result.col0), [res1] "=&j"(result.col1), [res2] "=&j"(result.col2), "=r"(vu0_ACC)
18531915
: [col0] "j"(col0), [col1] "j"(col1), [col2] "j"(col2), [vec] "j"(vec));
@@ -2648,7 +2710,7 @@ transform_t::inverse() const
26482710
[inv1] "=&j"(result.col1),
26492711
[inv2] "=&j"(result.col2),
26502712
[temp] "=&j"(temp),
2651-
[determinant] "=&j"(determinant), "=r"(vu0_ACC), "=j"(vu0_Q)
2713+
[determinant] "=&j"(determinant), "=r"(vu0_ACC)
26522714
: [col0] "j"(col0),
26532715
[col1] "j"(col1),
26542716
[col2] "j"(col2));
@@ -2669,7 +2731,7 @@ transform_t::inverse() const
26692731
[inv2] "+j"(result.col2),
26702732
[inv3] "=&j"(result.col3),
26712733
"=r"(vu0_ACC)
2672-
: [col3] "j"(col3), "j"(vu0_Q));
2734+
: [col3] "j"(col3));
26732735
return result;
26742736
}
26752737

0 commit comments

Comments
 (0)