@@ -51,24 +51,39 @@ class mat_33 : public mat_x3_template<vec_3> {
5151
5252 void set_identity ()
5353 {
54- asm (" ### mat_33::set_identity ### \n "
55- " vsub %[col0], %[col0], %[col0] \n "
56- " vsub %[col1], %[col1], %[col1] \n "
57- " vmr32 %[col2], vf00 \n "
58- " vaddw.x %[col0], vf00, vf00 \n "
59- " vaddw.y %[col1], vf00, vf00 \n "
60- : [col0] " =j" (col0),
61- [col1] " =j" (col1),
62- [col2] " =j" (col2));
54+ asm __volatile__ (
55+ " vaddw.x $vf1, $vf0, $vf0 \n "
56+ " vaddw.y $vf2, $vf0, $vf0 \n "
57+ " vmr32 $vf3, $vf0 \n "
58+ " qmfc2 %[col0], $vf1 \n "
59+ " qmfc2 %[col1], $vf2 \n "
60+ " qmfc2 %[col2], $vf3 \n "
61+ : [col0] " =r" (col0.vec128 ), [col1] " =r" (col1.vec128 ), [col2] " =r" (col2.vec128 )
62+ :
63+ : " memory" );
6364 }
6465
6566 void
6667 set_scale (vec_3 scale)
6768 {
6869 set_zero ();
69- col0 = (vec_x)scale;
70- col1 = (vec_y)scale;
71- col2 = (vec_z)scale;
70+ // Extract components and set diagonal elements using VU0
71+ float sx = (float )vec_x (scale);
72+ float sy = (float )vec_y (scale);
73+ float sz = (float )vec_z (scale);
74+ asm __volatile__ (
75+ " ctc2 %[sx], $vi21 \n "
76+ " vaddi.x $vf1, $vf0, $I \n "
77+ " ctc2 %[sy], $vi21 \n "
78+ " vaddi.y $vf2, $vf0, $I \n "
79+ " ctc2 %[sz], $vi21 \n "
80+ " vaddi.z $vf3, $vf0, $I \n "
81+ " qmfc2 %[col0], $vf1 \n "
82+ " qmfc2 %[col1], $vf2 \n "
83+ " qmfc2 %[col2], $vf3 \n "
84+ : [col0] " =r" (col0.vec128 ), [col1] " =r" (col1.vec128 ), [col2] " =r" (col2.vec128 )
85+ : [sx] " r" (sx), [sy] " r" (sy), [sz] " r" (sz)
86+ : " memory" );
7287 }
7388
7489 void
@@ -219,25 +234,26 @@ class mat_33 : public mat_x3_template<vec_3> {
219234 vec128_t result, temp0, temp1, temp2, ones;
220235
221236 asm (" ### mat_33 trans_mult vec_3 ### \n "
222- " vmul %[temp0], col0], %[vec] \n "
223- " vmaxw %[ones], vf00, vf00 \n "
224- " vmul %[temp1], %[col1], %[vec] \n "
225- " vmul %[temp2], %[col2], %[vec] \n "
226- " vadday.x ACC, %[temp0], %[temp0] \n "
227- " vmaddz.x %[result], %[ones], %[temp0] \n "
228- " vaddax.y ACC, %[temp1], %[temp1] \n "
229- " vmaddz.y %[result], %[ones], %[temp1] \n "
230- " vaddax.z ACC, %[temp2], %[temp2] \n "
231- " vmaddy.z %[result], %[ones], %[temp2] \n "
232- : [result] " =&j" (result),
233- [temp0] " =&j" (temp0),
234- [temp1] " =&j" (temp1),
235- [temp2] " =&j" (temp2),
236- [ones] " =&j" (ones), " =r" (vu0_ACC)
237- : " j col0" (col0),
238- " j col1" (col1),
239- " j col2" (col2),
240- " j vec" (vec));
237+ " qmtc2 %[col0], $vf10 \n "
238+ " qmtc2 %[col1], $vf11 \n "
239+ " qmtc2 %[col2], $vf12 \n "
240+ " qmtc2 %[vec], $vf13 \n "
241+ " vmul $vf14, $vf10, $vf13 \n "
242+ " vmaxw $vf15, $vf0, $vf0 \n "
243+ " vmul $vf16, $vf11, $vf13 \n "
244+ " vmul $vf17, $vf12, $vf13 \n "
245+ " vadday.x $ACC, $vf14, $vf14 \n "
246+ " vmaddz.x $vf14, $vf15, $vf14 \n "
247+ " vaddax.y $ACC, $vf16, $vf16 \n "
248+ " vmaddz.y $vf14, $vf15, $vf16 \n "
249+ " vaddax.z $ACC, $vf17, $vf17 \n "
250+ " vmaddy.z $vf14, $vf15, $vf17 \n "
251+ " qmfc2 %[result], $vf14 \n "
252+ : [result] " =&r" (result), " =r" (vu0_ACC)
253+ : [col0] " r" (col0.vec128 ),
254+ [col1] " r" (col1.vec128 ),
255+ [col2] " r" (col2.vec128 ),
256+ [vec] " r" (vec.vec128 ));
241257
242258 return vec_3 (result);
243259 }
@@ -422,7 +438,7 @@ class mat_43 : public mat_x3_template<vec_4> {
422438 " vadday.x ACC, %[temp0], %[temp0] \n "
423439 " vmaddz.x %[result], %[ones], %[temp0] \n "
424440 " vaddax.y ACC, %[temp1], %[temp1] \n "
425- " vmaddz.y %[result, %[ones], %[temp1] \n "
441+ " vmaddz.y %[result] , %[ones], %[temp1] \n "
426442 " vaddax.z ACC, %[temp2], %[temp2] \n "
427443 " vmaddy.z %[result], %[ones], %[temp2] \n "
428444 : [result] " =&j" (result),
@@ -742,43 +758,89 @@ class mat_44 : public mat_x4_template<vec_4> {
742758
743759 void set_identity ()
744760 {
745- asm (" ### mat_44::set_identity ### \n "
746- " vsub %[col0], %[col0], %[col0] \n "
747- " vsub %[col1], %[col1], %[col1] \n "
748- " vmr32 %[col2], vf00 \n "
749- " vmove %[col3], vf00 \n "
750- " vaddw.x %[col0], vf00, vf00 \n "
751- " vaddw.y %[col1], vf00, vf00 \n "
752- : [col0] " =j" (col0),
753- [col1] " =j" (col1),
754- [col2] " =j" (col2),
755- [col3] " =j" (col3));
761+ asm __volatile__ (
762+ " vaddw.x $vf1, $vf0, $vf0 \n "
763+ " vaddw.y $vf2, $vf0, $vf0 \n "
764+ " vmr32 $vf3, $vf0 \n "
765+ " vmove $vf4, $vf0 \n "
766+ " qmfc2 %[col0], $vf1 \n "
767+ " qmfc2 %[col1], $vf2 \n "
768+ " qmfc2 %[col2], $vf3 \n "
769+ " qmfc2 %[col3], $vf4 \n "
770+ : [col0] " =r" (col0.vec128 ), [col1] " =r" (col1.vec128 ), [col2] " =r" (col2.vec128 ), [col3] " =r" (col3.vec128 )
771+ :
772+ : " memory" );
756773 }
757774
758775 void
759776 set_scale (vec_3 scale)
760777 {
761- set_identity ();
762- col0 = (vec_x)scale;
763- col1 = (vec_y)scale;
764- col2 = (vec_z)scale;
778+ set_zero ();
779+ // Extract components and set diagonal elements using VU0
780+ float sx = (float )vec_x (scale);
781+ float sy = (float )vec_y (scale);
782+ float sz = (float )vec_z (scale);
783+ asm __volatile__ (
784+ " ctc2 %[sx], $vi21 \n "
785+ " vaddi.x $vf1, $vf0, $I \n "
786+ " ctc2 %[sy], $vi21 \n "
787+ " vaddi.y $vf2, $vf0, $I \n "
788+ " ctc2 %[sz], $vi21 \n "
789+ " vaddi.z $vf3, $vf0, $I \n "
790+ " vmove $vf4, $vf0 \n "
791+ " qmfc2 %[col0], $vf1 \n "
792+ " qmfc2 %[col1], $vf2 \n "
793+ " qmfc2 %[col2], $vf3 \n "
794+ " qmfc2 %[col3], $vf4 \n "
795+ : [col0] " =r" (col0.vec128 ), [col1] " =r" (col1.vec128 ), [col2] " =r" (col2.vec128 ), [col3] " =r" (col3.vec128 )
796+ : [sx] " r" (sx), [sy] " r" (sy), [sz] " r" (sz)
797+ : " memory" );
765798 }
766799
767800 void
768801 set_scale (vec_4 scale)
769802 {
770803 set_identity ();
771- col0 = (vec_x)scale;
772- col1 = (vec_y)scale;
773- col2 = (vec_z)scale;
774- col3 = (vec_w)scale;
804+ // Extract components and set diagonal elements using VU0
805+ float sx = (float )vec_x (scale);
806+ float sy = (float )vec_y (scale);
807+ float sz = (float )vec_z (scale);
808+ float sw = (float )vec_w (scale);
809+ asm __volatile__ (
810+ " vsub $vf1, $vf0, $vf0 \n "
811+ " vsub $vf2, $vf0, $vf0 \n "
812+ " vsub $vf3, $vf0, $vf0 \n "
813+ " vsub $vf4, $vf0, $vf0 \n "
814+ " ctc2 %[sx], $vi21 \n "
815+ " vaddi.x $vf1, $vf0, $I \n "
816+ " ctc2 %[sy], $vi21 \n "
817+ " vaddi.y $vf2, $vf0, $I \n "
818+ " ctc2 %[sz], $vi21 \n "
819+ " vaddi.z $vf3, $vf0, $I \n "
820+ " ctc2 %[sw], $vi21 \n "
821+ " vmuli.w $vf4, $vf0, $I \n "
822+ " qmfc2 %[col0], $vf1 \n "
823+ " qmfc2 %[col1], $vf2 \n "
824+ " qmfc2 %[col2], $vf3 \n "
825+ " qmfc2 %[col3], $vf4 \n "
826+ : [col0] " =r" (col0.vec128 ), [col1] " =r" (col1.vec128 ), [col2] " =r" (col2.vec128 ), [col3] " =r" (col3.vec128 )
827+ : [sx] " r" (sx), [sy] " r" (sy), [sz] " r" (sz), [sw] " r" (sw)
828+ : " memory" );
775829 }
776830
777831 void
778832 set_translate (vec_3 xlate_amount)
779833 {
780834 set_identity ();
781- col3 = xlate_amount;
835+ // Set translation in col3, preserving w=1 from set_identity
836+ float tx = (float )vec_x (xlate_amount);
837+ float ty = (float )vec_y (xlate_amount);
838+ float tz = (float )vec_z (xlate_amount);
839+ col3.set_x (tx);
840+ col3.set_y (ty);
841+ col3.set_z (tz);
842+ // w should already be 1.0 from set_identity, but ensure it
843+ col3.set_w (1 .0f );
782844 }
783845
784846 void
@@ -1377,7 +1439,7 @@ class transform_t {
13771439 " ### transform_t * vector_t ### \n "
13781440 " vmulax ACC, %[col0], %[vec] \n "
13791441 " vmadday ACC, %[col1], %[vec] \n "
1380- " vmaddz %[result], %[col2, %[vec] \n "
1442+ " vmaddz %[result], %[col2] , %[vec] \n "
13811443 : [result] " =&j" (result), " =r" (vu0_ACC)
13821444 : [vec] " j" (vec),
13831445 [col0] " j" (col0), [col1] " j" (col1), [col2] " j" (col2));
@@ -1761,7 +1823,7 @@ mat_33::inverse() const
17611823 " vaddx.y %[temp], vf00], %[inv2] # Do an in-place transpose, produces determinant(R)*Rinv \n "
17621824 " vadd.xz %[temp], vf00], %[inv1] \n "
17631825 " vaddy.x %[inv1], vf00], %[inv0] \n "
1764- " vdiv Q, vf00w, %[determinantx] # Q = 1/determinant(R) \n "
1826+ " vdiv Q, vf00w, %[determinant]x # Q = 1/determinant(R) \n "
17651827 " vaddy.z %[inv1], vf00, %[inv2] \n "
17661828 " vaddz.x %[inv2], vf00, %[inv0] \n "
17671829 " vaddy.z %[inv0], vf00, %[temp] \n "
@@ -1844,10 +1906,10 @@ mat_33::mult_tilde(vec_3 vec) const
18441906 mat_33 result;
18451907 asm (" ### mat_33 mult_tilde vec_3 ### \n "
18461908 " vmulaz ACC, %[col1], %[vec] \n "
1847- " vmsuby %[res0], %[col2, %[vec] \n "
1848- " vmulax ACC, %[col2, %[vec] \n "
1909+ " vmsuby %[res0], %[col2] , %[vec] \n "
1910+ " vmulax ACC, %[col2] , %[vec] \n "
18491911 " vmsubz %[res1], %[col0], %[vec] \n "
1850- " vmulay ACC, %[col0, %[vec] \n "
1912+ " vmulay ACC, %[col0] , %[vec] \n "
18511913 " vmsubx %[res2], %[col1], %[vec] \n "
18521914 : [res0] " =&j" (result.col0 ), [res1] " =&j" (result.col1 ), [res2] " =&j" (result.col2 ), " =r" (vu0_ACC)
18531915 : [col0] " j" (col0), [col1] " j" (col1), [col2] " j" (col2), [vec] " j" (vec));
@@ -2648,7 +2710,7 @@ transform_t::inverse() const
26482710 [inv1] " =&j" (result.col1 ),
26492711 [inv2] " =&j" (result.col2 ),
26502712 [temp] " =&j" (temp),
2651- [determinant] " =&j" (determinant), " =r" (vu0_ACC), " =j " (vu0_Q)
2713+ [determinant] " =&j" (determinant), " =r" (vu0_ACC)
26522714 : [col0] " j" (col0),
26532715 [col1] " j" (col1),
26542716 [col2] " j" (col2));
@@ -2669,7 +2731,7 @@ transform_t::inverse() const
26692731 [inv2] " +j" (result.col2 ),
26702732 [inv3] " =&j" (result.col3 ),
26712733 " =r" (vu0_ACC)
2672- : [col3] " j" (col3), " j " (vu0_Q) );
2734+ : [col3] " j" (col3));
26732735 return result;
26742736}
26752737
0 commit comments