Merge pull request #10750 from libretro/vitagl

VitaGL and Math-neon update
2025-03-29 22:20:21 +00:00 · 2020-05-31 17:26:43 +02:00 · 2020-05-31 17:26:43 +02:00 · a10abdd929
commit a10abdd929
parent ec336442bd ae9bdba06f
10 changed files with 844 additions and 131 deletions
--- a/deps/math-neon/README
+++ b/deps/math-neon/README
@ -36,134 +36,133 @@ Email: 	lachlan.ts@gmail.com

 PSVITA performances test results:

-RUNFAST: Enabled 
+RUNFAST: Disabled 
 ------------------------------------------------------------------------------------------------------
 MATRIX FUNCTION TESTS 
 ------------------------------------------------------------------------------------------------------
 matmul2_c = 
-			|-7.16, 9.42|
-			|17.86, -10.70|
+			|-14.56, 5.96|
+			|-15.35, 10.50|
 matmul2_neon = 
-			|-7.16, 9.42|
-			|17.86, -10.70|
-matmul2: c=183985 	 neon=87480 	 rate=2.10 
-matvec2_c = |-7.16, 17.86|
-matvec2_neon = |-7.16, 17.86|
-matvec2: c=98178 	 neon=66040 	 rate=1.49 
+			|-14.56, 5.96|
+			|-15.35, 10.50|
+matmul2: c=174924 	 neon=64490 	 rate=2.71 
+matvec2_c = |-14.56, -15.35|
+matvec2_neon = |-14.56, -15.35|
+matvec2: c=88957 	 neon=58337 	 rate=1.52 
 matmul3_c =
-			|11.14, -0.78, -3.98|
-			|16.56, 17.96, 23.58|
-			|8.73, -0.18, 1.57|
+			|-21.39, -4.68, -1.74|
+			|-8.66, -8.97, 1.83|
+			|15.88, 0.30, -2.23|
 matmul3_neon =
-			|11.14, -0.78, -3.98|
-			|16.56, 17.96, 23.58|
-			|8.73, -0.18, 1.57|
-matmul3: c=551838 	 neon=340292 	 rate=1.62 
-matvec3_c = |11.14, 16.56, 8.73|
-matvec3_neon = |11.14, 16.56, 8.73|
-matvec3: c=98178 	 neon=66040 	 rate=1.49 
+			|-21.39, -4.68, -1.74|
+			|-8.66, -8.97, 1.83|
+			|15.88, 0.30, -2.23|
+matmul3: c=552486 	 neon=297268 	 rate=1.86 
+matvec3_c = |-21.39, -8.66, 15.88|
+matvec3_neon = |-21.39, -8.66, 15.88|
+matvec3: c=184104 	 neon=128780 	 rate=1.43 
 matmul4_c =
-			|17.91, -23.96, 1.86, 16.53|
-			|4.10, -18.16, 4.17, 29.06|
-			|6.92, -1.60, 3.12, 27.81|
-			|-15.13, -7.46, -17.91, 22.49|
+			|-13.65, -1.80, -12.92, 6.56|
+			|-10.21, 9.47, 2.73, 14.79|
+			|0.97, 11.69, -0.64, -12.87|
+			|20.06, 6.77, 35.61, -0.02|
 matmul4_neon =
-			|17.91, -23.96, 1.86, 16.53|
-			|4.10, -18.16, 4.17, 29.06|
-			|6.92, -1.60, 3.12, 27.81|
-			|-15.13, -7.46, -17.91, 22.49|
-matmul4: c=1316131 	 neon=315444 	 rate=4.17 
-matvec4_c = |17.91, 4.10, 6.92, -15.126419|
-matvec4_neon = |17.91, 4.10, 6.92, -15.126419|
-matvec4: c=98178 	 neon=66040 	 rate=1.49 
+			|-13.65, -1.80, -12.92, 6.56|
+			|-10.21, 9.47, 2.73, 14.79|
+			|0.97, 11.69, -0.64, -12.87|
+			|20.06, 6.77, 35.61, -0.02|
+matmul4: c=1315568 	 neon=254227 	 rate=5.17 
+matvec4_c = |-13.65, -10.21, 0.97, 20.058556|
+matvec4_neon = |-13.65, -10.21, 0.97, 20.058556|
+matvec4: c=331712 	 neon=147196 	 rate=2.25 

-dot2_c = 5.804099
-dot2_neon = 5.804099
-dot2: c=291526 	 neon=307025 	 rate=0.95 
-normalize2_c = [0.97, 0.24]
-normalize2_neon = [0.97, 0.24]
-normalize2: c=1058588 	 neon=965696 	 rate=1.10 
+dot2_c = -10.903330
+dot2_neon = -10.903330
+dot2: c=230295 	 neon=168799 	 rate=1.36 
+normalize2_c = [-0.74, 0.67]
+normalize2_neon = [-0.74, 0.67]
+normalize2: c=950716 	 neon=965780 	 rate=0.98 

-dot3_c = -0.817487
-dot3_neon = -0.817487
-dot3: c=322094 	 neon=444834 	 rate=0.72 
-normalize3_c = [0.50, 0.12, -0.86]
-normalize3_neon = [0.50, 0.12, -0.86]
-normalize3: c=1257201 	 neon=1134375 	 rate=1.11 
-cross3_c = [-13.16, -17.29, -10.19]
-cross3_neon = [-13.16, -17.29, -10.19]
-cross3: c=705298 	 neon=766477 	 rate=0.92 
+dot3_c = -4.226746
+dot3_neon = -4.226746
+dot3: c=306957 	 neon=337316 	 rate=0.91 
+normalize3_c = [-0.69, 0.62, -0.38]
+normalize3_neon = [-0.69, 0.62, -0.38]
+normalize3: c=1180950 	 neon=1134557 	 rate=1.04 
+cross3_c = [-9.67, -19.39, -14.24]
+cross3_neon = [-9.67, -19.39, -14.24]
+cross3: c=659558 	 neon=766896 	 rate=0.86 

-dot4_c = -7.880241
-dot4_neon = -7.880241
-dot4: c=414431 	 neon=506460 	 rate=0.82 
-normalize4_c = [0.45, 0.11, -0.77, -0.44]
-normalize4_neon = [0.45, 0.11, -0.77, -0.44]
-normalize4: c=1410727 	 neon=1102802 	 rate=1.28 
+dot4_c = 2.782796
+dot4_neon = 2.782796
+dot4: c=414233 	 neon=276068 	 rate=1.50 
+normalize4_c = [-0.59, 0.53, -0.32, -0.52]
+normalize4_neon = [-0.59, 0.53, -0.32, -0.52]
+normalize4: c=1364294 	 neon=1103327 	 rate=1.24 

 ------------------------------------------------------------------------------------------------------
 CMATH FUNCTION TESTS 
 ------------------------------------------------------------------------------------------------------
 Function	Range		Number	ABS Max Error	REL Max Error	RMS Error	Time	Rate
 ------------------------------------------------------------------------------------------------------
-sinf       	[-3.14, 3.14]	500000	0.00e+00	0.00e+00%	0.00e+00	647042739	x1.00	
-sinf_c     	[-3.14, 3.14]	500000	7.75e-07	1.00e+02%	4.09e-07	646276691	x1.00	
-sinf_neon  	[-3.14, 3.14]	500000	1.00e+00	1.00e+02%	7.07e-01	645546381	x1.00	
-cosf       	[-3.14, 3.14]	500000	0.00e+00	0.00e+00%	0.00e+00	644742077	x1.00	
-cosf_c     	[-3.14, 3.14]	500000	7.75e-07	6.74e-01%	4.15e-07	643957358	x1.00	
-cosf_neon  	[-3.14, 3.14]	500000	1.00e+00	1.00e+02%	7.06e-01	643211256	x1.00	
-tanf       	[-0.79, 0.79]	500000	0.00e+00	0.00e+00%	0.00e+00	642444112	x1.00	
-tanf_c     	[-0.79, 0.79]	500000	2.98e-06	7.94e-04%	1.31e-06	641628507	x1.00	
-tanf_neon  	[-0.79, 0.79]	500000	1.00e+00	1.00e+02%	nan	640740514	x1.00	
-asinf      	[-1.00, 1.00]	500000	0.00e+00	0.00e+00%	0.00e+00	639560380	x1.00	
-asinf_c    	[-1.00, 1.00]	500000	5.54e-05	1.06e-02%	nan	638453383	x1.00	
-asinf_neon 	[-1.00, 1.00]	500000	1.57e+00	1.00e+02%	6.84e-01	637349653	x1.00	
-acosf      	[-1.00, 1.00]	500000	0.00e+00	0.00e+00%	0.00e+00	636078992	x1.00	
-acosf_c    	[-1.00, 1.00]	500000	5.56e-05	6.46e-03%	nan	634934201	x1.00	
-acosf_neon 	[-1.00, 1.00]	500000	1.57e+00	1.02e+05%	6.84e-01	633793585	x1.00	
-atanf      	[-1.00, 1.00]	500000	0.00e+00	0.00e+00%	0.00e+00	632835241	x1.00	
-atanf_c    	[-1.00, 1.00]	500000	1.67e-04	2.12e-02%	7.40e-05	632142823	x1.00	
-atanf_neon 	[-1.00, 1.00]	500000	7.85e-01	0.00e+00%	nan	631387330	x1.00	
-sinhf       	[-3.14, 3.14]	500000	0.00e+00	0.00e+00%	0.00e+00	630142014	x1.00	
-sinhf_c     	[-3.14, 3.14]	500000	1.91e-06	1.52e-01%	1.85e-07	628992714	x1.00	
-sinhf_neon  	[-3.14, 3.14]	500000	1.15e+01	1.00e+02%	4.55e+00	627998454	x1.00	
-coshf       	[-3.14, 3.14]	500000	0.00e+00	0.00e+00%	0.00e+00	626869866	x1.00	
-coshf_c     	[-3.14, 3.14]	500000	9.54e-07	2.38e-05%	1.64e-07	625829657	x1.00	
-coshf_neon  	[-3.14, 3.14]	500000	1.06e+01	9.14e+01%	3.92e+00	624873969	x1.00	
-tanhf       	[-3.14, 3.14]	500000	0.00e+00	0.00e+00%	0.00e+00	623689093	x1.00	
-tanhf_c     	[-3.14, 3.14]	500000	1.20e-05	2.48e-01%	5.48e-06	622547097	x1.00	
-tanhf_neon  	[-3.14, 3.14]	500000	9.96e-01	1.00e+02%	8.26e-01	621506812	x1.00	
-expf       	[0.00, 10.00]	500000	0.00e+00	0.00e+00%	0.00e+00	620497304	x1.00	
-expf_c     	[0.00, 10.00]	500000	9.77e-03	6.15e-05%	1.64e-03	619569554	x1.00	
-expf_neon  	[0.00, 10.00]	500000	2.20e+04	1.00e+02%	4.92e+03	618761400	x1.00	
-logf       	[1.00, 1000.00]	500000	0.00e+00	0.00e+00%	0.00e+00	617882765	x1.00	
-logf_c     	[1.00, 1000.00]	500000	6.20e-06	1.62e-02%	9.83e-07	617087810	x1.00	
-logf_neon  	[1.00, 1000.00]	500000	9.49e+01	inf%	9.39e+01	616388420	x1.00	
-log10f       	[1.00, 1000.00]	500000	0.00e+00	0.00e+00%	0.00e+00	615405364	x1.00	
-log10f_c     	[1.00, 1000.00]	500000	2.86e-06	6.68e-03%	4.79e-07	614442585	x1.00	
-log10f_neon  	[1.00, 1000.00]	500000	4.12e+01	inf%	4.07e+01	613671782	x1.00	
-floorf     	[1.00, 1000.00]	5000000	0.00e+00	0.00e+00%	0.00e+00	611113689	x1.00	
-floorf_c   	[1.00, 1000.00]	5000000	0.00e+00	0.00e+00%	0.00e+00	608159325	x1.00	
-floorf_neon	[1.00, 1000.00]	5000000	2.00e+00	2.00e+02%	1.42e-02	604769008	x1.01	
-ceilf     	[1.00, 1000.00]	5000000	0.00e+00	0.00e+00%	0.00e+00	601342443	x1.00	
-ceilf_c   	[1.00, 1000.00]	5000000	0.00e+00	0.00e+00%	0.00e+00	598387998	x1.00	
-ceilf_neon	[1.00, 1000.00]	5000000	2.00e+00	1.00e+02%	1.02e+00	594959710	x1.01	
-fabsf     	[1.00, 1000.00]	5000000	0.00e+00	0.00e+00%	0.00e+00	592068236	x1.00	
-fabsf_c   	[1.00, 1000.00]	5000000	0.00e+00	0.00e+00%	0.00e+00	589808748	x1.00	
-fabsf_neon	[1.00, 1000.00]	5000000	0.00e+00	0.00e+00%	0.00e+00	587712180	x1.01	
-sqrtf      	[1.00, 1000.00]	500000	0.00e+00	0.00e+00%	0.00e+00	586496654	x1.00	
-sqrtf_c    	[1.00, 1000.00]	500000	2.33e-04	1.06e-03%	8.69e-05	585470866	x1.00	
-sqrtf_neon 	[1.00, 1000.00]	500000	0.00e+00	0.00e+00%	nan	584594551	x1.00	
-invsqrtf      	[1.00, 1000.00]	500000	0.00e+00	0.00e+00%	0.00e+00	583492213	x1.00	
-invsqrtf_c    	[1.00, 1000.00]	500000	4.35e-06	4.78e-04%	2.00e-07	582448164	x1.00	
-invsqrtf_neon 	[1.00, 1000.00]	500000	0.00e+00	0.00e+00%	nan	581642365	x1.00	
-atan2f       	[0.10, 10.00]	10000	0.00e+00	0.00e+00%	0.00e+00	83594269	x1.00	
-atan2f_c     	[0.10, 10.00]	10000	1.73e-04	2.23e-02%	0.00e+00	85383651	x0.98	
-atan2f_neon  	[0.10, 10.00]	10000	0.00e+00	0.00e+00%	0.00e+00	87387055	x0.96	
-powf       	[1.00, 10.00]	10000	0.00e+00	0.00e+00%	0.00e+00	93430489	x1.00	
-powf_c     	[1.00, 10.00]	10000	1.08e+05	4.37e-03%	0.00e+00	96726976	x0.97	
-powf_neon  	[1.00, 10.00]	10000	9.97e+09	1.00e+02%	0.00e+00	100185753	x0.93	
-fmodf       	[1.00, 10.00]	10000	0.00e+00	0.00e+00%	0.00e+00	101653673	x1.00	
-fmodf_c     	[1.00, 10.00]	10000	9.90e+00	8.06e-02%	0.00e+00	103177551	x0.99	
-fmodf_neon  	[1.00, 10.00]	10000	9.99e+00	1.00e+02%	0.00e+00	104771240	x0.97	
-
+sinf       	[-3.14, 3.14]	500000	0.00e+00	0.00e+00%	0.00e+00	1394459996	x1.00	
+sinf_c     	[-3.14, 3.14]	500000	7.75e-07	1.00e+02%	4.09e-07	1395128226	x1.00	
+sinf_neon  	[-3.14, 3.14]	500000	8.34e-07	1.00e+02%	4.09e-07	1395853554	x1.00	
+cosf       	[-3.14, 3.14]	500000	0.00e+00	0.00e+00%	0.00e+00	1396644271	x1.00	
+cosf_c     	[-3.14, 3.14]	500000	7.75e-07	6.74e-01%	4.15e-07	1397360321	x1.00	
+cosf_neon  	[-3.14, 3.14]	500000	8.34e-07	6.74e-01%	4.16e-07	1398126872	x1.00	
+tanf       	[-0.79, 0.79]	500000	0.00e+00	0.00e+00%	0.00e+00	1398889596	x1.00	
+tanf_c     	[-0.79, 0.79]	500000	2.98e-06	7.94e-04%	1.31e-06	1399704712	x1.00	
+tanf_neon  	[-0.79, 0.79]	500000	1.91e-06	3.62e-04%	6.66e-07	1400612899	x1.00	
+asinf      	[-1.00, 1.00]	500000	0.00e+00	0.00e+00%	0.00e+00	1401838993	x1.00	
+asinf_c    	[-1.00, 1.00]	500000	5.54e-05	1.06e-02%	nan	1402745512	x1.00	
+asinf_neon 	[-1.00, 1.00]	500000	4.66e-05	8.90e-03%	nan	1403967661	x1.00	
+acosf      	[-1.00, 1.00]	500000	0.00e+00	0.00e+00%	0.00e+00	1405317842	x1.00	
+acosf_c    	[-1.00, 1.00]	500000	5.56e-05	6.46e-03%	nan	1406294753	x1.00	
+acosf_neon 	[-1.00, 1.00]	500000	4.67e-05	6.35e-03%	nan	1407598039	x1.00	
+atanf      	[-1.00, 1.00]	500000	0.00e+00	0.00e+00%	0.00e+00	1408314869	x1.00	
+atanf_c    	[-1.00, 1.00]	500000	1.67e-04	2.12e-02%	7.40e-05	1408872421	x1.00	
+atanf_neon 	[-1.00, 1.00]	500000	1.67e-04	2.12e-02%	7.40e-05	1409736652	x1.00	
+sinhf       	[-3.14, 3.14]	500000	0.00e+00	0.00e+00%	0.00e+00	1411101066	x1.00	
+sinhf_c     	[-3.14, 3.14]	500000	1.91e-06	1.52e-01%	1.85e-07	1412173492	x1.00	
+sinhf_neon  	[-3.14, 3.14]	500000	1.91e-06	1.52e-01%	1.90e-07	1413205410	x1.00	
+coshf       	[-3.14, 3.14]	500000	0.00e+00	0.00e+00%	0.00e+00	1414417802	x1.00	
+coshf_c     	[-3.14, 3.14]	500000	9.54e-07	2.38e-05%	1.64e-07	1415426083	x1.00	
+coshf_neon  	[-3.14, 3.14]	500000	1.91e-06	2.22e-05%	1.68e-07	1416412636	x1.00	
+tanhf       	[-3.14, 3.14]	500000	0.00e+00	0.00e+00%	0.00e+00	1417684273	x1.00	
+tanhf_c     	[-3.14, 3.14]	500000	1.20e-05	2.48e-01%	5.48e-06	1418659628	x1.00	
+tanhf_neon  	[-3.14, 3.14]	500000	2.38e-07	2.47e-01%	5.40e-08	1419650721	x1.00	
+expf       	[0.00, 10.00]	500000	0.00e+00	0.00e+00%	0.00e+00	1420706074	x1.00	
+expf_c     	[0.00, 10.00]	500000	9.77e-03	6.15e-05%	1.64e-03	1421444150	x1.00	
+expf_neon  	[0.00, 10.00]	500000	9.77e-03	6.58e-05%	1.64e-03	1422203499	x1.00	
+logf       	[1.00, 1000.00]	500000	0.00e+00	0.00e+00%	0.00e+00	1423106698	x1.00	
+logf_c     	[1.00, 1000.00]	500000	6.20e-06	1.62e-02%	9.83e-07	1423735174	x1.00	
+logf_neon  	[1.00, 1000.00]	500000	7.63e-06	1.03e-02%	1.07e-06	1424434406	x1.00	
+log10f       	[1.00, 1000.00]	500000	0.00e+00	0.00e+00%	0.00e+00	1425516892	x1.00	
+log10f_c     	[1.00, 1000.00]	500000	2.86e-06	6.68e-03%	4.79e-07	1426200368	x1.00	
+log10f_neon  	[1.00, 1000.00]	500000	3.34e-06	6.68e-03%	4.84e-07	1426966844	x1.00	
+floorf     	[1.00, 1000.00]	5000000	0.00e+00	0.00e+00%	0.00e+00	1429081993	x1.00	
+floorf_c   	[1.00, 1000.00]	5000000	0.00e+00	0.00e+00%	0.00e+00	1430839273	x1.00	
+floorf_neon	[1.00, 1000.00]	5000000	0.00e+00	0.00e+00%	0.00e+00	1433474766	x1.00	
+ceilf     	[1.00, 1000.00]	5000000	0.00e+00	0.00e+00%	0.00e+00	1435602956	x1.00	
+ceilf_c   	[1.00, 1000.00]	5000000	0.00e+00	0.00e+00%	0.00e+00	1437403711	x1.00	
+ceilf_neon	[1.00, 1000.00]	5000000	0.00e+00	0.00e+00%	0.00e+00	1440044970	x1.00	
+fabsf     	[1.00, 1000.00]	5000000	0.00e+00	0.00e+00%	0.00e+00	1441265630	x1.00	
+fabsf_c   	[1.00, 1000.00]	5000000	0.00e+00	0.00e+00%	0.00e+00	1442491716	x1.00	
+fabsf_neon	[1.00, 1000.00]	5000000	0.00e+00	0.00e+00%	0.00e+00	1443680744	x1.00	
+sqrtf      	[1.00, 1000.00]	500000	0.00e+00	0.00e+00%	0.00e+00	1444844144	x1.00	
+sqrtf_c    	[1.00, 1000.00]	500000	2.33e-04	1.06e-03%	8.69e-05	1445710342	x1.00	
+sqrtf_neon 	[1.00, 1000.00]	500000	7.63e-06	2.91e-05%	1.60e-06	1446544637	x1.00	
+invsqrtf      	[1.00, 1000.00]	500000	0.00e+00	0.00e+00%	0.00e+00	1446995307	x1.00	
+invsqrtf_c    	[1.00, 1000.00]	500000	4.35e-06	4.78e-04%	2.00e-07	1447471977	x1.00	
+invsqrtf_neon 	[1.00, 1000.00]	500000	1.19e-07	2.12e-05%	4.81e-09	1447987675	x1.00	
+atan2f       	[0.10, 10.00]	10000	0.00e+00	0.00e+00%	0.00e+00	1449713108	x1.00	
+atan2f_c     	[0.10, 10.00]	10000	1.73e-04	2.23e-02%	0.00e+00	1451276575	x1.00	
+atan2f_neon  	[0.10, 10.00]	10000	1.67e-04	2.12e-02%	0.00e+00	1453093260	x1.00	
+powf       	[1.00, 10.00]	10000	0.00e+00	0.00e+00%	0.00e+00	1458606663	x1.00	
+powf_c     	[1.00, 10.00]	10000	1.08e+05	4.37e-03%	0.00e+00	1461584933	x1.00	
+powf_neon  	[1.00, 10.00]	10000	1.36e+05	5.88e-03%	0.00e+00	1464702743	x1.00	
+fmodf       	[1.00, 10.00]	10000	0.00e+00	0.00e+00%	0.00e+00	1466022029	x1.00	
+fmodf_c     	[1.00, 10.00]	10000	9.90e+00	8.06e-02%	0.00e+00	1467403015	x1.00	
+fmodf_neon  	[1.00, 10.00]	10000	9.97e+00	8.06e-02%	0.00e+00	1468767755	x1.00	
--- a/deps/math-neon/math_debug.c
+++ b/deps/math-neon/math_debug.c
@ -0,0 +1,689 @@
+/*
+Math-NEON:  Neon Optimised Math Library based on cmath
+Contact:    lachlan.ts@gmail.com
+Copyright (C) 2009  Lachlan Tychsen - Smith aka Adventus
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 3 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+*/
+
+
+#include <math_neon.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include <time.h>
+#ifdef WIN32
+#include <time.h>
+#else
+#include <sys/time.h>
+#include <sys/resource.h>
+#endif
+
+#define randf()	(rand() / (RAND_MAX + 1.0f))
+
+void LOG(const char *format, ...) {
+	__gnuc_va_list arg;
+	va_start(arg, format);
+	char msg[512];
+	vsprintf(msg, format, arg);
+	va_end(arg);
+
+	FILE *log = fopen("ux0:/data/mathneon.log", "a+");
+	if (log != NULL) {
+		fwrite(msg, 1, strlen(msg), log);
+		fclose(log);
+	}
+}
+
+struct	test1_s {
+	const char*	name;
+	float 		(*func)(float);	//the function
+	float 		(*bench)(float);	//the function to benchmark against.
+	float 		rng0, rng1;
+	int			num;
+	float 		emaxabs;
+	float 		xmaxabs;
+	float 		emaxrel;
+	float 		xmaxrel;
+	float 		erms;
+	int			time;				//time to execute num functions;
+};
+
+struct	test2_s {
+	const char*	name;
+	float 		(*func)(float, float);	//the function
+	float 		(*bench)(float, float);	//the function to benchmark against.
+	float 		rng0, rng1;
+	int			num;
+	float 		emaxabs;
+	float 		xmaxabs;
+	float 		emaxrel;
+	float 		xmaxrel;
+	float 		erms;
+	int			time;				//time to execute num functions;
+};
+
+
+float invsqrtf(float x){
+	return (1.0f / sqrtf(x));
+}
+
+typedef struct test1_s test1_t;
+typedef struct test2_s test2_t;
+
+test1_t test1[51] = 
+{
+	{"sinf       ", 	sinf, 		sinf, 	-M_PI, 		M_PI, 	500000},
+	{"sinf_c     ", 	sinf_c, 	sinf, 	-M_PI, 		M_PI, 	500000},
+	{"sinf_neon  ", 	sinf_neon, 	sinf, 	-M_PI, 		M_PI, 	500000},
+	
+	{"cosf       ", 	cosf, 		cosf, 	-M_PI, 		M_PI, 	500000},
+	{"cosf_c     ", 	cosf_c, 	cosf, 	-M_PI, 		M_PI, 	500000},
+	{"cosf_neon  ", 	cosf_neon, 	cosf, 	-M_PI, 		M_PI, 	500000},
+
+	{"tanf       ", 	tanf, 		tanf, 	-M_PI_4, 	M_PI_4, 500000, 0, 0, 0},
+	{"tanf_c     ", 	tanf_c, 	tanf, 	-M_PI_4, 	M_PI_4, 500000, 0, 0, 0},
+	{"tanf_neon  ", 	tanf_neon, 	tanf, 	-M_PI_4, 	M_PI_4, 500000, 0, 0, 0},
+
+	{"asinf      ", 	asinf, 		asinf, 	-1, 		1, 		500000, 0, 0, 0},
+	{"asinf_c    ", 	asinf_c, 	asinf, 	-1, 		1,	 	500000, 0, 0, 0},
+	{"asinf_neon ",		asinf_neon,	asinf, 	-1, 		1, 		500000, 0, 0, 0},
+	
+	{"acosf      ", 	acosf, 		acosf, 	-1, 		1, 		500000, 0, 0, 0},
+	{"acosf_c    ", 	acosf_c, 	acosf, 	-1, 		1,	 	500000, 0, 0, 0},
+	{"acosf_neon ",		acosf_neon,	acosf, 	-1, 		1, 		500000, 0, 0, 0},
+	
+	{"atanf      ", 	atanf, 		atanf, 	-1, 		1, 		500000, 0, 0, 0},
+	{"atanf_c    ", 	atanf_c, 	atanf, 	-1, 		1,	 	500000, 0, 0, 0},
+	{"atanf_neon ",		atanf_neon,	atanf, 	-1, 		1, 		500000, 0, 0, 0},
+
+	{"sinhf       ", 	sinhf, 		sinhf, 	-M_PI, 		M_PI, 	500000, 0, 0, 0},
+	{"sinhf_c     ", 	sinhf_c, 	sinhf, 	-M_PI, 		M_PI, 	500000, 0, 0, 0},
+	{"sinhf_neon  ", 	sinhf_neon, sinhf, 	-M_PI, 		M_PI, 	500000, 0, 0, 0},
+	
+	{"coshf       ", 	coshf, 		coshf, 	-M_PI, 		M_PI, 	500000, 0, 0, 0},
+	{"coshf_c     ", 	coshf_c, 	coshf, 	-M_PI, 		M_PI, 	500000, 0, 0, 0},
+	{"coshf_neon  ", 	coshf_neon, coshf, 	-M_PI, 		M_PI, 	500000, 0, 0, 0},
+
+	{"tanhf       ", 	tanhf, 		tanhf, 	-M_PI, 		M_PI, 	500000, 0, 0, 0},
+	{"tanhf_c     ", 	tanhf_c, 	tanhf, 	-M_PI, 		M_PI, 	500000, 0, 0, 0},
+	{"tanhf_neon  ", 	tanhf_neon, tanhf, 	-M_PI, 		M_PI, 	500000, 0, 0, 0},
+
+	{"expf       ", 	expf, 		expf, 	0, 			10, 	500000, 0, 0, 0},
+	{"expf_c     ", 	expf_c, 	expf, 	0, 			10, 	500000, 0, 0, 0},
+	{"expf_neon  ",		expf_neon, 	expf, 	0, 			10, 	500000, 0, 0, 0},
+	
+	{"logf       ", 	logf, 		logf, 	1, 			1000, 	500000, 0, 0, 0},
+	{"logf_c     ", 	logf_c, 	logf, 	1, 			1000, 	500000, 0, 0, 0},
+	{"logf_neon  ",		logf_neon, 	logf, 	1, 			1000, 	500000, 0, 0, 0},
+
+	{"log10f       ", 	log10f, 	log10f, 1, 			1000, 	500000, 0, 0, 0},
+	{"log10f_c     ", 	log10f_c, 	log10f, 1, 			1000, 	500000, 0, 0, 0},
+	{"log10f_neon  ",	log10f_neon,log10f, 1, 			1000, 	500000, 0, 0, 0},
+
+	{"floorf     ", 	floorf, 	floorf, 1, 			1000, 	5000000, 0, 0, 0},
+	{"floorf_c   ", 	floorf_c, 	floorf, 1, 			1000, 	5000000, 0, 0, 0},
+	{"floorf_neon",		floorf_neon,floorf, 1, 			1000, 	5000000, 0, 0, 0},
+
+	{"ceilf     ", 		ceilf, 		ceilf, 	1, 			1000, 	5000000, 0, 0, 0},
+	{"ceilf_c   ", 		ceilf_c, 	ceilf, 	1, 			1000, 	5000000, 0, 0, 0},
+	{"ceilf_neon",		ceilf_neon,	ceilf, 	1, 			1000, 	5000000, 0, 0, 0},
+
+	{"fabsf     ", 		fabsf, 		fabsf, 	1, 			1000, 	5000000, 0, 0, 0},
+	{"fabsf_c   ", 		fabsf_c, 	fabsf, 	1, 			1000, 	5000000, 0, 0, 0},
+	{"fabsf_neon",		fabsf_neon,	fabsf, 	1, 			1000, 	5000000, 0, 0, 0},
+
+	{"sqrtf      ", 	sqrtf, 		sqrtf, 	1, 			1000, 	500000, 0, 0, 0},
+	{"sqrtf_c    ", 	sqrtf_c, 	sqrtf, 	1, 			1000, 	500000, 0, 0, 0},
+	{"sqrtf_neon ",		sqrtf_neon,	sqrtf, 	1, 			1000, 	500000, 0, 0, 0},
+
+	{"invsqrtf      ", 	invsqrtf, 		invsqrtf, 	1, 	1000, 	500000, 0, 0, 0},
+	{"invsqrtf_c    ", 	invsqrtf_c, 	invsqrtf, 	1, 	1000, 	500000, 0, 0, 0},
+	{"invsqrtf_neon ",	invsqrtf_neon,	invsqrtf, 	1, 	1000, 	500000, 0, 0, 0},
+};
+
+test2_t test2[9] = 
+{
+	{"atan2f       ", 	atan2f, 	atan2f, 0.1, 		10, 	10000, 0, 0, 0},
+	{"atan2f_c     ", 	atan2f_c, 	atan2f, 0.1, 		10, 	10000, 0, 0, 0},
+	{"atan2f_neon  ", 	atan2f_neon,atan2f, 0.1, 		10, 	10000, 0, 0, 0},
+	
+	{"powf       ", 	powf, 		powf, 	1, 			10, 	10000, 0, 0, 0},
+	{"powf_c     ", 	powf_c, 	powf, 	1, 			10, 	10000, 0, 0, 0},
+	{"powf_neon  ", 	powf_neon, 	powf, 	1, 			10, 	10000, 0, 0, 0},
+
+	{"fmodf       ", 	fmodf, 		fmodf, 	1, 			10, 	10000, 0, 0, 0},
+	{"fmodf_c     ", 	fmodf_c, 	fmodf, 	1, 			10, 	10000, 0, 0, 0},
+	{"fmodf_neon  ", 	fmodf_neon, fmodf, 	1, 			10, 	10000, 0, 0, 0},
+
+};
+
+
+void 
+test_mathfunc1(test1_t *tst)
+{
+
+	float x;
+	float dx = (tst->rng1 - tst->rng0) / ((float)tst->num);
+#ifndef WIN32
+	struct rusage ru;
+#endif
+
+	tst->emaxabs = tst->xmaxabs = 0;
+	tst->emaxrel = tst->xmaxrel = 0;
+	tst->erms = 0;
+	for(x = tst->rng0; x < tst->rng1 ; x += dx){	
+		float r = (tst->func)((float)x);
+		float rr = (tst->bench)((float)x);
+		float dr = fabs(r - rr);
+		float drr = dr * (100.0f / rr);
+		tst->erms += dr*dr;
+		if (dr > tst->emaxabs){
+			tst->emaxabs = dr;
+			tst->xmaxabs = x;
+		}
+		if (drr > tst->emaxrel){
+			tst->emaxrel = drr;
+			tst->xmaxrel = x;
+		}
+	}
+	tst->erms = sqrt(tst->erms / ((float) tst->num));
+	
+#ifdef WIN32
+	tst->time = (1000 * clock()) / (CLOCKS_PER_SEC / 1000);
+#else
+	tst->time = sceKernelGetSystemTimeWide();
+#endif
+
+	for(x = tst->rng0; x < tst->rng1 ; x += dx){	
+		(tst->func)((float)x);
+	}
+
+#ifdef WIN32
+	tst->time = (1000 * clock()) / (CLOCKS_PER_SEC / 1000) - tst->time;
+#else
+	tst->time = sceKernelGetSystemTimeWide();
+#endif
+
+}
+
+void
+test_mathfunc2(test2_t *tst)
+{
+	float x, y;
+	float rng = tst->rng1 - tst->rng0;
+	float d = (rng * rng) / ((float) tst->num);
+#ifndef WIN32
+	struct rusage ru;
+#endif
+
+	tst->emaxabs = tst->xmaxabs = 0;
+	tst->emaxrel = tst->xmaxrel = 0;
+	for(y = (tst->rng0); y < (tst->rng1) ; y += d){	
+		for(x = (tst->rng0); x < (tst->rng1); x += d){	
+			float r = (tst->func)((float)x, y);
+			float rr = (tst->bench)((float)x, y);
+			float dr = fabs(r - rr);
+			float drr = dr * (100.0f / rr);
+			if (dr > tst->emaxabs){
+				tst->emaxabs = dr;
+				tst->xmaxabs = x;
+			}
+			if (drr > tst->emaxrel && fabsf(rr) > 0.0001){
+				tst->emaxrel = drr;
+				tst->xmaxrel = x;
+			}
+		}
+	}
+	
+#ifdef WIN32
+	tst->time = (1000 * clock()) / (CLOCKS_PER_SEC / 1000) ;
+#else
+	tst->time = sceKernelGetSystemTimeWide();
+#endif
+
+	for(y = tst->rng0; y < tst->rng1 ; y += d){	
+		for(x = tst->rng0; x < tst->rng1 ; x += d){	
+			(tst->func)((float)x, (float)y);
+		}
+	}
+
+#ifdef WIN32
+	tst->time = (1000 * clock()) / (CLOCKS_PER_SEC / 1000) - tst->time;
+#else
+	tst->time = sceKernelGetSystemTimeWide();
+#endif
+
+}
+
+void test_vectorfunc()
+{
+	float v0[4], v1[4], d[4];
+	
+	for(int i=0;i<4;i++)
+	{
+		v0[i] = 10*randf() - 5;
+		v1[i] = 10*randf() - 5;
+		d[i] = 10*randf() - 5;		
+	}
+	
+	int testnum = 5000000;
+	struct rusage ru;
+	int v2t[3], v3t[3], v4t[3];
+	float r;
+	
+	LOG("\n");
+	
+	//dot 2
+	v2t[0] = sceKernelGetSystemTimeWide();
+	for(int i=0;i < testnum; i++)
+	{
+		r = dot2_c(v0, v1);
+	};
+	v2t[1] = sceKernelGetSystemTimeWide();
+	for(int i=0;i < testnum; i++)
+	{
+		r = dot2_neon(v0, v1);
+	};
+	v2t[2] = sceKernelGetSystemTimeWide();
+
+	r = dot2_c(v0, v1);
+	LOG("dot2_c = %f\n", r);
+	r = dot2_neon(v0, v1);
+	LOG("dot2_neon = %f\n", r);
+	
+	LOG("dot2: c=%i \t neon=%i \t rate=%.2f \n", v2t[1] - v2t[0], v2t[2] - v2t[1], 
+	(float)(v2t[1] - v2t[0]) / (float)(v2t[2] - v2t[1]));
+
+	//normalize 2
+	v2t[0] = sceKernelGetSystemTimeWide();
+	for(int i=0;i < testnum; i++)
+	{
+		normalize2_c(v0, d);
+	};
+	v2t[1] = sceKernelGetSystemTimeWide();
+	for(int i=0;i < testnum; i++)
+	{
+		normalize2_neon(v0, d);
+	};
+	v2t[2] = sceKernelGetSystemTimeWide();
+
+
+	normalize2_c(v0, d);
+	LOG("normalize2_c = [%.2f, %.2f]\n", d[0], d[1]);
+	normalize2_neon(v0, d);
+	LOG("normalize2_neon = [%.2f, %.2f]\n", d[0], d[1]);
+	
+	LOG("normalize2: c=%i \t neon=%i \t rate=%.2f \n", v2t[1] - v2t[0], v2t[2] - v2t[1], 
+	(float)(v2t[1] - v2t[0]) / (float)(v2t[2] - v2t[1]));
+	LOG("\n");
+
+	
+	//dot 3
+	v3t[0] = sceKernelGetSystemTimeWide();
+	for(int i=0;i < testnum; i++)
+	{
+		r = dot3_c(v0, v1);
+	};	
+	v3t[1] = sceKernelGetSystemTimeWide();
+	for(int i=0;i < testnum; i++)
+	{
+		r = dot3_neon(v0, v1);
+	};
+	v3t[2] = sceKernelGetSystemTimeWide();
+
+	r = dot3_c(v0, v1);
+	LOG("dot3_c = %f\n", r);
+	r = dot3_neon(v0, v1);
+	LOG("dot3_neon = %f\n", r);
+	
+	LOG("dot3: c=%i \t neon=%i \t rate=%.2f \n", v3t[1] - v3t[0], v3t[2] - v3t[1], 
+	(float)(v3t[1] - v3t[0]) / (float)(v3t[2] - v3t[1]));
+
+	//normalize 3
+	v3t[0] = sceKernelGetSystemTimeWide();
+	for(int i=0;i < testnum; i++)
+	{
+		normalize3_c(v0, d);
+	};	
+	v3t[1] = sceKernelGetSystemTimeWide();
+	for(int i=0;i < testnum; i++)
+	{
+		normalize3_neon(v0, d);
+	};	
+	v3t[2] = sceKernelGetSystemTimeWide();
+
+
+	normalize3_c(v0, d);
+	LOG("normalize3_c = [%.2f, %.2f, %.2f]\n", d[0], d[1], d[2]);
+	normalize3_neon(v0, d);
+	LOG("normalize3_neon = [%.2f, %.2f, %.2f]\n", d[0], d[1], d[2]);
+	
+	LOG("normalize3: c=%i \t neon=%i \t rate=%.2f \n", v3t[1] - v3t[0], v3t[2] - v3t[1], 
+	(float)(v3t[1] - v3t[0]) / (float)(v3t[2] - v3t[1]));
+
+	//cross 3	
+	v3t[0] = sceKernelGetSystemTimeWide();
+	for(int i=0;i < testnum; i++)
+	{
+		cross3_c(v0, v1, d);
+	};
+	v3t[1] = sceKernelGetSystemTimeWide();
+	for(int i=0;i < testnum; i++)
+	{
+		cross3_neon(v0, v1, d);
+	};
+	v3t[2] = sceKernelGetSystemTimeWide();
+
+
+	cross3_c(v0, v1, d);
+	LOG("cross3_c = [%.2f, %.2f, %.2f]\n", d[0], d[1], d[2]);
+	cross3_neon(v0, v1, d);
+	LOG("cross3_neon = [%.2f, %.2f, %.2f]\n", d[0], d[1], d[2]);
+	
+	LOG("cross3: c=%i \t neon=%i \t rate=%.2f \n", v3t[1] - v3t[0], v3t[2] - v3t[1], 
+	(float)(v3t[1] - v3t[0]) / (float)(v3t[2] - v3t[1]));
+	LOG("\n");
+
+
+	//dot 4
+	v4t[0] = sceKernelGetSystemTimeWide();
+	for(int i=0;i < testnum; i++)
+	{
+		r = dot4_c(v0, v1);
+	};
+	v4t[1] = sceKernelGetSystemTimeWide();
+	for(int i=0;i < testnum; i++)
+	{
+		r = dot4_neon(v0, v1);
+	};
+	v4t[2] = sceKernelGetSystemTimeWide();
+
+	r = dot4_c(v0, v1);
+	LOG("dot4_c = %f\n", r);
+	r = dot4_neon(v0, v1);
+	LOG("dot4_neon = %f\n", r);
+	
+	LOG("dot4: c=%i \t neon=%i \t rate=%.2f \n", v4t[1] - v4t[0], v4t[2] - v4t[1], 
+	(float)(v4t[1] - v4t[0]) / (float)(v4t[2] - v4t[1]));
+	
+	//normalize 4
+	v4t[0] = sceKernelGetSystemTimeWide();
+	for(int i=0;i < testnum; i++)
+	{
+		normalize4_c(v0, d);
+	};	
+	v4t[1] = sceKernelGetSystemTimeWide();
+	for(int i=0;i < testnum; i++)
+	{
+		normalize4_neon(v0, d);
+	};	
+	v4t[2] = sceKernelGetSystemTimeWide();
+
+
+	normalize4_c(v0, d);
+	LOG("normalize4_c = [%.2f, %.2f, %.2f, %.2f]\n", d[0], d[1], d[2], d[3]);
+	normalize4_neon(v0, d);
+	LOG("normalize4_neon = [%.2f, %.2f, %.2f, %.2f]\n", d[0], d[1], d[2], d[3]);
+	
+	LOG("normalize4: c=%i \t neon=%i \t rate=%.2f \n", v4t[1] - v4t[0], v4t[2] - v4t[1], 
+	(float)(v4t[1] - v4t[0]) / (float)(v4t[2] - v4t[1]));
+	LOG("\n");
+
+
+}
+
+
+
+void test_matrixfunc()
+{
+	float m0[16], m1[16], m2[16];
+	int m2t[3], m3t[3], m4t[3];
+	
+	int i;
+	int testnum = 1000000;
+	struct rusage ru;
+	
+	for(int i=0;i<16;i++)
+	{
+		m0[i] = 10.0f * randf() - 5.0f; 
+		m1[i] = 10.0f * randf() - 5.0f; 
+		m2[i] = 10.0f * randf() - 5.0f; 
+	}
+
+
+	//matmul2 
+	m2t[0] = sceKernelGetSystemTimeWide();
+	for(i = 0; i < testnum; i++){
+		matmul2_c(m0, m1, m2);	
+	}
+	m2t[1] = sceKernelGetSystemTimeWide();
+	for(i = 0; i < testnum; i++){
+		matmul2_neon(m0, m1, m2);
+	}
+	m2t[2] = sceKernelGetSystemTimeWide();
+
+	matmul2_c(m0, m1, m2);	
+	LOG("matmul2_c = \n");
+	LOG("\t\t\t|%.2f, %.2f|\n", m2[0], m2[2]);
+	LOG("\t\t\t|%.2f, %.2f|\n", m2[1], m2[3]);
+
+	matmul2_neon(m0, m1, m2);	
+	LOG("matmul2_neon = \n");
+	LOG("\t\t\t|%.2f, %.2f|\n", m2[0], m2[2]);
+	LOG("\t\t\t|%.2f, %.2f|\n", m2[1], m2[3]);
+	
+	LOG("matmul2: c=%i \t neon=%i \t rate=%.2f \n", m2t[1] - m2t[0], m2t[2] - m2t[1], 
+		(float)(m2t[1] - m2t[0]) / (float)(m2t[2] - m2t[1]));
+
+
+	//matvec2 
+	m2t[0] = sceKernelGetSystemTimeWide();
+	for(i = 0; i < testnum; i++){
+		matvec2_c(m0, m1, m2);	
+	}
+	m2t[1] = sceKernelGetSystemTimeWide();
+	for(i = 0; i < testnum; i++){
+		matvec2_neon(m0, m1, m2);
+	}
+	m2t[2] = sceKernelGetSystemTimeWide();
+
+	memset(m2, 0, 4*sizeof(float));
+	matvec2_c(m0, m1, m2);	
+	LOG("matvec2_c = |%.2f, %.2f|\n", m2[0], m2[1]);
+	
+	memset(m2, 0, 4*sizeof(float));
+	matvec2_neon(m0, m1, m2);	
+	LOG("matvec2_neon = |%.2f, %.2f|\n", m2[0], m2[1]);
+
+	LOG("matvec2: c=%i \t neon=%i \t rate=%.2f \n", m2t[1] - m2t[0], m2t[2] - m2t[1], 
+		(float)(m2t[1] - m2t[0]) / (float)(m2t[2] - m2t[1]));
+
+	//MAT3
+	m3t[0] = sceKernelGetSystemTimeWide();
+	for(i = 0; i < testnum; i++){
+		matmul3_c(m0, m1, m2);	
+	}
+	m3t[1] = sceKernelGetSystemTimeWide();
+	for(i = 0; i < testnum; i++){
+		matmul3_neon(m0, m1, m2);
+	}
+	m3t[2] = sceKernelGetSystemTimeWide();
+
+	memset(m2, 0, 9*sizeof(float));
+	matmul3_c(m0, m1, m2);	
+	LOG("matmul3_c =\n");
+	LOG("\t\t\t|%.2f, %.2f, %.2f|\n", m2[0], m2[3], m2[6]);
+	LOG("\t\t\t|%.2f, %.2f, %.2f|\n", m2[1], m2[4], m2[7]);
+	LOG("\t\t\t|%.2f, %.2f, %.2f|\n", m2[2], m2[5], m2[8]);
+	
+	memset(m2, 0, 9*sizeof(float));
+	matmul3_neon(m0, m1, m2);	
+	LOG("matmul3_neon =\n");
+	LOG("\t\t\t|%.2f, %.2f, %.2f|\n", m2[0], m2[3], m2[6]);
+	LOG("\t\t\t|%.2f, %.2f, %.2f|\n", m2[1], m2[4], m2[7]);
+	LOG("\t\t\t|%.2f, %.2f, %.2f|\n", m2[2], m2[5], m2[8]);
+	
+	LOG("matmul3: c=%i \t neon=%i \t rate=%.2f \n", m3t[1] - m3t[0], m3t[2] - m3t[1], 
+		(float)(m3t[1] - m3t[0]) / (float)(m3t[2] - m3t[1]));
+
+	//matvec3
+	m3t[0] = sceKernelGetSystemTimeWide();
+	for(i = 0; i < testnum; i++){
+		matvec3_c(m0, m1, m2);	
+	}
+	m3t[1] = sceKernelGetSystemTimeWide();
+	for(i = 0; i < testnum; i++){
+		matvec3_neon(m0, m1, m2);
+	}
+	m3t[2] = sceKernelGetSystemTimeWide();
+
+	memset(m2, 0, 4*sizeof(float));
+	matvec3_c(m0, m1, m2);	
+	LOG("matvec3_c = |%.2f, %.2f, %.2f|\n", m2[0], m2[1], m2[2]);
+
+	memset(m2, 0, 4*sizeof(float));
+	matvec3_neon(m0, m1, m2);	
+	LOG("matvec3_neon = |%.2f, %.2f, %.2f|\n", m2[0], m2[1], m2[2]);
+	
+	LOG("matvec3: c=%i \t neon=%i \t rate=%.2f \n", m3t[1] - m3t[0], m3t[2] - m3t[1], 
+		(float)(m3t[1] - m3t[0]) / (float)(m3t[2] - m3t[1]));
+
+	//MAT4
+	m4t[0] = sceKernelGetSystemTimeWide();
+	for(i = 0; i < testnum; i++){
+		matmul4_c(m0, m1, m2);	
+	}
+	m4t[1] = sceKernelGetSystemTimeWide();
+	for(i = 0; i < testnum; i++){
+		matmul4_neon(m0, m1, m2);
+	}
+	m4t[2] = sceKernelGetSystemTimeWide();
+
+	memset(m2, 0, 16*sizeof(float));
+	matmul4_c(m0, m1, m2);	
+	LOG("matmul4_c =\n");
+	LOG("\t\t\t|%.2f, %.2f, %.2f, %.2f|\n", m2[0], m2[4], m2[8], m2[12]);
+	LOG("\t\t\t|%.2f, %.2f, %.2f, %.2f|\n", m2[1], m2[5], m2[9], m2[13]);
+	LOG("\t\t\t|%.2f, %.2f, %.2f, %.2f|\n", m2[2], m2[6], m2[10], m2[14]);
+	LOG("\t\t\t|%.2f, %.2f, %.2f, %.2f|\n", m2[3], m2[7], m2[11], m2[15]);
+	
+	memset(m2, 0, 16*sizeof(float));
+	matmul4_neon(m0, m1, m2);	
+	LOG("matmul4_neon =\n");
+	LOG("\t\t\t|%.2f, %.2f, %.2f, %.2f|\n", m2[0], m2[4], m2[8], m2[12]);
+	LOG("\t\t\t|%.2f, %.2f, %.2f, %.2f|\n", m2[1], m2[5], m2[9], m2[13]);
+	LOG("\t\t\t|%.2f, %.2f, %.2f, %.2f|\n", m2[2], m2[6], m2[10], m2[14]);
+	LOG("\t\t\t|%.2f, %.2f, %.2f, %.2f|\n", m2[3], m2[7], m2[11], m2[15]);
+	
+	LOG("matmul4: c=%i \t neon=%i \t rate=%.2f \n", m4t[1] - m4t[0], m4t[2] - m4t[1], 
+		(float)(m4t[1] - m4t[0]) / (float)(m4t[2] - m4t[1]));
+
+	//matvec4
+	m4t[0] = sceKernelGetSystemTimeWide();
+	for(i = 0; i < testnum; i++){
+		matvec4_c(m0, m1, m2);	
+	}
+	m4t[1] = sceKernelGetSystemTimeWide();
+	for(i = 0; i < testnum; i++){
+		matvec4_neon(m0, m1, m2);
+	}
+	m4t[2] = sceKernelGetSystemTimeWide();
+
+	memset(m2, 0, 4*sizeof(float));
+	matvec4_c(m0, m1, m2);	
+	LOG("matvec4_c = |%.2f, %.2f, %.2f, %f|\n", m2[0], m2[1], m2[2], m2[3]);
+
+	memset(m2, 0, 4*sizeof(float));
+	matvec4_neon(m0, m1, m2);	
+	LOG("matvec4_neon = |%.2f, %.2f, %.2f, %f|\n", m2[0], m2[1], m2[2], m2[3]);
+	
+	LOG("matvec4: c=%i \t neon=%i \t rate=%.2f \n", m4t[1] - m4t[0], m4t[2] - m4t[1], 
+		(float)(m4t[1] - m4t[0]) / (float)(m4t[2] - m4t[1]));
+
+
+}
+
+int main(int argc, char** argv)
+{	
+
+	int i, ii;
+#if 1
+	LOG("RUNFAST: Disabled \n");
+#else
+	LOG("RUNFAST: Enabled \n");
+	enable_runfast();
+#endif
+	srand(time(NULL));
+
+#if 1
+	//test single argument functions:
+	LOG("------------------------------------------------------------------------------------------------------\n");	
+	LOG("MATRIX FUNCTION TESTS \n");	
+	LOG("------------------------------------------------------------------------------------------------------\n");	
+	
+	test_matrixfunc();
+	test_vectorfunc();
+
+	LOG("------------------------------------------------------------------------------------------------------\n");	
+	LOG("CMATH FUNCTION TESTS \n");	
+	LOG("------------------------------------------------------------------------------------------------------\n");	
+	LOG("Function\tRange\t\tNumber\tABS Max Error\tREL Max Error\tRMS Error\tTime\tRate\n");	
+	LOG("------------------------------------------------------------------------------------------------------\n");	
+	for(i = 0; i < 51; i++){
+		test_mathfunc1(&test1[i]);	
+		
+		ii = i - (i % 3);
+		LOG("%s\t", test1[i].name);
+		LOG("[%.2f, %.2f]\t", test1[i].rng0, test1[i].rng1);
+		LOG("%i\t", test1[i].num);
+		LOG("%.2e\t", test1[i].emaxabs);
+		LOG("%.2e%%\t", test1[i].emaxrel);
+		LOG("%.2e\t", test1[i].erms);
+		LOG("%i\t", test1[i].time);
+		LOG("x%.2f\t", (float)test1[ii].time / test1[i].time);
+		LOG("\n");
+	}
+	for(i = 0; i < 9; i++){
+		test_mathfunc2(&test2[i]);
+	
+		ii = i - (i % 3);
+		
+		LOG("%s\t", test2[i].name);
+		LOG("[%.2f, %.2f]\t", test2[i].rng0, test2[i].rng1);
+		LOG("%i\t", test2[i].num);
+		LOG("%.2e\t", test2[i].emaxabs);
+		LOG("%.2e%%\t", test2[i].emaxrel);
+		LOG("%.2e\t", test2[i].erms);
+		LOG("%i\t", test2[i].time);
+		LOG("x%.2f\t", (float)test2[ii].time / test2[i].time);
+		LOG("\n");
+	}
+	
+#else
+
+
+	float x = 0;
+	for(x = -M_PI_2; x < M_PI_2; x+= 0.01)
+	{
+		LOG("x=%.2f\t in=%.2f\t c=%.2f\t neon=%.2f \n", x, sinhf(x), sinhf_c(x), sinhf_neon(x));
+	}
+
+#endif
+	
+	return 0;
+} 
--- a/deps/math-neon/source/math_neon.h
+++ b/deps/math-neon/source/math_neon.h
@ -33,7 +33,7 @@ THE SOFTWARE.
 //Default Floating Point value ABI: 0=softfp, 1=hardfp. Only effects *_neon routines.
 //You can access the hardfp versions directly via the *_hard suffix. 
 //You can access the softfp versions directly via the *_soft suffix. 
-#define __MATH_FPABI 	0	
+#define __MATH_FPABI 	1	

 #endif

@ -84,6 +84,10 @@ THE SOFTWARE.
 #define modf_neon		modf_neon_hfp
 #define sqrtf_neon		sqrtf_neon_hfp
 #define invsqrtf_neon	invsqrtf_neon_hfp
+
+#define dot2_neon		dot2_neon_hfp
+#define dot3_neon		dot3_neon_hfp
+#define dot4_neon		dot4_neon_hfp
 #else
 #define sinf_neon		sinf_neon_sfp
 #define cosf_neon		cosf_neon_sfp
--- a/deps/math-neon/source/math_sqrtfv.c
+++ b/deps/math-neon/source/math_sqrtfv.c
@ -100,7 +100,7 @@ void sqrtfv_c(float *x, int n, float *r)

 void sqrtfv_neon(float *x, int n, float *r)
 {
-#if 0
+#ifdef __MATH_NEON
 	asm volatile (

 	"tst 			r1, #1 					\n\t"	//r1 & 1
--- a/deps/vitaGL/Makefile
+++ b/deps/vitaGL/Makefile
@ -17,6 +17,10 @@ AR      = $(PREFIX)-gcc-ar
 CFLAGS  = -g -Wl,-q -O2 -ffast-math -mtune=cortex-a9 -mfpu=neon -flto -ftree-vectorize -DSTB_DXT_IMPLEMENTATION
 ASFLAGS = $(CFLAGS)

+ifeq ($(NO_DEBUG),1)
+CFLAGS  += -DSKIP_ERROR_HANDLING
+endif
+
 all: $(TARGET).a

 $(TARGET).a: $(OBJS)
--- a/deps/vitaGL/source/custom_shaders.c
+++ b/deps/vitaGL/source/custom_shaders.c
@ -357,8 +357,7 @@ void glUniformMatrix4fv(GLint location, GLsizei count, GLboolean transpose, cons
 * ------------------------------
 */

-// Equivalent of glBindAttribLocation but for sceGxm architecture
-void vglBindAttribLocation(GLuint prog, GLuint index, const GLchar *name, const GLuint num, const GLenum type) {
+void vglBindPackedAttribLocation(GLuint prog, GLuint index, const GLchar *name, const GLuint num, const GLenum type, GLuint offset) {
 	// Grabbing passed program
 	program *p = &progs[prog - 1];
 	SceGxmVertexAttribute *attributes = &p->attr[index];
@ -369,7 +368,7 @@ void vglBindAttribLocation(GLuint prog, GLuint index, const GLchar *name, const

 	// Setting stream index and offset values
 	attributes->streamIndex = index;
-	attributes->offset = 0;
+	attributes->offset = offset;

 	// Detecting attribute format and size
 	int bpe;
@ -396,7 +395,12 @@ void vglBindAttribLocation(GLuint prog, GLuint index, const GLchar *name, const
 		p->attr_num = index + 1;
 }

-// Equivalent of glVertexAttribLocation but for sceGxm architecture
+// Equivalent of glBindAttribLocation but for sceGxm architecture
+void vglBindAttribLocation(GLuint prog, GLuint index, const GLchar *name, const GLuint num, const GLenum type) {
+	vglBindPackedAttribLocation(prog, index, name, num, type, 0);
+}
+
+// Equivalent of glVertexAttribPointer but for sceGxm architecture
 void vglVertexAttribPointer(GLuint index, GLint size, GLenum type, GLboolean normalized, GLsizei stride, GLuint count, const GLvoid *pointer) {
 #ifndef SKIP_ERROR_HANDLING
 	// Error handling
--- a/deps/vitaGL/source/textures.c
+++ b/deps/vitaGL/source/textures.c
@ -101,6 +101,7 @@ void glTexImage2D(GLenum target, GLint level, GLint internalFormat, GLsizei widt

 	SceGxmTextureFormat tex_format;
 	uint8_t data_bpp = 0;
+	uint8_t fast_store = GL_FALSE;

 	// Support for legacy GL1.0 internalFormat
 	switch (internalFormat) {
@ -157,7 +158,8 @@ void glTexImage2D(GLenum target, GLint level, GLint internalFormat, GLsizei widt
 		switch (type) {
 		case GL_UNSIGNED_BYTE:
 			data_bpp = 3;
-			read_cb = readRGB;
+			if (internalFormat == GL_RGB) fast_store = GL_TRUE;
+			else read_cb = readRGB;
 			break;
 		default:
 			error = GL_INVALID_ENUM;
@ -168,7 +170,8 @@ void glTexImage2D(GLenum target, GLint level, GLint internalFormat, GLsizei widt
 		switch (type) {
 		case GL_UNSIGNED_BYTE:
 			data_bpp = 4;
-			read_cb = readRGBA;
+			if (internalFormat == GL_RGBA) fast_store = GL_TRUE;
+			else read_cb = readRGBA;
 			break;
 		case GL_UNSIGNED_SHORT_5_5_5_1:
 			data_bpp = 2;
@ -236,7 +239,7 @@ void glTexImage2D(GLenum target, GLint level, GLint internalFormat, GLsizei widt
 		tex->type = internalFormat;
 		tex->write_cb = write_cb;
 		if (level == 0)
-			if (tex->write_cb) gpu_alloc_texture(width, height, tex_format, data, tex, data_bpp, read_cb, write_cb);
+			if (tex->write_cb) gpu_alloc_texture(width, height, tex_format, data, tex, data_bpp, read_cb, write_cb, fast_store);
 			else gpu_alloc_compressed_texture(width, height, tex_format, data, tex, data_bpp, read_cb);
 		else {
 			gpu_alloc_mipmaps(level, tex);
--- a/deps/vitaGL/source/utils/gpu_utils.c
+++ b/deps/vitaGL/source/utils/gpu_utils.c
@ -255,7 +255,7 @@ void gpu_free_texture(texture *tex) {
 	tex->valid = 0;
 }

-void gpu_alloc_texture(uint32_t w, uint32_t h, SceGxmTextureFormat format, const void *data, texture *tex, uint8_t src_bpp, uint32_t (*read_cb)(void *), void (*write_cb)(void *, uint32_t)) {
+void gpu_alloc_texture(uint32_t w, uint32_t h, SceGxmTextureFormat format, const void *data, texture *tex, uint8_t src_bpp, uint32_t (*read_cb)(void *), void (*write_cb)(void *, uint32_t), uint8_t fast_store) {
 	// If there's already a texture in passed texture object we first dealloc it
 	if (tex->valid)
 		gpu_free_texture(tex);
@ -274,13 +274,22 @@ void gpu_alloc_texture(uint32_t w, uint32_t h, SceGxmTextureFormat format, const
 			int i, j;
 			uint8_t *src = (uint8_t *)data;
 			uint8_t *dst;
-			for (i = 0; i < h; i++) {
-				dst = ((uint8_t *)texture_data) + (ALIGN(w, 8) * bpp) * i;
-				for (j = 0; j < w; j++) {
-					uint32_t clr = read_cb(src);
-					write_cb(dst, clr);
-					src += src_bpp;
-					dst += bpp;
+			if (fast_store) { // Internal Format and Data Format are the same, we can just use memcpy for better performance
+				uint32_t line_size = w * bpp;
+				for (i = 0; i < h; i++) {
+					dst = ((uint8_t *)texture_data) + (ALIGN(w, 8) * bpp) * i;
+					memcpy(dst, src, line_size);
+					src += line_size;
+				}
+			} else { // Different internal and data formats, we need to go with slower callbacks system
+				for (i = 0; i < h; i++) {
+					dst = ((uint8_t *)texture_data) + (ALIGN(w, 8) * bpp) * i;
+					for (j = 0; j < w; j++) {
+						uint32_t clr = read_cb(src);
+						write_cb(dst, clr);
+						src += src_bpp;
+						dst += bpp;
+					}
 				}
 			}
 		} else
--- a/deps/vitaGL/source/utils/gpu_utils.h
+++ b/deps/vitaGL/source/utils/gpu_utils.h
@ -82,7 +82,7 @@ void gpu_pool_init(uint32_t temp_pool_size);
 int tex_format_to_bytespp(SceGxmTextureFormat format);

 // Alloc a texture
-void gpu_alloc_texture(uint32_t w, uint32_t h, SceGxmTextureFormat format, const void *data, texture *tex, uint8_t src_bpp, uint32_t (*read_cb)(void *), void (*write_cb)(void *, uint32_t));
+void gpu_alloc_texture(uint32_t w, uint32_t h, SceGxmTextureFormat format, const void *data, texture *tex, uint8_t src_bpp, uint32_t (*read_cb)(void *), void (*write_cb)(void *, uint32_t), uint8_t fast_store);

 // Alloc a compresseed texture
 void gpu_alloc_compressed_texture(uint32_t w, uint32_t h, SceGxmTextureFormat format, const void *data, texture *tex, uint8_t src_bpp, uint32_t (*read_cb)(void *));
--- a/deps/vitaGL/source/vitaGL.h
+++ b/deps/vitaGL/source/vitaGL.h
@ -378,6 +378,7 @@ void vglVertexPointerMapped(const GLvoid *pointer);

 // VGL_EXT_gxp_shaders extension implementation
 void vglBindAttribLocation(GLuint prog, GLuint index, const GLchar *name, const GLuint num, const GLenum type);
+void vglBindPackedAttribLocation(GLuint prog, GLuint index, const GLchar *name, const GLuint num, const GLenum type, GLuint offset);
 void vglVertexAttribPointer(GLuint index, GLint size, GLenum type, GLboolean normalized, GLsizei stride, GLuint count, const GLvoid *pointer);
 void vglVertexAttribPointerMapped(GLuint index, const GLvoid *pointer);