mirror of
https://github.com/libretro/RetroArch
synced 2025-03-29 22:20:21 +00:00
Merge pull request #10750 from libretro/vitagl
VitaGL and Math-neon update
This commit is contained in:
commit
a10abdd929
225
deps/math-neon/README
vendored
225
deps/math-neon/README
vendored
@ -36,134 +36,133 @@ Email: lachlan.ts@gmail.com
|
||||
|
||||
PSVITA performances test results:
|
||||
|
||||
RUNFAST: Enabled
|
||||
RUNFAST: Disabled
|
||||
------------------------------------------------------------------------------------------------------
|
||||
MATRIX FUNCTION TESTS
|
||||
------------------------------------------------------------------------------------------------------
|
||||
matmul2_c =
|
||||
|-7.16, 9.42|
|
||||
|17.86, -10.70|
|
||||
|-14.56, 5.96|
|
||||
|-15.35, 10.50|
|
||||
matmul2_neon =
|
||||
|-7.16, 9.42|
|
||||
|17.86, -10.70|
|
||||
matmul2: c=183985 neon=87480 rate=2.10
|
||||
matvec2_c = |-7.16, 17.86|
|
||||
matvec2_neon = |-7.16, 17.86|
|
||||
matvec2: c=98178 neon=66040 rate=1.49
|
||||
|-14.56, 5.96|
|
||||
|-15.35, 10.50|
|
||||
matmul2: c=174924 neon=64490 rate=2.71
|
||||
matvec2_c = |-14.56, -15.35|
|
||||
matvec2_neon = |-14.56, -15.35|
|
||||
matvec2: c=88957 neon=58337 rate=1.52
|
||||
matmul3_c =
|
||||
|11.14, -0.78, -3.98|
|
||||
|16.56, 17.96, 23.58|
|
||||
|8.73, -0.18, 1.57|
|
||||
|-21.39, -4.68, -1.74|
|
||||
|-8.66, -8.97, 1.83|
|
||||
|15.88, 0.30, -2.23|
|
||||
matmul3_neon =
|
||||
|11.14, -0.78, -3.98|
|
||||
|16.56, 17.96, 23.58|
|
||||
|8.73, -0.18, 1.57|
|
||||
matmul3: c=551838 neon=340292 rate=1.62
|
||||
matvec3_c = |11.14, 16.56, 8.73|
|
||||
matvec3_neon = |11.14, 16.56, 8.73|
|
||||
matvec3: c=98178 neon=66040 rate=1.49
|
||||
|-21.39, -4.68, -1.74|
|
||||
|-8.66, -8.97, 1.83|
|
||||
|15.88, 0.30, -2.23|
|
||||
matmul3: c=552486 neon=297268 rate=1.86
|
||||
matvec3_c = |-21.39, -8.66, 15.88|
|
||||
matvec3_neon = |-21.39, -8.66, 15.88|
|
||||
matvec3: c=184104 neon=128780 rate=1.43
|
||||
matmul4_c =
|
||||
|17.91, -23.96, 1.86, 16.53|
|
||||
|4.10, -18.16, 4.17, 29.06|
|
||||
|6.92, -1.60, 3.12, 27.81|
|
||||
|-15.13, -7.46, -17.91, 22.49|
|
||||
|-13.65, -1.80, -12.92, 6.56|
|
||||
|-10.21, 9.47, 2.73, 14.79|
|
||||
|0.97, 11.69, -0.64, -12.87|
|
||||
|20.06, 6.77, 35.61, -0.02|
|
||||
matmul4_neon =
|
||||
|17.91, -23.96, 1.86, 16.53|
|
||||
|4.10, -18.16, 4.17, 29.06|
|
||||
|6.92, -1.60, 3.12, 27.81|
|
||||
|-15.13, -7.46, -17.91, 22.49|
|
||||
matmul4: c=1316131 neon=315444 rate=4.17
|
||||
matvec4_c = |17.91, 4.10, 6.92, -15.126419|
|
||||
matvec4_neon = |17.91, 4.10, 6.92, -15.126419|
|
||||
matvec4: c=98178 neon=66040 rate=1.49
|
||||
|-13.65, -1.80, -12.92, 6.56|
|
||||
|-10.21, 9.47, 2.73, 14.79|
|
||||
|0.97, 11.69, -0.64, -12.87|
|
||||
|20.06, 6.77, 35.61, -0.02|
|
||||
matmul4: c=1315568 neon=254227 rate=5.17
|
||||
matvec4_c = |-13.65, -10.21, 0.97, 20.058556|
|
||||
matvec4_neon = |-13.65, -10.21, 0.97, 20.058556|
|
||||
matvec4: c=331712 neon=147196 rate=2.25
|
||||
|
||||
dot2_c = 5.804099
|
||||
dot2_neon = 5.804099
|
||||
dot2: c=291526 neon=307025 rate=0.95
|
||||
normalize2_c = [0.97, 0.24]
|
||||
normalize2_neon = [0.97, 0.24]
|
||||
normalize2: c=1058588 neon=965696 rate=1.10
|
||||
dot2_c = -10.903330
|
||||
dot2_neon = -10.903330
|
||||
dot2: c=230295 neon=168799 rate=1.36
|
||||
normalize2_c = [-0.74, 0.67]
|
||||
normalize2_neon = [-0.74, 0.67]
|
||||
normalize2: c=950716 neon=965780 rate=0.98
|
||||
|
||||
dot3_c = -0.817487
|
||||
dot3_neon = -0.817487
|
||||
dot3: c=322094 neon=444834 rate=0.72
|
||||
normalize3_c = [0.50, 0.12, -0.86]
|
||||
normalize3_neon = [0.50, 0.12, -0.86]
|
||||
normalize3: c=1257201 neon=1134375 rate=1.11
|
||||
cross3_c = [-13.16, -17.29, -10.19]
|
||||
cross3_neon = [-13.16, -17.29, -10.19]
|
||||
cross3: c=705298 neon=766477 rate=0.92
|
||||
dot3_c = -4.226746
|
||||
dot3_neon = -4.226746
|
||||
dot3: c=306957 neon=337316 rate=0.91
|
||||
normalize3_c = [-0.69, 0.62, -0.38]
|
||||
normalize3_neon = [-0.69, 0.62, -0.38]
|
||||
normalize3: c=1180950 neon=1134557 rate=1.04
|
||||
cross3_c = [-9.67, -19.39, -14.24]
|
||||
cross3_neon = [-9.67, -19.39, -14.24]
|
||||
cross3: c=659558 neon=766896 rate=0.86
|
||||
|
||||
dot4_c = -7.880241
|
||||
dot4_neon = -7.880241
|
||||
dot4: c=414431 neon=506460 rate=0.82
|
||||
normalize4_c = [0.45, 0.11, -0.77, -0.44]
|
||||
normalize4_neon = [0.45, 0.11, -0.77, -0.44]
|
||||
normalize4: c=1410727 neon=1102802 rate=1.28
|
||||
dot4_c = 2.782796
|
||||
dot4_neon = 2.782796
|
||||
dot4: c=414233 neon=276068 rate=1.50
|
||||
normalize4_c = [-0.59, 0.53, -0.32, -0.52]
|
||||
normalize4_neon = [-0.59, 0.53, -0.32, -0.52]
|
||||
normalize4: c=1364294 neon=1103327 rate=1.24
|
||||
|
||||
------------------------------------------------------------------------------------------------------
|
||||
CMATH FUNCTION TESTS
|
||||
------------------------------------------------------------------------------------------------------
|
||||
Function Range Number ABS Max Error REL Max Error RMS Error Time Rate
|
||||
------------------------------------------------------------------------------------------------------
|
||||
sinf [-3.14, 3.14] 500000 0.00e+00 0.00e+00% 0.00e+00 647042739 x1.00
|
||||
sinf_c [-3.14, 3.14] 500000 7.75e-07 1.00e+02% 4.09e-07 646276691 x1.00
|
||||
sinf_neon [-3.14, 3.14] 500000 1.00e+00 1.00e+02% 7.07e-01 645546381 x1.00
|
||||
cosf [-3.14, 3.14] 500000 0.00e+00 0.00e+00% 0.00e+00 644742077 x1.00
|
||||
cosf_c [-3.14, 3.14] 500000 7.75e-07 6.74e-01% 4.15e-07 643957358 x1.00
|
||||
cosf_neon [-3.14, 3.14] 500000 1.00e+00 1.00e+02% 7.06e-01 643211256 x1.00
|
||||
tanf [-0.79, 0.79] 500000 0.00e+00 0.00e+00% 0.00e+00 642444112 x1.00
|
||||
tanf_c [-0.79, 0.79] 500000 2.98e-06 7.94e-04% 1.31e-06 641628507 x1.00
|
||||
tanf_neon [-0.79, 0.79] 500000 1.00e+00 1.00e+02% nan 640740514 x1.00
|
||||
asinf [-1.00, 1.00] 500000 0.00e+00 0.00e+00% 0.00e+00 639560380 x1.00
|
||||
asinf_c [-1.00, 1.00] 500000 5.54e-05 1.06e-02% nan 638453383 x1.00
|
||||
asinf_neon [-1.00, 1.00] 500000 1.57e+00 1.00e+02% 6.84e-01 637349653 x1.00
|
||||
acosf [-1.00, 1.00] 500000 0.00e+00 0.00e+00% 0.00e+00 636078992 x1.00
|
||||
acosf_c [-1.00, 1.00] 500000 5.56e-05 6.46e-03% nan 634934201 x1.00
|
||||
acosf_neon [-1.00, 1.00] 500000 1.57e+00 1.02e+05% 6.84e-01 633793585 x1.00
|
||||
atanf [-1.00, 1.00] 500000 0.00e+00 0.00e+00% 0.00e+00 632835241 x1.00
|
||||
atanf_c [-1.00, 1.00] 500000 1.67e-04 2.12e-02% 7.40e-05 632142823 x1.00
|
||||
atanf_neon [-1.00, 1.00] 500000 7.85e-01 0.00e+00% nan 631387330 x1.00
|
||||
sinhf [-3.14, 3.14] 500000 0.00e+00 0.00e+00% 0.00e+00 630142014 x1.00
|
||||
sinhf_c [-3.14, 3.14] 500000 1.91e-06 1.52e-01% 1.85e-07 628992714 x1.00
|
||||
sinhf_neon [-3.14, 3.14] 500000 1.15e+01 1.00e+02% 4.55e+00 627998454 x1.00
|
||||
coshf [-3.14, 3.14] 500000 0.00e+00 0.00e+00% 0.00e+00 626869866 x1.00
|
||||
coshf_c [-3.14, 3.14] 500000 9.54e-07 2.38e-05% 1.64e-07 625829657 x1.00
|
||||
coshf_neon [-3.14, 3.14] 500000 1.06e+01 9.14e+01% 3.92e+00 624873969 x1.00
|
||||
tanhf [-3.14, 3.14] 500000 0.00e+00 0.00e+00% 0.00e+00 623689093 x1.00
|
||||
tanhf_c [-3.14, 3.14] 500000 1.20e-05 2.48e-01% 5.48e-06 622547097 x1.00
|
||||
tanhf_neon [-3.14, 3.14] 500000 9.96e-01 1.00e+02% 8.26e-01 621506812 x1.00
|
||||
expf [0.00, 10.00] 500000 0.00e+00 0.00e+00% 0.00e+00 620497304 x1.00
|
||||
expf_c [0.00, 10.00] 500000 9.77e-03 6.15e-05% 1.64e-03 619569554 x1.00
|
||||
expf_neon [0.00, 10.00] 500000 2.20e+04 1.00e+02% 4.92e+03 618761400 x1.00
|
||||
logf [1.00, 1000.00] 500000 0.00e+00 0.00e+00% 0.00e+00 617882765 x1.00
|
||||
logf_c [1.00, 1000.00] 500000 6.20e-06 1.62e-02% 9.83e-07 617087810 x1.00
|
||||
logf_neon [1.00, 1000.00] 500000 9.49e+01 inf% 9.39e+01 616388420 x1.00
|
||||
log10f [1.00, 1000.00] 500000 0.00e+00 0.00e+00% 0.00e+00 615405364 x1.00
|
||||
log10f_c [1.00, 1000.00] 500000 2.86e-06 6.68e-03% 4.79e-07 614442585 x1.00
|
||||
log10f_neon [1.00, 1000.00] 500000 4.12e+01 inf% 4.07e+01 613671782 x1.00
|
||||
floorf [1.00, 1000.00] 5000000 0.00e+00 0.00e+00% 0.00e+00 611113689 x1.00
|
||||
floorf_c [1.00, 1000.00] 5000000 0.00e+00 0.00e+00% 0.00e+00 608159325 x1.00
|
||||
floorf_neon [1.00, 1000.00] 5000000 2.00e+00 2.00e+02% 1.42e-02 604769008 x1.01
|
||||
ceilf [1.00, 1000.00] 5000000 0.00e+00 0.00e+00% 0.00e+00 601342443 x1.00
|
||||
ceilf_c [1.00, 1000.00] 5000000 0.00e+00 0.00e+00% 0.00e+00 598387998 x1.00
|
||||
ceilf_neon [1.00, 1000.00] 5000000 2.00e+00 1.00e+02% 1.02e+00 594959710 x1.01
|
||||
fabsf [1.00, 1000.00] 5000000 0.00e+00 0.00e+00% 0.00e+00 592068236 x1.00
|
||||
fabsf_c [1.00, 1000.00] 5000000 0.00e+00 0.00e+00% 0.00e+00 589808748 x1.00
|
||||
fabsf_neon [1.00, 1000.00] 5000000 0.00e+00 0.00e+00% 0.00e+00 587712180 x1.01
|
||||
sqrtf [1.00, 1000.00] 500000 0.00e+00 0.00e+00% 0.00e+00 586496654 x1.00
|
||||
sqrtf_c [1.00, 1000.00] 500000 2.33e-04 1.06e-03% 8.69e-05 585470866 x1.00
|
||||
sqrtf_neon [1.00, 1000.00] 500000 0.00e+00 0.00e+00% nan 584594551 x1.00
|
||||
invsqrtf [1.00, 1000.00] 500000 0.00e+00 0.00e+00% 0.00e+00 583492213 x1.00
|
||||
invsqrtf_c [1.00, 1000.00] 500000 4.35e-06 4.78e-04% 2.00e-07 582448164 x1.00
|
||||
invsqrtf_neon [1.00, 1000.00] 500000 0.00e+00 0.00e+00% nan 581642365 x1.00
|
||||
atan2f [0.10, 10.00] 10000 0.00e+00 0.00e+00% 0.00e+00 83594269 x1.00
|
||||
atan2f_c [0.10, 10.00] 10000 1.73e-04 2.23e-02% 0.00e+00 85383651 x0.98
|
||||
atan2f_neon [0.10, 10.00] 10000 0.00e+00 0.00e+00% 0.00e+00 87387055 x0.96
|
||||
powf [1.00, 10.00] 10000 0.00e+00 0.00e+00% 0.00e+00 93430489 x1.00
|
||||
powf_c [1.00, 10.00] 10000 1.08e+05 4.37e-03% 0.00e+00 96726976 x0.97
|
||||
powf_neon [1.00, 10.00] 10000 9.97e+09 1.00e+02% 0.00e+00 100185753 x0.93
|
||||
fmodf [1.00, 10.00] 10000 0.00e+00 0.00e+00% 0.00e+00 101653673 x1.00
|
||||
fmodf_c [1.00, 10.00] 10000 9.90e+00 8.06e-02% 0.00e+00 103177551 x0.99
|
||||
fmodf_neon [1.00, 10.00] 10000 9.99e+00 1.00e+02% 0.00e+00 104771240 x0.97
|
||||
|
||||
sinf [-3.14, 3.14] 500000 0.00e+00 0.00e+00% 0.00e+00 1394459996 x1.00
|
||||
sinf_c [-3.14, 3.14] 500000 7.75e-07 1.00e+02% 4.09e-07 1395128226 x1.00
|
||||
sinf_neon [-3.14, 3.14] 500000 8.34e-07 1.00e+02% 4.09e-07 1395853554 x1.00
|
||||
cosf [-3.14, 3.14] 500000 0.00e+00 0.00e+00% 0.00e+00 1396644271 x1.00
|
||||
cosf_c [-3.14, 3.14] 500000 7.75e-07 6.74e-01% 4.15e-07 1397360321 x1.00
|
||||
cosf_neon [-3.14, 3.14] 500000 8.34e-07 6.74e-01% 4.16e-07 1398126872 x1.00
|
||||
tanf [-0.79, 0.79] 500000 0.00e+00 0.00e+00% 0.00e+00 1398889596 x1.00
|
||||
tanf_c [-0.79, 0.79] 500000 2.98e-06 7.94e-04% 1.31e-06 1399704712 x1.00
|
||||
tanf_neon [-0.79, 0.79] 500000 1.91e-06 3.62e-04% 6.66e-07 1400612899 x1.00
|
||||
asinf [-1.00, 1.00] 500000 0.00e+00 0.00e+00% 0.00e+00 1401838993 x1.00
|
||||
asinf_c [-1.00, 1.00] 500000 5.54e-05 1.06e-02% nan 1402745512 x1.00
|
||||
asinf_neon [-1.00, 1.00] 500000 4.66e-05 8.90e-03% nan 1403967661 x1.00
|
||||
acosf [-1.00, 1.00] 500000 0.00e+00 0.00e+00% 0.00e+00 1405317842 x1.00
|
||||
acosf_c [-1.00, 1.00] 500000 5.56e-05 6.46e-03% nan 1406294753 x1.00
|
||||
acosf_neon [-1.00, 1.00] 500000 4.67e-05 6.35e-03% nan 1407598039 x1.00
|
||||
atanf [-1.00, 1.00] 500000 0.00e+00 0.00e+00% 0.00e+00 1408314869 x1.00
|
||||
atanf_c [-1.00, 1.00] 500000 1.67e-04 2.12e-02% 7.40e-05 1408872421 x1.00
|
||||
atanf_neon [-1.00, 1.00] 500000 1.67e-04 2.12e-02% 7.40e-05 1409736652 x1.00
|
||||
sinhf [-3.14, 3.14] 500000 0.00e+00 0.00e+00% 0.00e+00 1411101066 x1.00
|
||||
sinhf_c [-3.14, 3.14] 500000 1.91e-06 1.52e-01% 1.85e-07 1412173492 x1.00
|
||||
sinhf_neon [-3.14, 3.14] 500000 1.91e-06 1.52e-01% 1.90e-07 1413205410 x1.00
|
||||
coshf [-3.14, 3.14] 500000 0.00e+00 0.00e+00% 0.00e+00 1414417802 x1.00
|
||||
coshf_c [-3.14, 3.14] 500000 9.54e-07 2.38e-05% 1.64e-07 1415426083 x1.00
|
||||
coshf_neon [-3.14, 3.14] 500000 1.91e-06 2.22e-05% 1.68e-07 1416412636 x1.00
|
||||
tanhf [-3.14, 3.14] 500000 0.00e+00 0.00e+00% 0.00e+00 1417684273 x1.00
|
||||
tanhf_c [-3.14, 3.14] 500000 1.20e-05 2.48e-01% 5.48e-06 1418659628 x1.00
|
||||
tanhf_neon [-3.14, 3.14] 500000 2.38e-07 2.47e-01% 5.40e-08 1419650721 x1.00
|
||||
expf [0.00, 10.00] 500000 0.00e+00 0.00e+00% 0.00e+00 1420706074 x1.00
|
||||
expf_c [0.00, 10.00] 500000 9.77e-03 6.15e-05% 1.64e-03 1421444150 x1.00
|
||||
expf_neon [0.00, 10.00] 500000 9.77e-03 6.58e-05% 1.64e-03 1422203499 x1.00
|
||||
logf [1.00, 1000.00] 500000 0.00e+00 0.00e+00% 0.00e+00 1423106698 x1.00
|
||||
logf_c [1.00, 1000.00] 500000 6.20e-06 1.62e-02% 9.83e-07 1423735174 x1.00
|
||||
logf_neon [1.00, 1000.00] 500000 7.63e-06 1.03e-02% 1.07e-06 1424434406 x1.00
|
||||
log10f [1.00, 1000.00] 500000 0.00e+00 0.00e+00% 0.00e+00 1425516892 x1.00
|
||||
log10f_c [1.00, 1000.00] 500000 2.86e-06 6.68e-03% 4.79e-07 1426200368 x1.00
|
||||
log10f_neon [1.00, 1000.00] 500000 3.34e-06 6.68e-03% 4.84e-07 1426966844 x1.00
|
||||
floorf [1.00, 1000.00] 5000000 0.00e+00 0.00e+00% 0.00e+00 1429081993 x1.00
|
||||
floorf_c [1.00, 1000.00] 5000000 0.00e+00 0.00e+00% 0.00e+00 1430839273 x1.00
|
||||
floorf_neon [1.00, 1000.00] 5000000 0.00e+00 0.00e+00% 0.00e+00 1433474766 x1.00
|
||||
ceilf [1.00, 1000.00] 5000000 0.00e+00 0.00e+00% 0.00e+00 1435602956 x1.00
|
||||
ceilf_c [1.00, 1000.00] 5000000 0.00e+00 0.00e+00% 0.00e+00 1437403711 x1.00
|
||||
ceilf_neon [1.00, 1000.00] 5000000 0.00e+00 0.00e+00% 0.00e+00 1440044970 x1.00
|
||||
fabsf [1.00, 1000.00] 5000000 0.00e+00 0.00e+00% 0.00e+00 1441265630 x1.00
|
||||
fabsf_c [1.00, 1000.00] 5000000 0.00e+00 0.00e+00% 0.00e+00 1442491716 x1.00
|
||||
fabsf_neon [1.00, 1000.00] 5000000 0.00e+00 0.00e+00% 0.00e+00 1443680744 x1.00
|
||||
sqrtf [1.00, 1000.00] 500000 0.00e+00 0.00e+00% 0.00e+00 1444844144 x1.00
|
||||
sqrtf_c [1.00, 1000.00] 500000 2.33e-04 1.06e-03% 8.69e-05 1445710342 x1.00
|
||||
sqrtf_neon [1.00, 1000.00] 500000 7.63e-06 2.91e-05% 1.60e-06 1446544637 x1.00
|
||||
invsqrtf [1.00, 1000.00] 500000 0.00e+00 0.00e+00% 0.00e+00 1446995307 x1.00
|
||||
invsqrtf_c [1.00, 1000.00] 500000 4.35e-06 4.78e-04% 2.00e-07 1447471977 x1.00
|
||||
invsqrtf_neon [1.00, 1000.00] 500000 1.19e-07 2.12e-05% 4.81e-09 1447987675 x1.00
|
||||
atan2f [0.10, 10.00] 10000 0.00e+00 0.00e+00% 0.00e+00 1449713108 x1.00
|
||||
atan2f_c [0.10, 10.00] 10000 1.73e-04 2.23e-02% 0.00e+00 1451276575 x1.00
|
||||
atan2f_neon [0.10, 10.00] 10000 1.67e-04 2.12e-02% 0.00e+00 1453093260 x1.00
|
||||
powf [1.00, 10.00] 10000 0.00e+00 0.00e+00% 0.00e+00 1458606663 x1.00
|
||||
powf_c [1.00, 10.00] 10000 1.08e+05 4.37e-03% 0.00e+00 1461584933 x1.00
|
||||
powf_neon [1.00, 10.00] 10000 1.36e+05 5.88e-03% 0.00e+00 1464702743 x1.00
|
||||
fmodf [1.00, 10.00] 10000 0.00e+00 0.00e+00% 0.00e+00 1466022029 x1.00
|
||||
fmodf_c [1.00, 10.00] 10000 9.90e+00 8.06e-02% 0.00e+00 1467403015 x1.00
|
||||
fmodf_neon [1.00, 10.00] 10000 9.97e+00 8.06e-02% 0.00e+00 1468767755 x1.00
|
||||
|
689
deps/math-neon/math_debug.c
vendored
Normal file
689
deps/math-neon/math_debug.c
vendored
Normal file
@ -0,0 +1,689 @@
|
||||
/*
|
||||
Math-NEON: Neon Optimised Math Library based on cmath
|
||||
Contact: lachlan.ts@gmail.com
|
||||
Copyright (C) 2009 Lachlan Tychsen - Smith aka Adventus
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 3 of the License, or (at your option) any later version.
|
||||
|
||||
This library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with this library; if not, write to the Free
|
||||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
*/
|
||||
|
||||
|
||||
#include <math_neon.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdarg.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
#include <time.h>
|
||||
#ifdef WIN32
|
||||
#include <time.h>
|
||||
#else
|
||||
#include <sys/time.h>
|
||||
#include <sys/resource.h>
|
||||
#endif
|
||||
|
||||
#define randf() (rand() / (RAND_MAX + 1.0f))
|
||||
|
||||
void LOG(const char *format, ...) {
|
||||
__gnuc_va_list arg;
|
||||
va_start(arg, format);
|
||||
char msg[512];
|
||||
vsprintf(msg, format, arg);
|
||||
va_end(arg);
|
||||
|
||||
FILE *log = fopen("ux0:/data/mathneon.log", "a+");
|
||||
if (log != NULL) {
|
||||
fwrite(msg, 1, strlen(msg), log);
|
||||
fclose(log);
|
||||
}
|
||||
}
|
||||
|
||||
struct test1_s {
|
||||
const char* name;
|
||||
float (*func)(float); //the function
|
||||
float (*bench)(float); //the function to benchmark against.
|
||||
float rng0, rng1;
|
||||
int num;
|
||||
float emaxabs;
|
||||
float xmaxabs;
|
||||
float emaxrel;
|
||||
float xmaxrel;
|
||||
float erms;
|
||||
int time; //time to execute num functions;
|
||||
};
|
||||
|
||||
struct test2_s {
|
||||
const char* name;
|
||||
float (*func)(float, float); //the function
|
||||
float (*bench)(float, float); //the function to benchmark against.
|
||||
float rng0, rng1;
|
||||
int num;
|
||||
float emaxabs;
|
||||
float xmaxabs;
|
||||
float emaxrel;
|
||||
float xmaxrel;
|
||||
float erms;
|
||||
int time; //time to execute num functions;
|
||||
};
|
||||
|
||||
|
||||
float invsqrtf(float x){
|
||||
return (1.0f / sqrtf(x));
|
||||
}
|
||||
|
||||
typedef struct test1_s test1_t;
|
||||
typedef struct test2_s test2_t;
|
||||
|
||||
test1_t test1[51] =
|
||||
{
|
||||
{"sinf ", sinf, sinf, -M_PI, M_PI, 500000},
|
||||
{"sinf_c ", sinf_c, sinf, -M_PI, M_PI, 500000},
|
||||
{"sinf_neon ", sinf_neon, sinf, -M_PI, M_PI, 500000},
|
||||
|
||||
{"cosf ", cosf, cosf, -M_PI, M_PI, 500000},
|
||||
{"cosf_c ", cosf_c, cosf, -M_PI, M_PI, 500000},
|
||||
{"cosf_neon ", cosf_neon, cosf, -M_PI, M_PI, 500000},
|
||||
|
||||
{"tanf ", tanf, tanf, -M_PI_4, M_PI_4, 500000, 0, 0, 0},
|
||||
{"tanf_c ", tanf_c, tanf, -M_PI_4, M_PI_4, 500000, 0, 0, 0},
|
||||
{"tanf_neon ", tanf_neon, tanf, -M_PI_4, M_PI_4, 500000, 0, 0, 0},
|
||||
|
||||
{"asinf ", asinf, asinf, -1, 1, 500000, 0, 0, 0},
|
||||
{"asinf_c ", asinf_c, asinf, -1, 1, 500000, 0, 0, 0},
|
||||
{"asinf_neon ", asinf_neon, asinf, -1, 1, 500000, 0, 0, 0},
|
||||
|
||||
{"acosf ", acosf, acosf, -1, 1, 500000, 0, 0, 0},
|
||||
{"acosf_c ", acosf_c, acosf, -1, 1, 500000, 0, 0, 0},
|
||||
{"acosf_neon ", acosf_neon, acosf, -1, 1, 500000, 0, 0, 0},
|
||||
|
||||
{"atanf ", atanf, atanf, -1, 1, 500000, 0, 0, 0},
|
||||
{"atanf_c ", atanf_c, atanf, -1, 1, 500000, 0, 0, 0},
|
||||
{"atanf_neon ", atanf_neon, atanf, -1, 1, 500000, 0, 0, 0},
|
||||
|
||||
{"sinhf ", sinhf, sinhf, -M_PI, M_PI, 500000, 0, 0, 0},
|
||||
{"sinhf_c ", sinhf_c, sinhf, -M_PI, M_PI, 500000, 0, 0, 0},
|
||||
{"sinhf_neon ", sinhf_neon, sinhf, -M_PI, M_PI, 500000, 0, 0, 0},
|
||||
|
||||
{"coshf ", coshf, coshf, -M_PI, M_PI, 500000, 0, 0, 0},
|
||||
{"coshf_c ", coshf_c, coshf, -M_PI, M_PI, 500000, 0, 0, 0},
|
||||
{"coshf_neon ", coshf_neon, coshf, -M_PI, M_PI, 500000, 0, 0, 0},
|
||||
|
||||
{"tanhf ", tanhf, tanhf, -M_PI, M_PI, 500000, 0, 0, 0},
|
||||
{"tanhf_c ", tanhf_c, tanhf, -M_PI, M_PI, 500000, 0, 0, 0},
|
||||
{"tanhf_neon ", tanhf_neon, tanhf, -M_PI, M_PI, 500000, 0, 0, 0},
|
||||
|
||||
{"expf ", expf, expf, 0, 10, 500000, 0, 0, 0},
|
||||
{"expf_c ", expf_c, expf, 0, 10, 500000, 0, 0, 0},
|
||||
{"expf_neon ", expf_neon, expf, 0, 10, 500000, 0, 0, 0},
|
||||
|
||||
{"logf ", logf, logf, 1, 1000, 500000, 0, 0, 0},
|
||||
{"logf_c ", logf_c, logf, 1, 1000, 500000, 0, 0, 0},
|
||||
{"logf_neon ", logf_neon, logf, 1, 1000, 500000, 0, 0, 0},
|
||||
|
||||
{"log10f ", log10f, log10f, 1, 1000, 500000, 0, 0, 0},
|
||||
{"log10f_c ", log10f_c, log10f, 1, 1000, 500000, 0, 0, 0},
|
||||
{"log10f_neon ", log10f_neon,log10f, 1, 1000, 500000, 0, 0, 0},
|
||||
|
||||
{"floorf ", floorf, floorf, 1, 1000, 5000000, 0, 0, 0},
|
||||
{"floorf_c ", floorf_c, floorf, 1, 1000, 5000000, 0, 0, 0},
|
||||
{"floorf_neon", floorf_neon,floorf, 1, 1000, 5000000, 0, 0, 0},
|
||||
|
||||
{"ceilf ", ceilf, ceilf, 1, 1000, 5000000, 0, 0, 0},
|
||||
{"ceilf_c ", ceilf_c, ceilf, 1, 1000, 5000000, 0, 0, 0},
|
||||
{"ceilf_neon", ceilf_neon, ceilf, 1, 1000, 5000000, 0, 0, 0},
|
||||
|
||||
{"fabsf ", fabsf, fabsf, 1, 1000, 5000000, 0, 0, 0},
|
||||
{"fabsf_c ", fabsf_c, fabsf, 1, 1000, 5000000, 0, 0, 0},
|
||||
{"fabsf_neon", fabsf_neon, fabsf, 1, 1000, 5000000, 0, 0, 0},
|
||||
|
||||
{"sqrtf ", sqrtf, sqrtf, 1, 1000, 500000, 0, 0, 0},
|
||||
{"sqrtf_c ", sqrtf_c, sqrtf, 1, 1000, 500000, 0, 0, 0},
|
||||
{"sqrtf_neon ", sqrtf_neon, sqrtf, 1, 1000, 500000, 0, 0, 0},
|
||||
|
||||
{"invsqrtf ", invsqrtf, invsqrtf, 1, 1000, 500000, 0, 0, 0},
|
||||
{"invsqrtf_c ", invsqrtf_c, invsqrtf, 1, 1000, 500000, 0, 0, 0},
|
||||
{"invsqrtf_neon ", invsqrtf_neon, invsqrtf, 1, 1000, 500000, 0, 0, 0},
|
||||
};
|
||||
|
||||
test2_t test2[9] =
|
||||
{
|
||||
{"atan2f ", atan2f, atan2f, 0.1, 10, 10000, 0, 0, 0},
|
||||
{"atan2f_c ", atan2f_c, atan2f, 0.1, 10, 10000, 0, 0, 0},
|
||||
{"atan2f_neon ", atan2f_neon,atan2f, 0.1, 10, 10000, 0, 0, 0},
|
||||
|
||||
{"powf ", powf, powf, 1, 10, 10000, 0, 0, 0},
|
||||
{"powf_c ", powf_c, powf, 1, 10, 10000, 0, 0, 0},
|
||||
{"powf_neon ", powf_neon, powf, 1, 10, 10000, 0, 0, 0},
|
||||
|
||||
{"fmodf ", fmodf, fmodf, 1, 10, 10000, 0, 0, 0},
|
||||
{"fmodf_c ", fmodf_c, fmodf, 1, 10, 10000, 0, 0, 0},
|
||||
{"fmodf_neon ", fmodf_neon, fmodf, 1, 10, 10000, 0, 0, 0},
|
||||
|
||||
};
|
||||
|
||||
|
||||
void
|
||||
test_mathfunc1(test1_t *tst)
|
||||
{
|
||||
|
||||
float x;
|
||||
float dx = (tst->rng1 - tst->rng0) / ((float)tst->num);
|
||||
#ifndef WIN32
|
||||
struct rusage ru;
|
||||
#endif
|
||||
|
||||
tst->emaxabs = tst->xmaxabs = 0;
|
||||
tst->emaxrel = tst->xmaxrel = 0;
|
||||
tst->erms = 0;
|
||||
for(x = tst->rng0; x < tst->rng1 ; x += dx){
|
||||
float r = (tst->func)((float)x);
|
||||
float rr = (tst->bench)((float)x);
|
||||
float dr = fabs(r - rr);
|
||||
float drr = dr * (100.0f / rr);
|
||||
tst->erms += dr*dr;
|
||||
if (dr > tst->emaxabs){
|
||||
tst->emaxabs = dr;
|
||||
tst->xmaxabs = x;
|
||||
}
|
||||
if (drr > tst->emaxrel){
|
||||
tst->emaxrel = drr;
|
||||
tst->xmaxrel = x;
|
||||
}
|
||||
}
|
||||
tst->erms = sqrt(tst->erms / ((float) tst->num));
|
||||
|
||||
#ifdef WIN32
|
||||
tst->time = (1000 * clock()) / (CLOCKS_PER_SEC / 1000);
|
||||
#else
|
||||
tst->time = sceKernelGetSystemTimeWide();
|
||||
#endif
|
||||
|
||||
for(x = tst->rng0; x < tst->rng1 ; x += dx){
|
||||
(tst->func)((float)x);
|
||||
}
|
||||
|
||||
#ifdef WIN32
|
||||
tst->time = (1000 * clock()) / (CLOCKS_PER_SEC / 1000) - tst->time;
|
||||
#else
|
||||
tst->time = sceKernelGetSystemTimeWide();
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
void
|
||||
test_mathfunc2(test2_t *tst)
|
||||
{
|
||||
float x, y;
|
||||
float rng = tst->rng1 - tst->rng0;
|
||||
float d = (rng * rng) / ((float) tst->num);
|
||||
#ifndef WIN32
|
||||
struct rusage ru;
|
||||
#endif
|
||||
|
||||
tst->emaxabs = tst->xmaxabs = 0;
|
||||
tst->emaxrel = tst->xmaxrel = 0;
|
||||
for(y = (tst->rng0); y < (tst->rng1) ; y += d){
|
||||
for(x = (tst->rng0); x < (tst->rng1); x += d){
|
||||
float r = (tst->func)((float)x, y);
|
||||
float rr = (tst->bench)((float)x, y);
|
||||
float dr = fabs(r - rr);
|
||||
float drr = dr * (100.0f / rr);
|
||||
if (dr > tst->emaxabs){
|
||||
tst->emaxabs = dr;
|
||||
tst->xmaxabs = x;
|
||||
}
|
||||
if (drr > tst->emaxrel && fabsf(rr) > 0.0001){
|
||||
tst->emaxrel = drr;
|
||||
tst->xmaxrel = x;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef WIN32
|
||||
tst->time = (1000 * clock()) / (CLOCKS_PER_SEC / 1000) ;
|
||||
#else
|
||||
tst->time = sceKernelGetSystemTimeWide();
|
||||
#endif
|
||||
|
||||
for(y = tst->rng0; y < tst->rng1 ; y += d){
|
||||
for(x = tst->rng0; x < tst->rng1 ; x += d){
|
||||
(tst->func)((float)x, (float)y);
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef WIN32
|
||||
tst->time = (1000 * clock()) / (CLOCKS_PER_SEC / 1000) - tst->time;
|
||||
#else
|
||||
tst->time = sceKernelGetSystemTimeWide();
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
void test_vectorfunc()
|
||||
{
|
||||
float v0[4], v1[4], d[4];
|
||||
|
||||
for(int i=0;i<4;i++)
|
||||
{
|
||||
v0[i] = 10*randf() - 5;
|
||||
v1[i] = 10*randf() - 5;
|
||||
d[i] = 10*randf() - 5;
|
||||
}
|
||||
|
||||
int testnum = 5000000;
|
||||
struct rusage ru;
|
||||
int v2t[3], v3t[3], v4t[3];
|
||||
float r;
|
||||
|
||||
LOG("\n");
|
||||
|
||||
//dot 2
|
||||
v2t[0] = sceKernelGetSystemTimeWide();
|
||||
for(int i=0;i < testnum; i++)
|
||||
{
|
||||
r = dot2_c(v0, v1);
|
||||
};
|
||||
v2t[1] = sceKernelGetSystemTimeWide();
|
||||
for(int i=0;i < testnum; i++)
|
||||
{
|
||||
r = dot2_neon(v0, v1);
|
||||
};
|
||||
v2t[2] = sceKernelGetSystemTimeWide();
|
||||
|
||||
r = dot2_c(v0, v1);
|
||||
LOG("dot2_c = %f\n", r);
|
||||
r = dot2_neon(v0, v1);
|
||||
LOG("dot2_neon = %f\n", r);
|
||||
|
||||
LOG("dot2: c=%i \t neon=%i \t rate=%.2f \n", v2t[1] - v2t[0], v2t[2] - v2t[1],
|
||||
(float)(v2t[1] - v2t[0]) / (float)(v2t[2] - v2t[1]));
|
||||
|
||||
//normalize 2
|
||||
v2t[0] = sceKernelGetSystemTimeWide();
|
||||
for(int i=0;i < testnum; i++)
|
||||
{
|
||||
normalize2_c(v0, d);
|
||||
};
|
||||
v2t[1] = sceKernelGetSystemTimeWide();
|
||||
for(int i=0;i < testnum; i++)
|
||||
{
|
||||
normalize2_neon(v0, d);
|
||||
};
|
||||
v2t[2] = sceKernelGetSystemTimeWide();
|
||||
|
||||
|
||||
normalize2_c(v0, d);
|
||||
LOG("normalize2_c = [%.2f, %.2f]\n", d[0], d[1]);
|
||||
normalize2_neon(v0, d);
|
||||
LOG("normalize2_neon = [%.2f, %.2f]\n", d[0], d[1]);
|
||||
|
||||
LOG("normalize2: c=%i \t neon=%i \t rate=%.2f \n", v2t[1] - v2t[0], v2t[2] - v2t[1],
|
||||
(float)(v2t[1] - v2t[0]) / (float)(v2t[2] - v2t[1]));
|
||||
LOG("\n");
|
||||
|
||||
|
||||
//dot 3
|
||||
v3t[0] = sceKernelGetSystemTimeWide();
|
||||
for(int i=0;i < testnum; i++)
|
||||
{
|
||||
r = dot3_c(v0, v1);
|
||||
};
|
||||
v3t[1] = sceKernelGetSystemTimeWide();
|
||||
for(int i=0;i < testnum; i++)
|
||||
{
|
||||
r = dot3_neon(v0, v1);
|
||||
};
|
||||
v3t[2] = sceKernelGetSystemTimeWide();
|
||||
|
||||
r = dot3_c(v0, v1);
|
||||
LOG("dot3_c = %f\n", r);
|
||||
r = dot3_neon(v0, v1);
|
||||
LOG("dot3_neon = %f\n", r);
|
||||
|
||||
LOG("dot3: c=%i \t neon=%i \t rate=%.2f \n", v3t[1] - v3t[0], v3t[2] - v3t[1],
|
||||
(float)(v3t[1] - v3t[0]) / (float)(v3t[2] - v3t[1]));
|
||||
|
||||
//normalize 3
|
||||
v3t[0] = sceKernelGetSystemTimeWide();
|
||||
for(int i=0;i < testnum; i++)
|
||||
{
|
||||
normalize3_c(v0, d);
|
||||
};
|
||||
v3t[1] = sceKernelGetSystemTimeWide();
|
||||
for(int i=0;i < testnum; i++)
|
||||
{
|
||||
normalize3_neon(v0, d);
|
||||
};
|
||||
v3t[2] = sceKernelGetSystemTimeWide();
|
||||
|
||||
|
||||
normalize3_c(v0, d);
|
||||
LOG("normalize3_c = [%.2f, %.2f, %.2f]\n", d[0], d[1], d[2]);
|
||||
normalize3_neon(v0, d);
|
||||
LOG("normalize3_neon = [%.2f, %.2f, %.2f]\n", d[0], d[1], d[2]);
|
||||
|
||||
LOG("normalize3: c=%i \t neon=%i \t rate=%.2f \n", v3t[1] - v3t[0], v3t[2] - v3t[1],
|
||||
(float)(v3t[1] - v3t[0]) / (float)(v3t[2] - v3t[1]));
|
||||
|
||||
//cross 3
|
||||
v3t[0] = sceKernelGetSystemTimeWide();
|
||||
for(int i=0;i < testnum; i++)
|
||||
{
|
||||
cross3_c(v0, v1, d);
|
||||
};
|
||||
v3t[1] = sceKernelGetSystemTimeWide();
|
||||
for(int i=0;i < testnum; i++)
|
||||
{
|
||||
cross3_neon(v0, v1, d);
|
||||
};
|
||||
v3t[2] = sceKernelGetSystemTimeWide();
|
||||
|
||||
|
||||
cross3_c(v0, v1, d);
|
||||
LOG("cross3_c = [%.2f, %.2f, %.2f]\n", d[0], d[1], d[2]);
|
||||
cross3_neon(v0, v1, d);
|
||||
LOG("cross3_neon = [%.2f, %.2f, %.2f]\n", d[0], d[1], d[2]);
|
||||
|
||||
LOG("cross3: c=%i \t neon=%i \t rate=%.2f \n", v3t[1] - v3t[0], v3t[2] - v3t[1],
|
||||
(float)(v3t[1] - v3t[0]) / (float)(v3t[2] - v3t[1]));
|
||||
LOG("\n");
|
||||
|
||||
|
||||
//dot 4
|
||||
v4t[0] = sceKernelGetSystemTimeWide();
|
||||
for(int i=0;i < testnum; i++)
|
||||
{
|
||||
r = dot4_c(v0, v1);
|
||||
};
|
||||
v4t[1] = sceKernelGetSystemTimeWide();
|
||||
for(int i=0;i < testnum; i++)
|
||||
{
|
||||
r = dot4_neon(v0, v1);
|
||||
};
|
||||
v4t[2] = sceKernelGetSystemTimeWide();
|
||||
|
||||
r = dot4_c(v0, v1);
|
||||
LOG("dot4_c = %f\n", r);
|
||||
r = dot4_neon(v0, v1);
|
||||
LOG("dot4_neon = %f\n", r);
|
||||
|
||||
LOG("dot4: c=%i \t neon=%i \t rate=%.2f \n", v4t[1] - v4t[0], v4t[2] - v4t[1],
|
||||
(float)(v4t[1] - v4t[0]) / (float)(v4t[2] - v4t[1]));
|
||||
|
||||
//normalize 4
|
||||
v4t[0] = sceKernelGetSystemTimeWide();
|
||||
for(int i=0;i < testnum; i++)
|
||||
{
|
||||
normalize4_c(v0, d);
|
||||
};
|
||||
v4t[1] = sceKernelGetSystemTimeWide();
|
||||
for(int i=0;i < testnum; i++)
|
||||
{
|
||||
normalize4_neon(v0, d);
|
||||
};
|
||||
v4t[2] = sceKernelGetSystemTimeWide();
|
||||
|
||||
|
||||
normalize4_c(v0, d);
|
||||
LOG("normalize4_c = [%.2f, %.2f, %.2f, %.2f]\n", d[0], d[1], d[2], d[3]);
|
||||
normalize4_neon(v0, d);
|
||||
LOG("normalize4_neon = [%.2f, %.2f, %.2f, %.2f]\n", d[0], d[1], d[2], d[3]);
|
||||
|
||||
LOG("normalize4: c=%i \t neon=%i \t rate=%.2f \n", v4t[1] - v4t[0], v4t[2] - v4t[1],
|
||||
(float)(v4t[1] - v4t[0]) / (float)(v4t[2] - v4t[1]));
|
||||
LOG("\n");
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
void test_matrixfunc()
|
||||
{
|
||||
float m0[16], m1[16], m2[16];
|
||||
int m2t[3], m3t[3], m4t[3];
|
||||
|
||||
int i;
|
||||
int testnum = 1000000;
|
||||
struct rusage ru;
|
||||
|
||||
for(int i=0;i<16;i++)
|
||||
{
|
||||
m0[i] = 10.0f * randf() - 5.0f;
|
||||
m1[i] = 10.0f * randf() - 5.0f;
|
||||
m2[i] = 10.0f * randf() - 5.0f;
|
||||
}
|
||||
|
||||
|
||||
//matmul2
|
||||
m2t[0] = sceKernelGetSystemTimeWide();
|
||||
for(i = 0; i < testnum; i++){
|
||||
matmul2_c(m0, m1, m2);
|
||||
}
|
||||
m2t[1] = sceKernelGetSystemTimeWide();
|
||||
for(i = 0; i < testnum; i++){
|
||||
matmul2_neon(m0, m1, m2);
|
||||
}
|
||||
m2t[2] = sceKernelGetSystemTimeWide();
|
||||
|
||||
matmul2_c(m0, m1, m2);
|
||||
LOG("matmul2_c = \n");
|
||||
LOG("\t\t\t|%.2f, %.2f|\n", m2[0], m2[2]);
|
||||
LOG("\t\t\t|%.2f, %.2f|\n", m2[1], m2[3]);
|
||||
|
||||
matmul2_neon(m0, m1, m2);
|
||||
LOG("matmul2_neon = \n");
|
||||
LOG("\t\t\t|%.2f, %.2f|\n", m2[0], m2[2]);
|
||||
LOG("\t\t\t|%.2f, %.2f|\n", m2[1], m2[3]);
|
||||
|
||||
LOG("matmul2: c=%i \t neon=%i \t rate=%.2f \n", m2t[1] - m2t[0], m2t[2] - m2t[1],
|
||||
(float)(m2t[1] - m2t[0]) / (float)(m2t[2] - m2t[1]));
|
||||
|
||||
|
||||
//matvec2
|
||||
m2t[0] = sceKernelGetSystemTimeWide();
|
||||
for(i = 0; i < testnum; i++){
|
||||
matvec2_c(m0, m1, m2);
|
||||
}
|
||||
m2t[1] = sceKernelGetSystemTimeWide();
|
||||
for(i = 0; i < testnum; i++){
|
||||
matvec2_neon(m0, m1, m2);
|
||||
}
|
||||
m2t[2] = sceKernelGetSystemTimeWide();
|
||||
|
||||
memset(m2, 0, 4*sizeof(float));
|
||||
matvec2_c(m0, m1, m2);
|
||||
LOG("matvec2_c = |%.2f, %.2f|\n", m2[0], m2[1]);
|
||||
|
||||
memset(m2, 0, 4*sizeof(float));
|
||||
matvec2_neon(m0, m1, m2);
|
||||
LOG("matvec2_neon = |%.2f, %.2f|\n", m2[0], m2[1]);
|
||||
|
||||
LOG("matvec2: c=%i \t neon=%i \t rate=%.2f \n", m2t[1] - m2t[0], m2t[2] - m2t[1],
|
||||
(float)(m2t[1] - m2t[0]) / (float)(m2t[2] - m2t[1]));
|
||||
|
||||
//MAT3
|
||||
m3t[0] = sceKernelGetSystemTimeWide();
|
||||
for(i = 0; i < testnum; i++){
|
||||
matmul3_c(m0, m1, m2);
|
||||
}
|
||||
m3t[1] = sceKernelGetSystemTimeWide();
|
||||
for(i = 0; i < testnum; i++){
|
||||
matmul3_neon(m0, m1, m2);
|
||||
}
|
||||
m3t[2] = sceKernelGetSystemTimeWide();
|
||||
|
||||
memset(m2, 0, 9*sizeof(float));
|
||||
matmul3_c(m0, m1, m2);
|
||||
LOG("matmul3_c =\n");
|
||||
LOG("\t\t\t|%.2f, %.2f, %.2f|\n", m2[0], m2[3], m2[6]);
|
||||
LOG("\t\t\t|%.2f, %.2f, %.2f|\n", m2[1], m2[4], m2[7]);
|
||||
LOG("\t\t\t|%.2f, %.2f, %.2f|\n", m2[2], m2[5], m2[8]);
|
||||
|
||||
memset(m2, 0, 9*sizeof(float));
|
||||
matmul3_neon(m0, m1, m2);
|
||||
LOG("matmul3_neon =\n");
|
||||
LOG("\t\t\t|%.2f, %.2f, %.2f|\n", m2[0], m2[3], m2[6]);
|
||||
LOG("\t\t\t|%.2f, %.2f, %.2f|\n", m2[1], m2[4], m2[7]);
|
||||
LOG("\t\t\t|%.2f, %.2f, %.2f|\n", m2[2], m2[5], m2[8]);
|
||||
|
||||
LOG("matmul3: c=%i \t neon=%i \t rate=%.2f \n", m3t[1] - m3t[0], m3t[2] - m3t[1],
|
||||
(float)(m3t[1] - m3t[0]) / (float)(m3t[2] - m3t[1]));
|
||||
|
||||
//matvec3
|
||||
m3t[0] = sceKernelGetSystemTimeWide();
|
||||
for(i = 0; i < testnum; i++){
|
||||
matvec3_c(m0, m1, m2);
|
||||
}
|
||||
m3t[1] = sceKernelGetSystemTimeWide();
|
||||
for(i = 0; i < testnum; i++){
|
||||
matvec3_neon(m0, m1, m2);
|
||||
}
|
||||
m3t[2] = sceKernelGetSystemTimeWide();
|
||||
|
||||
memset(m2, 0, 4*sizeof(float));
|
||||
matvec3_c(m0, m1, m2);
|
||||
LOG("matvec3_c = |%.2f, %.2f, %.2f|\n", m2[0], m2[1], m2[2]);
|
||||
|
||||
memset(m2, 0, 4*sizeof(float));
|
||||
matvec3_neon(m0, m1, m2);
|
||||
LOG("matvec3_neon = |%.2f, %.2f, %.2f|\n", m2[0], m2[1], m2[2]);
|
||||
|
||||
LOG("matvec3: c=%i \t neon=%i \t rate=%.2f \n", m3t[1] - m3t[0], m3t[2] - m3t[1],
|
||||
(float)(m3t[1] - m3t[0]) / (float)(m3t[2] - m3t[1]));
|
||||
|
||||
//MAT4
|
||||
m4t[0] = sceKernelGetSystemTimeWide();
|
||||
for(i = 0; i < testnum; i++){
|
||||
matmul4_c(m0, m1, m2);
|
||||
}
|
||||
m4t[1] = sceKernelGetSystemTimeWide();
|
||||
for(i = 0; i < testnum; i++){
|
||||
matmul4_neon(m0, m1, m2);
|
||||
}
|
||||
m4t[2] = sceKernelGetSystemTimeWide();
|
||||
|
||||
memset(m2, 0, 16*sizeof(float));
|
||||
matmul4_c(m0, m1, m2);
|
||||
LOG("matmul4_c =\n");
|
||||
LOG("\t\t\t|%.2f, %.2f, %.2f, %.2f|\n", m2[0], m2[4], m2[8], m2[12]);
|
||||
LOG("\t\t\t|%.2f, %.2f, %.2f, %.2f|\n", m2[1], m2[5], m2[9], m2[13]);
|
||||
LOG("\t\t\t|%.2f, %.2f, %.2f, %.2f|\n", m2[2], m2[6], m2[10], m2[14]);
|
||||
LOG("\t\t\t|%.2f, %.2f, %.2f, %.2f|\n", m2[3], m2[7], m2[11], m2[15]);
|
||||
|
||||
memset(m2, 0, 16*sizeof(float));
|
||||
matmul4_neon(m0, m1, m2);
|
||||
LOG("matmul4_neon =\n");
|
||||
LOG("\t\t\t|%.2f, %.2f, %.2f, %.2f|\n", m2[0], m2[4], m2[8], m2[12]);
|
||||
LOG("\t\t\t|%.2f, %.2f, %.2f, %.2f|\n", m2[1], m2[5], m2[9], m2[13]);
|
||||
LOG("\t\t\t|%.2f, %.2f, %.2f, %.2f|\n", m2[2], m2[6], m2[10], m2[14]);
|
||||
LOG("\t\t\t|%.2f, %.2f, %.2f, %.2f|\n", m2[3], m2[7], m2[11], m2[15]);
|
||||
|
||||
LOG("matmul4: c=%i \t neon=%i \t rate=%.2f \n", m4t[1] - m4t[0], m4t[2] - m4t[1],
|
||||
(float)(m4t[1] - m4t[0]) / (float)(m4t[2] - m4t[1]));
|
||||
|
||||
//matvec4
|
||||
m4t[0] = sceKernelGetSystemTimeWide();
|
||||
for(i = 0; i < testnum; i++){
|
||||
matvec4_c(m0, m1, m2);
|
||||
}
|
||||
m4t[1] = sceKernelGetSystemTimeWide();
|
||||
for(i = 0; i < testnum; i++){
|
||||
matvec4_neon(m0, m1, m2);
|
||||
}
|
||||
m4t[2] = sceKernelGetSystemTimeWide();
|
||||
|
||||
memset(m2, 0, 4*sizeof(float));
|
||||
matvec4_c(m0, m1, m2);
|
||||
LOG("matvec4_c = |%.2f, %.2f, %.2f, %f|\n", m2[0], m2[1], m2[2], m2[3]);
|
||||
|
||||
memset(m2, 0, 4*sizeof(float));
|
||||
matvec4_neon(m0, m1, m2);
|
||||
LOG("matvec4_neon = |%.2f, %.2f, %.2f, %f|\n", m2[0], m2[1], m2[2], m2[3]);
|
||||
|
||||
LOG("matvec4: c=%i \t neon=%i \t rate=%.2f \n", m4t[1] - m4t[0], m4t[2] - m4t[1],
|
||||
(float)(m4t[1] - m4t[0]) / (float)(m4t[2] - m4t[1]));
|
||||
|
||||
|
||||
}
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
|
||||
int i, ii;
|
||||
#if 1
|
||||
LOG("RUNFAST: Disabled \n");
|
||||
#else
|
||||
LOG("RUNFAST: Enabled \n");
|
||||
enable_runfast();
|
||||
#endif
|
||||
srand(time(NULL));
|
||||
|
||||
#if 1
|
||||
//test single argument functions:
|
||||
LOG("------------------------------------------------------------------------------------------------------\n");
|
||||
LOG("MATRIX FUNCTION TESTS \n");
|
||||
LOG("------------------------------------------------------------------------------------------------------\n");
|
||||
|
||||
test_matrixfunc();
|
||||
test_vectorfunc();
|
||||
|
||||
LOG("------------------------------------------------------------------------------------------------------\n");
|
||||
LOG("CMATH FUNCTION TESTS \n");
|
||||
LOG("------------------------------------------------------------------------------------------------------\n");
|
||||
LOG("Function\tRange\t\tNumber\tABS Max Error\tREL Max Error\tRMS Error\tTime\tRate\n");
|
||||
LOG("------------------------------------------------------------------------------------------------------\n");
|
||||
for(i = 0; i < 51; i++){
|
||||
test_mathfunc1(&test1[i]);
|
||||
|
||||
ii = i - (i % 3);
|
||||
LOG("%s\t", test1[i].name);
|
||||
LOG("[%.2f, %.2f]\t", test1[i].rng0, test1[i].rng1);
|
||||
LOG("%i\t", test1[i].num);
|
||||
LOG("%.2e\t", test1[i].emaxabs);
|
||||
LOG("%.2e%%\t", test1[i].emaxrel);
|
||||
LOG("%.2e\t", test1[i].erms);
|
||||
LOG("%i\t", test1[i].time);
|
||||
LOG("x%.2f\t", (float)test1[ii].time / test1[i].time);
|
||||
LOG("\n");
|
||||
}
|
||||
for(i = 0; i < 9; i++){
|
||||
test_mathfunc2(&test2[i]);
|
||||
|
||||
ii = i - (i % 3);
|
||||
|
||||
LOG("%s\t", test2[i].name);
|
||||
LOG("[%.2f, %.2f]\t", test2[i].rng0, test2[i].rng1);
|
||||
LOG("%i\t", test2[i].num);
|
||||
LOG("%.2e\t", test2[i].emaxabs);
|
||||
LOG("%.2e%%\t", test2[i].emaxrel);
|
||||
LOG("%.2e\t", test2[i].erms);
|
||||
LOG("%i\t", test2[i].time);
|
||||
LOG("x%.2f\t", (float)test2[ii].time / test2[i].time);
|
||||
LOG("\n");
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
|
||||
float x = 0;
|
||||
for(x = -M_PI_2; x < M_PI_2; x+= 0.01)
|
||||
{
|
||||
LOG("x=%.2f\t in=%.2f\t c=%.2f\t neon=%.2f \n", x, sinhf(x), sinhf_c(x), sinhf_neon(x));
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
6
deps/math-neon/source/math_neon.h
vendored
6
deps/math-neon/source/math_neon.h
vendored
@ -33,7 +33,7 @@ THE SOFTWARE.
|
||||
//Default Floating Point value ABI: 0=softfp, 1=hardfp. Only effects *_neon routines.
|
||||
//You can access the hardfp versions directly via the *_hard suffix.
|
||||
//You can access the softfp versions directly via the *_soft suffix.
|
||||
#define __MATH_FPABI 0
|
||||
#define __MATH_FPABI 1
|
||||
|
||||
#endif
|
||||
|
||||
@ -84,6 +84,10 @@ THE SOFTWARE.
|
||||
#define modf_neon modf_neon_hfp
|
||||
#define sqrtf_neon sqrtf_neon_hfp
|
||||
#define invsqrtf_neon invsqrtf_neon_hfp
|
||||
|
||||
#define dot2_neon dot2_neon_hfp
|
||||
#define dot3_neon dot3_neon_hfp
|
||||
#define dot4_neon dot4_neon_hfp
|
||||
#else
|
||||
#define sinf_neon sinf_neon_sfp
|
||||
#define cosf_neon cosf_neon_sfp
|
||||
|
2
deps/math-neon/source/math_sqrtfv.c
vendored
2
deps/math-neon/source/math_sqrtfv.c
vendored
@ -100,7 +100,7 @@ void sqrtfv_c(float *x, int n, float *r)
|
||||
|
||||
void sqrtfv_neon(float *x, int n, float *r)
|
||||
{
|
||||
#if 0
|
||||
#ifdef __MATH_NEON
|
||||
asm volatile (
|
||||
|
||||
"tst r1, #1 \n\t" //r1 & 1
|
||||
|
4
deps/vitaGL/Makefile
vendored
4
deps/vitaGL/Makefile
vendored
@ -17,6 +17,10 @@ AR = $(PREFIX)-gcc-ar
|
||||
CFLAGS = -g -Wl,-q -O2 -ffast-math -mtune=cortex-a9 -mfpu=neon -flto -ftree-vectorize -DSTB_DXT_IMPLEMENTATION
|
||||
ASFLAGS = $(CFLAGS)
|
||||
|
||||
ifeq ($(NO_DEBUG),1)
|
||||
CFLAGS += -DSKIP_ERROR_HANDLING
|
||||
endif
|
||||
|
||||
all: $(TARGET).a
|
||||
|
||||
$(TARGET).a: $(OBJS)
|
||||
|
12
deps/vitaGL/source/custom_shaders.c
vendored
12
deps/vitaGL/source/custom_shaders.c
vendored
@ -357,8 +357,7 @@ void glUniformMatrix4fv(GLint location, GLsizei count, GLboolean transpose, cons
|
||||
* ------------------------------
|
||||
*/
|
||||
|
||||
// Equivalent of glBindAttribLocation but for sceGxm architecture
|
||||
void vglBindAttribLocation(GLuint prog, GLuint index, const GLchar *name, const GLuint num, const GLenum type) {
|
||||
void vglBindPackedAttribLocation(GLuint prog, GLuint index, const GLchar *name, const GLuint num, const GLenum type, GLuint offset) {
|
||||
// Grabbing passed program
|
||||
program *p = &progs[prog - 1];
|
||||
SceGxmVertexAttribute *attributes = &p->attr[index];
|
||||
@ -369,7 +368,7 @@ void vglBindAttribLocation(GLuint prog, GLuint index, const GLchar *name, const
|
||||
|
||||
// Setting stream index and offset values
|
||||
attributes->streamIndex = index;
|
||||
attributes->offset = 0;
|
||||
attributes->offset = offset;
|
||||
|
||||
// Detecting attribute format and size
|
||||
int bpe;
|
||||
@ -396,7 +395,12 @@ void vglBindAttribLocation(GLuint prog, GLuint index, const GLchar *name, const
|
||||
p->attr_num = index + 1;
|
||||
}
|
||||
|
||||
// Equivalent of glVertexAttribLocation but for sceGxm architecture
|
||||
// Equivalent of glBindAttribLocation but for sceGxm architecture
|
||||
void vglBindAttribLocation(GLuint prog, GLuint index, const GLchar *name, const GLuint num, const GLenum type) {
|
||||
vglBindPackedAttribLocation(prog, index, name, num, type, 0);
|
||||
}
|
||||
|
||||
// Equivalent of glVertexAttribPointer but for sceGxm architecture
|
||||
void vglVertexAttribPointer(GLuint index, GLint size, GLenum type, GLboolean normalized, GLsizei stride, GLuint count, const GLvoid *pointer) {
|
||||
#ifndef SKIP_ERROR_HANDLING
|
||||
// Error handling
|
||||
|
9
deps/vitaGL/source/textures.c
vendored
9
deps/vitaGL/source/textures.c
vendored
@ -101,6 +101,7 @@ void glTexImage2D(GLenum target, GLint level, GLint internalFormat, GLsizei widt
|
||||
|
||||
SceGxmTextureFormat tex_format;
|
||||
uint8_t data_bpp = 0;
|
||||
uint8_t fast_store = GL_FALSE;
|
||||
|
||||
// Support for legacy GL1.0 internalFormat
|
||||
switch (internalFormat) {
|
||||
@ -157,7 +158,8 @@ void glTexImage2D(GLenum target, GLint level, GLint internalFormat, GLsizei widt
|
||||
switch (type) {
|
||||
case GL_UNSIGNED_BYTE:
|
||||
data_bpp = 3;
|
||||
read_cb = readRGB;
|
||||
if (internalFormat == GL_RGB) fast_store = GL_TRUE;
|
||||
else read_cb = readRGB;
|
||||
break;
|
||||
default:
|
||||
error = GL_INVALID_ENUM;
|
||||
@ -168,7 +170,8 @@ void glTexImage2D(GLenum target, GLint level, GLint internalFormat, GLsizei widt
|
||||
switch (type) {
|
||||
case GL_UNSIGNED_BYTE:
|
||||
data_bpp = 4;
|
||||
read_cb = readRGBA;
|
||||
if (internalFormat == GL_RGBA) fast_store = GL_TRUE;
|
||||
else read_cb = readRGBA;
|
||||
break;
|
||||
case GL_UNSIGNED_SHORT_5_5_5_1:
|
||||
data_bpp = 2;
|
||||
@ -236,7 +239,7 @@ void glTexImage2D(GLenum target, GLint level, GLint internalFormat, GLsizei widt
|
||||
tex->type = internalFormat;
|
||||
tex->write_cb = write_cb;
|
||||
if (level == 0)
|
||||
if (tex->write_cb) gpu_alloc_texture(width, height, tex_format, data, tex, data_bpp, read_cb, write_cb);
|
||||
if (tex->write_cb) gpu_alloc_texture(width, height, tex_format, data, tex, data_bpp, read_cb, write_cb, fast_store);
|
||||
else gpu_alloc_compressed_texture(width, height, tex_format, data, tex, data_bpp, read_cb);
|
||||
else {
|
||||
gpu_alloc_mipmaps(level, tex);
|
||||
|
25
deps/vitaGL/source/utils/gpu_utils.c
vendored
25
deps/vitaGL/source/utils/gpu_utils.c
vendored
@ -255,7 +255,7 @@ void gpu_free_texture(texture *tex) {
|
||||
tex->valid = 0;
|
||||
}
|
||||
|
||||
void gpu_alloc_texture(uint32_t w, uint32_t h, SceGxmTextureFormat format, const void *data, texture *tex, uint8_t src_bpp, uint32_t (*read_cb)(void *), void (*write_cb)(void *, uint32_t)) {
|
||||
void gpu_alloc_texture(uint32_t w, uint32_t h, SceGxmTextureFormat format, const void *data, texture *tex, uint8_t src_bpp, uint32_t (*read_cb)(void *), void (*write_cb)(void *, uint32_t), uint8_t fast_store) {
|
||||
// If there's already a texture in passed texture object we first dealloc it
|
||||
if (tex->valid)
|
||||
gpu_free_texture(tex);
|
||||
@ -274,13 +274,22 @@ void gpu_alloc_texture(uint32_t w, uint32_t h, SceGxmTextureFormat format, const
|
||||
int i, j;
|
||||
uint8_t *src = (uint8_t *)data;
|
||||
uint8_t *dst;
|
||||
for (i = 0; i < h; i++) {
|
||||
dst = ((uint8_t *)texture_data) + (ALIGN(w, 8) * bpp) * i;
|
||||
for (j = 0; j < w; j++) {
|
||||
uint32_t clr = read_cb(src);
|
||||
write_cb(dst, clr);
|
||||
src += src_bpp;
|
||||
dst += bpp;
|
||||
if (fast_store) { // Internal Format and Data Format are the same, we can just use memcpy for better performance
|
||||
uint32_t line_size = w * bpp;
|
||||
for (i = 0; i < h; i++) {
|
||||
dst = ((uint8_t *)texture_data) + (ALIGN(w, 8) * bpp) * i;
|
||||
memcpy(dst, src, line_size);
|
||||
src += line_size;
|
||||
}
|
||||
} else { // Different internal and data formats, we need to go with slower callbacks system
|
||||
for (i = 0; i < h; i++) {
|
||||
dst = ((uint8_t *)texture_data) + (ALIGN(w, 8) * bpp) * i;
|
||||
for (j = 0; j < w; j++) {
|
||||
uint32_t clr = read_cb(src);
|
||||
write_cb(dst, clr);
|
||||
src += src_bpp;
|
||||
dst += bpp;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else
|
||||
|
2
deps/vitaGL/source/utils/gpu_utils.h
vendored
2
deps/vitaGL/source/utils/gpu_utils.h
vendored
@ -82,7 +82,7 @@ void gpu_pool_init(uint32_t temp_pool_size);
|
||||
int tex_format_to_bytespp(SceGxmTextureFormat format);
|
||||
|
||||
// Alloc a texture
|
||||
void gpu_alloc_texture(uint32_t w, uint32_t h, SceGxmTextureFormat format, const void *data, texture *tex, uint8_t src_bpp, uint32_t (*read_cb)(void *), void (*write_cb)(void *, uint32_t));
|
||||
void gpu_alloc_texture(uint32_t w, uint32_t h, SceGxmTextureFormat format, const void *data, texture *tex, uint8_t src_bpp, uint32_t (*read_cb)(void *), void (*write_cb)(void *, uint32_t), uint8_t fast_store);
|
||||
|
||||
// Alloc a compresseed texture
|
||||
void gpu_alloc_compressed_texture(uint32_t w, uint32_t h, SceGxmTextureFormat format, const void *data, texture *tex, uint8_t src_bpp, uint32_t (*read_cb)(void *));
|
||||
|
1
deps/vitaGL/source/vitaGL.h
vendored
1
deps/vitaGL/source/vitaGL.h
vendored
@ -378,6 +378,7 @@ void vglVertexPointerMapped(const GLvoid *pointer);
|
||||
|
||||
// VGL_EXT_gxp_shaders extension implementation
|
||||
void vglBindAttribLocation(GLuint prog, GLuint index, const GLchar *name, const GLuint num, const GLenum type);
|
||||
void vglBindPackedAttribLocation(GLuint prog, GLuint index, const GLchar *name, const GLuint num, const GLenum type, GLuint offset);
|
||||
void vglVertexAttribPointer(GLuint index, GLint size, GLenum type, GLboolean normalized, GLsizei stride, GLuint count, const GLvoid *pointer);
|
||||
void vglVertexAttribPointerMapped(GLuint index, const GLvoid *pointer);
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user