mirror of
https://github.com/libretro/RetroArch
synced 2025-01-18 13:23:40 +00:00
a581c9ac86
use redefine macros in RGL and RetroArch for them - nice speedup
219 lines
7.3 KiB
C
219 lines
7.3 KiB
C
/*********************************************************************************
|
|
* Copyright (C) 2008-2010 by Konstantinos Margaritis <markos@codex.gr> *
|
|
* All rights reserved. *
|
|
* *
|
|
* Redistribution and use in source and binary forms, with or without *
|
|
* modification, are permitted provided that the following conditions are met: *
|
|
* 1. Redistributions of source code must retain the above copyright *
|
|
* notice, this list of conditions and the following disclaimer. *
|
|
* 2. Redistributions in binary form must reproduce the above copyright *
|
|
* notice, this list of conditions and the following disclaimer in the *
|
|
* documentation and/or other materials provided with the distribution. *
|
|
* 3. Neither the name of the Codex nor the *
|
|
* names of its contributors may be used to endorse or promote products *
|
|
* derived from this software without specific prior written permission. *
|
|
* *
|
|
* THIS SOFTWARE IS PROVIDED BY CODEX ''AS IS'' AND ANY *
|
|
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED *
|
|
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE *
|
|
* DISCLAIMED. IN NO EVENT SHALL CODEX BE LIABLE FOR ANY *
|
|
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
|
|
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; *
|
|
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND *
|
|
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT *
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS *
|
|
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
|
|
*********************************************************************************/
|
|
|
|
#include <stdio.h>
|
|
#include <limits.h>
|
|
#include "altivec_mem.h"
|
|
|
|
void *vec_memcpy(void *dstpp, const void *srcpp, size_t len)
|
|
{
|
|
const uint8_t *src = srcpp;
|
|
uint8_t *dst = dstpp;
|
|
|
|
if (len >= sizeof(word_t))
|
|
{
|
|
// Prefetch some stuff
|
|
READ_PREFETCH_START1(src);
|
|
WRITE_PREFETCH_START2(dst);
|
|
|
|
// Copy until dst is word aligned
|
|
int al = copy_fwd_until_dst_word_aligned(dst, src), l;
|
|
|
|
if (al)
|
|
{
|
|
src += sizeof(word_t) - al;
|
|
dst += sizeof(word_t) - al;
|
|
len -= sizeof(word_t) - al;
|
|
}
|
|
|
|
// Now dst is word aligned. We'll continue by word copying, but
|
|
// for this we have to know the word-alignment of src also.
|
|
int srcoffset = ((word_t)(src) % sizeof(word_t)), sh_l, sh_r;
|
|
sh_l = srcoffset * CHAR_BIT;
|
|
sh_r = CHAR_BIT * sizeof(word_t) - sh_l;
|
|
|
|
// Take the word-aligned long pointers of src and dest.
|
|
word_t *dstl = (word_t *)(dst);
|
|
const word_t *srcl = (word_t *)(src - srcoffset);
|
|
|
|
if (len >= SIMD_PACKETSIZE)
|
|
{
|
|
// While we're not 16-byte aligned, move in 4-byte long steps.
|
|
|
|
al = (word_t)dstl % SIMD_PACKETSIZE;
|
|
if (al)
|
|
{
|
|
copy_fwd_until_dst_simd_aligned(dstl, srcl, srcoffset, al, sh_l, sh_r);
|
|
srcl += (SIMD_PACKETSIZE - al)/WORDS_IN_PACKET;
|
|
src = (uint8_t *) srcl + srcoffset;
|
|
dstl += (SIMD_PACKETSIZE - al)/WORDS_IN_PACKET;
|
|
len -= SIMD_PACKETSIZE - al;
|
|
}
|
|
|
|
// Now, dst is 16byte aligned. We can use SIMD if len >= 16
|
|
l = len / SIMD_PACKETSIZE;
|
|
len -= l * SIMD_PACKETSIZE;
|
|
if (((word_t)(src) % SIMD_PACKETSIZE) == 0)
|
|
copy_fwd_rest_blocks_aligned(dstl, src, l);
|
|
else
|
|
copy_fwd_rest_blocks_unaligned(dstl, src, srcoffset, sh_l, sh_r, l);
|
|
src += l*SIMD_PACKETSIZE;
|
|
dstl += l * WORDS_IN_PACKET;
|
|
srcl = (word_t *)(src - srcoffset);
|
|
}
|
|
// Stop the prefetching
|
|
PREFETCH_STOP1;
|
|
PREFETCH_STOP2;
|
|
//#endif
|
|
|
|
// Copy the remaining bytes using word-copying
|
|
// Handle alignment as appropriate
|
|
l = len / sizeof(word_t);
|
|
len -= l * sizeof(word_t);
|
|
|
|
if (srcoffset == 0)
|
|
{
|
|
copy_fwd_rest_words_aligned(dstl, srcl, l);
|
|
srcl += l;
|
|
src = (uint8_t *) srcl;
|
|
}
|
|
else
|
|
{
|
|
copy_fwd_rest_words_unaligned(dstl, srcl, sh_l, sh_r, l);
|
|
srcl += l;
|
|
src = (uint8_t *) srcl + srcoffset;
|
|
}
|
|
|
|
dstl += l;
|
|
|
|
// For the end copy we have to use char * pointers.
|
|
dst = (uint8_t *) dstl;
|
|
}
|
|
|
|
// Copy the remaining bytes
|
|
copy_fwd_rest_bytes(dst, src, len);
|
|
|
|
return dstpp;
|
|
}
|
|
|
|
void *vec_memcpy_aligned(void *dstpp, const void *srcpp, size_t len)
|
|
{
|
|
const uint8_t *src = srcpp;
|
|
uint8_t *dst = dstpp;
|
|
|
|
if (len >= sizeof(word_t))
|
|
{
|
|
// Prefetch some stuff
|
|
READ_PREFETCH_START1(src);
|
|
WRITE_PREFETCH_START2(dst);
|
|
|
|
// Take the word-aligned long pointers of src and dest.
|
|
word_t *dstl = (word_t *)(dst);
|
|
const word_t *srcl = (word_t *)(src);
|
|
int l;
|
|
|
|
#ifdef LIBFREEVEC_SIMD_ENGINE
|
|
if (len >= SIMD_PACKETSIZE)
|
|
{
|
|
l = len / SIMD_PACKETSIZE;
|
|
len -= l * SIMD_PACKETSIZE;
|
|
// Now, dst is 16byte aligned. We can use SIMD if len >= 16
|
|
copy_fwd_rest_blocks_aligned(dstl, src, l);
|
|
}
|
|
#endif
|
|
|
|
// Copy the remaining bytes using word-copying
|
|
// Handle alignment as appropriate
|
|
l = len / sizeof(word_t);
|
|
copy_fwd_rest_words_aligned(dstl, srcl, l);
|
|
srcl += l;
|
|
dstl += l;
|
|
len -= l * sizeof(word_t);
|
|
// For the end copy we have to use char * pointers.
|
|
src = (uint8_t *) srcl;
|
|
dst = (uint8_t *) dstl;
|
|
}
|
|
|
|
// Stop the prefetching
|
|
PREFETCH_STOP1;
|
|
|
|
PREFETCH_STOP2;
|
|
|
|
// Copy the remaining bytes
|
|
copy_fwd_rest_bytes(dst, src, len);
|
|
|
|
return dstpp;
|
|
}
|
|
|
|
void *vec_memset(void *s, int p, size_t len)
|
|
{
|
|
uint8_t* ptr = s;
|
|
uint8_t __attribute__ ((aligned(16))) P = p;
|
|
if (len >= sizeof(word_t))
|
|
{
|
|
word_t pw = charmask(P);
|
|
|
|
size_t al = ((size_t)ptr) % sizeof(word_t);
|
|
if (al)
|
|
{
|
|
memset_fwd_until_dst_word_aligned(ptr, P, al);
|
|
ptr += sizeof(word_t) - al;
|
|
len -= sizeof(word_t) - al;
|
|
}
|
|
|
|
int l;
|
|
word_t *ptr_w = (word_t *)(ptr);
|
|
if (len >= SIMD_PACKETSIZE)
|
|
{
|
|
// ptr is now word (32/64bit) aligned, memset until ptr is SIMD aligned
|
|
al = (word_t) ptr_w % SIMD_PACKETSIZE;
|
|
if (al)
|
|
{
|
|
memset_fwd_until_simd_aligned(ptr_w, pw, al);
|
|
ptr_w += (SIMD_PACKETSIZE - al)/WORDS_IN_PACKET;
|
|
len -= SIMD_PACKETSIZE - al;
|
|
}
|
|
// ptr is now 128-bit aligned
|
|
// perform set using SIMD
|
|
|
|
l = len / SIMD_PACKETSIZE;
|
|
len -= l * SIMD_PACKETSIZE;
|
|
memset_set_blocks(ptr_w, pw, P, l);
|
|
ptr_w += l * WORDS_IN_PACKET;
|
|
}
|
|
// memset the remaining words
|
|
l = len / sizeof(word_t);
|
|
len -= l * sizeof(word_t);
|
|
memset_rest_words(ptr_w, pw, l);
|
|
ptr_w += l;
|
|
ptr = (uint8_t *)ptr_w;
|
|
}
|
|
// Handle the remaining bytes
|
|
memset_rest_bytes(ptr, P, len);
|
|
return s;
|
|
}
|