/* * unsigned long pa_memcpy(void *dstp, const void *srcp, unsigned long len) * * Inputs: * - sr1 already contains space of source region * - sr2 already contains space of destination region * * Returns: * - number of bytes that could not be copied. * On success, this will be zero. * * This code is based on a C-implementation of a copy routine written by * Randolph Chung, which in turn was derived from the glibc. * * Several strategies are tried to try to get the best performance for various * conditions. In the optimal case, we copy by loops that copy 32- or 16-bytes * at a time using general registers. Unaligned copies are handled either by * aligning the destination and then using shift-and-write method, or in a few * cases by falling back to a byte-at-a-time copy. * * Testing with various alignments and buffer sizes shows that this code is * often >10x faster than a simple byte-at-a-time copy, even for strangely * aligned operands. It is interesting to note that the glibc version of memcpy * (written in C) is actually quite fast already. This routine is able to beat * it by 30-40% for aligned copies because of the loop unrolling, but in some * cases the glibc version is still slightly faster. This lends more * credibility that gcc can generate very good code as long as we are careful. * * Possible optimizations: * - add cache prefetching * - try not to use the post-increment address modifiers; they may create * additional interlocks. Assumption is that those were only efficient on old * machines (pre PA8000 processors)
*/
dst = arg0
src = arg1
len = arg2
end = arg3
t1 = r19
t2 = r20
t3 = r21
t4 = r22
srcspc = sr1
dstspc = sr2
t0 = r1
a1 = t1
a2 = t2
a3 = t3
a0 = t4
save_src = ret0
save_dst = ret1
save_len = r31
ENTRY_CFI(pa_memcpy) /* Last destination address */
add dst,len,end
/* short copy with less than 16 bytes? */
cmpib,COND(>>=),n 15,len,.Lbyte_loop
/* same alignment? */
xor src,dst,t0
extru t0,31,2,t1
cmpib,<>,n 0,t1,.Lunaligned_copy
#ifdef CONFIG_64BIT /* only do 64-bit copies if we can get aligned. */
extru t0,31,3,t1
cmpib,<>,n 0,t1,.Lalign_loop32
/* loop until we are 64-bit aligned */
.Lalign_loop64:
extru dst,31,3,t1
cmpib,=,n 0,t1,.Lcopy_loop_16_start
20: ldb,ma 1(srcspc,src),t1
21: stb,ma t1,1(dstspc,dst)
b .Lalign_loop64
ldo -1(len),len
/* src and dst are not aligned the same way. */ /* need to go the hard way */
.Lunaligned_copy: /* align until dst is 32bit-word-aligned */
extru dst,31,2,t1
cmpib,=,n 0,t1,.Lcopy_dstaligned
20: ldb 0(srcspc,src),t1
ldo 1(src),src
21: stb,ma t1,1(dstspc,dst)
b .Lunaligned_copy
ldo -1(len),len
.Lcda_rdfault:
.Lcda_finish: /* calculate new src, dst and len and jump to byte-copy loop */ sub dst,save_dst,t0
add save_src,t0,src
b .Lbyte_loop sub save_len,t0,len
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.