movb %cl,%al shrl $2,%ecx /* copy longword-wise */ cld rep movsl movb %al,%cl andb $3,%cl /* copy remaining bytes */ rep movsbThis can move about 40MB/s on our 133MHz Pentium (Triton Chipset, 60ns EDO RAM, 256KB PB cache) as a user-level program. The movs instruction, when prefixed by rep, moves data from an area pointed to by the %esi ("source index") register to that pointed to by %edi ("destination index"). The number of items moved is in the %ecx ("count") register. It speeds up the movement by using movsl (move 32-bit longword), until there are at most 3 bytes remaining, and then calls movsb (move byte) to do the rest.
cmpl $63,%ecx jbe unrolled_tail .align 2,0x90 unrolled_loop: movl 32(%esi),%eax /* prefetch next cache line */ cmpl $67,%ecx jbe unrolled_tmp /* and one more if we have */ movl 64(%esi),%eax /* >= 68 bytes to move */ .align 2,0x90 unrolled_tmp: movl 0(%esi),%eax /* load in pairs */ movl 4(%esi),%edx movl %eax,0(%edi) /* store in pairs */ movl %edx,4(%edi) movl 8(%esi),%eax movl 12(%esi),%edx movl %eax,8(%edi) movl %edx,12(%edi) movl 16(%esi),%eax movl 20(%esi),%edx movl %eax,16(%edi) movl %edx,20(%edi) movl 24(%esi),%eax movl 28(%esi),%edx movl %eax,24(%edi) movl %edx,28(%edi) movl 32(%esi),%eax movl 36(%esi),%edx movl %eax,32(%edi) movl %edx,36(%edi) movl 40(%esi),%eax movl 44(%esi),%edx movl %eax,40(%edi) movl %edx,44(%edi) movl 48(%esi),%eax movl 52(%esi),%edx movl %eax,48(%edi) movl %edx,52(%edi) movl 56(%esi),%eax movl 60(%esi),%edx movl %eax,56(%edi) movl %edx,60(%edi) addl $-64,%ecx addl $64,%esi addl $64,%edi cmpl $63,%ecx ja unrolled_loop unrolled_tail: /* this part same as libc */ movl %ecx,%eax shrl $2,%ecx cld rep movsl movl %eax,%ecx andl $3,%ecx rep movsbNote that it also attempts to prefetch the next cache line by touching the src+32 and src+64'th bytes. (We are assuming the cache line size is 32 bytes.)
This version gives us up to 60MB/s on the same machine, or a 50% speedup, if we unroll the loop enough.
cmpl $63,%ecx jbe unrolled_tail 4: pushl %ecx cmpl $1792,%ecx /* prefetch up to 1792 bytes */ jbe 2f /* (1792 = 2048 - 256) */ movl $1792,%ecx 2: subl %ecx,0(%esp) cmpl $256,%ecx jb 5f pushl %esi pushl %ecx .align 4,0x90 3: movl 0(%esi),%eax movl 32(%esi),%eax movl 64(%esi),%eax movl 96(%esi),%eax movl 128(%esi),%eax movl 160(%esi),%eax movl 192(%esi),%eax movl 224(%esi),%eax addl $256,%esi subl $256,%ecx cmpl $256,%ecx jae 3b popl %ecx popl %esi 5: .align 2,0x90 unrolled_loop: fildq 0(%esi) fildq 8(%esi) fildq 16(%esi) fildq 24(%esi) /* load 8 quad (64-bit) words */ fildq 32(%esi) fildq 40(%esi) fildq 48(%esi) fildq 56(%esi) fistpq 56(%edi) fistpq 48(%edi) fistpq 40(%edi) fistpq 32(%edi) /* store them in reverse order */ fistpq 24(%edi) fistpq 16(%edi) fistpq 8(%edi) fistpq 0(%edi) addl $-64,%ecx addl $64,%esi addl $64,%edi cmpl $63,%ecx ja unrolled_loop popl %eax addl %eax,%ecx cmpl $64,%ecx jae 4b unrolled_tail: /* this part same as libc */ movl %ecx,%eax shrl $2,%ecx cld rep movsl movl %eax,%ecx andl $3,%ecx rep movsbThe Intel x86 floating-point unit has eight 80-bit registers organized as a stack. The fildq (floating-point integer load quadword) instruction loads a 64-bit integer into a 80-bit register, converting it into floating point in the process. (Note there is no data loss since the 80-bit floating-point format has 64 bits for the significand.) The fistpq (floating-point integer store and pop quadword) does the opposite.
This version can move up to 80MB/s, or 100% speedup, on the same machine. The speed doesn't seem to go up much with even more unrolling.
It seems like copying 64 bytes using FP registers (the first blue line) is the best solution.
Note that using floating-point registers don't help at all, as the best they can do is to match the speed of libc bcopy. Also the maximum bandwidth is only about half that of Dell's 133-MHz Pentium above.