Add ACKNOWLEDGEMENTS. Replace memcpy with advsimd implementation.

This commit is contained in:
Kao Makino
2021-08-23 19:12:52 -07:00
parent 16e05fad9d
commit ab318880a5
2 changed files with 92 additions and 104 deletions

View File

@@ -562,3 +562,28 @@ folly_memcpy:
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
Arm Limited (optimized-routines)
MIT License
Copyright (c) 1999-2019, Arm Limited.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@@ -1,13 +1,13 @@
/* /*
* memcpy - copy memory area * memcpy - copy memory area
* *
* Copyright (c) 2012-2020, Arm Limited. * Copyright (c) 2019-2020, Arm Limited.
* SPDX-License-Identifier: MIT * SPDX-License-Identifier: MIT
*/ */
/* Assumptions: /* Assumptions:
* *
* ARMv8-a, AArch64, unaligned accesses. * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
* *
*/ */
@@ -25,21 +25,18 @@
#define B_l x8 #define B_l x8
#define B_lw w8 #define B_lw w8
#define B_h x9 #define B_h x9
#define C_l x10
#define C_lw w10 #define C_lw w10
#define C_h x11
#define D_l x12
#define D_h x13
#define E_l x14
#define E_h x15
#define F_l x16
#define F_h x17
#define G_l count
#define G_h dst
#define H_l src
#define H_h srcend
#define tmp1 x14 #define tmp1 x14
#define A_q q0
#define B_q q1
#define C_q q2
#define D_q q3
#define E_q q4
#define F_q q5
#define G_q q6
#define H_q q7
/* This implementation handles overlaps and supports both memcpy and memmove /* This implementation handles overlaps and supports both memcpy and memmove
from a single entry point. It uses unaligned accesses and branchless from a single entry point. It uses unaligned accesses and branchless
sequences to keep the code small, simple and improve performance. sequences to keep the code small, simple and improve performance.
@@ -49,7 +46,7 @@
check is negligible since it is only required for large copies. check is negligible since it is only required for large copies.
Large copies use a software pipelined loop processing 64 bytes per iteration. Large copies use a software pipelined loop processing 64 bytes per iteration.
The destination pointer is 16-byte aligned to minimize unaligned accesses. The source pointer is 16-byte aligned to minimize unaligned accesses.
The loop tail is handled by always copying 64 bytes from the end. The loop tail is handled by always copying 64 bytes from the end.
*/ */
@@ -68,10 +65,10 @@ ENTRY (memcpy)
/* Small copies: 0..32 bytes. */ /* Small copies: 0..32 bytes. */
cmp count, 16 cmp count, 16
b.lo L(copy16) b.lo L(copy16)
ldp A_l, A_h, [src] ldr A_q, [src]
ldp D_l, D_h, [srcend, -16] ldr B_q, [srcend, -16]
stp A_l, A_h, [dstin] str A_q, [dstin]
stp D_l, D_h, [dstend, -16] str B_q, [dstend, -16]
ret ret
/* Copy 8-15 bytes. */ /* Copy 8-15 bytes. */
@@ -109,134 +106,100 @@ L(copy0):
.p2align 4 .p2align 4
/* Medium copies: 33..128 bytes. */ /* Medium copies: 33..128 bytes. */
L(copy32_128): L(copy32_128):
ldp A_l, A_h, [src] ldp A_q, B_q, [src]
ldp B_l, B_h, [src, 16] ldp C_q, D_q, [srcend, -32]
ldp C_l, C_h, [srcend, -32]
ldp D_l, D_h, [srcend, -16]
cmp count, 64 cmp count, 64
b.hi L(copy128) b.hi L(copy128)
stp A_l, A_h, [dstin] stp A_q, B_q, [dstin]
stp B_l, B_h, [dstin, 16] stp C_q, D_q, [dstend, -32]
stp C_l, C_h, [dstend, -32]
stp D_l, D_h, [dstend, -16]
ret ret
.p2align 4 .p2align 4
/* Copy 65..128 bytes. */ /* Copy 65..128 bytes. */
L(copy128): L(copy128):
ldp E_l, E_h, [src, 32] ldp E_q, F_q, [src, 32]
ldp F_l, F_h, [src, 48]
cmp count, 96 cmp count, 96
b.ls L(copy96) b.ls L(copy96)
ldp G_l, G_h, [srcend, -64] ldp G_q, H_q, [srcend, -64]
ldp H_l, H_h, [srcend, -48] stp G_q, H_q, [dstend, -64]
stp G_l, G_h, [dstend, -64]
stp H_l, H_h, [dstend, -48]
L(copy96): L(copy96):
stp A_l, A_h, [dstin] stp A_q, B_q, [dstin]
stp B_l, B_h, [dstin, 16] stp E_q, F_q, [dstin, 32]
stp E_l, E_h, [dstin, 32] stp C_q, D_q, [dstend, -32]
stp F_l, F_h, [dstin, 48]
stp C_l, C_h, [dstend, -32]
stp D_l, D_h, [dstend, -16]
ret ret
.p2align 4
/* Copy more than 128 bytes. */ /* Copy more than 128 bytes. */
L(copy_long): L(copy_long):
/* Use backwards copy if there is an overlap. */ /* Use backwards copy if there is an overlap. */
sub tmp1, dstin, src sub tmp1, dstin, src
cbz tmp1, L(copy0)
cmp tmp1, count cmp tmp1, count
b.lo L(copy_long_backwards) b.lo L(copy_long_backwards)
/* Copy 16 bytes and then align dst to 16-byte alignment. */ /* Copy 16 bytes and then align src to 16-byte alignment. */
ldr D_q, [src]
ldp D_l, D_h, [src] and tmp1, src, 15
and tmp1, dstin, 15 bic src, src, 15
bic dst, dstin, 15 sub dst, dstin, tmp1
sub src, src, tmp1
add count, count, tmp1 /* Count is now 16 too large. */ add count, count, tmp1 /* Count is now 16 too large. */
ldp A_l, A_h, [src, 16] ldp A_q, B_q, [src, 16]
stp D_l, D_h, [dstin] str D_q, [dstin]
ldp B_l, B_h, [src, 32] ldp C_q, D_q, [src, 48]
ldp C_l, C_h, [src, 48]
ldp D_l, D_h, [src, 64]!
subs count, count, 128 + 16 /* Test and readjust count. */ subs count, count, 128 + 16 /* Test and readjust count. */
b.ls L(copy64_from_end) b.ls L(copy64_from_end)
L(loop64): L(loop64):
stp A_l, A_h, [dst, 16] stp A_q, B_q, [dst, 16]
ldp A_l, A_h, [src, 16] ldp A_q, B_q, [src, 80]
stp B_l, B_h, [dst, 32] stp C_q, D_q, [dst, 48]
ldp B_l, B_h, [src, 32] ldp C_q, D_q, [src, 112]
stp C_l, C_h, [dst, 48] add src, src, 64
ldp C_l, C_h, [src, 48] add dst, dst, 64
stp D_l, D_h, [dst, 64]!
ldp D_l, D_h, [src, 64]!
subs count, count, 64 subs count, count, 64
b.hi L(loop64) b.hi L(loop64)
/* Write the last iteration and copy 64 bytes from the end. */ /* Write the last iteration and copy 64 bytes from the end. */
L(copy64_from_end): L(copy64_from_end):
ldp E_l, E_h, [srcend, -64] ldp E_q, F_q, [srcend, -64]
stp A_l, A_h, [dst, 16] stp A_q, B_q, [dst, 16]
ldp A_l, A_h, [srcend, -48] ldp A_q, B_q, [srcend, -32]
stp B_l, B_h, [dst, 32] stp C_q, D_q, [dst, 48]
ldp B_l, B_h, [srcend, -32] stp E_q, F_q, [dstend, -64]
stp C_l, C_h, [dst, 48] stp A_q, B_q, [dstend, -32]
ldp C_l, C_h, [srcend, -16]
stp D_l, D_h, [dst, 64]
stp E_l, E_h, [dstend, -64]
stp A_l, A_h, [dstend, -48]
stp B_l, B_h, [dstend, -32]
stp C_l, C_h, [dstend, -16]
ret ret
.p2align 4
/* Large backwards copy for overlapping copies. /* Large backwards copy for overlapping copies.
Copy 16 bytes and then align dst to 16-byte alignment. */ Copy 16 bytes and then align srcend to 16-byte alignment. */
L(copy_long_backwards): L(copy_long_backwards):
ldp D_l, D_h, [srcend, -16] cbz tmp1, L(copy0)
and tmp1, dstend, 15 ldr D_q, [srcend, -16]
sub srcend, srcend, tmp1 and tmp1, srcend, 15
bic srcend, srcend, 15
sub count, count, tmp1 sub count, count, tmp1
ldp A_l, A_h, [srcend, -16] ldp A_q, B_q, [srcend, -32]
stp D_l, D_h, [dstend, -16] str D_q, [dstend, -16]
ldp B_l, B_h, [srcend, -32] ldp C_q, D_q, [srcend, -64]
ldp C_l, C_h, [srcend, -48]
ldp D_l, D_h, [srcend, -64]!
sub dstend, dstend, tmp1 sub dstend, dstend, tmp1
subs count, count, 128 subs count, count, 128
b.ls L(copy64_from_start) b.ls L(copy64_from_start)
L(loop64_backwards): L(loop64_backwards):
stp A_l, A_h, [dstend, -16] str B_q, [dstend, -16]
ldp A_l, A_h, [srcend, -16] str A_q, [dstend, -32]
stp B_l, B_h, [dstend, -32] ldp A_q, B_q, [srcend, -96]
ldp B_l, B_h, [srcend, -32] str D_q, [dstend, -48]
stp C_l, C_h, [dstend, -48] str C_q, [dstend, -64]!
ldp C_l, C_h, [srcend, -48] ldp C_q, D_q, [srcend, -128]
stp D_l, D_h, [dstend, -64]! sub srcend, srcend, 64
ldp D_l, D_h, [srcend, -64]!
subs count, count, 64 subs count, count, 64
b.hi L(loop64_backwards) b.hi L(loop64_backwards)
/* Write the last iteration and copy 64 bytes from the start. */ /* Write the last iteration and copy 64 bytes from the start. */
L(copy64_from_start): L(copy64_from_start):
ldp G_l, G_h, [src, 48] ldp E_q, F_q, [src, 32]
stp A_l, A_h, [dstend, -16] stp A_q, B_q, [dstend, -32]
ldp A_l, A_h, [src, 32] ldp A_q, B_q, [src]
stp B_l, B_h, [dstend, -32] stp C_q, D_q, [dstend, -64]
ldp B_l, B_h, [src, 16] stp E_q, F_q, [dstin, 32]
stp C_l, C_h, [dstend, -48] stp A_q, B_q, [dstin]
ldp C_l, C_h, [src]
stp D_l, D_h, [dstend, -64]
stp G_l, G_h, [dstin, 48]
stp A_l, A_h, [dstin, 32]
stp B_l, B_h, [dstin, 16]
stp C_l, C_h, [dstin]
ret ret
END (memcpy) END (memcpy)