OpenSolaris_b135/lib/libc/amd64/gen/strcpy.s

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright (c) 2009, Intel Corporation
 * All rights reserved.
 */

/*
 *	str[n]cpy - copy [n] chars from second operand into first operand
 */
#include "SYS.h"
#include "proc64_id.h"

#define LABEL(s) .strcpy/**/s

#ifdef USE_AS_STRNCPY
	ENTRY(strncpy)
	test	%edx, %edx
	jz	LABEL(strncpy_exitz)
	mov	%rdx, %r8
#else
	ENTRY(strcpy)				/* (char *, const char *) */
	xor	%rdx, %rdx
#endif
	mov	%esi, %ecx
	and	$0xfffffffffffffff0, %rsi	/* force rsi 16 byte align */
	and	$0xf, %rcx
	mov	%rdi, %rax			/* save destination address for return value */


	pxor	%xmm0, %xmm0			/* clear %xmm0 for null char checks */
	pcmpeqb	(%rsi), %xmm0			/* check 16 bytes in src for null */
	pmovmskb %xmm0, %edx
	shr	%cl, %edx			/* adjust for offset from 16byte boundary */
	test	%edx, %edx			/* edx will be 0 if chars are non-null */
	jnz	LABEL(less16bytes)		/* null char found in first 16 bytes examined */
#ifdef USE_AS_STRNCPY
	/*
	 * Check if the count is satisfied in first 16 bytes examined.
	 */
	lea	-16(%r8, %rcx), %r11
	cmp	$0, %r11
	jle	LABEL(less16bytes)
#endif
	mov	%rcx, %r9			/* rsi alignment offset */
	or	%edi, %ecx
	and	$0xf, %ecx
	lea	-16(%r9), %r10
	jz	LABEL(ashr_0)			/* src and dest are both 16 byte aligned */

	neg	%r10				/* max src bytes remaining in current dqword */

	pxor	%xmm0, %xmm0			/* clear %xmm0, may be polluted by unaligned operation */
	pcmpeqb	16(%rsi), %xmm0			/* check next 16 bytes in src for a null */
	pmovmskb %xmm0, %edx
	test	%edx, %edx
	jnz	LABEL(less32bytes)		/* null char found in first 32 bytes examined */

#ifdef USE_AS_STRNCPY
	/*
	 * If strncpy count <= 16 go to exit case
	 */
	sub	$16, %r8
	jbe	LABEL(less32bytes_strncpy_truncation)
#endif
	/*
	 * At least 16 bytes to copy to destination string. Move them now.
	 * Don't worry about alignment.
	 */
	mov	(%rsi, %r9), %rdx
	mov	%rdx, (%rdi)
	mov	8(%rsi, %r9), %rdx
	mov	%rdx, 8(%rdi)

	/*
	 * so far destination rdi may be aligned by 16, re-calculate rsi and
	 * jump to corresponding src/dest relative offset case.
	 * 	rcx is offset of rsi
	 * 	rdx is offset of rdi
	 */
	and	$0xfffffffffffffff0, %rdi	/* force rdi 16 byte align */
	mov	%rax, %rdx			/* rax contains orignal rdi */
	xor	%rdi, %rdx			/* same effect as "and $0xf, %rdx" */
#ifdef USE_AS_STRNCPY
	/*
	 * Will now do 16 byte aligned stores. Stores may overlap some bytes
	 * (ie store twice) if destination was unaligned. Compensate here.
	 */
	add	%rdx, %r8			/* compensate for overlap */
#endif

	add	$16, %rdi			/* next 16 bytes for dest */

	/*
	 * align src to 16-byte boundary. Could be up or down depending on
	 * whether src offset - dest offset > 0 (up) or
	 *  src offset - dest offset < 0 (down).
	 */
	sub	%rdx, %r9			/* src offset - dest offset */

	lea	16(%r9, %rsi), %rsi
	mov	%esi, %ecx			/* for new src offset */
	and	$0xfffffffffffffff0, %rsi	/* force rsi 16 byte align */

	and	$0xf, %ecx			/* new src offset is 0 if rsi/rdi have same alignment */
	jz	LABEL(ashr_0)

#ifdef USE_AS_STRNCPY
	xor	%edx, %edx			/* In case unaligned_exit is taken */
#endif
	/*
	 * Jump to case corresponding to source/dest string relative offsets
	 * Index = (16 + (src offset - dest offset)) % 16
	 */
	lea	-16(%rcx), %r10
	mov	%rcx, %r9
	neg	%r10				/* max src bytes remaining in current dqword */
	lea	LABEL(unaligned_table)(%rip), %r11
	movslq	(%r11, %rcx, 4), %rcx
	lea	(%r11, %rcx), %rcx
	jmp	*%rcx

/*
 * ashr_0 handles the following cases:
 * 	src alignment offset = dest alignment offset
 */
	.p2align 5
LABEL(ashr_0):
#ifdef USE_AS_STRNCPY
	sub	$16, %r8
 	jbe	LABEL(strncpy_truncation_aligned)
#endif
	movdqa	(%rsi), %xmm1		/* fetch 16 bytes from src string */
	movdqa	%xmm1, (%rdi)		/* store 16 bytes into dest string */
	add	$16, %rsi
	add	$16, %rdi
	pcmpeqb	(%rsi), %xmm0		/* check 16 bytes in src for a null */
	pmovmskb %xmm0, %edx

	test	%edx, %edx		/* edx will be 0 if chars are non-null */
	jnz	LABEL(aligned_16bytes)	/* exit tail */

LABEL(ashr_0_loop):
#ifdef USE_AS_STRNCPY
	sub	$16, %r8
	jbe	LABEL(strncpy_truncation_aligned)
#endif
	movdqa	(%rsi, %rcx), %xmm1
	movdqa	%xmm1, (%rdi, %rcx)
	add	$16, %rcx
	pcmpeqb	(%rsi, %rcx), %xmm0
	pmovmskb %xmm0, %edx
	test	%edx, %edx
	jnz	LABEL(aligned_exit)

#ifdef USE_AS_STRNCPY
	sub	$16, %r8
	jbe	LABEL(strncpy_truncation_aligned)
#endif
	movdqa  (%rsi, %rcx), %xmm1
	movdqa  %xmm1, (%rdi, %rcx)
	add	$16, %rcx
	pcmpeqb  (%rsi, %rcx), %xmm0
	pmovmskb  %xmm0, %edx
	test	%edx, %edx
	jnz	LABEL(aligned_exit)

#ifdef USE_AS_STRNCPY
	sub	$16, %r8
	jbe	LABEL(strncpy_truncation_aligned)
#endif
	movdqa  (%rsi, %rcx), %xmm1
	movdqa  %xmm1, (%rdi, %rcx)

	add	$16, %rcx
	pcmpeqb  (%rsi, %rcx), %xmm0
	pmovmskb  %xmm0, %edx
	test	%edx, %edx
	jnz	LABEL(aligned_exit)

#ifdef USE_AS_STRNCPY
	sub	$16, %r8
	jbe	LABEL(strncpy_truncation_aligned)
#endif
	movdqa  (%rsi, %rcx), %xmm1
	movdqa  %xmm1, (%rdi, %rcx)
	add	$16, %rcx
	pcmpeqb  (%rsi, %rcx), %xmm0
	pmovmskb  %xmm0, %edx
	test	%edx, %edx
	jz	LABEL(ashr_0_loop)
	jmp	LABEL(aligned_exit)


/*
 * ashr_15 handles the following cases:
 * 	(16 + (src offset - dest offset)) % 16 = 15 
 *
 * Based on above operation, start from (%r9 + rsi) to the left of this cache
 * bank, there is no null byte.
 */
	.p2align 4
LABEL(ashr_15):
	xor	%ecx, %ecx				/* clear index */
#ifdef USE_AS_STRNCPY
	cmp	%r10, %r8
	jbe	LABEL(unaligned_exit)
#endif
	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
	jz	LABEL(ashr_15_use_sse2)

	.p2align 4
LABEL(ashr_15_use_ssse3):
	movdqa	16(%rsi, %rcx), %xmm3
	pcmpeqb	%xmm3, %xmm0
	pmovmskb %xmm0, %edx
	test	%edx, %edx
	jnz	LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
	sub	$16, %r8
 	jbe	LABEL(strncpy_truncation_unaligned)
#endif

	#palignr $15, (%rsi, %rcx), %xmm3
	.byte	0x66, 0x0F, 0x3A ,0x0F
	.byte	0x1c, 0x0e, 0x0f

	movdqa	%xmm3, (%rdi, %rcx)
	add	$16, %rcx

#ifdef USE_AS_STRNCPY
	cmp	%r10, %r8
	jbe	LABEL(unaligned_exit)
#endif
	movdqa	16(%rsi, %rcx), %xmm3
	pcmpeqb %xmm3, %xmm0
	pmovmskb %xmm0, %edx
	test	%edx, %edx
	jnz	LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
	sub	$16, %r8
 	jbe	LABEL(strncpy_truncation_unaligned)
#endif

	#palignr $15, (%rsi, %rcx), %xmm3
	.byte	0x66, 0x0F, 0x3A ,0x0F
	.byte	0x1c, 0x0e, 0x0f

	movdqa	%xmm3, (%rdi, %rcx)
	add	$16, %rcx

#ifdef USE_AS_STRNCPY
	cmp	%r10, %r8
	jbe	LABEL(unaligned_exit)
#endif
	jmp	LABEL(ashr_15_use_ssse3)

	.p2align 4
LABEL(ashr_15_use_sse2):
	pcmpeqb 16(%rsi, %rcx), %xmm0
	pmovmskb %xmm0, %edx
	test	%edx, %edx
	jnz	LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
	sub	$16, %r8
 	jbe	LABEL(strncpy_truncation_unaligned)
#endif

	movdqa	16(%rsi, %rcx), %xmm3
	movdqa	(%rsi, %rcx), %xmm2

	psrldq	$15, %xmm2
	pslldq	$1, %xmm3
	por	%xmm2, %xmm3

	movdqa	%xmm3, (%rdi, %rcx)
	add	$16, %rcx
#ifdef USE_AS_STRNCPY
	cmp	%r10, %r8
	jbe	LABEL(unaligned_exit)
#endif
	pcmpeqb 16(%rsi, %rcx), %xmm0
	pmovmskb %xmm0, %edx
	test	%edx, %edx
	jnz	LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
	sub	$16, %r8
 	jbe	LABEL(strncpy_truncation_unaligned)
#endif

	movdqa	16(%rsi, %rcx), %xmm3
	movdqa	(%rsi, %rcx), %xmm2

	psrldq	$15, %xmm2
	pslldq	$1, %xmm3
	por	%xmm2, %xmm3

	movdqa	%xmm3, (%rdi, %rcx)
	add	$16, %rcx
#ifdef USE_AS_STRNCPY
	cmp	%r10, %r8
	jbe	LABEL(unaligned_exit)
#endif
	jmp	LABEL(ashr_15_use_sse2)


/*
 * ashr_14 handles the following cases:
 * 	(16 + (src offset - dest offset)) % 16 = 14 
 *
 * Based on above operation, start from (%r9 + rsi) to the left of this cache
 * bank, there is no null byte.
 */
	.p2align 4
LABEL(ashr_14):
	xor	%ecx, %ecx				/* clear index */
#ifdef USE_AS_STRNCPY
	cmp	%r10, %r8
	jbe	LABEL(unaligned_exit)
#endif
	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
	jz	LABEL(ashr_14_use_sse2)

	.p2align 4
LABEL(ashr_14_use_ssse3):
	movdqa	16(%rsi, %rcx), %xmm3
	pcmpeqb	%xmm3, %xmm0
	pmovmskb %xmm0, %edx
	test	%edx, %edx
	jnz	LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
	sub	$16, %r8
 	jbe	LABEL(strncpy_truncation_unaligned)
#endif

	#palignr $14, (%rsi, %rcx), %xmm3
	.byte	0x66, 0x0F, 0x3A ,0x0F
	.byte	0x1c, 0x0e, 0x0e

	movdqa	%xmm3, (%rdi, %rcx)
	add	$16, %rcx

#ifdef USE_AS_STRNCPY
	cmp	%r10, %r8
	jbe	LABEL(unaligned_exit)
#endif
	movdqa	16(%rsi, %rcx), %xmm3
	pcmpeqb %xmm3, %xmm0
	pmovmskb %xmm0, %edx
	test	%edx, %edx
	jnz	LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
	sub	$16, %r8
 	jbe	LABEL(strncpy_truncation_unaligned)
#endif

	#palignr $14, (%rsi, %rcx), %xmm3
	.byte	0x66, 0x0F, 0x3A ,0x0F
	.byte	0x1c, 0x0e, 0x0e

	movdqa	%xmm3, (%rdi, %rcx)
	add	$16, %rcx
#ifdef USE_AS_STRNCPY
	cmp	%r10, %r8
	jbe	LABEL(unaligned_exit)
#endif
	jmp	LABEL(ashr_14_use_ssse3)

	.p2align 4
LABEL(ashr_14_use_sse2):
	pcmpeqb 16(%rsi, %rcx), %xmm0
	pmovmskb %xmm0, %edx
	test	%edx, %edx
	jnz	LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
	sub	$16, %r8
 	jbe	LABEL(strncpy_truncation_unaligned)
#endif

	movdqa	16(%rsi, %rcx), %xmm3
	movdqa	(%rsi, %rcx), %xmm2

	psrldq	$14, %xmm2
	pslldq	$2, %xmm3
	por	%xmm2, %xmm3

	movdqa	%xmm3, (%rdi, %rcx)
	add	$16, %rcx

#ifdef USE_AS_STRNCPY
	cmp	%r10, %r8
	jbe	LABEL(unaligned_exit)
#endif
	pcmpeqb 16(%rsi, %rcx), %xmm0
	pmovmskb %xmm0, %edx
	test	%edx, %edx
	jnz	LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
	sub	$16, %r8
 	jbe	LABEL(strncpy_truncation_unaligned)
#endif

	movdqa	16(%rsi, %rcx), %xmm3
	movdqa	(%rsi, %rcx), %xmm2

	psrldq	$14, %xmm2
	pslldq	$2, %xmm3
	por	%xmm2, %xmm3

	movdqa	%xmm3, (%rdi, %rcx)
	add	$16, %rcx
#ifdef USE_AS_STRNCPY
	cmp	%r10, %r8
	jbe	LABEL(unaligned_exit)
#endif
	jmp	LABEL(ashr_14_use_sse2)


/*
 * ashr_13 handles the following cases:
 * 	(16 + (src offset - dest offset)) % 16 = 13 
 *
 * Based on above operation, start from (%r9 + rsi) to the left of this cache
 * bank, there is no null byte.
 */
	.p2align 4
LABEL(ashr_13):
	xor	%ecx, %ecx				/* clear index */
#ifdef USE_AS_STRNCPY
	cmp	%r10, %r8
	jbe	LABEL(unaligned_exit)
#endif
	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
	jz	LABEL(ashr_13_use_sse2)

	.p2align 4
LABEL(ashr_13_use_ssse3):
	movdqa	16(%rsi, %rcx), %xmm3
	pcmpeqb	%xmm3, %xmm0
	pmovmskb %xmm0, %edx
	test	%edx, %edx
	jnz	LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
	sub	$16, %r8
 	jbe	LABEL(strncpy_truncation_unaligned)
#endif

	#palignr $13, (%rsi, %rcx), %xmm3
	.byte	0x66, 0x0F, 0x3A ,0x0F
	.byte	0x1c, 0x0e, 0x0d

	movdqa	%xmm3, (%rdi, %rcx)
	add	$16, %rcx

#ifdef USE_AS_STRNCPY
	cmp	%r10, %r8
	jbe	LABEL(unaligned_exit)
#endif
	movdqa	16(%rsi, %rcx), %xmm3
	pcmpeqb %xmm3, %xmm0
	pmovmskb %xmm0, %edx
	test	%edx, %edx
	jnz	LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
	sub	$16, %r8
 	jbe	LABEL(strncpy_truncation_unaligned)
#endif

	#palignr $13, (%rsi, %rcx), %xmm3
	.byte	0x66, 0x0F, 0x3A ,0x0F
	.byte	0x1c, 0x0e, 0x0d

	movdqa	%xmm3, (%rdi, %rcx)
	add	$16, %rcx
#ifdef USE_AS_STRNCPY
	cmp	%r10, %r8
	jbe	LABEL(unaligned_exit)
#endif
	jmp	LABEL(ashr_13_use_ssse3)

	.p2align 4
LABEL(ashr_13_use_sse2):
	pcmpeqb 16(%rsi, %rcx), %xmm0
	pmovmskb %xmm0, %edx
	test	%edx, %edx
	jnz	LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
	sub	$16, %r8
 	jbe	LABEL(strncpy_truncation_unaligned)
#endif

	movdqa	16(%rsi, %rcx), %xmm3
	movdqa	(%rsi, %rcx), %xmm2

	psrldq	$13, %xmm2
	pslldq	$3, %xmm3
	por	%xmm2, %xmm3

	movdqa	%xmm3, (%rdi, %rcx)
	add	$16, %rcx

#ifdef USE_AS_STRNCPY
	cmp	%r10, %r8
	jbe	LABEL(unaligned_exit)
#endif
	pcmpeqb 16(%rsi, %rcx), %xmm0
	pmovmskb %xmm0, %edx
	test	%edx, %edx
	jnz	LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
	sub	$16, %r8
 	jbe	LABEL(strncpy_truncation_unaligned)
#endif

	movdqa	16(%rsi, %rcx), %xmm3
	movdqa	(%rsi, %rcx), %xmm2

	psrldq	$13, %xmm2
	pslldq	$3, %xmm3
	por	%xmm2, %xmm3

	movdqa	%xmm3, (%rdi, %rcx)
	add	$16, %rcx
#ifdef USE_AS_STRNCPY
	cmp	%r10, %r8
	jbe	LABEL(unaligned_exit)
#endif
	jmp	LABEL(ashr_13_use_sse2)


/*
 * ashr_12 handles the following cases:
 * 	(16 + (src offset - dest offset)) % 16 = 12 
 *
 * Based on above operation, start from (%r9 + rsi) to the left of this cache
 * bank, there is no null byte.
 */
	.p2align 4
LABEL(ashr_12):
	xor	%ecx, %ecx				/* clear index */
#ifdef USE_AS_STRNCPY
	cmp	%r10, %r8
	jbe	LABEL(unaligned_exit)
#endif
	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
	jz	LABEL(ashr_12_use_sse2)

	.p2align 4
LABEL(ashr_12_use_ssse3):
	movdqa	16(%rsi, %rcx), %xmm3
	pcmpeqb	%xmm3, %xmm0
	pmovmskb %xmm0, %edx
	test	%edx, %edx
	jnz	LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
	sub	$16, %r8
 	jbe	LABEL(strncpy_truncation_unaligned)
#endif

	#palignr $12, (%rsi, %rcx), %xmm3
	.byte	0x66, 0x0F, 0x3A ,0x0F
	.byte	0x1c, 0x0e, 0x0c

	movdqa	%xmm3, (%rdi, %rcx)
	add	$16, %rcx

#ifdef USE_AS_STRNCPY
	cmp	%r10, %r8
	jbe	LABEL(unaligned_exit)
#endif
	movdqa	16(%rsi, %rcx), %xmm3
	pcmpeqb %xmm3, %xmm0
	pmovmskb %xmm0, %edx
	test	%edx, %edx
	jnz	LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
	sub	$16, %r8
 	jbe	LABEL(strncpy_truncation_unaligned)
#endif

	#palignr $12, (%rsi, %rcx), %xmm3
	.byte	0x66, 0x0F, 0x3A ,0x0F
	.byte	0x1c, 0x0e, 0x0c

	movdqa	%xmm3, (%rdi, %rcx)
	add	$16, %rcx
#ifdef USE_AS_STRNCPY
	cmp	%r10, %r8
	jbe	LABEL(unaligned_exit)
#endif
	jmp	LABEL(ashr_12_use_ssse3)

	.p2align 4
LABEL(ashr_12_use_sse2):
	pcmpeqb 16(%rsi, %rcx), %xmm0
	pmovmskb %xmm0, %edx
	test	%edx, %edx
	jnz	LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
	sub	$16, %r8
 	jbe	LABEL(strncpy_truncation_unaligned)
#endif

	movdqa	16(%rsi, %rcx), %xmm3
	movdqa	(%rsi, %rcx), %xmm2

	psrldq	$12, %xmm2
	pslldq	$4, %xmm3
	por	%xmm2, %xmm3

	movdqa	%xmm3, (%rdi, %rcx)
	add	$16, %rcx

#ifdef USE_AS_STRNCPY
	cmp	%r10, %r8
	jbe	LABEL(unaligned_exit)
#endif
	pcmpeqb 16(%rsi, %rcx), %xmm0
	pmovmskb %xmm0, %edx
	test	%edx, %edx
	jnz	LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
	sub	$16, %r8
 	jbe	LABEL(strncpy_truncation_unaligned)
#endif

	movdqa	16(%rsi, %rcx), %xmm3
	movdqa	(%rsi, %rcx), %xmm2

	psrldq	$12, %xmm2
	pslldq	$4, %xmm3
	por	%xmm2, %xmm3

	movdqa	%xmm3, (%rdi, %rcx)
	add	$16, %rcx
#ifdef USE_AS_STRNCPY
	cmp	%r10, %r8
	jbe	LABEL(unaligned_exit)
#endif
	jmp	LABEL(ashr_12_use_sse2)


/*
 * ashr_11 handles the following cases:
 * 	(16 + (src offset - dest offset)) % 16 = 11 
 *
 * Based on above operation, start from (%r9 + rsi) to the left of this cache
 * bank, there is no null byte.
 */
	.p2align 4
LABEL(ashr_11):
	xor	%ecx, %ecx				/* clear index */
#ifdef USE_AS_STRNCPY
	cmp	%r10, %r8
	jbe	LABEL(unaligned_exit)
#endif
	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
	jz	LABEL(ashr_11_use_sse2)

	.p2align 4
LABEL(ashr_11_use_ssse3):
	movdqa	16(%rsi, %rcx), %xmm3
	pcmpeqb	%xmm3, %xmm0
	pmovmskb %xmm0, %edx
	test	%edx, %edx
	jnz	LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
	sub	$16, %r8
 	jbe	LABEL(strncpy_truncation_unaligned)
#endif

	#palignr $11, (%rsi, %rcx), %xmm3
	.byte	0x66, 0x0F, 0x3A ,0x0F
	.byte	0x1c, 0x0e, 0x0b

	movdqa	%xmm3, (%rdi, %rcx)
	add	$16, %rcx

#ifdef USE_AS_STRNCPY
	cmp	%r10, %r8
	jbe	LABEL(unaligned_exit)
#endif
	movdqa	16(%rsi, %rcx), %xmm3
	pcmpeqb %xmm3, %xmm0
	pmovmskb %xmm0, %edx
	test	%edx, %edx
	jnz	LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
	sub	$16, %r8
 	jbe	LABEL(strncpy_truncation_unaligned)
#endif

	#palignr $11, (%rsi, %rcx), %xmm3
	.byte	0x66, 0x0F, 0x3A ,0x0F
	.byte	0x1c, 0x0e, 0x0b

	movdqa	%xmm3, (%rdi, %rcx)
	add	$16, %rcx
#ifdef USE_AS_STRNCPY
	cmp	%r10, %r8
	jbe	LABEL(unaligned_exit)
#endif
	jmp	LABEL(ashr_11_use_ssse3)

	.p2align 4
LABEL(ashr_11_use_sse2):
	pcmpeqb 16(%rsi, %rcx), %xmm0
	pmovmskb %xmm0, %edx
	test	%edx, %edx
	jnz	LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
	sub	$16, %r8
 	jbe	LABEL(strncpy_truncation_unaligned)
#endif

	movdqa	16(%rsi, %rcx), %xmm3
	movdqa	(%rsi, %rcx), %xmm2

	psrldq	$11, %xmm2
	pslldq	$5, %xmm3
	por	%xmm2, %xmm3

	movdqa	%xmm3, (%rdi, %rcx)
	add	$16, %rcx

#ifdef USE_AS_STRNCPY
	cmp	%r10, %r8
	jbe	LABEL(unaligned_exit)
#endif
	pcmpeqb 16(%rsi, %rcx), %xmm0
	pmovmskb %xmm0, %edx
	test	%edx, %edx
	jnz	LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
	sub	$16, %r8
 	jbe	LABEL(strncpy_truncation_unaligned)
#endif

	movdqa	16(%rsi, %rcx), %xmm3
	movdqa	(%rsi, %rcx), %xmm2

	psrldq	$11, %xmm2
	pslldq	$5, %xmm3
	por	%xmm2, %xmm3

	movdqa	%xmm3, (%rdi, %rcx)
	add	$16, %rcx
#ifdef USE_AS_STRNCPY
	cmp	%r10, %r8
	jbe	LABEL(unaligned_exit)
#endif
	jmp	LABEL(ashr_11_use_sse2)


/*
 * ashr_10 handles the following cases:
 * 	(16 + (src offset - dest offset)) % 16 = 10
 *
 * Based on above operation, start from (%r9 + rsi) to the left of this cache
 * bank, there is no null byte.
 */
	.p2align 4
LABEL(ashr_10):
	xor	%ecx, %ecx				/* clear index */
#ifdef USE_AS_STRNCPY
	cmp	%r10, %r8
	jbe	LABEL(unaligned_exit)
#endif
	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
	jz	LABEL(ashr_10_use_sse2)

	.p2align 4
LABEL(ashr_10_use_ssse3):
	movdqa	16(%rsi, %rcx), %xmm3
	pcmpeqb	%xmm3, %xmm0
	pmovmskb %xmm0, %edx
	test	%edx, %edx
	jnz	LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
	sub	$16, %r8
 	jbe	LABEL(strncpy_truncation_unaligned)
#endif

	#palignr $10, (%rsi, %rcx), %xmm3
	.byte	0x66, 0x0F, 0x3A ,0x0F
	.byte	0x1c, 0x0e, 0x0a

	movdqa	%xmm3, (%rdi, %rcx)
	add	$16, %rcx

#ifdef USE_AS_STRNCPY
	cmp	%r10, %r8
	jbe	LABEL(unaligned_exit)
#endif
	movdqa	16(%rsi, %rcx), %xmm3
	pcmpeqb %xmm3, %xmm0
	pmovmskb %xmm0, %edx
	test	%edx, %edx
	jnz	LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
	sub	$16, %r8
 	jbe	LABEL(strncpy_truncation_unaligned)
#endif

	#palignr $10, (%rsi, %rcx), %xmm3
	.byte	0x66, 0x0F, 0x3A ,0x0F
	.byte	0x1c, 0x0e, 0x0a

	movdqa	%xmm3, (%rdi, %rcx)
	add	$16, %rcx
#ifdef USE_AS_STRNCPY
	cmp	%r10, %r8
	jbe	LABEL(unaligned_exit)
#endif
	jmp	LABEL(ashr_10_use_ssse3)

	.p2align 4
LABEL(ashr_10_use_sse2):
	pcmpeqb 16(%rsi, %rcx), %xmm0
	pmovmskb %xmm0, %edx
	test	%edx, %edx
	jnz	LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
	sub	$16, %r8
 	jbe	LABEL(strncpy_truncation_unaligned)
#endif

	movdqa	16(%rsi, %rcx), %xmm3
	movdqa	(%rsi, %rcx), %xmm2

	psrldq	$10, %xmm2
	pslldq	$6, %xmm3
	por	%xmm2, %xmm3

	movdqa	%xmm3, (%rdi, %rcx)
	add	$16, %rcx

#ifdef USE_AS_STRNCPY
	cmp	%r10, %r8
	jbe	LABEL(unaligned_exit)
#endif
	pcmpeqb 16(%rsi, %rcx), %xmm0
	pmovmskb %xmm0, %edx
	test	%edx, %edx
	jnz	LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
	sub	$16, %r8
 	jbe	LABEL(strncpy_truncation_unaligned)
#endif

	movdqa	16(%rsi, %rcx), %xmm3
	movdqa	(%rsi, %rcx), %xmm2

	psrldq	$10, %xmm2
	pslldq	$6, %xmm3
	por	%xmm2, %xmm3

	movdqa	%xmm3, (%rdi, %rcx)
	add	$16, %rcx
#ifdef USE_AS_STRNCPY
	cmp	%r10, %r8
	jbe	LABEL(unaligned_exit)
#endif
	jmp	LABEL(ashr_10_use_sse2)


/*
 * ashr_9 handles the following cases:
 * 	(16 + (src offset - dest offset)) % 16 = 9
 *
 * Based on above operation, start from (%r9 + rsi) to the left of this cache
 * bank, there is no null byte.
 */
	.p2align 4
LABEL(ashr_9):
	xor	%ecx, %ecx				/* clear index */
#ifdef USE_AS_STRNCPY
	cmp	%r10, %r8
	jbe	LABEL(unaligned_exit)
#endif
	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
	jz	LABEL(ashr_9_use_sse2)

	.p2align 4
LABEL(ashr_9_use_ssse3):
	movdqa	16(%rsi, %rcx), %xmm3
	pcmpeqb	%xmm3, %xmm0
	pmovmskb %xmm0, %edx
	test	%edx, %edx
	jnz	LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
	sub	$16, %r8
 	jbe	LABEL(strncpy_truncation_unaligned)
#endif

	#palignr $9, (%rsi, %rcx), %xmm3
	.byte	0x66, 0x0F, 0x3A ,0x0F
	.byte	0x1c, 0x0e, 0x09

	movdqa	%xmm3, (%rdi, %rcx)
	add	$16, %rcx

#ifdef USE_AS_STRNCPY
	cmp	%r10, %r8
	jbe	LABEL(unaligned_exit)
#endif
	movdqa	16(%rsi, %rcx), %xmm3
	pcmpeqb %xmm3, %xmm0
	pmovmskb %xmm0, %edx
	test	%edx, %edx
	jnz	LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
	sub	$16, %r8
 	jbe	LABEL(strncpy_truncation_unaligned)
#endif

	#palignr $9, (%rsi, %rcx), %xmm3
	.byte	0x66, 0x0F, 0x3A ,0x0F
	.byte	0x1c, 0x0e, 0x09

	movdqa	%xmm3, (%rdi, %rcx)
	add	$16, %rcx
#ifdef USE_AS_STRNCPY
	cmp	%r10, %r8
	jbe	LABEL(unaligned_exit)
#endif
	jmp	LABEL(ashr_9_use_ssse3)

	.p2align 4
LABEL(ashr_9_use_sse2):
	pcmpeqb 16(%rsi, %rcx), %xmm0
	pmovmskb %xmm0, %edx
	test	%edx, %edx
	jnz	LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
	sub	$16, %r8
 	jbe	LABEL(strncpy_truncation_unaligned)
#endif

	movdqa	16(%rsi, %rcx), %xmm3
	movdqa	(%rsi, %rcx), %xmm2

	psrldq	$9, %xmm2
	pslldq	$7, %xmm3
	por	%xmm2, %xmm3

	movdqa	%xmm3, (%rdi, %rcx)
	add	$16, %rcx

#ifdef USE_AS_STRNCPY
	cmp	%r10, %r8
	jbe	LABEL(unaligned_exit)
#endif
	pcmpeqb 16(%rsi, %rcx), %xmm0
	pmovmskb %xmm0, %edx
	test	%edx, %edx
	jnz	LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
	sub	$16, %r8
 	jbe	LABEL(strncpy_truncation_unaligned)
#endif

	movdqa	16(%rsi, %rcx), %xmm3
	movdqa	(%rsi, %rcx), %xmm2

	psrldq	$9, %xmm2
	pslldq	$7, %xmm3
	por	%xmm2, %xmm3

	movdqa	%xmm3, (%rdi, %rcx)
	add	$16, %rcx
#ifdef USE_AS_STRNCPY
	cmp	%r10, %r8
	jbe	LABEL(unaligned_exit)
#endif
	jmp	LABEL(ashr_9_use_sse2)


/*
 * ashr_8 handles the following cases:
 * 	(16 + (src offset - dest offset)) % 16 = 8
 *
 * Based on above operation, start from (%r9 + rsi) to the left of this cache
 * bank, there is no null byte.
 */
	.p2align 4
LABEL(ashr_8):
	xor	%ecx, %ecx				/* clear index */
#ifdef USE_AS_STRNCPY
	cmp	%r10, %r8
	jbe	LABEL(unaligned_exit)
#endif
	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
	jz	LABEL(ashr_8_use_sse2)

	.p2align 4
LABEL(ashr_8_use_ssse3):
	movdqa	16(%rsi, %rcx), %xmm3
	pcmpeqb	%xmm3, %xmm0
	pmovmskb %xmm0, %edx
	test	%edx, %edx
	jnz	LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
	sub	$16, %r8
 	jbe	LABEL(strncpy_truncation_unaligned)
#endif

	#palignr $8, (%rsi, %rcx), %xmm3
	.byte	0x66, 0x0F, 0x3A ,0x0F
	.byte	0x1c, 0x0e, 0x08

	movdqa	%xmm3, (%rdi, %rcx)
	add	$16, %rcx

#ifdef USE_AS_STRNCPY
	cmp	%r10, %r8
	jbe	LABEL(unaligned_exit)
#endif
	movdqa	16(%rsi, %rcx), %xmm3
	pcmpeqb %xmm3, %xmm0
	pmovmskb %xmm0, %edx
	test	%edx, %edx
	jnz	LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
	sub	$16, %r8
 	jbe	LABEL(strncpy_truncation_unaligned)
#endif

	#palignr $8, (%rsi, %rcx), %xmm3
	.byte	0x66, 0x0F, 0x3A ,0x0F
	.byte	0x1c, 0x0e, 0x08

	movdqa	%xmm3, (%rdi, %rcx)
	add	$16, %rcx
#ifdef USE_AS_STRNCPY
	cmp	%r10, %r8
	jbe	LABEL(unaligned_exit)
#endif
	jmp	LABEL(ashr_8_use_ssse3)

	.p2align 4
LABEL(ashr_8_use_sse2):
	pcmpeqb 16(%rsi, %rcx), %xmm0
	pmovmskb %xmm0, %edx
	test	%edx, %edx
	jnz	LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
	sub	$16, %r8
 	jbe	LABEL(strncpy_truncation_unaligned)
#endif

	movdqa	16(%rsi, %rcx), %xmm3
	movdqa	(%rsi, %rcx), %xmm2

	psrldq	$8, %xmm2
	pslldq	$8, %xmm3
	por	%xmm2, %xmm3

	movdqa	%xmm3, (%rdi, %rcx)
	add	$16, %rcx

#ifdef USE_AS_STRNCPY
	cmp	%r10, %r8
	jbe	LABEL(unaligned_exit)
#endif
	pcmpeqb 16(%rsi, %rcx), %xmm0
	pmovmskb %xmm0, %edx
	test	%edx, %edx
	jnz	LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
	sub	$16, %r8
 	jbe	LABEL(strncpy_truncation_unaligned)
#endif

	movdqa	16(%rsi, %rcx), %xmm3
	movdqa	(%rsi, %rcx), %xmm2

	psrldq	$8, %xmm2
	pslldq	$8, %xmm3
	por	%xmm2, %xmm3

	movdqa	%xmm3, (%rdi, %rcx)
	add	$16, %rcx
#ifdef USE_AS_STRNCPY
	cmp	%r10, %r8
	jbe	LABEL(unaligned_exit)
#endif
	jmp	LABEL(ashr_8_use_sse2)


/*
 * ashr_7 handles the following cases:
 * 	(16 + (src offset - dest offset)) % 16 = 7
 *
 * Based on above operation, start from (%r9 + rsi) to the left of this cache
 * bank, there is no null byte.
 */
	.p2align 4
LABEL(ashr_7):
	xor	%ecx, %ecx				/* clear index */
#ifdef USE_AS_STRNCPY
	cmp	%r10, %r8
	jbe	LABEL(unaligned_exit)
#endif
	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
	jz	LABEL(ashr_7_use_sse2)

	.p2align 4
LABEL(ashr_7_use_ssse3):
	movdqa	16(%rsi, %rcx), %xmm3
	pcmpeqb	%xmm3, %xmm0
	pmovmskb %xmm0, %edx
	test	%edx, %edx
	jnz	LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
	sub	$16, %r8
 	jbe	LABEL(strncpy_truncation_unaligned)
#endif

	#palignr $7, (%rsi, %rcx), %xmm3
	.byte	0x66, 0x0F, 0x3A ,0x0F
	.byte	0x1c, 0x0e, 0x07

	movdqa	%xmm3, (%rdi, %rcx)
	add	$16, %rcx

#ifdef USE_AS_STRNCPY
	cmp	%r10, %r8
	jbe	LABEL(unaligned_exit)
#endif
	movdqa	16(%rsi, %rcx), %xmm3
	pcmpeqb %xmm3, %xmm0
	pmovmskb %xmm0, %edx
	test	%edx, %edx
	jnz	LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
	sub	$16, %r8
 	jbe	LABEL(strncpy_truncation_unaligned)
#endif

	#palignr $7, (%rsi, %rcx), %xmm3
	.byte	0x66, 0x0F, 0x3A ,0x0F
	.byte	0x1c, 0x0e, 0x07

	movdqa	%xmm3, (%rdi, %rcx)
	add	$16, %rcx
#ifdef USE_AS_STRNCPY
	cmp	%r10, %r8
	jbe	LABEL(unaligned_exit)
#endif
	jmp	LABEL(ashr_7_use_ssse3)

	.p2align 4
LABEL(ashr_7_use_sse2):
	pcmpeqb 16(%rsi, %rcx), %xmm0
	pmovmskb %xmm0, %edx
	test	%edx, %edx
	jnz	LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
	sub	$16, %r8
 	jbe	LABEL(strncpy_truncation_unaligned)
#endif

	movdqa	16(%rsi, %rcx), %xmm3
	movdqa	(%rsi, %rcx), %xmm2

	psrldq	$7, %xmm2
	pslldq	$9, %xmm3
	por	%xmm2, %xmm3

	movdqa	%xmm3, (%rdi, %rcx)
	add	$16, %rcx

#ifdef USE_AS_STRNCPY
	cmp	%r10, %r8
	jbe	LABEL(unaligned_exit)
#endif
	pcmpeqb 16(%rsi, %rcx), %xmm0
	pmovmskb %xmm0, %edx
	test	%edx, %edx
	jnz	LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
	sub	$16, %r8
 	jbe	LABEL(strncpy_truncation_unaligned)
#endif

	movdqa	16(%rsi, %rcx), %xmm3
	movdqa	(%rsi, %rcx), %xmm2

	psrldq	$7, %xmm2
	pslldq	$9, %xmm3
	por	%xmm2, %xmm3

	movdqa	%xmm3, (%rdi, %rcx)
	add	$16, %rcx
#ifdef USE_AS_STRNCPY
	cmp	%r10, %r8
	jbe	LABEL(unaligned_exit)
#endif
	jmp	LABEL(ashr_7_use_sse2)


/*
 * ashr_6 handles the following cases:
 * 	(16 + (src offset - dest offset)) % 16 = 6
 *
 * Based on above operation, start from (%r9 + rsi) to the left of this cache
 * bank, there is no null byte.
 */
	.p2align 4
LABEL(ashr_6):
	xor	%ecx, %ecx				/* clear index */
#ifdef USE_AS_STRNCPY
	cmp	%r10, %r8
	jbe	LABEL(unaligned_exit)
#endif
	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
	jz	LABEL(ashr_6_use_sse2)

	.p2align 4
LABEL(ashr_6_use_ssse3):
	movdqa	16(%rsi, %rcx), %xmm3
	pcmpeqb	%xmm3, %xmm0
	pmovmskb %xmm0, %edx
	test	%edx, %edx
	jnz	LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
	sub	$16, %r8
 	jbe	LABEL(strncpy_truncation_unaligned)
#endif

	#palignr $6, (%rsi, %rcx), %xmm3
	.byte	0x66, 0x0F, 0x3A ,0x0F
	.byte	0x1c, 0x0e, 0x06

	movdqa	%xmm3, (%rdi, %rcx)
	add	$16, %rcx

#ifdef USE_AS_STRNCPY
	cmp	%r10, %r8
	jbe	LABEL(unaligned_exit)
#endif
	movdqa	16(%rsi, %rcx), %xmm3
	pcmpeqb %xmm3, %xmm0
	pmovmskb %xmm0, %edx
	test	%edx, %edx
	jnz	LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
	sub	$16, %r8
 	jbe	LABEL(strncpy_truncation_unaligned)
#endif

	#palignr $6, (%rsi, %rcx), %xmm3
	.byte	0x66, 0x0F, 0x3A ,0x0F
	.byte	0x1c, 0x0e, 0x06

	movdqa	%xmm3, (%rdi, %rcx)
	add	$16, %rcx
#ifdef USE_AS_STRNCPY
	cmp	%r10, %r8
	jbe	LABEL(unaligned_exit)
#endif
	jmp	LABEL(ashr_6_use_ssse3)

	.p2align 4
LABEL(ashr_6_use_sse2):
	pcmpeqb 16(%rsi, %rcx), %xmm0
	pmovmskb %xmm0, %edx
	test	%edx, %edx
	jnz	LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
	sub	$16, %r8
 	jbe	LABEL(strncpy_truncation_unaligned)
#endif

	movdqa	16(%rsi, %rcx), %xmm3
	movdqa	(%rsi, %rcx), %xmm2

	psrldq	$6, %xmm2
	pslldq	$10, %xmm3
	por	%xmm2, %xmm3

	movdqa	%xmm3, (%rdi, %rcx)
	add	$16, %rcx

#ifdef USE_AS_STRNCPY
	cmp	%r10, %r8
	jbe	LABEL(unaligned_exit)
#endif
	pcmpeqb 16(%rsi, %rcx), %xmm0
	pmovmskb %xmm0, %edx
	test	%edx, %edx
	jnz	LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
	sub	$16, %r8
 	jbe	LABEL(strncpy_truncation_unaligned)
#endif

	movdqa	16(%rsi, %rcx), %xmm3
	movdqa	(%rsi, %rcx), %xmm2

	psrldq	$6, %xmm2
	pslldq	$10, %xmm3
	por	%xmm2, %xmm3

	movdqa	%xmm3, (%rdi, %rcx)
	add	$16, %rcx
#ifdef USE_AS_STRNCPY
	cmp	%r10, %r8
	jbe	LABEL(unaligned_exit)
#endif
	jmp	LABEL(ashr_6_use_sse2)


/*
 * ashr_5 handles the following cases:
 * 	(16 + (src offset - dest offset)) % 16 = 5
 *
 * Based on above operation, start from (%r9 + rsi) to the left of this cache
 * bank, there is no null byte.
 */
	.p2align 4
LABEL(ashr_5):
	xor	%ecx, %ecx				/* clear index */
#ifdef USE_AS_STRNCPY
	cmp	%r10, %r8
	jbe	LABEL(unaligned_exit)
#endif
	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
	jz	LABEL(ashr_5_use_sse2)

	.p2align 4
LABEL(ashr_5_use_ssse3):
	movdqa	16(%rsi, %rcx), %xmm3
	pcmpeqb	%xmm3, %xmm0
	pmovmskb %xmm0, %edx
	test	%edx, %edx
	jnz	LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
	sub	$16, %r8
 	jbe	LABEL(strncpy_truncation_unaligned)
#endif

	#palignr $5, (%rsi, %rcx), %xmm3
	.byte	0x66, 0x0F, 0x3A ,0x0F
	.byte	0x1c, 0x0e, 0x05

	movdqa	%xmm3, (%rdi, %rcx)
	add	$16, %rcx

#ifdef USE_AS_STRNCPY
	cmp	%r10, %r8
	jbe	LABEL(unaligned_exit)
#endif
	movdqa	16(%rsi, %rcx), %xmm3
	pcmpeqb %xmm3, %xmm0
	pmovmskb %xmm0, %edx
	test	%edx, %edx
	jnz	LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
	sub	$16, %r8
 	jbe	LABEL(strncpy_truncation_unaligned)
#endif

	#palignr $5, (%rsi, %rcx), %xmm3
	.byte	0x66, 0x0F, 0x3A ,0x0F
	.byte	0x1c, 0x0e, 0x05

	movdqa	%xmm3, (%rdi, %rcx)
	add	$16, %rcx
#ifdef USE_AS_STRNCPY
	cmp	%r10, %r8
	jbe	LABEL(unaligned_exit)
#endif
	jmp	LABEL(ashr_5_use_ssse3)

	.p2align 4
LABEL(ashr_5_use_sse2):
	pcmpeqb 16(%rsi, %rcx), %xmm0
	pmovmskb %xmm0, %edx
	test	%edx, %edx
	jnz	LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
	sub	$16, %r8
 	jbe	LABEL(strncpy_truncation_unaligned)
#endif

	movdqa	16(%rsi, %rcx), %xmm3
	movdqa	(%rsi, %rcx), %xmm2

	psrldq	$5, %xmm2
	pslldq	$11, %xmm3
	por	%xmm2, %xmm3

	movdqa	%xmm3, (%rdi, %rcx)
	add	$16, %rcx

#ifdef USE_AS_STRNCPY
	cmp	%r10, %r8
	jbe	LABEL(unaligned_exit)
#endif
	pcmpeqb 16(%rsi, %rcx), %xmm0
	pmovmskb %xmm0, %edx
	test	%edx, %edx
	jnz	LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
	sub	$16, %r8
 	jbe	LABEL(strncpy_truncation_unaligned)
#endif

	movdqa	16(%rsi, %rcx), %xmm3
	movdqa	(%rsi, %rcx), %xmm2

	psrldq	$5, %xmm2
	pslldq	$11, %xmm3
	por	%xmm2, %xmm3

	movdqa	%xmm3, (%rdi, %rcx)
	add	$16, %rcx
#ifdef USE_AS_STRNCPY
	cmp	%r10, %r8
	jbe	LABEL(unaligned_exit)
#endif
	jmp	LABEL(ashr_5_use_sse2)


/*
 * ashr_4 handles the following cases:
 * 	(16 + (src offset - dest offset)) % 16 = 4
 *
 * Based on above operation, start from (%r9 + rsi) to the left of this cache
 * bank, there is no null byte.
 */
	.p2align 4
LABEL(ashr_4):
	xor	%ecx, %ecx				/* clear index */
#ifdef USE_AS_STRNCPY
	cmp	%r10, %r8
	jbe	LABEL(unaligned_exit)
#endif
	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
	jz	LABEL(ashr_4_use_sse2)

	.p2align 4
LABEL(ashr_4_use_ssse3):
	movdqa	16(%rsi, %rcx), %xmm3
	pcmpeqb	%xmm3, %xmm0
	pmovmskb %xmm0, %edx
	test	%edx, %edx
	jnz	LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
	sub	$16, %r8
 	jbe	LABEL(strncpy_truncation_unaligned)
#endif

	#palignr $4, (%rsi, %rcx), %xmm3
	.byte	0x66, 0x0F, 0x3A ,0x0F
	.byte	0x1c, 0x0e, 0x04

	movdqa	%xmm3, (%rdi, %rcx)
	add	$16, %rcx

#ifdef USE_AS_STRNCPY
	cmp	%r10, %r8
	jbe	LABEL(unaligned_exit)
#endif
	movdqa	16(%rsi, %rcx), %xmm3
	pcmpeqb %xmm3, %xmm0
	pmovmskb %xmm0, %edx
	test	%edx, %edx
	jnz	LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
	sub	$16, %r8
 	jbe	LABEL(strncpy_truncation_unaligned)
#endif

	#palignr $4, (%rsi, %rcx), %xmm3
	.byte	0x66, 0x0F, 0x3A ,0x0F
	.byte	0x1c, 0x0e, 0x04

	movdqa	%xmm3, (%rdi, %rcx)
	add	$16, %rcx
#ifdef USE_AS_STRNCPY
	cmp	%r10, %r8
	jbe	LABEL(unaligned_exit)
#endif
	jmp	LABEL(ashr_4_use_ssse3)

	.p2align 4
LABEL(ashr_4_use_sse2):
	pcmpeqb 16(%rsi, %rcx), %xmm0
	pmovmskb %xmm0, %edx
	test	%edx, %edx
	jnz	LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
	sub	$16, %r8
 	jbe	LABEL(strncpy_truncation_unaligned)
#endif

	movdqa	16(%rsi, %rcx), %xmm3
	movdqa	(%rsi, %rcx), %xmm2

	psrldq	$4, %xmm2
	pslldq	$12, %xmm3
	por	%xmm2, %xmm3

	movdqa	%xmm3, (%rdi, %rcx)
	add	$16, %rcx

#ifdef USE_AS_STRNCPY
	cmp	%r10, %r8
	jbe	LABEL(unaligned_exit)
#endif
	pcmpeqb 16(%rsi, %rcx), %xmm0
	pmovmskb %xmm0, %edx
	test	%edx, %edx
	jnz	LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
	sub	$16, %r8
 	jbe	LABEL(strncpy_truncation_unaligned)
#endif

	movdqa	16(%rsi, %rcx), %xmm3
	movdqa	(%rsi, %rcx), %xmm2

	psrldq	$4, %xmm2
	pslldq	$12, %xmm3
	por	%xmm2, %xmm3

	movdqa	%xmm3, (%rdi, %rcx)
	add	$16, %rcx
#ifdef USE_AS_STRNCPY
	cmp	%r10, %r8
	jbe	LABEL(unaligned_exit)
#endif
	jmp	LABEL(ashr_4_use_sse2)


/*
 * ashr_3 handles the following cases:
 * 	(16 + (src offset - dest offset)) % 16 = 3
 *
 * Based on above operation, start from (%r9 + rsi) to the left of this cache
 * bank, there is no null byte.
 */
	.p2align 4
LABEL(ashr_3):
	xor	%ecx, %ecx				/* clear index */
#ifdef USE_AS_STRNCPY
	cmp	%r10, %r8
	jbe	LABEL(unaligned_exit)
#endif
	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
	jz	LABEL(ashr_3_use_sse2)

	.p2align 4
LABEL(ashr_3_use_ssse3):
	movdqa	16(%rsi, %rcx), %xmm3
	pcmpeqb	%xmm3, %xmm0
	pmovmskb %xmm0, %edx
	test	%edx, %edx
	jnz	LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
	sub	$16, %r8
 	jbe	LABEL(strncpy_truncation_unaligned)
#endif

	#palignr $3, (%rsi, %rcx), %xmm3
	.byte	0x66, 0x0F, 0x3A ,0x0F
	.byte	0x1c, 0x0e, 0x03

	movdqa	%xmm3, (%rdi, %rcx)
	add	$16, %rcx

#ifdef USE_AS_STRNCPY
	cmp	%r10, %r8
	jbe	LABEL(unaligned_exit)
#endif
	movdqa	16(%rsi, %rcx), %xmm3
	pcmpeqb %xmm3, %xmm0
	pmovmskb %xmm0, %edx
	test	%edx, %edx
	jnz	LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
	sub	$16, %r8
 	jbe	LABEL(strncpy_truncation_unaligned)
#endif

	#palignr $3, (%rsi, %rcx), %xmm3
	.byte	0x66, 0x0F, 0x3A ,0x0F
	.byte	0x1c, 0x0e, 0x03

	movdqa	%xmm3, (%rdi, %rcx)
	add	$16, %rcx
#ifdef USE_AS_STRNCPY
	cmp	%r10, %r8
	jbe	LABEL(unaligned_exit)
#endif
	jmp	LABEL(ashr_3_use_ssse3)

	.p2align 4
LABEL(ashr_3_use_sse2):
	pcmpeqb 16(%rsi, %rcx), %xmm0
	pmovmskb %xmm0, %edx
	test	%edx, %edx
	jnz	LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
	sub	$16, %r8
 	jbe	LABEL(strncpy_truncation_unaligned)
#endif

	movdqa	16(%rsi, %rcx), %xmm3
	movdqa	(%rsi, %rcx), %xmm2

	psrldq	$3, %xmm2
	pslldq	$13, %xmm3
	por	%xmm2, %xmm3

	movdqa	%xmm3, (%rdi, %rcx)
	add	$16, %rcx

#ifdef USE_AS_STRNCPY
	cmp	%r10, %r8
	jbe	LABEL(unaligned_exit)
#endif
	pcmpeqb 16(%rsi, %rcx), %xmm0
	pmovmskb %xmm0, %edx
	test	%edx, %edx
	jnz	LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
	sub	$16, %r8
 	jbe	LABEL(strncpy_truncation_unaligned)
#endif

	movdqa	16(%rsi, %rcx), %xmm3
	movdqa	(%rsi, %rcx), %xmm2

	psrldq	$3, %xmm2
	pslldq	$13, %xmm3
	por	%xmm2, %xmm3

	movdqa	%xmm3, (%rdi, %rcx)
	add	$16, %rcx
#ifdef USE_AS_STRNCPY
	cmp	%r10, %r8
	jbe	LABEL(unaligned_exit)
#endif
	jmp	LABEL(ashr_3_use_sse2)


/*
 * ashr_2 handles the following cases:
 * 	(16 + (src offset - dest offset)) % 16 = 2
 *
 * Based on above operation, start from (%r9 + rsi) to the left of this cache
 * bank, there is no null byte.
 */
	.p2align 4
LABEL(ashr_2):
	xor	%ecx, %ecx				/* clear index */
#ifdef USE_AS_STRNCPY
	cmp	%r10, %r8
	jbe	LABEL(unaligned_exit)
#endif
	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
	jz	LABEL(ashr_2_use_sse2)

	.p2align 4
LABEL(ashr_2_use_ssse3):
	movdqa	16(%rsi, %rcx), %xmm3
	pcmpeqb	%xmm3, %xmm0
	pmovmskb %xmm0, %edx
	test	%edx, %edx
	jnz	LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
	sub	$16, %r8
 	jbe	LABEL(strncpy_truncation_unaligned)
#endif

	#palignr $2, (%rsi, %rcx), %xmm3
	.byte	0x66, 0x0F, 0x3A ,0x0F
	.byte	0x1c, 0x0e, 0x02

	movdqa	%xmm3, (%rdi, %rcx)
	add	$16, %rcx

#ifdef USE_AS_STRNCPY
	cmp	%r10, %r8
	jbe	LABEL(unaligned_exit)
#endif
	movdqa	16(%rsi, %rcx), %xmm3
	pcmpeqb %xmm3, %xmm0
	pmovmskb %xmm0, %edx
	test	%edx, %edx
	jnz	LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
	sub	$16, %r8
 	jbe	LABEL(strncpy_truncation_unaligned)
#endif

	#palignr $2, (%rsi, %rcx), %xmm3
	.byte	0x66, 0x0F, 0x3A ,0x0F
	.byte	0x1c, 0x0e, 0x02

	movdqa	%xmm3, (%rdi, %rcx)
	add	$16, %rcx
#ifdef USE_AS_STRNCPY
	cmp	%r10, %r8
	jbe	LABEL(unaligned_exit)
#endif
	jmp	LABEL(ashr_2_use_ssse3)

	.p2align 4
LABEL(ashr_2_use_sse2):
	pcmpeqb 16(%rsi, %rcx), %xmm0
	pmovmskb %xmm0, %edx
	test	%edx, %edx
	jnz	LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
	sub	$16, %r8
 	jbe	LABEL(strncpy_truncation_unaligned)
#endif

	movdqa	16(%rsi, %rcx), %xmm3
	movdqa	(%rsi, %rcx), %xmm2

	psrldq	$2, %xmm2
	pslldq	$14, %xmm3
	por	%xmm2, %xmm3

	movdqa	%xmm3, (%rdi, %rcx)
	add	$16, %rcx

#ifdef USE_AS_STRNCPY
	cmp	%r10, %r8
	jbe	LABEL(unaligned_exit)
#endif
	pcmpeqb 16(%rsi, %rcx), %xmm0
	pmovmskb %xmm0, %edx
	test	%edx, %edx
	jnz	LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
	sub	$16, %r8
 	jbe	LABEL(strncpy_truncation_unaligned)
#endif

	movdqa	16(%rsi, %rcx), %xmm3
	movdqa	(%rsi, %rcx), %xmm2

	psrldq	$2, %xmm2
	pslldq	$14, %xmm3
	por	%xmm2, %xmm3

	movdqa	%xmm3, (%rdi, %rcx)
	add	$16, %rcx
#ifdef USE_AS_STRNCPY
	cmp	%r10, %r8
	jbe	LABEL(unaligned_exit)
#endif
	jmp	LABEL(ashr_2_use_sse2)


/*
 * ashr_1 handles the following cases:
 * 	(16 + (src offset - dest offset)) % 16 = 1
 *
 * Based on above operation, start from (%r9 + rsi) to the left of this cache
 * bank, there is no null byte.
 */
	.p2align 4
LABEL(ashr_1):
	xor	%ecx, %ecx				/* clear index */
#ifdef USE_AS_STRNCPY
	cmp	%r10, %r8
	jbe	LABEL(unaligned_exit)
#endif
	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
	jz	LABEL(ashr_1_use_sse2)

	.p2align 4
LABEL(ashr_1_use_ssse3):
	movdqa	16(%rsi, %rcx), %xmm3
	pcmpeqb	%xmm3, %xmm0
	pmovmskb %xmm0, %edx
	test	%edx, %edx
	jnz	LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
	sub	$16, %r8
 	jbe	LABEL(strncpy_truncation_unaligned)
#endif

	#palignr $1, (%rsi, %rcx), %xmm3
	.byte	0x66, 0x0F, 0x3A ,0x0F
	.byte	0x1c, 0x0e, 0x01

	movdqa	%xmm3, (%rdi, %rcx)
	add	$16, %rcx

#ifdef USE_AS_STRNCPY
	cmp	%r10, %r8
	jbe	LABEL(unaligned_exit)
#endif
	movdqa	16(%rsi, %rcx), %xmm3
	pcmpeqb %xmm3, %xmm0
	pmovmskb %xmm0, %edx
	test	%edx, %edx
	jnz	LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
	sub	$16, %r8
 	jbe	LABEL(strncpy_truncation_unaligned)
#endif
	#palignr $1, (%rsi, %rcx), %xmm3
	.byte	0x66, 0x0F, 0x3A ,0x0F
	.byte	0x1c, 0x0e, 0x01

	movdqa	%xmm3, (%rdi, %rcx)
	add	$16, %rcx
#ifdef USE_AS_STRNCPY
	cmp	%r10, %r8
	jbe	LABEL(unaligned_exit)
#endif
	jmp	LABEL(ashr_1_use_ssse3)

	.p2align 4
LABEL(ashr_1_use_sse2):
	pcmpeqb 16(%rsi, %rcx), %xmm0
	pmovmskb %xmm0, %edx
	test	%edx, %edx
	jnz	LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
	sub	$16, %r8
 	jbe	LABEL(strncpy_truncation_unaligned)
#endif
	movdqa	16(%rsi, %rcx), %xmm3
	movdqa	(%rsi, %rcx), %xmm2

	psrldq	$1, %xmm2
	pslldq	$15, %xmm3
	por	%xmm2, %xmm3

	movdqa	%xmm3, (%rdi, %rcx)
	add	$16, %rcx

#ifdef USE_AS_STRNCPY
	cmp	%r10, %r8
	jbe	LABEL(unaligned_exit)
#endif
	pcmpeqb 16(%rsi, %rcx), %xmm0
	pmovmskb %xmm0, %edx
	test	%edx, %edx
	jnz	LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
	sub	$16, %r8
 	jbe	LABEL(strncpy_truncation_unaligned)
#endif

	movdqa	16(%rsi, %rcx), %xmm3
	movdqa	(%rsi, %rcx), %xmm2

	psrldq	$1, %xmm2
	pslldq	$15, %xmm3
	por	%xmm2, %xmm3

	movdqa	%xmm3, (%rdi, %rcx)
	add	$16, %rcx
#ifdef USE_AS_STRNCPY
	cmp	%r10, %r8
	jbe	LABEL(unaligned_exit)
#endif
	jmp	LABEL(ashr_1_use_sse2)


	/*
	 * Exit tail code:
	 * Up to 32 bytes are copied in the case of strcpy.
	 */
	.p2align 4
LABEL(less32bytes):
	xor	%ecx, %ecx
LABEL(unaligned_exit):
	add	%r9, %rsi		/* r9 holds offset of rsi */
	mov	%rcx, %r9
	mov	%r10, %rcx
	shl	%cl, %edx		/* after shl, calculate the exact number to be filled */
	mov	%r9, %rcx
	.p2align 4
LABEL(aligned_exit):
	add	%rcx, %rdi		/* locate exact address for rdi */
LABEL(less16bytes):
	add	%rcx, %rsi		/* locate exact address for rsi */
LABEL(aligned_16bytes):
#ifdef USE_AS_STRNCPY
	/*
	 * Null found in 16bytes checked. Set bit in bitmask corresponding to
	 * the strncpy count argument. We will copy to the null (inclusive)
	 * or count whichever comes first.
	 */
	mov	$1, %r9d
	lea	-1(%r8), %rcx
	shl	%cl, %r9d
	cmp	$32, %r8
	ja	LABEL(strncpy_tail)
	or	%r9d, %edx
LABEL(strncpy_tail):
#endif
	/*
	 * Check to see if BSF is fast on this processor. If not, use a
	 * different exit tail.
	 */
	testb	$USE_BSF, .memops_method(%rip)
	jz	LABEL(AMD_exit)
	bsf	%rdx, %rcx		/* Find byte with null char */
	lea	LABEL(tail_table)(%rip), %r11
	movslq	(%r11, %rcx, 4), %rcx
	lea	(%r11, %rcx), %rcx
	jmp	*%rcx

#ifdef USE_AS_STRNCPY
	/*
	 * Count reached before null found.
	 */
	.p2align 4
LABEL(less32bytes_strncpy_truncation):
	xor	%ecx, %ecx
LABEL(strncpy_truncation_unaligned):
	add	%r9, %rsi		/* next src char to copy */
LABEL(strncpy_truncation_aligned):
	add	%rcx, %rdi
	add	%rcx, %rsi
	add	$16, %r8		/* compensation */
	lea	-1(%r8), %rcx
	lea	LABEL(tail_table)(%rip), %r11
	movslq	(%r11, %rcx, 4), %rcx
	lea	(%r11, %rcx), %rcx
	jmp	*%rcx

	.p2align 4
LABEL(strncpy_exitz):
	mov	%rdi, %rax
	ret
#endif

	.p2align 4
LABEL(AMD_exit):
	test	%dl, %dl
	jz	LABEL(AMD_exit_more_8)
	test	$0x01, %dl
	jnz	LABEL(tail_0)
	test	$0x02, %dl
	jnz	LABEL(tail_1)
	test	$0x04, %dl
	jnz	LABEL(tail_2)
	test	$0x08, %dl
	jnz	LABEL(tail_3)
	test	$0x10, %dl
	jnz	LABEL(tail_4)
	test	$0x20, %dl
	jnz	LABEL(tail_5)
	test	$0x40, %dl
	jnz	LABEL(tail_6)

	.p2align 4
LABEL(tail_7):				/* 8 bytes */
	mov	(%rsi), %rcx
	mov	%rcx, (%rdi)
#ifdef USE_AS_STRNCPY
	mov	$8, %cl
	sub	$8, %r8
	jnz	LABEL(strncpy_fill_tail)
#endif
	ret

#ifdef USE_AS_STRNCPY
	/*
	 * Null terminated src string shorter than count. Fill the rest of the
	 * destination with null chars.
	 */
	.p2align 4
LABEL(strncpy_fill_tail):
	mov	%rax, %rdx
	movzx	%cl, %rax
	mov	%r8, %rcx
	add	%rax, %rdi
	xor	%eax, %eax
	shr	$3, %ecx
	jz	LABEL(strncpy_fill_less_8)

	rep	stosq
LABEL(strncpy_fill_less_8):
	mov	%r8, %rcx
	and	$7, %rcx
	jz	LABEL(strncpy_fill_return)
LABEL(strncpy_fill_less_7):
	sub	$1, %ecx
	mov	%al, (%rdi, %rcx)
	jnz	LABEL(strncpy_fill_less_7)
LABEL(strncpy_fill_return):
	mov	%rdx, %rax
	ret
#endif

	.p2align 4
LABEL(tail_0):				/* 1 byte */
	mov	(%rsi), %cl
	mov	%cl, (%rdi)
#ifdef USE_AS_STRNCPY
	mov	$1, %cl
	sub	$1, %r8
	jnz	LABEL(strncpy_fill_tail)
#endif
	ret

	.p2align 4
LABEL(tail_1):				/* 2 bytes */
	mov	(%rsi), %cx
	mov	%cx, (%rdi)
#ifdef USE_AS_STRNCPY
	mov	$2, %cl
	sub	$2, %r8
	jnz	LABEL(strncpy_fill_tail)
#endif
	ret

	.p2align 4
LABEL(tail_2):				/* 3 bytes */
	mov	(%rsi), %cx
	mov	%cx, (%rdi)
	mov	1(%rsi), %cx
	mov	%cx, 1(%rdi)
#ifdef USE_AS_STRNCPY
	mov	$3, %cl
	sub	$3, %r8
	jnz	LABEL(strncpy_fill_tail)
#endif
	ret

	.p2align 4
LABEL(tail_3):				/* 4 bytes */
	mov	(%rsi), %ecx
	mov	%ecx, (%rdi)
#ifdef USE_AS_STRNCPY
	mov	$4, %cl
	sub	$4, %r8
	jnz	LABEL(strncpy_fill_tail)
#endif
	ret

	.p2align 4
LABEL(tail_4):				/* 5 bytes */
	mov	(%rsi), %ecx
	mov	%ecx, (%rdi)
	mov	1(%rsi), %edx
	mov	%edx, 1(%rdi)
#ifdef USE_AS_STRNCPY
	mov	$5, %cl
	sub	$5, %r8
	jnz	LABEL(strncpy_fill_tail)
#endif
	ret

	.p2align 4
LABEL(tail_5):				/* 6 bytes */
	mov	(%rsi), %ecx
	mov	%ecx, (%rdi)
	mov	2(%rsi), %edx
	mov	%edx, 2(%rdi)
#ifdef USE_AS_STRNCPY
	mov	$6, %cl
	sub	$6, %r8
	jnz	LABEL(strncpy_fill_tail)
#endif
	ret

	.p2align 4
LABEL(tail_6):				/* 7 bytes */
	mov	(%rsi), %ecx
	mov	%ecx, (%rdi)
	mov	3(%rsi), %edx
	mov	%edx,3(%rdi)
#ifdef USE_AS_STRNCPY
	mov	$7, %cl
	sub	$7, %r8
	jnz	LABEL(strncpy_fill_tail)
#endif
	ret

	.p2align 4
LABEL(tail_8):				/* 9 bytes */
	mov	(%rsi), %rcx
	mov	%rcx, (%rdi)
	mov	5(%rsi), %edx
	mov	%edx, 5(%rdi)
#ifdef USE_AS_STRNCPY
	mov	$9, %cl
	sub	$9, %r8
	jnz	LABEL(strncpy_fill_tail)
#endif
	ret

	.p2align 4
LABEL(AMD_exit_more_8):
	test	%dh, %dh
	jz	LABEL(AMD_exit_more_16)
	test	$0x01, %dh
	jnz	LABEL(tail_8)
	test	$0x02, %dh
	jnz	LABEL(tail_9)
	test	$0x04, %dh
	jnz	LABEL(tail_10)
	test	$0x08, %dh
	jnz	LABEL(tail_11)
	test	$0x10, %dh
	jnz	LABEL(tail_12)
	test	$0x20, %dh
	jnz	LABEL(tail_13)
	test	$0x40, %dh
	jnz	LABEL(tail_14)

	.p2align 4
LABEL(tail_15):				/* 16 bytes */
	mov	(%rsi), %rcx
	mov	%rcx, (%rdi)
	mov	8(%rsi), %rdx
	mov	%rdx, 8(%rdi)
#ifdef USE_AS_STRNCPY
	mov	$16, %cl
	sub	$16, %r8
	jnz	LABEL(strncpy_fill_tail)
#endif
	ret

	.p2align 4
LABEL(tail_9):				/* 10 bytes */
	mov	(%rsi), %rcx
	mov	%rcx, (%rdi)
	mov	6(%rsi), %edx
	mov	%edx, 6(%rdi)
#ifdef USE_AS_STRNCPY
	mov	$10, %cl
	sub	$10, %r8
	jnz	LABEL(strncpy_fill_tail)
#endif
	ret

	.p2align 4
LABEL(tail_10):				/* 11 bytes */
	mov	(%rsi), %rcx
	mov	%rcx, (%rdi)
	mov	7(%rsi), %edx
	mov	%edx, 7(%rdi)
#ifdef USE_AS_STRNCPY
	mov	$11, %cl
	sub	$11, %r8
	jnz	LABEL(strncpy_fill_tail)
#endif
	ret

	.p2align 4
LABEL(tail_11):				/* 12 bytes */
	mov	(%rsi), %rcx
	mov	%rcx, (%rdi)
	mov	8(%rsi), %edx
	mov	%edx, 8(%rdi)
#ifdef USE_AS_STRNCPY
	mov	$12, %cl
	sub	$12, %r8
	jnz	LABEL(strncpy_fill_tail)
#endif
	ret

	.p2align 4
LABEL(tail_12):				/* 13 bytes */
	mov	(%rsi), %rcx
	mov	%rcx, (%rdi)
	mov	5(%rsi), %rcx
	mov	%rcx, 5(%rdi)
#ifdef USE_AS_STRNCPY
	mov	$13, %cl
	sub	$13, %r8
	jnz	LABEL(strncpy_fill_tail)
#endif
	ret

	.p2align 4
LABEL(tail_13):				/* 14 bytes */
	mov	(%rsi), %rcx
	mov	%rcx, (%rdi)
	mov	6(%rsi), %rcx
	mov	%rcx, 6(%rdi)
#ifdef USE_AS_STRNCPY
	mov	$14, %cl
	sub	$14, %r8
	jnz	LABEL(strncpy_fill_tail)
#endif
	ret

	.p2align 4
LABEL(tail_14):				/* 15 bytes */
	mov	(%rsi), %rcx
	mov	%rcx, (%rdi)
	mov	7(%rsi), %rcx
	mov	%rcx, 7(%rdi)
#ifdef USE_AS_STRNCPY
	mov	$15, %cl
	sub	$15, %r8
	jnz	LABEL(strncpy_fill_tail)
#endif
	ret

	.p2align 4
LABEL(AMD_exit_more_16):
	shr	$16, %edx
	test	%dl, %dl
	jz	LABEL(AMD_exit_more_24)
	test	$0x01, %dl
	jnz	LABEL(tail_16)
	test	$0x02, %dl
	jnz	LABEL(tail_17)
	test	$0x04, %dl
	jnz	LABEL(tail_18)
	test	$0x08, %dl
	jnz	LABEL(tail_19)
	test	$0x10, %dl
	jnz	LABEL(tail_20)
	test	$0x20, %dl
	jnz	LABEL(tail_21)
	test	$0x40, %dl
	jnz	LABEL(tail_22)

	.p2align 4
LABEL(tail_23):				/* 24 bytes */
	mov	(%rsi), %rcx
	mov	%rcx, (%rdi)
	mov	8(%rsi), %rdx
	mov	%rdx, 8(%rdi)
	mov	16(%rsi), %rcx
	mov	%rcx, 16(%rdi)
#ifdef USE_AS_STRNCPY
	mov	$24, %cl
	sub	$24, %r8
	jnz	LABEL(strncpy_fill_tail)
#endif
	ret

	.p2align 4
LABEL(tail_16):				/* 17 bytes */
	mov	(%rsi), %rcx
	mov	%rcx, (%rdi)
	mov	8(%rsi), %rdx
	mov	%rdx, 8(%rdi)
	mov	16(%rsi), %cl
	mov	%cl, 16(%rdi)
#ifdef USE_AS_STRNCPY
	mov	$17, %cl
	sub	$17, %r8
	jnz	LABEL(strncpy_fill_tail)
#endif
	ret

	.p2align 4
LABEL(tail_17):				/* 18 bytes */
	mov	(%rsi), %rcx
	mov	%rcx, (%rdi)
	mov	8(%rsi), %rdx
	mov	%rdx, 8(%rdi)
	mov	16(%rsi), %cx
	mov	%cx, 16(%rdi)
#ifdef USE_AS_STRNCPY
	mov	$18, %cl
	sub	$18, %r8
	jnz	LABEL(strncpy_fill_tail)
#endif
	ret

	.p2align 4
LABEL(tail_18):				/* 19 bytes */
	mov	(%rsi), %rcx
	mov	%rcx, (%rdi)
	mov	8(%rsi), %rdx
	mov	%rdx, 8(%rdi)
	mov	15(%rsi), %ecx
	mov	%ecx,15(%rdi)
#ifdef USE_AS_STRNCPY
	mov	$19, %cl
	sub	$19, %r8
	jnz	LABEL(strncpy_fill_tail)
#endif
	ret

	.p2align 4
LABEL(tail_19):				/* 20 bytes */
	mov	(%rsi), %rcx
	mov	%rcx, (%rdi)
	mov	8(%rsi), %rdx
	mov	%rdx, 8(%rdi)
	mov	16(%rsi), %ecx
	mov	%ecx, 16(%rdi)
#ifdef USE_AS_STRNCPY
	mov	$20, %cl
	sub	$20, %r8
	jnz	LABEL(strncpy_fill_tail)
#endif
	ret

	.p2align 4
LABEL(tail_20):				/* 21 bytes */
	mov	(%rsi), %rcx
	mov	%rcx, (%rdi)
	mov	8(%rsi), %rdx
	mov	%rdx, 8(%rdi)
	mov	13(%rsi), %rcx
	mov	%rcx, 13(%rdi)
#ifdef USE_AS_STRNCPY
	mov	$21, %cl
	sub	$21, %r8
	jnz	LABEL(strncpy_fill_tail)
#endif
	ret

	.p2align 4
LABEL(tail_21):				/* 22 bytes */
	mov	(%rsi), %rcx
	mov	%rcx, (%rdi)
	mov	8(%rsi), %rdx
	mov	%rdx, 8(%rdi)
	mov	14(%rsi), %rcx
	mov	%rcx, 14(%rdi)
#ifdef USE_AS_STRNCPY
	mov	$22, %cl
	sub	$22, %r8
	jnz	LABEL(strncpy_fill_tail)
#endif
	ret

	.p2align 4
LABEL(tail_22):				/* 23 bytes */
	mov	(%rsi), %rcx
	mov	%rcx, (%rdi)
	mov	8(%rsi), %rdx
	mov	%rdx, 8(%rdi)
	mov	15(%rsi), %rcx
	mov	%rcx, 15(%rdi)
#ifdef USE_AS_STRNCPY
	mov	$23, %cl
	sub	$23, %r8
	jnz	LABEL(strncpy_fill_tail)
#endif
	ret

	.p2align 4
LABEL(AMD_exit_more_24):
	test	$0x01, %dh
	jnz	LABEL(tail_24)
	test	$0x02, %dh
	jnz	LABEL(tail_25)
	test	$0x04, %dh
	jnz	LABEL(tail_26)
	test	$0x08, %dh
	jnz	LABEL(tail_27)
	test	$0x10, %dh
	jnz	LABEL(tail_28)
	test	$0x20, %dh
	jnz	LABEL(tail_29)
	test	$0x40, %dh
	jnz	LABEL(tail_30)

	.p2align 4
LABEL(tail_31):				/* 32 bytes */
	mov	(%rsi), %rcx
	mov	%rcx, (%rdi)
	mov	8(%rsi), %rdx
	mov	%rdx, 8(%rdi)
	mov	16(%rsi), %rcx
	mov	%rcx, 16(%rdi)
	mov	24(%rsi), %rdx
	mov	%rdx, 24(%rdi)
#ifdef USE_AS_STRNCPY
	mov	$32, %cl
	sub	$32, %r8
	jnz	LABEL(strncpy_fill_tail)
#endif
	ret

	.p2align 4
LABEL(tail_24):				/* 25 bytes */
	mov	(%rsi), %rcx
	mov	%rcx, (%rdi)
	mov	8(%rsi), %rdx
	mov	%rdx, 8(%rdi)
	mov	16(%rsi), %rcx
	mov	%rcx, 16(%rdi)
	mov	21(%rsi), %edx
	mov	%edx, 21(%rdi)
#ifdef USE_AS_STRNCPY
	mov	$25, %cl
	sub	$25, %r8
	jnz	LABEL(strncpy_fill_tail)
#endif
	ret

	.p2align 4
LABEL(tail_25):				/* 26 bytes */
	mov	(%rsi), %rcx
	mov	%rcx, (%rdi)
	mov	8(%rsi), %rdx
	mov	%rdx, 8(%rdi)
	mov	16(%rsi), %rcx
	mov	%rcx, 16(%rdi)
	mov	22(%rsi), %edx
	mov	%edx, 22(%rdi)
#ifdef USE_AS_STRNCPY
	mov	$26, %cl
	sub	$26, %r8
	jnz	LABEL(strncpy_fill_tail)
#endif
	ret

	.p2align 4
LABEL(tail_26):				/* 27 bytes */
	mov	(%rsi), %rcx
	mov	%rcx, (%rdi)
	mov	8(%rsi), %rdx
	mov	%rdx, 8(%rdi)
	mov	16(%rsi), %rcx
	mov	%rcx, 16(%rdi)
	mov	23(%rsi), %edx
	mov	%edx, 23(%rdi)
#ifdef USE_AS_STRNCPY
	mov	$27, %cl
	sub	$27, %r8
	jnz	LABEL(strncpy_fill_tail)
#endif
	ret

	.p2align 4
LABEL(tail_27):				/* 28 bytes */
	mov	(%rsi), %rcx
	mov	%rcx, (%rdi)
	mov	8(%rsi), %rdx
	mov	%rdx, 8(%rdi)
	mov	16(%rsi), %rcx
	mov	%rcx, 16(%rdi)
	mov	24(%rsi), %edx
	mov	%edx, 24(%rdi)
#ifdef USE_AS_STRNCPY
	mov	$28, %cl
	sub	$28, %r8
	jnz	LABEL(strncpy_fill_tail)
#endif
	ret

	.p2align 4
LABEL(tail_28):				/* 29 bytes */
	mov	(%rsi), %rcx
	mov	%rcx, (%rdi)
	mov	8(%rsi), %rdx
	mov	%rdx, 8(%rdi)
	mov	16(%rsi), %rcx
	mov	%rcx, 16(%rdi)
	mov	21(%rsi), %rdx
	mov	%rdx, 21(%rdi)
#ifdef USE_AS_STRNCPY
	mov	$29, %cl
	sub	$29, %r8
	jnz	LABEL(strncpy_fill_tail)
#endif
	ret

	.p2align 4
LABEL(tail_29):				/* 30 bytes */
	mov	(%rsi), %rcx
	mov	%rcx, (%rdi)
	mov	8(%rsi), %rdx
	mov	%rdx, 8(%rdi)
	mov	16(%rsi), %rcx
	mov	%rcx, 16(%rdi)
	mov	22(%rsi), %rdx
	mov	%rdx, 22(%rdi)
#ifdef USE_AS_STRNCPY
	mov	$30, %cl
	sub	$30, %r8
	jnz	LABEL(strncpy_fill_tail)
#endif
	ret

	.p2align 4
LABEL(tail_30):				/* 31 bytes */
	mov	(%rsi), %rcx
	mov	%rcx, (%rdi)
	mov	8(%rsi), %rdx
	mov	%rdx, 8(%rdi)
	mov	16(%rsi), %rcx
	mov	%rcx, 16(%rdi)
	mov	23(%rsi), %rdx
	mov	%rdx, 23(%rdi)
#ifdef USE_AS_STRNCPY
	mov	$31, %cl
	sub	$31, %r8
	jnz	LABEL(strncpy_fill_tail)
#endif
	ret

	.pushsection .rodata
	.p2align 4
LABEL(tail_table):
	.int	LABEL(tail_0) - LABEL(tail_table)	/* 1 byte */
	.int	LABEL(tail_1) - LABEL(tail_table)
	.int	LABEL(tail_2) - LABEL(tail_table)
	.int	LABEL(tail_3) - LABEL(tail_table)
	.int	LABEL(tail_4) - LABEL(tail_table)
	.int	LABEL(tail_5) - LABEL(tail_table)
	.int	LABEL(tail_6) - LABEL(tail_table)
	.int	LABEL(tail_7) - LABEL(tail_table)
	.int	LABEL(tail_8) - LABEL(tail_table)
	.int	LABEL(tail_9) - LABEL(tail_table)
	.int	LABEL(tail_10) - LABEL(tail_table)
	.int	LABEL(tail_11) - LABEL(tail_table)
	.int	LABEL(tail_12) - LABEL(tail_table)
	.int	LABEL(tail_13) - LABEL(tail_table)
	.int	LABEL(tail_14) - LABEL(tail_table)
	.int	LABEL(tail_15) - LABEL(tail_table)
	.int	LABEL(tail_16) - LABEL(tail_table)
	.int	LABEL(tail_17) - LABEL(tail_table)
	.int	LABEL(tail_18) - LABEL(tail_table)
	.int	LABEL(tail_19) - LABEL(tail_table)
	.int	LABEL(tail_20) - LABEL(tail_table)
	.int	LABEL(tail_21) - LABEL(tail_table)
	.int	LABEL(tail_22) - LABEL(tail_table)
	.int	LABEL(tail_23) - LABEL(tail_table)
	.int	LABEL(tail_24) - LABEL(tail_table)
	.int	LABEL(tail_25) - LABEL(tail_table)
	.int	LABEL(tail_26) - LABEL(tail_table)
	.int	LABEL(tail_27) - LABEL(tail_table)
	.int	LABEL(tail_28) - LABEL(tail_table)
	.int	LABEL(tail_29) - LABEL(tail_table)
	.int	LABEL(tail_30) - LABEL(tail_table)
	.int	LABEL(tail_31) - LABEL(tail_table)	/* 32 bytes */

	.p2align 4
LABEL(unaligned_table):
	.int	LABEL(ashr_0) - LABEL(unaligned_table)
	.int	LABEL(ashr_1) - LABEL(unaligned_table)
	.int	LABEL(ashr_2) - LABEL(unaligned_table)
	.int	LABEL(ashr_3) - LABEL(unaligned_table)
	.int	LABEL(ashr_4) - LABEL(unaligned_table)
	.int	LABEL(ashr_5) - LABEL(unaligned_table)
	.int	LABEL(ashr_6) - LABEL(unaligned_table)
	.int	LABEL(ashr_7) - LABEL(unaligned_table)
	.int	LABEL(ashr_8) - LABEL(unaligned_table)
	.int	LABEL(ashr_9) - LABEL(unaligned_table)
	.int	LABEL(ashr_10) - LABEL(unaligned_table)
	.int	LABEL(ashr_11) - LABEL(unaligned_table)
	.int	LABEL(ashr_12) - LABEL(unaligned_table)
	.int	LABEL(ashr_13) - LABEL(unaligned_table)
	.int	LABEL(ashr_14) - LABEL(unaligned_table)
	.int	LABEL(ashr_15) - LABEL(unaligned_table)
	.popsection

#ifdef USE_AS_STRNCPY
	SET_SIZE(strncpy)
#else
	SET_SIZE(strcpy)			/* (char *, const char *) */
#endif