/*********************************************************************************
 * libs/libc/machine/x86_64/gnu/arch_memset_avx2.S
 *
 * Copyright (c) 2014, Intel Corporation
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *     * Redistributions of source code must retain the above copyright notice,
 *     * this list of conditions and the following disclaimer.
 *
 *     * Redistributions in binary form must reproduce the above copyright notice,
 *     * this list of conditions and the following disclaimer in the documentation
 *     * and/or other materials provided with the distribution.
 *
 *     * Neither the name of Intel Corporation nor the names of its contributors
 *     * may be used to endorse or promote products derived from this software
 *     * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
 *********************************************************************************/

/********************************************************************************
 * Included Files
 *********************************************************************************/

#include "cache.h"

/*********************************************************************************
 * Pre-processor Definitions
 *********************************************************************************/

#ifndef L
# define L(label)	.L##label
#endif

#ifndef ALIGN
# define ALIGN(n)	.p2align n
#endif

#define ENTRY(__f)         \
  .text;                   \
  .global __f;             \
  .balign 16;              \
  .type __f, @function;    \
__f:                       \
  .cfi_startproc;

#define END(__f) \
  .cfi_endproc;  \
  .size __f, .- __f;

/*********************************************************************************
 * Public Functions
 *********************************************************************************/

	.section .text.avx2,"ax",@progbits

ENTRY(memset)
	movq	%rdi, %rax
	and	$0xff, %rsi
	mov	$0x0101010101010101, %rcx
	imul	%rsi, %rcx
	cmpq	$16, %rdx
	jae	L(16bytesormore)
	testb	$8, %dl
	jnz	L(8_15bytes)
	testb	$4, %dl
	jnz	L(4_7bytes)
	testb	$2, %dl
	jnz	L(2_3bytes)
	testb	$1, %dl
	jz	1f
	movb	%cl, (%rdi)
1:	ret

L(8_15bytes):
	movq	%rcx, (%rdi)
	movq	%rcx, -8(%rdi, %rdx)
	ret

L(4_7bytes):
	movl	%ecx, (%rdi)
	movl	%ecx, -4(%rdi, %rdx)
	ret

L(2_3bytes):
	movw	%cx, (%rdi)
	movw	%cx, -2(%rdi, %rdx)
	ret

	ALIGN (4)
L(16bytesormore):
	movd	%rcx, %xmm0
	pshufd	$0, %xmm0, %xmm0
	movdqu	%xmm0, (%rdi)
	movdqu	%xmm0, -16(%rdi, %rdx)
	cmpq	$32, %rdx
	jbe	L(done)
	movdqu	%xmm0, 16(%rdi)
	movdqu	%xmm0, -32(%rdi, %rdx)
	cmpq	$64, %rdx
	jbe	L(done)
	movdqu	%xmm0, 32(%rdi)
	movdqu	%xmm0, 48(%rdi)
	movdqu	%xmm0, -64(%rdi, %rdx)
	movdqu	%xmm0, -48(%rdi, %rdx)
	cmpq	$128, %rdx
	jbe	L(done)
	vpbroadcastb %xmm0, %ymm0
	vmovdqu	%ymm0, 64(%rdi)
	vmovdqu	%ymm0, 96(%rdi)
	vmovdqu	%ymm0, -128(%rdi, %rdx)
	vmovdqu	%ymm0, -96(%rdi, %rdx)
	cmpq	$256, %rdx
	jbe	L(done)

	ALIGN (4)
	leaq	128(%rdi), %rcx
	andq	$-128, %rcx
	movq	%rdx, %r8
	addq	%rdi, %rdx
	andq	$-128, %rdx
	cmpq	%rcx, %rdx
	je	L(done)

#ifdef SHARED_CACHE_SIZE
	cmp	$SHARED_CACHE_SIZE, %r8
#else
	cmp	__x86_64_shared_cache_size(%rip), %r8
#endif
	ja	L(non_temporal_loop)

	ALIGN (4)
L(normal_loop):
	vmovdqa	%ymm0, (%rcx)
	vmovdqa	%ymm0, 32(%rcx)
	vmovdqa	%ymm0, 64(%rcx)
	vmovdqa	%ymm0, 96(%rcx)
	addq	$128, %rcx
	cmpq	%rcx, %rdx
	jne	L(normal_loop)
	jmp	L(done)

	ALIGN (4)
L(non_temporal_loop):
	movntdq	 %xmm0, (%rcx)
	movntdq	 %xmm0, 16(%rcx)
	movntdq	 %xmm0, 32(%rcx)
	movntdq	 %xmm0, 48(%rcx)
	movntdq	 %xmm0, 64(%rcx)
	movntdq	 %xmm0, 80(%rcx)
	movntdq	 %xmm0, 96(%rcx)
	movntdq	 %xmm0, 112(%rcx)
	leaq	128(%rcx), %rcx
	cmpq	%rcx, %rdx
	jne	L(non_temporal_loop)
	/* We used non-temporal stores, so we need a fence here. */
	sfence

L(done):
  /* We used the ymm registers, and that can break SSE2 performance
   * unless you do this.
   */
	vzeroupper
	ret

END(memset)