//go:build amd64 // +build amd64 #include "textflag.h" // func siluAVX2Asm(x *float32, n int) TEXT ·siluAVX2Asm(SB), NOSPLIT, $0-16 // Load args MOVQ x+0(FP), DI MOVQ n+8(FP), CX CMPQ CX, $0 JLE done // Broadcast constants VBROADCASTSS ·expHi(SB), Y14 VBROADCASTSS ·expLo(SB), Y13 VBROADCASTSS ·log2EF(SB), Y12 VBROADCASTSS ·halfConst(SB), Y11 VBROADCASTSS ·expC1(SB), Y10 VBROADCASTSS ·expC2(SB), Y9 VBROADCASTSS ·oneConst(SB), Y8 VPBROADCASTD ·signMaskConst(SB), Y15 loop: CMPQ CX, $8 JL done VMOVUPS (DI), Y0 // original x VMOVAPS Y0, Y1 // copy for neg VXORPS Y15, Y1, Y1 // y1 = -x VMINPS Y14, Y1, Y1 // clamp hi VMAXPS Y13, Y1, Y1 // clamp lo VMULPS Y12, Y1, Y2 // y2 = x * log2e VADDPS Y11, Y2, Y2 // +0.5 VROUNDPS $1, Y2, Y2 // floor VCVTPS2DQ Y2, Y6 // integer exponent VCVTDQ2PS Y6, Y5 // fx as float VMULPS Y10, Y5, Y3 // fx * C1 VSUBPS Y3, Y1, Y1 VMULPS Y9, Y5, Y3 // fx * C2 VSUBPS Y3, Y1, Y1 VMULPS Y1, Y1, Y3 // z = x*x VBROADCASTSS ·polyP0(SB), Y4 VMULPS Y1, Y4, Y4 VBROADCASTSS ·polyP1(SB), Y5 VADDPS Y5, Y4, Y4 VMULPS Y1, Y4, Y4 VBROADCASTSS ·polyP2(SB), Y5 VADDPS Y5, Y4, Y4 VMULPS Y1, Y4, Y4 VBROADCASTSS ·polyP3(SB), Y5 VADDPS Y5, Y4, Y4 VMULPS Y1, Y4, Y4 VBROADCASTSS ·polyP4(SB), Y5 VADDPS Y5, Y4, Y4 VMULPS Y1, Y4, Y4 VBROADCASTSS ·polyP5(SB), Y5 VADDPS Y5, Y4, Y4 VMULPS Y3, Y4, Y4 // y *= z VADDPS Y1, Y4, Y4 // y += x VADDPS Y8, Y4, Y4 // y += 1 VPBROADCASTD ·expBiasConst(SB), Y5 VPADDD Y5, Y6, Y6 VPSLLD $23, Y6, Y6 VMULPS Y6, Y4, Y4 // exp(-x) VADDPS Y8, Y4, Y3 // denom = 1 + exp(-x) VDIVPS Y3, Y8, Y3 // 1 / denom VMULPS Y0, Y3, Y0 // x * sigmoid(x) VMOVUPS Y0, (DI) ADDQ $32, DI SUBQ $8, CX JMP loop done: RET // Constants (single-float broadcast) DATA ·expHi+0(SB)/4, $0x42b0c0a5 GLOBL ·expHi(SB), RODATA, $4 DATA ·expLo+0(SB)/4, $0xc2b0c0a5 GLOBL ·expLo(SB), RODATA, $4 DATA ·log2EF+0(SB)/4, $0x3fb8aa3b GLOBL ·log2EF(SB), RODATA, $4 DATA ·halfConst+0(SB)/4, $0x3f000000 GLOBL ·halfConst(SB), RODATA, $4 DATA ·expC1+0(SB)/4, $0x3f318000 GLOBL ·expC1(SB), RODATA, $4 DATA ·expC2+0(SB)/4, $0xb95e8083 GLOBL ·expC2(SB), RODATA, $4 DATA ·polyP0+0(SB)/4, $0x39506967 GLOBL ·polyP0(SB), RODATA, $4 DATA ·polyP1+0(SB)/4, $0x3ab743ce GLOBL ·polyP1(SB), RODATA, $4 DATA ·polyP2+0(SB)/4, $0x3c088908 GLOBL ·polyP2(SB), RODATA, $4 DATA ·polyP3+0(SB)/4, $0x3d2aa9c1 GLOBL ·polyP3(SB), RODATA, $4 DATA ·polyP4+0(SB)/4, $0x3e2aaaaa GLOBL ·polyP4(SB), RODATA, $4 DATA ·polyP5+0(SB)/4, $0x3f000000 GLOBL ·polyP5(SB), RODATA, $4 DATA ·oneConst+0(SB)/4, $0x3f800000 GLOBL ·oneConst(SB), RODATA, $4 DATA ·signMaskConst+0(SB)/4, $0x80000000 GLOBL ·signMaskConst(SB), RODATA, $4 DATA ·expBiasConst+0(SB)/4, $0x0000007f GLOBL ·expBiasConst(SB), RODATA, $4