| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133 |
- //go:build amd64
- // +build amd64
- #include "textflag.h"
- // func siluAVX2Asm(x *float32, n int)
- TEXT ·siluAVX2Asm(SB), NOSPLIT, $0-16
- // Load args
- MOVQ x+0(FP), DI
- MOVQ n+8(FP), CX
- CMPQ CX, $0
- JLE done
- // Broadcast constants
- VBROADCASTSS ·expHi(SB), Y14
- VBROADCASTSS ·expLo(SB), Y13
- VBROADCASTSS ·log2EF(SB), Y12
- VBROADCASTSS ·halfConst(SB), Y11
- VBROADCASTSS ·expC1(SB), Y10
- VBROADCASTSS ·expC2(SB), Y9
- VBROADCASTSS ·oneConst(SB), Y8
- VPBROADCASTD ·signMaskConst(SB), Y15
- loop:
- CMPQ CX, $8
- JL done
- VMOVUPS (DI), Y0 // original x
- VMOVAPS Y0, Y1 // copy for neg
- VXORPS Y15, Y1, Y1 // y1 = -x
- VMINPS Y14, Y1, Y1 // clamp hi
- VMAXPS Y13, Y1, Y1 // clamp lo
- VMULPS Y12, Y1, Y2 // y2 = x * log2e
- VADDPS Y11, Y2, Y2 // +0.5
- VROUNDPS $1, Y2, Y2 // floor
- VCVTPS2DQ Y2, Y6 // integer exponent
- VCVTDQ2PS Y6, Y5 // fx as float
- VMULPS Y10, Y5, Y3 // fx * C1
- VSUBPS Y3, Y1, Y1
- VMULPS Y9, Y5, Y3 // fx * C2
- VSUBPS Y3, Y1, Y1
- VMULPS Y1, Y1, Y3 // z = x*x
- VBROADCASTSS ·polyP0(SB), Y4
- VMULPS Y1, Y4, Y4
- VBROADCASTSS ·polyP1(SB), Y5
- VADDPS Y5, Y4, Y4
- VMULPS Y1, Y4, Y4
- VBROADCASTSS ·polyP2(SB), Y5
- VADDPS Y5, Y4, Y4
- VMULPS Y1, Y4, Y4
- VBROADCASTSS ·polyP3(SB), Y5
- VADDPS Y5, Y4, Y4
- VMULPS Y1, Y4, Y4
- VBROADCASTSS ·polyP4(SB), Y5
- VADDPS Y5, Y4, Y4
- VMULPS Y1, Y4, Y4
- VBROADCASTSS ·polyP5(SB), Y5
- VADDPS Y5, Y4, Y4
- VMULPS Y3, Y4, Y4 // y *= z
- VADDPS Y1, Y4, Y4 // y += x
- VADDPS Y8, Y4, Y4 // y += 1
- VPBROADCASTD ·expBiasConst(SB), Y5
- VPADDD Y5, Y6, Y6
- VPSLLD $23, Y6, Y6
- VMULPS Y6, Y4, Y4 // exp(-x)
- VADDPS Y8, Y4, Y3 // denom = 1 + exp(-x)
- VDIVPS Y3, Y8, Y3 // 1 / denom
- VMULPS Y0, Y3, Y0 // x * sigmoid(x)
- VMOVUPS Y0, (DI)
- ADDQ $32, DI
- SUBQ $8, CX
- JMP loop
- done:
- RET
- // Constants (single-float broadcast)
- DATA ·expHi+0(SB)/4, $0x42b0c0a5
- GLOBL ·expHi(SB), RODATA, $4
- DATA ·expLo+0(SB)/4, $0xc2b0c0a5
- GLOBL ·expLo(SB), RODATA, $4
- DATA ·log2EF+0(SB)/4, $0x3fb8aa3b
- GLOBL ·log2EF(SB), RODATA, $4
- DATA ·halfConst+0(SB)/4, $0x3f000000
- GLOBL ·halfConst(SB), RODATA, $4
- DATA ·expC1+0(SB)/4, $0x3f318000
- GLOBL ·expC1(SB), RODATA, $4
- DATA ·expC2+0(SB)/4, $0xb95e8083
- GLOBL ·expC2(SB), RODATA, $4
- DATA ·polyP0+0(SB)/4, $0x39506967
- GLOBL ·polyP0(SB), RODATA, $4
- DATA ·polyP1+0(SB)/4, $0x3ab743ce
- GLOBL ·polyP1(SB), RODATA, $4
- DATA ·polyP2+0(SB)/4, $0x3c088908
- GLOBL ·polyP2(SB), RODATA, $4
- DATA ·polyP3+0(SB)/4, $0x3d2aa9c1
- GLOBL ·polyP3(SB), RODATA, $4
- DATA ·polyP4+0(SB)/4, $0x3e2aaaaa
- GLOBL ·polyP4(SB), RODATA, $4
- DATA ·polyP5+0(SB)/4, $0x3f000000
- GLOBL ·polyP5(SB), RODATA, $4
- DATA ·oneConst+0(SB)/4, $0x3f800000
- GLOBL ·oneConst(SB), RODATA, $4
- DATA ·signMaskConst+0(SB)/4, $0x80000000
- GLOBL ·signMaskConst(SB), RODATA, $4
- DATA ·expBiasConst+0(SB)/4, $0x0000007f
- GLOBL ·expBiasConst(SB), RODATA, $4
|