silu_avx2.s 2.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133
  1. //go:build amd64
  2. // +build amd64
  3. #include "textflag.h"
  4. // func siluAVX2Asm(x *float32, n int)
  5. TEXT ·siluAVX2Asm(SB), NOSPLIT, $0-16
  6. // Load args
  7. MOVQ x+0(FP), DI
  8. MOVQ n+8(FP), CX
  9. CMPQ CX, $0
  10. JLE done
  11. // Broadcast constants
  12. VBROADCASTSS ·expHi(SB), Y14
  13. VBROADCASTSS ·expLo(SB), Y13
  14. VBROADCASTSS ·log2EF(SB), Y12
  15. VBROADCASTSS ·halfConst(SB), Y11
  16. VBROADCASTSS ·expC1(SB), Y10
  17. VBROADCASTSS ·expC2(SB), Y9
  18. VBROADCASTSS ·oneConst(SB), Y8
  19. VPBROADCASTD ·signMaskConst(SB), Y15
  20. loop:
  21. CMPQ CX, $8
  22. JL done
  23. VMOVUPS (DI), Y0 // original x
  24. VMOVAPS Y0, Y1 // copy for neg
  25. VXORPS Y15, Y1, Y1 // y1 = -x
  26. VMINPS Y14, Y1, Y1 // clamp hi
  27. VMAXPS Y13, Y1, Y1 // clamp lo
  28. VMULPS Y12, Y1, Y2 // y2 = x * log2e
  29. VADDPS Y11, Y2, Y2 // +0.5
  30. VROUNDPS $1, Y2, Y2 // floor
  31. VCVTPS2DQ Y2, Y6 // integer exponent
  32. VCVTDQ2PS Y6, Y5 // fx as float
  33. VMULPS Y10, Y5, Y3 // fx * C1
  34. VSUBPS Y3, Y1, Y1
  35. VMULPS Y9, Y5, Y3 // fx * C2
  36. VSUBPS Y3, Y1, Y1
  37. VMULPS Y1, Y1, Y3 // z = x*x
  38. VBROADCASTSS ·polyP0(SB), Y4
  39. VMULPS Y1, Y4, Y4
  40. VBROADCASTSS ·polyP1(SB), Y5
  41. VADDPS Y5, Y4, Y4
  42. VMULPS Y1, Y4, Y4
  43. VBROADCASTSS ·polyP2(SB), Y5
  44. VADDPS Y5, Y4, Y4
  45. VMULPS Y1, Y4, Y4
  46. VBROADCASTSS ·polyP3(SB), Y5
  47. VADDPS Y5, Y4, Y4
  48. VMULPS Y1, Y4, Y4
  49. VBROADCASTSS ·polyP4(SB), Y5
  50. VADDPS Y5, Y4, Y4
  51. VMULPS Y1, Y4, Y4
  52. VBROADCASTSS ·polyP5(SB), Y5
  53. VADDPS Y5, Y4, Y4
  54. VMULPS Y3, Y4, Y4 // y *= z
  55. VADDPS Y1, Y4, Y4 // y += x
  56. VADDPS Y8, Y4, Y4 // y += 1
  57. VPBROADCASTD ·expBiasConst(SB), Y5
  58. VPADDD Y5, Y6, Y6
  59. VPSLLD $23, Y6, Y6
  60. VMULPS Y6, Y4, Y4 // exp(-x)
  61. VADDPS Y8, Y4, Y3 // denom = 1 + exp(-x)
  62. VDIVPS Y3, Y8, Y3 // 1 / denom
  63. VMULPS Y0, Y3, Y0 // x * sigmoid(x)
  64. VMOVUPS Y0, (DI)
  65. ADDQ $32, DI
  66. SUBQ $8, CX
  67. JMP loop
  68. done:
  69. RET
  70. // Constants (single-float broadcast)
  71. DATA ·expHi+0(SB)/4, $0x42b0c0a5
  72. GLOBL ·expHi(SB), RODATA, $4
  73. DATA ·expLo+0(SB)/4, $0xc2b0c0a5
  74. GLOBL ·expLo(SB), RODATA, $4
  75. DATA ·log2EF+0(SB)/4, $0x3fb8aa3b
  76. GLOBL ·log2EF(SB), RODATA, $4
  77. DATA ·halfConst+0(SB)/4, $0x3f000000
  78. GLOBL ·halfConst(SB), RODATA, $4
  79. DATA ·expC1+0(SB)/4, $0x3f318000
  80. GLOBL ·expC1(SB), RODATA, $4
  81. DATA ·expC2+0(SB)/4, $0xb95e8083
  82. GLOBL ·expC2(SB), RODATA, $4
  83. DATA ·polyP0+0(SB)/4, $0x39506967
  84. GLOBL ·polyP0(SB), RODATA, $4
  85. DATA ·polyP1+0(SB)/4, $0x3ab743ce
  86. GLOBL ·polyP1(SB), RODATA, $4
  87. DATA ·polyP2+0(SB)/4, $0x3c088908
  88. GLOBL ·polyP2(SB), RODATA, $4
  89. DATA ·polyP3+0(SB)/4, $0x3d2aa9c1
  90. GLOBL ·polyP3(SB), RODATA, $4
  91. DATA ·polyP4+0(SB)/4, $0x3e2aaaaa
  92. GLOBL ·polyP4(SB), RODATA, $4
  93. DATA ·polyP5+0(SB)/4, $0x3f000000
  94. GLOBL ·polyP5(SB), RODATA, $4
  95. DATA ·oneConst+0(SB)/4, $0x3f800000
  96. GLOBL ·oneConst(SB), RODATA, $4
  97. DATA ·signMaskConst+0(SB)/4, $0x80000000
  98. GLOBL ·signMaskConst(SB), RODATA, $4
  99. DATA ·expBiasConst+0(SB)/4, $0x0000007f
  100. GLOBL ·expBiasConst(SB), RODATA, $4