acc_amd64.s 23 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028
  1. // generated by go run gen.go; DO NOT EDIT
  2. // +build !appengine
  3. // +build gc
  4. // +build !noasm
  5. #include "textflag.h"
  6. // fl is short for floating point math. fx is short for fixed point math.
  7. DATA flAlmost65536<>+0x00(SB)/8, $0x477fffff477fffff
  8. DATA flAlmost65536<>+0x08(SB)/8, $0x477fffff477fffff
  9. DATA flOne<>+0x00(SB)/8, $0x3f8000003f800000
  10. DATA flOne<>+0x08(SB)/8, $0x3f8000003f800000
  11. DATA flSignMask<>+0x00(SB)/8, $0x7fffffff7fffffff
  12. DATA flSignMask<>+0x08(SB)/8, $0x7fffffff7fffffff
  13. // scatterAndMulBy0x101 is a PSHUFB mask that brings the low four bytes of an
  14. // XMM register to the low byte of that register's four uint32 values. It
  15. // duplicates those bytes, effectively multiplying each uint32 by 0x101.
  16. //
  17. // It transforms a little-endian 16-byte XMM value from
  18. // ijkl????????????
  19. // to
  20. // ii00jj00kk00ll00
  21. DATA scatterAndMulBy0x101<>+0x00(SB)/8, $0x8080010180800000
  22. DATA scatterAndMulBy0x101<>+0x08(SB)/8, $0x8080030380800202
  23. // gather is a PSHUFB mask that brings the second-lowest byte of the XMM
  24. // register's four uint32 values to the low four bytes of that register.
  25. //
  26. // It transforms a little-endian 16-byte XMM value from
  27. // ?i???j???k???l??
  28. // to
  29. // ijkl000000000000
  30. DATA gather<>+0x00(SB)/8, $0x808080800d090501
  31. DATA gather<>+0x08(SB)/8, $0x8080808080808080
  32. DATA fxAlmost65536<>+0x00(SB)/8, $0x0000ffff0000ffff
  33. DATA fxAlmost65536<>+0x08(SB)/8, $0x0000ffff0000ffff
  34. DATA inverseFFFF<>+0x00(SB)/8, $0x8000800180008001
  35. DATA inverseFFFF<>+0x08(SB)/8, $0x8000800180008001
  36. GLOBL flAlmost65536<>(SB), (NOPTR+RODATA), $16
  37. GLOBL flOne<>(SB), (NOPTR+RODATA), $16
  38. GLOBL flSignMask<>(SB), (NOPTR+RODATA), $16
  39. GLOBL scatterAndMulBy0x101<>(SB), (NOPTR+RODATA), $16
  40. GLOBL gather<>(SB), (NOPTR+RODATA), $16
  41. GLOBL fxAlmost65536<>(SB), (NOPTR+RODATA), $16
  42. GLOBL inverseFFFF<>(SB), (NOPTR+RODATA), $16
  43. // func haveSSE4_1() bool
  44. TEXT ·haveSSE4_1(SB), NOSPLIT, $0
  45. MOVQ $1, AX
  46. CPUID
  47. SHRQ $19, CX
  48. ANDQ $1, CX
  49. MOVB CX, ret+0(FP)
  50. RET
  51. // ----------------------------------------------------------------------------
  52. // func fixedAccumulateOpOverSIMD(dst []uint8, src []uint32)
  53. //
  54. // XMM registers. Variable names are per
  55. // https://github.com/google/font-rs/blob/master/src/accumulate.c
  56. //
  57. // xmm0 scratch
  58. // xmm1 x
  59. // xmm2 y, z
  60. // xmm3 -
  61. // xmm4 -
  62. // xmm5 fxAlmost65536
  63. // xmm6 gather
  64. // xmm7 offset
  65. // xmm8 scatterAndMulBy0x101
  66. // xmm9 fxAlmost65536
  67. // xmm10 inverseFFFF
  68. TEXT ·fixedAccumulateOpOverSIMD(SB), NOSPLIT, $0-48
  69. MOVQ dst_base+0(FP), DI
  70. MOVQ dst_len+8(FP), BX
  71. MOVQ src_base+24(FP), SI
  72. MOVQ src_len+32(FP), R10
  73. // Sanity check that len(dst) >= len(src).
  74. CMPQ BX, R10
  75. JLT fxAccOpOverEnd
  76. // R10 = len(src) &^ 3
  77. // R11 = len(src)
  78. MOVQ R10, R11
  79. ANDQ $-4, R10
  80. // fxAlmost65536 := XMM(0x0000ffff repeated four times) // Maximum of an uint16.
  81. MOVOU fxAlmost65536<>(SB), X5
  82. // gather := XMM(see above) // PSHUFB shuffle mask.
  83. // scatterAndMulBy0x101 := XMM(see above) // PSHUFB shuffle mask.
  84. // fxAlmost65536 := XMM(0x0000ffff repeated four times) // 0xffff.
  85. // inverseFFFF := XMM(0x80008001 repeated four times) // Magic constant for dividing by 0xffff.
  86. MOVOU gather<>(SB), X6
  87. MOVOU scatterAndMulBy0x101<>(SB), X8
  88. MOVOU fxAlmost65536<>(SB), X9
  89. MOVOU inverseFFFF<>(SB), X10
  90. // offset := XMM(0x00000000 repeated four times) // Cumulative sum.
  91. XORPS X7, X7
  92. // i := 0
  93. MOVQ $0, R9
  94. fxAccOpOverLoop4:
  95. // for i < (len(src) &^ 3)
  96. CMPQ R9, R10
  97. JAE fxAccOpOverLoop1
  98. // x = XMM(s0, s1, s2, s3)
  99. //
  100. // Where s0 is src[i+0], s1 is src[i+1], etc.
  101. MOVOU (SI), X1
  102. // scratch = XMM(0, s0, s1, s2)
  103. // x += scratch // yields x == XMM(s0, s0+s1, s1+s2, s2+s3)
  104. MOVOU X1, X0
  105. PSLLO $4, X0
  106. PADDD X0, X1
  107. // scratch = XMM(0, 0, 0, 0)
  108. // scratch = XMM(scratch@0, scratch@0, x@0, x@1) // yields scratch == XMM(0, 0, s0, s0+s1)
  109. // x += scratch // yields x == XMM(s0, s0+s1, s0+s1+s2, s0+s1+s2+s3)
  110. XORPS X0, X0
  111. SHUFPS $0x40, X1, X0
  112. PADDD X0, X1
  113. // x += offset
  114. PADDD X7, X1
  115. // y = abs(x)
  116. // y >>= 2 // Shift by 2*ϕ - 16.
  117. // y = min(y, fxAlmost65536)
  118. PABSD X1, X2
  119. PSRLL $2, X2
  120. PMINUD X5, X2
  121. // z = convertToInt32(y)
  122. // No-op.
  123. // Blend over the dst's prior value. SIMD for i in 0..3:
  124. //
  125. // dstA := uint32(dst[i]) * 0x101
  126. // maskA := z@i
  127. // outA := dstA*(0xffff-maskA)/0xffff + maskA
  128. // dst[i] = uint8(outA >> 8)
  129. //
  130. // First, set X0 to dstA*(0xfff-maskA).
  131. MOVL (DI), X0
  132. PSHUFB X8, X0
  133. MOVOU X9, X11
  134. PSUBL X2, X11
  135. PMULLD X11, X0
  136. // We implement uint32 division by 0xffff as multiplication by a magic
  137. // constant (0x800080001) and then a shift by a magic constant (47).
  138. // See TestDivideByFFFF for a justification.
  139. //
  140. // That multiplication widens from uint32 to uint64, so we have to
  141. // duplicate and shift our four uint32s from one XMM register (X0) to
  142. // two XMM registers (X0 and X11).
  143. //
  144. // Move the second and fourth uint32s in X0 to be the first and third
  145. // uint32s in X11.
  146. MOVOU X0, X11
  147. PSRLQ $32, X11
  148. // Multiply by magic, shift by magic.
  149. PMULULQ X10, X0
  150. PMULULQ X10, X11
  151. PSRLQ $47, X0
  152. PSRLQ $47, X11
  153. // Merge the two registers back to one, X11, and add maskA.
  154. PSLLQ $32, X11
  155. XORPS X0, X11
  156. PADDD X11, X2
  157. // As per opSrcStore4, shuffle and copy the 4 second-lowest bytes.
  158. PSHUFB X6, X2
  159. MOVL X2, (DI)
  160. // offset = XMM(x@3, x@3, x@3, x@3)
  161. MOVOU X1, X7
  162. SHUFPS $0xff, X1, X7
  163. // i += 4
  164. // dst = dst[4:]
  165. // src = src[4:]
  166. ADDQ $4, R9
  167. ADDQ $4, DI
  168. ADDQ $16, SI
  169. JMP fxAccOpOverLoop4
  170. fxAccOpOverLoop1:
  171. // for i < len(src)
  172. CMPQ R9, R11
  173. JAE fxAccOpOverEnd
  174. // x = src[i] + offset
  175. MOVL (SI), X1
  176. PADDD X7, X1
  177. // y = abs(x)
  178. // y >>= 2 // Shift by 2*ϕ - 16.
  179. // y = min(y, fxAlmost65536)
  180. PABSD X1, X2
  181. PSRLL $2, X2
  182. PMINUD X5, X2
  183. // z = convertToInt32(y)
  184. // No-op.
  185. // Blend over the dst's prior value.
  186. //
  187. // dstA := uint32(dst[0]) * 0x101
  188. // maskA := z
  189. // outA := dstA*(0xffff-maskA)/0xffff + maskA
  190. // dst[0] = uint8(outA >> 8)
  191. MOVBLZX (DI), R12
  192. IMULL $0x101, R12
  193. MOVL X2, R13
  194. MOVL $0xffff, AX
  195. SUBL R13, AX
  196. MULL R12 // MULL's implicit arg is AX, and the result is stored in DX:AX.
  197. MOVL $0x80008001, BX // Divide by 0xffff is to first multiply by a magic constant...
  198. MULL BX // MULL's implicit arg is AX, and the result is stored in DX:AX.
  199. SHRL $15, DX // ...and then shift by another magic constant (47 - 32 = 15).
  200. ADDL DX, R13
  201. SHRL $8, R13
  202. MOVB R13, (DI)
  203. // offset = x
  204. MOVOU X1, X7
  205. // i += 1
  206. // dst = dst[1:]
  207. // src = src[1:]
  208. ADDQ $1, R9
  209. ADDQ $1, DI
  210. ADDQ $4, SI
  211. JMP fxAccOpOverLoop1
  212. fxAccOpOverEnd:
  213. RET
  214. // ----------------------------------------------------------------------------
  215. // func fixedAccumulateOpSrcSIMD(dst []uint8, src []uint32)
  216. //
  217. // XMM registers. Variable names are per
  218. // https://github.com/google/font-rs/blob/master/src/accumulate.c
  219. //
  220. // xmm0 scratch
  221. // xmm1 x
  222. // xmm2 y, z
  223. // xmm3 -
  224. // xmm4 -
  225. // xmm5 fxAlmost65536
  226. // xmm6 gather
  227. // xmm7 offset
  228. // xmm8 -
  229. // xmm9 -
  230. // xmm10 -
  231. TEXT ·fixedAccumulateOpSrcSIMD(SB), NOSPLIT, $0-48
  232. MOVQ dst_base+0(FP), DI
  233. MOVQ dst_len+8(FP), BX
  234. MOVQ src_base+24(FP), SI
  235. MOVQ src_len+32(FP), R10
  236. // Sanity check that len(dst) >= len(src).
  237. CMPQ BX, R10
  238. JLT fxAccOpSrcEnd
  239. // R10 = len(src) &^ 3
  240. // R11 = len(src)
  241. MOVQ R10, R11
  242. ANDQ $-4, R10
  243. // fxAlmost65536 := XMM(0x0000ffff repeated four times) // Maximum of an uint16.
  244. MOVOU fxAlmost65536<>(SB), X5
  245. // gather := XMM(see above) // PSHUFB shuffle mask.
  246. MOVOU gather<>(SB), X6
  247. // offset := XMM(0x00000000 repeated four times) // Cumulative sum.
  248. XORPS X7, X7
  249. // i := 0
  250. MOVQ $0, R9
  251. fxAccOpSrcLoop4:
  252. // for i < (len(src) &^ 3)
  253. CMPQ R9, R10
  254. JAE fxAccOpSrcLoop1
  255. // x = XMM(s0, s1, s2, s3)
  256. //
  257. // Where s0 is src[i+0], s1 is src[i+1], etc.
  258. MOVOU (SI), X1
  259. // scratch = XMM(0, s0, s1, s2)
  260. // x += scratch // yields x == XMM(s0, s0+s1, s1+s2, s2+s3)
  261. MOVOU X1, X0
  262. PSLLO $4, X0
  263. PADDD X0, X1
  264. // scratch = XMM(0, 0, 0, 0)
  265. // scratch = XMM(scratch@0, scratch@0, x@0, x@1) // yields scratch == XMM(0, 0, s0, s0+s1)
  266. // x += scratch // yields x == XMM(s0, s0+s1, s0+s1+s2, s0+s1+s2+s3)
  267. XORPS X0, X0
  268. SHUFPS $0x40, X1, X0
  269. PADDD X0, X1
  270. // x += offset
  271. PADDD X7, X1
  272. // y = abs(x)
  273. // y >>= 2 // Shift by 2*ϕ - 16.
  274. // y = min(y, fxAlmost65536)
  275. PABSD X1, X2
  276. PSRLL $2, X2
  277. PMINUD X5, X2
  278. // z = convertToInt32(y)
  279. // No-op.
  280. // z = shuffleTheSecondLowestBytesOfEach4ByteElement(z)
  281. // copy(dst[:4], low4BytesOf(z))
  282. PSHUFB X6, X2
  283. MOVL X2, (DI)
  284. // offset = XMM(x@3, x@3, x@3, x@3)
  285. MOVOU X1, X7
  286. SHUFPS $0xff, X1, X7
  287. // i += 4
  288. // dst = dst[4:]
  289. // src = src[4:]
  290. ADDQ $4, R9
  291. ADDQ $4, DI
  292. ADDQ $16, SI
  293. JMP fxAccOpSrcLoop4
  294. fxAccOpSrcLoop1:
  295. // for i < len(src)
  296. CMPQ R9, R11
  297. JAE fxAccOpSrcEnd
  298. // x = src[i] + offset
  299. MOVL (SI), X1
  300. PADDD X7, X1
  301. // y = abs(x)
  302. // y >>= 2 // Shift by 2*ϕ - 16.
  303. // y = min(y, fxAlmost65536)
  304. PABSD X1, X2
  305. PSRLL $2, X2
  306. PMINUD X5, X2
  307. // z = convertToInt32(y)
  308. // No-op.
  309. // dst[0] = uint8(z>>8)
  310. MOVL X2, BX
  311. SHRL $8, BX
  312. MOVB BX, (DI)
  313. // offset = x
  314. MOVOU X1, X7
  315. // i += 1
  316. // dst = dst[1:]
  317. // src = src[1:]
  318. ADDQ $1, R9
  319. ADDQ $1, DI
  320. ADDQ $4, SI
  321. JMP fxAccOpSrcLoop1
  322. fxAccOpSrcEnd:
  323. RET
  324. // ----------------------------------------------------------------------------
  325. // func fixedAccumulateMaskSIMD(buf []uint32)
  326. //
  327. // XMM registers. Variable names are per
  328. // https://github.com/google/font-rs/blob/master/src/accumulate.c
  329. //
  330. // xmm0 scratch
  331. // xmm1 x
  332. // xmm2 y, z
  333. // xmm3 -
  334. // xmm4 -
  335. // xmm5 fxAlmost65536
  336. // xmm6 -
  337. // xmm7 offset
  338. // xmm8 -
  339. // xmm9 -
  340. // xmm10 -
  341. TEXT ·fixedAccumulateMaskSIMD(SB), NOSPLIT, $0-24
  342. MOVQ buf_base+0(FP), DI
  343. MOVQ buf_len+8(FP), BX
  344. MOVQ buf_base+0(FP), SI
  345. MOVQ buf_len+8(FP), R10
  346. // R10 = len(src) &^ 3
  347. // R11 = len(src)
  348. MOVQ R10, R11
  349. ANDQ $-4, R10
  350. // fxAlmost65536 := XMM(0x0000ffff repeated four times) // Maximum of an uint16.
  351. MOVOU fxAlmost65536<>(SB), X5
  352. // offset := XMM(0x00000000 repeated four times) // Cumulative sum.
  353. XORPS X7, X7
  354. // i := 0
  355. MOVQ $0, R9
  356. fxAccMaskLoop4:
  357. // for i < (len(src) &^ 3)
  358. CMPQ R9, R10
  359. JAE fxAccMaskLoop1
  360. // x = XMM(s0, s1, s2, s3)
  361. //
  362. // Where s0 is src[i+0], s1 is src[i+1], etc.
  363. MOVOU (SI), X1
  364. // scratch = XMM(0, s0, s1, s2)
  365. // x += scratch // yields x == XMM(s0, s0+s1, s1+s2, s2+s3)
  366. MOVOU X1, X0
  367. PSLLO $4, X0
  368. PADDD X0, X1
  369. // scratch = XMM(0, 0, 0, 0)
  370. // scratch = XMM(scratch@0, scratch@0, x@0, x@1) // yields scratch == XMM(0, 0, s0, s0+s1)
  371. // x += scratch // yields x == XMM(s0, s0+s1, s0+s1+s2, s0+s1+s2+s3)
  372. XORPS X0, X0
  373. SHUFPS $0x40, X1, X0
  374. PADDD X0, X1
  375. // x += offset
  376. PADDD X7, X1
  377. // y = abs(x)
  378. // y >>= 2 // Shift by 2*ϕ - 16.
  379. // y = min(y, fxAlmost65536)
  380. PABSD X1, X2
  381. PSRLL $2, X2
  382. PMINUD X5, X2
  383. // z = convertToInt32(y)
  384. // No-op.
  385. // copy(dst[:4], z)
  386. MOVOU X2, (DI)
  387. // offset = XMM(x@3, x@3, x@3, x@3)
  388. MOVOU X1, X7
  389. SHUFPS $0xff, X1, X7
  390. // i += 4
  391. // dst = dst[4:]
  392. // src = src[4:]
  393. ADDQ $4, R9
  394. ADDQ $16, DI
  395. ADDQ $16, SI
  396. JMP fxAccMaskLoop4
  397. fxAccMaskLoop1:
  398. // for i < len(src)
  399. CMPQ R9, R11
  400. JAE fxAccMaskEnd
  401. // x = src[i] + offset
  402. MOVL (SI), X1
  403. PADDD X7, X1
  404. // y = abs(x)
  405. // y >>= 2 // Shift by 2*ϕ - 16.
  406. // y = min(y, fxAlmost65536)
  407. PABSD X1, X2
  408. PSRLL $2, X2
  409. PMINUD X5, X2
  410. // z = convertToInt32(y)
  411. // No-op.
  412. // dst[0] = uint32(z)
  413. MOVL X2, (DI)
  414. // offset = x
  415. MOVOU X1, X7
  416. // i += 1
  417. // dst = dst[1:]
  418. // src = src[1:]
  419. ADDQ $1, R9
  420. ADDQ $4, DI
  421. ADDQ $4, SI
  422. JMP fxAccMaskLoop1
  423. fxAccMaskEnd:
  424. RET
  425. // ----------------------------------------------------------------------------
  426. // func floatingAccumulateOpOverSIMD(dst []uint8, src []float32)
  427. //
  428. // XMM registers. Variable names are per
  429. // https://github.com/google/font-rs/blob/master/src/accumulate.c
  430. //
  431. // xmm0 scratch
  432. // xmm1 x
  433. // xmm2 y, z
  434. // xmm3 flSignMask
  435. // xmm4 flOne
  436. // xmm5 flAlmost65536
  437. // xmm6 gather
  438. // xmm7 offset
  439. // xmm8 scatterAndMulBy0x101
  440. // xmm9 fxAlmost65536
  441. // xmm10 inverseFFFF
  442. TEXT ·floatingAccumulateOpOverSIMD(SB), NOSPLIT, $8-48
  443. MOVQ dst_base+0(FP), DI
  444. MOVQ dst_len+8(FP), BX
  445. MOVQ src_base+24(FP), SI
  446. MOVQ src_len+32(FP), R10
  447. // Sanity check that len(dst) >= len(src).
  448. CMPQ BX, R10
  449. JLT flAccOpOverEnd
  450. // R10 = len(src) &^ 3
  451. // R11 = len(src)
  452. MOVQ R10, R11
  453. ANDQ $-4, R10
  454. // Prepare to set MXCSR bits 13 and 14, so that the CVTPS2PL below is
  455. // "Round To Zero".
  456. STMXCSR mxcsrOrig-8(SP)
  457. MOVL mxcsrOrig-8(SP), AX
  458. ORL $0x6000, AX
  459. MOVL AX, mxcsrNew-4(SP)
  460. // flSignMask := XMM(0x7fffffff repeated four times) // All but the sign bit of a float32.
  461. // flOne := XMM(0x3f800000 repeated four times) // 1 as a float32.
  462. // flAlmost65536 := XMM(0x477fffff repeated four times) // 255.99998 * 256 as a float32.
  463. MOVOU flSignMask<>(SB), X3
  464. MOVOU flOne<>(SB), X4
  465. MOVOU flAlmost65536<>(SB), X5
  466. // gather := XMM(see above) // PSHUFB shuffle mask.
  467. // scatterAndMulBy0x101 := XMM(see above) // PSHUFB shuffle mask.
  468. // fxAlmost65536 := XMM(0x0000ffff repeated four times) // 0xffff.
  469. // inverseFFFF := XMM(0x80008001 repeated four times) // Magic constant for dividing by 0xffff.
  470. MOVOU gather<>(SB), X6
  471. MOVOU scatterAndMulBy0x101<>(SB), X8
  472. MOVOU fxAlmost65536<>(SB), X9
  473. MOVOU inverseFFFF<>(SB), X10
  474. // offset := XMM(0x00000000 repeated four times) // Cumulative sum.
  475. XORPS X7, X7
  476. // i := 0
  477. MOVQ $0, R9
  478. flAccOpOverLoop4:
  479. // for i < (len(src) &^ 3)
  480. CMPQ R9, R10
  481. JAE flAccOpOverLoop1
  482. // x = XMM(s0, s1, s2, s3)
  483. //
  484. // Where s0 is src[i+0], s1 is src[i+1], etc.
  485. MOVOU (SI), X1
  486. // scratch = XMM(0, s0, s1, s2)
  487. // x += scratch // yields x == XMM(s0, s0+s1, s1+s2, s2+s3)
  488. MOVOU X1, X0
  489. PSLLO $4, X0
  490. ADDPS X0, X1
  491. // scratch = XMM(0, 0, 0, 0)
  492. // scratch = XMM(scratch@0, scratch@0, x@0, x@1) // yields scratch == XMM(0, 0, s0, s0+s1)
  493. // x += scratch // yields x == XMM(s0, s0+s1, s0+s1+s2, s0+s1+s2+s3)
  494. XORPS X0, X0
  495. SHUFPS $0x40, X1, X0
  496. ADDPS X0, X1
  497. // x += offset
  498. ADDPS X7, X1
  499. // y = x & flSignMask
  500. // y = min(y, flOne)
  501. // y = mul(y, flAlmost65536)
  502. MOVOU X3, X2
  503. ANDPS X1, X2
  504. MINPS X4, X2
  505. MULPS X5, X2
  506. // z = convertToInt32(y)
  507. LDMXCSR mxcsrNew-4(SP)
  508. CVTPS2PL X2, X2
  509. LDMXCSR mxcsrOrig-8(SP)
  510. // Blend over the dst's prior value. SIMD for i in 0..3:
  511. //
  512. // dstA := uint32(dst[i]) * 0x101
  513. // maskA := z@i
  514. // outA := dstA*(0xffff-maskA)/0xffff + maskA
  515. // dst[i] = uint8(outA >> 8)
  516. //
  517. // First, set X0 to dstA*(0xfff-maskA).
  518. MOVL (DI), X0
  519. PSHUFB X8, X0
  520. MOVOU X9, X11
  521. PSUBL X2, X11
  522. PMULLD X11, X0
  523. // We implement uint32 division by 0xffff as multiplication by a magic
  524. // constant (0x800080001) and then a shift by a magic constant (47).
  525. // See TestDivideByFFFF for a justification.
  526. //
  527. // That multiplication widens from uint32 to uint64, so we have to
  528. // duplicate and shift our four uint32s from one XMM register (X0) to
  529. // two XMM registers (X0 and X11).
  530. //
  531. // Move the second and fourth uint32s in X0 to be the first and third
  532. // uint32s in X11.
  533. MOVOU X0, X11
  534. PSRLQ $32, X11
  535. // Multiply by magic, shift by magic.
  536. PMULULQ X10, X0
  537. PMULULQ X10, X11
  538. PSRLQ $47, X0
  539. PSRLQ $47, X11
  540. // Merge the two registers back to one, X11, and add maskA.
  541. PSLLQ $32, X11
  542. XORPS X0, X11
  543. PADDD X11, X2
  544. // As per opSrcStore4, shuffle and copy the 4 second-lowest bytes.
  545. PSHUFB X6, X2
  546. MOVL X2, (DI)
  547. // offset = XMM(x@3, x@3, x@3, x@3)
  548. MOVOU X1, X7
  549. SHUFPS $0xff, X1, X7
  550. // i += 4
  551. // dst = dst[4:]
  552. // src = src[4:]
  553. ADDQ $4, R9
  554. ADDQ $4, DI
  555. ADDQ $16, SI
  556. JMP flAccOpOverLoop4
  557. flAccOpOverLoop1:
  558. // for i < len(src)
  559. CMPQ R9, R11
  560. JAE flAccOpOverEnd
  561. // x = src[i] + offset
  562. MOVL (SI), X1
  563. ADDPS X7, X1
  564. // y = x & flSignMask
  565. // y = min(y, flOne)
  566. // y = mul(y, flAlmost65536)
  567. MOVOU X3, X2
  568. ANDPS X1, X2
  569. MINPS X4, X2
  570. MULPS X5, X2
  571. // z = convertToInt32(y)
  572. LDMXCSR mxcsrNew-4(SP)
  573. CVTPS2PL X2, X2
  574. LDMXCSR mxcsrOrig-8(SP)
  575. // Blend over the dst's prior value.
  576. //
  577. // dstA := uint32(dst[0]) * 0x101
  578. // maskA := z
  579. // outA := dstA*(0xffff-maskA)/0xffff + maskA
  580. // dst[0] = uint8(outA >> 8)
  581. MOVBLZX (DI), R12
  582. IMULL $0x101, R12
  583. MOVL X2, R13
  584. MOVL $0xffff, AX
  585. SUBL R13, AX
  586. MULL R12 // MULL's implicit arg is AX, and the result is stored in DX:AX.
  587. MOVL $0x80008001, BX // Divide by 0xffff is to first multiply by a magic constant...
  588. MULL BX // MULL's implicit arg is AX, and the result is stored in DX:AX.
  589. SHRL $15, DX // ...and then shift by another magic constant (47 - 32 = 15).
  590. ADDL DX, R13
  591. SHRL $8, R13
  592. MOVB R13, (DI)
  593. // offset = x
  594. MOVOU X1, X7
  595. // i += 1
  596. // dst = dst[1:]
  597. // src = src[1:]
  598. ADDQ $1, R9
  599. ADDQ $1, DI
  600. ADDQ $4, SI
  601. JMP flAccOpOverLoop1
  602. flAccOpOverEnd:
  603. RET
  604. // ----------------------------------------------------------------------------
  605. // func floatingAccumulateOpSrcSIMD(dst []uint8, src []float32)
  606. //
  607. // XMM registers. Variable names are per
  608. // https://github.com/google/font-rs/blob/master/src/accumulate.c
  609. //
  610. // xmm0 scratch
  611. // xmm1 x
  612. // xmm2 y, z
  613. // xmm3 flSignMask
  614. // xmm4 flOne
  615. // xmm5 flAlmost65536
  616. // xmm6 gather
  617. // xmm7 offset
  618. // xmm8 -
  619. // xmm9 -
  620. // xmm10 -
  621. TEXT ·floatingAccumulateOpSrcSIMD(SB), NOSPLIT, $8-48
  622. MOVQ dst_base+0(FP), DI
  623. MOVQ dst_len+8(FP), BX
  624. MOVQ src_base+24(FP), SI
  625. MOVQ src_len+32(FP), R10
  626. // Sanity check that len(dst) >= len(src).
  627. CMPQ BX, R10
  628. JLT flAccOpSrcEnd
  629. // R10 = len(src) &^ 3
  630. // R11 = len(src)
  631. MOVQ R10, R11
  632. ANDQ $-4, R10
  633. // Prepare to set MXCSR bits 13 and 14, so that the CVTPS2PL below is
  634. // "Round To Zero".
  635. STMXCSR mxcsrOrig-8(SP)
  636. MOVL mxcsrOrig-8(SP), AX
  637. ORL $0x6000, AX
  638. MOVL AX, mxcsrNew-4(SP)
  639. // flSignMask := XMM(0x7fffffff repeated four times) // All but the sign bit of a float32.
  640. // flOne := XMM(0x3f800000 repeated four times) // 1 as a float32.
  641. // flAlmost65536 := XMM(0x477fffff repeated four times) // 255.99998 * 256 as a float32.
  642. MOVOU flSignMask<>(SB), X3
  643. MOVOU flOne<>(SB), X4
  644. MOVOU flAlmost65536<>(SB), X5
  645. // gather := XMM(see above) // PSHUFB shuffle mask.
  646. MOVOU gather<>(SB), X6
  647. // offset := XMM(0x00000000 repeated four times) // Cumulative sum.
  648. XORPS X7, X7
  649. // i := 0
  650. MOVQ $0, R9
  651. flAccOpSrcLoop4:
  652. // for i < (len(src) &^ 3)
  653. CMPQ R9, R10
  654. JAE flAccOpSrcLoop1
  655. // x = XMM(s0, s1, s2, s3)
  656. //
  657. // Where s0 is src[i+0], s1 is src[i+1], etc.
  658. MOVOU (SI), X1
  659. // scratch = XMM(0, s0, s1, s2)
  660. // x += scratch // yields x == XMM(s0, s0+s1, s1+s2, s2+s3)
  661. MOVOU X1, X0
  662. PSLLO $4, X0
  663. ADDPS X0, X1
  664. // scratch = XMM(0, 0, 0, 0)
  665. // scratch = XMM(scratch@0, scratch@0, x@0, x@1) // yields scratch == XMM(0, 0, s0, s0+s1)
  666. // x += scratch // yields x == XMM(s0, s0+s1, s0+s1+s2, s0+s1+s2+s3)
  667. XORPS X0, X0
  668. SHUFPS $0x40, X1, X0
  669. ADDPS X0, X1
  670. // x += offset
  671. ADDPS X7, X1
  672. // y = x & flSignMask
  673. // y = min(y, flOne)
  674. // y = mul(y, flAlmost65536)
  675. MOVOU X3, X2
  676. ANDPS X1, X2
  677. MINPS X4, X2
  678. MULPS X5, X2
  679. // z = convertToInt32(y)
  680. LDMXCSR mxcsrNew-4(SP)
  681. CVTPS2PL X2, X2
  682. LDMXCSR mxcsrOrig-8(SP)
  683. // z = shuffleTheSecondLowestBytesOfEach4ByteElement(z)
  684. // copy(dst[:4], low4BytesOf(z))
  685. PSHUFB X6, X2
  686. MOVL X2, (DI)
  687. // offset = XMM(x@3, x@3, x@3, x@3)
  688. MOVOU X1, X7
  689. SHUFPS $0xff, X1, X7
  690. // i += 4
  691. // dst = dst[4:]
  692. // src = src[4:]
  693. ADDQ $4, R9
  694. ADDQ $4, DI
  695. ADDQ $16, SI
  696. JMP flAccOpSrcLoop4
  697. flAccOpSrcLoop1:
  698. // for i < len(src)
  699. CMPQ R9, R11
  700. JAE flAccOpSrcEnd
  701. // x = src[i] + offset
  702. MOVL (SI), X1
  703. ADDPS X7, X1
  704. // y = x & flSignMask
  705. // y = min(y, flOne)
  706. // y = mul(y, flAlmost65536)
  707. MOVOU X3, X2
  708. ANDPS X1, X2
  709. MINPS X4, X2
  710. MULPS X5, X2
  711. // z = convertToInt32(y)
  712. LDMXCSR mxcsrNew-4(SP)
  713. CVTPS2PL X2, X2
  714. LDMXCSR mxcsrOrig-8(SP)
  715. // dst[0] = uint8(z>>8)
  716. MOVL X2, BX
  717. SHRL $8, BX
  718. MOVB BX, (DI)
  719. // offset = x
  720. MOVOU X1, X7
  721. // i += 1
  722. // dst = dst[1:]
  723. // src = src[1:]
  724. ADDQ $1, R9
  725. ADDQ $1, DI
  726. ADDQ $4, SI
  727. JMP flAccOpSrcLoop1
  728. flAccOpSrcEnd:
  729. RET
  730. // ----------------------------------------------------------------------------
  731. // func floatingAccumulateMaskSIMD(dst []uint32, src []float32)
  732. //
  733. // XMM registers. Variable names are per
  734. // https://github.com/google/font-rs/blob/master/src/accumulate.c
  735. //
  736. // xmm0 scratch
  737. // xmm1 x
  738. // xmm2 y, z
  739. // xmm3 flSignMask
  740. // xmm4 flOne
  741. // xmm5 flAlmost65536
  742. // xmm6 -
  743. // xmm7 offset
  744. // xmm8 -
  745. // xmm9 -
  746. // xmm10 -
  747. TEXT ·floatingAccumulateMaskSIMD(SB), NOSPLIT, $8-48
  748. MOVQ dst_base+0(FP), DI
  749. MOVQ dst_len+8(FP), BX
  750. MOVQ src_base+24(FP), SI
  751. MOVQ src_len+32(FP), R10
  752. // Sanity check that len(dst) >= len(src).
  753. CMPQ BX, R10
  754. JLT flAccMaskEnd
  755. // R10 = len(src) &^ 3
  756. // R11 = len(src)
  757. MOVQ R10, R11
  758. ANDQ $-4, R10
  759. // Prepare to set MXCSR bits 13 and 14, so that the CVTPS2PL below is
  760. // "Round To Zero".
  761. STMXCSR mxcsrOrig-8(SP)
  762. MOVL mxcsrOrig-8(SP), AX
  763. ORL $0x6000, AX
  764. MOVL AX, mxcsrNew-4(SP)
  765. // flSignMask := XMM(0x7fffffff repeated four times) // All but the sign bit of a float32.
  766. // flOne := XMM(0x3f800000 repeated four times) // 1 as a float32.
  767. // flAlmost65536 := XMM(0x477fffff repeated four times) // 255.99998 * 256 as a float32.
  768. MOVOU flSignMask<>(SB), X3
  769. MOVOU flOne<>(SB), X4
  770. MOVOU flAlmost65536<>(SB), X5
  771. // offset := XMM(0x00000000 repeated four times) // Cumulative sum.
  772. XORPS X7, X7
  773. // i := 0
  774. MOVQ $0, R9
  775. flAccMaskLoop4:
  776. // for i < (len(src) &^ 3)
  777. CMPQ R9, R10
  778. JAE flAccMaskLoop1
  779. // x = XMM(s0, s1, s2, s3)
  780. //
  781. // Where s0 is src[i+0], s1 is src[i+1], etc.
  782. MOVOU (SI), X1
  783. // scratch = XMM(0, s0, s1, s2)
  784. // x += scratch // yields x == XMM(s0, s0+s1, s1+s2, s2+s3)
  785. MOVOU X1, X0
  786. PSLLO $4, X0
  787. ADDPS X0, X1
  788. // scratch = XMM(0, 0, 0, 0)
  789. // scratch = XMM(scratch@0, scratch@0, x@0, x@1) // yields scratch == XMM(0, 0, s0, s0+s1)
  790. // x += scratch // yields x == XMM(s0, s0+s1, s0+s1+s2, s0+s1+s2+s3)
  791. XORPS X0, X0
  792. SHUFPS $0x40, X1, X0
  793. ADDPS X0, X1
  794. // x += offset
  795. ADDPS X7, X1
  796. // y = x & flSignMask
  797. // y = min(y, flOne)
  798. // y = mul(y, flAlmost65536)
  799. MOVOU X3, X2
  800. ANDPS X1, X2
  801. MINPS X4, X2
  802. MULPS X5, X2
  803. // z = convertToInt32(y)
  804. LDMXCSR mxcsrNew-4(SP)
  805. CVTPS2PL X2, X2
  806. LDMXCSR mxcsrOrig-8(SP)
  807. // copy(dst[:4], z)
  808. MOVOU X2, (DI)
  809. // offset = XMM(x@3, x@3, x@3, x@3)
  810. MOVOU X1, X7
  811. SHUFPS $0xff, X1, X7
  812. // i += 4
  813. // dst = dst[4:]
  814. // src = src[4:]
  815. ADDQ $4, R9
  816. ADDQ $16, DI
  817. ADDQ $16, SI
  818. JMP flAccMaskLoop4
  819. flAccMaskLoop1:
  820. // for i < len(src)
  821. CMPQ R9, R11
  822. JAE flAccMaskEnd
  823. // x = src[i] + offset
  824. MOVL (SI), X1
  825. ADDPS X7, X1
  826. // y = x & flSignMask
  827. // y = min(y, flOne)
  828. // y = mul(y, flAlmost65536)
  829. MOVOU X3, X2
  830. ANDPS X1, X2
  831. MINPS X4, X2
  832. MULPS X5, X2
  833. // z = convertToInt32(y)
  834. LDMXCSR mxcsrNew-4(SP)
  835. CVTPS2PL X2, X2
  836. LDMXCSR mxcsrOrig-8(SP)
  837. // dst[0] = uint32(z)
  838. MOVL X2, (DI)
  839. // offset = x
  840. MOVOU X1, X7
  841. // i += 1
  842. // dst = dst[1:]
  843. // src = src[1:]
  844. ADDQ $1, R9
  845. ADDQ $4, DI
  846. ADDQ $4, SI
  847. JMP flAccMaskLoop1
  848. flAccMaskEnd:
  849. RET