sse.cpp 34 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035
  1. /*! \file SSE.cpp
  2. SIMD functionality, mainly for 32 bit interleaved complex integer type (CPX)
  3. */
  4. /************************************************************************************************
  5. Copyright 2008 Gregory W Heckler
  6. This file is part of the GPS Software Defined Radio (GPS-SDR)
  7. The GPS-SDR is free software; you can redistribute it and/or modify it under the terms of the
  8. GNU General Public License as published by the Free Software Foundation; either version 2 of the
  9. License, or (at your option) any later version.
  10. The GPS-SDR is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
  11. even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  12. General Public License for more details.
  13. You should have received a copy of the GNU General Public License along with GPS-SDR; if not,
  14. write to the:
  15. Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
  16. ************************************************************************************************/
  17. // __asm
  18. // (
  19. // "mov eax, 0x1 \n\t"
  20. //
  21. // "mov eax, edx \n\t"
  22. // "leave \n\t"
  23. // "ret \n\t"
  24. // ".att_syntax \n\t"
  25. // );
  26. #include "includes.h"
  27. void sse_add(int16 *A, int16 *B, int32 cnt)
  28. {
  29. int32 cnt1;
  30. int32 cnt2;
  31. cnt1 = cnt / 8;
  32. cnt2 = (cnt - (8*cnt1));
  33. if(((int)A%16) || ((int)B%16)) // unaligned version
  34. {
  35. __asm
  36. (
  37. ".intel_syntax noprefix \n\t" //Set up for loop
  38. "mov edi, [ebp+8] \n\t" //Address of A
  39. "mov esi, [ebp+12] \n\t" //Address of B
  40. "mov ecx, %[cnt1] \n\t"//"mov ecx, [ebp-20] \n\t" //Counter 1
  41. "jecxz Z%= \n\t"
  42. "L%=: \n\t"
  43. "movupd xmm0, [edi] \n\t" //Load from A
  44. "movupd xmm1, [esi] \n\t" //Load from B
  45. "paddw xmm0, xmm1 \n\t" //Multiply A*B
  46. "movupd [edi], xmm0 \n\t" //Move into A
  47. "add edi, 16 \n\t"
  48. "add esi, 16 \n\t"
  49. "loop L%= \n\t" //Loop if not done
  50. "Z%=: \n\t"
  51. "mov ecx, %[cnt2] \n\t"//"mov ecx, [ebp-16] \n\t" //Counter 2
  52. "jecxz ZZ%= \n\t"
  53. "mov eax, 0 \n\t"
  54. "LL%=: \n\t" //Really finish off loop with non SIMD instructions
  55. "mov ax, [edi] \n\t"
  56. "add ax, [esi] \n\t"
  57. "mov [edi], ax \n\t"
  58. "add esi, 2 \n\t"
  59. "add edi, 2 \n\t"
  60. "loop LL%= \n\t"
  61. "ZZ%=: \n\t"
  62. "EMMS \n\t"
  63. ".att_syntax \n\t"
  64. :
  65. : "m" (A), "m" (B), "m" (cnt), [cnt1] "r" (cnt1), [cnt2] "r" (cnt2)//: "m" (A), "m" (B), "m" (cnt), "m" (cnt1), "m" (cnt2)
  66. : "%eax", "%ecx", "%edi", "%esi"
  67. );
  68. }
  69. else
  70. {
  71. __asm
  72. (
  73. ".intel_syntax noprefix \n\t" //Set up for loop
  74. "mov edi, [ebp+8] \n\t" //Address of A
  75. "mov esi, [ebp+12] \n\t" //Address of B
  76. "mov ecx, [ebp-20] \n\t" //Counter 1
  77. "jecxz Z%= \n\t"
  78. "L%=: \n\t"
  79. "movapd xmm0, [edi] \n\t" //Load from A
  80. "paddw xmm0, [esi] \n\t" //Multiply A*B
  81. "movapd [edi], xmm0 \n\t" //Move into A
  82. "add edi, 16 \n\t"
  83. "add esi, 16 \n\t"
  84. "loop L%= \n\t" //Loop if not done
  85. "Z%=: \n\t"
  86. "mov ecx, [ebp-16] \n\t" //Counter 2
  87. "jecxz ZZ%= \n\t"
  88. "mov eax, 0 \n\t"
  89. "LL%=: \n\t" //Really finish off loop with non SIMD instructions
  90. "mov ax, [edi] \n\t"
  91. "add ax, [esi] \n\t"
  92. "mov [edi], ax \n\t"
  93. "add esi, 2 \n\t"
  94. "add edi, 2 \n\t"
  95. "loop LL%= \n\t"
  96. "ZZ%=: \n\t"
  97. "EMMS \n\t"
  98. ".att_syntax \n\t"
  99. :
  100. : "m" (A), "m" (B), "m" (cnt), "m" (cnt1), "m" (cnt2)
  101. : "%eax", "%ecx", "%edi", "%esi"
  102. );//end __asm
  103. }//end if
  104. }
  105. void sse_sub(int16 *A, int16 *B, int32 cnt)
  106. {
  107. int32 cnt1;
  108. int32 cnt2;
  109. cnt1 = cnt / 8;
  110. cnt2 = (cnt - (8*cnt1));
  111. if(((int)A%16) || ((int)B%16)) // unaligned version
  112. {
  113. __asm
  114. (
  115. ".intel_syntax noprefix \n\t" //Set up for loop
  116. "mov edi, [ebp+8] \n\t" //Address of A
  117. "mov esi, [ebp+12] \n\t" //Address of B
  118. "mov ecx, [ebp-20] \n\t" //Counter 1
  119. "jecxz Z%= \n\t"
  120. "L%=: \n\t"
  121. "movupd xmm0, [edi] \n\t" //Load from A
  122. "movupd xmm1, [esi] \n\t" //Load from B
  123. "psubw xmm0, xmm1 \n\t" //Multiply A*B
  124. "movupd [edi], xmm0 \n\t" //Move into A
  125. "add edi, 16 \n\t"
  126. "add esi, 16 \n\t"
  127. "loop L%= \n\t" //Loop if not done
  128. "Z%=: \n\t"
  129. "mov ecx, [ebp-16] \n\t" //Counter 2
  130. "jecxz ZZ%= \n\t"
  131. "mov eax, 0 \n\t"
  132. "LL%=: \n\t" //Really finish off loop with non SIMD instructions
  133. "mov ax, [edi] \n\t"
  134. "sub ax, [esi] \n\t"
  135. "mov [edi], ax \n\t"
  136. "add esi, 2 \n\t"
  137. "add edi, 2 \n\t"
  138. "loop LL%= \n\t"
  139. "ZZ%=: \n\t"
  140. "EMMS \n\t"
  141. ".att_syntax \n\t"
  142. :
  143. : "m" (A), "m" (B), "m" (cnt), "m" (cnt1), "m" (cnt2)
  144. : "%eax", "%ecx", "%edi", "%esi"
  145. );
  146. }
  147. else
  148. {
  149. __asm
  150. (
  151. ".intel_syntax noprefix \n\t" //Set up for loop
  152. "mov edi, [ebp+8] \n\t" //Address of A
  153. "mov esi, [ebp+12] \n\t" //Address of B
  154. "mov ecx, [ebp-20] \n\t" //Counter 1
  155. "jecxz Z%= \n\t"
  156. "L%=: \n\t"
  157. "movapd xmm0, [edi] \n\t" //Load from A
  158. "psubw xmm0, [esi] \n\t" //Multiply A*B
  159. "movapd [edi], xmm0 \n\t" //Move into A
  160. "add edi, 16 \n\t"
  161. "add esi, 16 \n\t"
  162. "loop L%= \n\t" //Loop if not done
  163. "Z%=: \n\t"
  164. "mov ecx, [ebp-16] \n\t" //Counter 2
  165. "jecxz ZZ%= \n\t"
  166. "mov eax, 0 \n\t"
  167. "LL%=: \n\t" //Really finish off loop with non SIMD instructions
  168. "mov ax, [edi] \n\t"
  169. "sub ax, [esi] \n\t"
  170. "mov [edi], ax \n\t"
  171. "add esi, 2 \n\t"
  172. "add edi, 2 \n\t"
  173. "loop LL%= \n\t"
  174. "ZZ%=: \n\t"
  175. "EMMS \n\t"
  176. ".att_syntax \n\t"
  177. :
  178. : "m" (A), "m" (B), "m" (cnt), "m" (cnt1), "m" (cnt2)
  179. : "%eax", "%ecx", "%edi", "%esi"
  180. );//end __asm
  181. }//end if
  182. }
  183. void sse_mul(int16 *A, int16 *B, int32 cnt)
  184. {
  185. int32 cnt1;
  186. int32 cnt2;
  187. cnt1 = cnt / 8;
  188. cnt2 = (cnt - (8*cnt1));
  189. if(((int)A%16) || ((int)B%16)) // unaligned version
  190. {
  191. __asm
  192. (
  193. ".intel_syntax noprefix \n\t" //Set up for loop
  194. "mov edi, [ebp+8] \n\t" //Address of A
  195. "mov esi, [ebp+12] \n\t" //Address of B
  196. "mov ecx, [ebp-20] \n\t" //Counter 1
  197. "jecxz Z%= \n\t"
  198. "L%=: \n\t"
  199. "movupd xmm0, [edi] \n\t" //Load from A
  200. "movupd xmm1, [esi] \n\t" //Load from B
  201. "pmullw xmm0, xmm1 \n\t" //Multiply A*B
  202. "movupd [edi], xmm0 \n\t" //Move into A
  203. "add edi, 16 \n\t"
  204. "add esi, 16 \n\t"
  205. "loop L%= \n\t" //Loop if not done
  206. "Z%=: \n\t"
  207. "mov ecx, [ebp-16] \n\t" //Counter 2
  208. "jecxz ZZ%= \n\t"
  209. "mov eax, 0 \n\t"
  210. "LL%=: \n\t" //Really finish off loop with non SIMD instructions
  211. "mov ax, [edi] \n\t"
  212. "imul ax, [esi] \n\t"
  213. "mov [edi], ax \n\t"
  214. "add esi, 2 \n\t"
  215. "add edi, 2 \n\t"
  216. "loop LL%= \n\t"
  217. "ZZ%=: \n\t"
  218. "EMMS \n\t"
  219. ".att_syntax \n\t"
  220. :
  221. : "m" (A), "m" (B), "m" (cnt), "m" (cnt1), "m" (cnt2)
  222. : "%eax", "%ecx", "%edi", "%esi"
  223. );
  224. }
  225. else
  226. {
  227. __asm
  228. (
  229. ".intel_syntax noprefix \n\t" //Set up for loop
  230. "mov edi, [ebp+8] \n\t" //Address of A
  231. "mov esi, [ebp+12] \n\t" //Address of B
  232. "mov ecx, [ebp-20] \n\t" //Counter 1
  233. "jecxz Z%= \n\t"
  234. "L%=: \n\t"
  235. "movapd xmm0, [edi] \n\t" //Load from A
  236. "pmullw xmm0, [esi] \n\t" //Multiply A*B
  237. "movapd [edi], xmm0 \n\t" //Move into A
  238. "add edi, 16 \n\t"
  239. "add esi, 16 \n\t"
  240. "loop L%= \n\t" //Loop if not done
  241. "Z%=: \n\t"
  242. "mov ecx, [ebp-16] \n\t" //Counter 2
  243. "jecxz ZZ%= \n\t"
  244. "mov eax, 0 \n\t"
  245. "LL%=: \n\t" //Really finish off loop with non SIMD instructions
  246. "mov ax, [edi] \n\t"
  247. "imul ax, [esi] \n\t"
  248. "mov [edi], ax \n\t"
  249. "add esi, 2 \n\t"
  250. "add edi, 2 \n\t"
  251. "loop LL%= \n\t"
  252. "ZZ%=: \n\t"
  253. "EMMS \n\t"
  254. ".att_syntax \n\t"
  255. :
  256. : "m" (A), "m" (B), "m" (cnt), "m" (cnt1), "m" (cnt2)
  257. : "%eax", "%ecx", "%edi", "%esi"
  258. );//end __asm
  259. }//end if
  260. }
  261. int32 sse_dot(int16 *A, int16 *B, int32 cnt)
  262. {
  263. int32 cnt1;
  264. int32 cnt2;
  265. int32 temp;
  266. if(((int32)A%16) || ((int32)B%16)) //If the memory locations are not 16 byte aligned use slower movupd instruction
  267. {
  268. cnt1 = cnt / 24;
  269. cnt2 = (cnt - (24*cnt1));
  270. __asm
  271. (
  272. ".intel_syntax noprefix \n\t" //Set up for loop
  273. "mov edi, [ebp+8] \n\t" //Address of A
  274. "mov esi, [ebp+12] \n\t" //Address of B
  275. "mov ecx, [ebp-40] \n\t" //Counter 1
  276. "pxor xmm0, xmm0 \n\t" //Clear the running sum (accumulator)
  277. "jecxz Z%= \n\t"
  278. "L%=: \n\t"
  279. "movupd xmm1, [esi] \n\t" //Load from A
  280. "movupd xmm2, [esi+16] \n\t" //Load from A
  281. "movupd xmm3, [esi+32] \n\t" //Load from A
  282. "movupd xmm4, [edi] \n\t" //Load from B
  283. "movupd xmm5, [edi+16] \n\t" //Load from B
  284. "movupd xmm6, [edi+32] \n\t" //Load from B
  285. "pmaddwd xmm1, xmm4 \n\t" //Multiply and accumulate
  286. "pmaddwd xmm2, xmm5 \n\t" //Multiply and accumulate
  287. "pmaddwd xmm3, xmm6 \n\t" //Multiply and accumulate
  288. "paddd xmm1, xmm3 \n\t" //Add into accumulator (efficiently)
  289. "paddd xmm0, xmm2 \n\t"
  290. "paddd xmm0, xmm1 \n\t"
  291. "add esi, 48 \n\t"
  292. "add edi, 48 \n\t"
  293. "loop L%= \n\t" //Loop if not done
  294. "Z%=: \n\t"
  295. "movd ebx, xmm0 \n\t" //right-hand word to ebx
  296. "psrldq xmm0, 4 \n\t" //left-hand word to right side of xmm0
  297. "movd eax, xmm0 \n\t" //left-hand word into eax
  298. "add eax, ebx \n\t" //running sum now in eax
  299. "mov edx, eax \n\t" //move into edx
  300. "psrldq xmm0, 4 \n\t" //left-hand word to right side of xmm0
  301. "movd ebx, xmm0 \n\t" //right-hand word to ebx
  302. "psrldq xmm0, 4 \n\t" //left-hand word to right side of xmm0
  303. "movd eax, xmm0 \n\t" //left-hand word into eax
  304. "add eax, ebx \n\t" //running sum now in eax
  305. "add edx, eax \n\t" //add to edx
  306. "mov ecx, [ebp-36] \n\t"
  307. "jecxz ZZ%= \n\t"
  308. "LL%=: \n\t" //Really finish off loop with non SIMD instructions
  309. "mov bx, [edi] \n\t" //Move 16 bits into bx
  310. "movsx ebx, bx \n\t" //Sign extend to 32 bits
  311. "mov ax, [esi] \n\t" //Move 16 bits into ax
  312. "movsx eax, ax \n\t" //Sign extend to 32 bits
  313. "imul ebx, eax \n\t" //Multiply
  314. "add edx, ebx \n\t" //Add into accumulator
  315. "add esi, 2 \n\t"
  316. "add edi, 2 \n\t"
  317. "loop LL%= \n\t"
  318. "ZZ%=: \n\t"
  319. "EMMS \n\t"
  320. "mov [ebp-32], edx \n\t"
  321. ".att_syntax \n\t"
  322. : "=m"(temp)
  323. : "m" (A), "m" (B), "m" (cnt), "m" (cnt1), "m" (cnt2)
  324. : "%eax", "%ebx", "%ecx", "%edx", "%edi", "%esi"
  325. );//end __asm
  326. }
  327. else //use faster movapd instruction
  328. {
  329. cnt1 = cnt / 56;
  330. cnt2 = (cnt - (56*cnt1));
  331. __asm
  332. (
  333. //Set up for loop
  334. ".intel_syntax noprefix \n\t" //Set up for loop
  335. "mov edi, [ebp+8] \n\t" //Address of A
  336. "mov esi, [ebp+12] \n\t" //Address of B
  337. "mov ecx, [ebp-40] \n\t" //Counter 1
  338. "pxor xmm0, xmm0 \n\t" //Clear the running sum (accumulator)
  339. "jecxz Z%= \n\t"
  340. "L%=: \n\t"
  341. "movapd xmm1, [esi] \n\t" //Load from A
  342. "movapd xmm2, [esi+16] \n\t" //Load from A
  343. "movapd xmm3, [esi+32] \n\t" //Load from A
  344. "movapd xmm4, [esi+48] \n\t" //Load from A
  345. "movapd xmm5, [esi+64] \n\t" //Load from A
  346. "movapd xmm6, [esi+80] \n\t" //Load from A
  347. "movapd xmm7, [esi+96] \n\t" //Load from A
  348. "pmaddwd xmm1, [edi] \n\t"
  349. "pmaddwd xmm2, [edi+16] \n\t"
  350. "pmaddwd xmm3, [edi+32] \n\t"
  351. "pmaddwd xmm4, [edi+48] \n\t"
  352. "pmaddwd xmm5, [edi+64] \n\t"
  353. "pmaddwd xmm6, [edi+80] \n\t"
  354. "pmaddwd xmm7, [edi+96] \n\t"
  355. "paddd xmm0, xmm7 \n\t"
  356. "paddd xmm1, xmm2 \n\t"
  357. "paddd xmm3, xmm4 \n\t"
  358. "paddd xmm5, xmm6 \n\t"
  359. "paddd xmm1, xmm3 \n\t"
  360. "paddd xmm0, xmm5 \n\t"
  361. "paddd xmm0, xmm1 \n\t"
  362. "add esi, 112 \n\t"
  363. "add edi, 112 \n\t"
  364. "loop L%= \n\t" // Loop if not done
  365. "Z%=: \n\t"
  366. "movd ebx, xmm0 \n\t" // right-hand word to ebx
  367. "psrldq xmm0, 4 \n\t" // left-hand word to right side of xmm0
  368. "movd eax, xmm0 \n\t" // left-hand word into eax
  369. "add eax, ebx \n\t" // running sum now in eax
  370. "mov edx, eax \n\t" // move into temp
  371. "psrldq xmm0, 4 \n\t"
  372. "movd ebx, xmm0 \n\t" // right-hand word to ebx
  373. "psrldq xmm0, 4 \n\t" // left-hand word to right side of xmm0
  374. "movd eax, xmm0 \n\t" // left-hand word into eax
  375. "add eax, ebx \n\t" // running sum now in eax
  376. "add edx, eax \n\t" // add into temp
  377. "mov ecx, [ebp-36] \n\t"
  378. "jecxz ZZ%= \n\t"
  379. "LL%=: \n\t" //Really finish off loop with non SIMD instructions
  380. "mov bx, [edi] \n\t" //Move 16 bits into bx
  381. "movsx ebx, bx \n\t" //Sign extend to 32 bits
  382. "mov ax, [esi] \n\t" //Move 16 bits into ax
  383. "movsx eax, ax \n\t" //Sign extend to 32 bits
  384. "imul ebx, eax \n\t" //Multiply
  385. "add edx, ebx \n\t" //Add into accumulator
  386. "add esi, 2 \n\t"
  387. "add edi, 2 \n\t"
  388. "loop LL%= \n\t"
  389. "ZZ%=: \n\t"
  390. "EMMS \n\t"
  391. "mov [ebp-32], edx \n\t"
  392. ".att_syntax \n\t"
  393. : "=m"(temp)
  394. : "m" (A), "m" (B), "m" (cnt), "m" (cnt1), "m" (cnt2)
  395. : "%eax", "%ebx", "%ecx", "%edx", "%edi", "%esi"
  396. );
  397. }
  398. return(temp);
  399. }
  400. void sse_conj(CPX *A, int32 cnt)
  401. {
  402. int32 cnt1;
  403. int32 cnt2;
  404. int32 temp = 0xffff0001; //[1, -1]
  405. cnt1 = cnt/4;
  406. cnt2 = cnt-4*cnt1;
  407. __asm
  408. (
  409. ".intel_syntax noprefix \n\t" //Set up for loop
  410. "mov edi, [ebp+8] \n\t" //Address of A source1
  411. "mov ecx, [ebp-24] \n\t" //Counter
  412. "movd mm7, [ebp-16] \n\t"
  413. "punpckldq mm7, mm7 \n\t"
  414. "movd xmm7, [ebp-16] \n\t"
  415. "punpckldq xmm7, xmm7 \n\t"
  416. "punpckldq xmm7, xmm7 \n\t"
  417. "jecxz Z%= \n\t"
  418. "L%=: \n\t"
  419. " movupd xmm0, [edi] \n\t" //Load from A
  420. " pmullw xmm0, xmm7 \n\t" //Multiply to get [Re -Im Re -Im]
  421. " movupd [edi], xmm0 \n\t" //Move into A
  422. " add edi, 16 \n\t" //Move in array
  423. "loop L%= \n\t" //Loop if not done
  424. "Z%=: \n\t"
  425. "mov ecx, [ebp-20] \n\t"
  426. "jecxz ZZ%= \n\t"
  427. "LL%=: \n\t"
  428. " movd mm0, [edi] \n\t" //Load from A
  429. " pmullw mm0, mm7 \n\t" //Multiply to get [Re -Im Re -Im]
  430. " movd [edi], mm0 \n\t" //Move into A
  431. " add edi, 4 \n\t"
  432. "loop LL%= \n\t"
  433. "ZZ%=: \n\t"
  434. "EMMS \n\t"
  435. ".att_syntax \n\t"
  436. :
  437. : "m" (A), "m" (cnt), "m" (cnt1), "m" (cnt2), "m" (temp)
  438. : "%ecx", "%edx", "%edi"
  439. );
  440. }
  441. void sse_cmul(CPX *A, CPX *B, int32 cnt)
  442. {
  443. int32 cnt1;
  444. int32 cnt2;
  445. volatile int32 M[4] = {0xffff0001, 0x00010001, 0xffff0001, 0x00010001}; //{1,-1,1,1,1,-1,1,1};
  446. cnt1 = cnt/4;
  447. cnt2 = cnt-4*cnt1;
  448. cnt1 = 0;
  449. cnt2 = cnt;
  450. __asm
  451. (
  452. ".intel_syntax noprefix \n\t" //Set up for loop
  453. "mov edi, [ebp+8] \n\t" //Address of A
  454. "mov esi, [ebp+12] \n\t" //Address of B
  455. "mov ecx, [ebp-52] \n\t" //Counter 1
  456. "movupd xmm7,[ebp-44] \n\t" // Move the multiply thingie
  457. "jecxz Z%= \n\t"
  458. "L%=: \n\t"
  459. "movlpd xmm0, [edi] \n\t" //Copy from A
  460. "movlpd xmm1, [edi+8] \n\t" //Copy from A
  461. "movlpd xmm3, [esi] \n\t" //Copy from B
  462. "movlpd xmm4, [esi+8] \n\t" //Copy from B
  463. "punpckldq xmm0, xmm0 \n\t" //Copy low 32 bits to high 32 bits
  464. "punpckldq xmm1, xmm1 \n\t" //Copy low 32 bits to high 32 bits
  465. "punpckldq xmm3, xmm3 \n\t" //Copy low 32 bits to high 32 bits
  466. "punpckldq xmm4, xmm4 \n\t" //Copy low 32 bits to high 32 bits
  467. "pshuflw xmm3, xmm3, 0x14 \n\t" //Shuffle Low 64 bits to get [Re Im Im Re]
  468. "pshuflw xmm4, xmm4, 0x14 \n\t" //Shuffle Low 64 bits to get [Re Im Im Re]
  469. "pshufhw xmm3, xmm3, 0x14 \n\t" //Shuffle High 64 bits to get [Re Im Im Re]
  470. "pshufhw xmm4, xmm4, 0x14 \n\t" //Shuffle High 64 bits to get [Re Im Im Re]
  471. "pmullw xmm3, xmm7 \n\t" //Multiply to get [Re Im -Im Re]
  472. "pmullw xmm4, xmm7 \n\t" //Multiply to get [Re Im -Im Re]
  473. "pmaddwd xmm0, xmm3 \n\t" //Complex multiply and add
  474. "pmaddwd xmm1, xmm4 \n\t" //Complex multiply and add
  475. "packssdw xmm0, xmm0 \n\t" //Get into low 64 bits
  476. "packssdw xmm1, xmm1 \n\t" //Get into low 64 bits
  477. "movsd [edi], xmm0 \n\t" //Move into A
  478. "movsd [edi+8], xmm1 \n\t" //Move into A
  479. "add edi, 16 \n\t" //Move in array
  480. "add esi, 16 \n\t" //Move in array
  481. "loop L%= \n\t" // Loop if not done
  482. "Z%=: \n\t"
  483. "mov ecx, [ebp-48] \n\t"
  484. "jecxz ZZ%= \n\t"
  485. "LL%=: \n\t"
  486. "movlpd xmm0, [edi] \n\t" //Copy from A
  487. "movlpd xmm1, [esi] \n\t" //Copy from B
  488. "punpckldq xmm0, xmm0 \n\t" //Copy low 32 bits to high 32 bits
  489. "punpckldq xmm1, xmm1 \n\t" //Copy low 32 bits to high 32 bits
  490. "pshuflw xmm1, xmm1, 0x14\n\t" //Shuffle Low 64 bits to get [Re Im Im Re]
  491. "pmullw xmm1, xmm7 \n\t" //Multiply to get [Re Im -Im Re]
  492. "pmaddwd xmm0, xmm1 \n\t" //Complex multiply and add
  493. "packssdw xmm0, xmm0 \n\t" //Get into low 32 bits
  494. "movd [edi], xmm0 \n\t" //Move into A
  495. "add edi, 4 \n\t"
  496. "add esi, 4 \n\t"
  497. "loop LL%= \n\t"
  498. "ZZ%=: \n\t"
  499. "EMMS \n\t"
  500. ".att_syntax \n\t"
  501. :
  502. : "m" (A), "m" (B), "m" (cnt), "m" (cnt1), "m" (cnt2)
  503. : "%ecx", "%edi", "%esi"
  504. );
  505. }
  506. void sse_cmuls(CPX *A, CPX *B, int32 cnt, int32 shift)
  507. {
  508. int32 cnt1;
  509. int32 cnt2;
  510. int32 round;
  511. volatile int32 M[4] = {0xffff0001, 0x00010001, 0xffff0001, 0x00010001}; //{1,-1,1,1,1,-1,1,1};
  512. cnt1 = cnt/4;
  513. cnt2 = cnt-4*cnt1;
  514. round = 1 << (shift-1);
  515. __asm
  516. (
  517. ".intel_syntax noprefix \n\t" //Set up for loop
  518. "mov edi, [ebp+8] \n\t" //Address of A
  519. "mov esi, [ebp+12] \n\t" //Address of B
  520. "mov ecx, [ebp-56] \n\t" //Counter 1
  521. "movupd xmm7,[ebp-44] \n\t" //Move the multiply thingie
  522. "movss xmm6, [ebp+20] \n\t" //Move the round thingie
  523. "movss xmm5, [ebp-48] \n\t" //Move the round thingie
  524. "punpckldq xmm5, xmm5 \n\t"
  525. "punpcklqdq xmm5, xmm5 \n\t"
  526. "jecxz Z%= \n\t"
  527. "L%=: \n\t"
  528. "movlpd xmm0, [edi] \n\t" //Copy from A
  529. "movlpd xmm1, [edi+8] \n\t" //Copy from A
  530. "movlpd xmm3, [esi] \n\t" //Copy from B
  531. "movlpd xmm4, [esi+8] \n\t" //Copy from B
  532. "punpckldq xmm0, xmm0 \n\t" //Copy low 32 bits to high 32 bits
  533. "punpckldq xmm1, xmm1 \n\t" //Copy low 32 bits to high 32 bits
  534. "punpckldq xmm3, xmm3 \n\t" //Copy low 32 bits to high 32 bits
  535. "punpckldq xmm4, xmm4 \n\t" //Copy low 32 bits to high 32 bits
  536. "pshuflw xmm3, xmm3, 0x14 \n\t" //Shuffle Low 64 bits to get [Re Im Im Re]
  537. "pshuflw xmm4, xmm4, 0x14 \n\t" //Shuffle Low 64 bits to get [Re Im Im Re]
  538. "pshufhw xmm3, xmm3, 0x14 \n\t" //Shuffle High 64 bits to get [Re Im Im Re]
  539. "pshufhw xmm4, xmm4, 0x14 \n\t" //Shuffle High 64 bits to get [Re Im Im Re]
  540. "pmullw xmm3, xmm7 \n\t" //Multiply to get [Re Im -Im Re]
  541. "pmullw xmm4, xmm7 \n\t" //Multiply to get [Re Im -Im Re]
  542. "pmaddwd xmm0, xmm3 \n\t" //Complex multiply and add
  543. "pmaddwd xmm1, xmm4 \n\t" //Complex multiply and add
  544. "paddd xmm0, xmm5 \n\t" //Add in 2^(shift-1)
  545. "paddd xmm1, xmm5 \n\t" //Add in 2^(shift-1)
  546. "psrad xmm0, xmm6 \n\t" //Shift by X bits
  547. "psrad xmm1, xmm6 \n\t" //Shift by X bits
  548. "packssdw xmm0, xmm0 \n\t" //Get into low 64 bits
  549. "packssdw xmm1, xmm1 \n\t" //Get into low 64 bits
  550. "movlpd [edi], xmm0 \n\t" //Move into A
  551. "movlpd [edi+8], xmm1 \n\t" //Move into A
  552. "add edi, 16 \n\t" //Move in array
  553. "add esi, 16 \n\t" //Move in array
  554. "loop L%= \n\t" //Loop if not done
  555. "Z%=: \n\t"
  556. "mov ecx, [ebp-52] \n\t"
  557. "jecxz ZZ%= \n\t"
  558. "LL%=: \n\t"
  559. "movlpd xmm0, [edi] \n\t" //Copy from A
  560. "movlpd xmm1, [esi] \n\t" //Copy from B
  561. "punpckldq xmm0, xmm0 \n\t" //Copy low 32 bits to high 32 bits
  562. "punpckldq xmm1, xmm1 \n\t" //Copy low 32 bits to high 32 bits
  563. "pshuflw xmm1, xmm1, 0x14\n\t" //Shuffle Low 64 bits to get [Re Im Im Re]
  564. "pmullw xmm1, xmm7 \n\t" //Multiply to get [Re Im -Im Re]
  565. "pmaddwd xmm0, xmm1 \n\t" //Complex multiply and add
  566. "paddd xmm0, xmm5 \n\t" //Add in 2^(shift-1)
  567. "psrad xmm0, xmm6 \n\t" //Shift by X bits
  568. "packssdw xmm0, xmm0 \n\t" //Get into low 32 bits
  569. "movd [edi], xmm0 \n\t" //Move into A
  570. "add edi, 4 \n\t"
  571. "add esi, 4 \n\t"
  572. "loop LL%= \n\t"
  573. "ZZ%=: \n\t"
  574. "EMMS \n\t"
  575. ".att_syntax \n\t"
  576. :
  577. : "m" (A), "m" (B), "m" (cnt), "m" (cnt1), "m" (cnt2), "m" (shift), "m" (round)
  578. : "%ecx", "%edi", "%esi"
  579. );
  580. }
  581. void sse_cmulsc(CPX *A, CPX *B, CPX *C, int32 cnt, int32 shift)
  582. {
  583. int32 cnt1;
  584. int32 cnt2;
  585. int32 round;
  586. volatile int32 M[4] = {0xffff0001, 0x00010001, 0xffff0001, 0x00010001}; //{1,-1,1,1,1,-1,1,1};
  587. cnt1 = cnt/4;
  588. cnt2 = cnt-4*cnt1;
  589. round = 1 << (shift-1);
  590. __asm
  591. (
  592. ".intel_syntax noprefix \n\t" //Set up for loop
  593. "mov edi, [ebp+8] \n\t" //Address of A
  594. "mov esi, [ebp+12] \n\t" //Address of B
  595. "mov eax, [ebp+16] \n\t" //Address of C
  596. "mov ecx, [ebp-56] \n\t" //Counter 1
  597. "movupd xmm7,[ebp-44] \n\t" //Move the multiply thingie
  598. "movss xmm6, [ebp+24] \n\t" //Move the round thingie
  599. "movss xmm5, [ebp-48] \n\t" //Move the round thingie
  600. "punpckldq xmm5, xmm5 \n\t"
  601. "punpcklqdq xmm5, xmm5 \n\t"
  602. "jecxz Z%= \n\t"
  603. "L%=: \n\t"
  604. "movlpd xmm0, [edi] \n\t" //Copy from A
  605. "movlpd xmm1, [edi+8] \n\t" //Copy from A
  606. "movlpd xmm3, [esi] \n\t" //Copy from B
  607. "movlpd xmm4, [esi+8] \n\t" //Copy from B
  608. "punpckldq xmm0, xmm0 \n\t" //Copy low 32 bits to high 32 bits
  609. "punpckldq xmm1, xmm1 \n\t" //Copy low 32 bits to high 32 bits
  610. "punpckldq xmm3, xmm3 \n\t" //Copy low 32 bits to high 32 bits
  611. "punpckldq xmm4, xmm4 \n\t" //Copy low 32 bits to high 32 bits
  612. "pshuflw xmm3, xmm3, 0x14 \n\t" //Shuffle Low 64 bits to get [Re Im Im Re]
  613. "pshuflw xmm4, xmm4, 0x14 \n\t" //Shuffle Low 64 bits to get [Re Im Im Re]
  614. "pshufhw xmm3, xmm3, 0x14 \n\t" //Shuffle High 64 bits to get [Re Im Im Re]
  615. "pshufhw xmm4, xmm4, 0x14 \n\t" //Shuffle High 64 bits to get [Re Im Im Re]
  616. "pmullw xmm3, xmm7 \n\t" //Multiply to get [Re Im -Im Re]
  617. "pmullw xmm4, xmm7 \n\t" //Multiply to get [Re Im -Im Re]
  618. "pmaddwd xmm0, xmm3 \n\t" //Complex multiply and add
  619. "pmaddwd xmm1, xmm4 \n\t" //Complex multiply and add
  620. "paddd xmm0, xmm5 \n\t" //Add in 2^(shift-1)
  621. "paddd xmm1, xmm5 \n\t" //Add in 2^(shift-1)
  622. "psrad xmm0, xmm6 \n\t" //Shift by X bits
  623. "psrad xmm1, xmm6 \n\t" //Shift by X bits
  624. "packssdw xmm0, xmm0 \n\t" //Get into low 64 bits
  625. "packssdw xmm1, xmm1 \n\t" //Get into low 64 bits
  626. "movlpd [eax], xmm0 \n\t" //Move into A
  627. "movlpd [eax+8], xmm1 \n\t" //Move into A
  628. "add edi, 16 \n\t" //Move in array
  629. "add esi, 16 \n\t" //Move in array
  630. "add eax, 16 \n\t"
  631. "loop L%= \n\t" //Loop if not done
  632. "Z%=: \n\t"
  633. "mov ecx, [ebp-52] \n\t"
  634. "jecxz ZZ%= \n\t"
  635. "LL%=: \n\t"
  636. "movlpd xmm0, [edi] \n\t" //Copy from A
  637. "movlpd xmm1, [esi] \n\t" //Copy from B
  638. "punpckldq xmm0, xmm0 \n\t" //Copy low 32 bits to high 32 bits
  639. "punpckldq xmm1, xmm1 \n\t" //Copy low 32 bits to high 32 bits
  640. "pshuflw xmm1, xmm1, 0x14\n\t" //Shuffle Low 64 bits to get [Re Im Im Re]
  641. "pmullw xmm1, xmm7 \n\t" //Multiply to get [Re Im -Im Re]
  642. "pmaddwd xmm0, xmm1 \n\t" //Complex multiply and add
  643. "paddd xmm0, xmm5 \n\t" //Add in 2^(shift-1)
  644. "psrad xmm0, xmm6 \n\t" //Shift by X bits
  645. "packssdw xmm0, xmm0 \n\t" //Get into low 32 bits
  646. "movd [eax], xmm0 \n\t" //Move into A
  647. "add edi, 4 \n\t"
  648. "add esi, 4 \n\t"
  649. "add eax, 4 \n\t"
  650. "loop LL%= \n\t"
  651. "ZZ%=: \n\t"
  652. "EMMS \n\t"
  653. ".att_syntax \n\t"
  654. :
  655. : "m" (A), "m" (B), "m" (C), "m" (cnt), "m" (cnt1), "m" (cnt2), "m" (shift), "m" (round)
  656. : "%eax", "%ecx", "%edi", "%esi"
  657. );
  658. }
  659. void sse_cacc(CPX *A, MIX *B, int32 cnt, int32 *iaccum, int32 *baccum)
  660. {
  661. int32 cnt1;
  662. int32 cnt2;
  663. if(((int)A%16) || ((int)B%16))
  664. {
  665. cnt1 = cnt / 6;
  666. cnt2 = (cnt - (6*cnt1));
  667. __asm
  668. (
  669. ".intel_syntax noprefix \n\t" //Set up for loop
  670. "mov edi, [ebp+8] \n\t" //Address of A
  671. "mov esi, [ebp+12] \n\t" //Address of B
  672. "mov ecx, [ebp-20] \n\t" //Counter 1
  673. "pxor xmm0, xmm0 \n\t" //Clear the running sum
  674. "pxor mm0, mm0 \n\t" //Clear the running sum
  675. "jecxz Z%= \n\t"
  676. "L%=: \n\t"
  677. " movlpd xmm1, [edi] \n\t" //load IF data
  678. " movlpd xmm2, [edi+8] \n\t" //load IF data
  679. " movlpd xmm3, [edi+16] \n\t" //load IF data
  680. " movupd xmm4, [esi] \n\t" //load Sine data
  681. " movupd xmm5, [esi+16] \n\t" //load Sine data
  682. " movupd xmm6, [esi+32] \n\t" //load Sine data
  683. " punpckldq xmm1, xmm1 \n\t" //copies bits 0..31 to 32..63 and bits 32..63 to 64..95 and 65..127
  684. " punpckldq xmm2, xmm2 \n\t" //copies bits 0..63 to 64..127
  685. " punpckldq xmm3, xmm3 \n\t" //copies bits 0..63 to 64..127
  686. " pmaddwd xmm1, xmm4 \n\t" //multiply and add, result in xmm1
  687. " pmaddwd xmm2, xmm5 \n\t" //multiply and add, result in xmm2
  688. " pmaddwd xmm3, xmm6 \n\t" //multiply and add, result in xmm3
  689. " paddd xmm0, xmm3 \n\t" //Add into accumulator (efficiently)
  690. " paddd xmm1, xmm2 \n\t"
  691. " paddd xmm0, xmm1 \n\t"
  692. " add edi, 24 \n\t" //move in complex sine by 24 bytes
  693. " add esi, 48 \n\t" //move in IF array by 48 bytes
  694. "loop L%= \n\t" //Loop if not done
  695. "Z%=: \n\t"
  696. "mov ecx, [ebp-16] \n\t"
  697. "jecxz ZZ%= \n\t"
  698. "LL%=: \n\t"
  699. " movd mm1, [edi] \n\t" //load IF data
  700. " movq mm2, [esi] \n\t"
  701. " punpckldq mm1, mm1 \n\t" //copy bottom 32 bits of IF data into high 32 bits
  702. " pmaddwd mm1, mm2 \n\t" //perform mmx complex multiply
  703. " paddd mm0, mm1 \n\t" //add into accumulator
  704. " add edi, 4 \n\t" //move in complex sine by 4 bytes
  705. " add esi, 8 \n\t" //move in IF array by 8 bytes
  706. "loop LL%= \n\t"
  707. "ZZ%=: \n\t"
  708. "movdq2q mm1, xmm0 \n\t"
  709. "punpckhqdq xmm0, xmm0 \n\t" //move bits 64..127 of xmm0 into 0..63 of xmm0
  710. "movdq2q mm2, xmm0 \n\t"
  711. "paddd mm0, mm1 \n\t" //add together
  712. "paddd mm0, mm2 \n\t" //add" punpckldq xmm1, xmm1 \n\t" //copies bits 0..31 to 32..63 and bits 32..63 to 64..95 and 65..127 together
  713. "mov eax, [ebp+20] \n\t"
  714. "movd [eax], mm0 \n\t"
  715. "punpckhdq mm0, mm0 \n\t"
  716. "mov eax, [ebp+24] \n\t"
  717. "movd [eax], mm0 \n\t"
  718. "EMMS \n\t"
  719. ".att_syntax \n\t"
  720. :
  721. : "m" (A), "m" (B), "m" (cnt), "m" (iaccum), "m" (baccum), "m" (cnt1), "m" (cnt2)
  722. : "%eax", "%ecx", "%edi", "%esi"
  723. );//end __asm
  724. }
  725. else
  726. {
  727. cnt1 = cnt / 12;
  728. cnt2 = (cnt - (12*cnt1));
  729. __asm
  730. (
  731. ".intel_syntax noprefix \n\t" //Set up for loop
  732. "mov edi, [ebp+8] \n\t" //Address of A
  733. "mov esi, [ebp+12] \n\t" //Address of B
  734. "mov ecx, [ebp-20] \n\t" //Counter 1
  735. "pxor xmm0, xmm0 \n\t" //Clear the running sum
  736. "pxor mm0, mm0 \n\t" //Clear the running sum
  737. "jecxz AZ%= \n\t"
  738. "AL%=: \n\t"
  739. " movlpd xmm1, [edi] \n\t" //load IF data
  740. " movlpd xmm2, [edi+8] \n\t" //load IF data
  741. " movlpd xmm3, [edi+16] \n\t" //load IF data
  742. " movlpd xmm4, [edi+24] \n\t" //load IF data
  743. " movlpd xmm5, [edi+32] \n\t" //load IF data
  744. " movlpd xmm6, [edi+40] \n\t" //load IF data
  745. " punpckldq xmm1, xmm1 \n\t" //copies bits 0..31 to 32..63 and bits 32..63 to 64..95 and 65..127
  746. " punpckldq xmm2, xmm2 \n\t" //copies bits 0..63 to 64..127
  747. " punpckldq xmm3, xmm3 \n\t" //copies bits 0..63 to 64..127
  748. " punpckldq xmm4, xmm4 \n\t" //copies bits 0..63 to 64..127
  749. " punpckldq xmm5, xmm5 \n\t" //copies bits 0..63 to 64..127
  750. " punpckldq xmm6, xmm6 \n\t" //copies bits 0..63 to 64..127
  751. " pmaddwd xmm1, [esi] \n\t" //multiply and add, result in xmm1
  752. " pmaddwd xmm2, [esi+16] \n\t" //multiply and add, result in xmm2
  753. " pmaddwd xmm3, [esi+32] \n\t" //multiply and add, result in xmm3
  754. " pmaddwd xmm4, [esi+48] \n\t" //multiply and add, result in xmm4
  755. " pmaddwd xmm5, [esi+64] \n\t" //multiply and add, result in xmm5
  756. " pmaddwd xmm6, [esi+80] \n\t" //multiply and add, result in xmm6
  757. " paddd xmm1, xmm2 \n\t" //Add into accumulator (efficiently)
  758. " paddd xmm3, xmm4 \n\t"
  759. " paddd xmm5, xmm6 \n\t"
  760. " paddd xmm1, xmm3 \n\t"
  761. " paddd xmm0, xmm5 \n\t"
  762. " paddd xmm0, xmm1 \n\t"
  763. " add edi, 48 \n\t" //move in complex sine by 56 bytes
  764. " add esi, 96 \n\t" //move in IF array by 112 bytes
  765. "loop AL%= \n\t" // Loop if not done
  766. "AZ%=: \n\t"
  767. "mov ecx, [ebp-16] \n\t"
  768. "jecxz AZZ%= \n\t"
  769. "ALL%=: \n\t"
  770. " movq mm1, [edi] \n\t" //load IF data
  771. " punpckldq mm1, mm1 \n\t" //copy bottom 32 bits of IF data into high 32 bits
  772. " pmaddwd mm1, [esi] \n\t" //perform mmx complex multiply
  773. " paddd mm0, mm1 \n\t" //add into accumulator
  774. " add edi, 4 \n\t" //move in complex sine by 4 bytes
  775. " add esi, 8 \n\t" //move in IF array by 8 bytes
  776. "loop ALL%= \n\t"
  777. "AZZ%=: \n\t"
  778. "movdq2q mm1, xmm0 \n\t"
  779. "punpckhqdq xmm0, xmm0 \n\t" //move bits 64..127 of xmm0 into 0..63 of xmm0
  780. "movdq2q mm2, xmm0 \n\t"
  781. "paddd mm0, mm1 \n\t" //add together
  782. "paddd mm0, mm2 \n\t" //add together
  783. "mov eax, [ebp+20] \n\t"
  784. "movd [eax], mm0 \n\t"
  785. "punpckhdq mm0, mm0 \n\t"
  786. "mov eax, [ebp+24] \n\t"
  787. "movd [eax], mm0 \n\t"
  788. "EMMS \n\t"
  789. ".att_syntax \n\t"
  790. :
  791. : "m" (A), "m" (B), "m" (cnt), "m" (iaccum), "m" (baccum), "m" (cnt1), "m" (cnt2)
  792. : "%eax", "%ecx", "%edi", "%esi"
  793. );//end __asm
  794. }//end if
  795. }
  796. //!< A must hold baseband data, E,P,L must hold PRN data
  797. void sse_prn_accum(CPX *A, CPX *E, CPX *P, CPX *L, int32 cnt, CPX *accum)
  798. {
  799. int32 cnt1;
  800. int32 cnt2;
  801. cnt1 = cnt / 2;
  802. cnt2 = (cnt - (2*cnt1));
  803. cnt1 = 0;
  804. cnt2 = cnt;
  805. __asm
  806. (
  807. ".intel_syntax noprefix \n\t" //Set up for loop
  808. "mov esi, [ebp+8] \n\t" //Address of A
  809. "mov eax, [ebp+12] \n\t" //Address of E
  810. "mov ebx, [ebp+16] \n\t" //Address of P
  811. "mov edx, [ebp+20] \n\t" //Address of L
  812. "mov ecx, [ebp-20] \n\t" //Counter 1
  813. "pxor mm5, mm5 \n\t" //Clear the running sum for E
  814. "pxor mm6, mm6 \n\t" //Clear the running sum for P
  815. "pxor mm7, mm7 \n\t" //Clear the running sum for L
  816. "jecxz Z%= \n\t"
  817. "L%=: \n\t"
  818. " movq mm0, [esi] \n\t" //load IF data
  819. " movq mm1, [eax] \n\t" //load E data
  820. " movq mm2, [ebx] \n\t" //load P data
  821. " movq mm3, [edx] \n\t" //load L data
  822. " movq mm4, mm0 \n\t" //load IF data
  823. " pand mm0, mm1 \n\t" //mask it
  824. " pandn mm1, mm4 \n\t" //mask it
  825. " paddsw mm5, mm1 \n\t" //add
  826. " psubsw mm5, mm0 \n\t" //subtract
  827. " movq mm0, mm4 \n\t" //load IF data
  828. " pand mm0, mm2 \n\t" //mask it
  829. " pandn mm2, mm4 \n\t" //mask it
  830. " paddsw mm6, mm2 \n\t" //add
  831. " psubsw mm6, mm0 \n\t" //subtract
  832. " movq mm0, mm4 \n\t" //load IF data
  833. " pand mm0, mm3 \n\t" //mask it
  834. " pandn mm3, mm4 \n\t" //mask it
  835. " paddsw mm7, mm3 \n\t" //add
  836. " psubsw mm7, mm0 \n\t" //subtract
  837. " add esi, 8 \n\t" //move in complex sine by 24 bytes
  838. " add eax, 8 \n\t" //move in IF array by 48 bytes
  839. " add ebx, 8 \n\t" //move in IF array by 48 bytes
  840. " add edx, 8 \n\t" //move in IF array by 48 bytes
  841. "loop L%= \n\t" //Loop if not done
  842. "Z%=: \n\t"
  843. "mov ecx, [ebp-16] \n\t"
  844. "jecxz ZZ%= \n\t"
  845. "LL%=: \n\t"
  846. " movd mm0, [esi] \n\t" //load IF data
  847. " movd mm1, [eax] \n\t" //load E data
  848. " movd mm2, [ebx] \n\t" //load P data
  849. " movd mm3, [edx] \n\t" //load L data
  850. " movq mm4, mm0 \n\t" //load IF data
  851. " pand mm0, mm1 \n\t" //mask it
  852. " pandn mm1, mm4 \n\t" //mask it
  853. " paddsw mm5, mm1 \n\t" //add
  854. " psubsw mm5, mm0 \n\t" //subtract
  855. " movq mm0, mm4 \n\t" //load IF data
  856. " pand mm0, mm2 \n\t" //mask it
  857. " pandn mm2, mm4 \n\t" //mask it
  858. " paddsw mm6, mm2 \n\t" //add
  859. " psubsw mm6, mm0 \n\t" //subtract
  860. " movq mm0, mm4 \n\t" //load IF data
  861. " pand mm0, mm3 \n\t" //mask it
  862. " pandn mm3, mm4 \n\t" //mask it
  863. " paddsw mm7, mm3 \n\t" //add
  864. " psubsw mm7, mm0 \n\t" //subtract
  865. " add esi, 4 \n\t" //move in complex sine by 24 bytes
  866. " add eax, 4 \n\t" //move in IF array by 48 bytes
  867. " add ebx, 4 \n\t" //move in IF array by 48 bytes
  868. " add edx, 4 \n\t" //move in IF array by 48 bytes
  869. "loop LL%= \n\t" //Loop if not done
  870. "ZZ%=: \n\t"
  871. "mov esi, [ebp+28] \n\t"
  872. "movq mm0, mm5 \n\t"
  873. "punpckhdq mm0, mm0 \n\t"
  874. "paddsw mm5, mm0 \n\t"
  875. "movd [esi], mm5 \n\t"
  876. "add esi, 4 \n\t"
  877. "movq mm0, mm6 \n\t"
  878. "punpckhdq mm0, mm0 \n\t"
  879. "paddsw mm6, mm0 \n\t"
  880. "movd [esi], mm6 \n\t"
  881. "add esi, 4 \n\t"
  882. "movq mm0, mm7 \n\t"
  883. "punpckhdq mm0, mm0 \n\t"
  884. "paddsw mm7, mm0 \n\t"
  885. "movd [esi], mm7 \n\t"
  886. "EMMS \n\t"
  887. ".att_syntax \n\t"
  888. :
  889. : "m" (A), "m" (E), "m" (P), "m" (L), "m" (cnt), "m" (accum), "m" (cnt1), "m" (cnt2)
  890. : "%eax", "%ebx", "%ecx", "%edx", "%esi"
  891. );//end __asm
  892. }
  893. //!< A must hold baseband data, E,P,L must hold PRN data
  894. void sse_prn_accum_new(CPX *A, MIX *E, MIX *P, MIX *L, int32 cnt, CPX_ACCUM *accum)
  895. {
  896. __asm
  897. (
  898. ".intel_syntax noprefix \n\t" //Set up for loop
  899. "mov esi, [ebp+8] \n\t" //Address of A
  900. "mov eax, [ebp+12] \n\t" //Address of E
  901. "mov ebx, [ebp+16] \n\t" //Address of P
  902. "mov edx, [ebp+20] \n\t" //Address of L
  903. "mov ecx, [ebp+24] \n\t" //Value of cnt
  904. "pxor mm5, mm5 \n\t" //Clear the running sum for E
  905. "pxor mm6, mm6 \n\t" //Clear the running sum for P
  906. "pxor mm7, mm7 \n\t" //Clear the running sum for L
  907. "jecxz Z%= \n\t"
  908. "L%=: \n\t"
  909. " movd mm0, [esi] \n\t" //load IF data
  910. " movq mm1, [eax] \n\t" //load E data
  911. " movq mm2, [ebx] \n\t" //load P data
  912. " movq mm3, [edx] \n\t" //load L data
  913. " punpckldq mm0, mm0 \n\t" //copy low 32 bits to high 32 pits
  914. " pmaddwd mm1, mm0 \n\t" //complex multiply E by IF
  915. " pmaddwd mm2, mm0 \n\t" //complex multiply P by IF
  916. " pmaddwd mm3, mm0 \n\t" //complex multiply L by IF
  917. " paddd mm5, mm1 \n\t" //add into E accumulator
  918. " paddd mm6, mm2 \n\t" //add into E accumulator
  919. " paddd mm7, mm3 \n\t" //add into E accumulator
  920. " add esi, 4 \n\t" //move in baseband data by one sample (4 bytes)
  921. " add eax, 8 \n\t" //move in PRN-E array by one sample (8 bytes)
  922. " add ebx, 8 \n\t" //move in PRN-P array by one sample (8 bytes)
  923. " add edx, 8 \n\t" //move in PRN-L array by one sample (8 bytes)
  924. "loop L%= \n\t" //Loop if not done
  925. "Z%=: \n\t"
  926. "mov esi, [ebp+28] \n\t"
  927. "movq [esi], mm5 \n\t"
  928. "add esi, 8 \n\t"
  929. "movq [esi], mm6 \n\t"
  930. "add esi, 8 \n\t"
  931. "movq [esi], mm7 \n\t"
  932. "EMMS \n\t"
  933. ".att_syntax \n\t"
  934. :
  935. : "m" (A), "m" (E), "m" (P), "m" (L), "m" (cnt), "m" (accum)
  936. : "%eax", "%ebx", "%ecx", "%edx", "%esi"
  937. );//end __asm
  938. }