Crypto++
|
00001 // panama.cpp - written and placed in the public domain by Wei Dai 00002 00003 // use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM panama.cpp" to generate MASM code 00004 00005 #include "pch.h" 00006 00007 #ifndef CRYPTOPP_GENERATE_X64_MASM 00008 00009 #include "panama.h" 00010 #include "misc.h" 00011 #include "cpu.h" 00012 00013 NAMESPACE_BEGIN(CryptoPP) 00014 00015 template <class B> 00016 void Panama<B>::Reset() 00017 { 00018 memset(m_state, 0, m_state.SizeInBytes()); 00019 #if CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE 00020 m_state[17] = HasSSSE3(); 00021 #endif 00022 } 00023 00024 #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM 00025 00026 #ifdef CRYPTOPP_X64_MASM_AVAILABLE 00027 extern "C" { 00028 void Panama_SSE2_Pull(size_t count, word32 *state, word32 *z, const word32 *y); 00029 } 00030 #elif CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE 00031 00032 #ifdef CRYPTOPP_GENERATE_X64_MASM 00033 Panama_SSE2_Pull PROC FRAME 00034 rex_push_reg rdi 00035 alloc_stack(2*16) 00036 save_xmm128 xmm6, 0h 00037 save_xmm128 xmm7, 10h 00038 .endprolog 00039 #else 00040 #pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code 00041 void CRYPTOPP_NOINLINE Panama_SSE2_Pull(size_t count, word32 *state, word32 *z, const word32 *y) 00042 { 00043 #ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY 00044 asm __volatile__ 00045 ( 00046 ".intel_syntax noprefix;" 00047 AS_PUSH_IF86( bx) 00048 #else 00049 AS2( mov AS_REG_1, count) 00050 AS2( mov AS_REG_2, state) 00051 AS2( mov AS_REG_3, z) 00052 AS2( mov AS_REG_4, y) 00053 #endif 00054 #endif // #ifdef CRYPTOPP_GENERATE_X64_MASM 00055 00056 #if CRYPTOPP_BOOL_X86 00057 #define REG_loopEnd [esp] 00058 #elif defined(CRYPTOPP_GENERATE_X64_MASM) 00059 #define REG_loopEnd rdi 00060 #else 00061 #define REG_loopEnd r8 00062 #endif 00063 00064 AS2( shl AS_REG_1, 5) 00065 ASJ( jz, 5, f) 00066 AS2( mov AS_REG_6d, [AS_REG_2+4*17]) 00067 AS2( add AS_REG_1, AS_REG_6) 00068 00069 #if CRYPTOPP_BOOL_X64 00070 AS2( mov REG_loopEnd, AS_REG_1) 00071 #else 00072 AS1( push ebp) 00073 AS1( push AS_REG_1) 00074 #endif 00075 00076 AS2( movdqa xmm0, XMMWORD_PTR [AS_REG_2+0*16]) 00077 AS2( movdqa xmm1, XMMWORD_PTR [AS_REG_2+1*16]) 00078 AS2( movdqa xmm2, XMMWORD_PTR [AS_REG_2+2*16]) 00079 AS2( movdqa xmm3, XMMWORD_PTR [AS_REG_2+3*16]) 00080 AS2( mov eax, dword ptr [AS_REG_2+4*16]) 00081 00082 ASL(4) 00083 // gamma and pi 00084 #if CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE 00085 AS2( test AS_REG_6, 1) 00086 ASJ( jnz, 6, f) 00087 #endif 00088 AS2( movdqa xmm6, xmm2) 00089 AS2( movss xmm6, xmm3) 00090 ASS( pshufd xmm5, xmm6, 0, 3, 2, 1) 00091 AS2( movd xmm6, eax) 00092 AS2( movdqa xmm7, xmm3) 00093 AS2( movss xmm7, xmm6) 00094 ASS( pshufd xmm6, xmm7, 0, 3, 2, 1) 00095 #if CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE 00096 ASJ( jmp, 7, f) 00097 ASL(6) 00098 AS2( movdqa xmm5, xmm3) 00099 AS3( palignr xmm5, xmm2, 4) 00100 AS2( movd xmm6, eax) 00101 AS3( palignr xmm6, xmm3, 4) 00102 ASL(7) 00103 #endif 00104 00105 AS2( movd AS_REG_1d, xmm2) 00106 AS1( not AS_REG_1d) 00107 AS2( movd AS_REG_7d, xmm3) 00108 AS2( or AS_REG_1d, AS_REG_7d) 00109 AS2( xor eax, AS_REG_1d) 00110 00111 #define SSE2_Index(i) ASM_MOD(((i)*13+16), 17) 00112 00113 #define pi(i) \ 00114 AS2( movd AS_REG_1d, xmm7)\ 00115 AS2( rol AS_REG_1d, ASM_MOD((ASM_MOD(5*i,17)*(ASM_MOD(5*i,17)+1)/2), 32))\ 00116 AS2( mov [AS_REG_2+SSE2_Index(ASM_MOD(5*(i), 17))*4], AS_REG_1d) 00117 00118 #define pi4(x, y, z, a, b, c, d) \ 00119 AS2( pcmpeqb xmm7, xmm7)\ 00120 AS2( pxor xmm7, x)\ 00121 AS2( por xmm7, y)\ 00122 AS2( pxor xmm7, z)\ 00123 pi(a)\ 00124 ASS( pshuflw xmm7, xmm7, 1, 0, 3, 2)\ 00125 pi(b)\ 00126 AS2( punpckhqdq xmm7, xmm7)\ 00127 pi(c)\ 00128 ASS( pshuflw xmm7, xmm7, 1, 0, 3, 2)\ 00129 pi(d) 00130 00131 pi4(xmm1, xmm2, xmm3, 1, 5, 9, 13) 00132 pi4(xmm0, xmm1, xmm2, 2, 6, 10, 14) 00133 pi4(xmm6, xmm0, xmm1, 3, 7, 11, 15) 00134 pi4(xmm5, xmm6, xmm0, 4, 8, 12, 16) 00135 00136 // output keystream and update buffer here to hide partial memory stalls between pi and theta 00137 AS2( movdqa xmm4, xmm3) 00138 AS2( punpcklqdq xmm3, xmm2) // 1 5 2 6 00139 AS2( punpckhdq xmm4, xmm2) // 9 10 13 14 00140 AS2( movdqa xmm2, xmm1) 00141 AS2( punpcklqdq xmm1, xmm0) // 3 7 4 8 00142 AS2( punpckhdq xmm2, xmm0) // 11 12 15 16 00143 00144 // keystream 00145 AS2( test AS_REG_3, AS_REG_3) 00146 ASJ( jz, 0, f) 00147 AS2( movdqa xmm6, xmm4) 00148 AS2( punpcklqdq xmm4, xmm2) 00149 AS2( punpckhqdq xmm6, xmm2) 00150 AS2( test AS_REG_4, 15) 00151 ASJ( jnz, 2, f) 00152 AS2( test AS_REG_4, AS_REG_4) 00153 ASJ( jz, 1, f) 00154 AS2( pxor xmm4, [AS_REG_4]) 00155 AS2( pxor xmm6, [AS_REG_4+16]) 00156 AS2( add AS_REG_4, 32) 00157 ASJ( jmp, 1, f) 00158 ASL(2) 00159 AS2( movdqu xmm0, [AS_REG_4]) 00160 AS2( movdqu xmm2, [AS_REG_4+16]) 00161 AS2( pxor xmm4, xmm0) 00162 AS2( pxor xmm6, xmm2) 00163 AS2( add AS_REG_4, 32) 00164 ASL(1) 00165 AS2( test AS_REG_3, 15) 00166 ASJ( jnz, 3, f) 00167 AS2( movdqa XMMWORD_PTR [AS_REG_3], xmm4) 00168 AS2( movdqa XMMWORD_PTR [AS_REG_3+16], xmm6) 00169 AS2( add AS_REG_3, 32) 00170 ASJ( jmp, 0, f) 00171 ASL(3) 00172 AS2( movdqu XMMWORD_PTR [AS_REG_3], xmm4) 00173 AS2( movdqu XMMWORD_PTR [AS_REG_3+16], xmm6) 00174 AS2( add AS_REG_3, 32) 00175 ASL(0) 00176 00177 // buffer update 00178 AS2( lea AS_REG_1, [AS_REG_6 + 32]) 00179 AS2( and AS_REG_1, 31*32) 00180 AS2( lea AS_REG_7, [AS_REG_6 + (32-24)*32]) 00181 AS2( and AS_REG_7, 31*32) 00182 00183 AS2( movdqa xmm0, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_1+0*8]) 00184 AS2( pxor xmm3, xmm0) 00185 ASS( pshufd xmm0, xmm0, 2, 3, 0, 1) 00186 AS2( movdqa XMMWORD_PTR [AS_REG_2+20*4+AS_REG_1+0*8], xmm3) 00187 AS2( pxor xmm0, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_7+2*8]) 00188 AS2( movdqa XMMWORD_PTR [AS_REG_2+20*4+AS_REG_7+2*8], xmm0) 00189 00190 AS2( movdqa xmm4, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_1+2*8]) 00191 AS2( pxor xmm1, xmm4) 00192 AS2( movdqa XMMWORD_PTR [AS_REG_2+20*4+AS_REG_1+2*8], xmm1) 00193 AS2( pxor xmm4, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_7+0*8]) 00194 AS2( movdqa XMMWORD_PTR [AS_REG_2+20*4+AS_REG_7+0*8], xmm4) 00195 00196 // theta 00197 AS2( movdqa xmm3, XMMWORD_PTR [AS_REG_2+3*16]) 00198 AS2( movdqa xmm2, XMMWORD_PTR [AS_REG_2+2*16]) 00199 AS2( movdqa xmm1, XMMWORD_PTR [AS_REG_2+1*16]) 00200 AS2( movdqa xmm0, XMMWORD_PTR [AS_REG_2+0*16]) 00201 00202 #if CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE 00203 AS2( test AS_REG_6, 1) 00204 ASJ( jnz, 8, f) 00205 #endif 00206 AS2( movd xmm6, eax) 00207 AS2( movdqa xmm7, xmm3) 00208 AS2( movss xmm7, xmm6) 00209 AS2( movdqa xmm6, xmm2) 00210 AS2( movss xmm6, xmm3) 00211 AS2( movdqa xmm5, xmm1) 00212 AS2( movss xmm5, xmm2) 00213 AS2( movdqa xmm4, xmm0) 00214 AS2( movss xmm4, xmm1) 00215 ASS( pshufd xmm7, xmm7, 0, 3, 2, 1) 00216 ASS( pshufd xmm6, xmm6, 0, 3, 2, 1) 00217 ASS( pshufd xmm5, xmm5, 0, 3, 2, 1) 00218 ASS( pshufd xmm4, xmm4, 0, 3, 2, 1) 00219 #if CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE 00220 ASJ( jmp, 9, f) 00221 ASL(8) 00222 AS2( movd xmm7, eax) 00223 AS3( palignr xmm7, xmm3, 4) 00224 AS2( movq xmm6, xmm3) 00225 AS3( palignr xmm6, xmm2, 4) 00226 AS2( movq xmm5, xmm2) 00227 AS3( palignr xmm5, xmm1, 4) 00228 AS2( movq xmm4, xmm1) 00229 AS3( palignr xmm4, xmm0, 4) 00230 ASL(9) 00231 #endif 00232 00233 AS2( xor eax, 1) 00234 AS2( movd AS_REG_1d, xmm0) 00235 AS2( xor eax, AS_REG_1d) 00236 AS2( movd AS_REG_1d, xmm3) 00237 AS2( xor eax, AS_REG_1d) 00238 00239 AS2( pxor xmm3, xmm2) 00240 AS2( pxor xmm2, xmm1) 00241 AS2( pxor xmm1, xmm0) 00242 AS2( pxor xmm0, xmm7) 00243 AS2( pxor xmm3, xmm7) 00244 AS2( pxor xmm2, xmm6) 00245 AS2( pxor xmm1, xmm5) 00246 AS2( pxor xmm0, xmm4) 00247 00248 // sigma 00249 AS2( lea AS_REG_1, [AS_REG_6 + (32-4)*32]) 00250 AS2( and AS_REG_1, 31*32) 00251 AS2( lea AS_REG_7, [AS_REG_6 + 16*32]) 00252 AS2( and AS_REG_7, 31*32) 00253 00254 AS2( movdqa xmm4, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_1+0*16]) 00255 AS2( movdqa xmm5, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_7+0*16]) 00256 AS2( movdqa xmm6, xmm4) 00257 AS2( punpcklqdq xmm4, xmm5) 00258 AS2( punpckhqdq xmm6, xmm5) 00259 AS2( pxor xmm3, xmm4) 00260 AS2( pxor xmm2, xmm6) 00261 00262 AS2( movdqa xmm4, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_1+1*16]) 00263 AS2( movdqa xmm5, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_7+1*16]) 00264 AS2( movdqa xmm6, xmm4) 00265 AS2( punpcklqdq xmm4, xmm5) 00266 AS2( punpckhqdq xmm6, xmm5) 00267 AS2( pxor xmm1, xmm4) 00268 AS2( pxor xmm0, xmm6) 00269 00270 // loop 00271 AS2( add AS_REG_6, 32) 00272 AS2( cmp AS_REG_6, REG_loopEnd) 00273 ASJ( jne, 4, b) 00274 00275 // save state 00276 AS2( mov [AS_REG_2+4*16], eax) 00277 AS2( movdqa XMMWORD_PTR [AS_REG_2+3*16], xmm3) 00278 AS2( movdqa XMMWORD_PTR [AS_REG_2+2*16], xmm2) 00279 AS2( movdqa XMMWORD_PTR [AS_REG_2+1*16], xmm1) 00280 AS2( movdqa XMMWORD_PTR [AS_REG_2+0*16], xmm0) 00281 00282 #if CRYPTOPP_BOOL_X86 00283 AS2( add esp, 4) 00284 AS1( pop ebp) 00285 #endif 00286 ASL(5) 00287 00288 #ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY 00289 AS_POP_IF86( bx) 00290 ".att_syntax prefix;" 00291 : 00292 #if CRYPTOPP_BOOL_X64 00293 : "D" (count), "S" (state), "d" (z), "c" (y) 00294 : "%r8", "%r9", "r10", "%eax", "memory", "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7" 00295 #else 00296 : "c" (count), "d" (state), "S" (z), "D" (y) 00297 : "%eax", "memory", "cc" 00298 #endif 00299 ); 00300 #endif 00301 #ifdef CRYPTOPP_GENERATE_X64_MASM 00302 movdqa xmm6, [rsp + 0h] 00303 movdqa xmm7, [rsp + 10h] 00304 add rsp, 2*16 00305 pop rdi 00306 ret 00307 Panama_SSE2_Pull ENDP 00308 #else 00309 } 00310 #endif 00311 #endif // #ifdef CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE 00312 00313 #ifndef CRYPTOPP_GENERATE_X64_MASM 00314 00315 template <class B> 00316 void Panama<B>::Iterate(size_t count, const word32 *p, word32 *z, const word32 *y) 00317 { 00318 word32 bstart = m_state[17]; 00319 word32 *const aPtr = m_state; 00320 word32 cPtr[17]; 00321 00322 #define bPtr ((byte *)(aPtr+20)) 00323 00324 // reorder the state for SSE2 00325 // a and c: 4 8 12 16 | 3 7 11 15 | 2 6 10 14 | 1 5 9 13 | 0 00326 // xmm0 xmm1 xmm2 xmm3 eax 00327 #define a(i) aPtr[((i)*13+16) % 17] // 13 is inverse of 4 mod 17 00328 #define c(i) cPtr[((i)*13+16) % 17] 00329 // b: 0 4 | 1 5 | 2 6 | 3 7 00330 #define b(i, j) b##i[(j)*2%8 + (j)/4] 00331 00332 // output 00333 #define OA(i) z[i] = ConditionalByteReverse(B::ToEnum(), a(i+9)) 00334 #define OX(i) z[i] = y[i] ^ ConditionalByteReverse(B::ToEnum(), a(i+9)) 00335 // buffer update 00336 #define US(i) {word32 t=b(0,i); b(0,i)=ConditionalByteReverse(B::ToEnum(), p[i])^t; b(25,(i+6)%8)^=t;} 00337 #define UL(i) {word32 t=b(0,i); b(0,i)=a(i+1)^t; b(25,(i+6)%8)^=t;} 00338 // gamma and pi 00339 #define GP(i) c(5*i%17) = rotlFixed(a(i) ^ (a((i+1)%17) | ~a((i+2)%17)), ((5*i%17)*((5*i%17)+1)/2)%32) 00340 // theta and sigma 00341 #define T(i,x) a(i) = c(i) ^ c((i+1)%17) ^ c((i+4)%17) ^ x 00342 #define TS1S(i) T(i+1, ConditionalByteReverse(B::ToEnum(), p[i])) 00343 #define TS1L(i) T(i+1, b(4,i)) 00344 #define TS2(i) T(i+9, b(16,i)) 00345 00346 while (count--) 00347 { 00348 if (z) 00349 { 00350 if (y) 00351 { 00352 OX(0); OX(1); OX(2); OX(3); OX(4); OX(5); OX(6); OX(7); 00353 y += 8; 00354 } 00355 else 00356 { 00357 OA(0); OA(1); OA(2); OA(3); OA(4); OA(5); OA(6); OA(7); 00358 } 00359 z += 8; 00360 } 00361 00362 word32 *const b16 = (word32 *)(bPtr+((bstart+16*32) & 31*32)); 00363 word32 *const b4 = (word32 *)(bPtr+((bstart+(32-4)*32) & 31*32)); 00364 bstart += 32; 00365 word32 *const b0 = (word32 *)(bPtr+((bstart) & 31*32)); 00366 word32 *const b25 = (word32 *)(bPtr+((bstart+(32-25)*32) & 31*32)); 00367 00368 if (p) 00369 { 00370 US(0); US(1); US(2); US(3); US(4); US(5); US(6); US(7); 00371 } 00372 else 00373 { 00374 UL(0); UL(1); UL(2); UL(3); UL(4); UL(5); UL(6); UL(7); 00375 } 00376 00377 GP(0); 00378 GP(1); 00379 GP(2); 00380 GP(3); 00381 GP(4); 00382 GP(5); 00383 GP(6); 00384 GP(7); 00385 GP(8); 00386 GP(9); 00387 GP(10); 00388 GP(11); 00389 GP(12); 00390 GP(13); 00391 GP(14); 00392 GP(15); 00393 GP(16); 00394 00395 T(0,1); 00396 00397 if (p) 00398 { 00399 TS1S(0); TS1S(1); TS1S(2); TS1S(3); TS1S(4); TS1S(5); TS1S(6); TS1S(7); 00400 p += 8; 00401 } 00402 else 00403 { 00404 TS1L(0); TS1L(1); TS1L(2); TS1L(3); TS1L(4); TS1L(5); TS1L(6); TS1L(7); 00405 } 00406 00407 TS2(0); TS2(1); TS2(2); TS2(3); TS2(4); TS2(5); TS2(6); TS2(7); 00408 } 00409 m_state[17] = bstart; 00410 } 00411 00412 namespace Weak { 00413 template <class B> 00414 size_t PanamaHash<B>::HashMultipleBlocks(const word32 *input, size_t length) 00415 { 00416 this->Iterate(length / this->BLOCKSIZE, input); 00417 return length % this->BLOCKSIZE; 00418 } 00419 00420 template <class B> 00421 void PanamaHash<B>::TruncatedFinal(byte *hash, size_t size) 00422 { 00423 this->ThrowIfInvalidTruncatedSize(size); 00424 00425 PadLastBlock(this->BLOCKSIZE, 0x01); 00426 00427 HashEndianCorrectedBlock(this->m_data); 00428 00429 this->Iterate(32); // pull 00430 00431 FixedSizeSecBlock<word32, 8> buf; 00432 this->Iterate(1, NULL, buf, NULL); 00433 00434 memcpy(hash, buf, size); 00435 00436 this->Restart(); // reinit for next use 00437 } 00438 } 00439 00440 template <class B> 00441 void PanamaCipherPolicy<B>::CipherSetKey(const NameValuePairs ¶ms, const byte *key, size_t length) 00442 { 00443 assert(length==32); 00444 memcpy(m_key, key, 32); 00445 } 00446 00447 template <class B> 00448 void PanamaCipherPolicy<B>::CipherResynchronize(byte *keystreamBuffer, const byte *iv, size_t length) 00449 { 00450 assert(length==32); 00451 this->Reset(); 00452 this->Iterate(1, m_key); 00453 if (iv && IsAligned<word32>(iv)) 00454 this->Iterate(1, (const word32 *)iv); 00455 else 00456 { 00457 FixedSizeSecBlock<word32, 8> buf; 00458 if (iv) 00459 memcpy(buf, iv, 32); 00460 else 00461 memset(buf, 0, 32); 00462 this->Iterate(1, buf); 00463 } 00464 00465 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE) 00466 if (B::ToEnum() == LITTLE_ENDIAN_ORDER && HasSSE2() && !IsP4()) // SSE2 code is slower on P4 Prescott 00467 Panama_SSE2_Pull(32, this->m_state, NULL, NULL); 00468 else 00469 #endif 00470 this->Iterate(32); 00471 } 00472 00473 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X64 00474 template <class B> 00475 unsigned int PanamaCipherPolicy<B>::GetAlignment() const 00476 { 00477 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE) 00478 if (B::ToEnum() == LITTLE_ENDIAN_ORDER && HasSSE2()) 00479 return 16; 00480 else 00481 #endif 00482 return 1; 00483 } 00484 #endif 00485 00486 template <class B> 00487 void PanamaCipherPolicy<B>::OperateKeystream(KeystreamOperation operation, byte *output, const byte *input, size_t iterationCount) 00488 { 00489 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE) 00490 if (B::ToEnum() == LITTLE_ENDIAN_ORDER && HasSSE2()) 00491 Panama_SSE2_Pull(iterationCount, this->m_state, (word32 *)output, (const word32 *)input); 00492 else 00493 #endif 00494 this->Iterate(iterationCount, NULL, (word32 *)output, (const word32 *)input); 00495 } 00496 00497 template class Panama<BigEndian>; 00498 template class Panama<LittleEndian>; 00499 00500 template class Weak::PanamaHash<BigEndian>; 00501 template class Weak::PanamaHash<LittleEndian>; 00502 00503 template class PanamaCipherPolicy<BigEndian>; 00504 template class PanamaCipherPolicy<LittleEndian>; 00505 00506 NAMESPACE_END 00507 00508 #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM