init: v1.0.0

2026-05-27 23:03:00 +08:00
commit 8d97f750eb
466 changed files with 80067 additions and 0 deletions
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2018 Markku-Juhani O. Saarinen
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
@@ -0,0 +1,18 @@
+# Makefile
+# 2018-04-20  Markku-Juhani O. Saarinen <mjos@iki.fi>
+
+BIN		= xtest
+OBJS		= sm4ni.o sm4_ref.o testmain.o
+DIST		= sm4ni
+
+CC		= gcc
+CFLAGS		= -Wall -Ofast -march=native -DSM4NI_UNROLL
+
+$(BIN): $(OBJS)
+	$(CC) $(LDFLAGS) -o $(BIN) $(OBJS) $(LIBS)
+
+.c.o:
+	$(CC) $(CFLAGS) $(INCLUDES) -c $< -o $@
+
+clean:
+	rm -f $(OBJS) $(BIN)
@@ -0,0 +1,66 @@
+# sm4ni
+
+Demonstration that AES-NI instructions and affine transforms can be used 
+to create a fast, vectorized,constant time implementation of the Chinese 
+Encryption Standard SM4.
+
+## Background and Theory
+
+SM4 is the Chinese Standard Encryption Algorithm. It is a block cipher 
+with a 128-bit key and 128-bit block size. For more information, see
+the [Internet Draft](https://www.ietf.org/id/draft-ribose-cfrg-sm4).
+The use of SM4 is now mandated for certain applications within China.
+ARM is introducing special SM4 instructions in its future architectures.
+
+This note shows how to use Intel vector instructions to create about 2-3
+times faster **constant time** implementation. The trick is to use affine 
+transforms to emulate the SM4 S-Box with the AES S-Box. The S-Boxes are
+both based on finite field inversion, but use different affine transforms 
+and even polynomial basis for the finite field. However, different 
+polynomial bases are affine isomorphic. 
+
+We combine various linear operations into two affine transforms (one on 
+each side), A1 and A2. Here affine transform consists of a multiplication 
+with a 8x8 binary matrix M and addition of a 8-bit constant C.
+```
+SM4-S(x) = A2(AES-S(A1(x))
+A1(x) = M1*x + C1
+A2(x) = M2*x + C2
+```
+We note that each affine transform can be constructed from XOR of two 
+4x8-bit table lookups, which we implement with constant time byte 
+shuffle instructions (each 16-entry table is in a single 128-bit register).
+For parallel AES S-Box lookups we use the `AESENCLAST` instruction 
+(nominally intended for AES last round) in order to avoid AES MDS matrix 
+expansion.
+
+Due to the structure of SM4, we are processing 4 blocks in parallel.
+This means that CBC cannot be implemented this way, but faster parallelizable
+modes like CTR, GCM, and OCB are fine. This code example only implements 
+the block encryption function (block decryption is essentially equivalent but unneeded for decryption with CTR, GCM, OCB) and uses Intel C intrinsics. The 
+fast block encryption code is in `sm4ni.c`.
+
+## Testing
+
+Just clone or extract the distibution and:
+```
+$ make
+gcc -Wall -Ofast -march=native  -c sm4ni.c -o sm4ni.o
+gcc -Wall -Ofast -march=native  -c sm4_ref.c -o sm4_ref.o
+gcc -Wall -Ofast -march=native  -c testmain.c -o testmain.o
+gcc  -o xtest sm4ni.o sm4_ref.o testmain.o 
+
+$ ./xtest 
+SM4 reference     60.906 MB/s
+Vector SM4NI     160.666 MB/s
+```
+Of course support for AES-NI is required. This benchmark indicates 264%
+speed for the new implementation (and it is constant time!). Your
+architecture may give very different results. Futher optimizations are
+possible.
+
+## Notes
+
+This is part of ongoing research work, and I think I am the first person who
+discovered this trick. So please give me some credit if you use this.
+
@@ -0,0 +1,179 @@
+// sm4_ref.c
+// 2018-04-20  Markku-Juhani O. Saarinen <mjos@iki.fi>
+
+// Reference implementation of SM4, the Chinese Encryption Standard.
+// Adopted from Internet Draft draft-ribose-cfrg-sm4 with some modifications.
+
+#include "sm4_ref.h"
+
+/* Operations */
+/* Rotate Left 32-bit number */
+#define ROTL32(x, n) (((x) << (n)) | ((x) >> (32 - (n))))
+
+static const uint32_t sm4_ck[32] = {
+    0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269,
+    0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9,
+    0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249,
+    0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9,
+    0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229,
+    0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299,
+    0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209,
+    0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279
+};
+
+static const uint8_t sm4_sbox[256] = {
+    0xD6, 0x90, 0xE9, 0xFE, 0xCC, 0xE1, 0x3D, 0xB7,
+    0x16, 0xB6, 0x14, 0xC2, 0x28, 0xFB, 0x2C, 0x05,
+    0x2B, 0x67, 0x9A, 0x76, 0x2A, 0xBE, 0x04, 0xC3,
+    0xAA, 0x44, 0x13, 0x26, 0x49, 0x86, 0x06, 0x99,
+    0x9C, 0x42, 0x50, 0xF4, 0x91, 0xEF, 0x98, 0x7A,
+    0x33, 0x54, 0x0B, 0x43, 0xED, 0xCF, 0xAC, 0x62,
+    0xE4, 0xB3, 0x1C, 0xA9, 0xC9, 0x08, 0xE8, 0x95,
+    0x80, 0xDF, 0x94, 0xFA, 0x75, 0x8F, 0x3F, 0xA6,
+    0x47, 0x07, 0xA7, 0xFC, 0xF3, 0x73, 0x17, 0xBA,
+    0x83, 0x59, 0x3C, 0x19, 0xE6, 0x85, 0x4F, 0xA8,
+    0x68, 0x6B, 0x81, 0xB2, 0x71, 0x64, 0xDA, 0x8B,
+    0xF8, 0xEB, 0x0F, 0x4B, 0x70, 0x56, 0x9D, 0x35,
+    0x1E, 0x24, 0x0E, 0x5E, 0x63, 0x58, 0xD1, 0xA2,
+    0x25, 0x22, 0x7C, 0x3B, 0x01, 0x21, 0x78, 0x87,
+    0xD4, 0x00, 0x46, 0x57, 0x9F, 0xD3, 0x27, 0x52,
+    0x4C, 0x36, 0x02, 0xE7, 0xA0, 0xC4, 0xC8, 0x9E,
+    0xEA, 0xBF, 0x8A, 0xD2, 0x40, 0xC7, 0x38, 0xB5,
+    0xA3, 0xF7, 0xF2, 0xCE, 0xF9, 0x61, 0x15, 0xA1,
+    0xE0, 0xAE, 0x5D, 0xA4, 0x9B, 0x34, 0x1A, 0x55,
+    0xAD, 0x93, 0x32, 0x30, 0xF5, 0x8C, 0xB1, 0xE3,
+    0x1D, 0xF6, 0xE2, 0x2E, 0x82, 0x66, 0xCA, 0x60,
+    0xC0, 0x29, 0x23, 0xAB, 0x0D, 0x53, 0x4E, 0x6F,
+    0xD5, 0xDB, 0x37, 0x45, 0xDE, 0xFD, 0x8E, 0x2F,
+    0x03, 0xFF, 0x6A, 0x72, 0x6D, 0x6C, 0x5B, 0x51,
+    0x8D, 0x1B, 0xAF, 0x92, 0xBB, 0xDD, 0xBC, 0x7F,
+    0x11, 0xD9, 0x5C, 0x41, 0x1F, 0x10, 0x5A, 0xD8,
+    0x0A, 0xC1, 0x31, 0x88, 0xA5, 0xCD, 0x7B, 0xBD,
+    0x2D, 0x74, 0xD0, 0x12, 0xB8, 0xE5, 0xB4, 0xB0,
+    0x89, 0x69, 0x97, 0x4A, 0x0C, 0x96, 0x77, 0x7E,
+    0x65, 0xB9, 0xF1, 0x09, 0xC5, 0x6E, 0xC6, 0x84,
+    0x18, 0xF0, 0x7D, 0xEC, 0x3A, 0xDC, 0x4D, 0x20,
+    0x79, 0xEE, 0x5F, 0x3E, 0xD7, 0xCB, 0x39, 0x48
+};
+
+static const uint32_t sm4_fk[4] = {
+  0xA3B1BAC6, 0x56AA3350, 0x677D9197, 0xB27022DC
+};
+
+static uint32_t load_u32_be(const uint8_t *b, uint32_t n)
+{
+  return ((uint32_t)b[4 * n    ] << 24) |
+         ((uint32_t)b[4 * n + 1] << 16) |
+         ((uint32_t)b[4 * n + 2] << 8)  |
+         ((uint32_t)b[4 * n + 3]);
+}
+
+static void store_u32_be(uint32_t v, uint8_t *b)
+{
+  b[0] = (uint8_t)(v >> 24);
+  b[1] = (uint8_t)(v >> 16);
+  b[2] = (uint8_t)(v >> 8);
+  b[3] = (uint8_t)(v);
+}
+
+void sm4_key_schedule(const uint8_t key[], uint32_t rk[])
+{
+  uint32_t t, x, k[36];
+  int i;
+
+  for (i = 0; i < 4; i++)
+  {
+    k[i] = load_u32_be(key, i) ^ sm4_fk[i];
+  }
+
+  /* T' */
+  for (i = 0; i < SM4_KEY_SCHEDULE; ++i)
+  {
+    x = k[i + 1] ^ k[i + 2] ^ k[i + 3] ^ sm4_ck[i];
+
+    /* Nonlinear operation tau */
+    t = ((uint32_t)sm4_sbox[(uint8_t)(x >> 24)]) << 24 |
+        ((uint32_t)sm4_sbox[(uint8_t)(x >> 16)]) << 16 |
+        ((uint32_t)sm4_sbox[(uint8_t)(x >>  8)]) <<  8 |
+        ((uint32_t)sm4_sbox[(uint8_t)(x)]);
+
+    /* Linear operation L' */
+    k[i+4] = k[i] ^ (t ^ ROTL32(t, 13) ^ ROTL32(t, 23));
+    rk[i] = k[i + 4];
+  }
+}
+
+#define SM4_ROUNDS(k0, k1, k2, k3, F)   \
+  do {                                  \
+    x0 ^= F(x1 ^ x2 ^ x3 ^ rk[k0]); \
+    x1 ^= F(x0 ^ x2 ^ x3 ^ rk[k1]); \
+    x2 ^= F(x0 ^ x1 ^ x3 ^ rk[k2]); \
+    x3 ^= F(x0 ^ x1 ^ x2 ^ rk[k3]); \
+  } while(0)
+
+static uint32_t sm4_t(uint32_t x)
+{
+  uint32_t t = 0;
+
+  t |= ((uint32_t)sm4_sbox[(uint8_t)(x >> 24)]) << 24;
+  t |= ((uint32_t)sm4_sbox[(uint8_t)(x >> 16)]) << 16;
+  t |= ((uint32_t)sm4_sbox[(uint8_t)(x >> 8)]) << 8;
+  t |= sm4_sbox[(uint8_t)x];
+
+  /*
+   * L linear transform
+   */
+  return t ^ ROTL32(t, 2) ^ ROTL32(t, 10) ^
+         ROTL32(t, 18) ^ ROTL32(t, 24);
+}
+
+void sm4_encrypt(const uint32_t rk[SM4_KEY_SCHEDULE],
+    const uint8_t *plaintext, uint8_t *ciphertext)
+{
+  uint32_t x0, x1, x2, x3;
+
+  x0 = load_u32_be(plaintext, 0);
+  x1 = load_u32_be(plaintext, 1);
+  x2 = load_u32_be(plaintext, 2);
+  x3 = load_u32_be(plaintext, 3);
+
+  SM4_ROUNDS( 0,  1,  2,  3, sm4_t);
+  SM4_ROUNDS( 4,  5,  6,  7, sm4_t);
+  SM4_ROUNDS( 8,  9, 10, 11, sm4_t);
+  SM4_ROUNDS(12, 13, 14, 15, sm4_t);
+  SM4_ROUNDS(16, 17, 18, 19, sm4_t);
+  SM4_ROUNDS(20, 21, 22, 23, sm4_t);
+  SM4_ROUNDS(24, 25, 26, 27, sm4_t);
+  SM4_ROUNDS(28, 29, 30, 31, sm4_t);
+
+  store_u32_be(x3, ciphertext);
+  store_u32_be(x2, ciphertext + 4);
+  store_u32_be(x1, ciphertext + 8);
+  store_u32_be(x0, ciphertext + 12);
+}
+
+void sm4_decrypt(const uint32_t rk[SM4_KEY_SCHEDULE],
+    const uint8_t *ciphertext, uint8_t *plaintext)
+{
+  uint32_t x0, x1, x2, x3;
+
+  x0 = load_u32_be(ciphertext, 0);
+  x1 = load_u32_be(ciphertext, 1);
+  x2 = load_u32_be(ciphertext, 2);
+  x3 = load_u32_be(ciphertext, 3);
+
+  SM4_ROUNDS(31, 30, 29, 28, sm4_t);
+  SM4_ROUNDS(27, 26, 25, 24, sm4_t);
+  SM4_ROUNDS(23, 22, 21, 20, sm4_t);
+  SM4_ROUNDS(19, 18, 17, 16, sm4_t);
+  SM4_ROUNDS(15, 14, 13, 12, sm4_t);
+  SM4_ROUNDS(11, 10,  9,  8, sm4_t);
+  SM4_ROUNDS( 7,  6,  5,  4, sm4_t);
+  SM4_ROUNDS( 3,  2,  1,  0, sm4_t);
+
+  store_u32_be(x3, plaintext);
+  store_u32_be(x2, plaintext + 4);
+  store_u32_be(x1, plaintext + 8);
+  store_u32_be(x0, plaintext + 12);
+}
+
@@ -0,0 +1,23 @@
+// sm4_ref.h
+// 2018-04-20  Markku-Juhani O. Saarinen <mjos@iki.fi>
+
+#ifndef SM4_REF_H
+#define SM4_REF_H
+
+#define SM4_BLOCK_SIZE    16
+#define SM4_KEY_SCHEDULE  32
+
+#include <stdint.h>
+
+// reference implementation based on the internet draft
+
+void sm4_key_schedule(const uint8_t key[], uint32_t rk[]);
+
+void sm4_encrypt(const uint32_t rk[SM4_KEY_SCHEDULE],
+    const uint8_t *plaintext, uint8_t *ciphertext);
+
+void sm4_decrypt(const uint32_t rk[SM4_KEY_SCHEDULE],
+    const uint8_t *ciphertext, uint8_t *plaintext);
+
+#endif
+
@@ -0,0 +1,173 @@
+// sm4ni.c
+// 2018-04-20  Markku-Juhani O. Saarinen <mjos@iki.fi>
+
+// Vectorized implementation of SM4. Uses affine transformations and AES NI
+// to implement the SM4 S-Box.
+
+#include "sm4_ref.h"
+#include <x86intrin.h>
+
+// Encrypt 4 blocks (64 bytes) in ECB mode
+
+void sm4_encrypt4(const uint32_t rk[32], void *src, const void *dst)
+{
+    // nibble mask
+    const __m128i c0f __attribute__((aligned(0x10))) =
+        { 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F };
+
+    // flip all bytes in all 32-bit words
+    const __m128i flp __attribute__((aligned(0x10))) =
+        { 0x0405060700010203, 0x0C0D0E0F08090A0B };
+
+    // inverse shift rows
+    const __m128i shr __attribute__((aligned(0x10))) =
+        { 0x0B0E0104070A0D00, 0x0306090C0F020508 };
+
+    // Affine transform 1 (low and high hibbles)
+    const __m128i m1l __attribute__((aligned(0x10))) =
+        { 0x9197E2E474720701, 0xC7C1B4B222245157 };
+    const __m128i m1h __attribute__((aligned(0x10))) =
+        { 0xE240AB09EB49A200, 0xF052B91BF95BB012 };
+
+    // Affine transform 2 (low and high hibbles)
+    const __m128i m2l __attribute__((aligned(0x10))) =
+        { 0x5B67F2CEA19D0834, 0xEDD14478172BBE82 };
+    const __m128i m2h __attribute__((aligned(0x10))) =
+        { 0xAE7201DD73AFDC00, 0x11CDBE62CC1063BF };
+
+    // left rotations of 32-bit words by 8-bit increments
+    const __m128i r08 __attribute__((aligned(0x10))) =
+        { 0x0605040702010003, 0x0E0D0C0F0A09080B };
+    const __m128i r16 __attribute__((aligned(0x10))) =
+        { 0x0504070601000302, 0x0D0C0F0E09080B0A };
+    const __m128i r24 __attribute__((aligned(0x10))) =
+        { 0x0407060500030201, 0x0C0F0E0D080B0A09 };
+
+    __m128i x, y, t0, t1, t2, t3;
+
+    uint32_t k, *p32, v[4] __attribute__((aligned(0x10)));
+    int i;
+
+    p32 = (uint32_t *) src;
+    t0 = _mm_set_epi32(p32[12], p32[ 8], p32[ 4], p32[ 0]);
+    t0 = _mm_shuffle_epi8(t0, flp);
+    t1 = _mm_set_epi32(p32[13], p32[ 9], p32[ 5], p32[ 1]);
+    t1 = _mm_shuffle_epi8(t1, flp);
+    t2 = _mm_set_epi32(p32[14], p32[10], p32[ 6], p32[ 2]);
+    t2 = _mm_shuffle_epi8(t2, flp);
+    t3 = _mm_set_epi32(p32[15], p32[11], p32[ 7], p32[ 3]);
+    t3 = _mm_shuffle_epi8(t3, flp);
+
+#ifndef SM4NI_UNROLL
+
+    // not unrolled
+
+    for (i = 0; i < 32; i++) {
+
+        k = rk[i];
+        x = t1 ^ t2 ^ t3 ^ _mm_set_epi32(k, k, k, k);
+
+        y = _mm_and_si128(x, c0f);          // inner affine
+        y = _mm_shuffle_epi8(m1l, y);
+        x = _mm_srli_epi64(x, 4);
+        x = _mm_and_si128(x, c0f);
+        x = _mm_shuffle_epi8(m1h, x) ^ y;
+
+        x = _mm_shuffle_epi8(x, shr);       // inverse MixColumns
+        x = _mm_aesenclast_si128(x, c0f);   // AESNI instruction
+
+        y = _mm_andnot_si128(x, c0f);       // outer affine
+        y = _mm_shuffle_epi8(m2l, y);
+        x = _mm_srli_epi64(x, 4);
+        x = _mm_and_si128(x, c0f);
+        x = _mm_shuffle_epi8(m2h, x) ^ y;
+
+        // 4 parallel L1 linear transforms
+        y = x ^ _mm_shuffle_epi8(x, r08) ^ _mm_shuffle_epi8(x, r16);
+        y = _mm_slli_epi32(y, 2) ^ _mm_srli_epi32(y, 30);
+        x = x ^ y ^ _mm_shuffle_epi8(x, r24);
+
+        // rotate registers
+        x ^= t0;
+        t0 = t1;
+        t1 = t2;
+        t2 = t3;
+        t3 = x;
+    }
+
+#else
+
+    // unrolled version
+
+#define SM4_TAU_L1 { \
+    y = _mm_and_si128(x, c0f);              \
+    y = _mm_shuffle_epi8(m1l, y);           \
+    x = _mm_srli_epi64(x, 4);               \
+    x = _mm_and_si128(x, c0f);              \
+    x = _mm_shuffle_epi8(m1h, x) ^ y;       \
+    x = _mm_shuffle_epi8(x, shr);           \
+    x = _mm_aesenclast_si128(x, c0f);       \
+    y = _mm_andnot_si128(x, c0f);           \
+    y = _mm_shuffle_epi8(m2l, y);           \
+    x = _mm_srli_epi64(x, 4);               \
+    x = _mm_and_si128(x, c0f);              \
+    x = _mm_shuffle_epi8(m2h, x) ^ y;       \
+    y = x ^ _mm_shuffle_epi8(x, r08) ^      \
+        _mm_shuffle_epi8(x, r16);           \
+    y = _mm_slli_epi32(y, 2) ^              \
+        _mm_srli_epi32(y, 30);              \
+    x = x ^ y ^ _mm_shuffle_epi8(x, r24);   \
+}
+
+    for (i = 0; i < 32;) {
+
+        k = rk[i++];
+        x = t1 ^ t2 ^ t3 ^ _mm_set_epi32(k, k, k, k);
+        SM4_TAU_L1
+        t0 ^= x;
+
+        k = rk[i++];
+        x = t0 ^ t2 ^ t3 ^ _mm_set_epi32(k, k, k, k);
+        SM4_TAU_L1
+        t1 ^= x;
+
+        k = rk[i++];
+        x = t0 ^ t1 ^ t3 ^ _mm_set_epi32(k, k, k, k);
+        SM4_TAU_L1
+        t2 ^= x;
+
+        k = rk[i++];
+        x = t0 ^ t1 ^ t2 ^ _mm_set_epi32(k, k, k, k);
+        SM4_TAU_L1
+        t3 ^= x;
+    }
+
+#endif
+
+    p32 = (uint32_t *) dst;
+
+    _mm_store_si128((__m128i *) v, _mm_shuffle_epi8(t3, flp));
+    p32[ 0] = v[0];
+    p32[ 4] = v[1];
+    p32[ 8] = v[2];
+    p32[12] = v[3];
+
+    _mm_store_si128((__m128i *) v, _mm_shuffle_epi8(t2, flp));
+    p32[ 1] = v[0];
+    p32[ 5] = v[1];
+    p32[ 9] = v[2];
+    p32[13] = v[3];
+
+    _mm_store_si128((__m128i *) v, _mm_shuffle_epi8(t1, flp));
+    p32[ 2] = v[0];
+    p32[ 6] = v[1];
+    p32[10] = v[2];
+    p32[14] = v[3];
+
+    _mm_store_si128((__m128i *) v, _mm_shuffle_epi8(t0, flp));
+    p32[ 3] = v[0];
+    p32[ 7] = v[1];
+    p32[11] = v[2];
+    p32[15] = v[3];
+}
+
@@ -0,0 +1,128 @@
+// testmain.c
+// 2018-04-20  Markku-Juhani O. Saarinen <mjos@iki.fi>
+
+#include <stdio.h>
+#include <time.h>
+#include <stdlib.h>
+#include <string.h>
+
+// reference implementation
+#include "sm4_ref.h"
+
+// AES-NI / SSE3 implementation, encrypt 4 blocks at once
+void sm4_encrypt4(const uint32_t rk[32], void *src, const void *dst);
+
+// high-precision time
+
+static double clk_now()
+{
+    struct timespec ts;
+
+    // You may onsider CLOCK_MONOTONIC and CLOCK_MONOTONIC_RAW here too
+    if (clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts) != 0) {
+        perror("clock_gettime()");
+        exit(-1);
+    }
+    return ((double) ts.tv_sec) + 1E-9 * ((double) ts.tv_nsec);
+}
+
+// test speed and validity
+
+int main(int argc, char **argv)
+{
+    int i, e;
+    double clk1, clk2, n;
+
+    // test vectors from the standard
+
+    const uint8_t key[16] = {
+        0x01, 0x23, 0x45, 0x67, 0x89, 0xAB, 0xCD, 0xEF,
+        0xFE, 0xDC, 0xBA, 0x98, 0x76, 0x54, 0x32, 0x10
+    };
+
+    const uint8_t ref[16] = {
+        0x68, 0x1E, 0xDF, 0x34, 0xD2, 0x06, 0x96, 0x5E,
+        0x86, 0xB3, 0xE9, 0x4F, 0x53, 0x6E, 0x42, 0x46
+    };
+
+    uint8_t buf1[64], buf2[64];
+    uint32_t rk[32];
+
+    memset(buf1, 0x55, sizeof(buf1));
+    memset(buf2, 0xAA, sizeof(buf2));
+
+    // Test reference implementation with a test vector.
+
+    sm4_key_schedule(key, rk);
+    sm4_encrypt(rk, key, buf1);
+    if (memcmp(buf1, ref, 16) != 0) {
+        fprintf(stderr, "sm4_encrypt() test failed.\n");
+        return -1;
+    }
+    sm4_decrypt(rk, buf1, buf2);
+    if (memcmp(buf2, key, 16) != 0) {
+        fprintf(stderr, "sm4_decrypt() test failed.\n");
+        return -1;
+    }
+
+    // Test the SM4NI four-block version against reference implementation.
+
+    for (i = 0; i < 64; i++)
+        buf1[i] = i;
+
+    // encrypt 4 blocks at once to buf2
+    sm4_encrypt4(rk, buf1, buf2);
+
+    // individual blocks in place (with ref algoritm)
+    sm4_encrypt(rk, buf1, buf1);
+    sm4_encrypt(rk, buf1 + 16, buf1 + 16);
+    sm4_encrypt(rk, buf1 + 32, buf1 + 32);
+    sm4_encrypt(rk, buf1 + 48, buf1 + 48);
+
+    if (memcmp(buf1, buf2, 64) != 0) {
+                fprintf(stderr, "sm4_encrypt4() test failed.\n");
+        return -1;
+    }
+
+    // bench reference implementation
+
+    n = 0;
+    e = 16;
+    clk1 = clk_now();
+    do {
+
+        for (i = 0; i < e; i++) {
+            sm4_encrypt(rk, buf1, buf1);
+            sm4_encrypt(rk, buf1 + 16, buf1 + 16);
+            sm4_encrypt(rk, buf1 + 32, buf1 + 32);
+            sm4_encrypt(rk, buf1 + 48, buf1 + 48);
+        }
+        n += 64 * e;
+        e <<= 1;
+        clk2 = clk_now() - clk1;
+    } while (clk2 < 2.0);
+
+    printf("SM4 reference %10.3f MB/s\n", (n / 1E6) / ((double) clk2));
+
+    // bench reference implementation
+
+    n = 0;
+    e = 16;
+    clk1 = clk_now();
+    do {
+
+        for (i = 0; i < e; i++) {
+            sm4_encrypt4(rk, buf1, buf1);
+        }
+        n += 64 * e;
+        e <<= 1;
+        clk2 = clk_now() - clk1;
+    } while (clk2 < 2.0);
+
+    printf("Vector SM4NI  %10.3f MB/s\n", (n / 1E6) / ((double) clk2));
+
+
+    return 0;
+}
+
+