+#define MT_SIZE 624
+#define MT_PERIOD 397
+#define MT_SPACE (MT_SIZE - MT_PERIOD)
+
+static uint32_t mt_state[MT_SIZE];
+static size_t mt_index = 0;
+
+static GMQCC_INLINE void mt_generate(void) {
+ /*
+ * The loop has been unrolled here: the original paper and implemenation
+ * Called for the following code:
+ * for (register unsigned i = 0; i < MT_SIZE; ++i) {
+ * register uint32_t load;
+ * load = (0x80000000 & mt_state[i]) // most significant 32nd bit
+ * load |= (0x7FFFFFFF & mt_state[(i + 1) % MT_SIZE]) // least significant 31nd bit
+ *
+ * mt_state[i] = mt_state[(i + MT_PERIOD) % MT_SIZE] ^ (load >> 1);
+ *
+ * if (load & 1) mt_state[i] ^= 0x9908B0DF;
+ * }
+ *
+ * This essentially is a waste: we have two modulus operations, and
+ * a branch that is executed every iteration from [0, MT_SIZE).
+ *
+ * Please see: http://www.quadibloc.com/crypto/co4814.htm for more
+ * information on how this clever trick works.
+ */
+ static const uint32_t matrix[2] = {
+ 0x00000000,
+ 0x9908B0Df
+ };
+ /*
+ * This register gives up a little more speed by instructing the compiler
+ * to force these into CPU registers (they're counters for indexing mt_state
+ * which we can force the compiler to generate prefetch instructions for)
+ */
+ register uint32_t y;
+ register uint32_t i;
+
+ /*
+ * Said loop has been unrolled for MT_SPACE (226 iterations), opposed
+ * to [0, MT_SIZE) (634 iterations).
+ */
+ for (i = 0; i < MT_SPACE; ++i) {
+ y = (0x80000000 & mt_state[i]) | (0x7FFFFFF & mt_state[i + 1]);
+ mt_state[i] = mt_state[i + MT_PERIOD] ^ (y >> 1) ^ matrix[y & 1];
+
+ i ++; /* loop unroll */
+
+ y = (0x80000000 & mt_state[i]) | (0x7FFFFFF & mt_state[i + 1]);
+ mt_state[i] = mt_state[i + MT_PERIOD] ^ (y >> 1) ^ matrix[y & 1];
+ }
+
+ /*
+ * collapsing the walls unrolled (evenly dividing 396 [632-227 = 396
+ * = 2*2*3*3*11])
+ */
+ i = MT_SPACE;
+ while (i < MT_SIZE - 1) {
+ /*
+ * We expand this 11 times .. manually, no macros are required
+ * here. This all fits in the CPU cache.
+ */
+ y = (0x80000000 & mt_state[i]) | (0x7FFFFFFF & mt_state[i + 1]);
+ mt_state[i] = mt_state[i - MT_SPACE] ^ (y >> 1) ^ matrix[y & 1];
+ ++i;
+ y = (0x80000000 & mt_state[i]) | (0x7FFFFFFF & mt_state[i + 1]);
+ mt_state[i] = mt_state[i - MT_SPACE] ^ (y >> 1) ^ matrix[y & 1];
+ ++i;
+ y = (0x80000000 & mt_state[i]) | (0x7FFFFFFF & mt_state[i + 1]);
+ mt_state[i] = mt_state[i - MT_SPACE] ^ (y >> 1) ^ matrix[y & 1];
+ ++i;
+ y = (0x80000000 & mt_state[i]) | (0x7FFFFFFF & mt_state[i + 1]);
+ mt_state[i] = mt_state[i - MT_SPACE] ^ (y >> 1) ^ matrix[y & 1];
+ ++i;
+ y = (0x80000000 & mt_state[i]) | (0x7FFFFFFF & mt_state[i + 1]);
+ mt_state[i] = mt_state[i - MT_SPACE] ^ (y >> 1) ^ matrix[y & 1];
+ ++i;
+ y = (0x80000000 & mt_state[i]) | (0x7FFFFFFF & mt_state[i + 1]);
+ mt_state[i] = mt_state[i - MT_SPACE] ^ (y >> 1) ^ matrix[y & 1];
+ ++i;
+ y = (0x80000000 & mt_state[i]) | (0x7FFFFFFF & mt_state[i + 1]);
+ mt_state[i] = mt_state[i - MT_SPACE] ^ (y >> 1) ^ matrix[y & 1];
+ ++i;
+ y = (0x80000000 & mt_state[i]) | (0x7FFFFFFF & mt_state[i + 1]);
+ mt_state[i] = mt_state[i - MT_SPACE] ^ (y >> 1) ^ matrix[y & 1];
+ ++i;
+ y = (0x80000000 & mt_state[i]) | (0x7FFFFFFF & mt_state[i + 1]);
+ mt_state[i] = mt_state[i - MT_SPACE] ^ (y >> 1) ^ matrix[y & 1];
+ ++i;
+ y = (0x80000000 & mt_state[i]) | (0x7FFFFFFF & mt_state[i + 1]);
+ mt_state[i] = mt_state[i - MT_SPACE] ^ (y >> 1) ^ matrix[y & 1];
+ ++i;
+ y = (0x80000000 & mt_state[i]) | (0x7FFFFFFF & mt_state[i + 1]);
+ mt_state[i] = mt_state[i - MT_SPACE] ^ (y >> 1) ^ matrix[y & 1];
+ ++i;
+ }