// Security library: DH key exchange + XTEA-CTR cipher for DJGPP // // Diffie-Hellman uses the RFC 2409 Group 2 (1024-bit) safe prime with // Montgomery multiplication for modular exponentiation. Private exponents // are 256 bits for fast computation on 486-class hardware. // // XTEA in CTR mode provides symmetric encryption. No lookup tables, // no key schedule -- just shifts, adds, and XORs. #include #include #include #include #include #include #include #include "security.h" // ======================================================================== // Internal defines // ======================================================================== #define BN_BITS 1024 #define BN_WORDS (BN_BITS / 32) #define BN_BYTES (BN_BITS / 8) #define DH_PRIVATE_BITS 256 #define DH_PRIVATE_BYTES (DH_PRIVATE_BITS / 8) #define XTEA_ROUNDS 32 #define XTEA_DELTA 0x9E3779B9 // ======================================================================== // Types // ======================================================================== typedef struct { uint32_t w[BN_WORDS]; } BigNumT; struct SecDhS { BigNumT privateKey; BigNumT publicKey; BigNumT sharedSecret; bool hasKeys; bool hasSecret; }; struct SecCipherS { uint32_t key[4]; uint32_t nonce[2]; uint32_t counter[2]; }; typedef struct { uint32_t key[4]; uint32_t counter[2]; bool seeded; } RngStateT; // ======================================================================== // Static globals // ======================================================================== // RFC 2409 Group 2 (1024-bit MODP) prime, little-endian word order static const BigNumT sDhPrime = { .w = { 0x39E38FAF, 0xCDB1CEDC, 0x51FF5DB8, 0x85E28A20, 0x1E9C284F, 0x2BB72AE0, 0x60F89D81, 0x4E664FD5, 0x45E6F3A1, 0x92F2129E, 0xB8E51B21, 0x35C7D431, 0x14A0C959, 0x137E2179, 0x5BE0CD19, 0x7A51F1D7, 0xF25F1468, 0x302B0A6D, 0xCD3A431B, 0xEF9519B3, 0x8E3404DD, 0x514A0879, 0x3B139B22, 0x020BBEA6, 0x8A67CC74, 0x29024E08, 0x80DC1CD1, 0xC4C6628B, 0x2168C234, 0xC90FDAA2, 0xFFFFFFFF, 0xFFFFFFFF }}; // Generator g = 2 static const BigNumT sDhGenerator = { .w = { 2 } }; // Montgomery constants (computed lazily on first DH operation). // These are expensive to compute (~2048 shift-and-subtract operations for R2) // but only need to be done once since we always use the same prime. // R = 2^1024 in Montgomery arithmetic; R^2 mod p is the conversion factor. // m0inv = -p[0]^(-1) mod 2^32 is the Montgomery reduction constant. static BigNumT sDhR2; static uint32_t sDhM0Inv; static bool sDhInited = false; // RNG state static RngStateT sRng = { .seeded = false }; // ======================================================================== // Static prototypes (alphabetical) // ======================================================================== static int bnAdd(BigNumT *result, const BigNumT *a, const BigNumT *b); static int bnBit(const BigNumT *a, int n); static int bnBitLength(const BigNumT *a); static void bnClear(BigNumT *a); static int bnCmp(const BigNumT *a, const BigNumT *b); static void bnCopy(BigNumT *dst, const BigNumT *src); static void bnFromBytes(BigNumT *a, const uint8_t *buf); static void bnModExp(BigNumT *result, const BigNumT *base, const BigNumT *exp, const BigNumT *mod, uint32_t m0inv, const BigNumT *r2); static void bnMontMul(BigNumT *result, const BigNumT *a, const BigNumT *b, const BigNumT *mod, uint32_t m0inv); static void bnSet(BigNumT *a, uint32_t val); static int bnShiftLeft1(BigNumT *a); static int bnSub(BigNumT *result, const BigNumT *a, const BigNumT *b); static void bnToBytes(uint8_t *buf, const BigNumT *a); static uint32_t computeM0Inv(uint32_t m0); static void computeR2(BigNumT *r2, const BigNumT *m); static void dhInit(void); static void secureZero(void *ptr, int len); static void xteaEncryptBlock(uint32_t v[2], const uint32_t key[4]); // ======================================================================== // BigNum functions (alphabetical) // ======================================================================== static int __attribute__((unused)) bnAdd(BigNumT *result, const BigNumT *a, const BigNumT *b) { uint64_t carry = 0; for (int i = 0; i < BN_WORDS; i++) { uint64_t sum = (uint64_t)a->w[i] + b->w[i] + carry; result->w[i] = (uint32_t)sum; carry = sum >> 32; } return (int)carry; } static int bnBit(const BigNumT *a, int n) { return (a->w[n / 32] >> (n % 32)) & 1; } static int bnBitLength(const BigNumT *a) { for (int i = BN_WORDS - 1; i >= 0; i--) { if (a->w[i]) { uint32_t v = a->w[i]; int bits = i * 32; while (v) { bits++; v >>= 1; } return bits; } } return 0; } static void bnClear(BigNumT *a) { memset(a->w, 0, sizeof(a->w)); } static int bnCmp(const BigNumT *a, const BigNumT *b) { for (int i = BN_WORDS - 1; i >= 0; i--) { if (a->w[i] > b->w[i]) { return 1; } if (a->w[i] < b->w[i]) { return -1; } } return 0; } static void bnCopy(BigNumT *dst, const BigNumT *src) { memcpy(dst->w, src->w, sizeof(dst->w)); } static void bnFromBytes(BigNumT *a, const uint8_t *buf) { for (int i = 0; i < BN_WORDS; i++) { int j = (BN_WORDS - 1 - i) * 4; a->w[i] = ((uint32_t)buf[j] << 24) | ((uint32_t)buf[j + 1] << 16) | ((uint32_t)buf[j + 2] << 8) | (uint32_t)buf[j + 3]; } } // Modular exponentiation using Montgomery multiplication. // // Montgomery multiplication replaces expensive modular reduction (division // by a 1024-bit number) with cheaper additions and right-shifts, at the // cost of converting operands to/from "Montgomery form" (multiply by R mod m). // For exponentiation where we do hundreds of multiplications with the same // modulus, the conversion cost is amortized and the net speedup is ~3-5x // over schoolbook multiply-then-reduce on a 486. // // Uses left-to-right binary (square-and-multiply) scanning of the exponent. // For a 256-bit private exponent, this is ~256 squarings + ~128 multiplies // on average (half the bits are 1). static void bnModExp(BigNumT *result, const BigNumT *base, const BigNumT *exp, const BigNumT *mod, uint32_t m0inv, const BigNumT *r2) { BigNumT montBase; BigNumT montResult; BigNumT one; int bits; bool started; // Convert base to Montgomery form: montBase = base * R mod m bnMontMul(&montBase, base, r2, mod, m0inv); // Initialize montResult to 1 in Montgomery form (= R mod m) bnClear(&one); one.w[0] = 1; bnMontMul(&montResult, &one, r2, mod, m0inv); // Left-to-right binary square-and-multiply bits = bnBitLength(exp); started = false; for (int i = bits - 1; i >= 0; i--) { if (started) { bnMontMul(&montResult, &montResult, &montResult, mod, m0inv); } if (bnBit(exp, i)) { if (!started) { bnCopy(&montResult, &montBase); started = true; } else { bnMontMul(&montResult, &montResult, &montBase, mod, m0inv); } } } // Convert back from Montgomery form: result = montResult * 1 * R^(-1) mod m bnClear(&one); one.w[0] = 1; bnMontMul(result, &montResult, &one, mod, m0inv); } // Montgomery multiplication: computes (a * b * R^(-1)) mod m without division. // // The algorithm processes one word of 'a' per outer iteration (32 words for // 1024-bit numbers). For each word: // 1. Accumulate a[i] * b into the temporary product t // 2. Compute the Montgomery reduction factor u = t[0] * m0inv (mod 2^32) // 3. Add u * mod to t and shift right by 32 bits (the division by 2^32) // // The shift-and-reduce avoids explicit modular reduction. After all 32 // iterations, the result is in [0, 2m), so a single conditional subtraction // brings it into [0, m). This is the CIOS (Coarsely Integrated Operand // Scanning) variant, which is cache-friendly because it accesses 'b' and // 'mod' sequentially in the inner loops. static void bnMontMul(BigNumT *result, const BigNumT *a, const BigNumT *b, const BigNumT *mod, uint32_t m0inv) { uint32_t t[BN_WORDS + 1]; uint32_t u; uint64_t carry; uint64_t prod; uint64_t sum; memset(t, 0, sizeof(t)); for (int i = 0; i < BN_WORDS; i++) { // Step 1: t += a[i] * b carry = 0; for (int j = 0; j < BN_WORDS; j++) { prod = (uint64_t)a->w[i] * b->w[j] + t[j] + carry; t[j] = (uint32_t)prod; carry = prod >> 32; } t[BN_WORDS] += (uint32_t)carry; // Step 2: Montgomery reduction factor u = t[0] * m0inv; // Step 3: t = (t + u * mod) >> 32 // First word: result is zero by construction, take carry only prod = (uint64_t)u * mod->w[0] + t[0]; carry = prod >> 32; // Remaining words: shift result left by one position for (int j = 1; j < BN_WORDS; j++) { prod = (uint64_t)u * mod->w[j] + t[j] + carry; t[j - 1] = (uint32_t)prod; carry = prod >> 32; } sum = (uint64_t)t[BN_WORDS] + carry; t[BN_WORDS - 1] = (uint32_t)sum; t[BN_WORDS] = (uint32_t)(sum >> 32); } // Copy result memcpy(result->w, t, BN_WORDS * sizeof(uint32_t)); // Conditional subtract if result >= mod if (t[BN_WORDS] || bnCmp(result, mod) >= 0) { bnSub(result, result, mod); } } static void bnSet(BigNumT *a, uint32_t val) { bnClear(a); a->w[0] = val; } static int bnShiftLeft1(BigNumT *a) { uint32_t carry = 0; for (int i = 0; i < BN_WORDS; i++) { uint32_t newCarry = a->w[i] >> 31; a->w[i] = (a->w[i] << 1) | carry; carry = newCarry; } return carry; } static int bnSub(BigNumT *result, const BigNumT *a, const BigNumT *b) { uint64_t borrow = 0; for (int i = 0; i < BN_WORDS; i++) { uint64_t diff = (uint64_t)a->w[i] - b->w[i] - borrow; result->w[i] = (uint32_t)diff; borrow = (diff >> 63) & 1; } return (int)borrow; } static void bnToBytes(uint8_t *buf, const BigNumT *a) { for (int i = 0; i < BN_WORDS; i++) { int j = (BN_WORDS - 1 - i) * 4; uint32_t w = a->w[i]; buf[j] = (uint8_t)(w >> 24); buf[j + 1] = (uint8_t)(w >> 16); buf[j + 2] = (uint8_t)(w >> 8); buf[j + 3] = (uint8_t)(w); } } // ======================================================================== // Helper functions (alphabetical) // ======================================================================== // Compute -m0^(-1) mod 2^32 using Newton's method for modular inverse. // Starting from x=1 (which is always a valid initial approximation for // odd m0), each iteration doubles the number of correct bits. After 5 // iterations we have 32 correct bits (1->2->4->8->16->32). This is the // standard approach for computing the Montgomery constant. static uint32_t computeM0Inv(uint32_t m0) { uint32_t x = 1; for (int i = 0; i < 5; i++) { x = x * (2 - m0 * x); } // Return -m0^(-1) mod 2^32 return ~x + 1; } // Compute R^2 mod m where R = 2^1024. This is the Montgomery domain // conversion factor. We compute it by repeated doubling (shift left by 1) // with modular reduction, which is simple but takes 2048 iterations. // Only done once at initialization time. static void computeR2(BigNumT *r2, const BigNumT *m) { bnSet(r2, 1); for (int i = 0; i < 2 * BN_BITS; i++) { int carry = bnShiftLeft1(r2); if (carry || bnCmp(r2, m) >= 0) { bnSub(r2, r2, m); } } } static void dhInit(void) { if (sDhInited) { return; } sDhM0Inv = computeM0Inv(sDhPrime.w[0]); computeR2(&sDhR2, &sDhPrime); sDhInited = true; } // Volatile pointer prevents the compiler from optimizing away the zeroing // as a dead store. Critical for clearing key material -- without volatile, // the compiler sees that ptr is about to be freed and removes the memset. static void secureZero(void *ptr, int len) { volatile uint8_t *p = (volatile uint8_t *)ptr; for (int i = 0; i < len; i++) { p[i] = 0; } } // XTEA block cipher: encrypts an 8-byte block in-place. The Feistel network // uses 32 rounds (vs TEA's 32 or 64). Each round mixes the halves using // shifts, adds, and XORs -- no S-boxes, no lookup tables, no key schedule. // The delta constant (golden ratio * 2^32) ensures each round uses a // different effective key, preventing slide attacks. static void xteaEncryptBlock(uint32_t v[2], const uint32_t key[4]) { uint32_t v0 = v[0]; uint32_t v1 = v[1]; uint32_t sum = 0; for (int i = 0; i < XTEA_ROUNDS; i++) { v0 += (((v1 << 4) ^ (v1 >> 5)) + v1) ^ (sum + key[sum & 3]); sum += XTEA_DELTA; v1 += (((v0 << 4) ^ (v0 >> 5)) + v0) ^ (sum + key[(sum >> 11) & 3]); } v[0] = v0; v[1] = v1; } // ======================================================================== // RNG functions (alphabetical) // ======================================================================== // Mix additional entropy into the RNG state. XOR-folding into the key is // simple and cannot reduce entropy (XOR with random data is a bijection). // The re-mix step (encrypting the key with itself) diffuses the new entropy // across all key bits so that even a single byte of good entropy improves // the entire key state. void secRngAddEntropy(const uint8_t *data, int len) { for (int i = 0; i < len; i++) { ((uint8_t *)sRng.key)[i % 16] ^= data[i]; } // Re-mix: encrypt the key with itself uint32_t block[2]; block[0] = sRng.key[0] ^ sRng.key[2]; block[1] = sRng.key[1] ^ sRng.key[3]; xteaEncryptBlock(block, sRng.key); sRng.key[0] ^= block[0]; sRng.key[1] ^= block[1]; block[0] = sRng.key[2] ^ sRng.key[0]; block[1] = sRng.key[3] ^ sRng.key[1]; xteaEncryptBlock(block, sRng.key); sRng.key[2] ^= block[0]; sRng.key[3] ^= block[1]; } // Generate pseudorandom bytes using XTEA-CTR DRBG. Each call encrypts // the monotonically increasing counter with the RNG key, producing 8 bytes // of keystream per block. The counter never repeats (64-bit space), so // the output is a pseudorandom stream as long as the key has sufficient // entropy. Auto-seeds from hardware entropy on first use as a safety net. void secRngBytes(uint8_t *buf, int len) { if (!sRng.seeded) { uint8_t entropy[16]; int got = secRngGatherEntropy(entropy, sizeof(entropy)); secRngSeed(entropy, got); } uint32_t block[2]; int pos = 0; while (pos < len) { block[0] = sRng.counter[0]; block[1] = sRng.counter[1]; xteaEncryptBlock(block, sRng.key); int take = len - pos; if (take > 8) { take = 8; } memcpy(buf + pos, block, take); pos += take; // Increment counter if (++sRng.counter[0] == 0) { sRng.counter[1]++; } } } // Gather hardware entropy from the PIT (Programmable Interval Timer) and // BIOS tick count. The PIT runs at 1.193182 MHz, so its LSBs change rapidly // and provide ~10 bits of entropy per read (depending on timing jitter). // The BIOS tick at 18.2 Hz adds a few more bits. Two PIT readings with // the intervening code execution provide some jitter. Total: roughly 20 // bits of real entropy -- not enough alone, but sufficient to seed the DRBG // when supplemented by user interaction timing. int secRngGatherEntropy(uint8_t *buf, int len) { int out = 0; outportb(0x43, 0x00); uint8_t pitLo = inportb(0x40); uint8_t pitHi = inportb(0x40); // BIOS tick count (18.2 Hz) uint32_t ticks = _farpeekl(_dos_ds, 0x46C); if (out < len) { buf[out++] = pitLo; } if (out < len) { buf[out++] = pitHi; } if (out < len) { buf[out++] = (uint8_t)(ticks); } if (out < len) { buf[out++] = (uint8_t)(ticks >> 8); } if (out < len) { buf[out++] = (uint8_t)(ticks >> 16); } if (out < len) { buf[out++] = (uint8_t)(ticks >> 24); } // Second PIT reading for jitter outportb(0x43, 0x00); pitLo = inportb(0x40); pitHi = inportb(0x40); if (out < len) { buf[out++] = pitLo; } if (out < len) { buf[out++] = pitHi; } return out; } // Initialize the RNG from entropy. The 64-byte discard at the end is // standard DRBG practice -- it advances the state past any weak initial // output that might leak information about the seed material. void secRngSeed(const uint8_t *entropy, int len) { memset(&sRng, 0, sizeof(sRng)); // XOR-fold entropy into the key for (int i = 0; i < len; i++) { ((uint8_t *)sRng.key)[i % 16] ^= entropy[i]; } // Derive counter from key bits sRng.counter[0] = sRng.key[2] ^ sRng.key[0]; sRng.counter[1] = sRng.key[3] ^ sRng.key[1]; sRng.seeded = true; // Mix state by generating and discarding 64 bytes uint8_t discard[64]; sRng.seeded = true; // prevent recursion in secRngBytes secRngBytes(discard, sizeof(discard)); secureZero(discard, sizeof(discard)); } // ======================================================================== // DH functions (alphabetical) // ======================================================================== // Compute the shared secret from the remote side's public key. // Validates that the remote key is in [2, p-2] to prevent small-subgroup // attacks (keys of 0, 1, or p-1 would produce trivially guessable secrets). // The shared secret is remote^private mod p, which both sides compute // independently and arrive at the same value (the DH property). int secDhComputeSecret(SecDhT *dh, const uint8_t *remotePub, int len) { BigNumT remote; BigNumT two; if (!dh || !remotePub) { return SEC_ERR_PARAM; } if (len != SEC_DH_KEY_SIZE) { return SEC_ERR_PARAM; } if (!dh->hasKeys) { return SEC_ERR_NOT_READY; } dhInit(); bnFromBytes(&remote, remotePub); // Validate remote public key: must be in range [2, p-2] bnSet(&two, 2); if (bnCmp(&remote, &two) < 0 || bnCmp(&remote, &sDhPrime) >= 0) { secureZero(&remote, sizeof(remote)); return SEC_ERR_PARAM; } // shared = remote^private mod p bnModExp(&dh->sharedSecret, &remote, &dh->privateKey, &sDhPrime, sDhM0Inv, &sDhR2); dh->hasSecret = true; secureZero(&remote, sizeof(remote)); return SEC_SUCCESS; } SecDhT *secDhCreate(void) { SecDhT *dh = (SecDhT *)calloc(1, sizeof(SecDhT)); return dh; } // Derive a symmetric key from the 128-byte shared secret by XOR-folding. // This is a simple key derivation function: each byte of the secret is // XOR'd into the output key at position (i % keyLen). For a 16-byte XTEA // key, each output byte is the XOR of 8 secret bytes, providing good // mixing. A proper KDF (HKDF, etc.) would be better but adds complexity // and code size for marginal benefit in this use case. int secDhDeriveKey(SecDhT *dh, uint8_t *key, int keyLen) { uint8_t secretBytes[BN_BYTES]; if (!dh || !key || keyLen <= 0) { return SEC_ERR_PARAM; } if (!dh->hasSecret) { return SEC_ERR_NOT_READY; } if (keyLen > BN_BYTES) { keyLen = BN_BYTES; } bnToBytes(secretBytes, &dh->sharedSecret); // XOR-fold 128-byte shared secret down to keyLen bytes memset(key, 0, keyLen); for (int i = 0; i < BN_BYTES; i++) { key[i % keyLen] ^= secretBytes[i]; } secureZero(secretBytes, sizeof(secretBytes)); return SEC_SUCCESS; } // secureZero the entire struct before freeing to prevent the private key // from lingering in freed memory where it could be read by a later malloc. void secDhDestroy(SecDhT *dh) { if (dh) { secureZero(dh, sizeof(SecDhT)); free(dh); } } // Generate a DH keypair: random 256-bit private key, then compute // public = g^private mod p. The private key is only 256 bits (not 1024) // to keep exponentiation fast on 486-class hardware. With Montgomery // multiplication, this takes ~256 squarings + ~128 multiplies, each // operating on 32-word (1024-bit) numbers. int secDhGenerateKeys(SecDhT *dh) { if (!dh) { return SEC_ERR_PARAM; } dhInit(); // Generate 256-bit random private key bnClear(&dh->privateKey); secRngBytes((uint8_t *)dh->privateKey.w, DH_PRIVATE_BYTES); // Ensure private key >= 2 if (bnBitLength(&dh->privateKey) <= 1) { dh->privateKey.w[0] = 2; } // public = g^private mod p bnModExp(&dh->publicKey, &sDhGenerator, &dh->privateKey, &sDhPrime, sDhM0Inv, &sDhR2); dh->hasKeys = true; dh->hasSecret = false; return SEC_SUCCESS; } int secDhGetPublicKey(SecDhT *dh, uint8_t *buf, int *len) { if (!dh || !buf || !len) { return SEC_ERR_PARAM; } if (*len < SEC_DH_KEY_SIZE) { return SEC_ERR_PARAM; } if (!dh->hasKeys) { return SEC_ERR_NOT_READY; } bnToBytes(buf, &dh->publicKey); *len = SEC_DH_KEY_SIZE; return SEC_SUCCESS; } // ======================================================================== // Cipher functions (alphabetical) // ======================================================================== SecCipherT *secCipherCreate(const uint8_t *key) { SecCipherT *c; if (!key) { return 0; } c = (SecCipherT *)calloc(1, sizeof(SecCipherT)); if (!c) { return 0; } memcpy(c->key, key, SEC_XTEA_KEY_SIZE); return c; } // CTR-mode encryption/decryption. The counter is encrypted to produce // an 8-byte keystream block, which is XOR'd with the data. The counter // increments after each block. Because XOR is its own inverse, the same // function handles both encryption and decryption. // // IMPORTANT: the counter is internal state that advances with each call. // The secLink layer ensures that TX and RX use separate cipher instances // with separate counters, and that the same counter value is never reused // with the same key (which would be catastrophic for CTR mode security). void secCipherCrypt(SecCipherT *c, uint8_t *data, int len) { uint32_t block[2]; uint8_t *keystream; int pos; int take; if (!c || !data || len <= 0) { return; } keystream = (uint8_t *)block; pos = 0; while (pos < len) { // Encrypt counter to generate keystream block[0] = c->counter[0]; block[1] = c->counter[1]; xteaEncryptBlock(block, c->key); // XOR keystream with data take = len - pos; if (take > 8) { take = 8; } for (int i = 0; i < take; i++) { data[pos + i] ^= keystream[i]; } pos += take; // Increment counter if (++c->counter[0] == 0) { c->counter[1]++; } } } void secCipherDestroy(SecCipherT *c) { if (c) { secureZero(c, sizeof(SecCipherT)); free(c); } } void secCipherSetNonce(SecCipherT *c, uint32_t nonceLo, uint32_t nonceHi) { if (!c) { return; } c->nonce[0] = nonceLo; c->nonce[1] = nonceHi; c->counter[0] = nonceLo; c->counter[1] = nonceHi; }