diff options
Diffstat (limited to 'A5.1/CUDA/calculate_chain_kernel.cu')
-rw-r--r-- | A5.1/CUDA/calculate_chain_kernel.cu | 130 |
1 files changed, 130 insertions, 0 deletions
diff --git a/A5.1/CUDA/calculate_chain_kernel.cu b/A5.1/CUDA/calculate_chain_kernel.cu new file mode 100644 index 0000000..0c6ebec --- /dev/null +++ b/A5.1/CUDA/calculate_chain_kernel.cu @@ -0,0 +1,130 @@ +/* + * Calculation of chains for A5/1 rainbow table cracking. + * + * + * Loosely based on: A pedagogical implementation of A5/1. + * + * Copyright (C) 1998-1999: Marc Briceno, Ian Goldberg, and David Wagner + * + * See accompanying file A5.1.c for original version and full copyright + * + * + * Modified and optimized for running on CUDA. + * + * Copyright (C) 2009: Ingo Albrecht <prom@berlin.ccc.de> + * + */ + +// XXX: Undefine for 64bit platform. You will also have to fix up printfs. +#define BITSIZE_32 + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <string.h> +#include <math.h> + +/* Masks for the three shift registers */ +#define R1MASK 0x07FFFF /* 19 bits, numbered 0..18 */ +#define R2MASK 0x3FFFFF /* 22 bits, numbered 0..21 */ +#define R3MASK 0x7FFFFF /* 23 bits, numbered 0..22 */ + +/* Middle bit of each of the three shift registers, for clock control */ +#define R1MID 0x000100 /* bit 8 */ +#define R2MID 0x000400 /* bit 10 */ +#define R3MID 0x000400 /* bit 10 */ + +/* Feedback taps, for clocking the shift registers. */ +#define R1TAPS 0x072000 /* bits 18,17,16,13 */ +#define R2TAPS 0x300000 /* bits 21,20 */ +#define R3TAPS 0x700080 /* bits 22,21,20,7 */ + +/* Output taps, for output generation */ +#define R1OUT 0x040000 /* bit 18 (the high bit) */ +#define R2OUT 0x200000 /* bit 21 (the high bit) */ +#define R3OUT 0x400000 /* bit 22 (the high bit) */ + +typedef unsigned char byte; +#ifdef BITSIZE_32 +typedef unsigned long uint32; +typedef unsigned long long uint64; +#else +typedef unsigned int uint32; +typedef unsigned long uint64; +#endif + +typedef unsigned int bit; + +__device__ bit parity32(uint32 x) { + x ^= x>>16; + x ^= x>>8; + x ^= x>>4; + x ^= x>>2; + x ^= x>>1; + return x&1; +} + +__device__ uint32 clockone(uint32 reg, uint32 mask, uint32 taps) { + uint32 t = reg & taps; + reg = (reg << 1) & mask; + reg |= parity32(t); + return reg; +} + +__device__ bit majority(uint32 R1, uint32 R2, uint32 R3) { + int sum; + sum = ((R1&R1MID) >> 8) + ((R2&R2MID) >> 10) + ((R3&R3MID) >> 10); + if (sum >= 2) + return 1; + else + return 0; +} + +__device__ bit getbit(uint32 R1, uint32 R2, uint32 R3) { + return ((R1&R1OUT) >> 18) ^ ((R2&R2OUT) >> 21) ^ ((R3&R3OUT) >> 22); +} + +__device__ uint64 calculate_link (uint64 input, uint32 count) { + uint64 result; + int i; + + /* Reduction function. */ + uint32 R1 = ((input >> (22 + 23))^count) & R1MASK; + uint32 R2 = ((input >> 23)^count) & R2MASK; + uint32 R3 = (input^count) & R3MASK; + + result = getbit(R1, R2, R3); + for(i=1;i<64;i++) { + // Yes, virginia, we only need to clock 63 times for 64 bits of output + + // clock() + bit maj = majority(R1, R2, R3); + uint32 T1 = clockone(R1, R1MASK, R1TAPS); + uint32 T2 = clockone(R2, R2MASK, R2TAPS); + uint32 T3 = clockone(R3, R3MASK, R3TAPS); + + if (((R1&R1MID)!=0) == maj) + R1 = T1; + if (((R2&R2MID)!=0) == maj) + R2 = T2; + if (((R3&R3MID)!=0) == maj) + R3 = T3; + + result = (result << 1)| getbit(R1, R2, R3); + } + return result; +} + +__global__ void crunch(uint64* results, uint32 index) { + uint32 tid = blockIdx.x * blockDim.x + threadIdx.x; + + uint64 state = results[tid]; + + uint32 i; + for(i = 0; i < OPERATIONS_PER_RUN; i++) { + state = calculate_link(state, index - i); + } + + results[tid] = state; +} + |