A5.1/CUDA/calculate_chain.cu


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167

/*
 * Driver program for a CUDA-based A5/1 rainbow table generator.
 *
 * Copyright (C) 2009: Ingo Albrecht <prom@berlin.ccc.de>
 */

#ifndef TEST_INTERMEDIATES
/* values below are for normal runs */

/*
 * These values are appropriate for a Quadro FX 570M.
 *
 * Before running this on different hardware, you
 * should decrease OPERATIONS_PER_RUN and then
 * increase it incrementally until you get
 * run lengths approaching 5 seconds.
 *
 * Thread and block count should be selected
 * so that they almost hit the register bound.
 *
 * If you want to tune the code for your card,
 * you should do it incrementally, keeping
 * the run length below 5 seconds, or your
 * graphics subsystem might go wonky.
 */

// number of threads per block
#define NUM_THREADS 32

// number of blocks to schedule
#define NUM_BLOCKS  32

// how long each run should be in cycles.
// must be a power of two for now.
#define OPERATIONS_PER_RUN  32768

#else
// values below are for intermediate testing

#define NUM_THREADS 10
#define NUM_BLOCKS  1

#define OPERATIONS_PER_RUN 32768

#endif

// total operations per chain (2^21)
#define OPERATIONS_PER_CHAIN 2097152

// number of chains to be computed
#define NUM_CHAINS NUM_THREADS * NUM_BLOCKS


#include <stdio.h>
#include <unistd.h>

#include <cutil.h>

#include "calculate_chain_kernel.cu"

int
main(int argc, char **argv) {
  CUT_DEVICE_INIT(argc, argv);

  uint32 i;

  uint64 start = 0; // XXX put your start vector here
  
  printf("Computing %d chains divided into %d blocks of %d threads, starting at 0x%16.16llx\n",
         NUM_CHAINS, NUM_BLOCKS, NUM_THREADS, start);

  uint32 num_runs = OPERATIONS_PER_CHAIN / OPERATIONS_PER_RUN;
  
  printf("Will execute %d runs of %d steps each.\n", num_runs, OPERATIONS_PER_RUN);

  // create a timer for the whole run
  unsigned int total_timer = 0;
  CUT_SAFE_CALL(cutCreateTimer(&total_timer));
  
  // compute size of state
  uint32  s_results = NUM_CHAINS * sizeof(uint64);
  
  // allocate and initialize host memory
  uint64* h_results = (uint64*) calloc(1, s_results);
  for(i = 0; i < NUM_CHAINS; i++) {
    h_results[i] = start + i;
  }
    
  // allocate and initialize device memory
  uint64* d_results;
  CUDA_SAFE_CALL(cudaMalloc((void**)&d_results, s_results));

  CUT_SAFE_CALL(cutStartTimer(total_timer));
  
  CUDA_SAFE_CALL(cudaMemcpy(d_results, h_results, s_results, cudaMemcpyHostToDevice));

  double total_run_time = 0.0;

  uint32 run;
  for(run = 0; run < num_runs; run++) {
    unsigned int run_timer = 0;
    CUT_SAFE_CALL(cutCreateTimer(&run_timer));
    
    uint32 index = OPERATIONS_PER_CHAIN - 1 - run * OPERATIONS_PER_RUN;

#ifdef TEST_INTERMEDIATES
    // print intermediates (for testing against calculate_chains_dump)
    for(i = 0; i < NUM_CHAINS; i++) {
      printf("results[%d] = 0x%16.16llx\n", i, h_results[i]);
    }
#endif
    
    printf("Run %3.3d/%3.3d, starting at index 0x%6.6x... ", run+1, num_runs, index);

    fflush(stdout);
    usleep(500*1000);
    
    CUT_SAFE_CALL(cutStartTimer(run_timer));

#ifdef TEST_INTERMEDIATES    
    CUDA_SAFE_CALL(cudaMemcpy(d_results, h_results, s_results, cudaMemcpyHostToDevice));
#endif
    
    dim3 gridDims(NUM_BLOCKS, 1, 1);
    dim3 blockDims(NUM_THREADS, 1, 1);
    crunch<<<gridDims, blockDims>>>(d_results, index);
    
    CUDA_SAFE_CALL(cudaThreadSynchronize());

#ifdef TEST_INTERMEDIATES
    CUDA_SAFE_CALL(cudaMemcpy(h_results, d_results, s_results, cudaMemcpyDeviceToHost));
#endif
    
    CUT_SAFE_CALL(cutStopTimer(run_timer));
    
    float run_time = cutGetTimerValue(run_timer);
    printf("%f ms.\n", run_time);
    total_run_time += run_time;
    fflush(stdout);

    
    CUT_SAFE_CALL(cutDeleteTimer(run_timer));
  }

  CUDA_SAFE_CALL(cudaMemcpy(h_results, d_results, s_results, cudaMemcpyDeviceToHost));

  CUT_SAFE_CALL(cutStopTimer(total_timer));

  // free device memory
  CUDA_SAFE_CALL(cudaFree((void**)d_results));

  // print results
  for(i = 0; i < NUM_CHAINS; i++) {
    printf("results[%d] = 0x%16.16llx\n", i, h_results[i]);
  }

  // free host memory  
  free(h_results);

  // report total time
  printf("Total time: %f ms, %f spent crunching\n", cutGetTimerValue(total_timer), total_run_time);

  // delete the whole-run timer  
  CUT_SAFE_CALL(cutDeleteTimer(total_timer));
  
  return 0;
}