Advertisement
Guest User

Untitled

a guest
Dec 2nd, 2012
26
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #include "aes_context.hh"
  2. #include "aes_kernel.h"
  3.  
  4. #include <assert.h>
  5. #include <cutil_inline.h>
  6.  
  7. #define AES_BLOCK_SIZE 16
  8. #define THREADS_PER_BLK 256 // in order to load t box into shared memory on parallel
  9.  
  10. aes_context::aes_context(device_context *dev_ctx)
  11. {
  12. dev_ctx_ = dev_ctx;
  13. for (unsigned i = 0; i <= MAX_STREAM; i++) {
  14. streams[i].out = 0;
  15. streams[i].out_d = 0;
  16. streams[i].out_len = 0;
  17. }
  18. }
  19. aes_context::~aes_context() {
  20.  
  21. }
  22.  
  23. void aes_context::cbc_encrypt(const void *memory_start,
  24. const unsigned long in_pos,
  25. const unsigned long keys_pos,
  26. const unsigned long ivs_pos,
  27. const unsigned long pkt_offset_pos,
  28. const unsigned long tot_in_len,
  29. unsigned char *out,
  30. const unsigned long num_flows,
  31. const unsigned long tot_out_len,
  32. const unsigned int stream_id,
  33. const unsigned int bits)
  34. {
  35. assert(bits == 128);
  36. assert(dev_ctx_->get_state(stream_id) == READY);
  37. dev_ctx_->set_state(stream_id, WAIT_KERNEL);
  38.  
  39. /* Allocate memory on device */
  40. uint8_t *in_d;
  41. uint8_t *keys_d;
  42. uint8_t *ivs_d;
  43. uint32_t *pkt_offset_d;
  44. void *memory_d;
  45.  
  46. //Calculate # of cuda blks required
  47. unsigned int threads_per_blk = THREADS_PER_BLK;
  48. int num_blks = (num_flows + threads_per_blk - 1) / threads_per_blk;
  49. cuda_mem_pool *pool = dev_ctx_->get_cuda_mem_pool(stream_id);
  50.  
  51. memory_d = pool->alloc(tot_in_len);
  52. cudaMemcpyAsync(memory_d, memory_start, tot_in_len,
  53. cudaMemcpyHostToDevice, dev_ctx_->get_stream(stream_id));
  54.  
  55. dev_ctx_->clear_checkbits(stream_id, num_blks);
  56.  
  57. streams[stream_id].out_d = (uint8_t*)pool->alloc(tot_out_len);
  58.  
  59. in_d = (uint8_t *) memory_d + in_pos;
  60. keys_d = (uint8_t *) memory_d + keys_pos;
  61. ivs_d = (uint8_t *) memory_d + ivs_pos;
  62. pkt_offset_d = (uint32_t *) ((uint8_t *)memory_d + pkt_offset_pos);
  63.  
  64. /* Call cbc kernel function to do encryption */
  65. if (dev_ctx_->use_stream()) {
  66. AES_cbc_128_encrypt_gpu(in_d,
  67. streams[stream_id].out_d,
  68. pkt_offset_d,
  69. keys_d,
  70. ivs_d,
  71. num_flows,
  72. dev_ctx_->get_dev_checkbits(stream_id),
  73. threads_per_blk,
  74. dev_ctx_->get_stream(stream_id));
  75. } else {
  76. AES_cbc_128_encrypt_gpu(in_d,
  77. streams[stream_id].out_d,
  78. pkt_offset_d,
  79. keys_d,
  80. ivs_d,
  81. num_flows,
  82. dev_ctx_->get_dev_checkbits(stream_id),
  83. threads_per_blk);
  84.  
  85. }
  86.  
  87. assert(cudaGetLastError() == cudaSuccess);
  88.  
  89. streams[stream_id].out = out;
  90. streams[stream_id].out_len = tot_out_len;
  91.  
  92. /* Copy data back from device to host */
  93. if (!dev_ctx_->use_stream()) {
  94. sync(stream_id);
  95. }
  96. }
  97.  
  98. bool aes_context::sync(const unsigned int stream_id,
  99. const bool block,
  100. const bool copy_result)
  101. {
  102. if (block) {
  103. dev_ctx_->sync(stream_id, true);
  104. if (copy_result && dev_ctx_->get_state(stream_id) == WAIT_KERNEL) {
  105. cutilSafeCall(cudaMemcpyAsync(streams[stream_id].out,
  106. streams[stream_id].out_d,
  107. streams[stream_id].out_len,
  108. cudaMemcpyDeviceToHost,
  109. dev_ctx_->get_stream(stream_id)));
  110. dev_ctx_->set_state(stream_id, WAIT_COPY);
  111. dev_ctx_->sync(stream_id, true);
  112. dev_ctx_->set_state(stream_id, READY);
  113. } else if (dev_ctx_->get_state(stream_id) == WAIT_COPY) {
  114. dev_ctx_->set_state(stream_id, READY);
  115. }
  116. return true;
  117. } else {
  118. if (!dev_ctx_->sync(stream_id, false))
  119. return false;
  120.  
  121. if (dev_ctx_->get_state(stream_id) == WAIT_KERNEL) {
  122. //if no need for data copy
  123. if (!copy_result) {
  124. dev_ctx_->set_state(stream_id, READY);
  125. return true;
  126. }
  127.  
  128. cutilSafeCall(cudaMemcpyAsync(streams[stream_id].out,
  129. streams[stream_id].out_d,
  130. streams[stream_id].out_len,
  131. cudaMemcpyDeviceToHost,
  132. dev_ctx_->get_stream(stream_id)));
  133. dev_ctx_->set_state(stream_id, WAIT_COPY);
  134.  
  135. } else if (dev_ctx_->get_state(stream_id) == WAIT_COPY) {
  136. dev_ctx_->set_state(stream_id, READY);
  137. return true;
  138. } else if (dev_ctx_->get_state(stream_id) == READY) {
  139. return true;
  140. } else {
  141. assert(0);
  142. }
  143. }
  144. return false;
  145. }
  146.  
  147. void aes_context::ecb_128_encrypt(const void *memory_start,
  148. const unsigned long in_pos,
  149. const unsigned long keys_pos,
  150. const unsigned long pkt_index_pos,
  151. const unsigned long data_size,
  152. unsigned char *out,
  153. const unsigned long block_count,
  154. const unsigned long num_flows,
  155. unsigned int stream_id)
  156. {
  157. assert(dev_ctx_->get_state(stream_id) == READY);
  158.  
  159. unsigned long total_len = block_count * AES_BLOCK_SIZE;
  160. uint8_t *in_d;
  161. uint8_t *keys_d;
  162. uint16_t *pkt_index_d;
  163. void * memory_d;
  164.  
  165. cuda_mem_pool *pool = dev_ctx_->get_cuda_mem_pool(stream_id);
  166. memory_d = pool->alloc(data_size);
  167. cudaMemcpyAsync(memory_d, memory_start, data_size,
  168. cudaMemcpyHostToDevice, dev_ctx_->get_stream(stream_id));
  169.  
  170. streams[stream_id].out_d = (uint8_t *) pool->alloc(total_len);
  171.  
  172. in_d = (uint8_t *) memory_d + in_pos;
  173. keys_d = (uint8_t *) memory_d + keys_pos;
  174. pkt_index_d = (uint16_t *) ((uint8_t *) memory_d + pkt_index_pos);
  175.  
  176. unsigned int threads_per_blk = THREADS_PER_BLK;
  177.  
  178. if (dev_ctx_->use_stream())
  179. AES_ecb_128_encrypt_gpu(in_d,
  180. streams[stream_id].out_d,
  181. keys_d,
  182. pkt_index_d,
  183. block_count,
  184. threads_per_blk,
  185. dev_ctx_->get_stream(stream_id));
  186. else
  187. AES_ecb_128_encrypt_gpu(in_d,
  188. streams[stream_id].out_d,
  189. keys_d,
  190. pkt_index_d,
  191. block_count,
  192. threads_per_blk);
  193.  
  194. dev_ctx_->set_state(stream_id, WAIT_KERNEL);
  195.  
  196. streams[stream_id].out = out;
  197. streams[stream_id].out_len = total_len;
  198.  
  199. if (!dev_ctx_->use_stream()) {
  200. sync(stream_id);
  201. }
  202. }
  203.  
  204. void aes_context::ecb_128_encrypt_nocopy(const void *memory_start,
  205. const unsigned long in_pos,
  206. const unsigned long keys_pos,
  207. const unsigned long pkt_index_pos,
  208. const unsigned long data_size,
  209. unsigned char *out,
  210. const unsigned long block_count,
  211. const unsigned long num_flows,
  212. unsigned int stream_id)
  213. {
  214. assert(dev_ctx_->get_state(stream_id) == READY);
  215.  
  216. unsigned long total_len = block_count * AES_BLOCK_SIZE;
  217. uint8_t *in_d;
  218. uint8_t *keys_d;
  219. uint16_t *pkt_index_d;
  220. void * memory_d;
  221.  
  222. cuda_mem_pool *pool = dev_ctx_->get_cuda_mem_pool(stream_id);
  223.  
  224. memory_d = pool->alloc(data_size);
  225. streams[stream_id].out_d = (uint8_t*)pool->alloc(total_len);
  226. in_d = (uint8_t *) memory_d + in_pos;
  227. keys_d = (uint8_t *) memory_d + keys_pos;
  228. pkt_index_d = (uint16_t *) ((uint8_t *) memory_d + pkt_index_pos);
  229.  
  230. unsigned int threads_per_blk = THREADS_PER_BLK;
  231.  
  232. if (dev_ctx_->use_stream())
  233. AES_ecb_128_encrypt_gpu(in_d,
  234. streams[stream_id].out_d,
  235. keys_d,
  236. pkt_index_d,
  237. block_count,
  238. threads_per_blk,
  239. dev_ctx_->get_stream(stream_id));
  240. else
  241. AES_ecb_128_encrypt_gpu(in_d,
  242. streams[stream_id].out_d,
  243. keys_d, pkt_index_d,
  244. block_count,
  245. threads_per_blk);
  246.  
  247. assert(cudaGetLastError() == cudaSuccess);
  248. dev_ctx_->set_state(stream_id, WAIT_KERNEL);
  249.  
  250. streams[stream_id].out = out;
  251. streams[stream_id].out_len = total_len;
  252.  
  253. if (!dev_ctx_->use_stream()) {
  254. sync(stream_id);
  255. }
  256. }
  257.  
  258.  
  259. void aes_context::cbc_decrypt(const void *memory_start,
  260. const unsigned long in_pos,
  261. const unsigned long keys_pos,
  262. const unsigned long ivs_pos,
  263. const unsigned long pkt_index_pos,
  264. const unsigned long data_size,
  265. unsigned char *out,
  266. const unsigned long block_count,
  267. const unsigned long num_flows,
  268. const unsigned int stream_id,
  269. const unsigned int bits)
  270. {
  271. assert(bits==128);
  272. assert(dev_ctx_->get_state(stream_id) == READY);
  273. dev_ctx_->set_state(stream_id, WAIT_KERNEL);
  274.  
  275. unsigned long total_len = block_count * AES_BLOCK_SIZE;
  276. uint8_t *in_d;
  277. uint8_t *keys_d;
  278. uint8_t *ivs_d;
  279. uint16_t *pkt_index_d;
  280. void *memory_d;
  281. unsigned int threads_per_blk = 512;
  282. int num_blks = (block_count + threads_per_blk - 1) / threads_per_blk;
  283.  
  284. //transfor encrypt key to decrypt key
  285. //basically it generates round key and take the last
  286. //CUDA code will generate the round key in reverse order
  287. for (unsigned i = 0; i < num_flows; i++) {
  288. uint8_t *key = (uint8_t *)memory_start + keys_pos + i * bits / 8;
  289. AES_decrypt_key_prepare(key, key, bits);
  290. }
  291.  
  292. dev_ctx_->clear_checkbits(stream_id, num_blks);
  293.  
  294. cuda_mem_pool *pool = dev_ctx_->get_cuda_mem_pool(stream_id);
  295. memory_d = pool->alloc(data_size);
  296. cudaMemcpyAsync(memory_d,
  297. memory_start,
  298. data_size,
  299. cudaMemcpyHostToDevice,
  300. dev_ctx_->get_stream(stream_id));
  301.  
  302. streams[stream_id].out_d = (uint8_t*)pool->alloc(total_len);
  303.  
  304. in_d = (uint8_t *) memory_d + in_pos;
  305. keys_d = (uint8_t *) memory_d + keys_pos;
  306. ivs_d = (uint8_t *) memory_d + ivs_pos;
  307. pkt_index_d = (uint16_t *) ((uint8_t *) memory_d + pkt_index_pos);
  308.  
  309. assert(bits == 128);
  310. if (dev_ctx_->use_stream() && stream_id > 0) {
  311. AES_cbc_128_decrypt_gpu(in_d,
  312. streams[stream_id].out_d,
  313. keys_d,
  314. ivs_d,
  315. pkt_index_d,
  316. block_count,
  317. dev_ctx_->get_dev_checkbits(stream_id),
  318. threads_per_blk,
  319. dev_ctx_->get_stream(stream_id));
  320. } else if (stream_id == 0) {
  321. AES_cbc_128_decrypt_gpu(in_d,
  322. streams[stream_id].out_d,
  323. keys_d,
  324. ivs_d,
  325. pkt_index_d,
  326. block_count,
  327. dev_ctx_->get_dev_checkbits(stream_id),
  328. threads_per_blk);
  329. } else {
  330. assert(0);
  331. }
  332.  
  333. assert(cudaGetLastError() == cudaSuccess);
  334.  
  335. streams[stream_id].out = out;
  336. streams[stream_id].out_len = total_len;
  337.  
  338. if (!dev_ctx_->use_stream()) {
  339. sync(stream_id, true);
  340. }
  341. }
Advertisement
RAW Paste Data Copied
Advertisement