nvidia1/Introduction+to+CUDA+Python+with+Numba.ipynb


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<a href=\"https://www.nvidia.com/dli\"> <img src=\"images/DLI Header.png\" alt=\"Header\" style=\"width: 400px;\"/> </a>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Introduction to CUDA Python with Numba\n",
    "\n",
    "The **[CUDA](https://en.wikipedia.org/wiki/CUDA)** compute platform enables remarkable application acceleration by enabling developers to execute code in a massively parallel fashion on NVIDA GPUs.\n",
    "\n",
    "**[Numba](http://numba.pydata.org/)** is a just-in-time Python function compiler that exposes a simple interface for accelerating numerically-focused Python functions. Numba is a very attractive option for Python programmers wishing to GPU accelerate their applications without needing to write C/C++ code, especially for developers already performing computationally heavy operations on NumPy arrays. Numba can be used to accelerate Python functions for the CPU, as well as for NVIDIA GPUs. **The focus of this course is the fundamental techniques needed to GPU-accelerate Python applications using Numba.**"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Course Structure\n",
    "\n",
    "This course is divided into **three** main sections:\n",
    "\n",
    "- _Introduction to CUDA Python with Numba_\n",
    "- _Custom CUDA Kernels in Python with Numba_\n",
    "- _Multidimensional Grids and Shared Memory for CUDA Python with Numba_\n",
    "\n",
    "Each section contains a final assessment problem, the successful completion of which will enable you to earn a Certificate of Competency for the course. Each section also contains an appendix with advanced materials for those of you with interest."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Introduction to CUDA Python with Numba\n",
    "\n",
    "In this first section you will learn first how to use Numba to compile functions for the CPU, and will receive an introduction to the inner workings of the Numba compiler. You will then proceed to learn how to GPU accelerate element-wise NumPy array functions, along with some techniques for efficiently moving data between a CPU host and GPU device.\n",
    "\n",
    "By the end of the first session you will be able to GPU accelerate Python code that performs element-wise operations on NumPy arrays."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Custom CUDA Kernels in Python with Numba\n",
    "\n",
    "In the second section you will expand your abilities to be able to launch arbitrary, not just element-wise, numerically focused functions in parallel on the GPU by writing custom CUDA kernels. In service of this goal you will learn about how NVIDIA GPUs execute code in parallel. Additionally, you will be exposed to several fundamental parallel programming techniques including how to coordinate the work of parallel threads, and how to address race conditions. You will also learn techniques for debugging code that executes on the GPU.\n",
    "\n",
    "By the end of the second section you will be ready to GPU accelerate an incredible range of numerically focused functions on 1D data sets."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Multidimensional Grids and Shared Memory for CUDA Python with Numba\n",
    "\n",
    "In the third section you will begin working in parallel with 2D data, and will learn how to utilize an on-chip memory space on the GPU called shared memory.\n",
    "\n",
    "By the end of the third section, you will be able to write GPU accelerated code in Python using Numba on 1D and 2D datasets while utilizing several of the most important optimization strategies for writing consistently fast GPU accelerated code."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Course Prerequisites\n",
    "\n",
    "* Competency writing Python, specifically, writing and invoking functions, working with variables, loops, and conditionals, and imports.\n",
    "* Familiarity with the NumPy Python library for numerically-focused Python. If you have never used NumPy, but are familiar with Python, you will likely find the use of NumPy in this session straightforward. Comments and links are provided where some clarification might be helpful.\n",
    "* A high level understanding of some computer science terms like memory allocation, value types, latency, and processing cores.\n",
    "* A basic understanding of what vectors and matrices are, and also matrix multiplication."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Objectives for this Section\n",
    "\n",
    "By the time you complete this section you will be able to:\n",
    "\n",
    "- Use Numba to compile Python functions for the CPU.\n",
    "- Understand how Numba compiles Python functions.\n",
    "- GPU accelerate NumPy ufuncs.\n",
    "- GPU accelerate hand-written vectorized functions.\n",
    "- Optimize data transfers between the CPU host and GPU device."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## What is Numba?\n",
    "\n",
    "Numba is a **just-in-time**, **type-specializing**, **function compiler** for accelerating **numerically-focused** Python for either a CPU or GPU. That's a long list, so let's break down those terms:\n",
    "\n",
    " * **function compiler**: Numba compiles Python functions, not entire applications, and not parts of functions.  Numba does not replace your Python interpreter, but is just another Python module that can turn a function into a (usually) faster function. \n",
    " * **type-specializing**: Numba speeds up your function by generating a specialized implementation for the specific data types you are using.  Python functions are designed to operate on generic data types, which makes them very flexible, but also very slow.  In practice, you only will call a function with a small number of argument types, so Numba will generate a fast implementation for each set of types.\n",
    " * **just-in-time**: Numba translates functions when they are first called.  This ensures the compiler knows what argument types you will be using.  This also allows Numba to be used interactively in a Jupyter notebook just as easily as a traditional application.\n",
    " * **numerically-focused**: Currently, Numba is focused on numerical data types, like `int`, `float`, and `complex`.  There is very limited string processing support, and many string use cases are not going to work well on the GPU.  To get best results with Numba, you will likely be using NumPy arrays."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Requirements for Using Numba\n",
    "\n",
    "Numba supports a wide range of operating systems:\n",
    "\n",
    " * Windows 7 and later, 32 and 64-bit\n",
    " * macOS 10.9 and later, 64-bit\n",
    " * Linux (most anything >= RHEL 5), 32-bit and 64-bit\n",
    "\n",
    "and Python versions:\n",
    "\n",
    " * Python 2.7, >3.4\n",
    " * NumPy 1.10 and later\n",
    "\n",
    "and a very wide range of hardware:\n",
    "\n",
    "* x86, x86_64/AMD64 CPUs\n",
    "* NVIDIA CUDA GPUs (Compute capability 3.0 and later, CUDA 8.0 and later)\n",
    "* AMD GPUs (experimental patches)\n",
    "* ARM (experimental patches)\n",
    "\n",
    "For this course, we will be using Linux 64-bit and CUDA 9."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Aside: CUDA C/C++ vs. Numba vs. pyCUDA\n",
    "\n",
    "By no means is Numba the only way to program with CUDA. By far the most common way to program in CUDA is with the CUDA C/C++ language extensions. With regards to Python, [pyCUDA](https://documen.tician.de/pycuda/) is, in addition to Numba, an alternative to GPU accelerating Python code. We will remained focused on Numba throughout this course, but a quick comparison of the three options just named is worth a mention before we get started, just for a little context.\n",
    "\n",
    "**CUDA C/C++**:\n",
    "- The most common, performant, and flexible way to utilize CUDA\n",
    "- Accelerates C/C++ applications\n",
    "\n",
    "**pyCUDA**:\n",
    "- Exposes the entire CUDA C/C++ API\n",
    "- Is the most performant CUDA option available for Python\n",
    "- Requires writing C code in your Python, and in general, a lot of code modifications\n",
    "\n",
    "**Numba**:\n",
    "- Potentially less performant than pyCUDA\n",
    "- Does not (yet?) expose the entire CUDA C/C++ API\n",
    "- Still enables massive acceleration, often with very little code modification\n",
    "- Allows developers the convenience of writing code directly in Python\n",
    "- Also optimizes Python code for the CPU"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## First Steps: Compile for the CPU\n",
    "\n",
    "If you recall Numba can be used to optimize code for either a CPU or GPU. As an introduction, and before moving onto GPU acceleration, let's write our first Numba function and compile it for the **CPU**. In doing so we will get an easy entrance into Numba syntax, and will also have an opportunity a little later on to compare the performance of CPU optimized Numba code to GPU acclerated Numba code.\n",
    "\n",
    "The Numba compiler is typically enabled by applying a [**function decorator**](https://en.wikipedia.org/wiki/Python_syntax_and_semantics#Decorators) to a Python function. Decorators are function modifiers that transform the Python functions they decorate, using a very simple syntax. Here we will use Numba's CPU compilation decorator `@jit`:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from numba import jit\n",
    "import math\n",
    "\n",
    "# This is the function decorator syntax and is equivalent to `hypot = jit(hypot)`.\n",
    "# The Numba compiler is just a function you can call whenever you want!\n",
    "@jit\n",
    "def hypot(x, y):\n",
    "    # Implementation from https://en.wikipedia.org/wiki/Hypot\n",
    "    x = abs(x);\n",
    "    y = abs(y);\n",
    "    t = min(x, y);\n",
    "    x = max(x, y);\n",
    "    t = t / x;\n",
    "    return x * math.sqrt(1+t*t)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's try out our hypotenuse calculation:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5.0"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "hypot(3.0, 4.0)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We will go in to more detail below about what happens when `hypot` is called, but for now know that the first time we call `hypot`, the compiler is triggered and compiles a machine code implementation of the function for float inputs. Numba also saves the original Python implementation of the function in the `.py_func` attribute, so we can call the original Python code to make sure we get the same answer:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5.0"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "hypot.py_func(3.0, 4.0)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Benchmarking\n",
    "\n",
    "An important part of using Numba is measuring the performance of your new code.  Let's see if we actually sped anything up.  The easiest way to do this in a Jupyter notebook, like the one this session is run in, is to use the [`%timeit` magic function](https://ipython.readthedocs.io/en/stable/interactive/magics.html#magic-timeit).  Let's first measure the speed of the original Python:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "669 ns ± 0.53 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)\n"
     ]
    }
   ],
   "source": [
    "%timeit hypot.py_func(3.0, 4.0)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The `%timeit` magic runs the statement many times to get an accurate estimate of the run time.  It also returns the best time by default, which is useful to reduce the probability that random background events affect your measurement.  The best of 3 approach also ensures that the compilation time on the first call doesn't skew the results:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "%timeit hypot(3.0, 4.0)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Numba did a pretty good job with this function.  It's certainly faster than the pure Python version. Of course, the `hypot` function is already present in the Python module, let's see how it compares:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "%timeit math.hypot(3.0, 4.0)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Python's built-in is even faster than Numba! This is because Numba does introduce some overhead to each function call that is larger than the function call overhead of Python itself. Extremely fast functions (like the above one) will be hurt by this. (As an aside, if you call one Numba function from another one, there is very little function overhead, sometimes even zero if the compiler inlines the function into the other one. In short, always benchmark your functions for evidence of speed up.)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Exercise: Use Numba to Compile a Function for the CPU\n",
    "\n",
    "The following function uses [the Monte Carlo Method to determine Pi](https://academo.org/demos/estimating-pi-monte-carlo/) (source code from the [Numba homepage](http://numba.pydata.org/)). The function itself is already working so don't worry about the mathematical implementation details.\n",
    "\n",
    "Complete the two `TODO`s in order to compile `monte_carlo_pi` with Numba before executing the following 3 cells which will:\n",
    "\n",
    "  1. Confirm the compiled version is behaving the same as the uncompiled version.\n",
    "  2. Benchmark the uncompiled version.\n",
    "  3. Benchmark the compiled version.\n",
    "\n",
    "If you get stuck, check out [the solution](../edit/solutions/monte_carlo_pi_solution.py)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "nsamples = 1000000"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# TODO: Import Numba's just-in-time compiler function\n",
    "import random\n",
    "\n",
    "# TODO: Use the Numba compiler to compile this function\n",
    "def monte_carlo_pi(nsamples):\n",
    "    acc = 0\n",
    "    for i in range(nsamples):\n",
    "        x = random.random()\n",
    "        y = random.random()\n",
    "        if (x**2 + y**2) < 1.0:\n",
    "            acc += 1\n",
    "    return 4.0 * acc / nsamples"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# We will use numpy's `testing` library to confirm compiled and uncompiled versions run the same\n",
    "from numpy import testing\n",
    "\n",
    "# This assertion will fail until you successfully complete the exercise one cell above\n",
    "testing.assert_almost_equal(monte_carlo_pi(nsamples), monte_carlo_pi.py_func(nsamples), decimal=2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "%timeit monte_carlo_pi(nsamples)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "%timeit monte_carlo_pi.py_func(nsamples)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## How Numba Works\n",
    "\n",
    "Now that you've gotton your hands a little dirty using the Numba compiler, let's take a look at what is actually going on under the hood. The first time we called our Numba-wrapped `hypot` function, the following process was initiated:\n",
    "\n",
    "![Numba Flowchart](images/numba_flowchart.png \"The compilation process\")\n",
    "\n",
    "We can see the result of type inference by using the `.inspect_types()` method, which prints an annotated version of the source code:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "hypot.inspect_types()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Note that Numba's type names tend to mirror [the NumPy type names](https://docs.scipy.org/doc/numpy-1.13.0/user/basics.types.html), so a Python `float` is a `float64` (also called \"double precision\" in other languages).  Taking a look at the data types can sometimes be important in GPU code because the performance of `float32` and `float64` computations can (depending on the GPU) be very different on CUDA devices. If your algorithm can obtain correct results using `float32`, then you should probably use that data type, because casting to `float64` can, depending on the GPU type, dramatically slow down the function."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Object and nopython Modes\n",
    "\n",
    "Numba cannot compile all Python code.  Some functions don't have a Numba-translation, and some kinds of Python types can't be efficiently compiled at all (yet).  For example, Numba does not support dictionaries (as of this writing). Here let's try to compile some Python code that Numba does not yet know how to compile:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "@jit\n",
    "def cannot_compile(x):\n",
    "    return x['key']\n",
    "\n",
    "cannot_compile(dict(key='value'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Given what we just said, you might be surpised that the cell above executed without any problems. This is because by default, Numba will fall back to a mode, called **object mode**, which does not do type-specialization. Object mode exists to enable other Numba functionality, but in many cases, you want Numba to tell you if type inference fails. You can force **nopython mode** (the other compilation mode) by passing the `nopython` argument to the decorator:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "@jit(nopython=True)\n",
    "def cannot_compile(x):\n",
    "    return x['key']\n",
    "\n",
    "cannot_compile(dict(key='value'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now we get an exception when Numba tries to compile the function, and if you scroll down to the end of the exception output you will see an error that describes the underlying problem:\n",
    "```\n",
    "- argument 0: cannot determine Numba type of <class 'dict'>\n",
    "```\n",
    "\n",
    "**Using `nopython` mode is the recommended and best practice way to use `jit` as it leads to the best performance.**\n",
    "\n",
    "Numba provides another decorator `njit` which is an alias for `jit(nopython=True)`:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from numba import njit\n",
    "\n",
    "@njit\n",
    "def cannot_compile(x):\n",
    "    return x['key']\n",
    "\n",
    "cannot_compile(dict(key='value'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Please refer to [the Numba documentation](https://numba.pydata.org/numba-doc/dev/reference/pysupported.html) for an exhaustive account of Numba-supported Python."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Introduction to Numba for the GPU with NumPy Universal Functions (ufuncs)\n",
    "\n",
    "We will begin our coverage of GPU programming in Numba with how to compile [NumPy Universal functions \\(or ufuncs\\)](https://docs.scipy.org/doc/numpy-1.15.1/reference/ufuncs.html) for the GPU."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The most important thing to know about GPU programming as we get started is that GPU hardware is designed for *data parallelism*. Maximum throughput is achieved when the GPU is computing the same operations on many different elements at once.\n",
    "\n",
    "NumPy Universal functions, which perform the same operation on every element in a NumPy array, are naturally data parallel, so they are a natural fit for GPU programming."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Review of NumPy Universal Functions (ufuncs)\n",
    "\n",
    "Familiarity with NumPy ufuncs is a prerequisite of this course, but in case you are unfamiliar with them, or in case it has been a while, here is a very brief introduction. If, at the end of this brief introduction, you don't feel comfortable with the basic NumPy mechanisms for array creation and ufuncs, consider the ~1 hour [NumPy Quickstart Tutorial](https://docs.scipy.org/doc/numpy/user/quickstart.html).\n",
    "\n",
    "NumPy has the concept of universal functions (\"ufuncs\"), which are functions that can take NumPy arrays of varying dimensions, or scalars, and operate on them element-by-element.\n",
    "\n",
    "As an example we'll use the NumPy `add` ufunc to demonstrate the basic ufunc mechanism:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "a = np.array([1, 2, 3, 4])\n",
    "b = np.array([10, 20, 30, 40])\n",
    "\n",
    "np.add(a, b) # Returns a new NumPy array resulting from adding every element in `a` to every element in `b`"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Ufuncs also can combine scalars with arrays:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "np.add(a, 100) # Returns a new NumPy array resulting from adding 100 to every element in `a`"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Arrays of different, but compatible dimensions can also be combined via a technique called [*broadcasting*](https://docs.scipy.org/doc/numpy-1.15.0/user/basics.broadcasting.html). The lower dimensional array will be replicated to match the dimensionality of the higher dimensional array. If needed, check out the docs for [`numpy.arange`](https://docs.scipy.org/doc/numpy-1.15.0/reference/generated/numpy.arange.html) and [`numpy.ndarray.reshape`](https://docs.scipy.org/doc/numpy-1.15.0/reference/generated/numpy.ndarray.reshape.html), both will be used several times throughout this course:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "c = np.arange(4*4).reshape((4,4))\n",
    "print('c:', c)\n",
    "\n",
    "np.add(b, c)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Making ufuncs for the GPU\n",
    "\n",
    "Numba has the ability to create *compiled* ufuncs, typically a not-so-straighforward process involving C code. With Numba you simply implement a scalar function to be performed on all the inputs, decorate it with `@vectorize`, and Numba will figure out the broadcast rules for you. For those of you familiar with [NumPy's `vectorize`](https://docs.scipy.org/doc/numpy-1.15.0/reference/generated/numpy.vectorize.html), Numba's `vectorize` decorator will be very familiar."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "In this very first example we will use the `@vectorize` decorator to compile and optimize a ufunc for the **CPU**."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from numba import vectorize\n",
    "\n",
    "@vectorize\n",
    "def add_ten(num):\n",
    "    return num + 10 # This scalar operation will be performed on each element"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "nums = np.arange(10)\n",
    "add_ten(nums) # pass the whole array into the ufunc, it performs the operation on each element"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We are generating a ufunc that uses CUDA on the GPU with the addition of giving an **explicit type signature** and setting the `target` attribute. The type signature argument describes what types to use both for the ufuncs arguments and return value:\n",
    "```python\n",
    "'return_value_type(argument1_value_type, argument2_value_type, ...)'\n",
    "```\n",
    "\n",
    "Please see the Numba docs for more on [available types](https://numba.pydata.org/numba-doc/dev/reference/types.html), as well as for additional information on [writing ufuncs with more than one signature](https://numba.pydata.org/numba-doc/dev/user/vectorize.html)\n",
    "\n",
    "Here is a simple example of a ufunc that will be compiled for a CUDA enabled GPU device. It expects two `int64` values and return also an `int64` value:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "@vectorize(['int64(int64, int64)'], target='cuda') # Type signature and target are required for the GPU\n",
    "def add_ufunc(x, y):\n",
    "    return x + y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "add_ufunc(a, b)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "For such a simple function call, a lot of things just happened!  Numba just automatically:\n",
    "\n",
    " * Compiled a CUDA kernel to execute the ufunc operation in parallel over all the input elements.\n",
    " * Allocated GPU memory for the inputs and the output.\n",
    " * Copied the input data to the GPU.\n",
    " * Executed the CUDA kernel (GPU function) with the correct kernel dimensions given the input sizes.\n",
    " * Copied the result back from the GPU to the CPU.\n",
    " * Returned the result as a NumPy array on the host.\n",
    " \n",
    "Compared to an implementation in C, the above is remarkably more concise.\n",
    "\n",
    "You might be wondering how fast our simple example is on the GPU?  Let's see:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "%timeit np.add(b, c)   # NumPy on CPU"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "%timeit add_ufunc(b, c) # Numba on GPU"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Wow, the GPU is *a lot slower* than the CPU?? For the time being this is to be expected because we have (deliberately) misused the GPU in several ways in this example. How we have misused the GPU will help clarify what kinds of problems are well-suited for GPU computing, and which are best left to be performed on the CPU:\n",
    "\n",
    "  * **Our inputs are too small**: the GPU achieves performance through parallelism, operating on thousands of values at once.  Our test inputs have only 4 and 16 integers, respectively.  We need a much larger array to even keep the GPU busy.\n",
    "  * **Our calculation is too simple**: Sending a calculation to the GPU involves quite a bit of overhead compared to calling a function on the CPU.  If our calculation does not involve enough math operations (often called \"arithmetic intensity\"), then the GPU will spend most of its time waiting for data to move around.\n",
    "  * **We copy the data to and from the GPU**: While in some scenarios, paying the cost of copying data to and from the GPU can be worth it for a single function, often it will be preferred to to run several GPU operations in sequence. In those cases, it makes sense to send data to the GPU and keep it there until all of our processing is complete.\n",
    "  * **Our data types are larger than necessary**: Our example uses `int64` when we probably don't need it.  Scalar code using data types that are 32 and 64-bit run basically the same speed on the CPU, and for integer types the difference may not be drastic, but 64-bit floating point data types may have a significant performance cost on the GPU, depending on the GPU type.  Basic arithmetic on 64-bit floats can be anywhere from 2x (Pascal-architecture Tesla) to 24x (Maxwell-architecture GeForce) slower than 32-bit floats. If you are using more modern GPUs (Volta, Turing, Ampere), then this could be far less of a concern.  NumPy defaults to 64-bit data types when creating arrays, so it is important to set the [`dtype`](https://docs.scipy.org/doc/numpy-1.14.0/reference/arrays.dtypes.html) attribute or use the [`ndarray.astype()`](https://docs.scipy.org/doc/numpy-1.15.0/reference/generated/numpy.ndarray.astype.html) method to pick 32-bit types when you need them.\n",
    "  \n",
    "  \n",
    "Given the above, let's try an example that is faster on the GPU by performing an operation with much greater arithmetic intensity, on a much larger input, and using a 32-bit data type.\n",
    "\n",
    "**Please note:** Not all NumPy code will work on the GPU, and, as in the following example, we will need to use the `math` library's `pi` and `exp` instead of NumPy's. Please see [the Numba docs](https://numba.pydata.org/numba-doc/latest/reference/numpysupported.html) for extensive coverage of NumPy support on the GPU."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import math # Note that for the CUDA target, we need to use the scalar functions from the math module, not NumPy\n",
    "\n",
    "SQRT_2PI = np.float32((2*math.pi)**0.5)  # Precompute this constant as a float32.  Numba will inline it at compile time.\n",
    "\n",
    "@vectorize(['float32(float32, float32, float32)'], target='cuda')\n",
    "def gaussian_pdf(x, mean, sigma):\n",
    "    '''Compute the value of a Gaussian probability density function at x with given mean and sigma.'''\n",
    "    return math.exp(-0.5 * ((x - mean) / sigma)**2) / (sigma * SQRT_2PI)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "# Evaluate the Gaussian a million times!\n",
    "x = np.random.uniform(-3, 3, size=1000000).astype(np.float32)\n",
    "mean = np.float32(0.0)\n",
    "sigma = np.float32(1.0)\n",
    "\n",
    "# Quick test on a single element just to make sure it works\n",
    "gaussian_pdf(x[0], 0.0, 1.0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import scipy.stats # for definition of gaussian distribution, so we can compare CPU to GPU time\n",
    "norm_pdf = scipy.stats.norm\n",
    "%timeit norm_pdf.pdf(x, loc=mean, scale=sigma)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "%timeit gaussian_pdf(x, mean, sigma)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "That's a pretty large improvement, even including the overhead of copying all the data to and from the GPU.  Ufuncs that use special functions (`exp`, `sin`, `cos`, etc) on large data sets run especially well on the GPU.\n",
    "\n",
    "To complete our comparison, let's define and time our `gaussian_pdf` function when optimized by Numba for the **CPU**:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "@vectorize\n",
    "def cpu_gaussian_pdf(x, mean, sigma):\n",
    "    '''Compute the value of a Gaussian probability density function at x with given mean and sigma.'''\n",
    "    return math.exp(-0.5 * ((x - mean) / sigma)**2) / (sigma * SQRT_2PI)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "%timeit cpu_gaussian_pdf(x, mean, sigma)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "That's much faster than the uncompiled CPU version, but much slower than the GPU accelerated one."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## CUDA Device Functions\n",
    "\n",
    "Ufuncs are really quite fantastic if and when you want to perform element wise operations, which is a very common task. There are any number of functions however, that do not fit this description. To compile functions for the GPU that are **not** element wise, vectorized functions, we use `numba.cuda.jit`. In the next section of this course we work extensively with `numba.cuda.jit`, but for now, let us demonstrate how to use it to decorate a helper function, to be utilized by a GPU accelerated ufunc, so that you are not required to cram all your logic into a single ufunc defintion.\n",
    "\n",
    "Notice that `polar_to_cartesian` below does not require a type signature, and also, that it is passed two scalar values, unlike the vectorized ufuncs we have been using (and like `polar_distance` below) which expect NumPy arrays as arguments.\n",
    "\n",
    "The argument `device=True` indicates that the decorated function can **only** be called from a function running on the GPU, and not from CPU host code:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from numba import cuda\n",
    "\n",
    "@cuda.jit(device=True)\n",
    "def polar_to_cartesian(rho, theta):\n",
    "    x = rho * math.cos(theta)\n",
    "    y = rho * math.sin(theta)\n",
    "    return x, y\n",
    "\n",
    "@vectorize(['float32(float32, float32, float32, float32)'], target='cuda')\n",
    "def polar_distance(rho1, theta1, rho2, theta2):\n",
    "    x1, y1 = polar_to_cartesian(rho1, theta1) # We can use device functions inside our GPU ufuncs\n",
    "    x2, y2 = polar_to_cartesian(rho2, theta2)\n",
    "    \n",
    "    return ((x1 - x2)**2 + (y1 - y2)**2)**0.5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "n = 1000000\n",
    "rho1 = np.random.uniform(0.5, 1.5, size=n).astype(np.float32)\n",
    "theta1 = np.random.uniform(-np.pi, np.pi, size=n).astype(np.float32)\n",
    "rho2 = np.random.uniform(0.5, 1.5, size=n).astype(np.float32)\n",
    "theta2 = np.random.uniform(-np.pi, np.pi, size=n).astype(np.float32)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "polar_distance(rho1, theta1, rho2, theta2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Note that the CUDA compiler aggressively inlines device functions, so there is generally no overhead for function calls.  Similarly, the \"tuple\" returned by `polar_to_cartesian` is not actually created as a Python object, but represented temporarily as a struct, which is then optimized away by the compiler."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Allowed Python on the GPU\n",
    "\n",
    "Compared to Numba on the CPU (which is already limited), Numba on the GPU has more limitations.  Supported Python includes:\n",
    "\n",
    "* `if`/`elif`/`else`\n",
    "* `while` and `for` loops\n",
    "* Basic math operators\n",
    "* Selected functions from the `math` and `cmath` modules\n",
    "* Tuples\n",
    "\n",
    "See [the Numba manual](http://numba.pydata.org/numba-doc/latest/cuda/cudapysupported.html) for more details."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Exercise: GPU Accelerate a Function\n",
    "\n",
    "Let's GPU accelerate a \"zero suppression\" function. A common operation when working with waveforms is to force all sample values below a certain absolute magnitude to be zero, as a way to eliminate low amplitude noise. Let's make some sample data:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# This allows us to plot right here in the notebook\n",
    "%matplotlib inline\n",
    "\n",
    "# Hacking up a noisy pulse train\n",
    "from matplotlib import pyplot as plt\n",
    "\n",
    "n = 100000\n",
    "noise = np.random.normal(size=n) * 3\n",
    "pulses = np.maximum(np.sin(np.arange(n) / (n / 23)) - 0.3, 0.0)\n",
    "waveform = ((pulses * 300) + noise).astype(np.int16)\n",
    "plt.plot(waveform)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now decorate this `zero_suppress` function to run as a vectorized ufunc on the CUDA device. Check out [the solution](../edit/solutions/zero_suppress_solution.py) if you get stuck."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def zero_suppress(waveform_value, threshold):\n",
    "    if waveform_value < threshold:\n",
    "        result = 0\n",
    "    else:\n",
    "        result = waveform_value\n",
    "    return result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# This will throw an error until you successfully vectorize the `zero_suppress` function above.\n",
    "# The noise on the baseline should disappear when zero_suppress is implemented\n",
    "plt.plot(zero_suppress(waveform, 15))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Managing GPU Memory\n",
    "\n",
    "So far we have used NumPy arrays on the CPU as inputs and outputs to our GPU functions. As a convenience, Numba has been automatically transferring this data to the GPU for us so that it can be operated on by the GPU. With this implicit data transfer Numba, acting conservatively, will automatically transfer the data back to the CPU after processing. As you can imagine, this is a very time intensive operation.\n",
    "\n",
    "The [CUDA Best Practices Guide](https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html) indicates:\n",
    "\n",
    "> **High Priority**: Minimize data transfer between the host and the device, even if it means running some kernels on the device that do not show performance gains when compared with running them on the host CPU.\n",
    "\n",
    "With this in mind, we ought to consider how to prevent this automatic data transfer back to the host so that we can perform additional work on the data, only paying the price of copying it back to the host when we are truly ready.\n",
    "\n",
    "The way to do this is to create **CUDA Device Arrays** and pass them to our GPU functions. Device arrays will not be automatically transfered back to the host after processing, and can be reused as we wish on the device before ultimately, and only if necessary, sending them, or parts of them, back to the host.\n",
    "\n",
    "To demonstrate, let's create our example addition ufunc again:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "@vectorize(['float32(float32, float32)'], target='cuda')\n",
    "def add_ufunc(x, y):\n",
    "    return x + y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "n = 100000\n",
    "x = np.arange(n).astype(np.float32)\n",
    "y = 2 * x"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "%timeit add_ufunc(x, y)  # Baseline performance with host arrays"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The `numba.cuda` module includes a function that will copy host data to the GPU and return a CUDA device array. Note that below when we try to print the content of the device array, we only get information about the array, and not its actual contents. This is because the data is on the device, and we would need to transfer it back to the host in order to print its values, which we will show how to do later:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from numba import cuda\n",
    "\n",
    "x_device = cuda.to_device(x)\n",
    "y_device = cuda.to_device(y)\n",
    "\n",
    "print(x_device)\n",
    "print(x_device.shape)\n",
    "print(x_device.dtype)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Device arrays can be passed to CUDA functions just like NumPy arrays, but without the copy overhead:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "%timeit add_ufunc(x_device, y_device)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Because `x_device` and `y_device` are already on the device, this benchmark is much faster.\n",
    "\n",
    "We are, however, still allocating a device array for the output of the ufunc and copying it back to the host, even though in the cell above we are not actually assigning the array to a variable. To avoid this, we can create the output array with the [`numba.cuda.device_array()`](https://numba.pydata.org/numba-doc/dev/cuda-reference/memory.html#numba.cuda.device_array) function:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "out_device = cuda.device_array(shape=(n,), dtype=np.float32)  # does not initialize the contents, like np.empty()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "And then we can use a special `out` keyword argument to the ufunc to specify the output buffer:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "%timeit add_ufunc(x_device, y_device, out=out_device)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This call to `add_ufunc` does not involve any data transfers between the host and device and therefore runs the fastest. If and when we want to bring a device array back to the host memory, we can use the `copy_to_host()` method:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "out_host = out_device.copy_to_host()\n",
    "print(out_host[:10])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "You may be thinking that we are not comparing apples to apples here since we have not been benchmarking the `to_device` calls when using the device arrays although the implicit data transfers are being counted towards the benchmarking when we use host arrays `a` and `b`, and you would be correct. Of course our `add_func` function is not particularly well suited for the GPU as discussed earlier. The above was only intended to demonstrate how the transfers can be eliminated.\n",
    "\n",
    "Be sure to benchmark your data transfers when exploring whether or not a trip to the GPU is worth it.\n",
    "\n",
    "Also, Numba provides additional methods for managing device memory and data transfer, check out [the docs](https://numba.pydata.org/numba-doc/dev/cuda/memory.html) for full details."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Exercise: Optimize Memory Movement\n",
    "\n",
    "Given these ufuncs:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import math\n",
    "\n",
    "@vectorize(['float32(float32, float32, float32)'], target='cuda')\n",
    "def make_pulses(i, period, amplitude):\n",
    "    return max(math.sin(i / period) - 0.3, 0.0) * amplitude\n",
    "\n",
    "n = 100000\n",
    "noise = (np.random.normal(size=n) * 3).astype(np.float32)\n",
    "t = np.arange(n, dtype=np.float32)\n",
    "period = n / 23"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "As it currently stands in the cell below, there is an unnecessary data roundtrip back to the host and then back again to the device in between the calls to `make_pulses` and `add_ufunc`.\n",
    "\n",
    "Update the cell below to use device allocations so that there is only one copy to device before the call to `make_pulses` and one copy back to host after the call to `add_ufunc`. Check out [the solution](../edit/solutions/make_pulses_solution.py) if you get stuck."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "pulses = make_pulses(t, period, 100.0)\n",
    "waveform = add_ufunc(pulses, noise)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "from matplotlib import pyplot as plt\n",
    "plt.plot(waveform)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Assessment"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The following exercise will require you to utilize everything you've learned so far to GPU-accelerate neural network calculations. Unlike previous exercises, there will not be any solution code available to you. Just like in this section, the other 2 notebooks in this course also have assessment problems. For those of you who successfully complete all 3, you will receive a **certificate of competency** in the course.\n",
    "\n",
    "**Please read the directions carefully before beginning your work to ensure the best chance at successfully completing the assessment.**"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Accelerate Neural Network Calculations\n",
    "\n",
    "You will be refactoring a simple version of some code that performs work needed to create a hidden layer in a neural network. It normalizes grayscale values, weighs them, and applies an activation function.\n",
    "\n",
    "Your task is to move this work to the GPU using the techniques you've learned while retaining the correctness of the calculations."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Load Imports and Initialize Values"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Run this cell to import required libraries and intitialize values before beginning your work below."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# You should not modify this cell, it contains imports and initial values needed to do work on either\n",
    "# the CPU or the GPU.\n",
    "\n",
    "import numpy as np\n",
    "from numba import cuda, vectorize\n",
    "\n",
    "# Our hidden layer will contain 1M neurons.\n",
    "# When you assess your work below, this value will be automatically set to 100M.\n",
    "n = 1000000\n",
    "\n",
    "greyscales = np.floor(np.random.uniform(0, 255, n).astype(np.float32))\n",
    "weights = np.random.normal(.5, .1, n).astype(np.float32)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### GPU Accelerate"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "You will need to make modifications to each of the 3 cells in this section before assessing your work below. Follow the instructions in the comments."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# As you will recall, `numpy.exp` works on the CPU, but, cannot be used in GPU implmentations.\n",
    "# This import will work for the CPU-only boilerplate code provided below, but\n",
    "# you will need to modify this import before your GPU implementation will work.\n",
    "from numpy import exp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Modify these 3 function calls to run on the GPU.\n",
    "def normalize(grayscales):\n",
    "    return grayscales / 255\n",
    "\n",
    "def weigh(values, weights):\n",
    "    return values * weights\n",
    "        \n",
    "def activate(values):\n",
    "    return ( exp(values) - exp(-values) ) / ( exp(values) + exp(-values) )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Modify the body of this function to optimize data transfers and therefore speed up performance.\n",
    "# As a constraint, even after you move work to the GPU, make this function return a host array.\n",
    "def create_hidden_layer(n, greyscales, weights, exp, normalize, weigh, activate):\n",
    "    \n",
    "    normalized = normalize(greyscales)\n",
    "    weighted = weigh(normalized, weights)\n",
    "    activated = activate(weighted)\n",
    "    \n",
    "    # The assessment mechanism will expect `activated` to be a host array, so,\n",
    "    # even after you refactor this code to run on the GPU, make sure to explicitly copy\n",
    "    # `activated` back to the host.\n",
    "    return activated"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Check Your Work"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Feel free in this section to check your work and debug as needed before running the assessment below."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# You probably don't need to edit this cell, unless you change the name of any of the values being passed as\n",
    "# arguments to `create_hidden_layer` below.\n",
    "arguments = {\"n\":n,\n",
    "            \"greyscales\": greyscales,\n",
    "            \"weights\": weights,\n",
    "            \"exp\": exp,\n",
    "            \"normalize\": normalize,\n",
    "            \"weigh\": weigh,\n",
    "            \"activate\": activate}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Use this cell (and feel free to create others) to self-assess your function\n",
    "a = create_hidden_layer(**arguments)\n",
    "print(a)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Run the Assessment"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Run the following 2 cells to assess your work."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from assessment import assess"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true,
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "assess(create_hidden_layer, arguments)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Get Credit for Your Work"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "After successfully passing the assessment above, revisit the webpage where you launched this interactive environment and click on the **\"ASSESS TASK\"** button as shown in the screenshot below. Doing so will give you credit for this part of the workshop that counts towards earning a **certificate of competency** for the entire course."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "![Run the assessment](images/run_the_assessment.png)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Summary\n",
    "\n",
    "Now that you have completed this session you are able to:\n",
    "\n",
    "- Use Numba to compile Python functions for the CPU\n",
    "- Understand how Numba compiles functions\n",
    "- GPU accelerate NumPy ufuncs\n",
    "- GPU accelerate hand-written vectorized functions\n",
    "- Optimize memory transfers between the CPU host and GPU device"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Download Content\n",
    "\n",
    "To download the contents of this notebook, execute the following cell and then click the download link below. Note: If you run this notebook on a local Jupyter server, you can expect some of the file path links in the notebook to be broken as they are shaped to our own platform. You can still navigate to the files through the Jupyter file navigator."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "!tar -zcvf section1.tar.gz ."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "[Download files from this section.](files/section1.tar.gz)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Appendix: Generalized Ufuncs\n",
    "\n",
    "Ufuncs broadcast a scalar function over array inputs but what if you want to broadcast a lower dimensional array function over a higher dimensional array?  This is called a *generalized ufunc* (\"gufunc\"), and it opens up a whole new frontier for applying ufuncs.\n",
    "\n",
    "Generalized ufuncs are a little more tricky because they need a *signature* (not to be confused with the Numba type signature) that shows the index ordering when dealing with multiple inputs.  Fully explaining \"gufunc\" signatures is beyond the scope of this tutorial, but you can learn more from:\n",
    "\n",
    "* The NumPy docs on gufuncs: https://docs.scipy.org/doc/numpy/reference/c-api.generalized-ufuncs.html\n",
    "* The Numba docs on gufuncs: http://numba.pydata.org/numba-doc/latest/user/vectorize.html#the-guvectorize-decorator\n",
    "* The Numba docs on CUDA gufuncs: http://numba.pydata.org/numba-doc/latest/cuda/ufunc.html#generalized-cuda-ufuncs\n",
    "\n",
    "Let's write our own normalization function.  This will take an array input and compute the L2 norm along the last dimension.  Generalized ufuncs take their output array as the last argument, rather than returning a value. If the output is a scalar, then we will still receive an array that is one dimension less than the array input. For example, computing the row sums of an array will return a 1 dimensional array for 2D array input, or 2D array for 3D array input."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from numba import guvectorize\n",
    "import math\n",
    "\n",
    "@guvectorize(['(float32[:], float32[:])'], # have to include the output array in the type signature\n",
    "             '(i)->()',                 # map a 1D array to a scalar output\n",
    "             target='cuda')\n",
    "def l2_norm(vec, out):\n",
    "    acc = 0.0\n",
    "    for value in vec:\n",
    "        acc += value**2\n",
    "    out[0] = math.sqrt(acc)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "---"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "To test this, let's construct some points on the unit circle:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "angles = np.random.uniform(-np.pi, np.pi, 10)\n",
    "coords = np.stack([np.cos(angles), np.sin(angles)], axis=1)\n",
    "print(coords)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "As expected, the L2 norm is 1.0, up to rounding errors:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "l2_norm(coords)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<a href=\"https://www.nvidia.com/dli\"> <img src=\"images/DLI Header.png\" alt=\"Header\" style=\"width: 400px;\"/> </a>"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}