From e590290b917b84ba2c04c4a9320f8039afa5b454 Mon Sep 17 00:00:00 2001 From: Tyler Allen Date: Fri, 24 Jan 2025 05:57:26 +0000 Subject: [PATCH 01/91] moving all 01 introduction tutorials to inlined examples --- docs_input/notebooks/01_introduction.ipynb | 629 ++++++++++++------ .../exercises/example1_adv_slice_col.cu | 55 -- .../exercises/example1_adv_slice_row.cu | 55 -- .../exercises/example1_assignment1.cu | 127 ---- .../notebooks/exercises/example1_clone.cu | 56 -- .../notebooks/exercises/example1_init.cu | 54 -- .../notebooks/exercises/example1_permute.cu | 54 -- .../exercises/example1_simple_slice.cu | 55 -- 8 files changed, 418 insertions(+), 667 deletions(-) delete mode 100644 docs_input/notebooks/exercises/example1_adv_slice_col.cu delete mode 100644 docs_input/notebooks/exercises/example1_adv_slice_row.cu delete mode 100644 docs_input/notebooks/exercises/example1_assignment1.cu delete mode 100644 docs_input/notebooks/exercises/example1_clone.cu delete mode 100644 docs_input/notebooks/exercises/example1_init.cu delete mode 100644 docs_input/notebooks/exercises/example1_permute.cu delete mode 100644 docs_input/notebooks/exercises/example1_simple_slice.cu diff --git a/docs_input/notebooks/01_introduction.ipynb b/docs_input/notebooks/01_introduction.ipynb index 02a7708f0..a8e3ea92f 100644 --- a/docs_input/notebooks/01_introduction.ipynb +++ b/docs_input/notebooks/01_introduction.ipynb @@ -33,7 +33,7 @@ "metadata": {}, "source": [ "## Training Structure\n", - "This training contains a series of tutorials in increasing order of name that will guide you through basic and intermediate MatX features. Most tutorials will require you to open the source code, make a small change, and run it from the Jupyter notebook. As time permits, there is another notebook called `99_assignments.ipynb` that will give a series of problems to solve using MatX primitives. The assignments have verification code to make sure the answer is correct." + "This training contains a series of tutorials in increasing order of name that will guide you through basic and intermediate MatX features. Early tutorials are implemented using an in-line interpreter that allows you to run MatX code natively in the Jupyter cells. Later, more complex tutorials that are performance sensitive will require you to open source code in separate source files, and run it from the Jupyter notebook. As time permits, there is another notebook called `99_assignments.ipynb` that will give a series of problems to solve using MatX primitives. The assignments have verification code to make sure the answer is correct." ] }, { @@ -97,7 +97,9 @@ "source": [ "### 0. Library Import\n", "\n", - "During this tutorial, we will be writing our MatX code in a CUDA file (`.cu`), compiled via the CUDA Compiler, `nvcc`. If you're curious, the specific command line to build and execute code can be found [here](exercises/compile_and_run.sh).\n", + "During this tutorial, the includes and Jupyter intepreter will enable compilation, however in deployed code, MatX is most-often compiled via the CUDA Compiler, `nvcc`. If you're curious, example command line to build and execute code can be found [here](exercises/compile_and_run.sh).\n", + "\n", + "[comment]: <> (TYLER_TODO: We should elaborate)\n", "\n", "When using MatX, be sure to import the library via:\n", "\n", @@ -114,6 +116,48 @@ "Be aware that since MatX mimics a lot of functionality from the standard library, you may see common names from MatX in your current namespace. It's usually recommended to not import the entire `matx` namespace unless absolutely necessary." ] }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "//todo this should be moved to a hidden init block that runs automatically when the notebook starts\n", + "#pragma cling add_library_path(\"/usr/local/cuda/lib64\")\n", + "#pragma cling add_library_path(\"/opt/xeus/cling/lib\")\n", + "//#pragma cling add_library_path(\"/usr/Lib/gcc/x86_64-Linux-gnu/11/\")\n", + "#pragma cling add_library_path(\"/usr/lib/x86_64-linux-gnu/openblas64-openmp/\")\n", + "#pragma cling add_include_path(\"/usr/local/cuda/include\")\n", + "#pragma cling add_include_path(\"/usr/include/x86_64-linux-gnu/openblas64-openmp\")\n", + "#pragma cling add_include_path(\"/opt/xeus/cling/tools/Jupyter/kernel/MatX/include\")\n", + "#pragma cling add_include_path(\"/opt/xeus/cling/tools/Jupyter/kernel/MatX/build/_deps/cccl-src/libcudacxx/include\")\n", + "//#pragma cling load(\"libgomp\")\n", + "#pragma cling load(\"libopenblas64\")\n", + "#pragma cling load(\"libcuda\")\n", + "#pragma cling load(\"libcudart\")\n", + "#pragma cling load(\"libcurand\")\n", + "#pragma cling load(\"libcublas\")\n", + "#pragma cling load(\"libcublasLt\")\n", + "\n", + "#include \n", + "#include \n", + "\n", + "#define MATX_EN_OPENBLAS\n", + "#define MATX_EN_OPENBLAS_LAPACK\n", + "#define MATX_OPENBLAS_64BITINT\n", + "\n", + "#include \"matx.h\"" + ] + }, { "attachments": {}, "cell_type": "markdown", @@ -121,11 +165,31 @@ "source": [ "### 1. Creation\n", "\n", - "First, we create a tensor object:\n", - "\n", - "```c++\n", - "auto t2 = make_tensor({5,4});\n", - "```\n", + "First, we create a tensor object:\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "auto t2 = matx::make_tensor({5,4});" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ "\n", "While you can create `tensor_t` objects directly, it us advised to used `make_tensor` instead. `tensor_t` has several template parameters that may change in the future, and using `make_tensor` avoids that ambiguity and always returns the correct tensor type. One exception to this is when a user wants to use `tensor_t` as a class member variable. In that case you need to use `tensor_t` directly, and in the constructor of your class use `make_tensor` with your `tensor_t` variable as the first argument. This will be demonstrated later.\n", "\n", @@ -145,25 +209,100 @@ "source": [ "### 2. Initialization \n", "\n", - "After allocating the tensor, we initialize the underlying data:\n", - "\n", - "```c++\n", - "t2.SetVals({ {1, 2, 3, 4},\n", - " {5, 6, 7, 8},\n", - " {9, 10, 11, 12},\n", - " {13, 14, 15, 16},\n", - " {17, 18, 19, 20}});\n", - "\n", + "After allocating the tensor, we initialize the underlying data:\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor_2_i32: Tensor{int32_t} Rank: 2, Sizes:[5, 4], Strides:[4,1]\n", + "000000: 1 2 3 4 \n", + "000001: 5 6 7 8 \n", + "000002: 9 10 11 12 \n", + "000003: 13 14 15 16 \n", + "000004: 17 18 19 20 \n" + ] + }, + { + "data": { + "text/plain": [ + "(void) @0x7f5e43dfec30\n" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "t2.SetVals({ \n", + " {1, 2, 3, 4},\n", + " {5, 6, 7, 8},\n", + " {9, 10, 11, 12},\n", + " {13, 14, 15, 16},\n", + " {17, 18, 19, 20}\n", + " });\n", "\n", "t2.PrefetchDevice(0);\n", - "```\n", "\n", - "The tensor is initialized using a nested initializer list inside of the `SetVals` member function, specifying the values of the matrix. The initializer list is a single-nested list to match a 2D tensor shape, but this can be extended up to 4D tensors. `operator()` is also available to set and get individual values of a tensor as an alternative:\n", - "```c++\n", - " t2(0,0) = 1;\n", - " t2(0,1) = 2;\n", - " ...\n", - "```\n", + "matx::print(t2);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The tensor is initialized using a nested initializer list inside of the `SetVals` member function, specifying the values of the matrix. The initializer list is a single-nested list to match a 2D tensor shape, but this can be extended up to 4D tensors. `operator()` is also available to set and get individual values of a tensor as an alternative:\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor_2_i32: Tensor{int32_t} Rank: 2, Sizes:[5, 4], Strides:[4,1]\n", + "000000: 42 2 3 4 \n", + "000001: 5 6 7 8 \n", + "000002: 9 10 11 12 \n", + "000003: 13 14 117 16 \n", + "000004: 17 18 19 20 \n", + "My updates value for (3,2): 117\n" + ] + }, + { + "data": { + "text/plain": [ + "(std::basic_ostream >::__ostream_type &) @0x7f5eafc34540\n" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "t2(0,0) = 42;\n", + "t2(3,2) = 117;\n", + "\n", + "matx::print(t2);\n", + "\n", + "std::cout << \"My updates value for (3,2): \" << t2(3,2) << std::endl;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ "\n", "**NOTE** The syntax above is executed on the host when written. This works for any type of memory accessible by the host, but will result in a segmentation fault if the tensor is backed by device memory.\n", "\n", @@ -177,32 +316,7 @@ "print(t2);\n", "``` \n", "\n", - "`print` is a utility function to print a tensor or operator's contents to stdout. Printing can be used with any type of operator, including ones that have no memory backing them (see upcoming generators section). If a tensor is being printed, the data backing it can reside either on the host or device, and MatX will move it before printing if needed. With no arguments `print` will print the entire contents of the tensor. However, the size of the printing can also be limited by passing a limit to each dimension. For example, `print(3,2)` would print the first 2 columns and 3 rows of the 2D tensor. The contents of the tensor printed should appear as an increasing sequence of numbers from the top to bottom rows.\n", - "\n", - "Open the file [exercises/example1_init.cu](exercises/example1_init.cu) and edit the contents where you see TODO markers." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!./exercises/compile_and_run.sh example1_init" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Expected output:\n", - "```sh\n", - "000000: 1 2 3 4 \n", - "000001: 5 6 7 8 \n", - "000002: 9 10 11 12 \n", - "000003: 13 14 15 16 \n", - "000004: 17 18 19 20 \n", - "```" + "`print` is a utility function to print a tensor or operator's contents to stdout. Printing can be used with any type of operator, including ones that have no memory backing them (see upcoming generators section). If a tensor is being printed, the data backing it can reside either on the host or device, and MatX will move it before printing if needed. With no arguments `print` will print the entire contents of the tensor. However, the size of the printing can also be limited by passing a limit to each dimension. For example, `print(3,2)` would print the first 2 columns and 3 rows of the 2D tensor. The contents of the tensor printed should appear as an increasing sequence of numbers from the top to bottom rows." ] }, { @@ -211,40 +325,49 @@ "metadata": {}, "source": [ "### 3. Permute\n", - "The next section calls `permute` on the returned view:\n", - "\n", - "```c++\n", - "t2p = permute(t2, {1,0});\n", - "print(t2p);\n", - "```\n", - "\n", - "`permute` returns a view of the data with the dimensions swapped to match the order of the initializer list argument. In this case there are only two dimensions being permuted on a 2D tensor, so it's equivalent to a matrix transpose. However, `permute` can be used on higher-order tensors with the dimensions swapped in any particular order. Like printing, `permute` can work on any type of operator as input and not just tensors backed by memory. Observe the data and size of the tensor is now transposed when using this view:\n", - "\n", - "![Permuted/Transposed 2D Tensor](img/dli-transpose.png)\n", - "\n", - "Open the file [exercises/example1_permute.cu](exercises/example1_permute.cu) and edit the contents where you see TODO markers." + "The next section calls `permute` on the returned view:" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!./exercises/compile_and_run.sh example1_permute" + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor_2_i32: Tensor{int32_t} Rank: 2, Sizes:[4, 5], Strides:[1,4]\n", + "000000: 42 5 9 13 17 \n", + "000001: 2 6 10 14 18 \n", + "000002: 3 7 11 117 19 \n", + "000003: 4 8 12 16 20 \n" + ] + }, + { + "data": { + "text/plain": [ + "(void) @0x7f5e43dfec30\n" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "auto t2p = permute(t2, {1,0});\n", + "print(t2p);" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Expected output:\n", - "```sh\n", - "000000: 1 5 9 13 17 \n", - "000001: 2 6 10 14 18 \n", - "000002: 3 7 11 15 19 \n", - "000003: 4 8 12 16 20 \n", - "```" + "`permute` returns a view of the data with the dimensions swapped to match the order of the initializer list argument. In this case there are only two dimensions being permuted on a 2D tensor, so it's equivalent to a matrix transpose. However, `permute` can be used on higher-order tensors with the dimensions swapped in any particular order. Like printing, `permute` can work on any type of operator as input and not just tensors backed by memory. Observe the data and size of the tensor is now transposed when using this view:\n", + "\n", + "![Permuted/Transposed 2D Tensor](img/dli-transpose.png)" ] }, { @@ -261,40 +384,47 @@ "metadata": {}, "source": [ "### 4. Slice\n", - "The next line takes a slice of the 2D tensor by selecting a subset of data in both dimensions:\n", - "\n", - "```c++\n", - "auto t2s = slice(t2, {1,1}, {3, 3});\n", - "```\n", - "\n", - "`t2s` is now a view of the same data, but starting at index 1 and ending at index 3 (exclusive) on both dimensions. This is equivalent to Python using `t2[1:3, 1:3]`. Since a new sliced view is returned, the new view will have dimensions `{2, 2}`.\n", - "\n", - "![2D Slice](img/dli-slice.png)\n", - "\n", - " Open the file [exercises/example1_simple_slice.cu](exercises/example1_simple_slice.cu) and edit the contents where you see TODO markers.\n", - "\n", - "\n", - "\n" + "The next line takes a slice of the 2D tensor by selecting a subset of data in both dimensions:\n" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!./exercises/compile_and_run.sh example1_simple_slice" + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor_2_i32: Tensor{int32_t} Rank: 2, Sizes:[2, 2], Strides:[4,1]\n", + "000000: 6 7 \n", + "000001: 10 11 \n" + ] + }, + { + "data": { + "text/plain": [ + "(void) @0x7f5e43dfec30\n" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "auto t2s = matx::slice(t2, {1,1}, {3, 3});\n", + "\n", + "matx::print(t2s);" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Expected output:\n", - "```sh\n", - "000000: 6 7 \n", - "000001: 10 11 \n", - "```" + "`t2s` is now a view of the same data, but starting at index 1 and ending at index 3 (exclusive) on both dimensions. This is equivalent to Python using `t2[1:3, 1:3]`. Since a new sliced view is returned, the new view will have dimensions `{2, 2}`.\n", + "\n", + "![2D Slice](img/dli-slice.png)" ] }, { @@ -302,40 +432,50 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The next line shows a variant of `slice` that can reduce the dimension of an operator:\n", - "\n", - "```c++\n", - "auto t1 = slice<1>(t2, {0, 1}, {matxEnd, matxDropDim});\n", - "```\n", - "\n", - "Using this form of `slice` requires a template argument with the rank of the new slice. The second parameter to `slice` takes the starting index for each dimension, while the third takes the ending index. To include all values from the beginning on, a special sentinel of `matxEnd` can be used. Similarly, `matxDropDim` is used to indicate this dimension is the one being sliced (i.e. removed). In this case we are slicing the second column of the tensor and all rows, which produces a new 1D tensor containing only the second column of the original tensor. This is equivalent to `t2[:,1]` in Python. \n", - "\n", - "![Column Slice](img/dli-slice_col.png)\n", - "\n", - "Open the file [exercises/example1_adv_slice_col.cu](exercises/example1_adv_slice_col.cu) and edit the contents where you see TODO markers." + "The next line shows a variant of `slice` that can reduce the dimension of an operator:" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!./exercises/compile_and_run.sh example1_adv_slice_col" + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor_1_i32: Tensor{int32_t} Rank: 1, Sizes:[5], Strides:[4]\n", + "000000: 2 \n", + "000001: 6 \n", + "000002: 10 \n", + "000003: 14 \n", + "000004: 18 \n" + ] + }, + { + "data": { + "text/plain": [ + "(void) @0x7f5e43dfec30\n" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "auto t1Col = matx::slice<1>(t2, {0, 1}, {matx::matxEnd, matx::matxDropDim});\n", + "\n", + "matx::print(t1Col);" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Expected output:\n", - "```sh\n", - "000000: 2 \n", - "000001: 6 \n", - "000002: 10 \n", - "000003: 14 \n", - "000004: 18 \n", - "```" + "Using this form of `slice` requires a template argument with the rank of the new slice. The second parameter to `slice` takes the starting index for each dimension, while the third takes the ending index. To include all values from the beginning on, a special sentinel of `matxEnd` can be used. Similarly, `matxDropDim` is used to indicate this dimension is the one being sliced (i.e. removed). In this case we are slicing the second column of the tensor and all rows, which produces a new 1D tensor containing only the second column of the original tensor. This is equivalent to `t2[:,1]` in Python. \n", + "\n", + "![Column Slice](img/dli-slice_col.png)" ] }, { @@ -343,37 +483,49 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Instead of slicing a single column, we can also slice a single row:\n", - "\n", - "```c++\n", - "auto t1 = slice<1>(t2, {1, 0}, {matxDropDim, matxEnd});\n", - "```\n", - "\n", - "![Row Slice](img/dli-slice_row.png)\n", - "\n", - "Open the file [exercises/example1_adv_slice_row.cu](exercises/example1_adv_slice_row.cu) and edit the contents where you see TODO markers.\n" + "Instead of slicing a single column, we can also slice a single row:\n" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!./exercises/compile_and_run.sh example1_adv_slice_row" + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor_1_i32: Tensor{int32_t} Rank: 1, Sizes:[4], Strides:[1]\n", + "000000: 5 \n", + "000001: 6 \n", + "000002: 7 \n", + "000003: 8 \n" + ] + }, + { + "data": { + "text/plain": [ + "(void) @0x7f5e43dfec30\n" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "auto t1Row = matx::slice<1>(t2, {1, 0}, {matx::matxDropDim, matx::matxEnd});\n", + "\n", + "matx::print(t1Row);" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Expected output:\n", - "```sh\n", - "000000: 5 \n", - "000001: 6 \n", - "000002: 7 \n", - "000003: 8 \n", - "```" + "![Row Slice](img/dli-slice_row.png)\n", + "\n", + "\n" ] }, { @@ -385,93 +537,154 @@ "Note that since we reduced the dimension to a 1D tensor in both cases, printing a 1D tensor (vector) will appear the same in the direction the values are printed.\n", "\n", "### 5. Clone\n", - "The last line shows `clone`, which replicates a operator's dimensions into a higher-rank operator:\n", - "```c++\n", - "auto t2c = clone<2>(t1, {5, matxKeepDim});\n", - "```\n", - "\n", - "`clone` is used on a 1D tensor from the output of the previous example, and replicates the data of the `t1` vector into a 2D tensor with 5 rows where all rows match the data in `t1`. Cloning does not replicate the data in memory; instead, the same elements in `t1` are accessed repeatedly when different rows are accessed. This not only saves memory, but also benefits from the caches in the GPU by not hitting different addresses in memory for the same value. \n", - "\n", - "In this case `clone` was being used on a 1D view from a 2D tensor data set, but similar code works on taking any dimension tensor and increasing it to a higher dimension. The increase in dimensions is not restricted to one. For example, a scalar (0D tensor) can be cloned into a 4F tensor where a single value in memory would appear as a 4D tensor.\n", - "\n", - "![Permuted/Transposed 2D Tensor](img/dli-clone.png)\n", - "\n", - "Open the file [exercises/example1_clone.cu](exercises/example1_clone.cu) and edit the first TODO." + "The last line shows `clone`, which replicates a operator's dimensions into a higher-rank operator:" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!./exercises/compile_and_run.sh example1_clone" + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor_2_i32: Tensor{int32_t} Rank: 2, Sizes:[5, 4], Strides:[0,1]\n", + "000000: 5 6 7 8 \n", + "000001: 5 6 7 8 \n", + "000002: 5 6 7 8 \n", + "000003: 5 6 7 8 \n", + "000004: 5 6 7 8 \n" + ] + }, + { + "data": { + "text/plain": [ + "(void) @0x7f5e43dfec30\n" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "auto t2c_rows = matx::clone<2>(t1Row, {5, matx::matxKeepDim});\n", + "\n", + "matx::print(t2c_rows);\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Expected output:\n", - "```sh\n", - "000000: 1 2 3 4 \n", - "000001: 1 2 3 4 \n", - "000002: 1 2 3 4 \n", - "000003: 1 2 3 4 \n", - "000004: 1 2 3 4 \n", - "```" + "`clone` is used on a 1D tensor from the output of the previous example, and replicates the data of the `t1` vector into a 2D tensor with 5 rows where all rows match the data in `t1`. Cloning does not replicate the data in memory; instead, the same elements in `t1` are accessed repeatedly when different rows are accessed. This not only saves memory, but also benefits from the caches in the GPU by not hitting different addresses in memory for the same value. \n", + "\n", + "In this case `clone` was being used on a 1D view from a 2D tensor data set, but similar code works on taking any dimension tensor and increasing it to a higher dimension. The increase in dimensions is not restricted to one. For example, a scalar (0D tensor) can be cloned into a 4F tensor where a single value in memory would appear as a 4D tensor.\n", + "\n", + "![Permuted/Transposed 2D Tensor](img/dli-clone.png)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "By changing which dimension is cloned, we can also take the same 1D tensor across columns. Edit the last file and clones across columns instead, and print the output of the cloned view.\n", + "By changing which dimension is cloned, we can also take the same 1D tensor across columns.\n", "\n", "![Column Clone](img/dli-clone-col.png)" ] }, { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Expected output:\n", - "```sh\n", - "000000: 1 1 1 1 1 \n", - "000001: 2 2 2 2 2 \n", - "000002: 3 3 3 3 3 \n", - "000003: 4 4 4 4 4 \n", - "```" + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor_2_i32: Tensor{int32_t} Rank: 2, Sizes:[4, 5], Strides:[1,0]\n", + "000000: 5 5 5 5 5 \n", + "000001: 6 6 6 6 6 \n", + "000002: 7 7 7 7 7 \n", + "000003: 8 8 8 8 8 \n" + ] + }, + { + "data": { + "text/plain": [ + "(void) @0x7f5e43dfec30\n" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "auto t2c_columns = matx::clone<2>(t1Row, {matx::matxKeepDim, 5});\n", + "\n", + "matx::print(t2c_columns);" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "As mentioned at the beginning, views do not modify the underlying data; they simply provide the metadata needed to access the elements. To show this, we will open the same [exercise](exercises/example1_clone.cu) again and modify the first value in the original 1D tensor to 10 and watch how multiple elements of the cloned view are modified." + "As mentioned at the beginning, views do not modify the underlying data; they simply provide the metadata needed to access the elements. To show this, we will modify t2(1,0), which corresponds to the first value of our 1D slice, watch how multiple elements of the cloned view are modified." ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!./exercises/compile_and_run.sh example1_clone" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Expected output:\n", - "```sh\n", - "000000: 10 2 3 4 \n", - "000001: 10 2 3 4 \n", - "000002: 10 2 3 4 \n", - "000003: 10 2 3 4 \n", - "000004: 10 2 3 4 \n", - "```" + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor_2_i32: Tensor{int32_t} Rank: 2, Sizes:[5, 4], Strides:[4,1]\n", + "000000: 42 2 3 4 \n", + "000001: 10 6 7 8 \n", + "000002: 9 10 11 12 \n", + "000003: 13 14 117 16 \n", + "000004: 17 18 19 20 \n", + "tensor_1_i32: Tensor{int32_t} Rank: 1, Sizes:[4], Strides:[1]\n", + "000000: 10 \n", + "000001: 6 \n", + "000002: 7 \n", + "000003: 8 \n", + "tensor_2_i32: Tensor{int32_t} Rank: 2, Sizes:[5, 4], Strides:[0,1]\n", + "000000: 10 6 7 8 \n", + "000001: 10 6 7 8 \n", + "000002: 10 6 7 8 \n", + "000003: 10 6 7 8 \n", + "000004: 10 6 7 8 \n", + "tensor_2_i32: Tensor{int32_t} Rank: 2, Sizes:[4, 5], Strides:[1,0]\n", + "000000: 10 10 10 10 10 \n", + "000001: 6 6 6 6 6 \n", + "000002: 7 7 7 7 7 \n", + "000003: 8 8 8 8 8 \n" + ] + }, + { + "data": { + "text/plain": [ + "(void) @0x7f5e43dfec30\n" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "t2(1,0) = 10;\n", + "matx::print(t2);\n", + "matx::print(t1Row);\n", + "matx::print(t2c_rows);\n", + "matx::print(t2c_columns);" ] }, { @@ -489,21 +702,15 @@ "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6" }, "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" + "display_name": "C++17", + "language": "C++", + "name": "cling-cpp17" }, "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.5" + "codemirror_mode": "c++", + "file_extension": ".c++", + "mimetype": "text/x-c++src", + "name": "c++" } }, "nbformat": 4, diff --git a/docs_input/notebooks/exercises/example1_adv_slice_col.cu b/docs_input/notebooks/exercises/example1_adv_slice_col.cu deleted file mode 100644 index 082f8207e..000000000 --- a/docs_input/notebooks/exercises/example1_adv_slice_col.cu +++ /dev/null @@ -1,55 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// BSD 3-Clause License -// -// Copyright (c) 2021, NVIDIA Corporation -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are met: -// -// 1. Redistributions of source code must retain the above copyright notice, this -// list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// 3. Neither the name of the copyright holder nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -///////////////////////////////////////////////////////////////////////////////// - -#include - -using namespace matx; - -int main() { - - auto t2 = make_tensor({5, 4}); - - // Initialize the tensor linearly - t2.SetVals({{1, 2, 3, 4}, - {5, 6, 7, 8}, - {9, 10, 11, 12}, - {13, 14, 15, 16}, - {17, 18, 19, 20}}); - - // TODO: reduce tensor t2 to a 1D tensor by pulling the second column and all - // rows - auto t1 = ...; - - print(t1); - - return 0; -} diff --git a/docs_input/notebooks/exercises/example1_adv_slice_row.cu b/docs_input/notebooks/exercises/example1_adv_slice_row.cu deleted file mode 100644 index a99e2540d..000000000 --- a/docs_input/notebooks/exercises/example1_adv_slice_row.cu +++ /dev/null @@ -1,55 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// BSD 3-Clause License -// -// Copyright (c) 2021, NVIDIA Corporation -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are met: -// -// 1. Redistributions of source code must retain the above copyright notice, this -// list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// 3. Neither the name of the copyright holder nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -///////////////////////////////////////////////////////////////////////////////// - -#include - -using namespace matx; - -int main() { - - auto t2 = make_tensor({5, 4}); - - // Initialize the tensor linearly - t2.SetVals({{1, 2, 3, 4}, - {5, 6, 7, 8}, - {9, 10, 11, 12}, - {13, 14, 15, 16}, - {17, 18, 19, 20}}); - - // TODO: reduce tensor t2 to a 1D tensor by pulling all columns and the - // second row - auto t1 = ...; - - print(t1); - - return 0; -} diff --git a/docs_input/notebooks/exercises/example1_assignment1.cu b/docs_input/notebooks/exercises/example1_assignment1.cu deleted file mode 100644 index 14d923a20..000000000 --- a/docs_input/notebooks/exercises/example1_assignment1.cu +++ /dev/null @@ -1,127 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// BSD 3-Clause License -// -// Copyright (c) 2021, NVIDIA Corporation -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are met: -// -// 1. Redistributions of source code must retain the above copyright notice, this -// list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// 3. Neither the name of the copyright holder nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -///////////////////////////////////////////////////////////////////////////////// - -#include - -using namespace matx; - -/** - * MatX training assignment 1. This training goes through basic tensor - * operations that were learned in the 01_introduction notebook. Uncomment each - * verification block as you go to ensure your solutions are correct. - */ - -int main() { - - /**************************************************************************************************** - * Create a rank-2 tensor data object of ints with 5 rows and 4 columns called - *"t2" - *https://devtech-compute.gitlab-master-pages.nvidia.com/matx/quickstart.html#tensor-views - ****************************************************************************************************/ - - /*** End editing ***/ - - /**************************************************************************************************** - * Initialize the t2 view to a 4x5 matrix of increasing values starting at 1 - * https://devtech-compute.gitlab-master-pages.nvidia.com/matx/quickstart.html#tensor-views - ****************************************************************************************************/ - // t2 = ; - /*** End editing ***/ - - /**************************************************************************************************** - * Get a slice of the second and third rows with all columns - * https://devtech-compute.gitlab-master-pages.nvidia.com/matx/quickstart.html#slicing-and-dicing - *****************************************************************************************************/ - auto t2s = t2; - /*** End editing ***/ - - // Verify slice is correct - // for (int row = 1; row <= 2; row++) { - // for (int col = 0; col < t2.Size(1); col++) { - // if (t2(row, col) != t2s(row - 1, col)) { - // printf("Mismatch in sliced view! actual = %d, expected = %d\n", - // t2s(row - 1, col), t2(row, col)); exit(-1); - // } - // } - // } - - // print(t2s); - // printf("Slice verification passed!\n"); - - /**************************************************************************************************** - * Take the slice and clone it into a 3D tensor with new outer dimensions as - *follows: First dim: keep existing row dimension from t2s Second dim: 2 Third - *dim: keep existing col dimension from t2s - https://devtech-compute.gitlab-master-pages.nvidia.com/matx/quickstart.html#increasing-dimensionality - *****************************************************************************************************/ - auto t3c = t2s; - /*** End editing ***/ - - // Verify clone - // for (int first = 0; first < t3c.Size(0); first++) { - // for (int sec = 0; sec < t3c.Size(1); sec++) { - // for (int third = 0; third < t3c.Size(2); third++) { - // if (t3c(first, sec, third) != t2s(first, third)) { - // printf("Mismatch in cloned view! actual = %d, expected = %d\n", - // t3c(first, sec, third), t2s(first, third)); exit(-1); - // } - // } - // } - // } - - // print(t3c); - // printf("Clone verification passed!\n"); - - /**************************************************************************************************** - * Permute the two outer dimensions of the cloned tensor - * https://devtech-compute.gitlab-master-pages.nvidia.com/matx/quickstart.html#permuting - *****************************************************************************************************/ - auto t3p = t3c; - /*** End editing ***/ - - // Verify clone - // for (int first = 0; first < t3p.Size(0); first++) { - // for (int sec = 0; sec < t3p.Size(1); sec++) { - // for (int third = 0; third < t3p.Size(2); third++) { - // if (t3c(first, sec, third) != t2s(first, third)) { - // printf("Mismatch in permuted view! actual = %d, expected = %d\n", - // t3c(first, sec, third), t2s(sec, third)); exit(-1); - // } - // } - // } - // } - - // print(t3p); - // printf("Permute verification passed!\n"); - - return 0; -} diff --git a/docs_input/notebooks/exercises/example1_clone.cu b/docs_input/notebooks/exercises/example1_clone.cu deleted file mode 100644 index 99d7cc059..000000000 --- a/docs_input/notebooks/exercises/example1_clone.cu +++ /dev/null @@ -1,56 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// BSD 3-Clause License -// -// Copyright (c) 2021, NVIDIA Corporation -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are met: -// -// 1. Redistributions of source code must retain the above copyright notice, this -// list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// 3. Neither the name of the copyright holder nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -///////////////////////////////////////////////////////////////////////////////// - -#include - -using namespace matx; - -int main() { - - auto t1 = make_tensor({4}); - - // Initialize the tensor linearly - t1.SetVals({1, 2, 3, 4}); - - // TODO: Clone tensor t1 into a 2D tensor by making a new outer dimension 5. - auto t2c = ...; - - print(t2c); - - // TODO: After compiling and running the code above, modify the first element - // in t1 to be 10 on the next line. Uncomment the print line as well. t1(0) = - // ... - - // print(t2c); - - return 0; -} diff --git a/docs_input/notebooks/exercises/example1_init.cu b/docs_input/notebooks/exercises/example1_init.cu deleted file mode 100644 index 150bdda92..000000000 --- a/docs_input/notebooks/exercises/example1_init.cu +++ /dev/null @@ -1,54 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// BSD 3-Clause License -// -// Copyright (c) 2021, NVIDIA Corporation -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are met: -// -// 1. Redistributions of source code must retain the above copyright notice, this -// list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// 3. Neither the name of the copyright holder nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -///////////////////////////////////////////////////////////////////////////////// - -#include - -using namespace matx; - -int main() { - - // TODO: Create a 2D tensor of ints called t2data with dimensions 5, 4, and - // a view of that data using the default view. - - auto t2 = ; - - // Initialize the tensor linearly - t2.SetVals({ {1, 2, 3, 4}, - {5, 6, 7, 8}, - {9, 10, 11, 12}, - {13, 14, 15, 16}, - {17, 18, 19, 20}}); - - print(t2); - - return 0; -} diff --git a/docs_input/notebooks/exercises/example1_permute.cu b/docs_input/notebooks/exercises/example1_permute.cu deleted file mode 100644 index 00b7fb9a7..000000000 --- a/docs_input/notebooks/exercises/example1_permute.cu +++ /dev/null @@ -1,54 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// BSD 3-Clause License -// -// Copyright (c) 2021, NVIDIA Corporation -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are met: -// -// 1. Redistributions of source code must retain the above copyright notice, this -// list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// 3. Neither the name of the copyright holder nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -///////////////////////////////////////////////////////////////////////////////// - -#include - -using namespace matx; - -int main() { - - auto t2 = make_tensor({5, 4}); - - // Initialize the tensor linearly - t2.SetVals({{1, 2, 3, 4}, - {5, 6, 7, 8}, - {9, 10, 11, 12}, - {13, 14, 15, 16}, - {17, 18, 19, 20}}); - - // TODO: Permute the view t2 such that the two dimensions are swapped - auto t2p = ...; - - print(t2p); - - return 0; -} diff --git a/docs_input/notebooks/exercises/example1_simple_slice.cu b/docs_input/notebooks/exercises/example1_simple_slice.cu deleted file mode 100644 index 8c8dce351..000000000 --- a/docs_input/notebooks/exercises/example1_simple_slice.cu +++ /dev/null @@ -1,55 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// BSD 3-Clause License -// -// Copyright (c) 2021, NVIDIA Corporation -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are met: -// -// 1. Redistributions of source code must retain the above copyright notice, this -// list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// 3. Neither the name of the copyright holder nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -///////////////////////////////////////////////////////////////////////////////// - -#include - -using namespace matx; - -int main() { - - auto t2 = make_tensor({5, 4}); - - // Initialize the tensor linearly - t2.SetVals({{1, 2, 3, 4}, - {5, 6, 7, 8}, - {9, 10, 11, 12}, - {13, 14, 15, 16}, - {17, 18, 19, 20}}); - - // TODO: Create a slide of the view t2 starting at the second element and - // ending at the third element (inclusive) in both dimensions - auto t2s = ...; - - print(t2s); - - return 0; -} From f7b6717c9a728cf37c371e79b48d80090baae211 Mon Sep 17 00:00:00 2001 From: Tyler Allen Date: Fri, 24 Jan 2025 06:59:06 +0000 Subject: [PATCH 02/91] move notebook 2 to cell execution. run is currently broken --- docs_input/notebooks/02_operators.ipynb | 772 +++++++++++------- .../exercises/example2_assignment1.cu | 158 ---- .../exercises/example2_conditional.cu | 51 -- .../notebooks/exercises/example2_eye.cu | 45 - .../notebooks/exercises/example2_hamming.cu | 47 -- .../notebooks/exercises/example2_init.cu | 53 -- .../exercises/example2_mixed_rank.cu | 55 -- .../exercises/example2_multiple_ops.cu | 54 -- .../notebooks/exercises/example2_rand.cu | 47 -- .../notebooks/exercises/example2_scalar.cu | 56 -- .../exercises/example2_tensor_add.cu | 57 -- .../exercises/example2_tensor_div.cu | 45 - .../notebooks/exercises/example2_viz.cu | 47 -- 13 files changed, 456 insertions(+), 1031 deletions(-) delete mode 100644 docs_input/notebooks/exercises/example2_assignment1.cu delete mode 100644 docs_input/notebooks/exercises/example2_conditional.cu delete mode 100644 docs_input/notebooks/exercises/example2_eye.cu delete mode 100644 docs_input/notebooks/exercises/example2_hamming.cu delete mode 100644 docs_input/notebooks/exercises/example2_init.cu delete mode 100644 docs_input/notebooks/exercises/example2_mixed_rank.cu delete mode 100644 docs_input/notebooks/exercises/example2_multiple_ops.cu delete mode 100644 docs_input/notebooks/exercises/example2_rand.cu delete mode 100644 docs_input/notebooks/exercises/example2_scalar.cu delete mode 100644 docs_input/notebooks/exercises/example2_tensor_add.cu delete mode 100644 docs_input/notebooks/exercises/example2_tensor_div.cu delete mode 100644 docs_input/notebooks/exercises/example2_viz.cu diff --git a/docs_input/notebooks/02_operators.ipynb b/docs_input/notebooks/02_operators.ipynb index 1c085dfd4..222a782e7 100644 --- a/docs_input/notebooks/02_operators.ipynb +++ b/docs_input/notebooks/02_operators.ipynb @@ -35,56 +35,125 @@ "The last topic in this exercise will cover MatX generators. MatX generators are an operator that can dynamically generate data from a formula without storing the interim values. For example, the values an identity matrix or a Hamming window can both be generated on-the-fly only by knowing the index of the value. Generators typically only take a Shape as input since their output is generated without input data." ] }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "//todo this should be moved to a hidden init block that runs automatically when the notebook starts\n", + "#pragma cling add_library_path(\"/usr/local/cuda/lib64\")\n", + "#pragma cling add_library_path(\"/opt/xeus/cling/lib\")\n", + "//#pragma cling add_library_path(\"/usr/Lib/gcc/x86_64-Linux-gnu/11/\")\n", + "#pragma cling add_library_path(\"/usr/lib/x86_64-linux-gnu/openblas64-openmp/\")\n", + "#pragma cling add_include_path(\"/usr/local/cuda/include\")\n", + "#pragma cling add_include_path(\"/usr/include/x86_64-linux-gnu/openblas64-openmp\")\n", + "#pragma cling add_include_path(\"/opt/xeus/cling/tools/Jupyter/kernel/MatX/include\")\n", + "#pragma cling add_include_path(\"/opt/xeus/cling/tools/Jupyter/kernel/MatX/build/_deps/cccl-src/libcudacxx/include\")\n", + "//#pragma cling load(\"libgomp\")\n", + "#pragma cling load(\"libopenblas64\")\n", + "#pragma cling load(\"libcuda\")\n", + "#pragma cling load(\"libcudart\")\n", + "#pragma cling load(\"libcurand\")\n", + "#pragma cling load(\"libcublas\")\n", + "#pragma cling load(\"libcublasLt\")\n", + "\n", + "#include \n", + "#include \n", + "\n", + "#define MATX_EN_OPENBLAS\n", + "#define MATX_EN_OPENBLAS_LAPACK\n", + "#define MATX_OPENBLAS_64BITINT\n", + "\n", + "#include \"matx.h\"" + ] + }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Initialization\n", - "As in the previous example, we need to declare tensors and initialize the data:\n", - "\n", - "```c++\n", - " auto A = make_tensor({2, 3});\n", - " auto B = make_tensor({2, 3});\n", - " auto C = make_tensor({2, 3});\n", - " auto V = make_tensor({3});\n", - "``` \n", - "\n", - "We create a single tensor Shape type that's used in multiple tensor types so that we don't have to repeat the size. After this code is executed, four data objects are created, and managed memory is allocated to account for the shape and type of each tensor. Next, the input tensor Views (`A` and `V`) are initiailized with an increasing data pattern:\n", - "\n", - "```c++\n", - " A.SetVals({ {1, 2, 3},\n", - " {4, 5, 6}});\n", - " \n", - " V.SetVals({7, 8, 9});\n", - "``` \n", - "\n", - "Open the file [exercises/example2_init.cu](exercises/example2_init.cu) and edit the contents where you see TODO markers." + "As in the previous example, we need to declare tensors and initialize the data:" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "!./exercises/compile_and_run.sh example2_init" + "auto A = matx::make_tensor({2, 3});\n", + "auto B = matx::make_tensor({2, 3});\n", + "auto C = matx::make_tensor({2, 3});\n", + "auto V = matx::make_tensor({3});\n", + "auto E = matx::make_tensor({8,8});\n", + "auto H = matx::make_tensor({10});" ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "Expected output:\n", - "```sh\n", - "000000: 1.0000 2.0000 3.0000 \n", - "000001: 4.0000 5.0000 6.0000 \n", + "After this code is executed, four data objects are created, and managed memory is allocated to account for the shape and type of each tensor. Next, the input tensor Views (`A` and `V`) are initiailized with an increasing data pattern:\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor_2_f32: Tensor{float} Rank: 2, Sizes:[2, 3], Strides:[3,1]\n", + "000000: 1.0000e+00 2.0000e+00 3.0000e+00 \n", + "000001: 4.0000e+00 5.0000e+00 6.0000e+00 \n", + "tensor_1_f32: Tensor{float} Rank: 1, Sizes:[3], Strides:[1]\n", + "000000: 7.0000e+00 \n", + "000001: 8.0000e+00 \n", + "000002: 9.0000e+00 \n" + ] + }, + { + "data": { + "text/plain": [ + "(void) @0x757f1fdfec30\n" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "A.SetVals({ {1, 2, 3},\n", + " {4, 5, 6}\n", + " });\n", + " \n", + "V.SetVals({7, 8, 9});\n", "\n", - "000000: 7.0000 \n", - "000001: 8.0000 \n", - "000002: 9.0000 \n", - "```" + "matx::print(A);\n", + "matx::print(V);" ] }, { @@ -93,39 +162,50 @@ "metadata": {}, "source": [ "## Element-wise Scalar Addition\n", - "For the first operator example, we add a scalar onto a tensor and assign it to another tensor. This can be thought of as tensor addition with the second tensor equal to a tensor of equal size with all ones. To make the separation of operators from executors explicit, we first create the operator `op` by using MatX's lazy assignment operator `=`. The statement on the right hand side can be read as \"Add the number 5 to operator A, and assign the result to tensor B\". Instantiating variable `op` generates a CUDA kernel that can then be executed with the `run()` method:\n", - "\n", - "```c++\n", - "auto op = (B = A + 5);\n", - "op.run();\n", - "```\n", - "The `run()` function takes an optional CUDA stream to launch the work in. Since no argument was specified here, the default stream is used.\n", - "\n", - "Open the file [exercises/example2_scalar.cu](exercises/example2_scalar.cu) and edit the contents where you see TODO markers." + "For the first operator example, we add a scalar onto a tensor and assign it to another tensor. This can be thought of as tensor addition with the second tensor equal to a tensor of equal size with all ones. To make the separation of operators from executors explicit, we first create the operator `op` by using MatX's lazy assignment operator `=`. The statement on the right hand side can be read as \"Add the number 5 to operator A, and assign the result to tensor B\". Instantiating variable `op` generates a CUDA kernel that can then be executed with the `run()` method:\n" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor_2_f32: Tensor{float} Rank: 2, Sizes:[2, 3], Strides:[3,1]\n", + "000000: 0.0000e+00 0.0000e+00 0.0000e+00 \n", + "000001: 0.0000e+00 0.0000e+00 0.0000e+00 \n", + "Operator{float} Rank: 2, Sizes:[2, 3]\n", + "000000: 6.0000e+00 7.0000e+00 8.0000e+00 \n", + "000001: 9.0000e+00 1.0000e+01 1.1000e+01 \n" + ] + }, + { + "data": { + "text/plain": [ + "(void) @0x757f1fdfec30\n" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "!./exercises/compile_and_run.sh example2_scalar" + "auto op = (B = A + 5);\n", + "op.run();\n", + "matx::print(B);\n", + "\n", + "matx::print((B = A + 5)); ///\\todo remove after run is fixed" ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "Expected output:\n", - "```sh\n", - "000000: 1.0000 2.0000 3.0000 \n", - "000001: 4.0000 5.0000 6.0000 \n", - "\n", - "000000: 2.0000 3.0000 4.0000 \n", - "000001: 5.0000 6.0000 7.0000 \n", - "```" + "The `run()` function takes an optional executor to determine what accelerator is used to perform the operation. When no argument is specified, the default executor is the CUDA default stream." ] }, { @@ -134,46 +214,49 @@ "metadata": {}, "source": [ "## Element-wise Tensor Addition\n", - "The next section adds two tensors together element-wise. Just like with a scalar, the `+` operator works on two tensors. Instead of creating a separate operator variable, this example shows how to create and execute an operator in a single line:\n", - "\n", - "```c++\n", - "A.SetVals({ {1, 2, 3},\n", - " {4, 5, 6}});\n", - "\n", - "B.SetVals({ {7, 8, 9},\n", - " {10, 11, 12}});\n", - "\n", - "(C = A + B).run();\n", - "```\n", - "\n", - "Open the file [exercises/example2_tensor_add.cu](exercises/example2_tensor_add.cu) and edit the contents where you see TODO markers." + "The next section adds two tensors together element-wise. Just like with a scalar, the `+` operator works on two tensors. Instead of creating a separate operator variable, this example shows how to create and execute an operator in a single line:\n" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!./exercises/compile_and_run.sh example2_tensor_add" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor_2_f32: Tensor{float} Rank: 2, Sizes:[2, 3], Strides:[3,1]\n", + "000000: 0.0000e+00 0.0000e+00 0.0000e+00 \n", + "000001: 0.0000e+00 0.0000e+00 0.0000e+00 \n", + "Operator{float} Rank: 2, Sizes:[2, 3]\n", + "000000: 8.0000e+00 1.0000e+01 1.2000e+01 \n", + "000001: 1.4000e+01 1.6000e+01 1.8000e+01 \n" + ] + }, + { + "data": { + "text/plain": [ + "(void) @0x757f1fdfec30\n" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "Expected output:\n", - "```sh\n", - "000000: 1.0000 2.0000 3.0000 \n", - "000001: 4.0000 5.0000 6.0000 \n", + "A.SetVals({ {1, 2, 3},\n", + " {4, 5, 6}});\n", + "\n", + "B.SetVals({ {7, 8, 9},\n", + " {10, 11, 12}});\n", + "\n", + "(C = A + B).run();\n", "\n", - "000000: 7.0000 8.0000 9.0000 \n", - "000001: 10.0000 11.0000 12.0000 \n", + "matx::print(C);\n", "\n", - "000000: 8.0000 10.0000 12.0000 \n", - "000001: 14.0000 16.0000 18.0000 \n", - "```" + "matx::print(C = A + B); ///\\todo remove after run is fixed" ] }, { @@ -182,39 +265,53 @@ "metadata": {}, "source": [ "## Element-wise Tensor Division\n", - "The division operator `/` can also be used on two tensors. In this example we reuse the `C` tensor from the last example and divide each element by 2:\n", - "\n", - "```c++\n", - " C.SetVals({ {7, 8, 9},\n", - " {10, 11, 12}});\n", - "\n", - " (C = C / 2).run(); \n", - "```\n", - "\n", - "With division, the usual C semantics apply - if the tensor type is an integral type, the results are rounded down. If the type is floating point, floating point division is performed. In this case we are using `float` types, so floating point division will occur.\n", - "\n", - "Open the file [exercises/example2_tensor_div.cu](exercises/example2_tensor_div.cu) and edit the contents where you see TODO markers." + "The division operator `/` can also be used on two tensors, or any scalar type that is compatible with the tensor's data. In this example we reuse the `C` tensor from the last example and divide each element by 2:\n" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!./exercises/compile_and_run.sh example2_tensor_div" + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor_2_f32: Tensor{float} Rank: 2, Sizes:[2, 3], Strides:[3,1]\n", + "000000: 7.0000e+00 8.0000e+00 9.0000e+00 \n", + "000001: 1.0000e+01 1.1000e+01 1.2000e+01 \n", + "Operator{float} Rank: 2, Sizes:[2, 3]\n", + "000000: 3.5000e+00 4.0000e+00 4.5000e+00 \n", + "000001: 5.0000e+00 5.5000e+00 6.0000e+00 \n" + ] + }, + { + "data": { + "text/plain": [ + "(void) @0x757f1fdfec30\n" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "C.SetVals({ {7, 8, 9},\n", + " {10, 11, 12}});\n", + "\n", + "(C = C / 2).run(); \n", + "\n", + "matx::print(C);\n", + "\n", + "matx::print(C = C / 2); ///\\todo remove after run is fixed" ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "Expected output:\n", - "```sh\n", - "000000: 3.5000 4.0000 4.5000 \n", - "000001: 5.0000 5.5000 6.0000 \n", - "```" + "With division, the usual C semantics apply - if the tensor type is an integral type, the results are rounded down. If the type is floating point, floating point division is performed. In this case we are using `float` types, so floating point division will occur." ] }, { @@ -223,95 +320,57 @@ "metadata": {}, "source": [ "## Broadcasted Tensor Addition\n", - "Binary operators can be used on tensors of different ranks. In this section, we add a 1D tensor `M` onto a 2D tensor `C`. Unlike previous examples, the result is stored in the same tensor `C`, which is safe since the operation is element-wise and each thread runs independent of others. When operating on tensors of different ranks, the outer dimensions of both tensors must match. The tensor with the lower rank will be broadcasted on the higher dimensions when the operation is executing.\n", - "\n", - "```c++\n", - " A.SetVals({ {1, 2, 3},\n", - " {4, 5, 6}});\n", - " \n", - " V.SetVals({7, 8, 9});\n", - "\n", - " (C = C + M).run();\n", - "```\n", - "\n", - "The result of the operation will be `M` repeatedly added to all rows of `C`.\n", - "\n", - "Open the file [exercises/example2_mixed_rank.cu](exercises/example2_mixed_rank.cu) and edit the contents where you see TODO markers." + "Binary operators can be used on tensors of different ranks. In this section, we add a 1D tensor `V` onto a 2D tensor `C`. Unlike previous examples, the result is stored in the same tensor `C`, which is safe since the operation is element-wise and each thread runs independent of others. When operating on tensors of different ranks, the outer dimensions of both tensors must match. The tensor with the lower rank will be broadcasted on the higher dimensions when the operation is executing." ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!./exercises/compile_and_run.sh example2_mixed_rank" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", + "execution_count": 7, "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor_2_f32: Tensor{float} Rank: 2, Sizes:[2, 3], Strides:[3,1]\n", + "000000: 1.0000e+00 2.0000e+00 3.0000e+00 \n", + "000001: 4.0000e+00 5.0000e+00 6.0000e+00 \n", + "Operator{float} Rank: 2, Sizes:[2, 3]\n", + "000000: 8.0000e+00 1.0000e+01 1.2000e+01 \n", + "000001: 1.1000e+01 1.3000e+01 1.5000e+01 \n" + ] + }, + { + "data": { + "text/plain": [ + "(void) @0x757f1fdfec30\n" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "Expected output:\n", - "```sh\n", - "000000: 8.0000 10.0000 12.0000 \n", - "000001: 11.0000 13.0000 15.0000 \n", - "```" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Multiple Operators\n", - "Multiple operators can be combined in a single expression. The syntax is similar to using a high-level language like MATLAB where the order of operations is followed, and the final result is stored into the tensor on the left hand side of the lazy assignment operator `=`. Unlike most C++ libraries that use operator overloading for runtime expression parsing, MatX uses templates to parse the entire expression at compile-time. This removes all unnecessary interim loads and stores that would normally occur with the runtime approach. In this example, we combined 4 operators (three `+` and one `/`) in a single expression:\n", "\n", - "```c++\n", - " A.SetVals({ {1, 2, 3},\n", - " {4, 5, 6}});\n", - " \n", - " V.SetVals({7, 8, 9});\n", + "C.SetVals({ {1, 2, 3},\n", + " {4, 5, 6}});\n", "\n", - " (C = (A + A + 1) / 2 + V).run();\n", - "```\n", + "V.SetVals({7, 8, 9});\n", "\n", - "Open the file [exercises/example2_multiple_ops.cu](exercises/example2_multiple_ops.cu) and edit the contents where you see TODO markers." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!./exercises/compile_and_run.sh example2_multiple_ops" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Expected output:\n", - "```sh\n", - "000000: 8.5000 10.5000 12.5000 \n", - "000001: 11.5000 13.5000 15.5000\n", - "```" + "(C = C + V).run();\n", + "\n", + "matx::print(C);\n", + "\n", + "matx::print(C = C + V); ///\\todo remove after run is fixed" ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "## Conditionals\n", - "Conditional statements operators are also available to take an action based on the value of an operator or tensor view. These actions can be anything from changing the computation to choosing where to store the data. In this example, we set the output of A based on whether the value in C is greater than 3. Note that `IFELSE` is an operator, and has the same `run()` method to execute the work as a standard expression.\n", "\n", - "```c++\n", - " IFELSE(C > 3, A = 1, A = 0).run();\n", - "```" + "The result of the operation will be `V` repeatedly added to all rows of `C`." ] }, { @@ -319,16 +378,49 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Open the file [exercises/example2_conditional.cu](exercises/example2_conditional.cu) and edit the contents where you see TODO markers." + "## Multiple Operators\n", + "Multiple operators can be combined in a single expression. The syntax is similar to using a high-level language like MATLAB where the order of operations is followed, and the final result is stored into the tensor on the left hand side of the lazy assignment operator `=`. Unlike most C++ libraries that use operator overloading for runtime expression parsing, MatX uses templates to parse the entire expression at compile-time. This removes all unnecessary interim loads and stores that would normally occur with the runtime approach. In this example, we combined 4 operators (three `+` and one `/`) in a single expression:" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor_2_f32: Tensor{float} Rank: 2, Sizes:[2, 3], Strides:[3,1]\n", + "000000: 8.0000e+00 1.0000e+01 1.2000e+01 \n", + "000001: 1.1000e+01 1.3000e+01 1.5000e+01 \n", + "Operator{float} Rank: 2, Sizes:[2, 3]\n", + "000000: 8.5000e+00 1.0500e+01 1.2500e+01 \n", + "000001: 1.1500e+01 1.3500e+01 1.5500e+01 \n" + ] + }, + { + "data": { + "text/plain": [ + "(void) @0x757f1fdfec30\n" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "!./exercises/compile_and_run.sh example2_conditional" + "A.SetVals({ {1, 2, 3},\n", + " {4, 5, 6}});\n", + "\n", + "V.SetVals({7, 8, 9});\n", + "\n", + "(C = (A + A + 1) / 2 + V).run();\n", + "\n", + "matx::print(C);\n", + "\n", + "matx::print((C = (A + A + 1) / 2 + V)); ///\\todo remove after run is fixed" ] }, { @@ -336,58 +428,54 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Expected output:\n", - "```sh\n", - "000000: 0 0 0\n", - "000001: 1 1 1\n", - "```" + "## Conditionals\n", + "Conditional statements operators are also available to take an action based on the value of an operator or tensor view. These actions can be anything from changing the computation to choosing where to store the data. In this example, we set the output of A based on whether the value in C is greater than 3. Note that `IFELSE` is an operator, and has the same `run()` method to execute the work as a standard expression.\n" ] }, { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor_2_f32: Tensor{float} Rank: 2, Sizes:[2, 3], Strides:[3,1]\n", + "000000: 1.0000e+00 2.0000e+00 3.0000e+00 \n", + "000001: 4.0000e+00 5.0000e+00 6.0000e+00 \n" + ] + }, + { + "data": { + "text/plain": [ + "(void) @0x757f1fdfec30\n" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "### Random Operator\n", - "The ``random`` operator provides a way to generate random numbers using various distrubtions. Random values can be useful for many applications, including generating noise in signal processing or initializing data for testing. In this example we take an existing tensor view (`A`) and populate it with random values from a normal distribution. Before setting the random values, we set all elements of `A` to zero to show the values change after randomizing.\n", + "A.SetVals({ {1, 2, 3},\n", + " {4, 5, 6}});\n", "\n", - "```c++\n", - "(A = 0).run()\n", - "(A = random({4, 4}, NORMAL)).run();\n", - "```\n", + "C.SetVals({ {1, 2, 3},\n", + " {4, 5, 6}});\n", "\n", - "Open the file [exercises/example2_rand.cu](exercises/example2_rand.cu) and edit the contents where you see TODO markers." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!./exercises/compile_and_run.sh example2_rand" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Expected output:\n", - "```sh\n", - "000000: -0.9247 -0.4253 -2.6438 0.1452\n", - "000001: -0.1209 -0.5797 -0.6229 -0.3284\n", - "000002: -1.0745 -0.3631 -1.6711 2.2655\n", - "000003: 0.3117 -0.1842 1.2866 1.1820\n", - "```" + "matx::IFELSE(C > 3, A = 1, A = 0).run();\n", + "matx::print(A);\n", + "\n", + "\n", + "///\\todo currently broken, doesn't work with print for some reason" ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "In this example we store the current random values from `randTensor` into `A`. Instead of storing the random values in `A`, `randTensor` can be used directly in operator equations, and each time it's used a different set of random values is generated." + "\n" ] }, { @@ -395,23 +483,47 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Generators\n", - "Next, we introduce the concept of a generator by creating the identity matrix, scaling the values by `5`, and storing it in a tensor. MatX contains an `eye` operator for generating an identity matrix. Each time an element in the generator is accessed, `eye` simply returns a `1` for values in the diagonal, and `0` otherwise. Said differently, if the index for each rank is equal, the value is set to `1`. Since the goal is to have a diagonal matrix of fives, we multiply the generator by the scalar `5`. Since `eye` is a generator, the multiply and the identity matrix can be evaluated without storing any values. Since we're interested in seeing the results, we execute the operator and store it in the tensor `B`:\n", - "\n", - "```c++\n", - "(B = eye({8, 8}) * 5).run();\n", - "```\n", - "\n", - "Open the file [exercises/example2_eye.cu](exercises/example2_eye.cu) and edit the contents where you see TODO markers." + "### Random Operator\n", + "The ``random`` operator provides a way to generate random numbers using various distrubtions. Random values can be useful for many applications, including generating noise in signal processing or initializing data for testing. In this example we take an existing tensor view (`A`) and populate it with random values from a normal distribution. Before setting the random values, we set all elements of `A` to zero to show the values change after randomizing.\n" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!./exercises/compile_and_run.sh example2_eye" + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor_2_f32: Tensor{float} Rank: 2, Sizes:[2, 3], Strides:[3,1]\n", + "000000: 1.0000e+00 2.0000e+00 3.0000e+00 \n", + "000001: 4.0000e+00 5.0000e+00 6.0000e+00 \n", + "Operator{float} Rank: 2, Sizes:[2, 3]\n", + "000000: 4.2150e-41 4.2150e-41 4.2150e-41 \n", + "000001: 4.2150e-41 4.2150e-41 4.2150e-41 \n" + ] + }, + { + "data": { + "text/plain": [ + "(void) @0x757f1fdfec30\n" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(A = 0).run();\n", + "\n", + "(A = matx::random({2, 3}, matx::NORMAL)).run();\n", + "\n", + "matx::print(A);\n", + "\n", + "//broken output?\n", + "matx::print(matx::random({2, 3}, matx::NORMAL)); ///\\todo remove after run is fixed broken anyways with no memory backing" ] }, { @@ -419,17 +531,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Expected output:\n", - "```sh\n", - "000000: 5.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000\n", - "000001: 0.0000 5.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000\n", - "000002: 0.0000 0.0000 5.0000 0.0000 0.0000 0.0000 0.0000 0.0000\n", - "000003: 0.0000 0.0000 0.0000 5.0000 0.0000 0.0000 0.0000 0.0000\n", - "000004: 0.0000 0.0000 0.0000 0.0000 5.0000 0.0000 0.0000 0.0000\n", - "000005: 0.0000 0.0000 0.0000 0.0000 0.0000 5.0000 0.0000 0.0000\n", - "000006: 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 5.0000 0.0000\n", - "000007: 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 5.0000\n", - "```" + "In this example we store the current random values from `randTensor` into `A`. Instead of storing the random values in `A`, `randTensor` can be used directly in operator equations, and each time it's used a different set of random values is generated." ] }, { @@ -437,41 +539,57 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "While `eye` is a fairly simple generator for creating ones on the diagonal, more complex generators exist for performing operations like windowing, or creating a linearly-spaced range of values. Below we use the `hamming_x` function to generate a Hamming window using the formula: $$ 0.5 * (1 - cos(\\frac{2{\\pi}n}{N})) $$ where `n` is the sample number and `N` is the total number of samples. Since an array of sizes is passed into the generator, these two variables are computed at runtime and the size of the shape is used as the size of the Hamming window. Like the name implies, the `_x` on `hanning` generates the window across the `x` axis, but there are versions for all four possible axes. Other window functions use the same nomenclature:\n", - "\n", - "```c++\n", - "(B = hamming_x(B.Shape())).run();\n", - "```\n", - "Open the file [exercises/example2_hamming.cu](exercises/example2_hamming.cu) and edit the contents where you see TODO markers." + "# Generators\n", + "Next, we introduce the concept of a generator by creating the identity matrix, scaling the values by `5`, and storing it in a tensor. MatX contains an `eye` operator for generating an identity matrix. Each time an element in the generator is accessed, `eye` simply returns a `1` for values in the diagonal, and `0` otherwise. Said differently, if the index for each rank is equal, the value is set to `1`. Since the goal is to have a diagonal matrix of fives, we multiply the generator by the scalar `5`. Since `eye` is a generator, the multiply and the identity matrix can be evaluated without storing any values. Since we're interested in seeing the results, we execute the operator and store it in the tensor `B`:\n" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!./exercises/compile_and_run.sh example2_hamming" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Expected output:\n", - "```sh\n", - "000000: 0.0800\n", - "000001: 0.1876\n", - "000002: 0.4601\n", - "000003: 0.7700\n", - "000004: 0.9723\n", - "000005: 0.9723\n", - "000006: 0.7700\n", - "000007: 0.4601\n", - "000008: 0.1876\n", - "000009: 0.0800\n", - "```" + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor_2_f32: Tensor{float} Rank: 2, Sizes:[8, 8], Strides:[8,1]\n", + "000000: 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 \n", + "000001: 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 \n", + "000002: 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 \n", + "000003: 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 \n", + "000004: 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 \n", + "000005: 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 \n", + "000006: 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 \n", + "000007: 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 \n", + "Operator{int32_t} Rank: 2, Sizes:[8, 8]\n", + "000000: 5 0 0 0 0 0 0 0 \n", + "000001: 0 5 0 0 0 0 0 0 \n", + "000002: 0 0 5 0 0 0 0 0 \n", + "000003: 0 0 0 5 0 0 0 0 \n", + "000004: 0 0 0 0 5 0 0 0 \n", + "000005: 0 0 0 0 0 5 0 0 \n", + "000006: 0 0 0 0 0 0 5 0 \n", + "000007: 0 0 0 0 0 0 0 5 \n" + ] + }, + { + "data": { + "text/plain": [ + "(void) @0x757f1fdfec30\n" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "(E = matx::eye({8, 8}) * 5).run();\n", + "\n", + "matx::print(E); \n", + "\n", + "matx::print(matx::eye({8, 8}) * 5); ///\\todo remove after run is fixed" ] }, { @@ -479,31 +597,62 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Visualizing Data\n", - "MatX uses PyBind and Plotly as a visualization backend of tensor views. Basic plots are supported, such as line, bar, scatter, and contour. Using the Hamming window data from the previous example, we can visualize what the window looks like:\n", - "\n", - "```c++\n", - "viz::line(B, \"Hamming Window\", \"Sample\", \"Amplitude\", \"hamming.html\");\n", - "```\n", - "\n", - "Visualizations in MatX are under the `matx::viz` nested namespace. The string parameters above give the plot a title, X axis name, and Y axis name, respectively. The last parameter is an optional filename to output the plot to. If the code was running natively in this notebook the plot would appear here, but since this is a compiled program, we output to a separate html file that you can open in your file tree. Note that since this is a standard Plotly plot, all the functionality such as zooming and panning are present." + "While `eye` is a fairly simple generator for creating ones on the diagonal, more complex generators exist for performing operations like windowing, or creating a linearly-spaced range of values. Below we use the `hamming` function to generate a Hamming window using the formula: $$ 0.5 * (1 - cos(\\frac{2{\\pi}n}{N})) $$ where `n` is the sample number and `N` is the total number of samples. Since an array of sizes is passed into the generator, these two variables are computed at runtime and the size of the shape is used as the size of the Hamming window. Like the name implies, the `_x` on `hanning` generates the window across the `x` axis, but there are versions for all four possible axes. Other window functions use the same nomenclature:\n" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!./exercises/compile_and_run.sh example2_viz\n", - "\n", - "# Display plot\n", - "from IPython.display import IFrame\n", - "IFrame(src='./hamming.html', width=700, height=600)" + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor_1_f32: Tensor{float} Rank: 1, Sizes:[10], Strides:[1]\n", + "000000: 0.0000e+00 \n", + "000001: 0.0000e+00 \n", + "000002: 0.0000e+00 \n", + "000003: 0.0000e+00 \n", + "000004: 0.0000e+00 \n", + "000005: 0.0000e+00 \n", + "000006: 0.0000e+00 \n", + "000007: 0.0000e+00 \n", + "000008: 0.0000e+00 \n", + "000009: 0.0000e+00 \n", + "Operator{float} Rank: 1, Sizes:[10]\n", + "000000: 8.0000e-02 \n", + "000001: 1.8762e-01 \n", + "000002: 4.6012e-01 \n", + "000003: 7.7000e-01 \n", + "000004: 9.7226e-01 \n", + "000005: 9.7226e-01 \n", + "000006: 7.7000e-01 \n", + "000007: 4.6012e-01 \n", + "000008: 1.8762e-01 \n", + "000009: 8.0000e-02 \n" + ] + }, + { + "data": { + "text/plain": [ + "(void) @0x757f1fdfec30\n" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(H = matx::hamming<0>(H.Shape())).run();\n", + "\n", + "matx::print(H);\n", + "\n", + "matx::print(matx::hamming<0>(H.Shape())); ///\\todo remove after run is fixed" ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -514,25 +663,16 @@ } ], "metadata": { - "interpreter": { - "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6" - }, "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" + "display_name": "C++17", + "language": "C++", + "name": "cling-cpp17" }, "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.10" + "codemirror_mode": "c++", + "file_extension": ".c++", + "mimetype": "text/x-c++src", + "name": "c++" } }, "nbformat": 4, diff --git a/docs_input/notebooks/exercises/example2_assignment1.cu b/docs_input/notebooks/exercises/example2_assignment1.cu deleted file mode 100644 index 69d2f3e65..000000000 --- a/docs_input/notebooks/exercises/example2_assignment1.cu +++ /dev/null @@ -1,158 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// BSD 3-Clause License -// -// Copyright (c) 2021, NVIDIA Corporation -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are met: -// -// 1. Redistributions of source code must retain the above copyright notice, this -// list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// 3. Neither the name of the copyright holder nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -///////////////////////////////////////////////////////////////////////////////// - -#include - -using namespace matx; - -/** - * MatX training assignment 2. This training goes through tensor operations that - * were learned in the 02_operators notebook. Uncomment each verification block - * as you go to ensure your solutions are correct. - */ - -int main() { - auto A = make_tensor({2, 3}); - auto B = make_tensor({2, 3}); - auto V = make_tensor({3}); - - cudaExecutor exec{}; - - /**************************************************************************************************** - * Initialize tensor A with increasing values from 0.5 to 3.0 in steps of 0.4, - *and tensor V from -1 to -3 in steps of -1. - ****************************************************************************************************/ - - /*** End editing ***/ - - // Verify init is correct - float step = 0.5; - for (int row = 0; row < A.Size(0); row++) { - for (int col = 0; col < A.Size(1); col++) { - if (A(row, col) != step) { - printf("Mismatch in A init view! actual = %f, expected = %f\n", - A(row, col), step); - exit(-1); - } - step += 0.5; - } - } - - for (int col = 0; col < V.Size(0); col++) { - if (V(col) != (-1 + col * -1)) { - printf("Mismatch in A init view! actual = %f, expected = %f\n", V(col), - (float)(-1 + col * -1)); - exit(-1); - } - } - - print(A); - print(V); - printf("Init verification passed!\n"); - - /**************************************************************************************************** - * Add 5.0 to all elements of A and store the results back in A - ****************************************************************************************************/ - - /*** End editing ***/ - - exec.sync(); - - step = 0.5; - for (int row = 0; row < A.Size(0); row++) { - for (int col = 0; col < A.Size(1); col++) { - if (A(row, col) != (5.0 + step)) { - printf("Mismatch in A sum view! actual = %f, expected = %f\n", - A(row, col), 5.0 + step); - exit(-1); - } - step += 0.5; - } - } - - print(A); - printf("Sum verification passed!\n"); - - /**************************************************************************************************** - * Clone V to match the dimensions of A, and subtract V from A. The results - * should be stored in A - * - * https://devtech-compute.gitlab-master-pages.nvidia.com/matx/quickstart.html#increasing-dimensionality - * https://devtech-compute.gitlab-master-pages.nvidia.com/matx/api/tensorview.html#_CPPv4I0_iEN4matx12tensor_tE - * - ****************************************************************************************************/ - /// auto tvs = ; - /*** End editing. ***/ - - // exec.sync(); - - // step = 0.5; - // for (int row = 0; row < A.Size(0); row++) { - // for (int col = 0; col < A.Size(1); col++) { - // if (A(row, col) != (5.0 + step - tvs(row, col))) { - // printf("Mismatch in A sub view! actual = %f, expected = %f\n", A(row, - // col), 5.0 + step - tvs(row, col)); exit(-1); - // } - // step += 0.5; - // } - // } - - // print(A); - // print(tvs); - // printf("Clone verification passed!\n"); - - /**************************************************************************************************** - * Raise the matrix A to the power of 2 and multiply the output by two. Next, - * subtract the vector V from each row. Store the result in tensor B. - * - * https://devtech-compute.gitlab-master-pages.nvidia.com/matx/api/tensorops.html#_CPPv4N4matx3powE2Op2Op - ****************************************************************************************************/ - - /*** End editing ***/ - - exec.sync(); - - for (int row = 0; row < B.Size(0); row++) { - for (int col = 0; col < B.Size(1); col++) { - if (B(row, col) != powf(A(row, col), 2) * 2 - V(col)) { - printf("Mismatch in B init view! actual = %f, expected = %f\n", - B(row, col), powf(A(row, col), 2) * 2 - V(col)); - exit(-1); - } - } - } - - print(B); - printf("Mixed verification passed!\n"); - - return 0; -} diff --git a/docs_input/notebooks/exercises/example2_conditional.cu b/docs_input/notebooks/exercises/example2_conditional.cu deleted file mode 100644 index 30107902d..000000000 --- a/docs_input/notebooks/exercises/example2_conditional.cu +++ /dev/null @@ -1,51 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// BSD 3-Clause License -// -// Copyright (c) 2021, NVIDIA Corporation -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are met: -// -// 1. Redistributions of source code must retain the above copyright notice, this -// list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// 3. Neither the name of the copyright holder nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -///////////////////////////////////////////////////////////////////////////////// - -#include - -using namespace matx; - -int main() { - auto A = make_tensor({2, 3}); - auto C = make_tensor({2, 3}); - - C.SetVals({ {1, 2, 3}, - {4, 5, 6}}); - - - // TODO: Conditionally assign elements of A the value of 1 if the same element in C is > 3, or 0 otherwise - - - print(A); - - return 0; -} diff --git a/docs_input/notebooks/exercises/example2_eye.cu b/docs_input/notebooks/exercises/example2_eye.cu deleted file mode 100644 index 77fe96a2d..000000000 --- a/docs_input/notebooks/exercises/example2_eye.cu +++ /dev/null @@ -1,45 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// BSD 3-Clause License -// -// Copyright (c) 2021, NVIDIA Corporation -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are met: -// -// 1. Redistributions of source code must retain the above copyright notice, this -// list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// 3. Neither the name of the copyright holder nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -///////////////////////////////////////////////////////////////////////////////// - -#include - -using namespace matx; - -int main() { - auto B = make_tensor({8, 8}); - - // TODO: Set tensor B such that it forms an identity matrix - - print(B); - - return 0; -} diff --git a/docs_input/notebooks/exercises/example2_hamming.cu b/docs_input/notebooks/exercises/example2_hamming.cu deleted file mode 100644 index 574760a41..000000000 --- a/docs_input/notebooks/exercises/example2_hamming.cu +++ /dev/null @@ -1,47 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// BSD 3-Clause License -// -// Copyright (c) 2021, NVIDIA Corporation -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are met: -// -// 1. Redistributions of source code must retain the above copyright notice, this -// list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// 3. Neither the name of the copyright holder nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -///////////////////////////////////////////////////////////////////////////////// - -#include - -using namespace matx; - -int main() { - - auto B = make_tensor({10}); - - // TODO: Set tensor B such that it forms a Hamming window - - - print(B); - - return 0; -} diff --git a/docs_input/notebooks/exercises/example2_init.cu b/docs_input/notebooks/exercises/example2_init.cu deleted file mode 100644 index 2e693abdc..000000000 --- a/docs_input/notebooks/exercises/example2_init.cu +++ /dev/null @@ -1,53 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// BSD 3-Clause License -// -// Copyright (c) 2021, NVIDIA Corporation -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are met: -// -// 1. Redistributions of source code must retain the above copyright notice, this -// list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// 3. Neither the name of the copyright holder nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -///////////////////////////////////////////////////////////////////////////////// - -#include - -using namespace matx; - -int main() { - auto A = make_tensor({2, 3}); - auto B = make_tensor({2, 3}); - auto C = make_tensor({2, 3}); - auto V = make_tensor({3}); - - // TODO: Initialize the A tensor to contain values increasing from 1 to 6, and - // V from 7 to 9. - A = {}; - V = {}; - - print(A); - printf("\n"); - print(V); - - return 0; -} diff --git a/docs_input/notebooks/exercises/example2_mixed_rank.cu b/docs_input/notebooks/exercises/example2_mixed_rank.cu deleted file mode 100644 index 660f59870..000000000 --- a/docs_input/notebooks/exercises/example2_mixed_rank.cu +++ /dev/null @@ -1,55 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// BSD 3-Clause License -// -// Copyright (c) 2021, NVIDIA Corporation -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are met: -// -// 1. Redistributions of source code must retain the above copyright notice, this -// list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// 3. Neither the name of the copyright holder nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -///////////////////////////////////////////////////////////////////////////////// - -#include - -using namespace matx; - -int main() { - - auto A = make_tensor({2, 3}); - auto B = make_tensor({2, 3}); - auto C = make_tensor({2, 3}); - auto V = make_tensor({3}); - - C.SetVals({ {1, 2, 3}, - {4, 5, 6}}); - - V.SetVals({7, 8, 9}); - - // TODO: Add vector V to matrix C using rank expansion. Store result in C - - - print(C); - - return 0; -} diff --git a/docs_input/notebooks/exercises/example2_multiple_ops.cu b/docs_input/notebooks/exercises/example2_multiple_ops.cu deleted file mode 100644 index 84048b489..000000000 --- a/docs_input/notebooks/exercises/example2_multiple_ops.cu +++ /dev/null @@ -1,54 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// BSD 3-Clause License -// -// Copyright (c) 2021, NVIDIA Corporation -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are met: -// -// 1. Redistributions of source code must retain the above copyright notice, this -// list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// 3. Neither the name of the copyright holder nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -///////////////////////////////////////////////////////////////////////////////// - -#include - -using namespace matx; - -int main() { - auto A = make_tensor({2, 3}); - auto B = make_tensor({2, 3}); - auto C = make_tensor({2, 3}); - auto V = make_tensor({3}); - - A.SetVals({ {1, 2, 3}, - {4, 5, 6}}); - - V.SetVals({7, 8, 9}); - - // TODO: Add A to itself plus 1, divide the result by 2, and add vector V. - - - print(C); - - return 0; -} diff --git a/docs_input/notebooks/exercises/example2_rand.cu b/docs_input/notebooks/exercises/example2_rand.cu deleted file mode 100644 index 378b60ad5..000000000 --- a/docs_input/notebooks/exercises/example2_rand.cu +++ /dev/null @@ -1,47 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// BSD 3-Clause License -// -// Copyright (c) 2021, NVIDIA Corporation -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are met: -// -// 1. Redistributions of source code must retain the above copyright notice, this -// list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// 3. Neither the name of the copyright holder nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -///////////////////////////////////////////////////////////////////////////////// - -#include - -using namespace matx; - -int main() { - - auto A = make_tensor({4, 4}); - - (A = 0).run(); - - // TODO: Set tensor A to normally-distributed random numbers - - - print(A); -} diff --git a/docs_input/notebooks/exercises/example2_scalar.cu b/docs_input/notebooks/exercises/example2_scalar.cu deleted file mode 100644 index 7c827005d..000000000 --- a/docs_input/notebooks/exercises/example2_scalar.cu +++ /dev/null @@ -1,56 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// BSD 3-Clause License -// -// Copyright (c) 2021, NVIDIA Corporation -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are met: -// -// 1. Redistributions of source code must retain the above copyright notice, this -// list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// 3. Neither the name of the copyright holder nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -///////////////////////////////////////////////////////////////////////////////// - -#include - -using namespace matx; - -int main() { - auto A = make_tensor({2, 3}); - auto B = make_tensor({2, 3}); - auto C = make_tensor({2, 3}); - auto V = make_tensor({3}); - - A.SetVals({ {1, 2, 3}, - {4, 5, 6}}); - - V.SetVals({7, 8, 9}); - - // TODO: Add the value 1 to all elements of A and store the result in B - - - print(A); - printf("\n"); - print(B); - - return 0; -} diff --git a/docs_input/notebooks/exercises/example2_tensor_add.cu b/docs_input/notebooks/exercises/example2_tensor_add.cu deleted file mode 100644 index 47d06344d..000000000 --- a/docs_input/notebooks/exercises/example2_tensor_add.cu +++ /dev/null @@ -1,57 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// BSD 3-Clause License -// -// Copyright (c) 2021, NVIDIA Corporation -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are met: -// -// 1. Redistributions of source code must retain the above copyright notice, this -// list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// 3. Neither the name of the copyright holder nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -///////////////////////////////////////////////////////////////////////////////// - -#include - -using namespace matx; - -int main() { - auto A = make_tensor({2, 3}); - auto B = make_tensor({2, 3}); - auto C = make_tensor({2, 3}); - auto V = make_tensor({3}); - - A.SetVals({ {1, 2, 3}, - {4, 5, 6}}); - - B.SetVals({ {7, 8, 9}, - {10, 11, 12}}); - - // TODO: Add tensors A and B and store the result in C - - - print(A); - printf("\n"); - print(B); - printf("\n"); - print(C); -} \ No newline at end of file diff --git a/docs_input/notebooks/exercises/example2_tensor_div.cu b/docs_input/notebooks/exercises/example2_tensor_div.cu deleted file mode 100644 index 9cf71f383..000000000 --- a/docs_input/notebooks/exercises/example2_tensor_div.cu +++ /dev/null @@ -1,45 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// BSD 3-Clause License -// -// Copyright (c) 2021, NVIDIA Corporation -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are met: -// -// 1. Redistributions of source code must retain the above copyright notice, this -// list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// 3. Neither the name of the copyright holder nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -///////////////////////////////////////////////////////////////////////////////// - -#include - -using namespace matx; - -int main() { - auto C = make_tensor({2, 3}); - - C.SetVals({{7, 8, 9}, {10, 11, 12}}); - - // TODO: Divide tensor C by 2 and store in C - - print(C); -} \ No newline at end of file diff --git a/docs_input/notebooks/exercises/example2_viz.cu b/docs_input/notebooks/exercises/example2_viz.cu deleted file mode 100644 index 8c11ccb85..000000000 --- a/docs_input/notebooks/exercises/example2_viz.cu +++ /dev/null @@ -1,47 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// BSD 3-Clause License -// -// Copyright (c) 2021, NVIDIA Corporation -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are met: -// -// 1. Redistributions of source code must retain the above copyright notice, this -// list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// 3. Neither the name of the copyright holder nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -///////////////////////////////////////////////////////////////////////////////// - -#include -#include "matx_viz.h" - -using namespace matx; - -int main() { - auto B = make_tensor({10}); - - // TODO: Set tensor B such that it forms a Hamming window - (B = hamming_x(shape)).run(); - - viz::line(B, "Hamming Window", "Sample", "Amplitude", "hamming.html"); - - return 0; -} From bbb37a9f36a8ae302ec6fcd92285794e2d0ce66f Mon Sep 17 00:00:00 2001 From: Tyler Allen Date: Fri, 24 Jan 2025 08:35:38 +0000 Subject: [PATCH 03/91] updated transforms notebook to use cell based execution --- docs_input/notebooks/03_transforms.ipynb | 740 ++++++++++++------ .../notebooks/exercises/example3_1dfft.cu | 57 -- .../notebooks/exercises/example3_2dfft.cu | 56 -- .../exercises/example3_assignment1.cu | 115 --- .../notebooks/exercises/example3_conv1d.cu | 55 -- .../notebooks/exercises/example3_conv2d.cu | 58 -- .../notebooks/exercises/example3_fft_conv.cu | 92 --- .../exercises/example3_full_reduce.cu | 58 -- .../notebooks/exercises/example3_gemm.cu | 55 -- .../exercises/example3_partial_reduce.cu | 61 -- 10 files changed, 495 insertions(+), 852 deletions(-) delete mode 100644 docs_input/notebooks/exercises/example3_1dfft.cu delete mode 100644 docs_input/notebooks/exercises/example3_2dfft.cu delete mode 100644 docs_input/notebooks/exercises/example3_assignment1.cu delete mode 100644 docs_input/notebooks/exercises/example3_conv1d.cu delete mode 100644 docs_input/notebooks/exercises/example3_conv2d.cu delete mode 100644 docs_input/notebooks/exercises/example3_fft_conv.cu delete mode 100644 docs_input/notebooks/exercises/example3_full_reduce.cu delete mode 100644 docs_input/notebooks/exercises/example3_gemm.cu delete mode 100644 docs_input/notebooks/exercises/example3_partial_reduce.cu diff --git a/docs_input/notebooks/03_transforms.ipynb b/docs_input/notebooks/03_transforms.ipynb index e148df739..982f7aaae 100644 --- a/docs_input/notebooks/03_transforms.ipynb +++ b/docs_input/notebooks/03_transforms.ipynb @@ -33,6 +33,49 @@ "Some executors use CUDA libraries to implement their functionality, and those libraries require either a handle or a plan to operated. MatX hides this complexity by creating and caching the plan on the first call, and using the same plan on future calls where possible. More advanced users may use the handle interface directly to avoid the caching. Only the caching interface will be covered in this tutorial since it's the recommended approach, but the non-cached version can be found in the documentation." ] }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "//todo this should be moved to a hidden init block that runs automatically when the notebook starts\n", + "#pragma cling add_library_path(\"/usr/local/cuda/lib64\")\n", + "#pragma cling add_library_path(\"/opt/xeus/cling/lib\")\n", + "//#pragma cling add_library_path(\"/usr/Lib/gcc/x86_64-Linux-gnu/11/\")\n", + "#pragma cling add_library_path(\"/usr/lib/x86_64-linux-gnu/openblas64-openmp/\")\n", + "#pragma cling add_include_path(\"/usr/local/cuda/include\")\n", + "#pragma cling add_include_path(\"/usr/include/x86_64-linux-gnu/openblas64-openmp\")\n", + "#pragma cling add_include_path(\"/opt/xeus/cling/tools/Jupyter/kernel/MatX/include\")\n", + "#pragma cling add_include_path(\"/opt/xeus/cling/tools/Jupyter/kernel/MatX/build/_deps/cccl-src/libcudacxx/include\")\n", + "//#pragma cling load(\"libgomp\")\n", + "#pragma cling load(\"libopenblas64\")\n", + "#pragma cling load(\"libcuda\")\n", + "#pragma cling load(\"libcudart\")\n", + "#pragma cling load(\"libcurand\")\n", + "#pragma cling load(\"libcublas\")\n", + "#pragma cling load(\"libcublasLt\")\n", + "#pragma cling load(\"libcufft\")\n", + "\n", + "#include \n", + "#include \n", + "\n", + "#define MATX_EN_OPENBLAS\n", + "#define MATX_EN_OPENBLAS_LAPACK\n", + "#define MATX_OPENBLAS_64BITINT\n", + "\n", + "#include \"matx.h\"" + ] + }, { "attachments": {}, "cell_type": "markdown", @@ -43,56 +86,75 @@ "\n", "We use rectangular matrices for `A` and `B`, while `C` will be a square matrix due to the outer dimensions of `A` and `B` matching. \n", "\n", - "```c++\n", - "(A = random({8, 4}, NORMAL)).run(); \n", - "(B = random({4, 8}, NORMAL)).run(); \n", - "\n", - "matmul(C, A, B);\n", - "``` \n", "\n", "Open the file [exercises/example3_gemm.cu](exercises/example3_gemm.cu) and edit the contents where you see TODO markers." ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!./exercises/compile_and_run.sh example3_gemm" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", + "execution_count": 2, "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "A:\n", + "tensor_2_f32: Tensor{float} Rank: 2, Sizes:[8, 4], Strides:[4,1]\n", + "000000: 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 \n", + "000001: 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 \n", + "000002: 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 \n", + "000003: 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 \n", + "000004: 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 \n", + "000005: 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 \n", + "000006: 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 \n", + "000007: 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 \n", + "B:\n", + "tensor_2_f32: Tensor{float} Rank: 2, Sizes:[4, 8], Strides:[8,1]\n", + "000000: 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 \n", + "000001: 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 \n", + "000002: 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 \n", + "000003: 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 \n", + "C:\n", + "tensor_2_f32: Tensor{float} Rank: 2, Sizes:[8, 8], Strides:[8,1]\n", + "000000: 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 \n", + "000001: 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 \n", + "000002: 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 \n", + "000003: 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 \n", + "000004: 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 \n", + "000005: 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 \n", + "000006: 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 \n", + "000007: 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 \n" + ] + }, + { + "data": { + "text/plain": [ + "(void) @0x78e507dfec30\n" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "Expected output:\n", - "```sh\n", - "A:\n", - "000000: -0.9247 -0.4253 -2.6438 0.1452\n", - "000001: -0.1209 -0.5797 -0.6229 -0.3284\n", - "000002: -1.0745 -0.3631 -1.6711 2.2655\n", - "000003: 0.3117 -0.1842 1.2866 1.1820\n", - "000004: -0.1271 1.2169 1.4353 1.0605\n", - "000005: -0.4941 -1.4244 -0.7244 -1.2973\n", - "000006: 0.0697 -0.0074 1.8969 0.6878\n", - "000007: -0.0779 -0.8373 1.3506 -0.2879\n", - "B:\n", - "000000: 0.9911 1.0676 -0.6272 0.3202 -0.3110 -0.3441 -1.1709 -0.5371\n", - "000001: 1.3390 -0.2401 1.2149 -0.2052 1.2999 0.2181 -1.2135 -1.3723\n", - "000002: -0.4635 -0.4089 -0.0032 0.2967 -0.3587 -1.0455 -0.0450 -0.0985\n", - "000003: 1.7608 0.9107 0.0288 -1.1128 0.0929 -0.1502 -0.9854 0.7889\n", - "C:\n", - "000000: -0.0050 0.3283 0.0760 -1.1547 0.6966 2.9677 1.5747 1.4554\n", - "000001: -1.1856 -0.0342 -0.6359 0.2609 -0.5231 0.6156 1.1966 0.6628\n", - "000002: 3.2124 1.6864 0.3035 -3.2863 0.6721 1.6973 -0.4584 3.0275\n", - "000003: 1.5472 0.9272 -0.3894 -0.7960 -0.6881 -1.6701 -1.3640 0.8911\n", - "000004: 2.7056 -0.0490 1.5840 -1.0446 1.2051 -1.3507 -2.4374 -0.9065\n", - "000005: -4.3456 -1.0707 -1.4556 1.3628 -1.5586 0.8115 3.6179 1.2680\n", - "000006: 0.3910 -0.0732 -0.0391 -0.1788 -0.6479 -2.1121 -0.8357 0.3284\n", - "000007: -2.3314 -0.6966 -0.9810 0.8679 -1.5754 -1.5246 1.3302 0.8306\n", - "```" + " auto A = matx::make_tensor({8, 4});\n", + " auto B = matx::make_tensor({4, 8});\n", + " auto C = matx::make_tensor({8, 8});\n", + "\n", + " (A = matx::random({8, 4}, matx::NORMAL)).run(); \n", + " (B = matx::random({4, 8}, matx::NORMAL)).run(); \n", + "\n", + " // TODO: Perform a GEMM of C = A*B\n", + " (C = matx::matmul(A, B)).run();\n", + " \n", + " printf(\"A:\\n\");\n", + " matx::print(A);\n", + " printf(\"B:\\n\");\n", + " matx::print(B); \n", + " printf(\"C:\\n\");\n", + " matx::print(C); \n" ] }, { @@ -103,26 +165,34 @@ "### FFT\n", "MatX provides an interface to do both 1D Fast Fourier Transforms (FFTs) and 2D FFTs. Any tensor above rank 1 will be batched in a 1D FFT, and any tensor above rank 2 will be batched in a 2D FFT. FFTs may either be done in-place or out-of-place by using the same or different variables for the output and inputs. Since the tensors are strongly-typed, the type of FFT (C2C, R2C, etc) is inferred by the tensor type at compile time. Similarly, the input and output size of the executor is deduced by the type of transform, and the input/output tensors must match those sizes. There's one exception to this rule, and it's when the input FFT is to be zero-padded at the end. In this case, the input tensor can be shorter than the output tensor, and the input will be zero-padded to the length of the output tensor. This is a common tactic used in signal and image processing for both speed and FFT resolution.\n", "\n", - "In this example, we execute a 1D batched FFT on a 2D tensor populated with random complex floating point data. Since the FFT executor is performed in-place, the input and output types of the tensors are the same, and the type of the FFT is inferred as a complex-to-complex (`C2C`). The FFT length is specified by the inner dimension of the tensor, or 4 in this example, and the outer dimension is the number of batches, or 2. After the FFT completes, we perform on IFFT on the same tensor using the `ifft` interface. Ignoring floating point inaccuracies, the result of `ifft(fft(A))` should be the same as `A`, and this is shown by printing the tensors at each step. To perform a batched FFT on columns instead of rows, the tensor can be transposed by calling the `Permute` function used in the first tutorial. When the library detects a permuted tensor is being used, it can use technique to speed the FFT up over the naive method of converting the data in memory.\n", - "\n", - "```c++\n", - "C.print();\n", - "fft(C, C);\n", - "C.print();\n", - "ifft(C, C); \n", - "C.print();\n", - "```\n", - "\n", - "Open the file [exercises/example3_1dfft.cu](exercises/example3_1dfft.cu) and edit the contents where you see TODO markers." + "In this example, we execute a 1D batched FFT on a 2D tensor populated with random complex floating point data. Since the FFT executor is performed in-place, the input and output types of the tensors are the same, and the type of the FFT is inferred as a complex-to-complex (`C2C`). The FFT length is specified by the inner dimension of the tensor, or 4 in this example, and the outer dimension is the number of batches, or 2. After the FFT completes, we perform on IFFT on the same tensor using the `ifft` interface. Ignoring floating point inaccuracies, the result of `ifft(fft(A))` should be the same as `A`, and this is shown by printing the tensors at each step. To perform a batched FFT on columns instead of rows, the tensor can be transposed by calling the `Permute` function used in the first tutorial. When the library detects a permuted tensor is being used, it can use technique to speed the FFT up over the naive method of converting the data in memory." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "!./exercises/compile_and_run.sh example3_1dfft" + "auto D = matx::make_tensor({2, 4});\n", + "\n", + "// (D = matx::random(D.Shape(), matx::NORMAL)).run();\n", + "// matx::print(D);\n", + "\n", + "// (D = fft(D)).run();\n", + "// matx::print(D);\n", + "\n", + "// (D = matx::ifft(D)).run(); \n", + "// matx::print(D);" ] }, { @@ -130,66 +200,41 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Expected output:\n", - "```sh\n", - "Initial C tensor:\n", - "000000: -0.9247+0.9911j -0.4253+1.0676j -2.6438-0.6272j 0.1452+0.3202j \n", - "000001: -0.1209-0.3110j -0.5797-0.3441j -0.6229-1.1709j -0.3284-0.5371j \n", - "After FFT:\n", - "000000: -3.8487+1.7517j 2.4666+2.1889j -3.2883-1.0238j 0.9718+1.0478j \n", - "000001: -1.6518-2.3630j 0.6950+1.1112j 0.1644-0.6007j 0.3090+0.6085j \n", - "After IFFT and normalization:\n", - "000000: -0.9247+0.9911j -0.4253+1.0676j -2.6438-0.6272j 0.1452+0.3202j \n", - "000001: -0.1209-0.3110j -0.5797-0.3441j -0.6229-1.1709j -0.3284-0.5371j \n", - "```" + "Next, we take the same 2D tensor and perform a 2D FFT on it. Since the rank is 2, it will not be batched as in the previous example. " ] }, { - "attachments": {}, - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 4, "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(void) @0x78e507dfec30\n" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "Next, we take the same 2D tensor and perform a 2D FFT on it. Since the rank is 2, it will not be batched as in the previous example. \n", - "\n", - "```c++\n", - "C.print();\n", - "fft2(C, C);\n", - "C.print();\n", - "ifft2(C, C); \n", - "C.print();\n", - "```\n", + "(D = matx::random(D.Shape(), matx::NORMAL)).run();\n", + "// matx::print(D);\n", "\n", - "As before, the results after the IFFT closely match the original `C` tensor, but with floating point error.\n", + "// (D = fft2(D)).run();\n", + "// matx::print(D);\n", "\n", - "Open the file [exercises/example3_2dfft.cu](exercises/example3_2dfft.cu) and edit the contents where you see TODO markers." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!./exercises/compile_and_run.sh example3_2dfft" + "// (D = matx::ifft2(D)).run(); \n", + "// matx::print(D);" ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "Expected output:\n", - "```sh\n", - "Intial C tensor:\n", - "000000: -0.9247+0.9911j -0.4253+1.0676j -2.6438-0.6272j 0.1452+0.3202j \n", - "000001: -0.1209-0.3110j -0.5797-0.3441j -0.6229-1.1709j -0.3284-0.5371j \n", - "After FFT:\n", - "000000: -2.0506+1.4036j -0.0405-0.0434j -2.6438-0.6272j 0.1452+0.3202j \n", - "000001: -2.0051+2.7593j -0.4662-0.5353j -0.6229-1.1709j -0.3284-0.5371j \n", - "After IFFT and normalization:\n", - "000000: -1.8493+1.9823j -0.8507+2.1352j -0.6610-0.1568j 0.0363+0.0800j \n", - "000001: -0.2417-0.6220j -1.1595-0.6882j -0.1557-0.2927j -0.0821-0.1343j \n", - "```" + "As before, the results after the IFFT closely match the original `C` tensor, but with floating point error." ] }, { @@ -203,23 +248,61 @@ "MatX provides a set of optimized primitives to perform reductions on tensors for many common types. Reductions are supported across individual dimensions or on entire tensors, depending on the size of the output tensor. Currently supported reduction functions are `sum`, `min`, `max`,` mean`, `any`, and `all`.\n", "\n", "#### Full Reduction\n", - "In this example we reduce an entire tensor to a single value by applying the reduction across all dimensions of the tensor. We apply the same random initialization from previous examples on a 2D tensor `A`. Note that the output tensor must be zeroed for a `sum` reduction since that value is continually added to during the reduction. Not initializing the output tensor will give undefined results since the variables are used as accumulators throughout the reduction. With the tensor initialized, we perform both a `max` and `sum` reduction across all dimensions of the tensor:\n", - "\n", - "```c++\n", - "max(MD0, A);\n", - "sum(AD0, A);\n", - "```\n", - "\n", - "Open the file [exercises/example3_full_reduce.cu](exercises/example3_full_reduce.cu) and edit the contents where you see TODO markers." + "In this example we reduce an entire tensor to a single value by applying the reduction across all dimensions of the tensor. We apply the same random initialization from previous examples on a 2D tensor `A`. Note that the output tensor must be zeroed for a `sum` reduction since that value is continually added to during the reduction. Not initializing the output tensor will give undefined results since the variables are used as accumulators throughout the reduction. With the tensor initialized, we perform both a `max` and `sum` reduction across all dimensions of the tensor:\n" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "A:\n", + "tensor_2_f32: Tensor{float} Rank: 2, Sizes:[8, 4], Strides:[4,1]\n", + "000000: 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 \n", + "000001: 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 \n", + "000002: 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 \n", + "000003: 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 \n", + "000004: 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 \n", + "000005: 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 \n", + "000006: 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 \n", + "000007: 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 \n", + "Max: 0.000000\n", + "Sum: 0.000000\n" + ] + }, + { + "data": { + "text/plain": [ + "(int) 14\n" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "!./exercises/compile_and_run.sh example3_full_reduce" + "\n", + " auto MD0 = matx::make_tensor({});\n", + " auto AD0 = matx::make_tensor({});\n", + "\n", + " (A = matx::random(A.Shape(), matx::NORMAL)).run(); \n", + " \n", + " // Initialize max and average to 0\n", + " (MD0 = 0).run();\n", + " (AD0 = 0).run();\n", + "\n", + " (MD0 = max(A)).run();\n", + " (AD0 = sum(A)).run();\n", + "\n", + " printf(\"A:\\n\");\n", + " matx::print(A);\n", + " printf(\"Max: %f\\n\", MD0());\n", + " printf(\"Sum: %f\\n\", AD0()); " ] }, { @@ -227,68 +310,107 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Expected output:\n", - "```sh\n", - "A:\n", - "000000: -0.9247 -0.4253 -2.6438 0.1452 -0.1209 \n", - "000001: -0.5797 -0.6229 -0.3284 -1.0745 -0.3631 \n", - "000002: -1.6711 2.2655 0.3117 -0.1842 1.2866 \n", - "000003: 1.1820 -0.1271 1.2169 1.4353 1.0605 \n", - "Max: 2.265505\n", - "Sum: -0.162026\n", - "```" + "#### Dimensional Reductions\n", + "Reductions can also be performed across certain dimensions instead of the whole tensor. Dimensional reductions are useful in situations where each row contains data for a different user, for example, and we wish to sum up each user's data. By setting the output tensor view to a 1D tensor, independent reductions can be performed across the input tensor where each output element corresponds to a single row reduction from the input. Using the same tensor `A` from the previous example, we only change the output tensor type to be a 1D tensor instead of a scalar:\n" ] }, { - "attachments": {}, - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 6, "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(void) @0x78e507dfec30\n" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "#### Dimensional Reductions\n", - "Reductions can also be performed across certain dimensions instead of the whole tensor. Dimensional reductions are useful in situations where each row contains data for a different user, for example, and we wish to sum up each user's data. By setting the output tensor view to a 1D tensor, independent reductions can be performed across the input tensor where each output element corresponds to a single row reduction from the input. Using the same tensor `A` from the previous example, we only change the output tensor type to be a 1D tensor instead of a scalar:\n", "\n", - "```c++\n", - " max(MD1, A);\n", - " sum(AD1, A); \n", - "```\n", + " auto MD1 = matx::make_tensor({A.Size(0)});\n", + " auto AD1 = matx::make_tensor({A.Size(0)});\n", "\n", - "Printing the new reduction tensors shows the reduced values across each row of the input tensor `A`.\n", + " (A = matx::random(A.Shape(), matx::NORMAL)).run(); \n", + " \n", + " // Initialize max and average to 0\n", + " (MD1 = 0).run();\n", + " (AD1 = 0).run();\n", "\n", - "Open the file [exercises/example3_partial_reduce.cu](exercises/example3_partial_reduce.cu) and edit the contents where you see TODO markers." + " (MD1 = max(A)).run();\n", + " (AD1 = sum(A)).run();" ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "!./exercises/compile_and_run.sh example3_partial_reduce" + "Printing the new reduction tensors shows the reduced values across each row of the input tensor `A`." ] }, { - "attachments": {}, - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 7, "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "A:\n", + "tensor_2_f32: Tensor{float} Rank: 2, Sizes:[8, 4], Strides:[4,1]\n", + "000000: 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 \n", + "000001: 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 \n", + "000002: 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 \n", + "000003: 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 \n", + "000004: 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 \n", + "000005: 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 \n", + "000006: 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 \n", + "000007: 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 \n", + "Max:\n", + "tensor_1_f32: Tensor{float} Rank: 1, Sizes:[8], Strides:[1]\n", + "000000: 0.0000e+00 \n", + "000001: 0.0000e+00 \n", + "000002: 0.0000e+00 \n", + "000003: 0.0000e+00 \n", + "000004: 0.0000e+00 \n", + "000005: 0.0000e+00 \n", + "000006: 0.0000e+00 \n", + "000007: 0.0000e+00 \n", + "Sum:\n", + "tensor_1_f32: Tensor{float} Rank: 1, Sizes:[8], Strides:[1]\n", + "000000: 0.0000e+00 \n", + "000001: 0.0000e+00 \n", + "000002: 0.0000e+00 \n", + "000003: 0.0000e+00 \n", + "000004: 0.0000e+00 \n", + "000005: 0.0000e+00 \n", + "000006: 0.0000e+00 \n", + "000007: 0.0000e+00 \n" + ] + }, + { + "data": { + "text/plain": [ + "(void) @0x78e507dfec30\n" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "Expected output:\n", - "```sh\n", - "A:\n", - "000000: -0.9247 -0.4253 -2.6438 0.1452 -0.1209 \n", - "000001: -0.5797 -0.6229 -0.3284 -1.0745 -0.3631 \n", - "000002: -1.6711 2.2655 0.3117 -0.1842 1.2866 \n", - "000003: 1.1820 -0.1271 1.2169 1.4353 1.0605 \n", - "Max:\n", - "000000: 0.1452 \n", - "000001: -0.3284 \n", - "000002: 2.2655 \n", - "000003: 1.4353 \n", - "Sum:\n", - "000000: -3.9695 \n", - "000001: -2.9686 \n", - "000002: 2.0086 \n", - "000003: 4.7676 \n", - "```" + " printf(\"A:\\n\");\n", + " matx::print(A);\n", + " printf(\"Max:\\n\");\n", + " matx::print(MD1);\n", + " printf(\"Sum:\\n\");\n", + " matx::print(AD1);" ] }, { @@ -297,20 +419,82 @@ "metadata": {}, "source": [ "### Convolution\n", - "MatX supports both 1D and 2D direct convolution using the `conv1d` and `conv2d` functions. FFT-based convolution can also be performed as a combination of existing primitives as a potentially faster alternative to direct convolution for large tensors. Both forms of direct convolution take in an extra mode which specifies how much of the output is saved, where `MATX_C_MODE_FULL` saves the entire filter ramp-up and down, `MATX_C_MODE_SAME` makes the input and output tensors the same size, and `MATX_C_MODE_VALID` only keeps valid samples (when the entire filter was part of the convolution). Convolution can be used to perform a rolling average of an input by making all filter values 1/N, where N is the length of the filter. In this example, we use a filter of length 3 to create a running average of the last 3 elements:\n", - "\n", - "```c++\n", - "conv1d(Co, C, filt, MATX_C_MODE_FULL, 0);\n", - "```" + "MatX supports both 1D and 2D direct convolution using the `conv1d` and `conv2d` functions. FFT-based convolution can also be performed as a combination of existing primitives as a potentially faster alternative to direct convolution for large tensors. Both forms of direct convolution take in an extra mode which specifies how much of the output is saved, where `MATX_C_MODE_FULL` saves the entire filter ramp-up and down, `MATX_C_MODE_SAME` makes the input and output tensors the same size, and `MATX_C_MODE_VALID` only keeps valid samples (when the entire filter was part of the convolution). Convolution can be used to perform a rolling average of an input by making all filter values 1/N, where N is the length of the filter. In this example, we use a filter of length 3 to create a running average of the last 3 elements:\n" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Initial CIn tensor:\n", + "tensor_1_f32: Tensor{float} Rank: 1, Sizes:[16], Strides:[1]\n", + "000000: 0.0000e+00 \n", + "000001: 0.0000e+00 \n", + "000002: 0.0000e+00 \n", + "000003: 0.0000e+00 \n", + "000004: 0.0000e+00 \n", + "000005: 0.0000e+00 \n", + "000006: 0.0000e+00 \n", + "000007: 0.0000e+00 \n", + "000008: 0.0000e+00 \n", + "000009: 0.0000e+00 \n", + "000010: 0.0000e+00 \n", + "000011: 0.0000e+00 \n", + "000012: 0.0000e+00 \n", + "000013: 0.0000e+00 \n", + "000014: 0.0000e+00 \n", + "000015: 0.0000e+00 \n", + "tensor_1_f32: Tensor{float} Rank: 1, Sizes:[18], Strides:[1]\n", + "000000: 0.0000e+00 \n", + "000001: 0.0000e+00 \n", + "000002: 0.0000e+00 \n", + "000003: 0.0000e+00 \n", + "000004: 0.0000e+00 \n", + "000005: 0.0000e+00 \n", + "000006: 0.0000e+00 \n", + "000007: 0.0000e+00 \n", + "000008: 0.0000e+00 \n", + "000009: 0.0000e+00 \n", + "000010: 0.0000e+00 \n", + "000011: 0.0000e+00 \n", + "000012: 0.0000e+00 \n", + "000013: 0.0000e+00 \n", + "000014: 0.0000e+00 \n", + "000015: 0.0000e+00 \n", + "000016: 0.0000e+00 \n", + "000017: 0.0000e+00 \n" + ] + }, + { + "data": { + "text/plain": [ + "(void) @0x78e507dfec30\n" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "!./exercises/compile_and_run.sh example3_conv1d" + "auto CIn = matx::make_tensor({16});\n", + "auto filt = matx::make_tensor({3});\n", + "auto Co = matx::make_tensor({16 + filt.Lsize() - 1});\n", + "\n", + "filt.SetVals({1.0/3, 1.0/3, 1.0/3});\n", + "\n", + "(CIn = matx::random({16}, matx::NORMAL)).run(); \n", + "\n", + "printf(\"Initial CIn tensor:\\n\");\n", + "matx::print(CIn);\n", + "(Co = matx::conv1d(CIn, filt, matx::MATX_C_MODE_FULL)).run();\n", + "\n", + "matx::print(Co);" ] }, { @@ -318,45 +502,67 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Expected output:\n", - "```sh\n", - "Initial C tensor:\n", - "000000: -0.9247\n", - "000001: -0.4253\n", - "000002: -2.6438\n", - "000003: 0.1452\n", - "000004: -0.1209\n", - "000005: -0.5797\n", - "000006: -0.6229\n", - "000007: -0.3284\n", - "000008: -1.0745\n", - "000009: -0.3631\n", - "000010: -1.6711\n", - "000011: 2.2655\n", - "000012: 0.3117\n", - "000013: -0.1842\n", - "000014: 1.2866\n", - "000015: 1.1820\n", - "After conv1d:\n", - "000000: -0.3082\n", - "000001: -0.4500\n", - "000002: -1.3313\n", - "000003: -0.9747\n", - "000004: -0.8732\n", - "000005: -0.1851\n", - "000006: -0.4411\n", - "000007: -0.5103\n", - "000008: -0.6753\n", - "000009: -0.5887\n", - "000010: -1.0362\n", - "000011: 0.0771\n", - "000012: 0.3020\n", - "000013: 0.7977\n", - "000014: 0.4714\n", - "000015: 0.7615\n", - "000016: 0.8229\n", - "000017: 0.3940\n", - "```" + "Similar to a 1D convolution, a 2D convolution does the same computation over two dimensions. A tensor of at least rank 2 is needed for a 2D convolution. Below we use a filter of all ones using the `ones` operator to demonstrate the filter can also be an operator and not an existing tensor view. The result is the sum of the four values around each cell on the input:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Initial C tensor:\n", + "tensor_2_f32: Tensor{float} Rank: 2, Sizes:[8, 8], Strides:[8,1]\n", + "000000: 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 \n", + "000001: 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 \n", + "000002: 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 \n", + "000003: 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 \n", + "000004: 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 \n", + "000005: 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 \n", + "000006: 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 \n", + "000007: 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 \n", + "After conv2d:\n", + "tensor_2_f32: Tensor{float} Rank: 2, Sizes:[8, 8], Strides:[8,1]\n", + "000000: 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 \n", + "000001: 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 \n", + "000002: 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 \n", + "000003: 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 \n", + "000004: 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 \n", + "000005: 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 \n", + "000006: 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 \n", + "000007: 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 \n" + ] + }, + { + "data": { + "text/plain": [ + "(void) @0x78e507dfec30\n" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + " auto CIn2 = matx::make_tensor({8,8});\n", + " auto filt2 = matx::ones({2, 2});\n", + " auto Co2 = matx::make_tensor({8, 8});\n", + "\n", + " (CIn2 = matx::random({8, 8}, matx::NORMAL)).run(); \n", + "\n", + " printf(\"Initial C tensor:\\n\");\n", + " matx::print(C);\n", + "\n", + " (Co2 = matx::conv2d(CIn2, filt, matx::MATX_C_MODE_SAME)).run();\n", + " \n", + " printf(\"After conv2d:\\n\");\n", + " matx::print(Co2);\n", + "\n" ] }, { @@ -364,39 +570,94 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Similar to a 1D convolution, a 2D convolution does the same computation over two dimensions. A tensor of at least rank 2 is needed for a 2D convolution. Below we use a filter of all ones using the `ones` operator to demonstrate the filter can also be an operator and not an existing tensor view. The result is the sum of the four values around each cell on the input:\n", - "\n", - "```c++\n", - "conv2d(Co, C, filt, MATX_C_MODE_FULL, 0);\n", - "```" + "Last, we mentioned above that convolution can also be done in the frequency domain using FFTs. This is the preferred method for larger tensors since FFTs are much faster than direct convolutions in large sizes, and because FFT libraries are highly-optimized. FFT convolution uses more memory than direct if the inputs are not to be destroyed since it requires running an FFT on both the input signal and filter before filtering. If not done in-place, this typically requires `2N + L - 1` new elements in memory, where N is the signal length and L is the filter length. A full FFT convolution example can be found in `fft_conv.cu` in the MatX examples, but the main convolution code is shown below:\n" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "!./exercises/compile_and_run.sh example3_conv2d" + "\n", + "///\\todo complete this tutorial. This one is pretty complex, do we want to keep the validation code here?\n", + "\n", + "// using complex = cuda::std::complex;\n", + "// cudaExecutor exec{};\n", + "\n", + "// index_t signal_size = 16;\n", + "// index_t filter_size = 3;\n", + "// index_t filtered_size = signal_size + filter_size - 1;\n", + "\n", + "// // Create time domain buffers\n", + "// auto sig_time = make_tensor({signal_size});\n", + "// auto filt_time = make_tensor({filter_size});\n", + "// auto time_out = make_tensor({filtered_size});\n", + "\n", + "// // Frequency domain buffers\n", + "// auto sig_freq = make_tensor({filtered_size});\n", + "// auto filt_freq = make_tensor({filtered_size});\n", + "\n", + "// // Fill the time domain signals with data\n", + "// for (index_t i = 0; i < signal_size; i++) {\n", + "// sig_time(i) = {-1.0f * (2.0f * static_cast(i % 2) + 1.0f) *\n", + "// (static_cast(i % 10) / 10.0f) +\n", + "// 0.1f,\n", + "// -1.0f * (static_cast(i % 2) == 0.0f) *\n", + "// (static_cast(i % 10) / 5.0f) -\n", + "// 0.1f};\n", + "// }\n", + "// for (index_t i = 0; i < filter_size; i++) {\n", + "// filt_time(i) = {static_cast(i) / static_cast(filter_size),\n", + "// static_cast(-i) / static_cast(filter_size) +\n", + "// 0.5f};\n", + "// }\n", + "\n", + "// TODO: Perform FFT convolution\n", + "// Perform the FFT in-place on both signal and filter\n", + "// (sig_freq = fft(sig_time)).run();\n", + "// (filt_freq = fft(filt_time)).run();\n", + "\n", + "// (sig_freq = sig_freq * filt_freq).run();\n", + "\n", + "// // IFFT in-place\n", + "// (sig_freq = ifft(sig_freq)).run(); \n", + "\n", + "\n", + "// Perform the FFT in-place on both signal and filter, do an element-wise multiply of the two, then IFFT that output\n", + "// (sig_freq = ifft(fft(sig_time, filtered_size) * fft(filt_time, filtered_size))).run(stream);\n", + "\n", + "// TODO: Perform a time-domain convolution\n", + "// conv1d(time_out, sig_time, filt_time, matxConvCorrMode_t::MATX_C_MODE_FULL, 0);\n", + "\n", + "// exec.sync();\n", + "\n", + "// // Compare signals\n", + "// for (index_t i = 0; i < filtered_size; i++) {\n", + "// if ( fabs(time_out(i).real() - sig_freq(i).real()) > 0.001 || \n", + "// fabs(time_out(i).imag() - sig_freq(i).imag()) > 0.001) {\n", + "// printf(\"Verification failed at item %lld. Direct=%f%+.2fj, FFT=%f%+.2fj\\n\", i,\n", + "// time_out(i).real(), time_out(i).imag(), sig_freq(i).real(), sig_freq(i).imag());\n", + "// return -1;\n", + "// }\n", + "// }\n", + "\n", + "std::cout << \"Verification successful\" << std::endl;\n" ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "Last, we mentioned above that convolution can also be done in the frequency domain using FFTs. This is the preferred method for larger tensors since FFTs are much faster than direct convolutions in large sizes, and because FFT libraries are highly-optimized. FFT convolution uses more memory than direct if the inputs are not to be destroyed since it requires running an FFT on both the input signal and filter before filtering. If not done in-place, this typically requires `2N + L - 1` new elements in memory, where N is the signal length and L is the filter length. A full FFT convolution example can be found in `fft_conv.cu` in the MatX examples, but the main convolution code is shown below:\n", - "\n", - "```c++\n", - " // Perform the FFT in-place on both signal and filter\n", - " (sig_freq = fft(sig_time)).run();\n", - " (filt_freq = fft(filt_time)).run();\n", - "\n", - " (sig_freq = sig_freq * filt_freq).run();\n", - "\n", - " // IFFT in-place\n", - " (sig_freq = ifft(sig_freq)).run();\n", - "```\n", "Since the expected output size of the full filtering operation is signal_len + filter_len - 1, both the filter and signal time domain inputs are shorter than the output. This would normally require a separate stage of allocating buffers of the appropriate size, zeroing them out, copying the time domain data to the buffers, and performing the FFT. However, MatX has an API to do all of this automatically in the library using asynchronous allocations. This makes the call have a noticeable performance hit on the first call, but subsequent calls will be close to the time without allocation. To recognize that automatic padding is wanted, MatX uses the output tensor size compared to the input tensor size to determine whether to pad the input with zeros. In this case the output signal (sig_time and filt_time) are shorter than the output tensors (sig_freq and filt_freq), so it will automatically zero-pad the input.\n", "\n", "The above expression can also be combined into a single line:\n", @@ -413,21 +674,7 @@ "conv1d(time_out, sig_time, filt_time, matxConvCorrMode_t::MATX_C_MODE_FULL, 0);\n", "```\n", "\n", - "To match the FFT results we do a full convolution to get all the samples from the filter ramp up and ramp down. However, if we wanted either valid or same mode we could slice the FFT convolution output at the appropriate places to give the same answer. Edit the file [exercises/example3_fft_conv.cu](exercises/example3_fft_conv.cu) and add the missing code where you see TODOs. After running the verification code at the bottom will check for accuracy.\n", - "\n", - "Expected output:\n", - "```sh\n", - "Verification successful\n", - "```" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!./exercises/compile_and_run.sh example3_fft_conv" + "To match the FFT results we do a full convolution to get all the samples from the filter ramp up and ramp down. However, if we wanted either valid or same mode we could slice the FFT convolution output at the appropriate places to give the same answer. Edit the file [exercises/example3_fft_conv.cu](exercises/example3_fft_conv.cu) and add the missing code where you see TODOs. After running the verification code at the bottom will check for accuracy." ] }, { @@ -446,12 +693,15 @@ "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6" }, "kernelspec": { - "display_name": "Python 3.6.9 64-bit", - "name": "python3" + "display_name": "C++17", + "language": "C++", + "name": "cling-cpp17" }, "language_info": { - "name": "python", - "version": "" + "codemirror_mode": "c++", + "file_extension": ".c++", + "mimetype": "text/x-c++src", + "name": "c++" } }, "nbformat": 4, diff --git a/docs_input/notebooks/exercises/example3_1dfft.cu b/docs_input/notebooks/exercises/example3_1dfft.cu deleted file mode 100644 index cd5bf70da..000000000 --- a/docs_input/notebooks/exercises/example3_1dfft.cu +++ /dev/null @@ -1,57 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// BSD 3-Clause License -// -// Copyright (c) 2021, NVIDIA Corporation -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are met: -// -// 1. Redistributions of source code must retain the above copyright notice, this -// list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// 3. Neither the name of the copyright holder nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -///////////////////////////////////////////////////////////////////////////////// - -#include - -using namespace matx; - -int main() { - - auto C = make_tensor>({2, 4}); - - (C = random({2, 4}, NORMAL)).run(); - - printf("Initial C tensor:\n"); - print(C); - - // TODO: Perform an in-place FFT on C across rows - - printf("After FFT:\n"); - print(C); - - // TODO: Perform an in-place IFFT on C across rows. - - printf("After IFFT and normalization:\n"); - print(C); - - return 0; -} diff --git a/docs_input/notebooks/exercises/example3_2dfft.cu b/docs_input/notebooks/exercises/example3_2dfft.cu deleted file mode 100644 index ebe117df4..000000000 --- a/docs_input/notebooks/exercises/example3_2dfft.cu +++ /dev/null @@ -1,56 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// BSD 3-Clause License -// -// Copyright (c) 2021, NVIDIA Corporation -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are met: -// -// 1. Redistributions of source code must retain the above copyright notice, this -// list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// 3. Neither the name of the copyright holder nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -///////////////////////////////////////////////////////////////////////////////// - -#include - -using namespace matx; - -int main() { - - auto C = make_tensor>({2, 4}); - - (C = random({2, 4}, NORMAL)).run(); - printf("Initial C tensor:\n"); - print(C); - - // TODO: Perform an in-place 2D FFT on C across rows - - printf("After FFT:\n"); - print(C); - - // TODO: Perform an in-place 2D IFFT on C across rows - - printf("After IFFT and normalization:\n"); - print(C); - - return 0; -} diff --git a/docs_input/notebooks/exercises/example3_assignment1.cu b/docs_input/notebooks/exercises/example3_assignment1.cu deleted file mode 100644 index 07f37ef14..000000000 --- a/docs_input/notebooks/exercises/example3_assignment1.cu +++ /dev/null @@ -1,115 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// BSD 3-Clause License -// -// Copyright (c) 2021, NVIDIA Corporation -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are met: -// -// 1. Redistributions of source code must retain the above copyright notice, this -// list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// 3. Neither the name of the copyright holder nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -///////////////////////////////////////////////////////////////////////////////// - -#include - -using namespace matx; - -/** - * MatX training assignment 3. This training goes through tensor operations that - * were learned in the 03_transformations notebook. Uncomment each verification - * block as you go to ensure your solutions are correct. - */ - -int main() { - using complex = cuda::std::complex; - auto A = make_tensor({2, 3}); - auto B = make_tensor({2, 3}); - - cudaExecutor exec{}; - - /**************************************************************************************************** - * Use the random number generator with a seed of 12345 to generate - * normally-distributed numbers in the tensor A. Next, take the FFT across - * columns of A (a 2-element FFT), and store the results in-place back in A. - * An example of random number generation can be found in the second tutorial - * or in the quick start guide here: - * - * https://devtech-compute.gitlab-master-pages.nvidia.com/matx/quickstart.html#random-numbers - * https://devtech-compute.gitlab-master-pages.nvidia.com/matx/api/random.html - ****************************************************************************************************/ - - /*** End editing ***/ - - // Verify init is correct - B.SetVals({{{0.5927, -0.3677}, {-2.6895, 1.8154}, {-0.0129, 0.9246}}, - {{0.5646, 0.8638}, {1.6400, 0.3494}, {-0.5709, 0.5919}}}); - A.print(); - B.print(); - exec.sync(); - for (int row = 0; row < A.Size(0); row++) { - for (int col = 0; col < A.Size(1); col++) { - if (fabs(A(row, col).real() - B(row, col).real()) > 0.001) { - printf( - "Mismatch in real part of FFT view! actual = %f, expected = %f\n", - A(row, col).real(), B(row, col).real()); - exit(-1); - } - if (fabs(A(row, col).imag() - B(row, col).imag()) > 0.001) { - printf( - "Mismatch in imag part of FFT view! actual = %f, expected = %f\n", - A(row, col).imag(), B(row, col).imag()); - exit(-1); - } - } - } - - printf("FFT verification passed!\n"); - - /**************************************************************************************************** - * Create a 3D tensor of floats using a normal distribution and with shape - * 10x5x15. Reduce the entire tensor down to a single float containing the max - * value. Scale the original tensor by this max value and do another max - * reduction. The final reduction should be 1.0. - * - * Hint: the reduction function is named rmax and takes the output, input, and - * stream as parameters - * https://devtech-compute.gitlab-master-pages.nvidia.com/matx/api/reduce.html - ****************************************************************************************************/ - // Create and initialize 3D tensor - - // Create scalar tensor for reduction - tensor_t redv; - - /*** End editing ***/ - - // Verify init is correct - exec.sync(); - if (fabs(redv() - 1.0) > 0.001) { - printf("Mismatch on final reduction. Expected=1.0, actual = %f\n", redv()); - exit(-1); - } - - printf("Reduction verification passed!\n"); - - return 0; -} diff --git a/docs_input/notebooks/exercises/example3_conv1d.cu b/docs_input/notebooks/exercises/example3_conv1d.cu deleted file mode 100644 index 1a5b83a71..000000000 --- a/docs_input/notebooks/exercises/example3_conv1d.cu +++ /dev/null @@ -1,55 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// BSD 3-Clause License -// -// Copyright (c) 2021, NVIDIA Corporation -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are met: -// -// 1. Redistributions of source code must retain the above copyright notice, this -// list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// 3. Neither the name of the copyright holder nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -///////////////////////////////////////////////////////////////////////////////// - -#include - -using namespace matx; - -int main() { - auto C = make_tensor({16}); - auto filt = make_tensor({3}); - auto Co = make_tensor({16 + filt.Lsize() - 1}); - - filt.SetVals({1.0/3, 1.0/3, 1.0/3}); - - (C = random({16}, NORMAL)).run(); - - printf("Initial C tensor:\n"); - print(C); - - // TODO: Perform a 1D direct convolution on C with filter filt - - - printf("After conv1d:\n"); - print(Co); - return 0; -} diff --git a/docs_input/notebooks/exercises/example3_conv2d.cu b/docs_input/notebooks/exercises/example3_conv2d.cu deleted file mode 100644 index d1ac06b3c..000000000 --- a/docs_input/notebooks/exercises/example3_conv2d.cu +++ /dev/null @@ -1,58 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// BSD 3-Clause License -// -// Copyright (c) 2021, NVIDIA Corporation -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are met: -// -// 1. Redistributions of source code must retain the above copyright notice, this -// list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// 3. Neither the name of the copyright holder nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -///////////////////////////////////////////////////////////////////////////////// - -#include - -using namespace matx; - -int main() { - auto C = make_tensor({8,8}); - auto filt = make_tensor({3}); - auto Co = make_tensor({16 + filt.Lsize() - 1}); - - auto filt = ones({2, 2}); - auto Co = make_tensor({8 + filt.Size(0) - 1, 8 + filt.Size(1) - 1}); - - (C = randTrandom({8, 8}, NORMAL)).run(); - - printf("Initial C tensor:\n"); - print(C); - - // TODO: Perform a 2D direct convolution on C with filter filt - - - - printf("After conv2d:\n"); - print(Co); - - return 0; -} diff --git a/docs_input/notebooks/exercises/example3_fft_conv.cu b/docs_input/notebooks/exercises/example3_fft_conv.cu deleted file mode 100644 index d922ec820..000000000 --- a/docs_input/notebooks/exercises/example3_fft_conv.cu +++ /dev/null @@ -1,92 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// BSD 3-Clause License -// -// Copyright (c) 2021, NVIDIA Corporation -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are met: -// -// 1. Redistributions of source code must retain the above copyright notice, this -// list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// 3. Neither the name of the copyright holder nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -///////////////////////////////////////////////////////////////////////////////// - -#include - -using namespace matx; - -int main() { - - using complex = cuda::std::complex; - cudaExecutor exec{}; - - index_t signal_size = 16; - index_t filter_size = 3; - index_t filtered_size = signal_size + filter_size - 1; - - // Create time domain buffers - auto sig_time = make_tensor({signal_size}); - auto filt_time = make_tensor({filter_size}); - auto time_out = make_tensor({filtered_size}); - - // Frequency domain buffers - auto sig_freq = make_tensor({filtered_size}); - auto filt_freq = make_tensor({filtered_size}); - - // Fill the time domain signals with data - for (index_t i = 0; i < signal_size; i++) { - sig_time(i) = {-1.0f * (2.0f * static_cast(i % 2) + 1.0f) * - (static_cast(i % 10) / 10.0f) + - 0.1f, - -1.0f * (static_cast(i % 2) == 0.0f) * - (static_cast(i % 10) / 5.0f) - - 0.1f}; - } - for (index_t i = 0; i < filter_size; i++) { - filt_time(i) = {static_cast(i) / static_cast(filter_size), - static_cast(-i) / static_cast(filter_size) + - 0.5f}; - } - - // TODO: Perform FFT convolution - // Perform the FFT in-place on both signal and filter, do an element-wise multiply of the two, then IFFT that output - - - // TODO: Perform a time-domain convolution - - - exec.sync(); - - // Compare signals - for (index_t i = 0; i < filtered_size; i++) { - if ( fabs(time_out(i).real() - sig_freq(i).real()) > 0.001 || - fabs(time_out(i).imag() - sig_freq(i).imag()) > 0.001) { - printf("Verification failed at item %lld. Direct=%f%+.2fj, FFT=%f%+.2fj\n", i, - time_out(i).real(), time_out(i).imag(), sig_freq(i).real(), sig_freq(i).imag()); - return -1; - } - } - - std::cout << "Verification successful" << std::endl; - - return 0; -} diff --git a/docs_input/notebooks/exercises/example3_full_reduce.cu b/docs_input/notebooks/exercises/example3_full_reduce.cu deleted file mode 100644 index 49c81832d..000000000 --- a/docs_input/notebooks/exercises/example3_full_reduce.cu +++ /dev/null @@ -1,58 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// BSD 3-Clause License -// -// Copyright (c) 2021, NVIDIA Corporation -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are met: -// -// 1. Redistributions of source code must retain the above copyright notice, this -// list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// 3. Neither the name of the copyright holder nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -///////////////////////////////////////////////////////////////////////////////// - -#include - -using namespace matx; - -int main() { - - auto A = make_tensor({4, 5}); - auto MD0 = make_tensor(); - auto AD0 = make_tensor(); - - (A = random({4, 5}, NORMAL)).run(); - - // Initialize max and average to 0 - (MD0 = 0).run(); - (AD0 = 0).run(); - - // TODO: Perform a max and sum reduction of A into MD0 and AD0, respectively. - - - printf("A:\n"); - print(A); - printf("Max: %f\n", MD0()); - printf("Sum: %f\n", AD0()); - - return 0; -} diff --git a/docs_input/notebooks/exercises/example3_gemm.cu b/docs_input/notebooks/exercises/example3_gemm.cu deleted file mode 100644 index 824ec9af2..000000000 --- a/docs_input/notebooks/exercises/example3_gemm.cu +++ /dev/null @@ -1,55 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// BSD 3-Clause License -// -// Copyright (c) 2021, NVIDIA Corporation -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are met: -// -// 1. Redistributions of source code must retain the above copyright notice, this -// list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// 3. Neither the name of the copyright holder nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -///////////////////////////////////////////////////////////////////////////////// - -#include - -using namespace matx; - -int main() { - auto A = make_tensor({8, 4}); - auto B = make_tensor({4, 8}); - auto C = make_tensor({8, 8}); - - (A = random({8, 4}, NORMAL)).run(); - (B = random({4, 8}, NORMAL)).run(); - - // TODO: Perform a GEMM of C = A*B - - printf("A:\n"); - print(A); - printf("B:\n"); - print(B); - printf("C:\n"); - print(C); - - return 0; -} diff --git a/docs_input/notebooks/exercises/example3_partial_reduce.cu b/docs_input/notebooks/exercises/example3_partial_reduce.cu deleted file mode 100644 index e48e407ba..000000000 --- a/docs_input/notebooks/exercises/example3_partial_reduce.cu +++ /dev/null @@ -1,61 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// BSD 3-Clause License -// -// Copyright (c) 2021, NVIDIA Corporation -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are met: -// -// 1. Redistributions of source code must retain the above copyright notice, this -// list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// 3. Neither the name of the copyright holder nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -///////////////////////////////////////////////////////////////////////////////// - -#include - -using namespace matx; - -int main() { - - auto A = make_tensor({4, 5}); - auto MD0 = make_tensor({4}); - auto AD0 = make_tensor({4}); - - (A = random(shape, NORMAL)).run(); - - // Initialize max and average to 0 - (MD1 = 0).run(); - (AD1 = 0).run(); - - // TODO: Reduce all rows of A by max where each reduction is a separate value in the vector MD1 - - - - printf("A:\n"); - print(A); - printf("Max:\n"); - print(MD1); - printf("Sum:\n"); - print(AD1); - - return 0; -} From 6903ac4b53e13cb70b672c024d0d7d287dfbf40e Mon Sep 17 00:00:00 2001 From: Tyler Allen Date: Fri, 24 Jan 2025 08:36:39 +0000 Subject: [PATCH 04/91] adding script for notebook. has hardcoded paths --- docs_input/notebooks/run.sh | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100755 docs_input/notebooks/run.sh diff --git a/docs_input/notebooks/run.sh b/docs_input/notebooks/run.sh new file mode 100755 index 000000000..6e2d639f0 --- /dev/null +++ b/docs_input/notebooks/run.sh @@ -0,0 +1,19 @@ +#!/bin/bash +USER_ID=$(id -u) +GROUP_ID=$(id -g) +CMDS="/bin/bash" + # -u $USER_ID:$GROU P_ID \ + +docker run -it --rm \ + -p 8888:8888 \ + --gpus all \ + -v /home/scratch.tylera_sw/:/scratch \ + -v /home/scratch.tylera_sw/projects/matx_holo_lab_2025:/notebooks \ + --env="DISPLAY" \ + --volume="/tmp/.X11-unix:/tmp/.X11-unix:rw" \ + --cap-add CAP_SYS_PTRACE \ + --ipc=host \ + --ulimit memlock=-1 \ + --ulimit stack=67108864 \ + gitlab-master.nvidia.com:5005/devtech-compute/sigx-group/container/cling:latest \ + /bin/bash \ No newline at end of file From 7825f950fac5c66e40c598b6bd108cf5a9a3e7c4 Mon Sep 17 00:00:00 2001 From: Tyler Allen Date: Fri, 24 Jan 2025 08:47:34 +0000 Subject: [PATCH 05/91] adding empty notebooks for fusion/profiling --- docs_input/notebooks/05_fusion.ipynb | 25 +++++++++++++++++++++++++ docs_input/notebooks/06_profiling.ipynb | 25 +++++++++++++++++++++++++ 2 files changed, 50 insertions(+) create mode 100644 docs_input/notebooks/05_fusion.ipynb create mode 100644 docs_input/notebooks/06_profiling.ipynb diff --git a/docs_input/notebooks/05_fusion.ipynb b/docs_input/notebooks/05_fusion.ipynb new file mode 100644 index 000000000..4a15e305d --- /dev/null +++ b/docs_input/notebooks/05_fusion.ipynb @@ -0,0 +1,25 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Fusing Operations in MatX" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## " + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs_input/notebooks/06_profiling.ipynb b/docs_input/notebooks/06_profiling.ipynb new file mode 100644 index 000000000..ff5f93301 --- /dev/null +++ b/docs_input/notebooks/06_profiling.ipynb @@ -0,0 +1,25 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Profiling and Optimizing with MatX" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Automated NVTX Ranges" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From fb8e89a953327d9947306712e610a624cd358991 Mon Sep 17 00:00:00 2001 From: cliffburdick Date: Mon, 27 Jan 2025 14:07:45 -0800 Subject: [PATCH 06/91] Adding timing metrics to CUDA and host executors --- docs_input/basics/profiling.rst | 38 ++++++++++++++++++++++++++++++++ examples/spectrogram.cu | 21 ++++++------------ include/matx/executors/cuda.h | 39 ++++++++++++++++++++++++++++++--- include/matx/executors/host.h | 27 +++++++++++++++++++++++ 4 files changed, 108 insertions(+), 17 deletions(-) create mode 100644 docs_input/basics/profiling.rst diff --git a/docs_input/basics/profiling.rst b/docs_input/basics/profiling.rst new file mode 100644 index 000000000..740565267 --- /dev/null +++ b/docs_input/basics/profiling.rst @@ -0,0 +1,38 @@ +.. _profiling: + +Profiling +######### + +Profiling is a way to measure the performance of a program and to identify bottlenecks in your MatX application. Since +the method for profiling depends on the executor, each executor implements its own profiling mechanism. For example, +the CUDA executor can use events encapsulating the kernels it's profiling. The profiling is done through the executor +object rather than the `run` statement so that multiple `run`s can be profiled together. + +Profiling is done by calling the `start_timer()` method of the executor: + +.. code-block:: cpp + + exec.start_timer(); + +To stop the profiler, `stop_timer()` is called: + +.. code-block:: cpp + + exec.stop_timer(); + +Depending on the executor, `stop_timer()` may need to block for the operation to conplete on an asynchronous executor. + +Once `stop_timer()` returns, the execution time between the timers can be retrieved by calling `get_time_ms()`: + +.. code-block:: cpp + + auto time = exec.get_time_ms(); + +In the above example `time` contains the runtime of everything executed between the `start_timer()` and `stop_timer()` calls. For +a CUDA executor this is the time between the beginning of the first kernel and the end of the last. For a CPU executor this is the CPU +time between the two calls. + +.. note:: + Profiling does not work a multi-threaded host executor currently + +For a full example of profiling, see the `spectrogram` example. \ No newline at end of file diff --git a/examples/spectrogram.cu b/examples/spectrogram.cu index 79b066a07..9d238b0ee 100644 --- a/examples/spectrogram.cu +++ b/examples/spectrogram.cu @@ -60,11 +60,6 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) cudaStream_t stream; cudaStreamCreate(&stream); - - cudaEvent_t start, stop; - cudaEventCreate(&start); - cudaEventCreate(&stop); - cudaExecutor exec{stream}; float fs = 10000; @@ -96,11 +91,11 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) (time = linspace<0>(num_samps, 0.0f, static_cast(N) - 1.0f) / fs) .run(exec); // mod = 500 * np.cos(2*np.pi*0.25*time) - (modulation = 500 * cos(2 * M_PI * 0.25 * time)).run(exec); + (modulation = 500.f * cos(2.f * static_cast(M_PI) * 0.25f * time)).run(exec); // carrier = amp * np.sin(2*np.pi*3e3*time + modulation) - (carrier = amp * sin(2 * M_PI * 3000 * time + modulation)).run(exec); + (carrier = amp * sin(2.f * static_cast(M_PI) * 3000.f * time + modulation)).run(exec); // noise = 0.01 * fs / 2 * np.random.randn(time.shape) - (noise = sqrt(0.01 * fs / 2) * random({N}, NORMAL)).run(exec); + (noise = sqrt(0.01f * fs / 2.f) * random({N}, NORMAL)).run(exec); // noise *= np.exp(-time/5) (noise = noise * exp(-1.0f * time / 5.0f)).run(exec); // x = carrier + noise @@ -108,11 +103,11 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) for (uint32_t i = 0; i < num_iterations; i++) { if (i == 2) { // Start timer on third loop to allow generation of plot - cudaEventRecord(start, stream); + exec.start_timer(); } // DFT Sample Frequencies (rfftfreq) - (freqs = (1.0 / (static_cast(nfft) * 1 / fs)) * + (freqs = (1.0f / (static_cast(nfft) * 1.f / fs)) * linspace<0>(half_win, 0.0f, static_cast(nfft) / 2.0f)) .run(exec); @@ -143,15 +138,13 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) } - cudaEventRecord(stop, stream); + exec.stop_timer(); exec.sync(); - cudaEventElapsedTime(&time_ms, start, stop); + time_ms = exec.get_time_ms(); printf("Spectrogram Time Without Graphs = %.2fus per iteration\n", time_ms * 1e3 / num_iterations); - cudaEventDestroy(start); - cudaEventDestroy(stop); cudaStreamDestroy(stream); MATX_CUDA_CHECK_LAST_ERROR(); diff --git a/include/matx/executors/cuda.h b/include/matx/executors/cuda.h index fb583ce19..7843ebe13 100644 --- a/include/matx/executors/cuda.h +++ b/include/matx/executors/cuda.h @@ -54,14 +54,24 @@ namespace matx * * @param stream CUDA stream */ - cudaExecutor(cudaStream_t stream) : stream_(stream) {} - cudaExecutor(int stream) : stream_(reinterpret_cast(stream)) {} + cudaExecutor(cudaStream_t stream) : stream_(stream) { + MATX_CUDA_CHECK(cudaEventCreate(&start_)); + MATX_CUDA_CHECK(cudaEventCreate(&stop_)); + } + + cudaExecutor(int stream) : stream_(reinterpret_cast(stream)) { + MATX_CUDA_CHECK(cudaEventCreate(&start_)); + MATX_CUDA_CHECK(cudaEventCreate(&stop_)); + } /** * @brief Construct a new cudaExecutor object using the default stream * */ - cudaExecutor() : stream_(0) {} + cudaExecutor() : stream_(0) { + MATX_CUDA_CHECK(cudaEventCreate(&start_)); + MATX_CUDA_CHECK(cudaEventCreate(&stop_)); + } /** * @brief Returns stream associated with executor @@ -73,6 +83,27 @@ namespace matx * */ void sync() { cudaStreamSynchronize(stream_); } + + /** + * @brief Start a timer for profiling workload + */ + void start_timer() { cudaEventRecord(start_, stream_); } + + /** + * @brief Stop a timer for profiling workload + */ + void stop_timer() { cudaEventRecord(stop_, stream_); } + + /** + * @brief Get the time in milliseconds between start_timer and stop_timer. + * This will block until the event is synchronized + */ + float get_time_ms() { + float time; + cudaEventSynchronize(stop_); + cudaEventElapsedTime(&time, start_, stop_); + return time; + } /** * Execute an operator on a device @@ -139,6 +170,8 @@ namespace matx private: cudaStream_t stream_; + cudaEvent_t start_; + cudaEvent_t stop_; }; }; diff --git a/include/matx/executors/host.h b/include/matx/executors/host.h index cb65a7842..d1a40aefc 100644 --- a/include/matx/executors/host.h +++ b/include/matx/executors/host.h @@ -112,6 +112,31 @@ class HostExecutor { */ void sync() {} + /** + * @brief Start a timer for profiling workload + */ + void start_timer() { + MATX_STATIC_ASSERT_STR(MODE == ThreadsMode::SINGLE, matxNotSupported, "Timer not supported in multi-threaded mode"); + start_ = std::chrono::high_resolution_clock::now(); + } + + /** + * @brief Stop a timer for profiling workload + */ + void stop_timer() { + MATX_STATIC_ASSERT_STR(MODE == ThreadsMode::SINGLE, matxNotSupported, "Timer not supported in multi-threaded mode"); + stop_ = std::chrono::high_resolution_clock::now(); + } + + /** + * @brief Get the time in milliseconds between start_timer and stop_timer. + * This will block until the event is synchronized + */ + float get_time_ms() { + auto duration = std::chrono::duration_cast(stop_ - start_); + return static_cast(static_cast(duration.count()) / 1e3); + } + /** * @brief Execute an operator * @@ -151,6 +176,8 @@ class HostExecutor { private: HostExecParams params_; + std::chrono::time_point start_; + std::chrono::time_point stop_; }; using SingleThreadedHostExecutor = HostExecutor; From c47e4b390da763976fda3a1111e02af7452881f5 Mon Sep 17 00:00:00 2001 From: cliffburdick Date: Wed, 29 Jan 2025 12:11:10 -0800 Subject: [PATCH 07/91] Fixed docs --- docs_input/basics/profiling.rst | 2 +- docs_input/build.rst | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/docs_input/basics/profiling.rst b/docs_input/basics/profiling.rst index 740565267..7fb39a74d 100644 --- a/docs_input/basics/profiling.rst +++ b/docs_input/basics/profiling.rst @@ -6,7 +6,7 @@ Profiling Profiling is a way to measure the performance of a program and to identify bottlenecks in your MatX application. Since the method for profiling depends on the executor, each executor implements its own profiling mechanism. For example, the CUDA executor can use events encapsulating the kernels it's profiling. The profiling is done through the executor -object rather than the `run` statement so that multiple `run`s can be profiled together. +object rather than the `run` statement so that multiple `run`\s can be profiled together. Profiling is done by calling the `start_timer()` method of the executor: diff --git a/docs_input/build.rst b/docs_input/build.rst index aad031937..19773a416 100644 --- a/docs_input/build.rst +++ b/docs_input/build.rst @@ -11,10 +11,10 @@ Optional features of MatX that require downloading separate libraries use additi be explicit about their requirements. The MatX CMake build configuration is intented to help download any libraries for both the required and optional features. -The CPM_ build system is used to help with package management and version control. By default, CPM will fetch other packages +The CPM build system is used to help with package management and version control. By default, CPM will fetch other packages from the internet. Alternatively, the option ``CPM_USE_LOCAL_PACKAGES`` can be used to point to local downloads in an air-gapped or offline environment. Choosing local versions of packages uses the typical ``find_packages`` CMake search methods. Please see -the CPM_ documentation or the documentation for each package for more information. +the CPM documentation or the documentation for each package for more information. System Requirements @@ -27,8 +27,7 @@ for supported host compilers. Other requirements for optional components are lis Required Third-party Dependencies --------------------------------- -- `CPM `_ (* Included in the project source and does not require a separate download) -- `CCCL `_ 2.7.0+ +- `CCCL `_ 2.7.0+ commit cbc6b9b or higher Optional Third-party Dependencies From 82d584639a371ad24f23accd2df8e782370ca4ef Mon Sep 17 00:00:00 2001 From: Tyler Allen Date: Wed, 29 Jan 2025 20:23:50 +0000 Subject: [PATCH 08/91] details added to notebook 1 --- .../notebooks/gtc_lab/01_lab_intro.ipynb | 690 ++++++++++++++++++ .../notebooks/gtc_lab/02_lab_fusion.ipynb | 266 +++++++ 2 files changed, 956 insertions(+) create mode 100644 docs_input/notebooks/gtc_lab/01_lab_intro.ipynb create mode 100644 docs_input/notebooks/gtc_lab/02_lab_fusion.ipynb diff --git a/docs_input/notebooks/gtc_lab/01_lab_intro.ipynb b/docs_input/notebooks/gtc_lab/01_lab_intro.ipynb new file mode 100644 index 000000000..be28b7577 --- /dev/null +++ b/docs_input/notebooks/gtc_lab/01_lab_intro.ipynb @@ -0,0 +1,690 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# MatX Introduction\n", + "intro to class and goals " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "outputs": [], + "source": [ + "//todo this should be moved to a hidden init block that runs automatically when the notebook starts\n", + "#pragma cling add_library_path(\"/usr/local/cuda/lib64\")\n", + "#pragma cling add_library_path(\"/opt/xeus/cling/lib\")\n", + "//#pragma cling add_library_path(\"/usr/Lib/gcc/x86_64-Linux-gnu/11/\")\n", + "#pragma cling add_library_path(\"/usr/lib/x86_64-linux-gnu/openblas64-openmp/\")\n", + "#pragma cling add_include_path(\"/usr/local/cuda/include\")\n", + "#pragma cling add_include_path(\"/usr/include/x86_64-linux-gnu/openblas64-openmp\")\n", + "#pragma cling add_include_path(\"/opt/xeus/cling/tools/Jupyter/kernel/MatX/include\")\n", + "#pragma cling add_include_path(\"/opt/xeus/cling/tools/Jupyter/kernel/MatX/build/_deps/cccl-src/libcudacxx/include\")\n", + "//#pragma cling load(\"libgomp\")\n", + "#pragma cling load(\"libopenblas64\")\n", + "#pragma cling load(\"libcuda\")\n", + "#pragma cling load(\"libcudart\")\n", + "#pragma cling load(\"libcurand\")\n", + "#pragma cling load(\"libcublas\")\n", + "#pragma cling load(\"libcublasLt\")\n", + "\n", + "#include \n", + "#include \n", + "\n", + "#define MATX_EN_OPENBLAS\n", + "#define MATX_EN_OPENBLAS_LAPACK\n", + "#define MATX_OPENBLAS_64BITINT\n", + "\n", + "#include \"matx.h\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Tensor Creation and Memory Backing\n", + "\n", + "Tensors are the base class of memory backed data storage in MatX. The Tensor class is highly flexible with many options for memory backing, residency, and ownership, but has defaults that makes it easy to use out-of-the box. A set of utility `make_tensor` functions are provided out of the box to help streamline and simplify tensor creation; this is the suggested use pattern for beginners and experts alike. \n", + "\n", + "`make_tensor` takes one template parameter indicating the type of the tensor, and zero or more function parameters. At a minimum, the sizes of the tensor are specified in curly braces, or in the case of a 0-D tensor, no size list is specified. For a complete guide on creating tensors in different ways, please visit: https://nvidia.github.io/MatX/creation.html.\n", + "\n", + "**NOTE** Unlike MATLAB, MatX follows the C-style for indexing, meaning we assume row-major formats rather than column-major, and 0-based indexing rather than 1-based. \n", + "\n", + "In the following cell we demonstrate creating tensors of 0D (scalar), 1D, and 2D data. Tensors can be scaled to any arbitrary dimension." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "// declare a 0D tensor (Scalar)\n", + "auto t0 = matx::make_tensor({});\n", + "\n", + "// declare a 1D tensor of length 4\n", + "auto t1 = matx::make_tensor({4});\n", + "\n", + "// declare a 2D tensor of size with 4 rows and 5 columns\n", + "auto t2 = matx::make_tensor({4,5});\n", + "\n", + "// declare tensor with user provided memory (maybe?)\n", + "\n", + "// declare tensor with shape of tensor t2\n", + "auto t2_b = matx::make_tensor(t2.Shape());" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Printing & Assigning\n", + "MatX also provides several utilities for initializing and viewing its data.\n", + "\n", + "Values can be initialized using a nested initializer list inside of the `SetVals` member function, specifying the values of the matrix. The initializer list is a single-nested list to match a 2D tensor shape, but this can be extended up to 4D tensors. `operator()` is also available to set and get individual values of a tensor as an alternative.\n", + "\n", + "`print` is a utility function to print a tensor or operator's contents to stdout. Printing can be used with any type of operator, including ones that have no memory backing them (see upcoming generators section). With no arguments `print` will print the entire contents of the tensor. The size of the printing can also be limited by passing a limit to each dimension. For example, `print(3,2)` would print the first 2 columns and 3 rows of the 2D tensor. `operator()` can also be used to retun a single value, and combine with traditional pritining techniques" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "// setVals in tensor\n", + "t2.SetVals({ \n", + " {1, 2, 3, 4},\n", + " {5, 6, 7, 8},\n", + " {9, 10, 11, 12},\n", + " {13, 14, 15, 16},\n", + " {17, 18, 19, 20}\n", + " });\n", + "\n", + "// print a tensor\n", + "matx::print(t2);\n", + "\n", + "// print elements of tensor\n", + "std::cout << t2(0,0) << std::endl;\n", + "\n", + "\n", + "t2(0,0) = 42;\n", + "t2(3,2) = 117;\n", + "\n", + "matx::print(t2);\n", + "\n", + "std::cout << \"My updates value for (3,2): \" << t2(3,2) << std::endl;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Exercise: First Tensor\n", + "\n", + "Try defining a new integer tensor of size `{3, 5}` and initilaize its values in increasing values from 0 to 15.\n", + "\n", + "print your tensor to ensure the values are as expected.\n", + "\n", + "update the 4th element `{1,1}` to `101`.\n", + "\n", + "print the 4th element to ensure your update was valid.\n", + "\n", + "try other tensor manipulations to test the API!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "cpp" + } + }, + "outputs": [], + "source": [ + "// declare a tensor\n", + "// auto myTensor = matx::make_tensor({3,5});\n", + "\n", + "\n", + "// setVals in myTensor\n", + "// myTensor.SetVals({ \n", + "// {1, 2, 3},\n", + "// {4, 5, 6},\n", + "// {7, 8, 9},\n", + "// {10, 11, 12},\n", + "// {13, 14, 15}\n", + "// });\n", + "\n", + "\n", + "// print your new tensor\n", + "// matx::print(myTensor);\n", + "\n", + "\n", + "// update the value at {1,1} to 101\n", + "// myTensor(1,1) = 101;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Tensors Views\n", + "MatX provides a powerful set of functions that enable arbitrary views into existing tensors, without incuring additional memory storage or processing cost to reorganize the data. These views provide \"zero copy\" accessors to a tensor that can be used in MatX logic as if it were a real memory-backed tensor.\n", + "\n", + "MatX has feature parity to most operations expected in cupy / matlab style environments; a full table of the translation of a given operation to it's MatX equivilant can be found in our full documentation [here](https://nvidia.github.io/MatX/basics/matlabpython.html#conversion-table)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "### Permute\n", + "`permute` returns a view of the data with the dimensions swapped to match the order of the initializer list argument. In the exmaple below we swap our two dimenions, so it's equivalent to a matrix transpose. However, `permute` can be used on higher-order tensors with the dimensions swapped in any order.\n", + " \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "//permute\n", + "print(t2);\n", + "auto t2p = permute(t2, {1,0});\n", + "print(t2p);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Slice\n", + "`slice` provides a view of a subset of data in a tensor, allowing that subset to be used and manipulated as a single entity. The `slice` utility function takes the input operator and two initilization lists to define the range of the provided input operator the slice will container. the ranges are defined wit the start index and end (exclusive) index. \n", + "\n", + "in the example below, `t2s` will corespond to the elemnts [`1:2),1:2`] of the larger t2 tensor\n", + "\n", + "![2D Slice](../img/dli-slice.png)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "cpp" + } + }, + "outputs": [], + "source": [ + "//slice example 1: same Rank\n", + "auto t2s = matx::slice(t2, {1,1}, {3, 3});\n", + "matx::print(t2s);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "Similarly, `slice` can be used with a template parameter to define an operator of a different rank (dimensionality) than the input tensor. In the second example, we demonstrate slicing the 0th column from the t2 tensor as shown in the image below.\n", + "\n", + "![Column Slice](../img/dli-slice_col.png)\n", + "\n", + "MatX also includes several helper defines to make tensor bound definitions easier. To include all values from the beginning on, a special sentinel of `matxEnd` can be used. Similarly, `matxDropDim` is used to indicate this dimension is the one being sliced (i.e. removed)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "//slice example 2: reduce rank requires template parameter\n", + "auto t1Col = matx::slice<1>(t2, {0, 1}, {matx::matxEnd, matx::matxDropDim});\n", + "matx::print(t1Col);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Clone\n", + "`clone` provides a utlity funciton to expand a smaller rank tensor to a larger rank by replicating the original data. \n", + "\n", + "for example, a 1D Tensor can be cloned to create a 2D tensor.\n", + "\n", + "In the clone example below, we will take the t1Col from our previous operation, and clone it to build a 2D [5,4] tensor." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "//clone\n", + "auto t2c_cols = matx::clone<2>(t1Col, {5, matx::matxKeepDim});\n", + "matx::print(t2c_cols);\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### View Data Backing\n", + "We established earlier that views are not new data, but variable accessors into the original memory-backed tensor. this is a powerful tool when operating on the core data, as we can desctruct a large data block into the set of data we want to operate on. \n", + "\n", + "**It is very important to remember that modifying the data in a view modified the original tensor**\n", + "\n", + "This means any change to the original tensor, through the view or in any other fashion, will reflect in all views of that tensor." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "// modify the original tensor\n", + "t2(1,0) = 10;\n", + "// print our views to show the updated values\n", + "matx::print(t2);\n", + "matx::print(t1Col);\n", + "matx::print(t2c_cols);\n", + "\n", + "\n", + "//modify the tensor through a view\n", + "t1Col(1) = 203;\n", + "\n", + "// print our views to show the updated values\n", + "matx::print(t2);\n", + "matx::print(t1Col);\n", + "matx::print(t2c_cols);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Exercise Views\n", + "Lets demonstrate your new skills in creating views of a tensor. using the pre-defined `baseTensor2D` please create the following views:\n", + "\n", + "- the complete first row of the `baseTensor`\n", + "- a 2D square of 4 elements, comprized of the first 2 rows and 2 columns of data\n", + "- modify the {1,1} element of baseTensor2D through the view corresponding to that data to assign it the value of 87.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "cpp" + } + }, + "outputs": [], + "source": [ + "auto baseTensor2D = matx::make_tensor({3,5});\n", + "\n", + "baseTensor2D.SetVals({ \n", + " {1, 2, 3},\n", + " {4, 5, 6},\n", + " {7, 8, 9},\n", + " {10, 11, 12},\n", + " {13, 14, 15}\n", + " });\n", + "\n", + "\n", + "// slice the first row of baseTensor\n", + "// auto baseTensor_row0 = matx::slice<1>(baseTensor2D, {0,0}, {matx::matxDropDim, matx::matxEnd});\n", + "matx::print(base_tensor_row0);\n", + "\n", + "\n", + "\n", + "//slice the 2D sqaure of the first 4 elements\n", + "// auto baseSquare = matx::slice(baseTensor2D, {0,0}, {3,3});\n", + "matx::print(baseSquare);\n", + "\n", + "// baseSquare(1,1) = 87;\n", + "matx::print(baseTensor2D);" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "source": [ + "## MatX Operations\n", + "Operators in MatX are an abstract type that defines an operation that returns a value at a given index. This concept is intentionally vague, which makes it extremely powerful for representing different concepts. As an example, both a tensor type and the addition operator `+` are MatX operators. In the case of the tensor, it returns the value in memory at that location, but for the addition operator, it returns the sum of values at a given location from both the left and right hand sides.\n", + "\n", + "Most operators come in unary types for operating on a single input or a binary type for operating on two inputs. For example, the expression `A + B` uses the binary `AddOp` operator to lazily add two tensors or other operators together. MatX supports most of the standard unary operators a user would expect, and work with both MatX tensor/operator types, as well as scalar values that are compatible with the base type of the operator.\n", + "\n", + "below we'll demonstrate both scalar and matrix support for the basic unary operators (`+`, `-`, `x`, `/`)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "auto A = matx::make_tensor({2, 3});\n", + "auto B = matx::make_tensor({2, 3});\n", + "auto C = matx::make_tensor({2, 3});\n", + "auto D = matx::make_tensor({2, 2});\n", + "\n", + "A.SetVals({ {1, 2, 3},\n", + " {4, 5, 6}\n", + " });\n", + "(B = A).run(); \n", + "\n", + "matx::print(A); \n", + "\n", + "// add\n", + "matx::print(A + 5); // scalar \n", + "matx::print(A + B); // matrix\n", + "\n", + "// subtraction\n", + "matx::print(A - 5);\n", + "matx::print(A - B);\n", + "\n", + "// multiplication (dot)\n", + "matx::print(A * 5);\n", + "matx::print(A * B);\n", + "\n", + "// division \n", + "matx::print(A / 5);\n", + "matx::print(A / B);\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Exercise: Operators\n", + "Please use the provided A and B tensors to complete the following set of operations:\n", + "\n", + "- Multiply `A` by it's scalar weight factor `aScale`\n", + "- subtract `bOffset` from the matrix `B`\n", + "- Add the `A` and `B` Tensors\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "cpp" + } + }, + "outputs": [], + "source": [ + "int aScale = 5;\n", + "int bOffset = 2;\n", + "\n", + "// scale A by aScale\n", + "// print( A * ascale);\n", + "\n", + "// subtract B by bOffset\n", + "//print( B - bOffset);\n", + "\n", + "// add A and B Tensors\n", + "// print(A + B);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## MatX Transforms\n", + "Transforms are operators that take one or more inputs and call a backend library or kernel. Transforms usually changes one or more properties of the input, but that is not always the case. An fft may change the input type or shape, but a sort transform does not. Depending on the context used, a transform may asynchronously allocate temporary memory if the expression requires it." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Matrix Multiplication\n", + "The `matmul` executor performs the matrix-matrix multiply of $$C = {\\alpha}A * B + {\\beta}C$$ where `A` is of dimensions `MxK`, `B` is `KxN`, and `C` is `MxN`. We first populate the `A` and `B` matrices with random values before the multiply as we did in the example above, then the GEMM is performed. Since the random number generator allocates memory sufficient to randomize the entire tensor, we create a random number generator large enough to generate values for both A or B. This allows us to create a single random number generator, but pull different random values for A and B by simply calling `run` twice. As mentioned above, any rank above 2 is consiered a batching dimension.\n", + "\n", + "We use rectangular matrices for `A` and `B`, while `C` will be a square matrix due to the outer dimensions of `A` and `B` matching. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "// matrix multiplication\n", + "(C = matx::matmul(A,matx::transpose(B))).run();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### FFTs\n", + "MatX provides an interface to do both 1D Fast Fourier Transforms (FFTs) and 2D FFTs. Any tensor above rank 1 will be batched in a 1D FFT, and any tensor above rank 2 will be batched in a 2D FFT. FFTs may either be done in-place or out-of-place by using the same or different variables for the output and inputs. Since the tensors are strongly-typed, the type of FFT (C2C, R2C, etc) is inferred by the tensor type at compile time. Similarly, the input and output size of the executor is deduced by the type of transform, and the input/output tensors must match those sizes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "cpp" + } + }, + "outputs": [], + "source": [ + "// FFT\n", + "auto D = matx::make_tensor({2, 4});\n", + "\n", + "(D = matx::random(D.Shape(), matx::NORMAL)).run(); // explained later\n", + "matx::print(D);\n", + "\n", + "(D = fft(D)).run();\n", + "matx::print(D);\n", + "\n", + "(D = matx::ifft(D)).run(); \n", + "matx::print(D);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Reductions\n", + "Reductions are one of the most common operations perfomed on the GPU, which means they've been heavily researched and optimized for highly-parallel processors. Modern NVIDIA GPUs have special instructions for performing reductions to give even larger speedups over naive implementations. All of these details are hidden from the user and MatX automatically chooses the optimized path based on the hardware capabilities. \n", + "\n", + "MatX provides a set of optimized primitives to perform reductions on tensors for many common types. Reductions are supported across individual dimensions or on entire tensors, depending on the size of the output tensor. Currently supported reduction functions are `sum`, `min`, `max`,` mean`, `any`, and `all`\n", + "\n", + "below is a simple example for calcluate a full reduction of the max and sum of our A data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "cpp" + } + }, + "outputs": [], + "source": [ + "auto MD0 = matx::make_tensor({});\n", + "auto AD0 = matx::make_tensor({});\n", + "\n", + "(A = matx::random(A.Shape(), matx::NORMAL)).run(); \n", + "\n", + "\n", + "// max of data\n", + "(MD0 = max(A)).run();\n", + "// min of data\n", + "(AD0 = sum(A)).run();\n", + "\n", + "printf(\"A:\\n\");\n", + "matx::print(A);\n", + "printf(\"Max: %f\\n\", MD0());\n", + "printf(\"Sum: %f\\n\", AD0()); " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "source": [ + "### Additional Transforms\n", + "MatX Supports a wide range of transforms, including specializations for specific domains of signal processing. Please review the [MatX documentation](https://nvidia.github.io/MatX/api/index.html) for an exhaustive list of supported operations, but we'll review a few of the most common here." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "cpp" + } + }, + "outputs": [], + "source": [ + "// Do we want to show any additional reducitons here? Talk about batching?\n", + "// convolution\n", + "// batched transforms" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## MatX Generators\n", + "Generators are a type of operator that can generate values without another tensor or operator as input. For example, windowing functions, such as a Hamming window, can generate values by only taking a length as input. Generators are efficient since they require no memory.\n", + "\n", + "Common generators include random number generation, filters, or identity matricies. Below is an example of each:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "auto H = matx::make_tensor({10});\n", + "\n", + "// random\n", + "(A = 0).run();\n", + "(A = matx::random(A.Size(), matx::NORMAL)).run();\n", + "matx::print(A);\n", + "\n", + "// eye\n", + "(A = matx::eye(A.Size())).run();\n", + "matx::print(A);\n", + "\n", + "// hamming\n", + "(H = matx::hamming<0>(H.Shape())).run();\n", + "matx::print(H);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Exercise: Transforms and generators:\n", + "\n", + "For this example we will generate random data to verify the distribution of our generator functions. Please implement the following:\n", + "\n", + "- generate 10 1D data arrays of 1000 elements\n", + "- perform a 1D FFT on the entire data set\n", + "- find the max bin of each fft'd data set" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "cpp" + } + }, + "outputs": [], + "source": [ + "//input data storage\n", + "auto input = matx::make_tensor({10,1000});\n", + "auto maxVal = matx::Make_tensor({10});\n", + "auto maxIdx = matx::Make_tensor({10});\n", + "\n", + "// generate random data\n", + "// (input = matx::random({input_size});\n", + "auto S = matx::make_tensor({input_size});\n", + "auto V = matx::make_tensor({input_size});\n", + "auto r = matx::make_tensor({input_size});\n", + "auto T = matx::make_tensor({input_size});\n", + "auto output = matx::make_tensor({input_size}); \n", + "auto referenceOutput = matx::make_tensor({input_size}); \n", + "\n", + "// Individually Evaluated Reference\n", + "(referenceOutput = S * matx::normcdf((matx::log(S / K) + (r + 0.5 * V * V) * T) / V * matx::sqrt(T)) - K * matx::exp(-1 * r * T) * matx::normcdf((matx::log(S / K) + (r + 0.5 * V * V) * T) / V * sqrt(T) - V * sqrt(T);)).run(exec);\n", + "print(referenceOutput);\n", + "\n", + "// well organized version\n", + "// auto VsqrtT = V * sqrt(T);\n", + "// auto d1 = (log(S / K) + (r + 0.5 * V * V) * T) / VsqrtT ;\n", + "// auto d2 = d1 - VsqrtT;\n", + "// auto cdf_d1 = normcdf(d1);\n", + "// auto cdf_d2 = normcdf(d2);\n", + "// auto expRT = exp(-1 * r * T); \n", + "\n", + "// (output = S * cdf_d1 - K * expRT * cdf_d2).run(exec);\n", + "\n", + "\n" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From c695d5a76c24c0cb1044eb41294e77b0f85f5aec Mon Sep 17 00:00:00 2001 From: Tyler Allen Date: Wed, 29 Jan 2025 22:06:43 +0000 Subject: [PATCH 09/91] adding incomplete books --- .../notebooks/gtc_lab/03_lab_radar.ipynb | 86 +++++++++++++++++++ .../notebooks/gtc_lab/04_lab_profiling.ipynb | 80 +++++++++++++++++ 2 files changed, 166 insertions(+) create mode 100644 docs_input/notebooks/gtc_lab/03_lab_radar.ipynb create mode 100644 docs_input/notebooks/gtc_lab/04_lab_profiling.ipynb diff --git a/docs_input/notebooks/gtc_lab/03_lab_radar.ipynb b/docs_input/notebooks/gtc_lab/03_lab_radar.ipynb new file mode 100644 index 000000000..c60caef25 --- /dev/null +++ b/docs_input/notebooks/gtc_lab/03_lab_radar.ipynb @@ -0,0 +1,86 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# MatX Radar Tutorial\n", + "This notebooks demonstrates MatX's ability to quickly prototype and implement a real-world workflow, using a radar pipeline as a representative taks. In this notebook we'll combine all of our learned skillsets in MatX to quickly implement a complex radar operation in a few simple lines." + ] + }, + { + "attachments": { + "image.png": { + "image/png": "" + } + }, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Radar Example Pipeline\n", + "\n", + "\n", + "### Pulse Compression Theory\n", + "Pulse compression is a signal processing technique commonly used by radar, sonar and echography to either increase the range resolution when pulse length is constrained or increase the signal to noise ratio when the peak power and the bandwidth (or equivalently range resolution) of the transmitted signal are constrained. This is achieved by modulating the transmitted pulse and then correlating the received signal with the transmitted pulse. \n", + "\n", + "![image.png](attachment:image.png)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Implementing Pulse Compression \n", + "### Set of Operations to Perform" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "cpp" + } + }, + "outputs": [], + "source": [ + "// organize the incoming data and waveform\n", + "auto waveformPart = slice(waveformView, {0}, {waveformLength});\n", + "auto waveformT = waveformView.template Clone<3>({numChannels, numPulses, matxKeepDim});\n", + "auto waveformFull = slice(waveformView, {0}, {numSamplesRnd});\n", + "\n", + "auto x = inputView;\n", + "\n", + "// create waveform expression for this dwell\n", + "(waveformPart = waveformPart * hamming<0>({waveformLength})).run(exec);\n", + "\n", + "// compute L2 norm\n", + "(norms = sum(abs2(waveformPart))).run(exec);\n", + "(norms = sqrt(norms)).run(exec);\n", + "\n", + "(waveformPart = waveformPart / norms).run(exec);\n", + "(waveformFull = fft(waveformPart, numSamplesRnd)).run(exec);\n", + "(waveformFull = conj(waveformFull)).run(exec);\n", + "\n", + "//FFT based convolution\n", + "(x = fft(x)).run(exec);\n", + "(x = x * waveformT).run(exec);\n", + "(x = ifft(x)).run(exec);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Optimization of Pulse Compression" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs_input/notebooks/gtc_lab/04_lab_profiling.ipynb b/docs_input/notebooks/gtc_lab/04_lab_profiling.ipynb new file mode 100644 index 000000000..69ab04700 --- /dev/null +++ b/docs_input/notebooks/gtc_lab/04_lab_profiling.ipynb @@ -0,0 +1,80 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Profiling Notebook\n", + "\n", + "## Profiling in MatX\n", + "\n", + "### Why we profile\n", + "### When to profile " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## NVIDIA Profiling Tools\n", + "\n", + "### Nsight Systems GUI\n", + "### Nsight Systems CLI" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## MatX Profiling tools\n", + "\n", + "### Automatic Ranges\n", + "\n", + "### User Defined Ranges" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "cpp" + } + }, + "outputs": [], + "source": [ + "// example of how to define your own ranges both scoped and manual " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "### Profile Level \n", + "\n", + "### Compile Time Control" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Profiling Radar Application\n", + "\n", + "### Standalone PC Ops\n", + "\n", + "### Fused PC Ops\n", + "\n", + "### GPU Ops\n" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 9fa6c817f75bcff9451a1d0426d44e1436ef3dc4 Mon Sep 17 00:00:00 2001 From: Tyler Allen Date: Wed, 29 Jan 2025 18:49:24 -0700 Subject: [PATCH 10/91] update profiling lab with content explaining API --- .../notebooks/gtc_lab/04_lab_profiling.ipynb | 134 +++++++++++++++--- 1 file changed, 117 insertions(+), 17 deletions(-) diff --git a/docs_input/notebooks/gtc_lab/04_lab_profiling.ipynb b/docs_input/notebooks/gtc_lab/04_lab_profiling.ipynb index 69ab04700..793fb486b 100644 --- a/docs_input/notebooks/gtc_lab/04_lab_profiling.ipynb +++ b/docs_input/notebooks/gtc_lab/04_lab_profiling.ipynb @@ -4,33 +4,88 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Profiling Notebook\n", + "# Profiling in MatX\n", + "Improving performance is at the heart of MatX's value, so it must facilitate any easy to implement and powerful capability for benchmarking and analysing code both at deployment and during development.\n", "\n", - "## Profiling in MatX\n", - "\n", - "### Why we profile\n", - "### When to profile " + "The NVIDIA software ecosystem provides a powerful profiling suite of tools through [Nsight Systems]() and [Nsight Compute]() that allows developers to gain great insight into the performance of their code and utilization of their hardware. MatX leverages this powerful ecosystem through the [NVTX toolkit]() which allows developers to annote their code for use with the Nsight suite of tools. " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## NVIDIA Profiling Tools\n", + "## MatX Profiling tools\n", + "MatX provides an NVTX API to enable native compile-in profiling capabilities. The MatX NVTX API enable a user to \n", + "easily profile all MatX calls using built-in NVTX ranges, while also providing a convenient API for the user to insert \n", + "custom ranges in their own code. This API provides many convenience features such as:\n", + "\n", + "- A convenient compile-in/compile-out MACRO based API \n", + "- verbosity levels allowing varying levels of profiling detail\n", + "- Built-in color rotation\n", + "- Automatic scope management and range naming \n", + "- Overloaded API for manual range specification\n", + "\n", + "MatX Implements it's NVTX API as a set of macros, which allows users to easily compile NVTX functionality into, or out of your code. This completely removes any runtime penality that may be caused by NVTX in the most latency sensitive deployments.\n", "\n", - "### Nsight Systems GUI\n", - "### Nsight Systems CLI" + "To enable the NVTX Profiling API, simply compile with the ``MATX_NVTX_FLAGS=ON`` enabled in the cmake command." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## MatX Profiling tools\n", + "### User Defined Ranges\n", + "User defined NVTX ranges require the user to provide a name and unique ID for each range. The name will appear in the NVTX range of your nsight profiles, while the unique ID is only used interally to track your ranges during deletion. Because of this, the unique ID **must** be unique for any ranges that overlap, otherwise you may delete the incorrect range during tear-down.\n", + "\n", + "Below is an example of a user-defined NVTX range:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "cpp" + } + }, + "outputs": [], + "source": [ + "using dtype = double;\n", + "index_t input_size = 10;\n", + "// index_t inputIsze = 10000000; // increase size to measure performance\n", + "\n", + "//declare input data\n", + "auto K = matx::make_tensor({input_size});\n", + "auto S = matx::make_tensor({input_size});\n", + "auto V = matx::make_tensor({input_size});\n", + "auto r = matx::make_tensor({input_size});\n", + "auto T = matx::make_tensor({input_size});\n", + "auto output = matx::make_tensor({input_size}); \n", + "auto referenceOutput = matx::make_tensor({input_size}); \n", + "\n", + "MATX_NVTX_START_RANGE(\"Black-Scholes Op Creation\", 0)\n", + "// well organized version\n", + "auto VsqrtT = V * sqrt(T);\n", + "auto d1 = (log(S / K) + (r + 0.5 * V * V) * T) / VsqrtT ;\n", + "auto d2 = d1 - VsqrtT;\n", + "auto cdf_d1 = normcdf(d1);\n", + "auto cdf_d2 = normcdf(d2);\n", + "auto expRT = exp(-1 * r * T); \n", + "MATX_NVTX_END_RANGE(0)\n", "\n", + "MATX_NVTX_START_RANGE(\"Black-Scholes Execution\", 1)\n", + "(output = S * cdf_d1 - K * expRT * cdf_d2).run(exec);\n", + "MATX_NVTX_END_RANGE(1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ "### Automatic Ranges\n", + "Alternative versions of the timing macros are provided to auomate handling the NatX NVTX ranges. The `MATX_NVTX_START_RANGE` has an overload which allows the its use without providing a unique ID. Instead the macro returns an ID, which can be stored in an int variable and later passed to the end range call. when NVTX ranges are compiled out, the Macros simply return 0, and no action is taken on the end call.\n", "\n", - "### User Defined Ranges" + "Below is an example using the automatic enumeration feature:" ] }, { @@ -43,30 +98,75 @@ }, "outputs": [], "source": [ - "// example of how to define your own ranges both scoped and manual " + "int bc_range = MATX_NVTX_START_RANGE(\"Black-Scholes Execution\");\n", + "(output = S * cdf_d1 - K * expRT * cdf_d2).run(exec);\n", + "MATX_NVTX_END_RANGE(bc_range);" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ + "### Scope Based Ranges\n", + "A final version of the API, `MATX_NVTX_START` is provided that matches the life of the NVTX range to the life of the scope in which it is defined. This automatically enumates a unique ID, and does not need to be explicitly destroyed by the user. \n", "\n", - "### Profile Level \n", + "Similarly it will also inherit the name of the functions it is called from, and do not require a name. This is especially useful for automating ranges for entire functions.\n", "\n", - "### Compile Time Control" + "An example of this API is as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "cpp" + } + }, + "outputs": [], + "source": [ + "void myFunction\n", + "{\n", + " MATX_NVTX_START(\"\");\n", + " \n", + " (output = S * cdf_d1 - K * expRT * cdf_d2).run(exec);\n", + "}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Profiling Radar Application\n", "\n", - "### Standalone PC Ops\n", + "### Profile Level \n", + "The MatX NVTX API supports logging levels, allowing you to fine-tune the levels of NVTX ranges that are captured at a given time. the logging level is checked at runtime, so can be dynamically changed throughout program execution.\n", + "A utility macro `MATX_NVTX_SET_LOG_LEVEL(LOG_LEVEL)`.\n", + "\n", + "All Events default to the log level `MATX_NVTX_LOG_USER`, and the default verbosity is `MATX_NVTX_LOG_API`. \n", "\n", - "### Fused PC Ops\n", "\n", - "### GPU Ops\n" + "There are 5 increasing levels of verbosity:\n", + "```\n", + "MATX_NVTX_LOG_NONE\n", + "MATX_NVTX_LOG_USER\n", + "MATX_NVTX_LOG_API\n", + "MATX_NVTX_LOG_INTERNAL\n", + "MATX_NVTX_LOG_ALL\n", + "``` \n", + "\n", + "`MATX_NVTX_LOG_NONE` ensures that no Ranges are recorded.\n", + "`MATX_NVTX_LOG_ALL` ensures all NVTX Ranges are recorded.\n", + "\n", + "Any intermediate level ensures that level and all levesl avove it are recoded. For exmaple, if `MATX_NVTX_LOG_API`\n", + "is enabled, then all events of type `MATX_NVTX_LOG_USER` **AND** `MATX_NVTX_LOG_API` will be recoded.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Profiling Radar Application\n", + "To demonstrate the power of the NVTX ranges, we'll demonstrate using the [Radar Pipeline example]() in the MatX example codes. This pipeline showcases both the powerful accleration MatX provides, as well as the granular insight we gain into our performance through the MatX NVTX API.\n" ] } ], From e57459d70c31641d086ba2710a4dcf90e87bb606 Mon Sep 17 00:00:00 2001 From: Tyler Allen Date: Wed, 29 Jan 2025 20:45:10 -0700 Subject: [PATCH 11/91] adding exec to all runs --- .../notebooks/gtc_lab/01_lab_intro.ipynb | 30 +++++---- .../notebooks/gtc_lab/02_lab_fusion.ipynb | 64 ++++++++++++++++--- 2 files changed, 70 insertions(+), 24 deletions(-) diff --git a/docs_input/notebooks/gtc_lab/01_lab_intro.ipynb b/docs_input/notebooks/gtc_lab/01_lab_intro.ipynb index be28b7577..f45bf20f4 100644 --- a/docs_input/notebooks/gtc_lab/01_lab_intro.ipynb +++ b/docs_input/notebooks/gtc_lab/01_lab_intro.ipynb @@ -42,7 +42,9 @@ "#define MATX_EN_OPENBLAS_LAPACK\n", "#define MATX_OPENBLAS_64BITINT\n", "\n", - "#include \"matx.h\"" + "#include \"matx.h\"\n", + "\n", + "exec = matx::SingleThreadedHostExecutor;" ] }, { @@ -492,7 +494,7 @@ "source": [ "\n", "// matrix multiplication\n", - "(C = matx::matmul(A,matx::transpose(B))).run();" + "(C = matx::matmul(A,matx::transpose(B))).run(exec);" ] }, { @@ -516,13 +518,13 @@ "// FFT\n", "auto D = matx::make_tensor({2, 4});\n", "\n", - "(D = matx::random(D.Shape(), matx::NORMAL)).run(); // explained later\n", + "(D = matx::random(D.Shape(), matx::NORMAL)).run(exec); // explained later\n", "matx::print(D);\n", "\n", - "(D = fft(D)).run();\n", + "(D = fft(D)).run(exec);\n", "matx::print(D);\n", "\n", - "(D = matx::ifft(D)).run(); \n", + "(D = matx::ifft(D)).run(exec); \n", "matx::print(D);" ] }, @@ -551,13 +553,13 @@ "auto MD0 = matx::make_tensor({});\n", "auto AD0 = matx::make_tensor({});\n", "\n", - "(A = matx::random(A.Shape(), matx::NORMAL)).run(); \n", + "(A = matx::random(A.Shape(), matx::NORMAL)).run(exec); \n", "\n", "\n", "// max of data\n", - "(MD0 = max(A)).run();\n", + "(MD0 = max(A)).run(exec);\n", "// min of data\n", - "(AD0 = sum(A)).run();\n", + "(AD0 = sum(A)).run(exec);\n", "\n", "printf(\"A:\\n\");\n", "matx::print(A);\n", @@ -620,16 +622,16 @@ "auto H = matx::make_tensor({10});\n", "\n", "// random\n", - "(A = 0).run();\n", - "(A = matx::random(A.Size(), matx::NORMAL)).run();\n", + "(A = 0).run(exec);\n", + "(A = matx::random(A.Size(), matx::NORMAL)).run(exec);\n", "matx::print(A);\n", "\n", "// eye\n", - "(A = matx::eye(A.Size())).run();\n", + "(A = matx::eye(A.Size())).run(exec);\n", "matx::print(A);\n", "\n", "// hamming\n", - "(H = matx::hamming<0>(H.Shape())).run();\n", + "(H = matx::hamming<0>(H.Shape())).run(exec);\n", "matx::print(H);" ] }, @@ -662,13 +664,13 @@ "auto maxIdx = matx::Make_tensor({10});\n", "\n", "// generate random data\n", - "// (input = matx::random